Source code for filip.semantics.ontology_parser.post_processer

"""
The PostProcessing gets called after the vocabulary was parsed from sources

The postprocessing has the goal to add predefined values,
compute combinedRelations, reload user settings, and precompute
information as: duplicate labels or sort relations
"""

import datetime
import re
from typing import List, Optional

import stringcase

from filip.semantics.ontology_parser.vocabulary_builder import VocabularyBuilder
from filip.semantics.vocabulary import (
    Source,
    IdType,
    Vocabulary,
    DatatypeType,
    Datatype,
    Class,
)
from filip.semantics.vocabulary import (
    CombinedDataRelation,
    CombinedObjectRelation,
    CombinedRelation,
)


[docs]class PostProcessor:
    """Class offering postprocessing as cls-methods for a vocabulary"""

[docs]    @classmethod
    def post_process_vocabulary(
        cls, vocabulary: Vocabulary, old_vocabulary: Optional[Vocabulary] = None
    ):
        """Main methode to be called for post processing

        Args:
            vocabulary (Vocabulary): Freshly parsed Vocabulary
            old_vocabulary (Vocabulary): Existing Vocabulary of which the
                settings should be overtaken

        Returns:
            None
        """

        # all methods have to reset the state that they are editing first.
        # consecutive calls of post_process_vocabulary need to have the same
        # result
        voc_builder = VocabularyBuilder(vocabulary=vocabulary)
        cls._set_labels(voc_builder)
        cls._add_predefined_source(voc_builder)
        cls._add_predefined_datatypes(voc_builder)
        cls._add_owl_thing(voc_builder)
        cls._remove_duplicate_parents(voc_builder)

        cls._log_and_clear_dependencies(voc_builder)
        cls._compute_ancestor_classes(voc_builder)
        cls._compute_child_classes(voc_builder)
        cls._combine_relations(voc_builder)

        if old_vocabulary is not None:
            cls.transfer_settings(
                new_vocabulary=vocabulary, old_vocabulary=old_vocabulary
            )
        cls._apply_vocabulary_settings(voc_builder)

        cls._ensure_parent_class(voc_builder)

        cls._sort_relations(voc_builder)
        cls._mirror_object_property_inverses(voc_builder)

        cls._save_initial_label_summary(vocabulary)

    @classmethod
    def _set_labels(cls, voc_builder: VocabularyBuilder):
        """If entities have no label, extract their label from the iri

        Args:
            voc_builder: Builder object for Vocabulary

        Returns:
            None
        """
        for entity in voc_builder.vocabulary.get_all_entities():
            entity.label = entity.get_original_label()

    @classmethod
    def _add_predefined_source(cls, voc_builder: VocabularyBuilder):
        """Add a special source to the vocabulary: PREDEFINED

        Args:
            voc_builder: Builder object for Vocabulary

        Returns:
            None
        """
        if "PREDEFINED" not in voc_builder.vocabulary.sources:
            source = Source(
                source_name="Predefined",
                timestamp=datetime.datetime.now(),
                predefined=True,
            )
            voc_builder.add_source(source, "PREDEFINED")

    @classmethod
    def _log_and_clear_dependencies(cls, voc_builder: VocabularyBuilder):
        """
        remove all references to entities that are not in the vocabulary to
        prevent program errrors as we remove information we need to reparse
        the source each time a new source is added as than the dependency
        could be valid. Further log the found dependencies for the user to
        display

        Args:
            voc_builder: Builder object for Vocabulary

        Returns:
            None
        """
        for ontology in voc_builder.vocabulary.sources.values():
            ontology.treat_dependency_statements(voc_builder.vocabulary)

    @classmethod
    def _add_predefined_datatypes(cls, voc_builder: VocabularyBuilder):
        """
        Add predefinded datatype_catalogue to the PREDEFINED source; they
        are not included in an OWL file

        Args:
            voc_builder: Builder object for Vocabulary

        Returns:
            None
        """
        # Test if datatype_catalogue were already added, if yes skip
        if (
            "http://www.w3.org/2002/07/owl#rational"
            in voc_builder.vocabulary.datatypes.keys()
        ):
            return

        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2002/07/owl#rational",
                comment="All numbers allowed",
                type=DatatypeType.number,
                number_decimal_allowed=True,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2002/07/owl#real",
                comment="All whole numbers allowed",
                type=DatatypeType.number,
                number_decimal_allowed=False,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral",
                comment="All strings allowed",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral",
                comment="XML Syntax required",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2000/01/rdf-schema#Literal",
                comment="All strings allowed",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#anyURI",
                comment="Needs to start with http://",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#base64Binary",
                comment="Base64Binary",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#boolean",
                comment="True or False",
                type=DatatypeType.enum,
                enum_values=["True", "False"],
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#byte",
                comment="Byte Number",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=-128,
                number_range_max=127,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#dateTime",
                comment="Date with possible timezone",
                type=DatatypeType.date,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#dateTimeStamp",
                comment="Date",
                type=DatatypeType.date,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#decimal",
                comment="All decimal numbers",
                type=DatatypeType.number,
                number_decimal_allowed=True,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#double",
                comment="64 bit decimal",
                type=DatatypeType.number,
                number_decimal_allowed=True,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#float",
                comment="32 bit decimal",
                type=DatatypeType.number,
                number_decimal_allowed=True,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#hexBinary",
                comment="Hexadecimal",
                type=DatatypeType.string,
                allowed_chars=[
                    "0",
                    "1",
                    "2",
                    "3",
                    "4",
                    "5",
                    "6",
                    "7",
                    "8",
                    "9",
                    "A",
                    "B",
                    "C",
                    "D",
                    "E",
                    "F",
                ],
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#int",
                comment="Signed 32 bit number",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=-2147483648,
                number_range_max=2147483647,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#integer",
                comment="All whole numbers",
                type=DatatypeType.number,
                number_decimal_allowed=False,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#language",
                comment="Language code, e.g: en, en-US, fr, or fr-FR",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#long",
                comment="Signed 64 bit integer",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=-9223372036854775808,
                number_range_max=9223372036854775807,
                number_decimal_allowed=False,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#Name",
                comment="Name string (dont start with number)",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#NCName",
                comment="Name string : forbidden",
                type=DatatypeType.string,
                forbidden_chars=[":"],
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#negativeInteger",
                comment="All negative whole numbers",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_max=-1,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#NMTOKEN",
                comment="Token string",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
                comment="All positive whole numbers",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=0,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#nonPositiveInteger",
                comment="All negative whole numbers",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_max=-1,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#normalizedString",
                comment="normalized String",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#positiveInteger",
                comment="All positive whole numbers",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=0,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#short",
                comment="signed 16 bit number",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=-32768,
                number_range_max=32767,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#string",
                comment="String",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#token",
                comment="String",
                type=DatatypeType.string,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#unsignedByte",
                comment="unsigned 8 bit number",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=0,
                number_range_max=255,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#unsignedInt",
                comment="unsigned 32 bit number",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=0,
                number_range_max=4294967295,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#unsignedLong",
                comment="unsigned 64 bit number",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=0,
                number_range_max=18446744073709551615,
            )
        )
        voc_builder.add_predefined_datatype(
            Datatype(
                iri="http://www.w3.org/2001/XMLSchema#unsignedShort",
                comment="unsigned 16 bit number",
                type=DatatypeType.number,
                number_has_range=True,
                number_range_min=0,
                number_range_max=65535,
            )
        )

    @classmethod
    def _add_owl_thing(cls, voc_builder: VocabularyBuilder):
        """Add owl_thing class to the vocabulary in the predefined source

        By definition each class is a subclass of owl:thing and owl:thing can be
        a target of relation but owl thing is never mentioned explicitly in
        ontology files.

        Args:
            voc_builder: Builder object for Vocabulary
        Returns:
            None
        """
        root_class = Class(
            iri="http://www.w3.org/2002/07/owl#Thing",
            comment="Predefined root_class",
            label="Thing",
            predefined=True,
        )

        # as it is the root object it is only a parent of classes which have no
        # parents yet
        for class_ in voc_builder.vocabulary.get_classes():
            if class_.parent_class_iris == []:
                class_.parent_class_iris.insert(0, root_class.iri)

        if root_class.iri not in voc_builder.vocabulary.classes:
            voc_builder.add_class(root_class)
            root_class.source_ids.add("PREDEFINED")

    @classmethod
    def _remove_duplicate_parents(cls, voc_builder: VocabularyBuilder):
        """Prevent that a class_ has the same parent iri multiple times

        Args:
            voc_builder: Builder object for Vocabulary
        Returns:
            None
        """
        for class_ in voc_builder.vocabulary.classes.values():
            class_.parent_class_iris = list(dict.fromkeys(class_.parent_class_iris))

    @classmethod
    def _ensure_parent_class(cls, voc_builder: VocabularyBuilder):
        """If a class has a parent class, which was provided by an other
        ontology. And that ontology is not given, it will have no parents.
        In that case give him Thing as direct parent

        Args:
            voc_builder: Builder object for Vocabulary
        Returns:
            None
        """
        for class_ in voc_builder.vocabulary.classes.values():
            # Thing is the root of all
            if not class_.iri == "http://www.w3.org/2002/07/owl#Thing":
                if len(class_.parent_class_iris) == 0:
                    class_.parent_class_iris.append(
                        "http://www.w3.org/2002/07/owl#Thing"
                    )

    @classmethod
    def _apply_vocabulary_settings(cls, voc_builder: VocabularyBuilder):
        """
        Make the labels of all entities FIWARE safe, so that they can be used
        as field keys

        Args:
            voc_builder: Builder object for Vocabulary
        Returns:
            None
        """
        vocabulary = voc_builder.vocabulary
        settings = vocabulary.settings

        def to_pascal_case(string: str) -> str:
            return (
                stringcase.pascalcase(string)
                .replace("_", "")
                .replace(" ", "")
                .replace("-", "")
            )

        def to_camel_case(string: str) -> str:
            camel_string = stringcase.camelcase(string)
            return camel_string

        def to_snake_case(string: str) -> str:
            camel_string = to_pascal_case(string)
            return re.sub(r"(?<!^)(?=[A-Z])", "_", camel_string).lower()

        # replace all whitespaces
        for entity in vocabulary.get_all_entities():
            entity.label = entity.label.replace(" ", "_")

        # replace al whitespaces in enum_values
        for datatype in vocabulary.datatypes.values():
            new_enums = []
            for enum in datatype.enum_values:
                new_enums.append(enum.replace(" ", "_"))
            datatype.enum_values = new_enums

        if settings.pascal_case_class_labels:
            for class_ in vocabulary.get_classes():
                class_.label = to_pascal_case(class_.label)

        if settings.pascal_case_individual_labels:
            for individual in vocabulary.individuals.values():
                individual.label = to_pascal_case(individual.label)

        if settings.camel_case_property_labels:
            props = list(vocabulary.data_properties.values())
            props.extend(vocabulary.object_properties.values())
            for prop in props:
                prop.label = to_camel_case(prop.label)

        if settings.camel_case_datatype_labels:
            for datatype in vocabulary.datatypes.values():
                datatype.label = to_camel_case(datatype.label)

        if settings.pascal_case_datatype_enum_labels:
            for datatype in vocabulary.get_enum_dataytypes().values():
                datatype.label = to_pascal_case(datatype.label)

    @classmethod
    def _save_initial_label_summary(cls, vocabulary: Vocabulary):
        """
        Save the label_summary existing after parsing, before the user
        changed labels

        Args:
            vocabulary: vocabulary of which the label summary should be saved

        Returns:
            None
        """
        from filip.semantics.vocabulary_configurator import VocabularyConfigurator

        vocabulary.original_label_summary = (
            VocabularyConfigurator.get_label_conflicts_in_vocabulary(
                vocabulary=vocabulary
            )
        )

    @classmethod
    def _compute_ancestor_classes(cls, voc_builder: VocabularyBuilder):
        """Compute all ancestor classes of classes

        Args:
            voc_builder: Builder object for Vocabulary
        Returns:
            None
        """
        vocabulary = voc_builder.vocabulary
        # clear state
        for class_ in vocabulary.get_classes():
            class_.ancestor_class_iris = []

        for class_ in vocabulary.get_classes():
            queue: List[str] = []
            queue.extend(class_.parent_class_iris)

            while len(queue) > 0:
                parent = queue.pop()

                if not voc_builder.entity_is_known(parent):
                    continue

                class_.ancestor_class_iris.append(parent)
                grand_parents = vocabulary.get_class_by_iri(parent).parent_class_iris

                for grand_parent in grand_parents:
                    if grand_parent not in class_.ancestor_class_iris:
                        # prevent infinite loop if inheritance circle
                        queue.append(grand_parent)

    @classmethod
    def _compute_child_classes(cls, voc_builder: VocabularyBuilder):
        """Compute all child classes of classes

        Args:
            voc_builder: Builder object for Vocabulary
        Returns:
            None
        """
        vocabulary = voc_builder.vocabulary
        # clear state
        for class_ in vocabulary.get_classes():
            class_.child_class_iris = []

        for class_ in vocabulary.get_classes():
            for parent in class_.ancestor_class_iris:

                if not voc_builder.entity_is_known(parent):
                    continue

                parent_class = vocabulary.get_class_by_iri(parent)
                parent_class.child_class_iris.append(class_.iri)

    @classmethod
    def _combine_relations(cls, voc_builder: VocabularyBuilder):
        """Compute all CombinedRelations

        Args:
            voc_builder: Builder object for Vocabulary
        Returns:
            None
        """
        vocabulary = voc_builder.vocabulary
        # clear state
        vocabulary.combined_object_relations.clear()
        vocabulary.combined_data_relations.clear()

        for class_ in vocabulary.get_classes():
            class_.combined_object_relation_ids = []
            class_.combined_data_relation_ids = []

        for class_ in vocabulary.get_classes():

            relations_with_property_iri = {}

            all_relation_ids = []
            all_relation_ids.extend(class_.get_relation_ids())
            for ancestor_iri in class_.ancestor_class_iris:

                if not voc_builder.entity_is_known(ancestor_iri):
                    continue
                ancestor = vocabulary.get_class_by_iri(ancestor_iri)
                all_relation_ids.extend(ancestor.get_relation_ids())

            for relation_id in all_relation_ids:
                relation = vocabulary.get_relation_by_id(id=relation_id)
                property_iri = relation.property_iri

                if property_iri not in relations_with_property_iri:
                    relations_with_property_iri[property_iri] = []

                relations_with_property_iri[property_iri].append(relation_id)

            for property_iri, rel_list in relations_with_property_iri.items():

                # These ids should be derived, so that the same combined
                # relation always ends up with the same id as a class can
                # only have 1 combined relation of a property these ids are
                # unique by keeping the ids always the same, we can store
                # information more efficiently in the database (settings)

                # if a property iri is not known while parsing an ontology
                # (dependency not yet parsed) the relations with that
                # property are going to get ignored, maybe a not should be
                # displayed
                if vocabulary.is_id_of_type(property_iri, IdType.data_property):
                    id = "combined-data-relation|{}|{}".format(class_.iri, property_iri)
                    combi = CombinedDataRelation(
                        id=id,
                        property_iri=property_iri,
                        relation_ids=rel_list,
                        class_iri=class_.iri,
                    )
                    voc_builder.add_combined_data_relation_for_class(
                        class_iri=class_.iri, cdata=combi
                    )
                elif vocabulary.is_id_of_type(property_iri, IdType.object_property):
                    id = "combined-object-relation|{}|{}".format(
                        class_.iri, property_iri
                    )
                    combi = CombinedObjectRelation(
                        id=id,
                        property_iri=property_iri,
                        relation_ids=rel_list,
                        class_iri=class_.iri,
                    )
                    voc_builder.add_combined_object_relation_for_class(
                        class_iri=class_.iri, crel=combi
                    )
                else:
                    pass

    @classmethod
    def _sort_relations(cls, voc_builder: VocabularyBuilder):
        """sort relations alphabetically according to their labels

        Args:
            voc_builder: Builder object for Vocabulary
        Returns:
            None
        """
        vocabulary = voc_builder.vocabulary

        for class_ in vocabulary.get_classes():
            cors = class_.get_combined_object_relations(vocabulary)
            class_.combined_object_relation_ids = cls._sort_list_of_combined_relations(
                cors, vocabulary
            )

            cdrs = class_.get_combined_data_relations(vocabulary)
            class_.combined_data_relation_ids = cls._sort_list_of_combined_relations(
                cdrs, vocabulary
            )

    @classmethod
    def _sort_list_of_combined_relations(
        cls, combined_relations: List[CombinedRelation], vocabulary: Vocabulary
    ) -> List[str]:
        """sort given CombinedRelations according to their labels

        Args:
            vocabulary (Vocabulary)
            combined_relations (List[CombinedRelation]): CRs to sort
        Returns:
            List[str], list of cr_id, sorted according to their label
        """

        property_dic = {}

        for cor in combined_relations:
            property_iri = cor.property_iri
            label = cor.get_property_label(vocabulary=vocabulary)
            property_dic[label + property_iri] = cor.id
            # combine label with iri to prevent an error due to two identical
            # labels
        sorted_property_dic = sorted(property_dic.items())

        sorted_cor_ids = []
        for pair in sorted_property_dic:
            sorted_cor_ids.append(pair[1])
        return sorted_cor_ids

    @classmethod
    def _mirror_object_property_inverses(cls, voc_builder: VocabularyBuilder):
        """
        inverses could only be given for 1 obj_prop of the pair and needs to
        be derived for the other also we could have the inverse inside an other
        import (there for done in postprocessing)

        Args:
            voc_builder: Builder object for Vocabulary

        Returns:
            None
        """
        # the state is not cleared, instead add_inverse_property_iri() makes
        # sure that there will be no duplicates as it is a set
        vocabulary = voc_builder.vocabulary

        for obj_prop_iri in vocabulary.object_properties:
            obj_prop = vocabulary.get_object_property(obj_prop_iri)

            for inverse_iri in obj_prop.inverse_property_iris:
                inverse_prop = vocabulary.get_object_property(inverse_iri)
                inverse_prop.add_inverse_property_iri(obj_prop_iri)

[docs]    @classmethod
    def transfer_settings(cls, new_vocabulary: Vocabulary, old_vocabulary: Vocabulary):
        """
        Transfer all the user made settings (labels, ..)
        from an old vocabulary to a new vocabulary

        Args:
            new_vocabulary (Vocabulary): Vocabulary to which the settings should
                be transferred
            old_vocabulary (Vocabulary): Vocabulary of which the settings should
                be transferred

        Returns:
            None
        """

        # label settings
        for entity in old_vocabulary.get_all_entities():
            new_entity = new_vocabulary.get_entity_by_iri(entity.iri)
            if new_entity is not None:
                new_entity.user_set_label = entity.user_set_label

        # device settings
        for iri, data_property in old_vocabulary.data_properties.items():
            if iri in new_vocabulary.data_properties:
                new_data_property = new_vocabulary.data_properties[iri]
                new_data_property.field_type = data_property.field_type