diff --git a/docs/source/ocpa.algo.discovery.neo4j_discovery.rst b/docs/source/ocpa.algo.discovery.neo4j_discovery.rst new file mode 100644 index 0000000..bea363e --- /dev/null +++ b/docs/source/ocpa.algo.discovery.neo4j_discovery.rst @@ -0,0 +1,75 @@ +Neo4j Discovery Functions +========================= + +This document explains the usage of the Neo4j-based discovery functions +available in the `ocpa.algo.discovery.neo4j_discovery` module: + +- `discover_proclet_model_neo4j` +- `discover_dfg_neo4j` + +These functions enable discovery of process models directly from event logs stored in a Neo4j database. + +Usage Example +------------- + +.. code-block:: python + + from ocpa.algo.discovery.neo4j_discovery.discover_proclet_model import discover_proclet_model_neo4j + from ocpa.algo.discovery.neo4j_discovery.discover_dfg import discover_dfg_neo4j + + # Define Neo4j connection URL + url = 'bolt://neo4j:neo4jpass@localhost:7687' + + # Discover Proclet model from Neo4j + proclet_neo = discover_proclet_model_neo4j(url) + print("Proclet model discovered from Neo4j:") + print(proclet_neo) + + # Discover Directly Follows Graph (DFG) from Neo4j + dfg_neo = discover_dfg_neo4j(url) + print("\nDirectly Follows Graph (DFG) discovered from Neo4j:") + print(dfg_neo) + + +Neo4j Queries for Visual Inspection +----------------------------------- + +After running the discovery functions, you can also execute the following Cypher queries +directly in your Neo4j database to visually inspect the discovered models. + +1. Proclet Model Query +~~~~~~~~~~~~~~~~~~~~~~ + +This query retrieves activity and entity type classes, their directly-follows relationships, +and synchronization relationships from the Neo4j database: + +.. code-block:: cypher + + MATCH (c1:Class) + WHERE c1.Type = "activity,EntityType" + OPTIONAL MATCH (c1)-[df:DF_C]->(c2) + WHERE c1.Type = c2.Type + OPTIONAL MATCH (c1)-[sync:SYNC]->(c3) + WHERE c1.Type = c3.Type + RETURN c1, df, c2, sync, c3 + + +2. Directly Follows Graph (DFG) Query +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This query retrieves activity classes and their directly-follows relationships: + +.. code-block:: cypher + + MATCH (c1:Class) WHERE c1.Type = "Activity" + OPTIONAL MATCH (c1)-[df:DF_C]->(c2) + RETURN c1, df, c2 + + +Notes +----- + +- The `url` parameter should point to your running Neo4j instance, including username and password. +- The discovered models are returned as query results that can be further processed or visualized. +- The Cypher queries provided are useful for manual inspection within the Neo4j Browser or other visualization tools. + diff --git a/docs/source/ocpa.algo.filtering.rst b/docs/source/ocpa.algo.filtering.rst index 9e97a2a..046f599 100644 --- a/docs/source/ocpa.algo.filtering.rst +++ b/docs/source/ocpa.algo.filtering.rst @@ -1,19 +1,199 @@ ocpa.algo.filtering package =========================== -Subpackages ------------ +OCPA offers various filtering techniques for object-centric event logs, allowing to select subsets of the data based on activities, objects, time, attributes, lifecycle, performance, and variants. -.. toctree:: - :maxdepth: 4 +Activity Filtering +__________________ - ocpa.algo.filtering.graph - ocpa.algo.filtering.log +Filters an Object-Centric Event Log to retain only events corresponding to specified activities, preserving related objects and event-object relationships while removing all unrelated data. In the following example, only events for 'Create Purchase Requisition', 'Receive Goods', and 'Issue Goods Receipt' are retained. -Module contents ---------------- +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import activity_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + filtered_using_list_of_activities = activity_filtering( + ocel, + ['Create Purchase Requisition', 'Receive Goods', 'Issue Goods Receipt'] + ) + +Activity Frequency Filtering +____________________________ + +Filters an Object-Centric Event Log by retaining only the most frequent activities until the specified cumulative frequency threshold is met. In the following example, activities are kept until they account for 80% of all events, and the rest are removed. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import activity_freq_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + filtered_using_activity_frequencies = activity_freq_filtering(ocel, 0.8) + +Object Type Filtering +_____________________ + +Filters an Object-Centric Event Log by retaining only specified object types and all events related to them. In the following example, only objects of types 'PURCHORD' and 'INVOICE' and their associated events are kept; all other object types and unrelated events are removed. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import object_type_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + filtered_using_list_of_object_types = object_type_filtering( + ocel, + ['PURCHORD', 'INVOICE'] + ) + +Object Frequency Filtering +__________________________ + +Filters object types in an Object-Centric Event Log based on their frequency of participation in events, removing those whose involvement falls below a given threshold. In the example below, object types participating in less than 20% of events are filtered out. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import object_freq_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + filtered_using_object_type_frequencies = object_freq_filtering(ocel, 0.2) + +Time-based Filtering +____________________ + +Filters cases in an Object-Centric Event Log based on specified time intervals using different strategies, such as filtering by case start time, end time, full containment within the interval, or cases spanning the interval. In the example, cases starting between May 4 and July 6, 2021, are retained. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from datetime import datetime + from ocpa.algo.util.filtering.log.index_based_filtering import time_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + start = datetime.fromisoformat('2021-05-04 09:02:00+01:00') + end = datetime.fromisoformat('2021-07-06 09:00:00+01:00') + + filtered_based_on_time = time_filtering( + ocel, + start, + end, + strategy_name="start" # Alternatives: "end", "contained", "spanning" + ) + +Event Attribute Filtering +_________________________ + +Filters an Object-Centric Event Log by retaining only events that match specified attribute values. In the following example, only events with the activity 'Create Purchase Order' or 'Create Purchase Requisition' are retained. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import event_attribute_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + attr_filter = {"event_activity": ["Create Purchase Order", "Create Purchase Requisition"]} + filtered_based_on_event_attributes = event_attribute_filtering(ocel, attr_filter) + +Object Attribute Filtering +__________________________ + +Filters an Object-Centric Event Log by retaining only events linked to objects that meet specified attribute cardinality conditions. In the example below, only events associated with more than two 'MATERIAL' objects and exactly one 'PURCHORD' object are retained. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import object_attribute_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + vmap = {'MATERIAL': ['more than', 2], 'PURCHORD': ['exactly', 1]} + filtered_based_on_object_attributes = object_attribute_filtering(ocel, vmap) + +Object Lifecycle Filtering +__________________________ + +Filters an Object-Centric Event Log to retain only objects of a specified type that follow a given sequence of activities. In the following example, only 'PURCHORD' objects that go through 'Create Purchase Order', 'Receive Invoice', and 'Clear Invoice' in that order are retained. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import object_lifecycle_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + filtered_using_control_flow_of_objects = object_lifecycle_filtering( + ocel, + object_type="PURCHORD", + list_of_activities=["Create Purchase Order", "Receive Invoice", "Clear Invoice"] + ) + +Event Performance-based Filtering +________________________________ + +Filters an Object-Centric Event Log based on performance measures (e.g., synchronization, flow, or sojourn time), retaining only events that meet a specified condition. In the following example, only 'Create Purchase Order' events with a synchronization time of less than 24 hours are kept. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import event_performance_based_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + parameters = { + 'measure': 'synchronization', + 'activity': 'Create Purchase Order', + 'condition': lambda x: x < 86400 # 24-hour threshold + } + filtered_using_event_performance = event_performance_based_filtering(ocel, parameters) + +Variant Frequency Filtering +____________________________ + +Filters an Object-Centric Event Log by removing infrequent variants based on the given cumulative frequency threshold. In the following example, only the most common variants that together make up 80% of the total cases are retained. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import variant_frequency_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + filtered_ocel_variant_freq = variant_frequency_filtering(ocel, 0.8) + +Variant Activity Sequence Filtering +___________________________________ + +Filters an Object-Centric Event Log to retain only process executions (variants) that contain specific activity transitions. In the following example, only executions that include the transition from 'Verify Material' to 'Plan Goods Issue' are kept. + +.. code-block:: python + + from ocpa.objects.log.importer.ocel import factory as ocel_import_factory + from ocpa.algo.util.filtering.log.index_based_filtering import variant_activity_sequence_filtering + + filename = "sample_logs/jsonocel/exported-p2p-normal.jsonocel" + ocel = ocel_import_factory.apply(filename) + + filtered_ocel_with_act_to_act = variant_activity_sequence_filtering( + ocel, + [('Verify Material', 'Plan Goods Issue')] + ) -.. automodule:: ocpa.algo.filtering - :members: - :undoc-members: - :show-inheritance: diff --git a/example-scripts/event-log-management/filtering.py b/example-scripts/event-log-management/filtering.py index 884ffdf..81e3dd9 100644 --- a/example-scripts/event-log-management/filtering.py +++ b/example-scripts/event-log-management/filtering.py @@ -10,33 +10,36 @@ object_attribute_filtering, object_lifecycle_filtering, event_performance_based_filtering, - variant_infrequent_filtering, + variant_frequency_filtering, variant_activity_sequence_filtering ) filename = "../../sample_logs/jsonocel/exported-p2p-normal.jsonocel" ocel = ocel_import_factory.apply(filename) -# 1. Filter by explicitly removing specific activities from the log -# Removes all events related to 'Create Purchase Requisition', 'Receive Goods', and 'Issue Goods Receipt' +# 1. Inclusive activity filter: Retain all events belonging to specified activities +# Preserves only 'Create Purchase Requisition', 'Receive Goods', and 'Issue Goods Receipt' activities +# along with their associated objects and relationships. All other activities are permanently removed. filtered_using_list_of_activities = activity_filtering( ocel, ['Create Purchase Requisition', 'Receive Goods', 'Issue Goods Receipt'] ) -# 2. Filter activities by frequency - keep most frequent activities until cumulative 20% threshold -# Retains activities that together account for ≥20% of total activity occurrences +# 2. Frequency-based activity filter: Maintain most common activities covering ≥20% event coverage +# Keeps highest-frequency activities until cumulative frequency reaches 20% of total events +# Removes low-frequency activities while preserving the majority of common business process steps filtered_using_activity_frequencies = activity_freq_filtering(ocel, 0.2) -# 3. Filter by removing specific object types and their related events -# Removes all PURCHORD and INVOICE objects and their associated events +# 3. Object-centric filter: Preserve complete lifecycle of specified object types +# Retains all events and relationships involving 'PURCHORD' (Purchase Orders) and 'INVOICE' objects filtered_using_list_of_object_types = object_type_filtering( ocel, ['PURCHORD', 'INVOICE'] ) -# 4. Filter object types by participation frequency - remove types with <20% relative frequency -# Eliminates object types that participate in less than 20% of total object-event relationships +# 4. Participation threshold filter: Remove infrequently involved object types +# Eliminates object types participating in <20% of object-event relationships +# Maintains only object types with significant process involvement (≥20% relative frequency) filtered_using_object_type_frequencies = object_freq_filtering(ocel, 0.2) # 5. Temporal filtering using "start" strategy between 2021-05-04 and 2021-07-06 @@ -85,7 +88,7 @@ # 10. Filter infrequent variants based on cumulative frequency threshold (e.g., top 80%) # Retains only the most frequent behavioral variants whose combined frequency reaches ≥80% of total variant occurrences -filtered_ocel_variant_freq = variant_infrequent_filtering(ocel, 0.8) +filtered_ocel_variant_freq = variant_frequency_filtering(ocel, 0.8) # 11. Filter log by keeping only process executions that include specified activity transitions # Retains only executions where the activity sequence ('Verify Material' → 'Plan Goods Issue') occurs diff --git a/example-scripts/event-log-management/neo4j_to_ocel_converter.py b/example-scripts/event-log-management/neo4j_to_ocel_converter.py new file mode 100644 index 0000000..247f131 --- /dev/null +++ b/example-scripts/event-log-management/neo4j_to_ocel_converter.py @@ -0,0 +1,14 @@ +# Import the function to convert Neo4j data back into an OCEL log +from ocpa.objects.log.converter.versions.neo4j_to_ocel import neo4j_to_ocel + +# Specify the Neo4j connection URL +# Format: 'bolt://:@:' +# Example assumes Neo4j is running locally with username 'neo4j' and password 'password' +url = 'bolt://neo4j:password@localhost:7687' + +# Retrieve the OCEL event log from the Neo4j database +# This function queries the graph stored in Neo4j and reconstructs the OCEL event log, +# including events, objects, and their relationships. +ocel_from_neo4j = neo4j_to_ocel(url) + +# The 'ocel_from_neo4j' object now holds the event log in OCEL format. diff --git a/example-scripts/event-log-management/ocel_to_neo4j_converter.py b/example-scripts/event-log-management/ocel_to_neo4j_converter.py new file mode 100644 index 0000000..f26fdde --- /dev/null +++ b/example-scripts/event-log-management/ocel_to_neo4j_converter.py @@ -0,0 +1,22 @@ +# Import necessary functions for loading OCEL logs and uploading to Neo4j +from ocpa.objects.log.importer.ocel import factory as ocel_import_factory +from ocpa.objects.log.converter.versions..ocel_to_neo4j import ocel_to_neo4j + +# Specify the path to the OCEL log file +# This file contains the event data and object relationships in JSON-OCEL format +filename = "../../sample_logs/jsonocel/exported-p2p-normal.jsonocel" + +# Load the OCEL log into a Python object +# The 'ocel' object now holds events, objects, and their relations for further processing +ocel = ocel_import_factory.apply(filename) + +# Define the connection URL to the Neo4j database +# Format: 'bolt://:@:' +# Example: 'bolt://neo4j:neo4jpass@localhost:7687' +url = 'bolt://neo4j:neo4jpass@localhost:7687' + +# Upload the OCEL event log into the Neo4j database +# This converts the OCEL structure into nodes (events, objects) and relationships in the graph database +# The returned 'db' object represents the connection to Neo4j, allowing further queries if needed. +# Importing classes Entity and Events is necessary for cypher querying. +db = ocel_to_neo4j(url, ocel) diff --git a/example-scripts/process-discovery/neo4j_discovery.py b/example-scripts/process-discovery/neo4j_discovery.py new file mode 100644 index 0000000..83dc648 --- /dev/null +++ b/example-scripts/process-discovery/neo4j_discovery.py @@ -0,0 +1,17 @@ +from ocpa.algo.discovery.neo4j_discovery.discover_proclet_model import discover_proclet_model_neo4j +from ocpa.algo.discovery.neo4j_discovery.discover_dfg import discover_dfg_neo4j + +# Example usage script for Neo4j-based discovery functions + +# Define Neo4j connection URL +url = 'bolt://neo4j:neo4jpass@localhost:7687' + +# Discover Proclet model from Neo4j +proclet_neo = discover_proclet_model_neo4j(url) +print("Proclet model discovered from Neo4j:") +print(proclet_neo) + +# Discover Directly Follows Graph (DFG) from Neo4j +dfg_neo = discover_dfg_neo4j(url) +print("\nDirectly Follows Graph (DFG) discovered from Neo4j:") +print(dfg_neo) diff --git a/ocpa/algo/discovery/__init__.py b/ocpa/algo/discovery/__init__.py index 16a4742..d6b8f6b 100644 --- a/ocpa/algo/discovery/__init__.py +++ b/ocpa/algo/discovery/__init__.py @@ -1,2 +1,3 @@ import ocpa.algo.discovery.enhanced_ocpn import ocpa.algo.discovery.ocpn +import ocpa.algo.discovery.neo4j_discovery diff --git a/ocpa/algo/discovery/neo4j_discovery/__init__.py b/ocpa/algo/discovery/neo4j_discovery/__init__.py new file mode 100644 index 0000000..4ce3328 --- /dev/null +++ b/ocpa/algo/discovery/neo4j_discovery/__init__.py @@ -0,0 +1,2 @@ +from ocpa.algo.discovery.neo4j_discovery import discover_dfg +from ocpa.algo.discovery.neo4j_discovery import discover_proclet_model \ No newline at end of file diff --git a/ocpa/algo/discovery/neo4j_discovery/discover_dfg.py b/ocpa/algo/discovery/neo4j_discovery/discover_dfg.py new file mode 100644 index 0000000..14c7c99 --- /dev/null +++ b/ocpa/algo/discovery/neo4j_discovery/discover_dfg.py @@ -0,0 +1,79 @@ +from neomodel import ( + StructuredNode, + StringProperty, + DateTimeProperty, + UniqueIdProperty, + Relationship, + RelationshipTo, + config, + db +) + +def discover_dfg_neo4j(url): + """ + Constructs and retrieves a Directly Follows Graph (DFG) from event logs stored + in a Neo4j database. + + This function performs the following steps: + 1. Creates unique Activity Class nodes for each distinct activity in the event log. + 2. Links each Event node to its corresponding Activity Class node via an 'OBSERVED' relationship. + 3. Builds 'DF_C' (Directly Follows Class) relationships between Activity Class nodes, + representing the directly-follows behavior aggregated from the event-level + directly-follows (DF) relationships. These relationships are annotated with + frequency counts and entity types. + 4. Retrieves the DFG, which consists of Activity Class nodes and the 'DF_C' + relationships between them. + + Args: + url (str): The Neo4j database connection URL. + + Returns: + list: A list containing the result of the Cypher query that retrieves the DFG. + Each entry includes: + - Source Activity Class node (c1) + - Directly-follows relationship (df) with frequency count and entity type + - Target Activity Class node (c2) + + Note: + This function assumes the existence of 'Event' nodes with attributes 'activity' + and 'EntityType', as well as 'DF' (Directly Follows) relationships between events, + and 'CORR' relationships that define entity correlation (e.g., case or object type). + """ + + # Set the Neo4j database connection URL + config.DATABASE_URL = url + + # Query 1: Create unique Activity Class nodes based on distinct event activity names + db.cypher_query(""" + MATCH (e:Event) + WITH DISTINCT e.activity AS actName + MERGE (c:Class {Name: actName, Type: "Activity", ID: actName}) + """) + + # Query 2: Connect each Event node to its corresponding Activity Class node + db.cypher_query(""" + MATCH (c:Class) WHERE c.Type = "Activity" + MATCH (e:Event) WHERE c.Name = e.activity + CREATE (e)-[:OBSERVED]->(c) + """) + + # Query 3: Create directly-follows relationships (DF_C) between Activity Class nodes + # The relationship includes a frequency count of how often one activity directly follows another + db.cypher_query(""" + MATCH (c1:Class)<-[:OBSERVED]-(e1:Event)-[df:DF]->(e2:Event)-[:OBSERVED]->(c2:Class) + MATCH (e1)-[:CORR]->(n)<-[:CORR]-(e2) + WHERE c1.Type = c2.Type AND n.EntityType = df.EntityType + WITH n.EntityType as EntityType, c1, count(df) AS df_freq, c2 + MERGE (c1)-[rel2:DF_C {EntityType:EntityType}]->(c2) + ON CREATE SET rel2.count = df_freq + """) + + # Query 4: Retrieve the Directly Follows Graph (DFG) as relationships between Activity Class nodes + dfg_neo = db.cypher_query(""" + MATCH (c1:Class) WHERE c1.Type = "Activity" + OPTIONAL MATCH (c1)-[df:DF_C]->(c2) + RETURN c1, df, c2 + """) + + # Return the DFG result + return dfg_neo diff --git a/ocpa/algo/discovery/neo4j_discovery/discover_proclet_model.py b/ocpa/algo/discovery/neo4j_discovery/discover_proclet_model.py new file mode 100644 index 0000000..0fb79b4 --- /dev/null +++ b/ocpa/algo/discovery/neo4j_discovery/discover_proclet_model.py @@ -0,0 +1,94 @@ +from neomodel import ( + StructuredNode, + StringProperty, + DateTimeProperty, + UniqueIdProperty, + Relationship, + RelationshipTo, + config, + db +) + +def discover_proclet_model_neo4j(url): + """ + Discover and construct a proclet model from a Neo4j event log database. + + This function connects to the specified Neo4j database URL and performs a series of + Cypher queries to: + 1. Create Class nodes representing unique combinations of activities and entity types. + 2. Link events to their corresponding Class nodes with OBSERVED relationships. + 3. Create DF_C (directly-follows constrained) relationships between classes based on + direct event flows that share the same entity instance. + 4. Create SYNC relationships between classes that share the same activity but belong + to different entity types. + 5. Retrieve the full proclet model consisting of classes, DF_C, and SYNC relationships. + + Args: + url (str): The Neo4j database URL. + + Returns: + proclet_neo (tuple): The result of the Cypher query that retrieves the proclet model, + containing class nodes and their relationships. + """ + # Set the Neo4j database URL for the connection + config.DATABASE_URL = url + + # Query 1: Create Class nodes for each unique combination of activity and entity type + db.cypher_query(""" + MATCH (e:Event)-[:CORR]->(n:Entity) + WITH DISTINCT e.activity AS actName, n.EntityType AS EType + MERGE (c:Class { + ID: actName + "_" + EType, + Name: actName, + EntityType: EType, + Type: "activity,EntityType" + }) + """) + + # Query 2: Create OBSERVED relationships from Events to their corresponding Class nodes + db.cypher_query(""" + MATCH (c:Class) + WHERE c.Type = "activity,EntityType" + MATCH (e:Event)-[:CORR]->(n:Entity) + WHERE c.Name = e.activity AND c.EntityType = n.EntityType + CREATE (e)-[:OBSERVED]->(c) + """) + + # Query 3: Create DF_C relationships between Class nodes + # based on directly-follows relationships within the same entity instance + db.cypher_query(""" + MATCH (c1:Class)<-[:OBSERVED]-(e1:Event)-[df:DF]->(e2:Event)-[:OBSERVED]->(c2:Class) + MATCH (e1)-[:CORR]->(n)<-[:CORR]-(e2) + WHERE + c1.Type = c2.Type AND + n.EntityType = df.EntityType AND + c1.EntityType = n.EntityType AND + c2.EntityType = n.EntityType + WITH n.EntityType AS EType, c1, COUNT(df) AS df_freq, c2 + MERGE (c1)-[rel2:DF_C {EntityType: EType}]->(c2) + ON CREATE SET rel2.count = df_freq + """) + + # Query 4: Create SYNC relationships between Class nodes + # that represent the same activity but different entity types + db.cypher_query(""" + MATCH (c1:Class), (c2:Class) + WHERE + c1.Name = c2.Name AND + c1.EntityType <> c2.EntityType + MERGE (c1)-[:SYNC]->(c2) + """) + + # Query 5: Retrieve the proclet model including Class nodes, + # DF_C relationships, and SYNC relationships + proclet_neo = db.cypher_query(""" + MATCH (c1:Class) + WHERE c1.Type = "activity,EntityType" + OPTIONAL MATCH (c1)-[df:DF_C]->(c2) + WHERE c1.Type = c2.Type + OPTIONAL MATCH (c1)-[sync:SYNC]->(c3) + WHERE c1.Type = c3.Type + RETURN c1, df, c2, sync, c3 + """) + + return proclet_neo diff --git a/ocpa/algo/util/filtering/log/__init__.py b/ocpa/algo/util/filtering/log/__init__.py index a18656d..dba523d 100644 --- a/ocpa/algo/util/filtering/log/__init__.py +++ b/ocpa/algo/util/filtering/log/__init__.py @@ -1,4 +1,5 @@ import ocpa.algo.util.filtering.log.activity_filtering import ocpa.algo.util.filtering.log.case_filtering import ocpa.algo.util.filtering.log.time_filtering -import ocpa.algo.util.filtering.log.variant_filtering \ No newline at end of file +import ocpa.algo.util.filtering.log.variant_filtering +import ocpa.algo.util.filtering.log.index_based_filtering diff --git a/ocpa/algo/util/filtering/log/index_based_filtering.py b/ocpa/algo/util/filtering/log/index_based_filtering.py index 159a09c..6fd2097 100644 --- a/ocpa/algo/util/filtering/log/index_based_filtering.py +++ b/ocpa/algo/util/filtering/log/index_based_filtering.py @@ -878,7 +878,7 @@ def compute_object_freq(event_id): return filtered_ocel -def variant_infrequent_filtering(ocel, threshold): +def variant_frequency_filtering(ocel, threshold): ''' Filters infrequent behavioral variants from an Object-Centric Event Log (OCEL) based on a cumulative frequency threshold. diff --git a/ocpa/objects/log/converter/versions/__init__.py b/ocpa/objects/log/converter/versions/__init__.py index 2d14f79..3a6b041 100644 --- a/ocpa/objects/log/converter/versions/__init__.py +++ b/ocpa/objects/log/converter/versions/__init__.py @@ -1,2 +1,4 @@ from ocpa.objects.log.converter.versions import jsonocel_to_csv from ocpa.objects.log.converter.versions import df_to_ocel +from ocpa.objects.log.converter.versions import ocel_to_neo4j +from ocpa.objects.log.converter.versions import neo4j_to_ocel \ No newline at end of file diff --git a/ocpa/objects/log/converter/versions/neo4j_to_ocel.py b/ocpa/objects/log/converter/versions/neo4j_to_ocel.py new file mode 100644 index 0000000..8728ceb --- /dev/null +++ b/ocpa/objects/log/converter/versions/neo4j_to_ocel.py @@ -0,0 +1,96 @@ +from ocpa.objects.log.converter.versions.ocel_to_neo4j import Entity, Event +from neomodel import (db, config, StructuredNode, StringProperty, IntegerProperty, + UniqueIdProperty, RelationshipTo, DateTimeProperty, Relationship) +from collections import defaultdict +import pandas as pd +from ocpa.objects.log.util import misc as log_util + + +def neo4j_to_ocel(url): + """ + Converts an event log stored in a Neo4j database (using the OCEL data model) into an OCEL object in Python. + + Args: + url (str): The Neo4j database URL in the format 'bolt://username:password@host:port'. + + Returns: + OCEL object: An OCEL log object generated from the Neo4j database. + + The function performs the following steps: + 1. Connects to the Neo4j database using neomodel. + 2. Queries events, objects, and their correlations. + 3. Builds an event log dataframe with associated objects. + 4. Converts the dataframe into an OCEL object for further processing or analysis. + """ + + # Set neomodel database URL + config.DATABASE_URL = url + + # Helper function to convert Neo4j datetime to native Python datetime + def convert_neo4j_datetime(neo_time): + return neo_time.to_native() if hasattr(neo_time, "to_native") else neo_time + + # Query all events from the Neo4j database + results_events, _ = db.cypher_query("MATCH (e:Event) RETURN e") + + # Query all objects with their IDs and types + results_object_types, _ = db.cypher_query(""" + MATCH (o:Entity) + RETURN o.oid AS oid, o.EntityType AS type + """) + + # Query all object-event correlations + results_o_e_mapping, _ = db.cypher_query(""" + MATCH (e:Event)-[:CORR]->(o:Entity) + RETURN o.oid AS object_id, e.eid AS event_id + """) + + # Build a mapping from object ID to its type + object_id_to_type = {record[0]: record[1] for record in results_object_types} + object_types = set(object_id_to_type.values()) # Get all unique object types + + # Build a mapping from event_id to {object_type: [list of object_ids]} + event_to_objects_by_type = defaultdict(lambda: defaultdict(list)) + for record in results_o_e_mapping: + obj_id, event_id = record + obj_type = object_id_to_type.get(obj_id) + if obj_type: + event_to_objects_by_type[event_id][obj_type].append(obj_id) + + # Construct event dataframe with event_id, activity, and timestamp + event_rows = [] + for record in results_events: + node = record[0] + event_rows.append({ + "event_id": node['eid'], + "event_activity": node['activity'], + "event_timestamp": convert_neo4j_datetime(node['timestamp']), + }) + + events_df = pd.DataFrame(event_rows) + events_df['event_start_timestamp'] = events_df['event_timestamp'] # Duplicate column for start timestamp + + # Add columns for each object type containing lists of associated object IDs + for obj_type in object_types: + events_df[obj_type] = events_df['event_id'].apply( + lambda eid: event_to_objects_by_type.get(eid, {}).get(obj_type, []) + ) + + # Prepare dataframe: set index, sort, and ensure event_id is a column + events_df = events_df.set_index('event_id') + events_df['event_id'] = events_df.index + events_df = events_df.sort_index() + + # Define parameters for OCEL conversion + parameters = { + "obj_names": list(object_types), # List of object types + "val_names": [], # No additional attribute columns + "act_name": "event_activity", # Column name for activity + "time_name": "event_timestamp", # Column name for timestamp + "sep": "," # Separator for multi-valued fields + } + + # Create OCEL log from dataframe + new_ocel = log_util.copy_log_from_df(events_df, parameters) + + return new_ocel diff --git a/ocpa/objects/log/converter/versions/ocel_to_neo4j.py b/ocpa/objects/log/converter/versions/ocel_to_neo4j.py new file mode 100644 index 0000000..519d3aa --- /dev/null +++ b/ocpa/objects/log/converter/versions/ocel_to_neo4j.py @@ -0,0 +1,223 @@ +from neomodel import ( + StructuredNode, + StringProperty, + DateTimeProperty, + UniqueIdProperty, + Relationship, + RelationshipTo, + config, + db +) +from datetime import datetime +from collections import defaultdict +import pytz + + +class Entity(StructuredNode): + oid = UniqueIdProperty() + EntityType = StringProperty(index=True) + Related = Relationship('Entity', 'RELATED') + + +class Event(StructuredNode): + eid = UniqueIdProperty() + Activity = StringProperty(index=True) + Timestamp = DateTimeProperty(index=True) + corr = RelationshipTo('Entity', 'CORR') + df = RelationshipTo('Event', 'DF') + + +def ocel_to_neo4j(url, ocel): + """ + Converts an OCEL (Object-Centric Event Log) into a Neo4j graph database. + + This function creates nodes for Entities and Events, and relationships + between them: + - CORR: Links Events to related Entities. + - DF (Directly Follows): Links Events in order based on shared Entities. + - RELATED: Links Entities that co-occur in the same Events. + + The graph schema: + - Nodes: + * Entity (oid, EntityType) + * Event (eid, Activity, Timestamp) + - Relationships: + * (Event)-[:CORR]->(Entity) + * (Event)-[:DF {ent_id, EntityType}]->(Event) + * (Entity)-[:RELATED {count, event_ids}]-(Entity) + + Parameters: + ---------- + url : str + The Neo4j connection URL in the form 'bolt://user:password@host:port'. + ocel : Ocel object + The OCEL object containing event data, object data, and mappings. + + Returns: + ------- + db : neomodel.db + The Neo4j database connection object after populating the graph. + """ + + # Configure Neo4j connection + config.DATABASE_URL = url + # Clear the database before inserting new data + db.cypher_query("MATCH (n) DETACH DELETE n") + + # Extract objects, events, and object-event mappings from the OCEL + obj_ocel = ocel.obj.raw.objects + events_ocel = ocel.obj.raw.events + obj_event_mapping = ocel.obj.raw.obj_event_mapping + + # Create reverse mapping: event -> list of objects + event_obj_mapping = {} + for obj_id, event_ids in obj_event_mapping.items(): + for event_id in event_ids: + if event_id not in event_obj_mapping: + event_obj_mapping[event_id] = [] + event_obj_mapping[event_id].append(obj_id) + + batch_size = 500 # Batch size for Cypher queries + + # --------------------------- + # 1. Insert Entity nodes + # --------------------------- + entity_rows = [{'oid': obj.id, 'EntityType': obj.type} for obj in obj_ocel.values()] + entity_query = """ + UNWIND $rows AS row + MERGE (e:Entity {oid: row.oid}) + SET e.EntityType = row.EntityType + """ + for i in range(0, len(entity_rows), batch_size): + db.cypher_query(entity_query, {'rows': entity_rows[i:i + batch_size]}) + + # --------------------------- + # 2. Insert Event nodes + # --------------------------- + event_rows = [ + { + 'eid': eid, + 'activity': ev.act, + 'timestamp': ev.time.replace(tzinfo=pytz.UTC) # Ensure UTC timezone + } + for eid, ev in events_ocel.items() + ] + event_query = """ + UNWIND $rows AS row + MERGE (e:Event {eid: row.eid}) + SET e.activity = row.activity, e.timestamp = row.timestamp + """ + for i in range(0, len(event_rows), batch_size): + db.cypher_query(event_query, {'rows': event_rows[i:i + batch_size]}) + + # --------------------------- + # 3. Create CORR relationships (Event -> Entity) + # --------------------------- + corr_rows = [] + for eid, oids in event_obj_mapping.items(): + for oid in oids: + corr_rows.append({'eid': eid, 'oid': oid}) + + corr_query = """ + UNWIND $rows AS row + MATCH (e:Event {eid: row.eid}) + MATCH (o:Entity {oid: row.oid}) + MERGE (e)-[:CORR]->(o) + """ + for i in range(0, len(corr_rows), batch_size): + db.cypher_query(corr_query, {'rows': corr_rows[i:i + batch_size]}) + + # --------------------------- + # 4. Create DF (Directly Follows) relationships between events + # --------------------------- + + # Step 1: Group events by entity + entity_events = {} + for oid in obj_ocel: + entity_events[oid] = [] + + # Build list of (event_id, timestamp) pairs for each entity + for eid, ev in events_ocel.items(): + for oid in event_obj_mapping.get(eid, []): + entity_events[oid].append((eid, ev.time)) + + # Step 2: For each entity, sort events by timestamp and create DF relationships + follows_rel_rows = [] + for oid, events in entity_events.items(): + if len(events) < 2: + continue # No DF relationship needed if only one event + + # Sort events by timestamp + sorted_events = sorted(events, key=lambda x: x[1]) + + # Create consecutive pairs of events + for i in range(len(sorted_events) - 1): + e1_id, e1_time = sorted_events[i] + e2_id, e2_time = sorted_events[i + 1] + + follows_rel_rows.append({ + 'src': e1_id, + 'dst': e2_id, + 'ent_id': oid, + 'EntityType': obj_ocel[oid].type + }) + + follows_query = """ + UNWIND $rows AS row + MATCH (e1:Event {eid: row.src}) + MATCH (e2:Event {eid: row.dst}) + MERGE (e1)-[r:DF]->(e2) + SET r.ent_id = row.ent_id, r.EntityType = row.EntityType + """ + + for i in range(0, len(follows_rel_rows), batch_size): + db.cypher_query(follows_query, {'rows': follows_rel_rows[i:i + batch_size]}) + + # --------------------------- + # 5. Create RELATED relationships between Entities + # --------------------------- + + # Step 1: Find entity pairs that co-occur in the same event + cooccurrence_pairs = defaultdict(set) + for eid, oids in event_obj_mapping.items(): + for i in range(len(oids)): + for j in range(i + 1, len(oids)): + oid1, oid2 = oids[i], oids[j] + # Create canonical ordering to avoid duplicate pairs (A,B) and (B,A) + if oid1 < oid2: + pair = (oid1, oid2) + else: + pair = (oid2, oid1) + cooccurrence_pairs[pair].add(eid) + + # Step 2: Create RELATED relationships with metadata (count and event_ids) + related_rows = [] + for (oid1, oid2), events in cooccurrence_pairs.items(): + related_rows.append({ + 'oid1': oid1, + 'oid2': oid2, + 'count': len(events), # Number of co-occurrences + 'event_ids': list(events) # List of event IDs where both entities co-occur + }) + + related_query = """ + UNWIND $rows AS row + MATCH (e1:Entity {oid: row.oid1}) + MATCH (e2:Entity {oid: row.oid2}) + MERGE (e1)-[r:RELATED]-(e2) + SET r.count = row.count, r.event_ids = row.event_ids + """ + for i in range(0, len(related_rows), batch_size): + db.cypher_query(related_query, {'rows': related_rows[i:i + batch_size]}) + + # --------------------------- + # Print summary of created graph + # --------------------------- + print('Graph created with:') + print(f"- Entities: {len(entity_rows)}") + print(f"- Events: {len(event_rows)}") + print(f"- CORR relationships: {len(corr_rows)}") + print(f"- DF relationships: {len(follows_rel_rows)}") + print(f"- RELATED relationships: {len(related_rows)}") + + return db