From 9e808dd9d43aec75f0855ae149348c5849ae15d4 Mon Sep 17 00:00:00 2001 From: Ville Laitila Date: Sat, 28 Mar 2026 16:54:45 +0200 Subject: [PATCH] Add exclude_attrs support to ModelCompare and fix set intersection bug - Fix attributecomparison.py: set.intersection() was called without assignment, so intersection was always equal to keys1. Use & operator. - Fix keys-only-in-A/B filtering to properly exclude ignoredAttrs. - Add exclude_attrs parameter to compare() and compareModels() for suppressing noisy time-windowed metrics during comparison. - Add SLIDING_WINDOW_ATTRS preset in compareutils.py. - Improve docstrings for SElement, SElementAssociation, and ModelCompare. - Add comprehensive tests for the new functionality. - Document model comparison workflow in CLAUDE.md. --- CLAUDE.md | 22 ++++++ src/sgraph/compare/attributecomparison.py | 11 ++- src/sgraph/compare/compareutils.py | 27 +++++++ src/sgraph/compare/modelcompare.py | 68 +++++++++++------ src/sgraph/selement.py | 51 +++++++++---- src/sgraph/selementassociation.py | 44 +++++++++++ tests/compare/modelcompare_tests.py | 92 ++++++++++++++++++++++- 7 files changed, 274 insertions(+), 41 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 5d4c1c4..7902469 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -192,6 +192,28 @@ results = cypher_query(model, ''' CLI: `python -m sgraph.cypher model.xml.zip [query]` — supports interactive REPL and 11 output formats (table, csv, tsv, json, jsonl, xml, deps, dot, plantuml, graphml, cytoscape). See `docs/cypher.md` for full documentation. +### Comparing Models +```python +from sgraph.compare.modelcompare import ModelCompare +from sgraph.compare.compareutils import SLIDING_WINDOW_ATTRS + +mc = ModelCompare() + +# Basic comparison from files +compare_model = mc.compare('model_a.xml', 'model_b.xml') + +# Exclude noisy time-windowed metrics (author counts, commit counts, etc.) +compare_model = mc.compare('model_a.xml', 'model_b.xml', exclude_attrs=SLIDING_WINDOW_ATTRS) + +# Compare in-memory models +compare_model = mc.compareModels(model1, model2, exclude_attrs={'commit_count_30', 'author_list_7'}) + +# Inspect results +mc.printCompareInfos(compare_model) +``` + +The `exclude_attrs` parameter accepts a set of attribute names to ignore during comparison. Use `SLIDING_WINDOW_ATTRS` as a preset to suppress time-windowed metric noise (author/commit/bug counts at various time windows). + ## File Locations - Source code: `src/sgraph/` diff --git a/src/sgraph/compare/attributecomparison.py b/src/sgraph/compare/attributecomparison.py index 79b9b9f..c668419 100644 --- a/src/sgraph/compare/attributecomparison.py +++ b/src/sgraph/compare/attributecomparison.py @@ -34,8 +34,7 @@ def isListAttribute(attrName): keys1 = set(attributes1.keys()) keys2 = set(attributes2.keys()) - intersection = set(keys1) - intersection.intersection(keys2) + intersection = keys1 & keys2 intersection -= ignoredAttrs # TODO: Similar logic needed as implemented in desktop @@ -78,15 +77,15 @@ def isListAttribute(attrName): attributes2[attrName]) attrs.add(attrName) - keys1 = filter(lambda x: x not in intersection, keys1) - for attribute_in_a in keys1: + keys1_only = keys1 - intersection - ignoredAttrs + for attribute_in_a in keys1_only: val = attributes1[attribute_in_a] if val != '': outmap[c + '_' + attribute_in_a] = '{};--'.format(val) attrs.add(attribute_in_a) - keys2 = filter(lambda x: x not in intersection, keys2) - for attribute_in_b in keys2: + keys2_only = keys2 - intersection - ignoredAttrs + for attribute_in_b in keys2_only: val = attributes2[attribute_in_b] if val != '': outmap[c + '_' + attribute_in_b] = '--;{}'.format(val) diff --git a/src/sgraph/compare/compareutils.py b/src/sgraph/compare/compareutils.py index c07e7a2..da42ac2 100644 --- a/src/sgraph/compare/compareutils.py +++ b/src/sgraph/compare/compareutils.py @@ -4,6 +4,33 @@ ignoredAttrs = {'days_since_modified'} +# Attrs that are always noise in time-windowed models (sliding window metrics). +# Useful as a preset for exclude_attrs parameter in ModelCompare.compare(). +SLIDING_WINDOW_ATTRS = { + 'days_since_modified', + 'author_list_1', 'author_list_7', 'author_list_30', 'author_list_90', + 'author_list_180', 'author_list_365', + 'author_count_1', 'author_count_7', 'author_count_30', 'author_count_90', + 'author_count_180', 'author_count_365', + 'commit_count_1', 'commit_count_7', 'commit_count_30', 'commit_count_90', + 'commit_count_180', 'commit_count_365', + 'bug_fix_ratio_1', 'bug_fix_ratio_7', 'bug_fix_ratio_30', + 'bug_fix_ratio_90', 'bug_fix_ratio_180', 'bug_fix_ratio_365', + 'bug_fix_commit_count_1', 'bug_fix_commit_count_7', 'bug_fix_commit_count_30', + 'bug_fix_commit_count_90', 'bug_fix_commit_count_180', 'bug_fix_commit_count_365', + 'bug_count_1', 'bug_count_7', 'bug_count_30', 'bug_count_90', + 'bug_count_180', 'bug_count_365', + 'bug_list_1', 'bug_list_7', 'bug_list_30', 'bug_list_90', + 'bug_list_180', 'bug_list_365', + 'feature_count_1', 'feature_count_7', 'feature_count_30', 'feature_count_90', + 'feature_count_180', 'feature_count_365', + 'feature_list_1', 'feature_list_7', 'feature_list_30', 'feature_list_90', + 'feature_list_180', 'feature_list_365', + 'tech_debt_1', 'tech_debt_7', 'tech_debt_30', 'tech_debt_90', + 'tech_debt_180', 'tech_debt_365', + 'last_modified', 'latest_commits', +} + def tag_change_count(compareElement: SElement, changecount: int): if changecount > 0: diff --git a/src/sgraph/compare/modelcompare.py b/src/sgraph/compare/modelcompare.py index 7f06014..c18b5bc 100644 --- a/src/sgraph/compare/modelcompare.py +++ b/src/sgraph/compare/modelcompare.py @@ -5,7 +5,7 @@ from sgraph import SElement, SElementAssociation, SGraph from sgraph.compare.attributecomparison import compare_attrs from sgraph.compare.comparegraphattrs import CompareGraphAttrs -from sgraph.compare.compareutils import tag_change_count, debunk_uniqueness +from sgraph.compare.compareutils import tag_change_count, debunk_uniqueness, ignoredAttrs from sgraph.compare.renamedetector import RenameDetector @@ -14,28 +14,54 @@ class ModelCompare: def __init__(self): pass - def compare(self, path1: str, path2: str): + def compare(self, path1: str, path2: str, exclude_attrs: set[str] | None = None): + """Compare two models loaded from file paths. + + Args: + exclude_attrs: Attribute names to ignore during comparison (e.g. SLIDING_WINDOW_ATTRS). + """ model1 = SGraph.parse_xml_or_zipped_xml(path1) model2 = SGraph.parse_xml_or_zipped_xml(path2) - return self.compareModels(model1, model2) - - def compareModels(self, model1: SGraph, model2: SGraph, rename_detection: bool = False): - rootNode = SElement(None, '') - compareModel = SGraph(rootNode) - createdDeps: list[SElementAssociation] = [] - removedDeps: list[SElementAssociation] = [] - - # If there is already a attr_temporary.csv that has been "spoiling" the compare, - # by introducing some removed elements to the B model that should not be there.. - model2.rootNode.removeDescendantsIf( - lambda x: 'compare' in x.attrs and x.attrs['compare'] == 'removed') - self.compareWith(model1, model2, compareModel, True, createdDeps, removedDeps, - rename_detection) - for r in createdDeps: - r.addAttribute("compare", "added") - for r in removedDeps: - r.addAttribute("compare", "removed") - return compareModel + return self.compareModels(model1, model2, exclude_attrs=exclude_attrs) + + def compareModels(self, model1: SGraph, model2: SGraph, rename_detection: bool = False, + exclude_attrs: set[str] | None = None): + """Compare two in-memory models and produce a compare model annotating differences. + + Args: + exclude_attrs: Additional attribute names to ignore during comparison. + Use ``SLIDING_WINDOW_ATTRS`` from ``compareutils`` to suppress + time-windowed metric noise. + + Note: + exclude_attrs temporarily extends the global ``ignoredAttrs`` set for the + duration of the comparison and restores it afterwards. Not thread-safe if + called concurrently with different exclude_attrs values. + """ + added_to_ignored = set() + if exclude_attrs: + added_to_ignored = exclude_attrs - ignoredAttrs + ignoredAttrs.update(added_to_ignored) + + try: + rootNode = SElement(None, '') + compareModel = SGraph(rootNode) + createdDeps: list[SElementAssociation] = [] + removedDeps: list[SElementAssociation] = [] + + # If there is already a attr_temporary.csv that has been "spoiling" the compare, + # by introducing some removed elements to the B model that should not be there.. + model2.rootNode.removeDescendantsIf( + lambda x: 'compare' in x.attrs and x.attrs['compare'] == 'removed') + self.compareWith(model1, model2, compareModel, True, createdDeps, removedDeps, + rename_detection) + for r in createdDeps: + r.addAttribute("compare", "added") + for r in removedDeps: + r.addAttribute("compare", "removed") + return compareModel + finally: + ignoredAttrs.difference_update(added_to_ignored) # self == SGraph def compareWith( diff --git a/src/sgraph/selement.py b/src/sgraph/selement.py index bb28cf1..189de80 100644 --- a/src/sgraph/selement.py +++ b/src/sgraph/selement.py @@ -25,17 +25,25 @@ class SElement: _incoming_index: dict[tuple[int, str], "SElementAssociation"] def __init__(self, parent: Optional['SElement'], name: str): - """ - Creates an element and attach it under the given parent. - Only case when the parent may be None, is when creating the root element for the graph, - or in case of detached elements. + """Create an element and **immediately** attach it under *parent*. + + The new element is added to ``parent.children`` and + ``parent.childrenDict`` during construction. There is no need to + call :meth:`addChild` afterwards — doing so would trigger a merge + or raise :class:`SElementMergedException`. + + Pass ``parent=None`` only when creating the graph root or a + temporarily detached element. + + Args: + parent: Parent element (``None`` for root / detached). + name: Element name. ``'/'`` characters are normalised to + ``'__slash__'``. - :param parent: the parent element (optional), in the end, every element needs parent except the root. - :param name: name of the element, '/' is normalized to '__slash__' Raises: - Exception: If parent equals self (self loop). - SElementMergedException: If an element with the same name already exists - under the parent (in non-DEBUG mode). + Exception: If *parent* equals *self* (self-loop). + SElementMergedException: If an element with the same *name* + already exists under *parent*. """ if name == '': # sys.stderr.write('Creating with empty name\n') @@ -85,10 +93,27 @@ def __str__(self): return f'{self.name} ({self.getType()}) {children_info} {outbound_info} {inbound_info}' def addChild(self, child: "SElement") -> Optional["SElement"]: - """ - Add child, but if there is an overlapping element, merge instead and return merged element. - :param child: the child to be added. - :return: None or the element where the child has been merged with (differs from child) + """Add *child* to this element, merging on name collision. + + If a child with the same name already exists, the new child is + **merged** into the existing one and the existing element is + returned. + + .. note:: + ``SElement(parent, name)`` already attaches the new element to + *parent* during construction. Calling ``parent.addChild(elem)`` + **again** will trigger the merge/duplicate path — this is almost + never what you want for freshly constructed elements. + + Args: + child: Element to add. + + Returns: + ``None`` if the child was added as-is, or the **existing** + element that *child* was merged into. + + Raises: + Exception: If *child* is *self* (self-loop). """ if child == self: sys.stderr.write('Error with data model loop\n') diff --git a/src/sgraph/selementassociation.py b/src/sgraph/selementassociation.py index a2dcbfe..8b723ae 100644 --- a/src/sgraph/selementassociation.py +++ b/src/sgraph/selementassociation.py @@ -9,6 +9,28 @@ class SElementAssociation: + """Represents a directed dependency between two SElements. + + Attributes: + fromElement: The source element of the dependency. + toElement: The target element of the dependency. + deptype: The dependency type string (e.g. ``'call'``, ``'import'``). + Named ``deptype`` (not ``type``) to avoid shadowing the Python builtin. + Use :meth:`getType` as an alias. + attrs: Arbitrary key-value metadata attached to this association. + + Two-phase construction: + The constructor stores the endpoints but does **not** register the + association on the elements' ``outgoing`` / ``incoming`` lists. + Call :meth:`initElems` afterwards to complete registration:: + + ea = SElementAssociation(src, tgt, 'import') + ea.initElems() # now visible in src.outgoing & tgt.incoming + + Alternatively, use :meth:`create_unique_element_association` which + handles both steps and deduplication in one call. + """ + __slots__ = 'deptype', 'fromElement', 'toElement', 'attrs' fromElement: SElement @@ -57,6 +79,19 @@ def __init__( deptype: str, depattrs: dict[str, str | int | list[str]] | None = None, ): + """Create an association object **without** registering it on the elements. + + After construction the association is an inert object — it does not + appear in ``fr.outgoing`` or ``to.incoming``. Call :meth:`initElems` + to complete registration, or use + :meth:`create_unique_element_association` for a one-step alternative. + + Args: + fr: Source / from-element. + to: Target / to-element. + deptype: Dependency kind (e.g. ``'call'``, ``'import'``). + depattrs: Optional metadata dict. ``None`` becomes ``{}``. + """ self.deptype = deptype # Good to have this decommented when testing new analyzers: @@ -103,6 +138,15 @@ def getAttributes(self): return self.attrs def initElems(self): + """Register this association on both endpoint elements. + + Appends ``self`` to :attr:`fromElement.outgoing` and + :attr:`toElement.incoming`, and updates the incoming index used + for O(1) duplicate detection. + + Must be called exactly once after :meth:`__init__`. + :meth:`create_unique_element_association` calls this automatically. + """ self.fromElement.outgoing.append(self) self.toElement.incoming.append(self) # Maintain index for O(1) duplicate lookup diff --git a/tests/compare/modelcompare_tests.py b/tests/compare/modelcompare_tests.py index f35b603..6e27ee3 100644 --- a/tests/compare/modelcompare_tests.py +++ b/tests/compare/modelcompare_tests.py @@ -1,5 +1,7 @@ -from sgraph import SGraph, ModelApi +from sgraph import SGraph, SElement, SElementAssociation, ModelApi +from sgraph.compare.attributecomparison import compare_attrs from sgraph.compare.modelcompare import ModelCompare +from sgraph.compare.compareutils import ignoredAttrs, SLIDING_WINDOW_ATTRS def test_compareModels(): @@ -17,3 +19,91 @@ def test_compareModels(): model_compare = ModelCompare() compare_output = model_compare.compareModels(model1, model2) print(compare_output.to_deps(None)) + + +def test_compare_attrs_intersection_correctness(): + """Verify that intersection is computed correctly (regression for set.intersection() bug).""" + attrs1 = {'hash': 'abc', 'loc': '100', 'only_in_a': 'yes'} + attrs2 = {'hash': 'def', 'loc': '100', 'only_in_b': 'yes'} + outmap = {} + + changed_attrs, change_count = compare_attrs(attrs1, attrs2, outmap, 'file', 'file') + + # 'hash' changed → should appear in changed_attrs + assert 'hash' in changed_attrs + # 'loc' identical → should NOT appear + assert 'loc' not in changed_attrs + # 'only_in_a' should be reported as removed (val;--) + assert '_attr_diff_only_in_a' in outmap + assert outmap['_attr_diff_only_in_a'].endswith(';--') + # 'only_in_b' should be reported as added (--;val) + assert '_attr_diff_only_in_b' in outmap + assert outmap['_attr_diff_only_in_b'].startswith('--;') + + +def test_compare_attrs_ignored_attrs_excluded_from_only_in(): + """Ignored attrs should not appear in 'only in A/B' diffs.""" + attrs1 = {'hash': 'abc', 'days_since_modified': '10'} + attrs2 = {'hash': 'abc'} + outmap = {} + + changed_attrs, _ = compare_attrs(attrs1, attrs2, outmap, 'file', 'file') + + # days_since_modified is in ignoredAttrs and only in attrs1 → should be excluded + assert 'days_since_modified' not in changed_attrs + assert '_attr_diff_days_since_modified' not in outmap + + +def test_exclude_attrs_in_compareModels(): + """exclude_attrs should suppress specified attributes from comparison output.""" + model1 = SGraph(SElement(None, '')) + e1 = model1.createOrGetElementFromPath('/proj/src/file.py') + e1.addAttribute('hash', 'aaa') + e1.addAttribute('commit_count_30', '5') + + model2 = SGraph(SElement(None, '')) + e2 = model2.createOrGetElementFromPath('/proj/src/file.py') + e2.addAttribute('hash', 'bbb') + e2.addAttribute('commit_count_30', '15') + + mc = ModelCompare() + + # Without exclude_attrs: both hash and commit_count_30 should show as changed + result_full = mc.compareModels(model1, model2) + file_elem = result_full.findElementFromPath('/proj/src/file.py') + full_diff = file_elem.attrs.get('_attr_diff', '') + assert 'hash' in full_diff + assert 'commit_count_30' in full_diff + + # With exclude_attrs: commit_count_30 should be suppressed + result_filtered = mc.compareModels(model1, model2, exclude_attrs={'commit_count_30'}) + file_elem2 = result_filtered.findElementFromPath('/proj/src/file.py') + filtered_diff = file_elem2.attrs.get('_attr_diff', '') + assert 'hash' in filtered_diff + assert 'commit_count_30' not in filtered_diff + + +def test_exclude_attrs_restores_ignoredAttrs(): + """ignoredAttrs must be restored after compareModels, even if exclude_attrs was used.""" + original = ignoredAttrs.copy() + + mc = ModelCompare() + model1 = SGraph(SElement(None, '')) + model1.createOrGetElementFromPath('/proj/a.py') + model2 = SGraph(SElement(None, '')) + model2.createOrGetElementFromPath('/proj/a.py') + + mc.compareModels(model1, model2, exclude_attrs={'custom_metric_1', 'custom_metric_2'}) + + assert ignoredAttrs == original, "ignoredAttrs was not restored after compareModels" + + +def test_sliding_window_attrs_preset(): + """SLIDING_WINDOW_ATTRS should contain expected metric families.""" + assert 'days_since_modified' in SLIDING_WINDOW_ATTRS + assert 'commit_count_30' in SLIDING_WINDOW_ATTRS + assert 'author_list_365' in SLIDING_WINDOW_ATTRS + assert 'last_modified' in SLIDING_WINDOW_ATTRS + # Should not contain structural attrs + assert 'hash' not in SLIDING_WINDOW_ATTRS + assert 'loc' not in SLIDING_WINDOW_ATTRS