From bb272aa001d60fe1383baf9d8649bdfdc1e4b3c2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 09:56:50 +0000 Subject: [PATCH 1/4] Initial plan From 56ae9f1ea5fe3be493e6ec68a309654a341fef4c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 10:03:12 +0000 Subject: [PATCH 2/4] feat: apply blacklist file to metacheck analysis stage Co-authored-by: vuillaut <4263646+vuillaut@users.noreply.github.com> --- src/sw_metadata_bot/metacheck_wrapper.py | 70 +++++++++++++- src/sw_metadata_bot/pipeline.py | 2 + tests/test_metacheck_wrapper.py | 114 +++++++++++++++++++++++ tests/test_pipeline.py | 2 + 4 files changed, 186 insertions(+), 2 deletions(-) create mode 100644 tests/test_metacheck_wrapper.py diff --git a/src/sw_metadata_bot/metacheck_wrapper.py b/src/sw_metadata_bot/metacheck_wrapper.py index 5c72b25..5214ba5 100644 --- a/src/sw_metadata_bot/metacheck_wrapper.py +++ b/src/sw_metadata_bot/metacheck_wrapper.py @@ -1,11 +1,60 @@ """Wrapper for metacheck CLI to integrate with sw-metadata-bot.""" +import json import sys +import tempfile +from pathlib import Path import click from metacheck import cli as metacheck_cli +def _filter_blacklisted_repos(input_path: str, blacklist_file: Path) -> str: + """Return a temp file path with blacklisted repos removed from the input JSON. + + :param input_path: Path to the input JSON file containing repository list. + :param blacklist_file: Path to the JSON file containing blacklisted repos. + :return: Path to a temporary filtered JSON file. + :raises click.ClickException: If blacklist file has invalid format. + """ + with open(blacklist_file, encoding="utf-8") as f: + blacklist_data = json.load(f) + + blacklisted = blacklist_data.get("repositories", []) + if not isinstance(blacklisted, list): + raise click.ClickException( + f"Invalid format in {blacklist_file}: 'repositories' must be a list" + ) + blacklisted_set = {url.strip().rstrip("/") for url in blacklisted if isinstance(url, str)} + + with open(input_path, encoding="utf-8") as f: + input_data = json.load(f) + + original_repos = input_data.get("repositories", []) + filtered_repos = [ + url + for url in original_repos + if isinstance(url, str) and url.strip().rstrip("/") not in blacklisted_set + ] + + skipped = len(original_repos) - len(filtered_repos) + if skipped > 0: + click.echo( + f"Blacklist: skipping {skipped} blacklisted " + f"{'repository' if skipped == 1 else 'repositories'} from analysis." + ) + + filtered_data = {**input_data, "repositories": filtered_repos} + + tmp_file = tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False, encoding="utf-8" + ) + json.dump(filtered_data, tmp_file) + tmp_file.flush() + tmp_file.close() + return tmp_file.name + + @click.command() @click.option( "--input", @@ -35,13 +84,28 @@ default=0.8, help="SoMEF confidence threshold (default: 0.8).", ) -def metacheck_command(input, skip_somef, pitfalls_output, analysis_output, threshold): +@click.option( + "--blacklist-file", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + default=None, + help="JSON file listing repositories to exclude from analysis.", +) +def metacheck_command( + input, skip_somef, pitfalls_output, analysis_output, threshold, blacklist_file +): """Run metacheck to detect metadata pitfalls in repositories.""" + # Apply blacklist filtering when input is a JSON file (not a single URL) + effective_input = input.strip() + tmp_input_path = None + if blacklist_file is not None and Path(effective_input).is_file(): + tmp_input_path = _filter_blacklisted_repos(effective_input, blacklist_file) + effective_input = tmp_input_path + # Convert click arguments to sys.argv format for metacheck's argparse argv = ["metacheck"] # Add input files - argv.extend(["--input", input.strip()]) + argv.extend(["--input", effective_input]) if skip_somef: argv.append("--skip-somef") @@ -57,3 +121,5 @@ def metacheck_command(input, skip_somef, pitfalls_output, analysis_output, thres metacheck_cli() finally: sys.argv = original_argv + if tmp_input_path is not None: + Path(tmp_input_path).unlink(missing_ok=True) diff --git a/src/sw_metadata_bot/pipeline.py b/src/sw_metadata_bot/pipeline.py index a7a8f20..6d4b721 100644 --- a/src/sw_metadata_bot/pipeline.py +++ b/src/sw_metadata_bot/pipeline.py @@ -56,6 +56,8 @@ def run_pipeline( str(pitfalls_output_dir), "--analysis-output", str(analysis_output_file), + "--blacklist-file", + str(opt_outs_file), ], standalone_mode=False, ) diff --git a/tests/test_metacheck_wrapper.py b/tests/test_metacheck_wrapper.py new file mode 100644 index 0000000..3723328 --- /dev/null +++ b/tests/test_metacheck_wrapper.py @@ -0,0 +1,114 @@ +"""Tests for metacheck_wrapper module.""" + +import json +import os + +import click +import pytest + +from sw_metadata_bot import metacheck_wrapper + + +def test_filter_blacklisted_repos_removes_matching_urls(tmp_path): + """Repos in the blacklist are excluded from the filtered input file.""" + input_file = tmp_path / "repos.json" + input_file.write_text( + json.dumps( + { + "repositories": [ + "https://github.com/org/keep-me", + "https://github.com/org/blacklisted", + "https://gitlab.com/group/also-blacklisted/", + ] + } + ) + ) + + blacklist_file = tmp_path / "blacklist.json" + blacklist_file.write_text( + json.dumps( + { + "repositories": [ + "https://github.com/org/blacklisted", + "https://gitlab.com/group/also-blacklisted", + ] + } + ) + ) + + filtered_path = metacheck_wrapper._filter_blacklisted_repos( + str(input_file), blacklist_file + ) + + try: + with open(filtered_path, encoding="utf-8") as f: + filtered_data = json.load(f) + + assert filtered_data["repositories"] == ["https://github.com/org/keep-me"] + finally: + os.unlink(filtered_path) + + +def test_filter_blacklisted_repos_preserves_extra_keys(tmp_path): + """Non-repositories keys in the input file are preserved after filtering.""" + input_file = tmp_path / "repos.json" + input_file.write_text( + json.dumps( + { + "repositories": ["https://github.com/org/keep", "https://github.com/org/skip"], + "custom_message": "hello", + } + ) + ) + + blacklist_file = tmp_path / "blacklist.json" + blacklist_file.write_text( + json.dumps({"repositories": ["https://github.com/org/skip"]}) + ) + + filtered_path = metacheck_wrapper._filter_blacklisted_repos( + str(input_file), blacklist_file + ) + + try: + with open(filtered_path, encoding="utf-8") as f: + filtered_data = json.load(f) + + assert filtered_data["repositories"] == ["https://github.com/org/keep"] + assert filtered_data["custom_message"] == "hello" + finally: + os.unlink(filtered_path) + + +def test_filter_blacklisted_repos_empty_blacklist_keeps_all(tmp_path): + """Empty blacklist leaves the repository list unchanged.""" + input_file = tmp_path / "repos.json" + repos = ["https://github.com/org/a", "https://github.com/org/b"] + input_file.write_text(json.dumps({"repositories": repos})) + + blacklist_file = tmp_path / "blacklist.json" + blacklist_file.write_text(json.dumps({"repositories": []})) + + filtered_path = metacheck_wrapper._filter_blacklisted_repos( + str(input_file), blacklist_file + ) + + try: + with open(filtered_path, encoding="utf-8") as f: + filtered_data = json.load(f) + + assert filtered_data["repositories"] == repos + finally: + os.unlink(filtered_path) + + +def test_filter_blacklisted_repos_invalid_blacklist_format_raises(tmp_path): + """Invalid 'repositories' type in blacklist file raises ClickException.""" + input_file = tmp_path / "repos.json" + input_file.write_text(json.dumps({"repositories": ["https://github.com/org/a"]})) + + blacklist_file = tmp_path / "blacklist.json" + blacklist_file.write_text(json.dumps({"repositories": "not-a-list"})) + + with pytest.raises(click.ClickException, match="repositories' must be a list"): + metacheck_wrapper._filter_blacklisted_repos(str(input_file), blacklist_file) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 559c222..57d2fe0 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -83,6 +83,8 @@ def fake_create_issues_main(*, args, standalone_mode): str(output_root / "batch-a" / "202603" / "pitfalls_outputs"), "--analysis-output", str(output_root / "batch-a" / "202603" / "analysis_results.json"), + "--blacklist-file", + str(opt_outs_file), ] assert calls["create_issues"]["standalone_mode"] is False From f4c4809ac90d3bc3215c98a0a98b2099e2bb4074 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 10:17:22 +0000 Subject: [PATCH 3/4] feat: rename --blacklist-file to --blacklist, default to .blacklist, add wildcard support Co-authored-by: vuillaut <4263646+vuillaut@users.noreply.github.com> --- src/sw_metadata_bot/metacheck_wrapper.py | 67 +++++++-- src/sw_metadata_bot/pipeline.py | 2 +- tests/test_metacheck_wrapper.py | 177 +++++++++++++++++++++-- tests/test_pipeline.py | 2 +- 4 files changed, 225 insertions(+), 23 deletions(-) diff --git a/src/sw_metadata_bot/metacheck_wrapper.py b/src/sw_metadata_bot/metacheck_wrapper.py index 5214ba5..cdfc7be 100644 --- a/src/sw_metadata_bot/metacheck_wrapper.py +++ b/src/sw_metadata_bot/metacheck_wrapper.py @@ -1,6 +1,7 @@ """Wrapper for metacheck CLI to integrate with sw-metadata-bot.""" import json +import re import sys import tempfile from pathlib import Path @@ -8,10 +9,53 @@ import click from metacheck import cli as metacheck_cli +DEFAULT_BLACKLIST_FILE = Path(".blacklist") + + +def _pattern_to_regex(pattern: str) -> re.Pattern: + """Compile a blacklist pattern into a regex. + + Entries may be plain URLs (exact match) or patterns using ``*`` as a + wildcard (e.g. ``https://github.com/MyOrg/*``). The pattern is first + escaped with :func:`re.escape` so that URL characters such as ``.`` and + ``/`` are treated as literals, then the escaped form of ``*`` (``\\*``) is + replaced with ``.*`` to restore wildcard behaviour. + + :param pattern: A blacklist entry after trailing-slash stripping. + :return: Compiled regex pattern for ``re.fullmatch``. + """ + escaped = re.escape(pattern) + regex = escaped.replace(r"\*", ".*") + return re.compile(regex) + + +def _is_blacklisted(url: str, patterns: list[str]) -> bool: + """Return True if *url* matches any blacklist pattern. + + :param url: Repository URL to test (trailing slash already stripped). + :param patterns: List of raw blacklist entries from the blacklist file. + :return: True when url matches at least one pattern. + """ + normalized = url.strip().rstrip("/") + for pattern in patterns: + try: + if re.fullmatch(_pattern_to_regex(pattern.strip().rstrip("/")), normalized): + return True + except re.error as exc: + click.echo( + f"Warning: invalid blacklist pattern ignored ({pattern!r}): {exc}", + err=True, + ) + return False + def _filter_blacklisted_repos(input_path: str, blacklist_file: Path) -> str: """Return a temp file path with blacklisted repos removed from the input JSON. + Each entry in the blacklist ``repositories`` list is treated as a pattern + where ``*`` acts as a wildcard (e.g. ``https://github.com/MyOrg/*``). + Full Python regex syntax is also accepted. + :param input_path: Path to the input JSON file containing repository list. :param blacklist_file: Path to the JSON file containing blacklisted repos. :return: Path to a temporary filtered JSON file. @@ -25,7 +69,7 @@ def _filter_blacklisted_repos(input_path: str, blacklist_file: Path) -> str: raise click.ClickException( f"Invalid format in {blacklist_file}: 'repositories' must be a list" ) - blacklisted_set = {url.strip().rstrip("/") for url in blacklisted if isinstance(url, str)} + patterns = [url for url in blacklisted if isinstance(url, str)] with open(input_path, encoding="utf-8") as f: input_data = json.load(f) @@ -34,7 +78,7 @@ def _filter_blacklisted_repos(input_path: str, blacklist_file: Path) -> str: filtered_repos = [ url for url in original_repos - if isinstance(url, str) and url.strip().rstrip("/") not in blacklisted_set + if isinstance(url, str) and not _is_blacklisted(url, patterns) ] skipped = len(original_repos) - len(filtered_repos) @@ -85,20 +129,25 @@ def _filter_blacklisted_repos(input_path: str, blacklist_file: Path) -> str: help="SoMEF confidence threshold (default: 0.8).", ) @click.option( - "--blacklist-file", - type=click.Path(exists=True, dir_okay=False, path_type=Path), - default=None, - help="JSON file listing repositories to exclude from analysis.", + "--blacklist", + type=click.Path(dir_okay=False, path_type=Path), + default=DEFAULT_BLACKLIST_FILE, + show_default=True, + help=( + "JSON file listing repositories (or patterns) to exclude from analysis. " + "Defaults to .blacklist in the current directory; " + "silently ignored when the file does not exist." + ), ) def metacheck_command( - input, skip_somef, pitfalls_output, analysis_output, threshold, blacklist_file + input, skip_somef, pitfalls_output, analysis_output, threshold, blacklist ): """Run metacheck to detect metadata pitfalls in repositories.""" # Apply blacklist filtering when input is a JSON file (not a single URL) effective_input = input.strip() tmp_input_path = None - if blacklist_file is not None and Path(effective_input).is_file(): - tmp_input_path = _filter_blacklisted_repos(effective_input, blacklist_file) + if blacklist is not None and blacklist.is_file() and Path(effective_input).is_file(): + tmp_input_path = _filter_blacklisted_repos(effective_input, blacklist) effective_input = tmp_input_path # Convert click arguments to sys.argv format for metacheck's argparse diff --git a/src/sw_metadata_bot/pipeline.py b/src/sw_metadata_bot/pipeline.py index 6d4b721..9117830 100644 --- a/src/sw_metadata_bot/pipeline.py +++ b/src/sw_metadata_bot/pipeline.py @@ -56,7 +56,7 @@ def run_pipeline( str(pitfalls_output_dir), "--analysis-output", str(analysis_output_file), - "--blacklist-file", + "--blacklist", str(opt_outs_file), ], standalone_mode=False, diff --git a/tests/test_metacheck_wrapper.py b/tests/test_metacheck_wrapper.py index 3723328..97c4f75 100644 --- a/tests/test_metacheck_wrapper.py +++ b/tests/test_metacheck_wrapper.py @@ -24,8 +24,8 @@ def test_filter_blacklisted_repos_removes_matching_urls(tmp_path): ) ) - blacklist_file = tmp_path / "blacklist.json" - blacklist_file.write_text( + blacklist = tmp_path / "blacklist.json" + blacklist.write_text( json.dumps( { "repositories": [ @@ -37,7 +37,7 @@ def test_filter_blacklisted_repos_removes_matching_urls(tmp_path): ) filtered_path = metacheck_wrapper._filter_blacklisted_repos( - str(input_file), blacklist_file + str(input_file), blacklist ) try: @@ -49,6 +49,105 @@ def test_filter_blacklisted_repos_removes_matching_urls(tmp_path): os.unlink(filtered_path) +def test_filter_blacklisted_repos_wildcard_pattern(tmp_path): + """Glob-style wildcard patterns match all repos in an organisation.""" + input_file = tmp_path / "repos.json" + input_file.write_text( + json.dumps( + { + "repositories": [ + "https://github.com/SoftwareUnderstanding/repo-a", + "https://github.com/SoftwareUnderstanding/repo-b", + "https://github.com/other-org/keep-me", + ] + } + ) + ) + + blacklist = tmp_path / "blacklist.json" + blacklist.write_text( + json.dumps({"repositories": ["https://github.com/SoftwareUnderstanding/*"]}) + ) + + filtered_path = metacheck_wrapper._filter_blacklisted_repos( + str(input_file), blacklist + ) + + try: + with open(filtered_path, encoding="utf-8") as f: + filtered_data = json.load(f) + + assert filtered_data["repositories"] == ["https://github.com/other-org/keep-me"] + finally: + os.unlink(filtered_path) + + +def test_filter_blacklisted_repos_wildcard_suffix(tmp_path): + """Wildcard suffix on a prefix matches repos whose name starts with the prefix.""" + input_file = tmp_path / "repos.json" + input_file.write_text( + json.dumps( + { + "repositories": [ + "https://github.com/org/skip-123", + "https://github.com/org/skip-456", + "https://github.com/org/keep-me", + ] + } + ) + ) + + blacklist = tmp_path / "blacklist.json" + blacklist.write_text( + json.dumps({"repositories": ["https://github.com/org/skip-*"]}) + ) + + filtered_path = metacheck_wrapper._filter_blacklisted_repos( + str(input_file), blacklist + ) + + try: + with open(filtered_path, encoding="utf-8") as f: + filtered_data = json.load(f) + + assert filtered_data["repositories"] == ["https://github.com/org/keep-me"] + finally: + os.unlink(filtered_path) + + +def test_filter_blacklisted_repos_dot_in_url_is_literal(tmp_path): + """Dots in URLs are treated as literals, not regex 'any character'.""" + input_file = tmp_path / "repos.json" + input_file.write_text( + json.dumps( + { + "repositories": [ + "https://github.com/org/repo", + "https://githubXcom/org/repo", # dot replaced by X + ] + } + ) + ) + + blacklist = tmp_path / "blacklist.json" + blacklist.write_text( + json.dumps({"repositories": ["https://github.com/org/repo"]}) + ) + + filtered_path = metacheck_wrapper._filter_blacklisted_repos( + str(input_file), blacklist + ) + + try: + with open(filtered_path, encoding="utf-8") as f: + filtered_data = json.load(f) + + # Only the exact-match URL is removed; the X-variant is kept + assert filtered_data["repositories"] == ["https://githubXcom/org/repo"] + finally: + os.unlink(filtered_path) + + def test_filter_blacklisted_repos_preserves_extra_keys(tmp_path): """Non-repositories keys in the input file are preserved after filtering.""" input_file = tmp_path / "repos.json" @@ -61,13 +160,13 @@ def test_filter_blacklisted_repos_preserves_extra_keys(tmp_path): ) ) - blacklist_file = tmp_path / "blacklist.json" - blacklist_file.write_text( + blacklist = tmp_path / "blacklist.json" + blacklist.write_text( json.dumps({"repositories": ["https://github.com/org/skip"]}) ) filtered_path = metacheck_wrapper._filter_blacklisted_repos( - str(input_file), blacklist_file + str(input_file), blacklist ) try: @@ -86,11 +185,11 @@ def test_filter_blacklisted_repos_empty_blacklist_keeps_all(tmp_path): repos = ["https://github.com/org/a", "https://github.com/org/b"] input_file.write_text(json.dumps({"repositories": repos})) - blacklist_file = tmp_path / "blacklist.json" - blacklist_file.write_text(json.dumps({"repositories": []})) + blacklist = tmp_path / "blacklist.json" + blacklist.write_text(json.dumps({"repositories": []})) filtered_path = metacheck_wrapper._filter_blacklisted_repos( - str(input_file), blacklist_file + str(input_file), blacklist ) try: @@ -107,8 +206,62 @@ def test_filter_blacklisted_repos_invalid_blacklist_format_raises(tmp_path): input_file = tmp_path / "repos.json" input_file.write_text(json.dumps({"repositories": ["https://github.com/org/a"]})) - blacklist_file = tmp_path / "blacklist.json" - blacklist_file.write_text(json.dumps({"repositories": "not-a-list"})) + blacklist = tmp_path / "blacklist.json" + blacklist.write_text(json.dumps({"repositories": "not-a-list"})) with pytest.raises(click.ClickException, match="repositories' must be a list"): - metacheck_wrapper._filter_blacklisted_repos(str(input_file), blacklist_file) + metacheck_wrapper._filter_blacklisted_repos(str(input_file), blacklist) + + +def test_is_blacklisted_wildcard(): + """Wildcard pattern matches repos in the specified organisation.""" + patterns = ["https://github.com/MyOrg/*"] + assert metacheck_wrapper._is_blacklisted("https://github.com/MyOrg/repo-a", patterns) + assert metacheck_wrapper._is_blacklisted("https://github.com/MyOrg/repo-b", patterns) + assert not metacheck_wrapper._is_blacklisted("https://github.com/OtherOrg/repo", patterns) + + +def test_is_blacklisted_exact_url(): + """Exact URL (no regex) is matched correctly.""" + patterns = ["https://github.com/org/repo"] + assert metacheck_wrapper._is_blacklisted("https://github.com/org/repo", patterns) + assert metacheck_wrapper._is_blacklisted("https://github.com/org/repo/", patterns) + assert not metacheck_wrapper._is_blacklisted("https://github.com/org/other", patterns) + + +def test_is_blacklisted_trailing_slash_normalization(): + """Trailing slashes are stripped before matching.""" + patterns = ["https://github.com/org/repo/"] + assert metacheck_wrapper._is_blacklisted("https://github.com/org/repo", patterns) + + +def test_missing_default_blacklist_is_silently_skipped(tmp_path, monkeypatch): + """When the default .blacklist file is absent, no filtering is applied.""" + monkeypatch.chdir(tmp_path) # ensure no .blacklist exists in CWD + + def fake_metacheck_cli(): + pass + + monkeypatch.setattr(metacheck_wrapper, "metacheck_cli", fake_metacheck_cli) + + input_file = tmp_path / "repos.json" + input_file.write_text(json.dumps({"repositories": ["https://github.com/org/a"]})) + + # Invoke directly without providing --blacklist; default .blacklist does not exist + from click.testing import CliRunner + + runner = CliRunner() + result = runner.invoke( + metacheck_wrapper.metacheck_command, + [ + "--input", + str(input_file), + "--pitfalls-output", + str(tmp_path / "out"), + "--analysis-output", + str(tmp_path / "results.json"), + ], + ) + # Should not error due to missing default .blacklist + assert "Error" not in result.output + assert result.exit_code == 0 diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 57d2fe0..008e716 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -83,7 +83,7 @@ def fake_create_issues_main(*, args, standalone_mode): str(output_root / "batch-a" / "202603" / "pitfalls_outputs"), "--analysis-output", str(output_root / "batch-a" / "202603" / "analysis_results.json"), - "--blacklist-file", + "--blacklist", str(opt_outs_file), ] From 7639cd6e8be323de883baabe02ff6b4d583c5287 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 10:45:51 +0000 Subject: [PATCH 4/4] refactor: rename blacklist to opt-outs throughout metacheck_wrapper and tests Co-authored-by: vuillaut <4263646+vuillaut@users.noreply.github.com> --- src/sw_metadata_bot/metacheck_wrapper.py | 54 +++++------ src/sw_metadata_bot/pipeline.py | 2 +- tests/test_metacheck_wrapper.py | 112 +++++++++++------------ tests/test_pipeline.py | 2 +- 4 files changed, 85 insertions(+), 85 deletions(-) diff --git a/src/sw_metadata_bot/metacheck_wrapper.py b/src/sw_metadata_bot/metacheck_wrapper.py index cdfc7be..eae984f 100644 --- a/src/sw_metadata_bot/metacheck_wrapper.py +++ b/src/sw_metadata_bot/metacheck_wrapper.py @@ -9,11 +9,11 @@ import click from metacheck import cli as metacheck_cli -DEFAULT_BLACKLIST_FILE = Path(".blacklist") +DEFAULT_OPT_OUTS_FILE = Path(".opt-outs") def _pattern_to_regex(pattern: str) -> re.Pattern: - """Compile a blacklist pattern into a regex. + """Compile an opt-outs pattern into a regex. Entries may be plain URLs (exact match) or patterns using ``*`` as a wildcard (e.g. ``https://github.com/MyOrg/*``). The pattern is first @@ -21,7 +21,7 @@ def _pattern_to_regex(pattern: str) -> re.Pattern: ``/`` are treated as literals, then the escaped form of ``*`` (``\\*``) is replaced with ``.*`` to restore wildcard behaviour. - :param pattern: A blacklist entry after trailing-slash stripping. + :param pattern: An opt-outs entry after trailing-slash stripping. :return: Compiled regex pattern for ``re.fullmatch``. """ escaped = re.escape(pattern) @@ -29,11 +29,11 @@ def _pattern_to_regex(pattern: str) -> re.Pattern: return re.compile(regex) -def _is_blacklisted(url: str, patterns: list[str]) -> bool: - """Return True if *url* matches any blacklist pattern. +def _is_opted_out(url: str, patterns: list[str]) -> bool: + """Return True if *url* matches any opt-outs pattern. :param url: Repository URL to test (trailing slash already stripped). - :param patterns: List of raw blacklist entries from the blacklist file. + :param patterns: List of raw opt-outs entries from the opt-outs file. :return: True when url matches at least one pattern. """ normalized = url.strip().rstrip("/") @@ -43,33 +43,33 @@ def _is_blacklisted(url: str, patterns: list[str]) -> bool: return True except re.error as exc: click.echo( - f"Warning: invalid blacklist pattern ignored ({pattern!r}): {exc}", + f"Warning: invalid opt-outs pattern ignored ({pattern!r}): {exc}", err=True, ) return False -def _filter_blacklisted_repos(input_path: str, blacklist_file: Path) -> str: - """Return a temp file path with blacklisted repos removed from the input JSON. +def _filter_opt_out_repos(input_path: str, opt_outs_file: Path) -> str: + """Return a temp file path with opted-out repos removed from the input JSON. - Each entry in the blacklist ``repositories`` list is treated as a pattern + Each entry in the opt-outs ``repositories`` list is treated as a pattern where ``*`` acts as a wildcard (e.g. ``https://github.com/MyOrg/*``). Full Python regex syntax is also accepted. :param input_path: Path to the input JSON file containing repository list. - :param blacklist_file: Path to the JSON file containing blacklisted repos. + :param opt_outs_file: Path to the JSON file containing opted-out repos. :return: Path to a temporary filtered JSON file. - :raises click.ClickException: If blacklist file has invalid format. + :raises click.ClickException: If opt-outs file has invalid format. """ - with open(blacklist_file, encoding="utf-8") as f: - blacklist_data = json.load(f) + with open(opt_outs_file, encoding="utf-8") as f: + opt_outs_data = json.load(f) - blacklisted = blacklist_data.get("repositories", []) - if not isinstance(blacklisted, list): + opted_out = opt_outs_data.get("repositories", []) + if not isinstance(opted_out, list): raise click.ClickException( - f"Invalid format in {blacklist_file}: 'repositories' must be a list" + f"Invalid format in {opt_outs_file}: 'repositories' must be a list" ) - patterns = [url for url in blacklisted if isinstance(url, str)] + patterns = [url for url in opted_out if isinstance(url, str)] with open(input_path, encoding="utf-8") as f: input_data = json.load(f) @@ -78,13 +78,13 @@ def _filter_blacklisted_repos(input_path: str, blacklist_file: Path) -> str: filtered_repos = [ url for url in original_repos - if isinstance(url, str) and not _is_blacklisted(url, patterns) + if isinstance(url, str) and not _is_opted_out(url, patterns) ] skipped = len(original_repos) - len(filtered_repos) if skipped > 0: click.echo( - f"Blacklist: skipping {skipped} blacklisted " + f"Opt-outs: skipping {skipped} opted-out " f"{'repository' if skipped == 1 else 'repositories'} from analysis." ) @@ -129,25 +129,25 @@ def _filter_blacklisted_repos(input_path: str, blacklist_file: Path) -> str: help="SoMEF confidence threshold (default: 0.8).", ) @click.option( - "--blacklist", + "--opt-outs", type=click.Path(dir_okay=False, path_type=Path), - default=DEFAULT_BLACKLIST_FILE, + default=DEFAULT_OPT_OUTS_FILE, show_default=True, help=( "JSON file listing repositories (or patterns) to exclude from analysis. " - "Defaults to .blacklist in the current directory; " + "Defaults to .opt-outs in the current directory; " "silently ignored when the file does not exist." ), ) def metacheck_command( - input, skip_somef, pitfalls_output, analysis_output, threshold, blacklist + input, skip_somef, pitfalls_output, analysis_output, threshold, opt_outs ): """Run metacheck to detect metadata pitfalls in repositories.""" - # Apply blacklist filtering when input is a JSON file (not a single URL) + # Apply opt-outs filtering when input is a JSON file (not a single URL) effective_input = input.strip() tmp_input_path = None - if blacklist is not None and blacklist.is_file() and Path(effective_input).is_file(): - tmp_input_path = _filter_blacklisted_repos(effective_input, blacklist) + if opt_outs is not None and opt_outs.is_file() and Path(effective_input).is_file(): + tmp_input_path = _filter_opt_out_repos(effective_input, opt_outs) effective_input = tmp_input_path # Convert click arguments to sys.argv format for metacheck's argparse diff --git a/src/sw_metadata_bot/pipeline.py b/src/sw_metadata_bot/pipeline.py index 9117830..edd1b31 100644 --- a/src/sw_metadata_bot/pipeline.py +++ b/src/sw_metadata_bot/pipeline.py @@ -56,7 +56,7 @@ def run_pipeline( str(pitfalls_output_dir), "--analysis-output", str(analysis_output_file), - "--blacklist", + "--opt-outs", str(opt_outs_file), ], standalone_mode=False, diff --git a/tests/test_metacheck_wrapper.py b/tests/test_metacheck_wrapper.py index 97c4f75..499b5d9 100644 --- a/tests/test_metacheck_wrapper.py +++ b/tests/test_metacheck_wrapper.py @@ -9,35 +9,35 @@ from sw_metadata_bot import metacheck_wrapper -def test_filter_blacklisted_repos_removes_matching_urls(tmp_path): - """Repos in the blacklist are excluded from the filtered input file.""" +def test_filter_opt_out_repos_removes_matching_urls(tmp_path): + """Repos in the opt-outs list are excluded from the filtered input file.""" input_file = tmp_path / "repos.json" input_file.write_text( json.dumps( { "repositories": [ "https://github.com/org/keep-me", - "https://github.com/org/blacklisted", - "https://gitlab.com/group/also-blacklisted/", + "https://github.com/org/opted-out", + "https://gitlab.com/group/also-opted-out/", ] } ) ) - blacklist = tmp_path / "blacklist.json" - blacklist.write_text( + opt_outs = tmp_path / "opt-outs.json" + opt_outs.write_text( json.dumps( { "repositories": [ - "https://github.com/org/blacklisted", - "https://gitlab.com/group/also-blacklisted", + "https://github.com/org/opted-out", + "https://gitlab.com/group/also-opted-out", ] } ) ) - filtered_path = metacheck_wrapper._filter_blacklisted_repos( - str(input_file), blacklist + filtered_path = metacheck_wrapper._filter_opt_out_repos( + str(input_file), opt_outs ) try: @@ -49,7 +49,7 @@ def test_filter_blacklisted_repos_removes_matching_urls(tmp_path): os.unlink(filtered_path) -def test_filter_blacklisted_repos_wildcard_pattern(tmp_path): +def test_filter_opt_out_repos_wildcard_pattern(tmp_path): """Glob-style wildcard patterns match all repos in an organisation.""" input_file = tmp_path / "repos.json" input_file.write_text( @@ -64,13 +64,13 @@ def test_filter_blacklisted_repos_wildcard_pattern(tmp_path): ) ) - blacklist = tmp_path / "blacklist.json" - blacklist.write_text( + opt_outs = tmp_path / "opt-outs.json" + opt_outs.write_text( json.dumps({"repositories": ["https://github.com/SoftwareUnderstanding/*"]}) ) - filtered_path = metacheck_wrapper._filter_blacklisted_repos( - str(input_file), blacklist + filtered_path = metacheck_wrapper._filter_opt_out_repos( + str(input_file), opt_outs ) try: @@ -82,7 +82,7 @@ def test_filter_blacklisted_repos_wildcard_pattern(tmp_path): os.unlink(filtered_path) -def test_filter_blacklisted_repos_wildcard_suffix(tmp_path): +def test_filter_opt_out_repos_wildcard_suffix(tmp_path): """Wildcard suffix on a prefix matches repos whose name starts with the prefix.""" input_file = tmp_path / "repos.json" input_file.write_text( @@ -97,13 +97,13 @@ def test_filter_blacklisted_repos_wildcard_suffix(tmp_path): ) ) - blacklist = tmp_path / "blacklist.json" - blacklist.write_text( + opt_outs = tmp_path / "opt-outs.json" + opt_outs.write_text( json.dumps({"repositories": ["https://github.com/org/skip-*"]}) ) - filtered_path = metacheck_wrapper._filter_blacklisted_repos( - str(input_file), blacklist + filtered_path = metacheck_wrapper._filter_opt_out_repos( + str(input_file), opt_outs ) try: @@ -115,7 +115,7 @@ def test_filter_blacklisted_repos_wildcard_suffix(tmp_path): os.unlink(filtered_path) -def test_filter_blacklisted_repos_dot_in_url_is_literal(tmp_path): +def test_filter_opt_out_repos_dot_in_url_is_literal(tmp_path): """Dots in URLs are treated as literals, not regex 'any character'.""" input_file = tmp_path / "repos.json" input_file.write_text( @@ -129,13 +129,13 @@ def test_filter_blacklisted_repos_dot_in_url_is_literal(tmp_path): ) ) - blacklist = tmp_path / "blacklist.json" - blacklist.write_text( + opt_outs = tmp_path / "opt-outs.json" + opt_outs.write_text( json.dumps({"repositories": ["https://github.com/org/repo"]}) ) - filtered_path = metacheck_wrapper._filter_blacklisted_repos( - str(input_file), blacklist + filtered_path = metacheck_wrapper._filter_opt_out_repos( + str(input_file), opt_outs ) try: @@ -148,7 +148,7 @@ def test_filter_blacklisted_repos_dot_in_url_is_literal(tmp_path): os.unlink(filtered_path) -def test_filter_blacklisted_repos_preserves_extra_keys(tmp_path): +def test_filter_opt_out_repos_preserves_extra_keys(tmp_path): """Non-repositories keys in the input file are preserved after filtering.""" input_file = tmp_path / "repos.json" input_file.write_text( @@ -160,13 +160,13 @@ def test_filter_blacklisted_repos_preserves_extra_keys(tmp_path): ) ) - blacklist = tmp_path / "blacklist.json" - blacklist.write_text( + opt_outs = tmp_path / "opt-outs.json" + opt_outs.write_text( json.dumps({"repositories": ["https://github.com/org/skip"]}) ) - filtered_path = metacheck_wrapper._filter_blacklisted_repos( - str(input_file), blacklist + filtered_path = metacheck_wrapper._filter_opt_out_repos( + str(input_file), opt_outs ) try: @@ -179,17 +179,17 @@ def test_filter_blacklisted_repos_preserves_extra_keys(tmp_path): os.unlink(filtered_path) -def test_filter_blacklisted_repos_empty_blacklist_keeps_all(tmp_path): - """Empty blacklist leaves the repository list unchanged.""" +def test_filter_opt_out_repos_empty_list_keeps_all(tmp_path): + """Empty opt-outs list leaves the repository list unchanged.""" input_file = tmp_path / "repos.json" repos = ["https://github.com/org/a", "https://github.com/org/b"] input_file.write_text(json.dumps({"repositories": repos})) - blacklist = tmp_path / "blacklist.json" - blacklist.write_text(json.dumps({"repositories": []})) + opt_outs = tmp_path / "opt-outs.json" + opt_outs.write_text(json.dumps({"repositories": []})) - filtered_path = metacheck_wrapper._filter_blacklisted_repos( - str(input_file), blacklist + filtered_path = metacheck_wrapper._filter_opt_out_repos( + str(input_file), opt_outs ) try: @@ -201,43 +201,43 @@ def test_filter_blacklisted_repos_empty_blacklist_keeps_all(tmp_path): os.unlink(filtered_path) -def test_filter_blacklisted_repos_invalid_blacklist_format_raises(tmp_path): - """Invalid 'repositories' type in blacklist file raises ClickException.""" +def test_filter_opt_out_repos_invalid_format_raises(tmp_path): + """Invalid 'repositories' type in opt-outs file raises ClickException.""" input_file = tmp_path / "repos.json" input_file.write_text(json.dumps({"repositories": ["https://github.com/org/a"]})) - blacklist = tmp_path / "blacklist.json" - blacklist.write_text(json.dumps({"repositories": "not-a-list"})) + opt_outs = tmp_path / "opt-outs.json" + opt_outs.write_text(json.dumps({"repositories": "not-a-list"})) with pytest.raises(click.ClickException, match="repositories' must be a list"): - metacheck_wrapper._filter_blacklisted_repos(str(input_file), blacklist) + metacheck_wrapper._filter_opt_out_repos(str(input_file), opt_outs) -def test_is_blacklisted_wildcard(): +def test_is_opted_out_wildcard(): """Wildcard pattern matches repos in the specified organisation.""" patterns = ["https://github.com/MyOrg/*"] - assert metacheck_wrapper._is_blacklisted("https://github.com/MyOrg/repo-a", patterns) - assert metacheck_wrapper._is_blacklisted("https://github.com/MyOrg/repo-b", patterns) - assert not metacheck_wrapper._is_blacklisted("https://github.com/OtherOrg/repo", patterns) + assert metacheck_wrapper._is_opted_out("https://github.com/MyOrg/repo-a", patterns) + assert metacheck_wrapper._is_opted_out("https://github.com/MyOrg/repo-b", patterns) + assert not metacheck_wrapper._is_opted_out("https://github.com/OtherOrg/repo", patterns) -def test_is_blacklisted_exact_url(): +def test_is_opted_out_exact_url(): """Exact URL (no regex) is matched correctly.""" patterns = ["https://github.com/org/repo"] - assert metacheck_wrapper._is_blacklisted("https://github.com/org/repo", patterns) - assert metacheck_wrapper._is_blacklisted("https://github.com/org/repo/", patterns) - assert not metacheck_wrapper._is_blacklisted("https://github.com/org/other", patterns) + assert metacheck_wrapper._is_opted_out("https://github.com/org/repo", patterns) + assert metacheck_wrapper._is_opted_out("https://github.com/org/repo/", patterns) + assert not metacheck_wrapper._is_opted_out("https://github.com/org/other", patterns) -def test_is_blacklisted_trailing_slash_normalization(): +def test_is_opted_out_trailing_slash_normalization(): """Trailing slashes are stripped before matching.""" patterns = ["https://github.com/org/repo/"] - assert metacheck_wrapper._is_blacklisted("https://github.com/org/repo", patterns) + assert metacheck_wrapper._is_opted_out("https://github.com/org/repo", patterns) -def test_missing_default_blacklist_is_silently_skipped(tmp_path, monkeypatch): - """When the default .blacklist file is absent, no filtering is applied.""" - monkeypatch.chdir(tmp_path) # ensure no .blacklist exists in CWD +def test_missing_default_opt_outs_is_silently_skipped(tmp_path, monkeypatch): + """When the default .opt-outs file is absent, no filtering is applied.""" + monkeypatch.chdir(tmp_path) # ensure no .opt-outs exists in CWD def fake_metacheck_cli(): pass @@ -247,7 +247,7 @@ def fake_metacheck_cli(): input_file = tmp_path / "repos.json" input_file.write_text(json.dumps({"repositories": ["https://github.com/org/a"]})) - # Invoke directly without providing --blacklist; default .blacklist does not exist + # Invoke directly without providing --opt-outs; default .opt-outs does not exist from click.testing import CliRunner runner = CliRunner() @@ -262,6 +262,6 @@ def fake_metacheck_cli(): str(tmp_path / "results.json"), ], ) - # Should not error due to missing default .blacklist + # Should not error due to missing default .opt-outs assert "Error" not in result.output assert result.exit_code == 0 diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 008e716..391bc7c 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -83,7 +83,7 @@ def fake_create_issues_main(*, args, standalone_mode): str(output_root / "batch-a" / "202603" / "pitfalls_outputs"), "--analysis-output", str(output_root / "batch-a" / "202603" / "analysis_results.json"), - "--blacklist", + "--opt-outs", str(opt_outs_file), ]