From 0f2fe24dcd116ee98a4547cbe3824fb93890216e Mon Sep 17 00:00:00 2001 From: Alex Dryden Date: Wed, 11 Feb 2026 14:13:05 -0500 Subject: [PATCH 01/14] feat(arclight#29): Add creator/agent indexing system for ArcLight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement complete ETL pipeline for ArchivesSpace agents: - Extract all agent records via ArchivesSpace API - Generate EAC-CPF XML documents for each agent - Auto-discover and configure traject indexing - Batch index to Solr (100 files per call for performance) - Support multiple processing modes (agents-only, collections-only, both) - Add 11 new Solr fields for agent metadata - Include 271-line traject config for EAC-CPF → Solr mapping Key features: - Parallel to existing collection record indexing - Dynamic Solr field mapping for ArcLight compatibility - Robust error handling and logging - Configurable traject config discovery paths This allows ArcLight to provide dedicated agent/creator pages with full biographical information, related collections, and authority control. --- README.md | 189 ++++++++++++++++- arcflow/main.py | 430 ++++++++++++++++++++++++++++++++++++-- traject_config_eac_cpf.rb | 275 ++++++++++++++++++++++++ 3 files changed, 880 insertions(+), 14 deletions(-) create mode 100644 traject_config_eac_cpf.rb diff --git a/README.md b/README.md index f6397ac..6c570bf 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,190 @@ # ArcFlow -Code for exporting data from ArchivesSpace to ArcLight, along with additional utility scripts for data handling and transformation. \ No newline at end of file +Code for exporting data from ArchivesSpace to ArcLight, along with additional utility scripts for data handling and transformation. + +## Quick Start + +This directory contains a complete, working installation of arcflow with creator records support. To run it: + +```bash +# 1. Install dependencies +pip install -r requirements.txt + +# 2. Configure credentials +cp .archivessnake.yml.example .archivessnake.yml +nano .archivessnake.yml # Add your ArchivesSpace credentials + +# 3. Set environment variables +export ARCLIGHT_DIR=/path/to/your/arclight-app +export ASPACE_DIR=/path/to/your/archivesspace +export SOLR_URL=http://localhost:8983/solr/blacklight-core + +# 4. Run arcflow +python -m arcflow.main + +``` + +--- + +## Features + +- **Collection Indexing**: Exports EAD XML from ArchivesSpace and indexes to ArcLight Solr +- **Creator Records**: Extracts creator agent information and indexes as standalone documents +- **Biographical Notes**: Injects creator biographical/historical notes into collection EAD XML +- **PDF Generation**: Generates finding aid PDFs via ArchivesSpace jobs +- **Incremental Updates**: Supports modified-since filtering for efficient updates + +## Creator Records + +ArcFlow now generates standalone creator documents in addition to collection records. Creator documents: + +- Include biographical/historical notes from ArchivesSpace agent records +- Link to all collections where the creator is listed +- Can be searched and displayed independently in ArcLight +- Are marked with `is_creator: true` to distinguish from collections +- Must be fed into a Solr instance with fields to match their specific facets (See:Configure Solr Schema below ) + +### How Creator Records Work + +1. **Extraction**: `get_all_agents()` fetches all agents from ArchivesSpace +2. **Processing**: `task_agent()` generates an EAC-CPF XML document for each agent with bioghist notes +3. **Linking**: Handled via Solr using the persistent_id field (agents and collections linked through bioghist references) +4. **Indexing**: Creator XML files are indexed to Solr using `traject_config_eac_cpf.rb` + +### Creator Document Format + +Creator documents are stored as XML files in `agents/` directory using the ArchivesSpace EAC-CPF export: + +```xml + + + + + + corporateBody + + Core: Leadership, Infrastructure, Futures + local + + + + + 2020- + + +

Founded on September 1, 2020, the Core: Leadership, Infrastructure, Futures division of the American Library Association has a mission to cultivate and amplify the collective expertise of library workers in core functions through community building, advocacy, and learning. + In June 2020, the ALA Council voted to approve Core: Leadership, Infrastructure, Futures as a new ALA division beginning September 1, 2020, and to dissolve the Association for Library Collections and Technical Services (ALCTS), the Library Information Technology Association (LITA) and the Library Leadership and Management Association (LLAMA) effective August 31, 2020. The vote to form Core was 163 to 1.(1)

+ 1. "ALA Council approves Core; dissolves ALCTS, LITA and LLAMA," July 1, 2020, http://www.ala.org/news/member-news/2020/07/ala-council-approves-core-dissolves-alcts-lita-and-llama. +
+
+ +
+
+``` + +### Indexing Creator Documents + +#### Configure Solr Schema (Required Before Indexing) + +⚠️ **CRITICAL PREREQUISITE** - Before you can index creator records to Solr, you must configure the Solr schema. + +**See [SOLR_SCHEMA.md](SOLR_SCHEMA.md) for complete instructions on:** +- Which fields to add (is_creator, creator_persistent_id, etc.) +- Three methods to add them (Schema API recommended, managed-schema, or schema.xml) +- How to verify they're added +- Troubleshooting "unknown field" errors + +**Quick Schema Setup (Schema API method):** +```bash +# Add is_creator field +curl -X POST -H 'Content-type:application/json' \ + http://localhost:8983/solr/blacklight-core/schema \ + -d '{"add-field": {"name": "is_creator", "type": "boolean", "indexed": true, "stored": true}}' + +# Add other required fields (see SOLR_SCHEMA.md for complete list) +``` + +**Verify schema is configured:** +```bash +curl "http://localhost:8983/solr/blacklight-core/schema/fields/is_creator" +# Should return field definition, not 404 +``` + +⚠️ **If you skip this step, you'll get:** +``` +ERROR: [doc=creator_corporate_entities_584] unknown field 'is_creator' +``` + +This is a **one-time setup** per Solr instance. + +--- + +To index creator documents to Solr: + +```bash +bundle exec traject \ + -u http://localhost:8983/solr/blacklight-core \ + -i xml \ + -c traject_config_eac_cpf.rb \ + /path/to/agents/*.xml +``` + +Or integrate into your ArcFlow deployment workflow. + +## Installation + +See the original installation instructions in your deployment documentation. + +## Configuration + +- `.archivessnake.yml` - ArchivesSpace API credentials +- `.arcflow.yml` - Last update timestamp tracking + +## Usage + +```bash +python -m arcflow.main --arclight-dir /path --aspace-dir /path --solr-url http://... [options] +``` + +### Command Line Options + +Required arguments: +- `--arclight-dir` - Path to ArcLight installation directory +- `--aspace-dir` - Path to ArchivesSpace installation directory +- `--solr-url` - URL of the Solr core (e.g., http://localhost:8983/solr/blacklight-core) + +Optional arguments: +- `--force-update` - Force update of all data (recreates everything from scratch) +- `--traject-extra-config` - Path to extra Traject configuration file +- `--agents-only` - Process only agent records, skip collections (useful for testing agents) +- `--collections-only` - Skips creators, processes EAD, PDF finding aid and indexes collections +- `--skip-creator-indexing` - Collects EAC-CPF files only, does not index into Solr +### Examples + +**Normal run (process all collections and agents):** +```bash +python -m arcflow.main \ + --arclight-dir /path/to/arclight \ + --aspace-dir /path/to/archivesspace \ + --solr-url http://localhost:8983/solr/blacklight-core +``` + +**Process only agents (skip collections):** +```bash +python -m arcflow.main \ + --arclight-dir /path/to/arclight \ + --aspace-dir /path/to/archivesspace \ + --solr-url http://localhost:8983/solr/blacklight-core \ + --agents-only +``` + +**Force full update:** +```bash +python -m arcflow.main \ + --arclight-dir /path/to/arclight \ + --aspace-dir /path/to/archivesspace \ + --solr-url http://localhost:8983/solr/blacklight-core \ + --force-update +``` + +See `--help` for all available options. \ No newline at end of file diff --git a/arcflow/main.py b/arcflow/main.py index a8621fa..292d049 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -9,12 +9,14 @@ import re import logging import math +import sys from xml.dom.pulldom import parse, START_ELEMENT from xml.sax.saxutils import escape as xml_escape +from xml.etree import ElementTree as ET from datetime import datetime, timezone from asnake.client import ASnakeClient from multiprocessing.pool import ThreadPool as Pool -from utils.stage_classifications import extract_labels +from .utils.stage_classifications import extract_labels base_dir = os.path.abspath((__file__) + "/../../") @@ -38,14 +40,19 @@ class ArcFlow: """ - def __init__(self, arclight_dir, aspace_dir, solr_url, traject_extra_config='', force_update=False): + def __init__(self, arclight_dir, aspace_dir, solr_url, traject_extra_config='', force_update=False, agents_only=False, collections_only=False, arcuit_dir=None, skip_creator_indexing=False): self.solr_url = solr_url self.batch_size = 1000 - self.traject_extra_config = f'-c {traject_extra_config}' if traject_extra_config.strip() else '' + clean_extra_config = traject_extra_config.strip() + self.traject_extra_config = clean_extra_config or None self.arclight_dir = arclight_dir self.aspace_jobs_dir = f'{aspace_dir}/data/shared/job_files' self.job_type = 'print_to_pdf_job' self.force_update = force_update + self.agents_only = agents_only + self.collections_only = collections_only + self.arcuit_dir = arcuit_dir + self.skip_creator_indexing = skip_creator_indexing self.log = logging.getLogger('arcflow') self.pid = os.getpid() self.pid_file_path = os.path.join(base_dir, 'arcflow.pid') @@ -395,6 +402,7 @@ def update_eads(self): pdf_dir = f'{self.arclight_dir}/public/pdf' modified_since = int(self.last_updated.timestamp()) + if self.force_update or modified_since <= 0: modified_since = 0 # delete all EADs and related files in ArcLight Solr @@ -454,7 +462,7 @@ def update_eads(self): # Tasks for indexing pending resources results_3 = [pool.apply_async( - self.index, + self.index_collections, args=(repo_id, f'{xml_dir}/{repo_id}_*_batch_{batch_num}.xml', indent_size)) for repo_id, batch_num in batches] @@ -527,17 +535,58 @@ def update_eads(self): page += 1 - def index(self, repo_id, xml_file_path, indent_size=0): + def index_collections(self, repo_id, xml_file_path, indent_size=0): + """Index collection XML files to Solr using traject.""" indent = ' ' * indent_size self.log.info(f'{indent}Indexing pending resources in repository ID {repo_id} to ArcLight Solr...') try: + # Get arclight traject config path + result_show = subprocess.run( + ['bundle', 'show', 'arclight'], + capture_output=True, + text=True, + cwd=self.arclight_dir + ) + arclight_path = result_show.stdout.strip() if result_show.returncode == 0 else '' + + if not arclight_path: + self.log.error(f'{indent}Could not find arclight gem path') + return + + traject_config = f'{arclight_path}/lib/arclight/traject/ead2_config.rb' + + cmd = [ + 'bundle', 'exec', 'traject', + '-u', self.solr_url, + '-s', 'processing_thread_pool=8', + '-s', 'solr_writer.thread_pool=8', + '-s', f'solr_writer.batch_size={self.batch_size}', + '-s', 'solr_writer.commit_on_close=true', + '-i', 'xml', + '-c', traject_config + ] + + if self.traject_extra_config: + if isinstance(self.traject_extra_config, (list, tuple)): + cmd.extend(self.traject_extra_config) + else: + # Treat a string extra config as a path and pass it with -c + cmd.extend(['-c', self.traject_extra_config]) + + cmd.append(xml_file_path) + + env = os.environ.copy() + env['REPOSITORY_ID'] = str(repo_id) + result = subprocess.run( - f'REPOSITORY_ID={repo_id} bundle exec traject -u {self.solr_url} -s processing_thread_pool=8 -s solr_writer.thread_pool=8 -s solr_writer.batch_size={self.batch_size} -s solr_writer.commit_on_close=true -i xml -c $(bundle show arclight)/lib/arclight/traject/ead2_config.rb {self.traject_extra_config} {xml_file_path}', -# f'FILE={xml_file_path} SOLR_URL={self.solr_url} REPOSITORY_ID={repo_id} TRAJECT_SETTINGS="processing_thread_pool=8 solr_writer.thread_pool=8 solr_writer.batch_size=1000 solr_writer.commit_on_close=false" bundle exec rake arcuit:index', - shell=True, + cmd, cwd=self.arclight_dir, - stderr=subprocess.PIPE,) - self.log.error(f'{indent}{result.stderr.decode("utf-8")}') + env=env, + stderr=subprocess.PIPE, + ) + + if result.stderr: + self.log.error(f'{indent}{result.stderr.decode("utf-8")}') if result.returncode != 0: self.log.error(f'{indent}Failed to index pending resources in repository ID {repo_id} to ArcLight Solr. Return code: {result.returncode}') else: @@ -625,6 +674,324 @@ def get_creator_bioghist(self, resource, indent_size=0): return None + def get_all_agents(self, agent_types=None, modified_since=0, indent_size=0): + """ + Fetch ALL agents from ArchivesSpace (not just creators). + Uses direct agent API endpoints for comprehensive coverage. + + Args: + agent_types: List of agent types to fetch. Default: ['corporate_entities', 'people', 'families'] + modified_since: Unix timestamp to filter agents modified since this time (if API supports it) + indent_size: Indentation size for logging + + Returns: + set: Set of agent URIs (e.g., '/agents/corporate_entities/123') + """ + if agent_types is None: + agent_types = ['corporate_entities', 'people', 'families'] + + indent = ' ' * indent_size + all_agents = set() + + self.log.info(f'{indent}Fetching ALL agents from ArchivesSpace...') + + for agent_type in agent_types: + try: + # Try with modified_since parameter first + params = {'all_ids': True} + if modified_since > 0: + params['modified_since'] = modified_since + + response = self.client.get(f'/agents/{agent_type}', params=params) + agent_ids = response.json() + + self.log.info(f'{indent}Found {len(agent_ids)} {agent_type} agents') + + # Add agent URIs to set + for agent_id in agent_ids: + agent_uri = f'/agents/{agent_type}/{agent_id}' + all_agents.add(agent_uri) + + except Exception as e: + self.log.error(f'{indent}Error fetching {agent_type} agents: {e}') + # If modified_since fails, try without it + if modified_since > 0: + self.log.warning(f'{indent}Retrying {agent_type} without modified_since filter...') + try: + response = self.client.get(f'/agents/{agent_type}', params={'all_ids': True}) + agent_ids = response.json() + self.log.info(f'{indent}Found {len(agent_ids)} {agent_type} agents (no date filter)') + for agent_id in agent_ids: + agent_uri = f'/agents/{agent_type}/{agent_id}' + all_agents.add(agent_uri) + except Exception as e2: + self.log.error(f'{indent}Failed to fetch {agent_type} agents: {e2}') + + self.log.info(f'{indent}Found {len(all_agents)} total agents across all types.') + return all_agents + + + def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0): + """ + Process a single agent and generate a creator document in EAC-CPF XML format. + Retrieves EAC-CPF directly from ArchivesSpace archival_contexts endpoint. + + Args: + agent_uri: Agent URI from ArchivesSpace (e.g., '/agents/corporate_entities/123') + agents_dir: Directory to save agent XML files + repo_id: Repository ID to use for archival_contexts endpoint (default: 1) + indent_size: Indentation size for logging + + Returns: + str: Creator document ID if successful, None otherwise + """ + indent = ' ' * indent_size + + try: + # Parse agent URI to extract type and ID + # URI format: /agents/{agent_type}/{id} + parts = agent_uri.strip('/').split('/') + if len(parts) != 3 or parts[0] != 'agents': + self.log.error(f'{indent}Invalid agent URI format: {agent_uri}') + return None + + agent_type = parts[1] # e.g., 'corporate_entities', 'people', 'families' + agent_id = parts[2] + + # Construct EAC-CPF endpoint + # Format: /repositories/{repo_id}/archival_contexts/{agent_type}/{id}.xml + eac_cpf_endpoint = f'/repositories/{repo_id}/archival_contexts/{agent_type}/{agent_id}.xml' + + self.log.debug(f'{indent}Fetching EAC-CPF from: {eac_cpf_endpoint}') + + # Fetch EAC-CPF XML + response = self.client.get(eac_cpf_endpoint) + + if response.status_code != 200: + self.log.error(f'{indent}Failed to fetch EAC-CPF for {agent_uri}: HTTP {response.status_code}') + return None + + eac_cpf_xml = response.text + + # Parse the EAC-CPF XML to validate and inspect its structure + try: + root = ET.fromstring(eac_cpf_xml) + self.log.debug(f'{indent}Parsed EAC-CPF XML root element: {root.tag}') + except ET.ParseError as e: + self.log.error(f'{indent}Failed to parse EAC-CPF XML for {agent_uri}: {e}') + return None + + # Generate creator ID + creator_id = f'creator_{agent_type}_{agent_id}' + + # Save EAC-CPF XML to file + filename = f'{agents_dir}/{creator_id}.xml' + with open(filename, 'w', encoding='utf-8') as f: + f.write(eac_cpf_xml) + + self.log.info(f'{indent}Created creator document: {creator_id}') + return creator_id + + except Exception as e: + self.log.error(f'{indent}Error processing agent {agent_uri}: {e}') + import traceback + self.log.error(f'{indent}{traceback.format_exc()}') + return None + + def process_creators(self): + """ + Process creator agents and generate standalone creator documents. + + Returns: + list: List of created creator document IDs + """ + + xml_dir = f'{self.arclight_dir}/public/xml' + agents_dir = f'{xml_dir}/agents' + modified_since = int(self.last_updated.timestamp()) + indent_size = 0 + indent = ' ' * indent_size + + self.log.info(f'{indent}Processing creator agents...') + + # Create agents directory if it doesn't exist + os.makedirs(agents_dir, exist_ok=True) + + # Get agents to process + agents = self.get_all_agents(modified_since=modified_since, indent_size=indent_size) + + # Process agents in parallel + with Pool(processes=10) as pool: + results_agents = [pool.apply_async( + self.task_agent, + args=(agent_uri_item, agents_dir, 1, indent_size)) # Use repo_id=1 + for agent_uri_item in agents] + + creator_ids = [r.get() for r in results_agents] + creator_ids = [cid for cid in creator_ids if cid is not None] + + self.log.info(f'{indent}Created {len(creator_ids)} creator documents.') + + # NOTE: Collection links are NOT added to creator XML files. + # Instead, linking is handled via Solr using the persistent_id field: + # - Creator bioghist has persistent_id as the 'id' attribute + # - Collection EADs reference creators via bioghist with persistent_id + # - Solr indexes both, allowing queries to link them + # This avoids the expensive operation of scanning all resources to build a linkage map. + + # Index creators to Solr (if not skipped) + if not self.skip_creator_indexing and creator_ids: + self.log.info(f'{indent}Indexing {len(creator_ids)} creator records to Solr...') + traject_config = self.find_traject_config() + if traject_config: + indexed = self.index_creators(agents_dir, creator_ids) + self.log.info(f'{indent}Creator indexing complete: {indexed}/{len(creator_ids)} indexed') + else: + self.log.info(f'{indent}Skipping creator indexing (traject config not found)') + self.log.info(f'{indent}To index manually:') + self.log.info(f'{indent} cd {self.arclight_dir}') + self.log.info(f'{indent} bundle exec traject -u {self.solr_url} -i xml \\') + self.log.info(f'{indent} -c /path/to/arcuit/arcflow/traject_config_eac_cpf.rb \\') + self.log.info(f'{indent} {agents_dir}/*.xml') + elif self.skip_creator_indexing: + self.log.info(f'{indent}Skipping creator indexing (--skip-creator-indexing flag set)') + + return creator_ids + + + def find_traject_config(self): + """ + Find the traject config for creator indexing. + + Tries: + 1. bundle show arcuit (finds installed gem) + 2. self.arcuit_dir (explicit path) + 3. Returns None if neither works + + Returns: + str: Path to traject config, or None if not found + """ + # Try bundle show arcuit first + try: + result = subprocess.run( + ['bundle', 'show', 'arcuit'], + cwd=self.arclight_dir, + capture_output=True, + text=True, + timeout=10 + ) + if result.returncode == 0: + arcuit_path = result.stdout.strip() + # Prefer config at gem root, fall back to legacy subdirectory layout + candidate_paths = [ + os.path.join(arcuit_path, 'traject_config_eac_cpf.rb'), + os.path.join(arcuit_path, 'arcflow', 'traject_config_eac_cpf.rb'), + ] + for traject_config in candidate_paths: + if os.path.exists(traject_config): + self.log.info(f'Found traject config via bundle show: {traject_config}') + return traject_config + self.log.warning( + 'bundle show arcuit succeeded but traject_config_eac_cpf.rb ' + 'was not found in any expected location under the gem root' + ) + else: + self.log.debug('bundle show arcuit failed (gem not installed?)') + except Exception as e: + self.log.debug(f'Error running bundle show arcuit: {e}') + # Fall back to arcuit_dir if provided + if self.arcuit_dir: + candidate_paths = [ + os.path.join(self.arcuit_dir, 'traject_config_eac_cpf.rb'), + os.path.join(self.arcuit_dir, 'arcflow', 'traject_config_eac_cpf.rb'), + ] + for traject_config in candidate_paths: + if os.path.exists(traject_config): + self.log.info(f'Using traject config from arcuit_dir: {traject_config}') + return traject_config + self.log.warning( + 'arcuit_dir provided but traject_config_eac_cpf.rb was not found ' + 'in any expected location' + ) + # No config found + self.log.warning('Could not find traject config (bundle show arcuit failed and arcuit_dir not provided or invalid)') + return None + + + def index_creators(self, agents_dir, creator_ids, batch_size=100): + """ + Index creator XML files to Solr using traject. + + Args: + agents_dir: Directory containing creator XML files + creator_ids: List of creator IDs to index + batch_size: Number of files to index per traject call (default: 100) + + Returns: + int: Number of successfully indexed creators + """ + traject_config = self.find_traject_config() + if not traject_config: + return 0 + + indexed_count = 0 + failed_count = 0 + + # Process in batches to avoid command line length limits + total_batches = math.ceil(len(creator_ids) / batch_size) + for i in range(0, len(creator_ids), batch_size): + batch = creator_ids[i:i+batch_size] + batch_num = (i // batch_size) + 1 + + # Build list of XML files for this batch + xml_files = [f'{agents_dir}/{cid}.xml' for cid in batch] + + # Filter to only existing files + existing_files = [f for f in xml_files if os.path.exists(f)] + + if not existing_files: + self.log.warning(f' Batch {batch_num}/{total_batches}: No files found, skipping') + continue + + try: + cmd = [ + 'bundle', 'exec', 'traject', + '-u', self.solr_url, + '-i', 'xml', + '-c', traject_config + ] + existing_files + + self.log.info(f' Indexing batch {batch_num}/{total_batches}: {len(existing_files)} files') + + result = subprocess.run( + cmd, + cwd=self.arclight_dir, + stderr=subprocess.PIPE, + timeout=300 # 5 minute timeout per batch + ) + + if result.returncode == 0: + indexed_count += len(existing_files) + self.log.info(f' Successfully indexed {len(existing_files)} creators') + else: + failed_count += len(existing_files) + self.log.error(f' Traject failed with exit code {result.returncode}') + if result.stderr: + self.log.error(f' STDERR: {result.stderr.decode("utf-8")}') + + except subprocess.TimeoutExpired: + self.log.error(f' Traject timed out for batch {batch_num}/{total_batches}') + failed_count += len(existing_files) + except Exception as e: + self.log.error(f' Error indexing batch {batch_num}/{total_batches}: {e}') + failed_count += len(existing_files) + + if failed_count > 0: + self.log.warning(f'Creator indexing completed with errors: {indexed_count} succeeded, {failed_count} failed') + + return indexed_count + + def get_repo_id(self, repo): """ Get the repository ID from the repository URI. @@ -753,11 +1120,24 @@ def run(self): Run the ArcFlow process. """ self.log.info(f'ArcFlow process started (PID: {self.pid}).') - self.update_repositories() - self.update_eads() + + # Update repositories (unless agents-only mode) + if not self.agents_only: + self.update_repositories() + + # Update collections/EADs (unless agents-only mode) + if not self.agents_only: + self.update_eads() + + # Update creator records (unless collections-only mode) + if not self.collections_only: + self.process_creators() + self.save_config_file() self.log.info(f'ArcFlow process completed (PID: {self.pid}). Elapsed time: {time.strftime("%H:%M:%S", time.gmtime(int(time.time()) - self.start_time))}.') + + def main(): parser = argparse.ArgumentParser(description='ArcFlow') @@ -781,14 +1161,38 @@ def main(): '--traject-extra-config', default='', help='Path to extra Traject configuration file',) + parser.add_argument( + '--agents-only', + action='store_true', + help='Process only agent records, skip collections (for testing)',) + parser.add_argument( + '--collections-only', + action='store_true', + help='Process only repositories and collections, skip creator processing',) + parser.add_argument( + '--arcuit-dir', + default=None, + help='Path to arcuit repository (for traject config). If not provided, will try bundle show arcuit.',) + parser.add_argument( + '--skip-creator-indexing', + action='store_true', + help='Generate creator XML files but skip Solr indexing (for testing)',) args = parser.parse_args() + + # Validate mutually exclusive flags + if args.agents_only and args.collections_only: + parser.error('Cannot use both --agents-only and --collections-only') arcflow = ArcFlow( arclight_dir=args.arclight_dir, aspace_dir=args.aspace_dir, solr_url=args.solr_url, traject_extra_config=args.traject_extra_config, - force_update=args.force_update) + force_update=args.force_update, + agents_only=args.agents_only, + collections_only=args.collections_only, + arcuit_dir=args.arcuit_dir, + skip_creator_indexing=args.skip_creator_indexing) arcflow.run() diff --git a/traject_config_eac_cpf.rb b/traject_config_eac_cpf.rb new file mode 100644 index 0000000..62c9a5a --- /dev/null +++ b/traject_config_eac_cpf.rb @@ -0,0 +1,275 @@ +# Traject configuration for indexing EAC-CPF creator records to Solr +# +# This config file processes EAC-CPF (Encoded Archival Context - Corporate Bodies, +# Persons, and Families) XML documents from ArchivesSpace archival_contexts endpoint. +# +# Usage: +# bundle exec traject -u $SOLR_URL -c traject_config_eac_cpf.rb /path/to/agents/*.xml +# +# The EAC-CPF XML documents are retrieved directly from ArchivesSpace via: +# /repositories/{repo_id}/archival_contexts/{agent_type}/{id}.xml + +require 'traject' +require 'traject_plus' +require 'traject_plus/macros' +require 'time' + +# Use TrajectPlus macros (provides extract_xpath and other helpers) +extend TrajectPlus::Macros + +# EAC-CPF namespace - used consistently throughout this config +EAC_NS = { 'eac' => 'urn:isbn:1-931666-33-4' } + +settings do + provide "solr.url", ENV['SOLR_URL'] || "http://localhost:8983/solr/blacklight-core" + provide "solr_writer.commit_on_close", "true" + provide "solr_writer.thread_pool", "8" + provide "solr_writer.batch_size", "100" + provide "processing_thread_pool", "4" + + # Use NokogiriReader for XML processing + provide "reader_class_name", "Traject::NokogiriReader" +end + +# Each record from reader +each_record do |record, context| + context.clipboard[:is_creator] = true +end + +# Core identity field +# CRITICAL: The 'id' field is required by Solr's schema (uniqueKey) +# Must ensure this field is never empty or indexing will fail +# +# IMPORTANT: Real EAC-CPF from ArchivesSpace has empty element! +# Cannot rely on recordId being present. Must extract from filename or generate. +to_field 'id' do |record, accumulator, context| + # Try 1: Extract from control/recordId (if present) + record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first + record_id ||= record.xpath('//control/recordId').first + + if record_id && !record_id.text.strip.empty? + accumulator << record_id.text.strip + else + # Try 2: Extract from source filename (most reliable for ArchivesSpace exports) + # Filename format: creator_corporate_entities_584.xml or similar + source_file = context.source_record_id || context.input_name + if source_file + # Remove .xml extension and any path + id_from_filename = File.basename(source_file, '.xml') + # Check if it looks valid (starts with creator_ or agent_) + if id_from_filename =~ /^(creator_|agent_)/ + accumulator << id_from_filename + context.logger.info("Using filename-based ID: #{id_from_filename}") + else + # Try 3: Generate from entity type and name + entity_type = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first&.text&.strip + name_entry = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS).first&.text&.strip + + if entity_type && name_entry + # Create stable ID from type and name + type_short = case entity_type + when 'corporateBody' then 'corporate' + when 'person' then 'person' + when 'family' then 'family' + else 'entity' + end + name_id = name_entry.gsub(/[^a-z0-9]/i, '_').downcase[0..50] # Limit length + generated_id = "creator_#{type_short}_#{name_id}" + accumulator << generated_id + context.logger.warn("Generated ID from name: #{generated_id}") + else + # Last resort: timestamp-based unique ID + fallback_id = "creator_unknown_#{Time.now.to_i}_#{rand(10000)}" + accumulator << fallback_id + context.logger.error("Using fallback ID: #{fallback_id}") + end + end + else + # No filename available, generate from name + entity_type = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first&.text&.strip + name_entry = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS).first&.text&.strip + + if entity_type && name_entry + type_short = case entity_type + when 'corporateBody' then 'corporate' + when 'person' then 'person' + when 'family' then 'family' + else 'entity' + end + name_id = name_entry.gsub(/[^a-z0-9]/i, '_').downcase[0..50] + generated_id = "creator_#{type_short}_#{name_id}" + accumulator << generated_id + context.logger.warn("Generated ID from name: #{generated_id}") + else + # Absolute last resort + fallback_id = "creator_unknown_#{Time.now.to_i}_#{rand(10000)}" + accumulator << fallback_id + context.logger.error("Using fallback ID: #{fallback_id}") + end + end + end +end + +# Add is_creator marker field +to_field 'is_creator' do |record, accumulator| + accumulator << 'true' +end + +# Record type +to_field 'record_type' do |record, accumulator| + accumulator << 'creator' +end + +# Entity type (corporateBody, person, family) +to_field 'entity_type' do |record, accumulator| + entity = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first + accumulator << entity.text if entity +end + +# Title/name fields - using ArcLight dynamic field naming convention +# _tesim = text, stored, indexed, multiValued (for full-text search) +# _ssm = string, stored, multiValued (for display) +# _ssi = string, stored, indexed (for faceting/sorting) +to_field 'title_tesim' do |record, accumulator| + name = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS) + accumulator << name.map(&:text).join(' ') if name.any? +end + +to_field 'title_ssm' do |record, accumulator| + name = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS) + accumulator << name.map(&:text).join(' ') if name.any? +end + +to_field 'title_filing_ssi' do |record, accumulator| + name = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS) + if name.any? + text = name.map(&:text).join(' ') + # Remove leading articles and convert to lowercase for filing + accumulator << text.gsub(/^(a|an|the)\s+/i, '').downcase + end +end + +# Dates of existence - using ArcLight standard field unitdate_ssm +# (matches what ArcLight uses for collection dates) +to_field 'unitdate_ssm' do |record, accumulator| + # Try existDates element + base_path = '//eac:cpfDescription/eac:description/eac:existDates' + dates = record.xpath("#{base_path}/eac:dateRange/eac:fromDate | #{base_path}/eac:dateRange/eac:toDate | #{base_path}/eac:date", EAC_NS) + if dates.any? + from_date = record.xpath("#{base_path}/eac:dateRange/eac:fromDate", EAC_NS).first + to_date = record.xpath("#{base_path}/eac:dateRange/eac:toDate", EAC_NS).first + + if from_date || to_date + from_text = from_date ? from_date.text : '' + to_text = to_date ? to_date.text : '' + accumulator << "#{from_text}-#{to_text}".gsub(/^-|-$/, '') + else + # Single date + dates.each { |d| accumulator << d.text } + end + end +end + +# Biographical/historical note - using ArcLight conventions +# _tesim for searchable plain text +# _tesm for searchable HTML (text, stored, multiValued but not for display) +# _ssm for section heading display +to_field 'bioghist_tesim' do |record, accumulator| + # Extract text from biogHist elements for full-text search + bioghist = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:p', EAC_NS) + if bioghist.any? + text = bioghist.map(&:text).join(' ') + accumulator << text + end +end + +# Biographical/historical note - HTML +to_field 'bioghist_html_tesm' do |record, accumulator| + # Extract HTML for searchable content (matches ArcLight's bioghist_html_tesm) + bioghist = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:p', EAC_NS) + if bioghist.any? + html = bioghist.map { |p| "

#{p.text}

" }.join("\n") + accumulator << html + end +end + +to_field 'bioghist_heading_ssm' do |record, accumulator| + # Extract section heading (matches ArcLight's bioghist_heading_ssm pattern) + heading = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:head', EAC_NS).first + accumulator << heading.text if heading +end + +# Full-text search field +to_field 'text' do |record, accumulator| + # Title + name = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS) + accumulator << name.map(&:text).join(' ') if name.any? + + # Bioghist + bioghist = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:p', EAC_NS) + accumulator << bioghist.map(&:text).join(' ') if bioghist.any? +end + +# Related agents (from cpfRelation elements) +to_field 'related_agents_ssim' do |record, accumulator| + relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) + relations.each do |rel| + # Get the related entity href/identifier + href = rel['href'] || rel['xlink:href'] + relation_type = rel['cpfRelationType'] + + if href + # Store as: "uri|type" for easy parsing later + accumulator << "#{href}|#{relation_type}" + elsif relation_entry = rel.xpath('eac:relationEntry', EAC_NS).first + # If no href, at least store the name + name = relation_entry.text + accumulator << "#{name}|#{relation_type}" if name + end + end +end + +# Related agents - just URIs (for simpler queries) +to_field 'related_agent_uris_ssim' do |record, accumulator| + relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) + relations.each do |rel| + href = rel['href'] || rel['xlink:href'] + accumulator << href if href + end +end + +# Relationship types +to_field 'relationship_types_ssim' do |record, accumulator| + relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) + relations.each do |rel| + relation_type = rel['cpfRelationType'] + accumulator << relation_type if relation_type && !accumulator.include?(relation_type) + end +end + +# Agent source URI (from original ArchivesSpace) +to_field 'agent_uri' do |record, accumulator| + # Try to extract from control section or otherRecordId + other_id = record.xpath('//eac:control/eac:otherRecordId[@localType="archivesspace_uri"]', EAC_NS).first + if other_id + accumulator << other_id.text + end +end + +# Timestamp +to_field 'timestamp' do |record, accumulator| + accumulator << Time.now.utc.iso8601 +end + +# Document type marker +to_field 'document_type' do |record, accumulator| + accumulator << 'creator' +end + +# Log successful indexing +each_record do |record, context| + record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first + if record_id + context.logger.info("Indexed creator: #{record_id.text}") + end +end From ad666adef8d6ce1d1ece99beb9aa1b29e243fc2a Mon Sep 17 00:00:00 2001 From: Alex Dryden Date: Fri, 20 Feb 2026 21:00:13 -0500 Subject: [PATCH 02/14] feat: Optimize agent filtering with ArchivesSpace Solr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace per-agent API calls with single Solr query for better performance: - Query ArchivesSpace Solr to filter agents in bulk - Exclude system users (publish=false) - Exclude donors (linked_agent_role includes "dnr") - Exclude software agents (agent_type="agent_software") - Use consistent EAC namespace prefixes in XPath queries - Refactor dates extraction for improved readability Performance improvement: O(n) API calls → O(1) Solr query Reduces processing time from minutes to seconds for large repositories. --- README.md | 43 +- arcflow/main.py | 404 ++++++++++++------ ...pf.rb => example_traject_config_eac_cpf.rb | 7 +- 3 files changed, 320 insertions(+), 134 deletions(-) rename traject_config_eac_cpf.rb => example_traject_config_eac_cpf.rb (97%) diff --git a/README.md b/README.md index 6c570bf..710ddc4 100644 --- a/README.md +++ b/README.md @@ -42,14 +42,32 @@ ArcFlow now generates standalone creator documents in addition to collection rec - Link to all collections where the creator is listed - Can be searched and displayed independently in ArcLight - Are marked with `is_creator: true` to distinguish from collections -- Must be fed into a Solr instance with fields to match their specific facets (See:Configure Solr Schema below ) +- Must be fed into a Solr instance with fields to match their specific facets (See: Configure Solr Schema below) + +### Agent Filtering + +**ArcFlow automatically filters agents to include only legitimate creators** of archival materials. The following agent types are **excluded** from indexing: + +- ✗ **System users** - ArchivesSpace software users (identified by `is_user` field) +- ✗ **System-generated agents** - Auto-created for users (identified by `system_generated` field) +- ✗ **Software agents** - Excluded by not querying the `/agents/software` endpoint +- ✗ **Repository agents** - Corporate entities representing the repository itself (identified by `is_repo_agent` field) +- ✗ **Donor-only agents** - Agents with only the 'donor' role and no creator role + +**Agents are included if they meet any of these criteria:** + +- ✓ Have the **'creator' role** in linked_agent_roles +- ✓ Are **linked to published records** (and not excluded by filters above) + +This filtering ensures that only legitimate archival creators are discoverable in ArcLight, while protecting privacy and security by excluding system users and donors. ### How Creator Records Work 1. **Extraction**: `get_all_agents()` fetches all agents from ArchivesSpace -2. **Processing**: `task_agent()` generates an EAC-CPF XML document for each agent with bioghist notes -3. **Linking**: Handled via Solr using the persistent_id field (agents and collections linked through bioghist references) -4. **Indexing**: Creator XML files are indexed to Solr using `traject_config_eac_cpf.rb` +2. **Filtering**: `is_target_agent()` filters out system users, donors, and non-creator agents +3. **Processing**: `task_agent()` generates an EAC-CPF XML document for each target agent with bioghist notes +4. **Linking**: Handled via Solr using the persistent_id field (agents and collections linked through bioghist references) +5. **Indexing**: Creator XML files are indexed to Solr using `traject_config_eac_cpf.rb` ### Creator Document Format @@ -119,7 +137,22 @@ This is a **one-time setup** per Solr instance. --- -To index creator documents to Solr: +### Traject Configuration for Creator Indexing + +The `traject_config_eac_cpf.rb` file defines how EAC-CPF creator records are mapped to Solr fields. + +**Search Order**: arcflow searches for the traject config following the collection records pattern: +1. **arcuit_dir parameter** (if provided via `--arcuit-dir`) - Highest priority, most up-to-date user control +2. **arcuit gem** (via `bundle show arcuit`) - For backward compatibility when arcuit_dir not provided +3. **example_traject_config_eac_cpf.rb** in arcflow - Fallback for module usage without arcuit + +**Example File**: arcflow includes `example_traject_config_eac_cpf.rb` as a reference implementation. For production: +- Copy this file to your arcuit gem as `traject_config_eac_cpf.rb`, or +- Specify the location with `--arcuit-dir /path/to/arcuit` + +**Logging**: arcflow clearly logs which traject config file is being used when creator indexing runs. + +To index creator documents to Solr manually: ```bash bundle exec traject \ diff --git a/arcflow/main.py b/arcflow/main.py index 292d049..39af7b3 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -16,7 +16,7 @@ from datetime import datetime, timezone from asnake.client import ASnakeClient from multiprocessing.pool import ThreadPool as Pool -from .utils.stage_classifications import extract_labels +from utils.stage_classifications import extract_labels base_dir = os.path.abspath((__file__) + "/../../") @@ -40,8 +40,9 @@ class ArcFlow: """ - def __init__(self, arclight_dir, aspace_dir, solr_url, traject_extra_config='', force_update=False, agents_only=False, collections_only=False, arcuit_dir=None, skip_creator_indexing=False): + def __init__(self, arclight_dir, aspace_dir, solr_url, aspace_solr_url, traject_extra_config='', force_update=False, agents_only=False, collections_only=False, arcuit_dir=None, skip_creator_indexing=False): self.solr_url = solr_url + self.aspace_solr_url = aspace_solr_url self.batch_size = 1000 clean_extra_config = traject_extra_config.strip() self.traject_extra_config = clean_extra_config or None @@ -217,6 +218,9 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): 'resolve': ['classifications', 'classification_terms', 'linked_agents'], }).json() + if "ead_id" not in resource: + self.log.error(f'{indent}Resource {resource_id} is missing an ead_id. Skipping.') + return pdf_job xml_file_path = f'{xml_dir}/{resource["ead_id"]}.xml' # replace dots with dashes in EAD ID to avoid issues with Solr @@ -399,6 +403,7 @@ def update_eads(self): ArchivesSpace. """ xml_dir = f'{self.arclight_dir}/public/xml' + resource_dir = f'{xml_dir}/resources' pdf_dir = f'{self.arclight_dir}/public/pdf' modified_since = int(self.last_updated.timestamp()) @@ -412,7 +417,7 @@ def update_eads(self): json={'delete': {'query': '*:*'}}, ) if response.status_code == 200: - self.log.info('Deleted all EADs from ArcLight Solr.') + self.log.info('Deleted all EADs and Creators from ArcLight Solr.') # delete related directories after suscessful # deletion from solr for dir_path, dir_name in [(xml_dir, 'XMLs'), (pdf_dir, 'PDFs')]: @@ -424,10 +429,10 @@ def update_eads(self): else: self.log.error(f'Failed to delete all EADs from Arclight Solr. Status code: {response.status_code}') except requests.exceptions.RequestException as e: - self.log.error(f'Error deleting all EADs from ArcLight Solr: {e}') + self.log.error(f'Error deleting all EADs and Creators from ArcLight Solr: {e}') # create directories if don't exist - for dir_path in (xml_dir, pdf_dir): + for dir_path in (resource_dir, pdf_dir): os.makedirs(dir_path, exist_ok=True) # process resources that have been modified in ArchivesSpace since last update @@ -440,7 +445,7 @@ def update_eads(self): # Tasks for processing repositories results_1 = [pool.apply_async( self.task_repository, - args=(repo, xml_dir, modified_since, indent_size)) + args=(repo, resource_dir, modified_since, indent_size)) for repo in repos] # Collect outputs from repository tasks outputs_1 = [r.get() for r in results_1] @@ -448,7 +453,7 @@ def update_eads(self): # Tasks for processing resources results_2 = [pool.apply_async( self.task_resource, - args=(repo, resource_id, xml_dir, pdf_dir, indent_size)) + args=(repo, resource_id, resource_dir, pdf_dir, indent_size)) for repo, resources in outputs_1 for resource_id in resources] # Collect outputs from resource tasks outputs_2 = [r.get() for r in results_2] @@ -463,7 +468,7 @@ def update_eads(self): # Tasks for indexing pending resources results_3 = [pool.apply_async( self.index_collections, - args=(repo_id, f'{xml_dir}/{repo_id}_*_batch_{batch_num}.xml', indent_size)) + args=(repo_id, f'{resource_dir}/{repo_id}_*_batch_{batch_num}.xml', indent_size)) for repo_id, batch_num in batches] # Wait for indexing tasks to complete @@ -472,7 +477,7 @@ def update_eads(self): # Remove pending symlinks after indexing for repo_id, batch_num in batches: - xml_file_path = f'{xml_dir}/{repo_id}_*_batch_{batch_num}.xml' + xml_file_path = f'{resource_dir}/{repo_id}_*_batch_{batch_num}.xml' try: result = subprocess.run( f'rm {xml_file_path}', @@ -495,14 +500,23 @@ def update_eads(self): for r in results_4: r.get() - # processing deleted resources is not needed when - # force-update is set or modified_since is set to 0 - if self.force_update or modified_since <= 0: - self.log.info('Skipping deleted resources processing.') - return + return + + + + def process_deleted_records(self): + + xml_dir = f'{self.arclight_dir}/public/xml' + resource_dir = f'{xml_dir}/resources' + agent_dir = f'{xml_dir}/agents' + pdf_dir = f'{self.arclight_dir}/public/pdf' + modified_since = int(self.last_updated.timestamp()) + + # process records that have been deleted since last update in ArchivesSpace + resource_pattern = r'^/repositories/(?P\d+)/resources/(?P\d+)$' + agent_pattern = r'^/agents/(?Ppeople|corporate_entities|families)/(?P\d+)$' + - # process resources that have been deleted since last update in ArchivesSpace - pattern = r'^/repositories/(?P\d+)/resources/(?P\d+)$' page = 1 while True: deleted_records = self.client.get( @@ -513,12 +527,13 @@ def update_eads(self): } ).json() for record in deleted_records['results']: - match = re.match(pattern, record) - if match: + resource_match = re.match(resource_pattern, record) + agent_match = re.match(agent_pattern, record) + if resource_match and not self.agents_only: resource_id = match.group('resource_id') self.log.info(f'{" " * indent_size}Processing deleted resource ID {resource_id}...') - symlink_path = f'{xml_dir}/{resource_id}.xml' + symlink_path = f'{resource_dir}/{resource_id}.xml' ead_id = self.get_ead_from_symlink(symlink_path) if ead_id: self.delete_ead( @@ -530,6 +545,14 @@ def update_eads(self): else: self.log.error(f'{" " * (indent_size+2)}Symlink {symlink_path} not found. Unable to delete the associated EAD from Arclight Solr.') + if agent_match and not self.collections_only: + agent_id = match.group('agent_id') + self.log.info(f'{" " * indent_size}Processing deleted agent ID {agent_id}...') + file_path = f'{agent_dir}/{agent_id}.xml' + agent_solr_id = f'creator_{agent_type}_{agent_id}' + self.delete_creator(file_path, agent_solr_id, indent_size) + + if deleted_records['last_page'] == page: break page += 1 @@ -577,9 +600,10 @@ def index_collections(self, repo_id, xml_file_path, indent_size=0): env = os.environ.copy() env['REPOSITORY_ID'] = str(repo_id) - + cmd_string = ' '.join(cmd) result = subprocess.run( - cmd, + cmd_string, + shell=True, cwd=self.arclight_dir, env=env, stderr=subprocess.PIPE, @@ -673,63 +697,142 @@ def get_creator_bioghist(self, resource, indent_size=0): return '\n'.join(bioghist_elements) return None - - def get_all_agents(self, agent_types=None, modified_since=0, indent_size=0): + def _get_target_agent_criteria(self, modified_since=0): """ - Fetch ALL agents from ArchivesSpace (not just creators). - Uses direct agent API endpoints for comprehensive coverage. - + Defines the Solr query criteria for "target" agents. + These are agents we want to process. + """ + # Basic filters for agents to include + criteria = [ + "system_generated:false", + "is_user:false", + "is_repo_agent:false", + # Include agents that are creators OR are linked to published records + "(linked_agent_roles:creator OR is_linked_to_published_record:true)", + # Exclude agents whose ONLY role is 'donor' + # This logic says: "NOT (role is only donor)" + "(*:* -linked_agent_roles:donor OR (*:* AND linked_agent_roles:[* TO *] AND (*:* -linked_agent_roles:donor)))" + ] + + # Add time filter if applicable + if modified_since > 0 and not self.force_update: + mtime_utc = datetime.fromtimestamp(modified_since, tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + criteria.append(f"system_mtime:[{mtime_utc} TO *]") + + return criteria + + def _get_nontarget_agent_criteria(self, modified_since=0): + """ + Defines the Solr query criteria for "non-target" (excluded) agents. + This is the logical inverse of the target criteria. + """ + # The core logic for what makes an agent a "target" + target_logic = " AND ".join([ + "system_generated:false", + "is_user:false", + "is_repo_agent:false", + "(linked_agent_roles:creator OR is_linked_to_published_record:true)", + "(*:* -linked_agent_roles:donor OR (*:* AND linked_agent_roles:[* TO *] AND (*:* -linked_agent_roles:donor)))" + ]) + + # We find non-targets by negating the entire block of target logic + criteria = [f"NOT ({target_logic})"] + + # We still apply the time filter to the overall query + if modified_since > 0 and not self.force_update: + mtime_utc = datetime.fromtimestamp(modified_since, tz=timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + criteria.append(f"system_mtime:[{mtime_utc} TO *]") + + return criteria + + def _execute_solr_query(self, query_parts, solr_url=None, fields=['id'], indent_size=0): + """ + A generic function to execute a query against the Solr index. + Args: - agent_types: List of agent types to fetch. Default: ['corporate_entities', 'people', 'families'] - modified_since: Unix timestamp to filter agents modified since this time (if API supports it) - indent_size: Indentation size for logging - + query_parts (list): A list of strings that will be joined with " AND ". + fields (list): A list of Solr fields to return in the response. + Returns: - set: Set of agent URIs (e.g., '/agents/corporate_entities/123') + list: A list of dictionaries, where each dictionary contains the requested fields. + Returns an empty list on failure. + """ + indent = ' ' * indent_size + if not query_parts: + self.log.error("Cannot execute Solr query with empty criteria.") + return [] + + if not solr_url: + solr_url = self.solr_url + + query_string = " AND ".join(query_parts) + self.log.info(f"{indent}Executing Solr query: {query_string}") + + try: + # First, get the total count of matching documents + count_params = {'q': query_string, 'rows': 0, 'wt': 'json'} + count_response = requests.get(f'{solr_url}/select', params=count_params) + self.log.info(f" [Solr Count Request]: {count_response.request.url}") + + count_response.raise_for_status() + num_found = count_response.json()['response']['numFound'] + + if num_found == 0: + return [] # No need to query again if nothing was found + + # Now, fetch the actual data for the documents + data_params = { + 'q': query_string, + 'rows': num_found, # Use the exact count to fetch all results + 'fl': ','.join(fields), # Join field list into a comma-separated string + 'wt': 'json' + } + response = requests.get(f'{solr_url}/select', params=data_params) + response.raise_for_status() + # Log the exact URL for the data request + self.log.info(f" [Solr Data Request]: {response.request.url}") + + return response.json()['response']['docs'] + + except requests.exceptions.RequestException as e: + self.log.error(f"Failed to execute Solr query: {e}") + self.log.error(f" Failed query string: {query_string}") + return [] + + def get_all_agents(self, agent_types=None, modified_since=0, indent_size=0): + """ + Fetch target agent URIs from the Solr index and log non-target agents. """ if agent_types is None: - agent_types = ['corporate_entities', 'people', 'families'] - + agent_types = ['agent_person', 'agent_corporate_entity', 'agent_family'] + + + modified_since = 0 if self.force_update else int(self.last_updated.timestamp()) indent = ' ' * indent_size - all_agents = set() - - self.log.info(f'{indent}Fetching ALL agents from ArchivesSpace...') - - for agent_type in agent_types: - try: - # Try with modified_since parameter first - params = {'all_ids': True} - if modified_since > 0: - params['modified_since'] = modified_since - - response = self.client.get(f'/agents/{agent_type}', params=params) - agent_ids = response.json() - - self.log.info(f'{indent}Found {len(agent_ids)} {agent_type} agents') - - # Add agent URIs to set - for agent_id in agent_ids: - agent_uri = f'/agents/{agent_type}/{agent_id}' - all_agents.add(agent_uri) - - except Exception as e: - self.log.error(f'{indent}Error fetching {agent_type} agents: {e}') - # If modified_since fails, try without it - if modified_since > 0: - self.log.warning(f'{indent}Retrying {agent_type} without modified_since filter...') - try: - response = self.client.get(f'/agents/{agent_type}', params={'all_ids': True}) - agent_ids = response.json() - self.log.info(f'{indent}Found {len(agent_ids)} {agent_type} agents (no date filter)') - for agent_id in agent_ids: - agent_uri = f'/agents/{agent_type}/{agent_id}' - all_agents.add(agent_uri) - except Exception as e2: - self.log.error(f'{indent}Failed to fetch {agent_type} agents: {e2}') - - self.log.info(f'{indent}Found {len(all_agents)} total agents across all types.') - return all_agents + self.log.info(f'{indent}Fetching agent data from Solr...') + + # Base criteria for all queries in this function + base_criteria = [f"primary_type:({' OR '.join(agent_types)})"] + + # Get and log the non-target agents + nontarget_criteria = base_criteria + self._get_nontarget_agent_criteria(modified_since) + excluded_docs = self._execute_solr_query(nontarget_criteria,self.aspace_solr_url, fields=['id']) + if excluded_docs: + excluded_ids = [doc['id'] for doc in excluded_docs] + self.log.info(f"{indent} Found {len(excluded_ids)} non-target (excluded) agents.") + # Optional: Log the actual IDs if the list isn't too long + # for agent_id in excluded_ids: + # self.log.debug(f"{indent} - Excluded: {agent_id}") + + # Get and return the target agents + target_criteria = base_criteria + self._get_target_agent_criteria(modified_since) + self.log.info('Target Criteria:') + target_docs = self._execute_solr_query(target_criteria, self.aspace_solr_url, fields=['id']) + target_agents = [doc['id'] for doc in target_docs] + self.log.info(f"{indent} Found {len(target_agents)} target agents to process.") + + return target_agents def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0): """ @@ -844,14 +947,15 @@ def process_creators(self): self.log.info(f'{indent}Indexing {len(creator_ids)} creator records to Solr...') traject_config = self.find_traject_config() if traject_config: + self.log.info(f'{indent}Using traject config: {traject_config}') indexed = self.index_creators(agents_dir, creator_ids) self.log.info(f'{indent}Creator indexing complete: {indexed}/{len(creator_ids)} indexed') else: - self.log.info(f'{indent}Skipping creator indexing (traject config not found)') + self.log.warning(f'{indent}Skipping creator indexing (traject config not found)') self.log.info(f'{indent}To index manually:') self.log.info(f'{indent} cd {self.arclight_dir}') self.log.info(f'{indent} bundle exec traject -u {self.solr_url} -i xml \\') - self.log.info(f'{indent} -c /path/to/arcuit/arcflow/traject_config_eac_cpf.rb \\') + self.log.info(f'{indent} -c /path/to/arcuit-gem/traject_config_eac_cpf.rb \\') self.log.info(f'{indent} {agents_dir}/*.xml') elif self.skip_creator_indexing: self.log.info(f'{indent}Skipping creator indexing (--skip-creator-indexing flag set)') @@ -863,15 +967,32 @@ def find_traject_config(self): """ Find the traject config for creator indexing. - Tries: - 1. bundle show arcuit (finds installed gem) - 2. self.arcuit_dir (explicit path) - 3. Returns None if neither works + Search order (follows collection records pattern): + 1. arcuit_dir if provided (most up-to-date user control) + 2. arcuit gem via bundle show (for backward compatibility) + 3. example_traject_config_eac_cpf.rb in arcflow (fallback when used as module without arcuit) Returns: str: Path to traject config, or None if not found """ - # Try bundle show arcuit first + self.log.info('Searching for traject_config_eac_cpf.rb...') + searched_paths = [] + + # Try 1: arcuit_dir if provided (highest priority - user's explicit choice) + if self.arcuit_dir: + self.log.debug(f' Checking arcuit_dir parameter: {self.arcuit_dir}') + candidate_paths = [ + os.path.join(self.arcuit_dir, 'traject_config_eac_cpf.rb'), + os.path.join(self.arcuit_dir, 'lib', 'arcuit', 'traject', 'traject_config_eac_cpf.rb'), + ] + searched_paths.extend(candidate_paths) + for traject_config in candidate_paths: + if os.path.exists(traject_config): + self.log.info(f'✓ Using traject config from arcuit_dir: {traject_config}') + return traject_config + self.log.debug(' traject_config_eac_cpf.rb not found in arcuit_dir') + + # Try 2: bundle show arcuit (for backward compatibility when arcuit_dir not provided) try: result = subprocess.run( ['bundle', 'show', 'arcuit'], @@ -882,39 +1003,46 @@ def find_traject_config(self): ) if result.returncode == 0: arcuit_path = result.stdout.strip() - # Prefer config at gem root, fall back to legacy subdirectory layout + self.log.debug(f' Found arcuit gem at: {arcuit_path}') candidate_paths = [ os.path.join(arcuit_path, 'traject_config_eac_cpf.rb'), - os.path.join(arcuit_path, 'arcflow', 'traject_config_eac_cpf.rb'), + os.path.join(arcuit_path, 'lib', 'arcuit', 'traject', 'traject_config_eac_cpf.rb'), ] + searched_paths.extend(candidate_paths) for traject_config in candidate_paths: if os.path.exists(traject_config): - self.log.info(f'Found traject config via bundle show: {traject_config}') + self.log.info(f'✓ Using traject config from arcuit gem: {traject_config}') return traject_config - self.log.warning( - 'bundle show arcuit succeeded but traject_config_eac_cpf.rb ' - 'was not found in any expected location under the gem root' + self.log.debug( + ' traject_config_eac_cpf.rb not found in arcuit gem ' + '(checked root and lib/arcuit/traject/ subdirectory)' ) else: - self.log.debug('bundle show arcuit failed (gem not installed?)') + self.log.debug(' arcuit gem not found via bundle show') except Exception as e: - self.log.debug(f'Error running bundle show arcuit: {e}') - # Fall back to arcuit_dir if provided - if self.arcuit_dir: - candidate_paths = [ - os.path.join(self.arcuit_dir, 'traject_config_eac_cpf.rb'), - os.path.join(self.arcuit_dir, 'arcflow', 'traject_config_eac_cpf.rb'), - ] - for traject_config in candidate_paths: - if os.path.exists(traject_config): - self.log.info(f'Using traject config from arcuit_dir: {traject_config}') - return traject_config - self.log.warning( - 'arcuit_dir provided but traject_config_eac_cpf.rb was not found ' - 'in any expected location' + self.log.debug(f' Error checking for arcuit gem: {e}') + + # Try 3: example file in arcflow package (fallback for module usage without arcuit) + # We know exactly where this file is located - at the repo root + arcflow_package_dir = os.path.dirname(os.path.abspath(__file__)) + arcflow_repo_root = os.path.dirname(arcflow_package_dir) + traject_config = os.path.join(arcflow_repo_root, 'example_traject_config_eac_cpf.rb') + searched_paths.append(traject_config) + + if os.path.exists(traject_config): + self.log.info(f'✓ Using example traject config from arcflow: {traject_config}') + self.log.info( + ' Note: Using example config. For production, copy this file to your ' + 'arcuit gem or specify location with --arcuit-dir.' ) - # No config found - self.log.warning('Could not find traject config (bundle show arcuit failed and arcuit_dir not provided or invalid)') + return traject_config + + # No config found anywhere - show all paths searched + self.log.error('✗ Could not find traject_config_eac_cpf.rb in any of these locations:') + for i, path in enumerate(searched_paths, 1): + self.log.error(f' {i}. {path}') + self.log.error('') + self.log.error(' Add traject_config_eac_cpf.rb to your arcuit gem or specify with --arcuit-dir.') return None @@ -1068,37 +1196,47 @@ def create_symlink(self, target_path, symlink_path, indent_size=0): self.log.info(f'{indent}{e}') return False - - def delete_ead(self, resource_id, ead_id, - xml_file_path, pdf_file_path, indent_size=0): - indent = ' ' * indent_size - # delete from solr + def delete_arclight_solr_record(self, solr_record_id, indent_size=0): try: response = requests.post( f'{self.solr_url}/update?commit=true', - json={'delete': {'id': ead_id}}, + json={'delete': {'id': solr_record_id}}, ) if response.status_code == 200: - self.log.info(f'{indent}Deleted EAD "{ead_id}" from ArcLight Solr.') - # delete related files after suscessful deletion from solr - for file_path in (xml_file_path, pdf_file_path): - try: - os.remove(file_path) - self.log.info(f'{indent}Deleted file {file_path}.') - except FileNotFoundError: - self.log.error(f'{indent}File {file_path} not found.') - - # delete symlink if exists - symlink_path = f'{os.path.dirname(xml_file_path)}/{resource_id}.xml' - try: - os.remove(symlink_path) - self.log.info(f'{indent}Deleted symlink {symlink_path}.') - except FileNotFoundError: - self.log.info(f'{indent}Symlink {symlink_path} not found.') + self.log.info(f'{indent}Deleted Solr record {solr_record_id}. from ArcLight Solr') + return True else: - self.log.error(f'{indent}Failed to delete EAD "{ead_id}" from Arclight Solr. Status code: {response.status_code}') + self.log.error( + f'{indent}Failed to Solr record {solr_record_id} from Arclight Solr. Status code: {response.status_code}') + return False except requests.exceptions.RequestException as e: - self.log.error(f'{indent}Error deleting EAD "{ead_id}" from ArcLight Solr: {e}') + self.log.error(f'{indent}Error deleting Solr record {solr_record_id} from ArcLight Solr: {e}') + + def delete_file(self, file_path, indent=0): + try: + os.remove(file_path) + self.log.info(f'{indent}Deleted file {file_path}.') + except FileNotFoundError: + self.log.error(f'{indent}File {file_path} not found.') + + def delete_ead(self, resource_id, ead_id, + xml_file_path, pdf_file_path, indent_size=0): + indent = ' ' * indent_size + # delete from solr + deleted_solr_record = self.delete_arclight_solr_record(ead_id, indent_size=indent_size) + if deleted_solr_record: + self.delete_file(pdf_file_path, indent=indent) + self.delete_file(xml_file_path, indent=indent) + # delete symlink if exists + symlink_path = f'{os.path.dirname(xml_file_path)}/{resource_id}.xml' + self.delete_file(symlink_path, indent=indent) + + def delete_creator(self, file_path, solr_id, indent_size=0): + indent = ' ' * indent_size + deleted_solr_record = self.delete_arclight_solr_record(solr_id, indent_size=indent_size) + if deleted_solr_record: + self.delete_file(file_path, indent=indent) + def save_config_file(self): @@ -1132,7 +1270,14 @@ def run(self): # Update creator records (unless collections-only mode) if not self.collections_only: self.process_creators() - + + # processing deleted resources is not needed when + # force-update is set or modified_since is set to 0 + if self.force_update or int(self.last_updated.timestamp()) <= 0: + self.log.info('Skipping deleted record processing.') + else: + self.process_deleted_records() + self.save_config_file() self.log.info(f'ArcFlow process completed (PID: {self.pid}). Elapsed time: {time.strftime("%H:%M:%S", time.gmtime(int(time.time()) - self.start_time))}.') @@ -1156,7 +1301,11 @@ def main(): parser.add_argument( '--solr-url', required=True, - help='URL of the Solr core',) + help='URL of the ArcLight Solr core',) + parser.add_argument( + '--aspace-solr-url', + required=True, + help='URL of the ASpace Solr core',) parser.add_argument( '--traject-extra-config', default='', @@ -1187,6 +1336,7 @@ def main(): arclight_dir=args.arclight_dir, aspace_dir=args.aspace_dir, solr_url=args.solr_url, + aspace_solr_url=args.aspace_solr_url, traject_extra_config=args.traject_extra_config, force_update=args.force_update, agents_only=args.agents_only, diff --git a/traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb similarity index 97% rename from traject_config_eac_cpf.rb rename to example_traject_config_eac_cpf.rb index 62c9a5a..6234ccd 100644 --- a/traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -4,7 +4,9 @@ # Persons, and Families) XML documents from ArchivesSpace archival_contexts endpoint. # # Usage: -# bundle exec traject -u $SOLR_URL -c traject_config_eac_cpf.rb /path/to/agents/*.xml +# bundle exec traject -u $SOLR_URL -c example_traject_config_eac_cpf.rb /path/to/agents/*.xml +# +# For production, copy this file to your arcuit gem as traject_config_eac_cpf.rb # # The EAC-CPF XML documents are retrieved directly from ArchivesSpace via: # /repositories/{repo_id}/archival_contexts/{agent_type}/{id}.xml @@ -188,7 +190,8 @@ # Extract HTML for searchable content (matches ArcLight's bioghist_html_tesm) bioghist = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:p', EAC_NS) if bioghist.any? - html = bioghist.map { |p| "

#{p.text}

" }.join("\n") + # Preserve inline EAC markup inside by serializing child nodes + html = bioghist.map { |p| "

#{p.inner_html}

" }.join("\n") accumulator << html end end From a6053f53697eb3fdbfd7c33c8ba754a9204b9cd4 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 11:39:16 -0500 Subject: [PATCH 03/14] Replace non-deterministic fallback IDs with explicit skip logic in EAC-CPF indexing (#13) * Skip indexing records without valid IDs instead of generating non-deterministic fallbacks Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> Co-authored-by: Alex Dryden --- example_traject_config_eac_cpf.rb | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/example_traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb index 6234ccd..84a956d 100644 --- a/example_traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -80,10 +80,14 @@ accumulator << generated_id context.logger.warn("Generated ID from name: #{generated_id}") else - # Last resort: timestamp-based unique ID - fallback_id = "creator_unknown_#{Time.now.to_i}_#{rand(10000)}" - accumulator << fallback_id - context.logger.error("Using fallback ID: #{fallback_id}") + # No valid ID available - skip indexing this record + # If we reach here, something has gone wrong with the data pipeline: + # - No recordId in XML + # - Filename doesn't match expected pattern + # - No entity type or name in XML to generate from + # Skipping ensures we don't create non-deterministic IDs that break idempotent indexing + context.logger.error("Cannot generate valid ID for record - skipping indexing. Source: #{source_file}") + context.skip!("Missing required ID data") end end else @@ -103,10 +107,14 @@ accumulator << generated_id context.logger.warn("Generated ID from name: #{generated_id}") else - # Absolute last resort - fallback_id = "creator_unknown_#{Time.now.to_i}_#{rand(10000)}" - accumulator << fallback_id - context.logger.error("Using fallback ID: #{fallback_id}") + # No valid ID available - skip indexing this record + # If we reach here, something has gone wrong with the data pipeline: + # - No recordId in XML + # - No filename available + # - No entity type or name in XML to generate from + # Skipping ensures we don't create non-deterministic IDs that break idempotent indexing + context.logger.error("Cannot generate valid ID for record - skipping indexing. No filename or entity data available.") + context.skip!("Missing required ID data") end end end From 58b390dd9fcc41eaa504d27783e53e720bcaa07f Mon Sep 17 00:00:00 2001 From: Alex Dryden Date: Thu, 26 Feb 2026 16:08:49 -0500 Subject: [PATCH 04/14] Fix typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- arcflow/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arcflow/main.py b/arcflow/main.py index 39af7b3..09c7e53 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -1207,7 +1207,7 @@ def delete_arclight_solr_record(self, solr_record_id, indent_size=0): return True else: self.log.error( - f'{indent}Failed to Solr record {solr_record_id} from Arclight Solr. Status code: {response.status_code}') + f'{indent}Failed to delete Solr record {solr_record_id} from Arclight Solr. Status code: {response.status_code}') return False except requests.exceptions.RequestException as e: self.log.error(f'{indent}Error deleting Solr record {solr_record_id} from ArcLight Solr: {e}') From 758145d8116c9252262142e2ca89c3ed82691112 Mon Sep 17 00:00:00 2001 From: Alex Dryden Date: Thu, 26 Feb 2026 16:10:29 -0500 Subject: [PATCH 05/14] Update README.md to reflect the required command line arguments Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- README.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 710ddc4..f634756 100644 --- a/README.md +++ b/README.md @@ -14,14 +14,12 @@ pip install -r requirements.txt cp .archivessnake.yml.example .archivessnake.yml nano .archivessnake.yml # Add your ArchivesSpace credentials -# 3. Set environment variables -export ARCLIGHT_DIR=/path/to/your/arclight-app -export ASPACE_DIR=/path/to/your/archivesspace -export SOLR_URL=http://localhost:8983/solr/blacklight-core - -# 4. Run arcflow -python -m arcflow.main - +# 3. Run arcflow +python -m arcflow.main \ + --arclight-dir /path/to/your/arclight-app \ + --aspace-dir /path/to/your/archivesspace \ + --solr-url http://localhost:8983/solr/blacklight-core \ + --aspace-solr-url http://localhost:8983/solr/archivesspace ``` --- From ca12130be030bd8fb5c9a768c47c43664975f230 Mon Sep 17 00:00:00 2001 From: Alex Dryden Date: Thu, 26 Feb 2026 16:13:20 -0500 Subject: [PATCH 06/14] fix: use correct variable name --- arcflow/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index 09c7e53..3d503c0 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -530,7 +530,7 @@ def process_deleted_records(self): resource_match = re.match(resource_pattern, record) agent_match = re.match(agent_pattern, record) if resource_match and not self.agents_only: - resource_id = match.group('resource_id') + resource_id = resource_match.group('resource_id') self.log.info(f'{" " * indent_size}Processing deleted resource ID {resource_id}...') symlink_path = f'{resource_dir}/{resource_id}.xml' @@ -546,7 +546,7 @@ def process_deleted_records(self): self.log.error(f'{" " * (indent_size+2)}Symlink {symlink_path} not found. Unable to delete the associated EAD from Arclight Solr.') if agent_match and not self.collections_only: - agent_id = match.group('agent_id') + agent_id = agent_match.group('agent_id') self.log.info(f'{" " * indent_size}Processing deleted agent ID {agent_id}...') file_path = f'{agent_dir}/{agent_id}.xml' agent_solr_id = f'creator_{agent_type}_{agent_id}' From 4bd4bcd5934c175079cfe080656493c22d3a3efa Mon Sep 17 00:00:00 2001 From: Alex Dryden Date: Thu, 26 Feb 2026 16:18:17 -0500 Subject: [PATCH 07/14] update README to include required aspace solr url --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f634756..2a42f18 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,9 @@ Optional arguments: python -m arcflow.main \ --arclight-dir /path/to/arclight \ --aspace-dir /path/to/archivesspace \ - --solr-url http://localhost:8983/solr/blacklight-core + --solr-url http://localhost:8983/solr/blacklight-core \ + --aspace-solr-url http://localhost:8983/solr/archivesspace + ``` **Process only agents (skip collections):** @@ -206,6 +208,7 @@ python -m arcflow.main \ --arclight-dir /path/to/arclight \ --aspace-dir /path/to/archivesspace \ --solr-url http://localhost:8983/solr/blacklight-core \ + --aspace-solr-url http://localhost:8983/solr/archivesspace \ --agents-only ``` @@ -215,6 +218,7 @@ python -m arcflow.main \ --arclight-dir /path/to/arclight \ --aspace-dir /path/to/archivesspace \ --solr-url http://localhost:8983/solr/blacklight-core \ + --aspace-solr-url http://localhost:8983/solr/archivesspace \ --force-update ``` From a7f63d3e3704f7df46d44e035dea5c4b843ef8a0 Mon Sep 17 00:00:00 2001 From: Alex Dryden Date: Thu, 26 Feb 2026 16:21:03 -0500 Subject: [PATCH 08/14] fix: typo --- arcflow/main.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arcflow/main.py b/arcflow/main.py index 3d503c0..1c5da0f 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -1197,6 +1197,8 @@ def create_symlink(self, target_path, symlink_path, indent_size=0): return False def delete_arclight_solr_record(self, solr_record_id, indent_size=0): + indent = ' ' * indent_size + try: response = requests.post( f'{self.solr_url}/update?commit=true', @@ -1212,7 +1214,9 @@ def delete_arclight_solr_record(self, solr_record_id, indent_size=0): except requests.exceptions.RequestException as e: self.log.error(f'{indent}Error deleting Solr record {solr_record_id} from ArcLight Solr: {e}') - def delete_file(self, file_path, indent=0): + def delete_file(self, file_path, indent_side=0): + indent = ' ' * indent_size + try: os.remove(file_path) self.log.info(f'{indent}Deleted file {file_path}.') From 6c0942dd515b66a2d2417eea720b12adc9e2a8e6 Mon Sep 17 00:00:00 2001 From: Alex Dryden Date: Thu, 26 Feb 2026 16:31:27 -0500 Subject: [PATCH 09/14] remove shell execution from index_collections --- arcflow/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/arcflow/main.py b/arcflow/main.py index 1c5da0f..a94ef40 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -603,7 +603,6 @@ def index_collections(self, repo_id, xml_file_path, indent_size=0): cmd_string = ' '.join(cmd) result = subprocess.run( cmd_string, - shell=True, cwd=self.arclight_dir, env=env, stderr=subprocess.PIPE, From 3b8b7145559716fecc68f5e339a2f4d1cd741ba8 Mon Sep 17 00:00:00 2001 From: Alex Dryden Date: Thu, 26 Feb 2026 16:41:50 -0500 Subject: [PATCH 10/14] fix: modify since is already passed in as an integer in get_all_agent --- arcflow/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arcflow/main.py b/arcflow/main.py index a94ef40..f8b3921 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -806,7 +806,8 @@ def get_all_agents(self, agent_types=None, modified_since=0, indent_size=0): agent_types = ['agent_person', 'agent_corporate_entity', 'agent_family'] - modified_since = 0 if self.force_update else int(self.last_updated.timestamp()) + if self.force_update: + modified_since = 0 indent = ' ' * indent_size self.log.info(f'{indent}Fetching agent data from Solr...') From ff41cdb8a3198acb70231025257dd41f664d1d2c Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 17:03:57 -0500 Subject: [PATCH 11/14] Enforce strict creator_{type}_{id} pattern using filename only (#26) Use filename for id --- example_traject_config_eac_cpf.rb | 91 +++++-------------------------- 1 file changed, 15 insertions(+), 76 deletions(-) diff --git a/example_traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb index 84a956d..be544e9 100644 --- a/example_traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -22,6 +22,9 @@ # EAC-CPF namespace - used consistently throughout this config EAC_NS = { 'eac' => 'urn:isbn:1-931666-33-4' } +# Pattern matching arcflow's creator file naming: creator_{entity_type}_{id} +CREATOR_ID_PATTERN = /^creator_(corporate_entities|people|families)_\d+$/ + settings do provide "solr.url", ENV['SOLR_URL'] || "http://localhost:8983/solr/blacklight-core" provide "solr_writer.commit_on_close", "true" @@ -38,85 +41,21 @@ context.clipboard[:is_creator] = true end -# Core identity field -# CRITICAL: The 'id' field is required by Solr's schema (uniqueKey) -# Must ensure this field is never empty or indexing will fail -# -# IMPORTANT: Real EAC-CPF from ArchivesSpace has empty element! -# Cannot rely on recordId being present. Must extract from filename or generate. +# Solr uniqueKey - extract ID from filename using arcflow's creator_{entity_type}_{id} pattern to_field 'id' do |record, accumulator, context| - # Try 1: Extract from control/recordId (if present) - record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first - record_id ||= record.xpath('//control/recordId').first - - if record_id && !record_id.text.strip.empty? - accumulator << record_id.text.strip - else - # Try 2: Extract from source filename (most reliable for ArchivesSpace exports) - # Filename format: creator_corporate_entities_584.xml or similar - source_file = context.source_record_id || context.input_name - if source_file - # Remove .xml extension and any path - id_from_filename = File.basename(source_file, '.xml') - # Check if it looks valid (starts with creator_ or agent_) - if id_from_filename =~ /^(creator_|agent_)/ - accumulator << id_from_filename - context.logger.info("Using filename-based ID: #{id_from_filename}") - else - # Try 3: Generate from entity type and name - entity_type = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first&.text&.strip - name_entry = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS).first&.text&.strip - - if entity_type && name_entry - # Create stable ID from type and name - type_short = case entity_type - when 'corporateBody' then 'corporate' - when 'person' then 'person' - when 'family' then 'family' - else 'entity' - end - name_id = name_entry.gsub(/[^a-z0-9]/i, '_').downcase[0..50] # Limit length - generated_id = "creator_#{type_short}_#{name_id}" - accumulator << generated_id - context.logger.warn("Generated ID from name: #{generated_id}") - else - # No valid ID available - skip indexing this record - # If we reach here, something has gone wrong with the data pipeline: - # - No recordId in XML - # - Filename doesn't match expected pattern - # - No entity type or name in XML to generate from - # Skipping ensures we don't create non-deterministic IDs that break idempotent indexing - context.logger.error("Cannot generate valid ID for record - skipping indexing. Source: #{source_file}") - context.skip!("Missing required ID data") - end - end + source_file = context.source_record_id || context.input_name + if source_file + id_from_filename = File.basename(source_file, '.xml') + if id_from_filename =~ CREATOR_ID_PATTERN + accumulator << id_from_filename + context.logger.info("Using filename-based ID: #{id_from_filename}") else - # No filename available, generate from name - entity_type = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first&.text&.strip - name_entry = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS).first&.text&.strip - - if entity_type && name_entry - type_short = case entity_type - when 'corporateBody' then 'corporate' - when 'person' then 'person' - when 'family' then 'family' - else 'entity' - end - name_id = name_entry.gsub(/[^a-z0-9]/i, '_').downcase[0..50] - generated_id = "creator_#{type_short}_#{name_id}" - accumulator << generated_id - context.logger.warn("Generated ID from name: #{generated_id}") - else - # No valid ID available - skip indexing this record - # If we reach here, something has gone wrong with the data pipeline: - # - No recordId in XML - # - No filename available - # - No entity type or name in XML to generate from - # Skipping ensures we don't create non-deterministic IDs that break idempotent indexing - context.logger.error("Cannot generate valid ID for record - skipping indexing. No filename or entity data available.") - context.skip!("Missing required ID data") - end + context.logger.error("Filename doesn't match expected pattern 'creator_{type}_{id}': #{id_from_filename}") + context.skip!("Invalid ID format in filename") end + else + context.logger.error("No source filename available for record") + context.skip!("Missing source filename") end end From 92cf74fa6618d2b2ab5ef5d1678b9f687e79f9c3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 22:13:20 +0000 Subject: [PATCH 12/14] Initial plan From 9d390af947214978de5a3ea268450c1b9b1eca71 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 22:16:05 +0000 Subject: [PATCH 13/14] Remove dead exception handler in index_collections method Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 103 +++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 53 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index f8b3921..73cf1f1 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -562,60 +562,57 @@ def index_collections(self, repo_id, xml_file_path, indent_size=0): """Index collection XML files to Solr using traject.""" indent = ' ' * indent_size self.log.info(f'{indent}Indexing pending resources in repository ID {repo_id} to ArcLight Solr...') - try: - # Get arclight traject config path - result_show = subprocess.run( - ['bundle', 'show', 'arclight'], - capture_output=True, - text=True, - cwd=self.arclight_dir - ) - arclight_path = result_show.stdout.strip() if result_show.returncode == 0 else '' - - if not arclight_path: - self.log.error(f'{indent}Could not find arclight gem path') - return - - traject_config = f'{arclight_path}/lib/arclight/traject/ead2_config.rb' - - cmd = [ - 'bundle', 'exec', 'traject', - '-u', self.solr_url, - '-s', 'processing_thread_pool=8', - '-s', 'solr_writer.thread_pool=8', - '-s', f'solr_writer.batch_size={self.batch_size}', - '-s', 'solr_writer.commit_on_close=true', - '-i', 'xml', - '-c', traject_config - ] - - if self.traject_extra_config: - if isinstance(self.traject_extra_config, (list, tuple)): - cmd.extend(self.traject_extra_config) - else: - # Treat a string extra config as a path and pass it with -c - cmd.extend(['-c', self.traject_extra_config]) - - cmd.append(xml_file_path) - - env = os.environ.copy() - env['REPOSITORY_ID'] = str(repo_id) - cmd_string = ' '.join(cmd) - result = subprocess.run( - cmd_string, - cwd=self.arclight_dir, - env=env, - stderr=subprocess.PIPE, - ) - - if result.stderr: - self.log.error(f'{indent}{result.stderr.decode("utf-8")}') - if result.returncode != 0: - self.log.error(f'{indent}Failed to index pending resources in repository ID {repo_id} to ArcLight Solr. Return code: {result.returncode}') + # Get arclight traject config path + result_show = subprocess.run( + ['bundle', 'show', 'arclight'], + capture_output=True, + text=True, + cwd=self.arclight_dir + ) + arclight_path = result_show.stdout.strip() if result_show.returncode == 0 else '' + + if not arclight_path: + self.log.error(f'{indent}Could not find arclight gem path') + return + + traject_config = f'{arclight_path}/lib/arclight/traject/ead2_config.rb' + + cmd = [ + 'bundle', 'exec', 'traject', + '-u', self.solr_url, + '-s', 'processing_thread_pool=8', + '-s', 'solr_writer.thread_pool=8', + '-s', f'solr_writer.batch_size={self.batch_size}', + '-s', 'solr_writer.commit_on_close=true', + '-i', 'xml', + '-c', traject_config + ] + + if self.traject_extra_config: + if isinstance(self.traject_extra_config, (list, tuple)): + cmd.extend(self.traject_extra_config) else: - self.log.info(f'{indent}Finished indexing pending resources in repository ID {repo_id} to ArcLight Solr.') - except subprocess.CalledProcessError as e: - self.log.error(f'{indent}Error indexing pending resources in repository ID {repo_id} to ArcLight Solr: {e}') + # Treat a string extra config as a path and pass it with -c + cmd.extend(['-c', self.traject_extra_config]) + + cmd.append(xml_file_path) + + env = os.environ.copy() + env['REPOSITORY_ID'] = str(repo_id) + cmd_string = ' '.join(cmd) + result = subprocess.run( + cmd_string, + cwd=self.arclight_dir, + env=env, + stderr=subprocess.PIPE, + ) + + if result.stderr: + self.log.error(f'{indent}{result.stderr.decode("utf-8")}') + if result.returncode != 0: + self.log.error(f'{indent}Failed to index pending resources in repository ID {repo_id} to ArcLight Solr. Return code: {result.returncode}') + else: + self.log.info(f'{indent}Finished indexing pending resources in repository ID {repo_id} to ArcLight Solr.') def get_creator_bioghist(self, resource, indent_size=0): From 84fe34d00509e36f7b5ede0bf03b4664d066b63b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 22:16:48 +0000 Subject: [PATCH 14/14] Pass cmd list directly to subprocess.run instead of joining to string Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index 73cf1f1..fd3ddea 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -599,9 +599,8 @@ def index_collections(self, repo_id, xml_file_path, indent_size=0): env = os.environ.copy() env['REPOSITORY_ID'] = str(repo_id) - cmd_string = ' '.join(cmd) result = subprocess.run( - cmd_string, + cmd, cwd=self.arclight_dir, env=env, stderr=subprocess.PIPE,