Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
ef0dda7
feat: Add creator biographical information to EAD XML exports
Copilot Dec 23, 2025
b3f77eb
feat(arclight#29): Add creator/agent indexing system for ArcLight
alexdryden Feb 11, 2026
74df557
feat: Optimize agent filtering with ArchivesSpace Solr
alexdryden Feb 21, 2026
3ef2b0a
fix: clean up traject config
Copilot Feb 26, 2026
89057a9
feat(arclight#29): run orchestration for threaded and single-scope runs
alexdryden Mar 4, 2026
551e563
Merge branch 'main' into index_creators
alexdryden Mar 4, 2026
fa0c562
find eac_cpf traject in arcuit, fallback to example in arcflow
alexdryden Mar 6, 2026
2a8e88a
Merge branch 'main' into index_creators
alexdryden Mar 9, 2026
256a19b
Initial plan
Copilot Mar 9, 2026
221f569
Add bidirectional creator-collection links in EAD and EAC-CPF
Copilot Mar 9, 2026
22d2cec
chore: add rg and sg to the example ead extra config
alexdryden Mar 10, 2026
56c619c
update log message for clarity
Copilot Mar 10, 2026
1b35516
Fix idempotent check to match specific ead_id pattern
Copilot Mar 10, 2026
d636edc
Add XmlTransformService and AgentService with comprehensive tests
Copilot Mar 10, 2026
920431d
Refactor main.py to use XmlTransformService and AgentService
Copilot Mar 10, 2026
6deae57
Address code review: use relative imports and improve documentation
Copilot Mar 10, 2026
e078182
refactor: use xml transform service
alexdryden Mar 10, 2026
a503bbd
Merge branch 'main' into copilot/add-bidirectional-creator-collection…
alexdryden Mar 10, 2026
5431103
Apply suggestions from code review
alexdryden Mar 10, 2026
3465192
Refactor XML transformations to use ElementTree parser for clarity
Copilot Mar 10, 2026
7a1e9ab
feat: use custom namespace attribute for creator id
alexdryden Mar 11, 2026
d832204
fix: extract and use ead and eac_cpf namespace
alexdryden Mar 11, 2026
da7f161
fix: use correct namespace
alexdryden Mar 11, 2026
2116f31
update documentation
alexdryden Mar 11, 2026
a423017
fix: preserve nested bioghists for multiple creators
alexdryden Mar 11, 2026
6621ae4
chore: remove dead code
alexdryden Mar 11, 2026
0463690
fix: make XML transformations namespace-aware for EAD and EAC-CPF
Copilot Mar 11, 2026
f2a451a
fix: check for namespace explicitly
alexdryden Mar 11, 2026
1a76bd7
fix: formatting
alexdryden Mar 11, 2026
1d4674a
fix: remove unused import
alexdryden Mar 11, 2026
92e0c45
fix: ensure consistent namespace in sub-elements
alexdryden Mar 11, 2026
c221ccf
fix: properly escape XML special characters in bioghist paragraphs
Copilot Mar 11, 2026
170699a
remove unused import
alexdryden Mar 11, 2026
e511635
fix: use custom namespace notation
alexdryden Mar 11, 2026
675ccb4
fix: use correct namespace
alexdryden Mar 11, 2026
8cf6e66
fix: formatting
alexdryden Mar 11, 2026
82c2e23
fix: address code review feedback
Copilot Mar 11, 2026
651ab95
refactor: migrate from ElementTree to lxml for better namespace handling
Copilot Mar 17, 2026
f0fac40
fix: use quoted string for lxml._Element type hint
Copilot Mar 17, 2026
088b65d
feat: add additional fields for creator relationships to objects
alexdryden Mar 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion arcflow/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,15 @@
from .main import ArcFlow
"""
ArcFlow package for syncing ArchivesSpace to ArcLight.

To use ArcFlow, import directly from the main module:
from arcflow.main import ArcFlow

Services can be imported independently:
from arcflow.services.xml_transform_service import XmlTransformService
from arcflow.services.agent_service import AgentService

The top-level import is disabled to avoid eager loading of dependencies.
"""

# Avoid eager imports to allow services to be imported independently
# from .main import ArcFlow
Comment on lines +7 to +15
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

arcflow/__init__.py no longer re-exports ArcFlow, so from arcflow import ArcFlow will break for any downstream code relying on the package-level import. If the goal is to avoid eager loading, consider preserving backwards compatibility via a lazy import mechanism (e.g., module __getattr__) or explicitly documenting this as a breaking change.

Suggested change
Services can be imported independently:
from arcflow.services.xml_transform_service import XmlTransformService
from arcflow.services.agent_service import AgentService
The top-level import is disabled to avoid eager loading of dependencies.
"""
# Avoid eager imports to allow services to be imported independently
# from .main import ArcFlow
Or, for backwards compatibility, from the package root (lazy loaded):
from arcflow import ArcFlow
Services can be imported independently:
from arcflow.services.xml_transform_service import XmlTransformService
from arcflow.services.agent_service import AgentService
"""
# Avoid eager imports to allow services to be imported independently,
# but preserve backwards compatibility for `from arcflow import ArcFlow`
# via a lazy module-level attribute lookup.
__all__ = ["ArcFlow"]
def __getattr__(name: str):
"""
Lazily provide ArcFlow at the package level to avoid eager imports.
This allows:
from arcflow import ArcFlow
without importing .main until ArcFlow is actually requested.
"""
if name == "ArcFlow":
from .main import ArcFlow # type: ignore[attr-defined]
return ArcFlow
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

Copilot uses AI. Check for mistakes.
143 changes: 40 additions & 103 deletions arcflow/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from asnake.client import ASnakeClient
from multiprocessing.pool import ThreadPool as Pool
from utils.stage_classifications import extract_labels
from .services.xml_transform_service import XmlTransformService
from .services.agent_service import AgentService
import glob

base_dir = os.path.abspath((__file__) + "/../../")
Expand Down Expand Up @@ -115,6 +117,10 @@ def __init__(self, arclight_dir, aspace_dir, solr_url, aspace_solr_url, ead_extr
self.log.error(f'Error authorizing ASnakeClient: {e}')
exit(0)

# Initialize services
self.xml_transform = XmlTransformService(client=self.client, log=self.log)
self.agent_service = AgentService(client=self.client, log=self.log)


def is_running(self):
"""
Expand Down Expand Up @@ -262,50 +268,24 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0):
# (record group/subgroup labels and biographical/historical notes)
if xml.content:
xml_content = xml.content.decode('utf-8')
insert_pos = xml_content.find('<archdesc level="collection">')

if insert_pos != -1:
# Find the position after the closing </did> tag
did_end_pos = xml_content.find('</did>', insert_pos)

if did_end_pos != -1:
# Move to after the </did> tag
did_end_pos += len('</did>')
extra_xml = ''

# Add record group and subgroup labels
rg_label, sg_label = extract_labels(resource)[1:3]
if rg_label:
extra_xml += f'\n<recordgroup>{xml_escape(rg_label)}</recordgroup>'
if sg_label:
extra_xml += f'\n<subgroup>{xml_escape(sg_label)}</subgroup>'

# Handle biographical/historical notes from creator agents
bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size)
if bioghist_content:
# Check if there's already a bioghist element in the EAD
# Search for existing bioghist after </did> but before </archdesc>
archdesc_end = xml_content.find('</archdesc>', did_end_pos)
search_section = xml_content[did_end_pos:archdesc_end] if archdesc_end != -1 else xml_content[did_end_pos:]

# Look for closing </bioghist> tag
existing_bioghist_end = search_section.rfind('</bioghist>')

if existing_bioghist_end != -1:
# Found existing bioghist - insert agent elements INSIDE it (before closing tag)
insert_pos = did_end_pos + existing_bioghist_end
xml_content = (xml_content[:insert_pos] +
f'\n{bioghist_content}\n' +
xml_content[insert_pos:])
else:
# No existing bioghist - wrap agent elements in parent container
wrapped_content = f'<bioghist>\n{bioghist_content}\n</bioghist>'
extra_xml += f'\n{wrapped_content}'

if extra_xml:
xml_content = (xml_content[:did_end_pos] +
extra_xml +
xml_content[did_end_pos:])

# Add arcuit:creator_id attributes (in a custom namespace) to origination name elements
# (links creator names in EAD to their corresponding creator records, e.g., in Solr)
xml_content = self.xml_transform.add_creator_ids_to_ead(xml_content, resource, indent_size=indent_size)

# Get record group and subgroup labels
rg_label, sg_label = extract_labels(resource)[1:3]

# Get biographical/historical notes from creator agents
bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size)

# Inject all collection metadata using XmlTransformService
xml_content = self.xml_transform.inject_collection_metadata(
xml_content,
record_group=rg_label,
subgroup=sg_label,
bioghist_content=bioghist_content
)

xml_content = xml_content.encode('utf-8')
else:
Expand Down Expand Up @@ -634,7 +614,6 @@ def get_creator_bioghist(self, resource, indent_size=0):
Returns nested bioghist elements for each creator, or None if no creator agents have notes.
Each bioghist element includes the creator name in a head element and an id attribute.
"""
indent = ' ' * indent_size
bioghist_elements = []

if 'linked_agents' not in resource:
Expand All @@ -646,58 +625,16 @@ def get_creator_bioghist(self, resource, indent_size=0):
if linked_agent.get('role') == 'creator':
agent_ref = linked_agent.get('ref')
if agent_ref:
try:
agent = self.client.get(agent_ref).json()

# Get agent name for head element
agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown')

# Check for notes in the agent record
if 'notes' in agent:
for note in agent['notes']:
# Look for biographical/historical notes
if note.get('jsonmodel_type') == 'note_bioghist':
# Get persistent_id for the id attribute
persistent_id = note.get('persistent_id', '')
if not persistent_id:
self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist for agent {agent_ref}')
# Skip creating id attribute if persistent_id is missing
persistent_id = None

# Extract note content from subnotes
paragraphs = []
if 'subnotes' in note:
for subnote in note['subnotes']:
if 'content' in subnote:
# Split content on single newlines to create paragraphs
content = subnote['content']
# Handle content as either string or list with explicit type checking
if isinstance(content, str):
# Split on newline and filter out empty strings
lines = [line.strip() for line in content.split('\n') if line.strip()]
elif isinstance(content, list):
# Content is already a list - use as is
lines = [str(item).strip() for item in content if str(item).strip()]
else:
# Log unexpected content type prominently
self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content in agent {agent_ref}, got {type(content).__name__}')
continue
# Wrap each line in <p> tags
for line in lines:
paragraphs.append(f'<p>{line}</p>')

# Create nested bioghist element if we have paragraphs
if paragraphs:
paragraphs_xml = '\n'.join(paragraphs)
heading = f'Historical Note from {xml_escape(agent_name)} Creator Record'
# Only include id attribute if persistent_id is available
if persistent_id:
bioghist_el = f'<bioghist id="aspace_{persistent_id}"><head>{heading}</head>\n{paragraphs_xml}\n</bioghist>'
else:
bioghist_el = f'<bioghist><head>{heading}</head>\n{paragraphs_xml}\n</bioghist>'
bioghist_elements.append(bioghist_el)
except Exception as e:
self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}')
bioghist_data = self.agent_service.get_agent_bioghist_data(
agent_ref, indent_size=indent_size
)
if bioghist_data:
bioghist_xml = self.xml_transform.build_bioghist_element(
bioghist_data['agent_name'],
bioghist_data['persistent_id'],
bioghist_data['paragraphs']
)
bioghist_elements.append(bioghist_xml)

if bioghist_elements:
# Return the agent bioghist elements (unwrapped)
Expand Down Expand Up @@ -879,14 +816,14 @@ def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0):

eac_cpf_xml = response.text

# Parse the EAC-CPF XML to validate and inspect its structure
try:
root = ET.fromstring(eac_cpf_xml)
self.log.debug(f'{indent}Parsed EAC-CPF XML root element: {root.tag}')
except ET.ParseError as e:
self.log.error(f'{indent}Failed to parse EAC-CPF XML for {agent_uri}: {e}')
# Validate EAC-CPF XML structure
if not self.xml_transform.validate_eac_cpf_xml(eac_cpf_xml, agent_uri, indent_size=indent_size):
self.log.error(f'{indent}Invalid EAC-CPF XML for {agent_uri}, skipping')
return None

# Add collection ead_ids to resourceRelation creatorOf elements
eac_cpf_xml = self.xml_transform.add_collection_links_to_eac_cpf(eac_cpf_xml, indent_size=indent_size)

# Generate creator ID
Comment on lines 817 to 827
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR description mentions refactoring task_agent() to use a validate_eac_cpf_xml() service, but the updated code no longer validates the EAC-CPF document before writing it (it only attempts a parse inside add_collection_links_to_eac_cpf). If validation is still required, consider calling validate_eac_cpf_xml() here and aborting on invalid XML, or update the PR description if validation was intentionally removed.

Copilot uses AI. Check for mistakes.
creator_id = f'creator_{agent_type}_{agent_id}'

Expand Down
Empty file added arcflow/services/__init__.py
Empty file.
115 changes: 115 additions & 0 deletions arcflow/services/agent_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""
Service for fetching and processing agent data from ArchivesSpace.

Handles agent-related operations including:
- Fetching agent biographical/historical notes
- Processing note content into structured data
"""

import logging
from typing import Optional, List, Dict


class AgentService:
"""Service for agent data fetching and processing."""

def __init__(self, client, log=None):
"""
Initialize the agent service.

Args:
client: ASnake client for fetching agent data
log: Logger instance (optional, creates default if not provided)
"""
self.client = client
self.log = log or logging.getLogger(__name__)

def get_agent_bioghist_data(self, agent_uri: str, indent_size: int = 0) -> Optional[Dict]:
"""
Fetch bioghist DATA for an agent.

Returns structured data (not XML) so it can be used in different contexts:
- Build EAD XML for collections
- Build EAC-CPF XML for creator records
- Display in a web UI
- Export as JSON

Args:
agent_uri: Agent URI from ArchivesSpace (e.g., '/agents/corporate_entities/123')
indent_size: Indentation size for logging

Returns:
dict with keys: 'agent_name', 'persistent_id', 'paragraphs'
or None if no bioghist found or on error
"""
indent = ' ' * indent_size

try:
agent = self.client.get(agent_uri).json()
agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown')

for note in agent.get('notes', []):
if note.get('jsonmodel_type') == 'note_bioghist':
persistent_id = note.get('persistent_id')
paragraphs = self._extract_paragraphs(note, agent_uri, indent_size)

if paragraphs:
return {
'agent_name': agent_name,
'persistent_id': persistent_id,
'paragraphs': paragraphs
}

return None # No bioghist

except Exception as e:
self.log.error(f'{indent}Error fetching agent {agent_uri}: {e}')
return None

def _extract_paragraphs(self, note: dict, agent_uri: str, indent_size: int = 0) -> List[str]:
"""
Extract paragraph content from a bioghist note.

Args:
note: Note dictionary from ArchivesSpace
agent_uri: Agent URI for logging purposes
indent_size: Indentation size for logging

Returns:
List of plain text paragraph strings (not wrapped in <p> tags)
"""
indent = ' ' * indent_size
paragraphs = []

if 'subnotes' in note:
for subnote in note['subnotes']:
if 'content' in subnote:
content = subnote['content']

# Handle content as either string or list with explicit type checking
if isinstance(content, str):
# Split on newline and filter out empty strings
lines = [line.strip() for line in content.split('\n') if line.strip()]
elif isinstance(content, list):
# Content is already a list - use as is
lines = [str(item).strip() for item in content if str(item).strip()]
else:
# Log unexpected content type prominently
self.log.error(
f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content '
f'in agent {agent_uri}, got {type(content).__name__}'
)
continue

# Add plain text lines (will be wrapped in <p> tags by build_bioghist_element)
for line in lines:
paragraphs.append(line)

# Log if persistent_id is missing
if not note.get('persistent_id'):
self.log.error(
f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist '
f'for agent {agent_uri}'
)

return paragraphs
Loading