-
Notifications
You must be signed in to change notification settings - Fork 0
Add bidirectional creator↔collection links via EAD authfilenumber and EAC-CPF descriptiveNote #35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
ef0dda7
b3f77eb
74df557
3ef2b0a
89057a9
551e563
fa0c562
2a8e88a
256a19b
221f569
22d2cec
56c619c
1b35516
d636edc
920431d
6deae57
e078182
a503bbd
5431103
3465192
7a1e9ab
d832204
da7f161
2116f31
a423017
6621ae4
0463690
f2a451a
1a76bd7
1d4674a
92e0c45
c221ccf
170699a
e511635
675ccb4
8cf6e66
82c2e23
651ab95
f0fac40
088b65d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1,15 @@ | ||
| from .main import ArcFlow | ||
| """ | ||
| ArcFlow package for syncing ArchivesSpace to ArcLight. | ||
|
|
||
| To use ArcFlow, import directly from the main module: | ||
| from arcflow.main import ArcFlow | ||
|
|
||
| Services can be imported independently: | ||
| from arcflow.services.xml_transform_service import XmlTransformService | ||
| from arcflow.services.agent_service import AgentService | ||
|
|
||
| The top-level import is disabled to avoid eager loading of dependencies. | ||
| """ | ||
|
|
||
| # Avoid eager imports to allow services to be imported independently | ||
| # from .main import ArcFlow | ||
alexdryden marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,8 @@ | |
| from asnake.client import ASnakeClient | ||
| from multiprocessing.pool import ThreadPool as Pool | ||
| from utils.stage_classifications import extract_labels | ||
| from .services.xml_transform_service import XmlTransformService | ||
| from .services.agent_service import AgentService | ||
| import glob | ||
alexdryden marked this conversation as resolved.
Show resolved
Hide resolved
alexdryden marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| base_dir = os.path.abspath((__file__) + "/../../") | ||
|
|
@@ -115,6 +117,10 @@ def __init__(self, arclight_dir, aspace_dir, solr_url, aspace_solr_url, ead_extr | |
| self.log.error(f'Error authorizing ASnakeClient: {e}') | ||
| exit(0) | ||
|
|
||
| # Initialize services | ||
| self.xml_transform = XmlTransformService(client=self.client, log=self.log) | ||
| self.agent_service = AgentService(client=self.client, log=self.log) | ||
|
|
||
|
|
||
| def is_running(self): | ||
| """ | ||
|
|
@@ -262,50 +268,24 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): | |
| # (record group/subgroup labels and biographical/historical notes) | ||
| if xml.content: | ||
| xml_content = xml.content.decode('utf-8') | ||
| insert_pos = xml_content.find('<archdesc level="collection">') | ||
|
|
||
| if insert_pos != -1: | ||
| # Find the position after the closing </did> tag | ||
| did_end_pos = xml_content.find('</did>', insert_pos) | ||
|
|
||
| if did_end_pos != -1: | ||
| # Move to after the </did> tag | ||
| did_end_pos += len('</did>') | ||
| extra_xml = '' | ||
|
|
||
| # Add record group and subgroup labels | ||
| rg_label, sg_label = extract_labels(resource)[1:3] | ||
| if rg_label: | ||
| extra_xml += f'\n<recordgroup>{xml_escape(rg_label)}</recordgroup>' | ||
| if sg_label: | ||
| extra_xml += f'\n<subgroup>{xml_escape(sg_label)}</subgroup>' | ||
|
|
||
| # Handle biographical/historical notes from creator agents | ||
| bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) | ||
| if bioghist_content: | ||
| # Check if there's already a bioghist element in the EAD | ||
| # Search for existing bioghist after </did> but before </archdesc> | ||
| archdesc_end = xml_content.find('</archdesc>', did_end_pos) | ||
| search_section = xml_content[did_end_pos:archdesc_end] if archdesc_end != -1 else xml_content[did_end_pos:] | ||
|
|
||
| # Look for closing </bioghist> tag | ||
| existing_bioghist_end = search_section.rfind('</bioghist>') | ||
|
|
||
| if existing_bioghist_end != -1: | ||
| # Found existing bioghist - insert agent elements INSIDE it (before closing tag) | ||
| insert_pos = did_end_pos + existing_bioghist_end | ||
| xml_content = (xml_content[:insert_pos] + | ||
| f'\n{bioghist_content}\n' + | ||
| xml_content[insert_pos:]) | ||
| else: | ||
| # No existing bioghist - wrap agent elements in parent container | ||
| wrapped_content = f'<bioghist>\n{bioghist_content}\n</bioghist>' | ||
| extra_xml += f'\n{wrapped_content}' | ||
|
|
||
| if extra_xml: | ||
| xml_content = (xml_content[:did_end_pos] + | ||
| extra_xml + | ||
| xml_content[did_end_pos:]) | ||
|
|
||
| # Add arcuit:creator_id attributes (in a custom namespace) to origination name elements | ||
| # (links creator names in EAD to their corresponding creator records, e.g., in Solr) | ||
| xml_content = self.xml_transform.add_creator_ids_to_ead(xml_content, resource, indent_size=indent_size) | ||
|
|
||
| # Get record group and subgroup labels | ||
| rg_label, sg_label = extract_labels(resource)[1:3] | ||
|
|
||
| # Get biographical/historical notes from creator agents | ||
| bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) | ||
|
|
||
| # Inject all collection metadata using XmlTransformService | ||
| xml_content = self.xml_transform.inject_collection_metadata( | ||
| xml_content, | ||
| record_group=rg_label, | ||
| subgroup=sg_label, | ||
| bioghist_content=bioghist_content | ||
| ) | ||
|
|
||
| xml_content = xml_content.encode('utf-8') | ||
| else: | ||
|
|
@@ -634,7 +614,6 @@ def get_creator_bioghist(self, resource, indent_size=0): | |
| Returns nested bioghist elements for each creator, or None if no creator agents have notes. | ||
| Each bioghist element includes the creator name in a head element and an id attribute. | ||
| """ | ||
| indent = ' ' * indent_size | ||
| bioghist_elements = [] | ||
|
|
||
| if 'linked_agents' not in resource: | ||
|
|
@@ -646,58 +625,16 @@ def get_creator_bioghist(self, resource, indent_size=0): | |
| if linked_agent.get('role') == 'creator': | ||
| agent_ref = linked_agent.get('ref') | ||
| if agent_ref: | ||
| try: | ||
| agent = self.client.get(agent_ref).json() | ||
|
|
||
| # Get agent name for head element | ||
| agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown') | ||
|
|
||
| # Check for notes in the agent record | ||
| if 'notes' in agent: | ||
| for note in agent['notes']: | ||
| # Look for biographical/historical notes | ||
| if note.get('jsonmodel_type') == 'note_bioghist': | ||
| # Get persistent_id for the id attribute | ||
| persistent_id = note.get('persistent_id', '') | ||
| if not persistent_id: | ||
| self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist for agent {agent_ref}') | ||
| # Skip creating id attribute if persistent_id is missing | ||
| persistent_id = None | ||
|
|
||
| # Extract note content from subnotes | ||
| paragraphs = [] | ||
| if 'subnotes' in note: | ||
| for subnote in note['subnotes']: | ||
| if 'content' in subnote: | ||
| # Split content on single newlines to create paragraphs | ||
| content = subnote['content'] | ||
| # Handle content as either string or list with explicit type checking | ||
| if isinstance(content, str): | ||
| # Split on newline and filter out empty strings | ||
| lines = [line.strip() for line in content.split('\n') if line.strip()] | ||
| elif isinstance(content, list): | ||
| # Content is already a list - use as is | ||
| lines = [str(item).strip() for item in content if str(item).strip()] | ||
| else: | ||
| # Log unexpected content type prominently | ||
| self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content in agent {agent_ref}, got {type(content).__name__}') | ||
| continue | ||
| # Wrap each line in <p> tags | ||
| for line in lines: | ||
| paragraphs.append(f'<p>{line}</p>') | ||
|
|
||
| # Create nested bioghist element if we have paragraphs | ||
| if paragraphs: | ||
| paragraphs_xml = '\n'.join(paragraphs) | ||
| heading = f'Historical Note from {xml_escape(agent_name)} Creator Record' | ||
| # Only include id attribute if persistent_id is available | ||
| if persistent_id: | ||
| bioghist_el = f'<bioghist id="aspace_{persistent_id}"><head>{heading}</head>\n{paragraphs_xml}\n</bioghist>' | ||
| else: | ||
| bioghist_el = f'<bioghist><head>{heading}</head>\n{paragraphs_xml}\n</bioghist>' | ||
| bioghist_elements.append(bioghist_el) | ||
| except Exception as e: | ||
| self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}') | ||
| bioghist_data = self.agent_service.get_agent_bioghist_data( | ||
| agent_ref, indent_size=indent_size | ||
| ) | ||
| if bioghist_data: | ||
| bioghist_xml = self.xml_transform.build_bioghist_element( | ||
| bioghist_data['agent_name'], | ||
| bioghist_data['persistent_id'], | ||
| bioghist_data['paragraphs'] | ||
| ) | ||
| bioghist_elements.append(bioghist_xml) | ||
|
|
||
| if bioghist_elements: | ||
| # Return the agent bioghist elements (unwrapped) | ||
|
|
@@ -879,14 +816,14 @@ def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0): | |
|
|
||
| eac_cpf_xml = response.text | ||
|
|
||
| # Parse the EAC-CPF XML to validate and inspect its structure | ||
| try: | ||
| root = ET.fromstring(eac_cpf_xml) | ||
| self.log.debug(f'{indent}Parsed EAC-CPF XML root element: {root.tag}') | ||
| except ET.ParseError as e: | ||
| self.log.error(f'{indent}Failed to parse EAC-CPF XML for {agent_uri}: {e}') | ||
| # Validate EAC-CPF XML structure | ||
| if not self.xml_transform.validate_eac_cpf_xml(eac_cpf_xml, agent_uri, indent_size=indent_size): | ||
| self.log.error(f'{indent}Invalid EAC-CPF XML for {agent_uri}, skipping') | ||
| return None | ||
|
|
||
| # Add collection ead_ids to resourceRelation creatorOf elements | ||
| eac_cpf_xml = self.xml_transform.add_collection_links_to_eac_cpf(eac_cpf_xml, indent_size=indent_size) | ||
|
|
||
alexdryden marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Generate creator ID | ||
|
Comment on lines
817
to
827
|
||
| creator_id = f'creator_{agent_type}_{agent_id}' | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| """ | ||
| Service for fetching and processing agent data from ArchivesSpace. | ||
|
|
||
| Handles agent-related operations including: | ||
| - Fetching agent biographical/historical notes | ||
| - Processing note content into structured data | ||
| """ | ||
|
|
||
| import logging | ||
| from typing import Optional, List, Dict | ||
|
|
||
|
|
||
| class AgentService: | ||
| """Service for agent data fetching and processing.""" | ||
|
|
||
| def __init__(self, client, log=None): | ||
| """ | ||
| Initialize the agent service. | ||
|
|
||
| Args: | ||
| client: ASnake client for fetching agent data | ||
| log: Logger instance (optional, creates default if not provided) | ||
| """ | ||
| self.client = client | ||
| self.log = log or logging.getLogger(__name__) | ||
|
|
||
| def get_agent_bioghist_data(self, agent_uri: str, indent_size: int = 0) -> Optional[Dict]: | ||
| """ | ||
| Fetch bioghist DATA for an agent. | ||
|
|
||
| Returns structured data (not XML) so it can be used in different contexts: | ||
| - Build EAD XML for collections | ||
| - Build EAC-CPF XML for creator records | ||
| - Display in a web UI | ||
| - Export as JSON | ||
|
|
||
| Args: | ||
| agent_uri: Agent URI from ArchivesSpace (e.g., '/agents/corporate_entities/123') | ||
| indent_size: Indentation size for logging | ||
|
|
||
| Returns: | ||
| dict with keys: 'agent_name', 'persistent_id', 'paragraphs' | ||
| or None if no bioghist found or on error | ||
| """ | ||
| indent = ' ' * indent_size | ||
|
|
||
| try: | ||
| agent = self.client.get(agent_uri).json() | ||
| agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown') | ||
|
|
||
| for note in agent.get('notes', []): | ||
| if note.get('jsonmodel_type') == 'note_bioghist': | ||
| persistent_id = note.get('persistent_id') | ||
| paragraphs = self._extract_paragraphs(note, agent_uri, indent_size) | ||
|
|
||
| if paragraphs: | ||
| return { | ||
| 'agent_name': agent_name, | ||
| 'persistent_id': persistent_id, | ||
| 'paragraphs': paragraphs | ||
| } | ||
|
|
||
| return None # No bioghist | ||
|
|
||
| except Exception as e: | ||
| self.log.error(f'{indent}Error fetching agent {agent_uri}: {e}') | ||
| return None | ||
|
|
||
| def _extract_paragraphs(self, note: dict, agent_uri: str, indent_size: int = 0) -> List[str]: | ||
| """ | ||
| Extract paragraph content from a bioghist note. | ||
|
|
||
| Args: | ||
| note: Note dictionary from ArchivesSpace | ||
| agent_uri: Agent URI for logging purposes | ||
| indent_size: Indentation size for logging | ||
|
|
||
| Returns: | ||
| List of plain text paragraph strings (not wrapped in <p> tags) | ||
| """ | ||
| indent = ' ' * indent_size | ||
| paragraphs = [] | ||
|
|
||
| if 'subnotes' in note: | ||
| for subnote in note['subnotes']: | ||
| if 'content' in subnote: | ||
| content = subnote['content'] | ||
|
|
||
| # Handle content as either string or list with explicit type checking | ||
| if isinstance(content, str): | ||
| # Split on newline and filter out empty strings | ||
| lines = [line.strip() for line in content.split('\n') if line.strip()] | ||
| elif isinstance(content, list): | ||
| # Content is already a list - use as is | ||
| lines = [str(item).strip() for item in content if str(item).strip()] | ||
| else: | ||
| # Log unexpected content type prominently | ||
| self.log.error( | ||
| f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content ' | ||
| f'in agent {agent_uri}, got {type(content).__name__}' | ||
| ) | ||
| continue | ||
|
|
||
| # Add plain text lines (will be wrapped in <p> tags by build_bioghist_element) | ||
| for line in lines: | ||
| paragraphs.append(line) | ||
|
|
||
alexdryden marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Log if persistent_id is missing | ||
| if not note.get('persistent_id'): | ||
| self.log.error( | ||
| f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist ' | ||
| f'for agent {agent_uri}' | ||
| ) | ||
|
|
||
| return paragraphs | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
arcflow/__init__.pyno longer re-exportsArcFlow, sofrom arcflow import ArcFlowwill break for any downstream code relying on the package-level import. If the goal is to avoid eager loading, consider preserving backwards compatibility via a lazy import mechanism (e.g., module__getattr__) or explicitly documenting this as a breaking change.