diff --git a/arcflow/__init__.py b/arcflow/__init__.py
index f80bba7..7856a3d 100644
--- a/arcflow/__init__.py
+++ b/arcflow/__init__.py
@@ -1 +1,15 @@
-from .main import ArcFlow
\ No newline at end of file
+"""
+ArcFlow package for syncing ArchivesSpace to ArcLight.
+
+To use ArcFlow, import directly from the main module:
+ from arcflow.main import ArcFlow
+
+Services can be imported independently:
+ from arcflow.services.xml_transform_service import XmlTransformService
+ from arcflow.services.agent_service import AgentService
+
+The top-level import is disabled to avoid eager loading of dependencies.
+"""
+
+# Avoid eager imports to allow services to be imported independently
+# from .main import ArcFlow
\ No newline at end of file
diff --git a/arcflow/main.py b/arcflow/main.py
index 430539a..acac689 100644
--- a/arcflow/main.py
+++ b/arcflow/main.py
@@ -19,6 +19,8 @@
from asnake.client import ASnakeClient
from multiprocessing.pool import ThreadPool as Pool
from utils.stage_classifications import extract_labels
+from .services.xml_transform_service import XmlTransformService
+from .services.agent_service import AgentService
import glob
base_dir = os.path.abspath((__file__) + "/../../")
@@ -115,6 +117,10 @@ def __init__(self, arclight_dir, aspace_dir, solr_url, aspace_solr_url, ead_extr
self.log.error(f'Error authorizing ASnakeClient: {e}')
exit(0)
+ # Initialize services
+ self.xml_transform = XmlTransformService(client=self.client, log=self.log)
+ self.agent_service = AgentService(client=self.client, log=self.log)
+
def is_running(self):
"""
@@ -262,50 +268,24 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0):
# (record group/subgroup labels and biographical/historical notes)
if xml.content:
xml_content = xml.content.decode('utf-8')
- insert_pos = xml_content.find('')
-
- if insert_pos != -1:
- # Find the position after the closing tag
- did_end_pos = xml_content.find('', insert_pos)
-
- if did_end_pos != -1:
- # Move to after the tag
- did_end_pos += len('')
- extra_xml = ''
-
- # Add record group and subgroup labels
- rg_label, sg_label = extract_labels(resource)[1:3]
- if rg_label:
- extra_xml += f'\n{xml_escape(rg_label)}'
- if sg_label:
- extra_xml += f'\n{xml_escape(sg_label)}'
-
- # Handle biographical/historical notes from creator agents
- bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size)
- if bioghist_content:
- # Check if there's already a bioghist element in the EAD
- # Search for existing bioghist after but before
- archdesc_end = xml_content.find('', did_end_pos)
- search_section = xml_content[did_end_pos:archdesc_end] if archdesc_end != -1 else xml_content[did_end_pos:]
-
- # Look for closing tag
- existing_bioghist_end = search_section.rfind('')
-
- if existing_bioghist_end != -1:
- # Found existing bioghist - insert agent elements INSIDE it (before closing tag)
- insert_pos = did_end_pos + existing_bioghist_end
- xml_content = (xml_content[:insert_pos] +
- f'\n{bioghist_content}\n' +
- xml_content[insert_pos:])
- else:
- # No existing bioghist - wrap agent elements in parent container
- wrapped_content = f'\n{bioghist_content}\n'
- extra_xml += f'\n{wrapped_content}'
-
- if extra_xml:
- xml_content = (xml_content[:did_end_pos] +
- extra_xml +
- xml_content[did_end_pos:])
+
+ # Add arcuit:creator_id attributes (in a custom namespace) to origination name elements
+ # (links creator names in EAD to their corresponding creator records, e.g., in Solr)
+ xml_content = self.xml_transform.add_creator_ids_to_ead(xml_content, resource, indent_size=indent_size)
+
+ # Get record group and subgroup labels
+ rg_label, sg_label = extract_labels(resource)[1:3]
+
+ # Get biographical/historical notes from creator agents
+ bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size)
+
+ # Inject all collection metadata using XmlTransformService
+ xml_content = self.xml_transform.inject_collection_metadata(
+ xml_content,
+ record_group=rg_label,
+ subgroup=sg_label,
+ bioghist_content=bioghist_content
+ )
xml_content = xml_content.encode('utf-8')
else:
@@ -634,7 +614,6 @@ def get_creator_bioghist(self, resource, indent_size=0):
Returns nested bioghist elements for each creator, or None if no creator agents have notes.
Each bioghist element includes the creator name in a head element and an id attribute.
"""
- indent = ' ' * indent_size
bioghist_elements = []
if 'linked_agents' not in resource:
@@ -646,58 +625,16 @@ def get_creator_bioghist(self, resource, indent_size=0):
if linked_agent.get('role') == 'creator':
agent_ref = linked_agent.get('ref')
if agent_ref:
- try:
- agent = self.client.get(agent_ref).json()
-
- # Get agent name for head element
- agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown')
-
- # Check for notes in the agent record
- if 'notes' in agent:
- for note in agent['notes']:
- # Look for biographical/historical notes
- if note.get('jsonmodel_type') == 'note_bioghist':
- # Get persistent_id for the id attribute
- persistent_id = note.get('persistent_id', '')
- if not persistent_id:
- self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist for agent {agent_ref}')
- # Skip creating id attribute if persistent_id is missing
- persistent_id = None
-
- # Extract note content from subnotes
- paragraphs = []
- if 'subnotes' in note:
- for subnote in note['subnotes']:
- if 'content' in subnote:
- # Split content on single newlines to create paragraphs
- content = subnote['content']
- # Handle content as either string or list with explicit type checking
- if isinstance(content, str):
- # Split on newline and filter out empty strings
- lines = [line.strip() for line in content.split('\n') if line.strip()]
- elif isinstance(content, list):
- # Content is already a list - use as is
- lines = [str(item).strip() for item in content if str(item).strip()]
- else:
- # Log unexpected content type prominently
- self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content in agent {agent_ref}, got {type(content).__name__}')
- continue
- # Wrap each line in
tags
- for line in lines:
- paragraphs.append(f'
{line}
')
-
- # Create nested bioghist element if we have paragraphs
- if paragraphs:
- paragraphs_xml = '\n'.join(paragraphs)
- heading = f'Historical Note from {xml_escape(agent_name)} Creator Record'
- # Only include id attribute if persistent_id is available
- if persistent_id:
- bioghist_el = f'{heading}\n{paragraphs_xml}\n'
- else:
- bioghist_el = f'{heading}\n{paragraphs_xml}\n'
- bioghist_elements.append(bioghist_el)
- except Exception as e:
- self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}')
+ bioghist_data = self.agent_service.get_agent_bioghist_data(
+ agent_ref, indent_size=indent_size
+ )
+ if bioghist_data:
+ bioghist_xml = self.xml_transform.build_bioghist_element(
+ bioghist_data['agent_name'],
+ bioghist_data['persistent_id'],
+ bioghist_data['paragraphs']
+ )
+ bioghist_elements.append(bioghist_xml)
if bioghist_elements:
# Return the agent bioghist elements (unwrapped)
@@ -879,14 +816,14 @@ def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0):
eac_cpf_xml = response.text
- # Parse the EAC-CPF XML to validate and inspect its structure
- try:
- root = ET.fromstring(eac_cpf_xml)
- self.log.debug(f'{indent}Parsed EAC-CPF XML root element: {root.tag}')
- except ET.ParseError as e:
- self.log.error(f'{indent}Failed to parse EAC-CPF XML for {agent_uri}: {e}')
+ # Validate EAC-CPF XML structure
+ if not self.xml_transform.validate_eac_cpf_xml(eac_cpf_xml, agent_uri, indent_size=indent_size):
+ self.log.error(f'{indent}Invalid EAC-CPF XML for {agent_uri}, skipping')
return None
+ # Add collection ead_ids to resourceRelation creatorOf elements
+ eac_cpf_xml = self.xml_transform.add_collection_links_to_eac_cpf(eac_cpf_xml, indent_size=indent_size)
+
# Generate creator ID
creator_id = f'creator_{agent_type}_{agent_id}'
diff --git a/arcflow/services/__init__.py b/arcflow/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/arcflow/services/agent_service.py b/arcflow/services/agent_service.py
new file mode 100644
index 0000000..35e6a16
--- /dev/null
+++ b/arcflow/services/agent_service.py
@@ -0,0 +1,115 @@
+"""
+Service for fetching and processing agent data from ArchivesSpace.
+
+Handles agent-related operations including:
+- Fetching agent biographical/historical notes
+- Processing note content into structured data
+"""
+
+import logging
+from typing import Optional, List, Dict
+
+
+class AgentService:
+ """Service for agent data fetching and processing."""
+
+ def __init__(self, client, log=None):
+ """
+ Initialize the agent service.
+
+ Args:
+ client: ASnake client for fetching agent data
+ log: Logger instance (optional, creates default if not provided)
+ """
+ self.client = client
+ self.log = log or logging.getLogger(__name__)
+
+ def get_agent_bioghist_data(self, agent_uri: str, indent_size: int = 0) -> Optional[Dict]:
+ """
+ Fetch bioghist DATA for an agent.
+
+ Returns structured data (not XML) so it can be used in different contexts:
+ - Build EAD XML for collections
+ - Build EAC-CPF XML for creator records
+ - Display in a web UI
+ - Export as JSON
+
+ Args:
+ agent_uri: Agent URI from ArchivesSpace (e.g., '/agents/corporate_entities/123')
+ indent_size: Indentation size for logging
+
+ Returns:
+ dict with keys: 'agent_name', 'persistent_id', 'paragraphs'
+ or None if no bioghist found or on error
+ """
+ indent = ' ' * indent_size
+
+ try:
+ agent = self.client.get(agent_uri).json()
+ agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown')
+
+ for note in agent.get('notes', []):
+ if note.get('jsonmodel_type') == 'note_bioghist':
+ persistent_id = note.get('persistent_id')
+ paragraphs = self._extract_paragraphs(note, agent_uri, indent_size)
+
+ if paragraphs:
+ return {
+ 'agent_name': agent_name,
+ 'persistent_id': persistent_id,
+ 'paragraphs': paragraphs
+ }
+
+ return None # No bioghist
+
+ except Exception as e:
+ self.log.error(f'{indent}Error fetching agent {agent_uri}: {e}')
+ return None
+
+ def _extract_paragraphs(self, note: dict, agent_uri: str, indent_size: int = 0) -> List[str]:
+ """
+ Extract paragraph content from a bioghist note.
+
+ Args:
+ note: Note dictionary from ArchivesSpace
+ agent_uri: Agent URI for logging purposes
+ indent_size: Indentation size for logging
+
+ Returns:
+ List of plain text paragraph strings (not wrapped in tags)
+ """
+ indent = ' ' * indent_size
+ paragraphs = []
+
+ if 'subnotes' in note:
+ for subnote in note['subnotes']:
+ if 'content' in subnote:
+ content = subnote['content']
+
+ # Handle content as either string or list with explicit type checking
+ if isinstance(content, str):
+ # Split on newline and filter out empty strings
+ lines = [line.strip() for line in content.split('\n') if line.strip()]
+ elif isinstance(content, list):
+ # Content is already a list - use as is
+ lines = [str(item).strip() for item in content if str(item).strip()]
+ else:
+ # Log unexpected content type prominently
+ self.log.error(
+ f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content '
+ f'in agent {agent_uri}, got {type(content).__name__}'
+ )
+ continue
+
+ # Add plain text lines (will be wrapped in
tags by build_bioghist_element)
+ for line in lines:
+ paragraphs.append(line)
+
+ # Log if persistent_id is missing
+ if not note.get('persistent_id'):
+ self.log.error(
+ f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist '
+ f'for agent {agent_uri}'
+ )
+
+ return paragraphs
diff --git a/arcflow/services/xml_transform_service.py b/arcflow/services/xml_transform_service.py
new file mode 100644
index 0000000..ea2e3fb
--- /dev/null
+++ b/arcflow/services/xml_transform_service.py
@@ -0,0 +1,445 @@
+"""
+Service for transforming and manipulating XML content.
+
+Handles EAD and EAC-CPF XML transformations including:
+- Adding creator IDs to origination elements
+- Injecting collection metadata (record groups, subgroups, bioghist)
+- Adding collection links to EAC-CPF resourceRelation elements
+- Building bioghist XML elements from structured data
+"""
+
+import re
+from typing import Optional, List
+from lxml import etree
+import logging
+
+
+class XmlTransformService:
+ """Service for XML transformations and manipulations."""
+
+ def __init__(self, client=None, log=None):
+ """
+ Initialize the XML transform service.
+
+ Args:
+ client: ASnake client for fetching resources (optional, needed for some operations)
+ log: Logger instance (optional, creates default if not provided)
+ """
+ self.client = client
+ self.log = log or logging.getLogger(__name__)
+
+ def add_creator_ids_to_ead(self, ead: str, resource: dict, indent_size: int = 0) -> str:
+ """
+ Add arcuit:creator_id attributes to name elements inside elements in EAD XML.
+
+ Uses a custom namespace (xmlns:arcuit="https://arcuit.library.illinois.edu/ead-extensions") to avoid
+ collisions with standard EAD attributes like authfilenumber.
+
+ Maps linked_agents with role='creator' to origination elements by index order.
+ The arcuit:creator_id value is a creator ID in the format creator_{type}_{id}.
+
+ Args:
+ ead: EAD XML as a string
+ resource: ArchivesSpace resource record with resolved linked_agents
+ indent_size: Indentation size for logging
+
+ Returns:
+ str: Modified EAD XML string with arcuit namespace and creator_id attributes
+ """
+ indent = ' ' * indent_size
+
+ # Extract creator IDs from linked_agents in order
+ creator_ids = []
+ for linked_agent in resource.get('linked_agents', []):
+ if linked_agent.get('role') == 'creator':
+ agent_ref = linked_agent.get('ref', '')
+ match = re.match(r'.*/agents/(corporate_entities|people|families)/(\d+)$', agent_ref)
+ if match:
+ creator_ids.append(f'creator_{match.group(1)}_{match.group(2)}')
+ else:
+ self.log.warning(f'{indent}Could not parse creator ID from agent ref: {agent_ref}')
+
+ if not creator_ids:
+ return ead
+
+ try:
+ # Define the Arcuit namespace
+ arcuit_ns = "https://arcuit.library.illinois.edu/ead-extensions"
+
+ # Parse the XML with lxml
+ parser = etree.XMLParser(remove_blank_text=False)
+ root = etree.fromstring(ead.encode('utf-8'), parser)
+ namespace = ''
+ if root.tag.startswith('{'):
+ namespace = root.tag.split('}')[0] + '}'
+
+ # Add arcuit namespace declaration to root element if not present
+ nsmap = root.nsmap.copy() if root.nsmap else {}
+ if 'arcuit' not in nsmap:
+ nsmap['arcuit'] = arcuit_ns
+ # Create a new root element with updated namespace map
+ new_root = etree.Element(root.tag, nsmap=nsmap, attrib=root.attrib)
+ new_root.text = root.text
+ new_root.tail = root.tail
+ for child in root:
+ new_root.append(child)
+ root = new_root
+
+ # Find all origination elements with label="Creator"
+ creator_idx = 0
+ for origination in root.iter(f'{namespace}origination'):
+ if origination.get('label') == 'Creator' and creator_idx < len(creator_ids):
+ creator_id = creator_ids[creator_idx]
+
+ # Find the first name element (corpname, persname, or famname)
+ name_elem = None
+ for tag in ['corpname', 'persname', 'famname']:
+ name_elem = origination.find(f'{namespace}{tag}')
+ if name_elem is not None:
+ break
+
+ if name_elem is not None:
+ # Add the arcuit:creator_id attribute (always, never skip)
+ name_elem.set(f'{{{arcuit_ns}}}creator_id', creator_id)
+ creator_idx += 1
+ else:
+ # No eligible name element found
+ self.log.debug(
+ f'{indent}No eligible name element in for creator ID {creator_id}'
+ )
+
+ # Convert back to string with lxml, preserving XML declaration and namespaces
+ # Serialize to bytes first (which allows xml_declaration), then decode
+ result_bytes = etree.tostring(
+ root,
+ encoding='UTF-8',
+ method='xml',
+ pretty_print=False,
+ xml_declaration=True
+ )
+ result = result_bytes.decode('utf-8')
+ return result
+
+ except etree.ParseError as e:
+ self.log.error(f'{indent}Failed to parse EAD XML: {e}. Returning original content.')
+ return ead
+
+ def inject_collection_metadata(
+ self,
+ ead: str,
+ record_group: Optional[str],
+ subgroup: Optional[str],
+ bioghist_content: Optional[str]
+ ) -> str:
+ """
+ Inject ArcFlow metadata into collection EAD XML after tag.
+
+ Adds:
+ - Record group and subgroup classification labels
+ - Biographical/historical notes from creator agents
+
+ Args:
+ ead: EAD XML as a string
+ record_group: Record group label (e.g., "ALA 52 — Library Periodicals")
+ subgroup: Subgroup label (e.g., "ALA 52.2 — Publications")
+ bioghist_content: XML string of bioghist elements to inject
+
+ Returns:
+ str: Modified EAD XML string
+ """
+ try:
+ # Parse the XML with lxml
+ parser = etree.XMLParser(remove_blank_text=False)
+ root = etree.fromstring(ead.encode('utf-8'), parser)
+
+ # Get the namespace, if any
+ namespace = ''
+ if root.tag.startswith('{'):
+ namespace = root.tag.split('}')[0] + '}'
+
+ archdesc = None
+ for elem in root.iter(f'{namespace}archdesc'):
+ if elem.get('level') == 'collection':
+ archdesc = elem
+ break
+
+ if archdesc is None:
+ return ead
+
+ did = archdesc.find(f'{namespace}did')
+ if did is None:
+ return ead
+
+ did_index = list(archdesc).index(did)
+ insert_index = did_index + 1
+
+ if record_group:
+ recordgroup = etree.Element(f'{namespace}recordgroup')
+ recordgroup.text = record_group
+ archdesc.insert(insert_index, recordgroup)
+ insert_index += 1
+
+ if subgroup:
+ subgroup_elem = etree.Element(f'{namespace}subgroup')
+ subgroup_elem.text = subgroup
+ archdesc.insert(insert_index, subgroup_elem)
+ insert_index += 1
+
+ if bioghist_content:
+ existing_bioghist = None
+ for elem in archdesc:
+ if elem.tag == f'{namespace}bioghist':
+ existing_bioghist = elem
+ break
+
+ try:
+ # Wrap in a temporary root to handle multiple bioghist elements
+ bioghist_wrapper = etree.fromstring(f'{bioghist_content}'.encode('utf-8'))
+ bioghist_elements = list(bioghist_wrapper)
+
+ def _qualify_namespace(elem):
+ """
+ Ensure elem and its descendants use the same namespace as the
+ source EAD document when a default namespace is present.
+ """
+ if not namespace:
+ return
+ for child in elem.iter():
+ if isinstance(child.tag, str) and not child.tag.startswith('{'):
+ child.tag = f'{namespace}{child.tag}'
+
+ if existing_bioghist is not None:
+ for bioghist_elem in bioghist_elements:
+ _qualify_namespace(bioghist_elem)
+ existing_bioghist.append(bioghist_elem)
+ else:
+ # No existing bioghist: insert each parsed bioghist element
+ # directly into archdesc to preserve creator-level wrappers
+ # and attributes (e.g., id) returned by get_creator_bioghist.
+ for bioghist_elem in bioghist_elements:
+ _qualify_namespace(bioghist_elem)
+ archdesc.insert(insert_index, bioghist_elem)
+ insert_index += 1
+
+ except etree.ParseError as e:
+ self.log.warning(f'Failed to parse bioghist content: {e}')
+
+ result_bytes = etree.tostring(
+ root,
+ encoding='UTF-8',
+ method='xml',
+ pretty_print=False,
+ xml_declaration=True
+ )
+ result = result_bytes.decode('utf-8')
+ return result
+
+ except etree.ParseError as e:
+ self.log.error(f'Failed to parse EAD XML: {e}. Returning original content.')
+ return ead
+
+ def add_collection_links_to_eac_cpf(self, eac_cpf_xml: str, indent_size: int = 0) -> str:
+ """
+ Add ead_id:{ead_id}
to
+ elements in EAC-CPF XML.
+
+ For each creatorOf resourceRelation, fetches the linked ArchivesSpace resource
+ to obtain its ead_id. If a resource cannot be fetched (deleted, unpublished, etc.),
+ logs a warning and skips that collection link.
+
+ Args:
+ eac_cpf_xml: EAC-CPF XML as a string
+ indent_size: Indentation size for logging
+
+ Returns:
+ str: Modified EAC-CPF XML string
+
+ Raises:
+ ValueError: If client is not configured (required for fetching resources)
+ """
+ if not self.client:
+ raise ValueError("Client is required for add_collection_links_to_eac_cpf operation")
+
+ indent = ' ' * indent_size
+
+ # Save the original XML to return if no changes are made
+ original_xml = eac_cpf_xml
+
+ try:
+ # Parse the XML with lxml, handling potential namespace issues
+ parser = etree.XMLParser(remove_blank_text=False)
+ try:
+ root = etree.fromstring(eac_cpf_xml.encode('utf-8'), parser)
+ except etree.ParseError:
+ # If parsing fails, it might be due to undeclared namespaces
+ # Try to fix by adding namespace declarations
+ if 'xlink:' in eac_cpf_xml and 'xmlns:xlink' not in eac_cpf_xml:
+ # Add xlink namespace declaration to root element
+ eac_cpf_xml = eac_cpf_xml.replace('', '', 1)
+ root = etree.fromstring(eac_cpf_xml.encode('utf-8'), parser)
+
+ # Detect EAC-CPF namespace
+ namespace = ''
+ if root.tag.startswith('{'):
+ namespace = root.tag.split('}')[0] + '}'
+
+ # Track if any changes were made
+ changes_made = False
+
+ # Find all resourceRelation elements with resourceRelationType="creatorOf"
+ for resource_relation in root.iter(f'{namespace}resourceRelation'):
+ if resource_relation.get('resourceRelationType') != 'creatorOf':
+ continue
+
+ # Check if descriptiveNote with ead_id pattern already exists
+ has_ead_id_note = False
+ for desc_note in resource_relation.findall(f'{namespace}descriptiveNote'):
+ for p in desc_note.findall(f'{namespace}p'):
+ if p.text and p.text.startswith('ead_id:'):
+ has_ead_id_note = True
+ break
+ if has_ead_id_note:
+ break
+
+ if has_ead_id_note:
+ # Already has our descriptiveNote, skip
+ continue
+
+ # Extract href attribute - try multiple variations
+ href = None
+ # Try with xlink namespace
+ for attr_key in resource_relation.attrib:
+ if 'href' in attr_key:
+ href = resource_relation.attrib[attr_key]
+ break
+
+ if not href:
+ continue
+
+ # Only process resource URLs (skip digital_objects, etc.)
+ # Pattern: repositories/{number}/resources/{number}
+ uri_match = re.search(r'/repositories/(\d+)/resources/(\d+)', href)
+ if not uri_match:
+ # Not a resource URL (likely digital_object or other type) - skip silently
+ continue
+
+ res_repo_id = uri_match.group(1)
+ res_resource_id = uri_match.group(2)
+
+ # Fetch resource to get ead_id; skip on any error
+ try:
+ response = self.client.get(f'/repositories/{res_repo_id}/resources/{res_resource_id}')
+ if response.status_code != 200:
+ self.log.warning(
+ f'{indent}Could not fetch resource {href}: HTTP {response.status_code}. '
+ 'Skipping collection link.')
+ continue
+
+ resource = response.json()
+ ead_id = resource.get('ead_id')
+ if not ead_id:
+ self.log.warning(
+ f'{indent}Resource /repositories/{res_repo_id}/resources/{res_resource_id} '
+ 'has no ead_id. Skipping collection link.')
+ continue
+
+ # Create descriptiveNote element with ead_id (namespace-aware)
+ descriptive_note = etree.Element(f'{namespace}descriptiveNote')
+ p = etree.SubElement(descriptive_note, f'{namespace}p')
+ p.text = f'ead_id:{ead_id}'
+
+ # Append to resourceRelation
+ resource_relation.append(descriptive_note)
+ changes_made = True
+
+ except Exception as e:
+ self.log.warning(f'{indent}Could not fetch resource for {href}: {e}. Skipping collection link.')
+ continue
+
+ # Only convert back to string if changes were made
+ if changes_made:
+ result_bytes = etree.tostring(
+ root,
+ encoding='UTF-8',
+ method='xml',
+ pretty_print=False,
+ xml_declaration=True
+ )
+ result = result_bytes.decode('utf-8')
+ return result
+ else:
+ # Return original XML (not the potentially modified version with namespace)
+ return original_xml
+
+ except etree.ParseError as e:
+ self.log.error(f'{indent}Failed to parse EAC-CPF XML: {e}. Returning original content.')
+ return original_xml
+
+ def build_bioghist_element(
+ self,
+ agent_name: str,
+ persistent_id: Optional[str],
+ paragraphs: List[str]
+ ) -> str:
+ """
+ Build bioghist XML element from structured data using lxml for proper escaping.
+
+ Args:
+ agent_name: Name of the agent for the head element
+ persistent_id: Persistent ID for the bioghist element (optional)
+ paragraphs: List of plain text paragraph strings (will be wrapped in tags with proper escaping)
+
+ Returns:
+ str: Bioghist XML element as a string
+ """
+ # Create bioghist element
+ bioghist = etree.Element('bioghist')
+
+ # Add id attribute if persistent_id is available
+ if persistent_id:
+ bioghist.set('id', f'aspace_{persistent_id}')
+
+ # Create head element with escaped text
+ head = etree.SubElement(bioghist, 'head')
+ head.text = f'Historical Note from {agent_name} Creator Record'
+
+ # Create
elements from plain text paragraphs
+ # lxml automatically handles XML escaping
+ for paragraph_text in paragraphs:
+ p = etree.SubElement(bioghist, 'p')
+ p.text = paragraph_text
+
+ # Convert to string (no XML declaration for fragments)
+ return etree.tostring(bioghist, encoding='unicode', method='xml')
+
+ def validate_eac_cpf_xml(self, eac_cpf_xml: str, agent_uri: str, indent_size: int = 0) -> Optional['etree._Element']:
+ """
+ Parse and validate EAC-CPF XML structure.
+
+ Args:
+ eac_cpf_xml: EAC-CPF XML as a string
+ agent_uri: Agent URI for logging purposes
+ indent_size: Indentation size for logging
+
+ Returns:
+ lxml Element if valid, None if parsing fails
+ """
+ indent = ' ' * indent_size
+
+ try:
+ # Try to parse with lxml, with fallback for missing xlink namespace
+ parser = etree.XMLParser(remove_blank_text=False)
+ try:
+ root = etree.fromstring(eac_cpf_xml.encode('utf-8'), parser)
+ except etree.ParseError:
+ # If parsing fails, it might be due to undeclared namespaces
+ if 'xlink:' in eac_cpf_xml and 'xmlns:xlink' not in eac_cpf_xml:
+ # Add xlink namespace declaration to root element
+ eac_cpf_xml = eac_cpf_xml.replace('', '', 1)
+ root = etree.fromstring(eac_cpf_xml.encode('utf-8'), parser)
+
+ self.log.debug(f'{indent}Parsed EAC-CPF XML root element: {root.tag}')
+ return root
+ except etree.ParseError as e:
+ self.log.error(f'{indent}Failed to parse EAC-CPF XML for {agent_uri}: {e}')
+ return None
\ No newline at end of file
diff --git a/example_traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb
index 177da4f..7b804f3 100644
--- a/example_traject_config_eac_cpf.rb
+++ b/example_traject_config_eac_cpf.rb
@@ -203,6 +203,14 @@
end
end
+# Related Agents - Parallel array of names to match relationship ids, uris and type
+to_field 'related_agent_names_ssim' do |record, accumulator|
+ relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation/eac:relationEntry', EAC_NS)
+ relations.each do |rel|
+ accumulator << rel.text
+ end
+end
+
# Related Agents - Parallel array of relationship types to match relationship ids and uris
to_field 'related_agent_relationship_types_ssim' do |record, accumulator|
relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS)
@@ -224,6 +232,66 @@
end
end
+# Collections this creator is responsible for - EAD IDs injected by arcflow
+# into elements as:
+# ead_id:{ead_id}
+# Indexed as an array of EAD IDs (e.g., ["ALA.9.5.16"]) for bidirectional
+# creator↔collection linking in Solr.
+to_field 'creator_of_collection__collection_ids_ssim' do |record, accumulator|
+ relations = record.xpath(
+ '//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="creatorOf"]',
+ EAC_NS
+ )
+ relations.each do |rel|
+ note = rel.xpath('eac:descriptiveNote/eac:p', EAC_NS).first
+ if note && note.text =~ /\Aead_id:(.+)\z/
+ accumulator << $1.strip
+ end
+ end
+end
+
+to_field 'creator_of_collection__collection_name_ssim' do |record, accumulator|
+ relations = record.xpath(
+ '//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="creatorOf"]',
+ EAC_NS
+ )
+ relations.each do |rel|
+ note = rel.xpath('eac:descriptiveNote/eac:p', EAC_NS).first
+ if note && note.text =~ /\Aead_id:(.+)\z/
+ name = rel.xpath('eac:relationEntry', EAC_NS)
+ accumulator << name.text
+ end
+ end
+end
+
+
+to_field 'creator_of_digital_object__do_ids_ssim' do |record, accumulator|
+ relations = record.xpath(
+ '//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="creatorOf"]',
+ EAC_NS
+ )
+ relations.each do |rel|
+ href = rel['href'] || rel['xlink:href']
+ if href.include? "digital_object"
+ accumulator << href
+ end
+ end
+end
+
+to_field 'subject_of_digital_object__do_ids_ssim' do |record, accumulator|
+ relations = record.xpath(
+ '//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="subjectOf"]',
+ EAC_NS
+ )
+ relations.each do |rel|
+ href = rel['href'] || rel['xlink:href']
+ if href.include? "digital_object"
+ accumulator << href
+ end
+ end
+end
+
+
# Agent source URI (from original ArchivesSpace)
to_field 'agent_uri_ssi' do |record, accumulator|
# Try to extract from control section or otherRecordId
@@ -238,11 +306,6 @@
accumulator << Time.now.utc.iso8601
end
-# # Document type marker
-# to_field 'document_type' do |record, accumulator|
-# accumulator << 'creator'
-# end
-
# Log successful indexing
each_record do |record, context|
record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first
diff --git a/example_traject_config_ead_extra.rb b/example_traject_config_ead_extra.rb
new file mode 100644
index 0000000..8ad70a7
--- /dev/null
+++ b/example_traject_config_ead_extra.rb
@@ -0,0 +1,66 @@
+# Example Traject extra config for EAD collection indexing.
+# You can copy this file into Arclight (or a theme you have modifying Arclight,
+# e.g., Arcuit):
+# {arclight_dir}/lib/arcuit/traject/ead_extra_config.rb
+#
+# Any additional Traject commands you add to this file will be added to collection
+# records in Arclight.
+#
+# This file shows the fields that arcflow injects into EAD XML to support:
+# 1. Record group and sub-group categories
+# 2. Solr ID for the creator records also created by arcflow
+#
+# GROUP + SUB-GROUP
+# Arcflow adds and elements directly after
+# ALA 52 — Library Periodicals Round Table
+# ALA 52.2 — Publications
+#
+# CREATOR RECORDS
+# Arcflow adds arcuit:creator_id attributes to origination name elements
+# using a custom namespace to avoid collisions with existing authfilenumber values:
+#
+#
+#
+# ALA Allied Professional Association
+#
+#
+#
+
+#
+
+# Creator ArcLight IDs - extracted from arcuit:creator_id attributes on origination
+# name elements (, , ) injected by arcflow.
+# Uses custom namespace xmlns:arcuit="https://arcuit.library.illinois.edu/ead-extensions"
+# Indexed as an array of creator IDs (e.g., ["creator_corporate_entities_123"])
+# for bidirectional creator↔collection linking in Solr.
+to_field 'creator_arclight_ids_ssim' do |record, accumulator|
+ # Define namespace
+ arcuit_ns = {'arcuit' => 'https://arcuit.library.illinois.edu/ead-extensions',
+ 'ead' => 'urn:isbn:1-931666-22-9'}
+
+ # Extract arcuit:creator_id from origination name elements
+ record.xpath('//ead:archdesc/ead:did/ead:origination/ead:corpname[@arcuit:creator_id] |
+ //ead:archdesc/ead:did/ead:origination/ead:persname[@arcuit:creator_id] |
+ //ead:archdesc/ead:did/ead:origination/ead:famname[@arcuit:creator_id]',
+ arcuit_ns).each do |node|
+ accumulator << node['arcuit:creator_id']
+ end
+
+ # Also check without EAD namespace (some ASpace EAD exports omit it)
+ if accumulator.empty?
+ record.xpath('//archdesc/did/origination/corpname[@arcuit:creator_id] |
+ //archdesc/did/origination/persname[@arcuit:creator_id] |
+ //archdesc/did/origination/famname[@arcuit:creator_id]',
+ arcuit_ns).each do |node|
+ accumulator << node['arcuit:creator_id']
+ end
+ end
+end
+
+# Record group and sub-group - extracted from recordgroup and subgroup elements
+# injected by Arcflow into EAD documents created by ArchivesSpace
+to_field 'record_group_ssim', extract_xpath('/ead/archdesc/recordgroup')
+to_field 'subgroup_ssim', extract_xpath('/ead/archdesc/subgroup')
diff --git a/requirements.txt b/requirements.txt
index 6efbe65..84174a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
ArchivesSnake
-pyyaml
\ No newline at end of file
+pyyaml
+lxml
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_agent_service.py b/tests/test_agent_service.py
new file mode 100644
index 0000000..fde3792
--- /dev/null
+++ b/tests/test_agent_service.py
@@ -0,0 +1,257 @@
+"""
+Tests for AgentService.
+"""
+
+import unittest
+from unittest.mock import Mock
+from arcflow.services.agent_service import AgentService
+
+
+class TestAgentService(unittest.TestCase):
+ """Test cases for AgentService."""
+
+ def setUp(self):
+ """Set up test fixtures."""
+ self.mock_client = Mock()
+ self.mock_log = Mock()
+ self.service = AgentService(client=self.mock_client, log=self.mock_log)
+
+ def test_get_agent_bioghist_data_success(self):
+ """Test successfully fetching agent bioghist data."""
+ # Mock agent response
+ mock_response = Mock()
+ mock_response.json.return_value = {
+ 'title': 'Test Agent',
+ 'notes': [
+ {
+ 'jsonmodel_type': 'note_bioghist',
+ 'persistent_id': 'abc123',
+ 'subnotes': [
+ {'content': 'First paragraph.\nSecond paragraph.'}
+ ]
+ }
+ ]
+ }
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.get_agent_bioghist_data('/agents/corporate_entities/123')
+
+ self.assertIsNotNone(result)
+ self.assertEqual(result['agent_name'], 'Test Agent')
+ self.assertEqual(result['persistent_id'], 'abc123')
+ self.assertEqual(len(result['paragraphs']), 2)
+ self.assertIn('First paragraph.', result['paragraphs'])
+ self.assertIn('Second paragraph.', result['paragraphs'])
+
+ def test_get_agent_bioghist_data_no_bioghist(self):
+ """Test fetching agent with no bioghist notes."""
+ mock_response = Mock()
+ mock_response.json.return_value = {
+ 'title': 'Test Agent',
+ 'notes': []
+ }
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.get_agent_bioghist_data('/agents/corporate_entities/123')
+
+ self.assertIsNone(result)
+
+ def test_get_agent_bioghist_data_with_list_content(self):
+ """Test handling subnote content as a list."""
+ mock_response = Mock()
+ mock_response.json.return_value = {
+ 'title': 'Test Agent',
+ 'notes': [
+ {
+ 'jsonmodel_type': 'note_bioghist',
+ 'persistent_id': 'xyz789',
+ 'subnotes': [
+ {'content': ['First item', 'Second item']}
+ ]
+ }
+ ]
+ }
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.get_agent_bioghist_data('/agents/people/456')
+
+ self.assertIsNotNone(result)
+ self.assertEqual(len(result['paragraphs']), 2)
+ self.assertIn('First item', result['paragraphs'])
+ self.assertIn('Second item', result['paragraphs'])
+
+ def test_get_agent_bioghist_data_filters_empty_lines(self):
+ """Test that empty lines are filtered out."""
+ mock_response = Mock()
+ mock_response.json.return_value = {
+ 'title': 'Test Agent',
+ 'notes': [
+ {
+ 'jsonmodel_type': 'note_bioghist',
+ 'persistent_id': 'def456',
+ 'subnotes': [
+ {'content': 'Line 1\n\n\nLine 2\n \nLine 3'}
+ ]
+ }
+ ]
+ }
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.get_agent_bioghist_data('/agents/families/789')
+
+ self.assertIsNotNone(result)
+ self.assertEqual(len(result['paragraphs']), 3)
+ self.assertIn('Line 1', result['paragraphs'])
+ self.assertIn('Line 2', result['paragraphs'])
+ self.assertIn('Line 3', result['paragraphs'])
+
+ def test_get_agent_bioghist_data_missing_persistent_id(self):
+ """Test handling bioghist note without persistent_id."""
+ mock_response = Mock()
+ mock_response.json.return_value = {
+ 'title': 'Test Agent',
+ 'notes': [
+ {
+ 'jsonmodel_type': 'note_bioghist',
+ # No persistent_id
+ 'subnotes': [
+ {'content': 'Some content'}
+ ]
+ }
+ ]
+ }
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.get_agent_bioghist_data('/agents/corporate_entities/999')
+
+ self.assertIsNotNone(result)
+ self.assertIsNone(result['persistent_id'])
+ # Should log error about missing persistent_id
+ self.mock_log.error.assert_called()
+ error_call = str(self.mock_log.error.call_args)
+ self.assertIn('ASSUMPTION VIOLATION', error_call)
+ self.assertIn('persistent_id', error_call)
+
+ def test_get_agent_bioghist_data_invalid_content_type(self):
+ """Test handling unexpected content type."""
+ mock_response = Mock()
+ mock_response.json.return_value = {
+ 'title': 'Test Agent',
+ 'notes': [
+ {
+ 'jsonmodel_type': 'note_bioghist',
+ 'persistent_id': 'ghi123',
+ 'subnotes': [
+ {'content': {'unexpected': 'dict'}} # Invalid type
+ ]
+ }
+ ]
+ }
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.get_agent_bioghist_data('/agents/corporate_entities/111')
+
+ # Should return None when no valid paragraphs are extracted
+ self.assertIsNone(result)
+ # Should log error about unexpected type
+ self.mock_log.error.assert_called()
+ error_calls = [str(call) for call in self.mock_log.error.call_args_list]
+ error_text = ''.join(error_calls)
+ self.assertIn('ASSUMPTION VIOLATION', error_text)
+ self.assertIn('dict', error_text)
+
+ def test_get_agent_bioghist_data_uses_display_name_fallback(self):
+ """Test using display_name.sort_name when title is missing."""
+ mock_response = Mock()
+ mock_response.json.return_value = {
+ # No 'title' field
+ 'display_name': {'sort_name': 'Fallback Name'},
+ 'notes': [
+ {
+ 'jsonmodel_type': 'note_bioghist',
+ 'persistent_id': 'jkl456',
+ 'subnotes': [
+ {'content': 'Some content'}
+ ]
+ }
+ ]
+ }
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.get_agent_bioghist_data('/agents/people/222')
+
+ self.assertIsNotNone(result)
+ self.assertEqual(result['agent_name'], 'Fallback Name')
+
+ def test_get_agent_bioghist_data_handles_exception(self):
+ """Test handling exceptions during agent fetch."""
+ self.mock_client.get.side_effect = Exception('Network error')
+
+ result = self.service.get_agent_bioghist_data('/agents/corporate_entities/333')
+
+ self.assertIsNone(result)
+ self.mock_log.error.assert_called()
+ error_call = str(self.mock_log.error.call_args)
+ self.assertIn('Network error', error_call)
+
+ def test_get_agent_bioghist_data_multiple_subnotes(self):
+ """Test handling multiple subnotes in a bioghist note."""
+ mock_response = Mock()
+ mock_response.json.return_value = {
+ 'title': 'Test Agent',
+ 'notes': [
+ {
+ 'jsonmodel_type': 'note_bioghist',
+ 'persistent_id': 'mno789',
+ 'subnotes': [
+ {'content': 'First subnote'},
+ {'content': 'Second subnote'},
+ {'content': 'Third subnote'}
+ ]
+ }
+ ]
+ }
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.get_agent_bioghist_data('/agents/families/444')
+
+ self.assertIsNotNone(result)
+ self.assertEqual(len(result['paragraphs']), 3)
+ self.assertIn('First subnote', result['paragraphs'])
+ self.assertIn('Second subnote', result['paragraphs'])
+ self.assertIn('Third subnote', result['paragraphs'])
+
+ def test_get_agent_bioghist_data_returns_first_bioghist_only(self):
+ """Test that only the first bioghist note is returned."""
+ mock_response = Mock()
+ mock_response.json.return_value = {
+ 'title': 'Test Agent',
+ 'notes': [
+ {
+ 'jsonmodel_type': 'note_bioghist',
+ 'persistent_id': 'first123',
+ 'subnotes': [
+ {'content': 'First bioghist'}
+ ]
+ },
+ {
+ 'jsonmodel_type': 'note_bioghist',
+ 'persistent_id': 'second456',
+ 'subnotes': [
+ {'content': 'Second bioghist'}
+ ]
+ }
+ ]
+ }
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.get_agent_bioghist_data('/agents/corporate_entities/555')
+
+ self.assertIsNotNone(result)
+ self.assertEqual(result['persistent_id'], 'first123')
+ self.assertIn('First bioghist', result['paragraphs'])
+ self.assertNotIn('Second bioghist', result['paragraphs'])
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/test_xml_transform_service.py b/tests/test_xml_transform_service.py
new file mode 100644
index 0000000..b49ec10
--- /dev/null
+++ b/tests/test_xml_transform_service.py
@@ -0,0 +1,506 @@
+"""
+Tests for XmlTransformService.
+"""
+
+import unittest
+from unittest.mock import Mock
+from arcflow.services.xml_transform_service import XmlTransformService
+
+# Real ArchivesSpace EAD fixture with namespace
+REAL_EAD_WITH_NAMESPACE = '''
+
+
+ test-collection
+
+
+
+ Test Collection with Namespace
+
+ Test Corporation
+
+
+
+'''
+
+# Real EAC-CPF fixture with namespace
+REAL_EAC_CPF_WITH_NAMESPACE = '''
+
+
+ test-agent
+
+
+
+
+ Test Collection
+
+
+
+'''
+
+class TestXmlTransformService(unittest.TestCase):
+ """Test cases for XmlTransformService."""
+
+ def setUp(self):
+ """Set up test fixtures."""
+ self.mock_client = Mock()
+ self.mock_log = Mock()
+ self.service = XmlTransformService(client=self.mock_client, log=self.mock_log)
+
+ def test_add_creator_ids_to_ead(self):
+ """Test adding arcuit:creator_id attributes to origination elements."""
+
+ resource = {
+ 'linked_agents': [
+ {'role': 'creator', 'ref': '/agents/corporate_entities/123'}
+ ]
+ }
+
+ result = self.service.add_creator_ids_to_ead(REAL_EAD_WITH_NAMESPACE, resource)
+
+ # Should contain arcuit namespace declaration
+ self.assertIn('xmlns:arcuit', result)
+ self.assertIn('https://arcuit.library.illinois.edu/ead-extensions', result)
+ # Should contain the creator_id attribute
+ self.assertIn('creator_id="creator_corporate_entities_123"', result)
+ # Should preserve EAD namespace
+ self.assertIn('urn:isbn:1-931666-22-9', result)
+ # Should still find and modify the corpname element
+ self.assertIn('corpname', result)
+
+ def test_add_creator_ids_multiple_creators(self):
+ """Test adding arcuit:creator_id to multiple origination elements."""
+ xml_content = '''
+
+ First Corp
+
+
+ Second Person
+
+'''
+
+ resource = {
+ 'linked_agents': [
+ {'role': 'creator', 'ref': '/agents/corporate_entities/123'},
+ {'role': 'creator', 'ref': '/agents/people/456'}
+ ]
+ }
+
+ result = self.service.add_creator_ids_to_ead(xml_content, resource)
+
+ self.assertIn('creator_id="creator_corporate_entities_123"', result)
+ self.assertIn('creator_id="creator_people_456"', result)
+ self.assertIn('xmlns:arcuit', result)
+
+ def test_add_creator_ids_no_creators(self):
+ """Test that XML is unchanged when there are no creators."""
+ xml_content = 'Test'
+ resource = {'linked_agents': []}
+
+ result = self.service.add_creator_ids_to_ead(xml_content, resource)
+
+ self.assertEqual(xml_content, result)
+
+ def test_inject_collection_metadata_with_all_fields(self):
+ """Test injecting record group, subgroup, and bioghist."""
+ xml_content = '''
+
+
+
+ Test Collection
+
+
+ '''
+
+ result = self.service.inject_collection_metadata(
+ xml_content,
+ record_group='RG 1 — Test Group',
+ subgroup='SG 1.1 — Test Subgroup',
+ bioghist_content='Test bioghist
'
+ )
+
+ # Should add recordgroup with namespace
+ self.assertIn('recordgroup', result)
+ self.assertIn('RG 1 — Test Group', result)
+ # Should add subgroup with namespace
+ self.assertIn('subgroup', result)
+ self.assertIn('SG 1.1 — Test Subgroup', result)
+ # Should add bioghist with EAD namespace
+ self.assertIn('bioghist', result)
+ self.assertIn('Test bioghist', result)
+ # Should preserve original namespace
+ self.assertIn('xmlns', result)
+ self.assertIn('urn:isbn:1-931666-22-9', result)
+
+ def test_inject_collection_metadata_into_existing_bioghist(self):
+ """Test that bioghist content is inserted into existing bioghist element."""
+ xml_content = '''
+
+
+ Test Collection
+
+
+ Existing content
+
+
+'''
+
+ result = self.service.inject_collection_metadata(
+ xml_content,
+ record_group=None,
+ subgroup=None,
+ bioghist_content='New content
'
+ )
+
+ # Should insert before
+ self.assertIn('Existing content', result)
+ self.assertIn('New content', result)
+ # Should not create a new bioghist wrapper
+ self.assertEqual(result.count(''), 2) # Original + inserted
+
+ def test_inject_collection_metadata_xml_escaping(self):
+ """Test that special XML characters are properly escaped."""
+ xml_content = '''
+
+
+ Test
+
+
+'''
+
+ result = self.service.inject_collection_metadata(
+ xml_content,
+ record_group='Group & Co ',
+ subgroup=None,
+ bioghist_content=None
+ )
+
+ self.assertIn('Group & Co <test>', result)
+ self.assertNotIn('Group & Co ', result)
+
+ def test_add_collection_links_to_eac_cpf(self):
+ """Test adding ead_id descriptiveNote to resourceRelation elements."""
+
+ # Mock the client response
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.json.return_value = {'ead_id': 'TEST.1.2.3'}
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.add_collection_links_to_eac_cpf(REAL_EAC_CPF_WITH_NAMESPACE)
+
+ # Should add descriptiveNote (namespace-aware check)
+ self.assertIn('descriptiveNote', result)
+ self.assertIn('ead_id:TEST.1.2.3', result)
+ # Should preserve EAC-CPF namespace
+ self.assertIn('urn:isbn:1-931666-33-4', result)
+
+ def test_multiple_creators_with_namespace(self):
+ """Test handling multiple creators when EAD has default namespace."""
+ xml_with_namespace = '''
+
+
+
+
+ First Corp
+
+
+ Second Person
+
+
+
+'''
+
+ resource = {
+ 'linked_agents': [
+ {'role': 'creator', 'ref': '/agents/corporate_entities/123'},
+ {'role': 'creator', 'ref': '/agents/people/456'}
+ ]
+ }
+
+ result = self.service.add_creator_ids_to_ead(xml_with_namespace, resource)
+
+ # Should add both creator IDs
+ self.assertIn('creator_id="creator_corporate_entities_123"', result)
+ self.assertIn('creator_id="creator_people_456"', result)
+ # Should preserve namespace
+ self.assertIn('urn:isbn:1-931666-22-9', result)
+
+ def test_add_collection_links_idempotent(self):
+ """Test that adding collection links is idempotent."""
+ eac_cpf_xml = '''
+
+ Test Collection
+
+ ead_id:TEST.1.2.3
+
+
+'''
+
+ result = self.service.add_collection_links_to_eac_cpf(eac_cpf_xml)
+
+ # Should not call the client since descriptiveNote already exists
+ self.mock_client.get.assert_not_called()
+ # Should return unchanged XML
+ self.assertEqual(eac_cpf_xml, result)
+
+ def test_add_collection_links_skips_digital_objects(self):
+ """Test that digital object URLs are skipped silently."""
+ eac_cpf_xml = '''
+
+ Test Digital Object
+
+'''
+
+ result = self.service.add_collection_links_to_eac_cpf(eac_cpf_xml)
+
+ # Should not call the client
+ self.mock_client.get.assert_not_called()
+ # Should return unchanged XML
+ self.assertEqual(eac_cpf_xml, result)
+
+ def test_add_collection_links_handles_fetch_errors(self):
+ """Test that fetch errors are handled gracefully."""
+ eac_cpf_xml = '''
+
+ Test Collection
+
+'''
+
+ # Mock a 404 response
+ mock_response = Mock()
+ mock_response.status_code = 404
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.add_collection_links_to_eac_cpf(eac_cpf_xml)
+
+ # Should log a warning
+ self.mock_log.warning.assert_called()
+ # Should return unchanged XML
+ self.assertNotIn('', result)
+
+ def test_build_bioghist_element(self):
+ """Test building bioghist XML element from structured data."""
+ result = self.service.build_bioghist_element(
+ agent_name='Test Agent',
+ persistent_id='abc123',
+ paragraphs=['First paragraph', 'Second paragraph']
+ )
+
+ self.assertIn('', result)
+ self.assertIn('Historical Note from Test Agent Creator Record', result)
+ self.assertIn('First paragraph
', result)
+ self.assertIn('Second paragraph
', result)
+ self.assertIn('', result)
+
+ def test_build_bioghist_element_without_persistent_id(self):
+ """Test building bioghist without persistent_id."""
+ result = self.service.build_bioghist_element(
+ agent_name='Test Agent',
+ persistent_id=None,
+ paragraphs=['Content']
+ )
+
+ self.assertIn('', result)
+ self.assertNotIn('id=', result)
+ self.assertIn('Content
', result)
+
+ def test_build_bioghist_element_escapes_agent_name(self):
+ """Test that agent name is properly XML-escaped."""
+ result = self.service.build_bioghist_element(
+ agent_name='Agent & Co ',
+ persistent_id='abc',
+ paragraphs=['Content']
+ )
+
+ self.assertIn('Agent & Co <test>', result)
+
+ def test_build_bioghist_element_escapes_paragraph_content(self):
+ """Test that paragraph content with special XML characters is properly escaped."""
+ result = self.service.build_bioghist_element(
+ agent_name='Test Agent',
+ persistent_id='abc',
+ paragraphs=['Content with & ampersand', 'Content with and "quotes"']
+ )
+
+ self.assertIn('Content with & ampersand
', result)
+ self.assertIn('Content with <tags> and "quotes"
', result)
+
+ def test_validate_eac_cpf_xml_valid(self):
+ """Test validating valid EAC-CPF XML."""
+ eac_cpf_xml = ''
+
+ root = self.service.validate_eac_cpf_xml(eac_cpf_xml, '/agents/corporate_entities/123')
+
+ self.assertIsNotNone(root)
+ self.assertEqual(root.tag, 'eac-cpf')
+
+ def test_validate_eac_cpf_xml_invalid(self):
+ """Test validating invalid EAC-CPF XML."""
+ eac_cpf_xml = '' # Missing closing tags
+
+ root = self.service.validate_eac_cpf_xml(eac_cpf_xml, '/agents/corporate_entities/123')
+
+ self.assertIsNone(root)
+ self.mock_log.error.assert_called()
+
+ def test_add_collection_links_requires_client(self):
+ """Test that add_collection_links_to_eac_cpf requires a client."""
+ service_no_client = XmlTransformService(client=None)
+
+ with self.assertRaises(ValueError) as context:
+ service_no_client.add_collection_links_to_eac_cpf('')
+
+ self.assertIn('Client is required', str(context.exception))
+
+ def test_namespace_preservation_ead_with_declaration(self):
+ """Test that EAD namespace prefixes and XML declaration are preserved."""
+ xml_input = '''
+
+
+ test-collection
+
+
+
+ Test Collection
+
+ Test Corporation
+
+
+
+'''
+
+ resource = {
+ 'linked_agents': [
+ {'role': 'creator', 'ref': '/agents/corporate_entities/123'}
+ ]
+ }
+
+ result = self.service.add_creator_ids_to_ead(xml_input, resource)
+
+ # Should have XML declaration
+ self.assertTrue(result.startswith('
+
+
+ test-agent
+
+
+
+
+ Test Collection
+
+
+
+'''
+
+ # Mock the client response
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.json.return_value = {'ead_id': 'TEST.1.2.3'}
+ self.mock_client.get.return_value = mock_response
+
+ result = self.service.add_collection_links_to_eac_cpf(xml_input)
+
+ # Should have XML declaration
+ self.assertTrue(result.startswith('
+
+
+ test-collection
+
+
+
+ Test Collection
+
+
+'''
+
+ bioghist_content = '''
+ Historical Note from Test Agent Creator Record
+ Test paragraph
+'''
+
+ result = self.service.inject_collection_metadata(
+ xml_input,
+ record_group="Test Group",
+ subgroup="Test Subgroup",
+ bioghist_content=bioghist_content
+ )
+
+ # Should have XML declaration
+ self.assertTrue(result.startswith('', result)
+ self.assertIn('', result)
+ self.assertIn('
+
+ test-agent
+
+'''
+
+ # No changes will be made (no resourceRelations)
+ result = self.service.add_collection_links_to_eac_cpf(xml_input)
+
+ # Should not add XML declaration when original didn't have one and no changes made
+ self.assertEqual(xml_input, result, 'Unchanged XML should be returned as-is')
+ self.assertFalse(result.startswith('