diff --git a/arcflow/__init__.py b/arcflow/__init__.py index f80bba7..7856a3d 100644 --- a/arcflow/__init__.py +++ b/arcflow/__init__.py @@ -1 +1,15 @@ -from .main import ArcFlow \ No newline at end of file +""" +ArcFlow package for syncing ArchivesSpace to ArcLight. + +To use ArcFlow, import directly from the main module: + from arcflow.main import ArcFlow + +Services can be imported independently: + from arcflow.services.xml_transform_service import XmlTransformService + from arcflow.services.agent_service import AgentService + +The top-level import is disabled to avoid eager loading of dependencies. +""" + +# Avoid eager imports to allow services to be imported independently +# from .main import ArcFlow \ No newline at end of file diff --git a/arcflow/main.py b/arcflow/main.py index 430539a..acac689 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -19,6 +19,8 @@ from asnake.client import ASnakeClient from multiprocessing.pool import ThreadPool as Pool from utils.stage_classifications import extract_labels +from .services.xml_transform_service import XmlTransformService +from .services.agent_service import AgentService import glob base_dir = os.path.abspath((__file__) + "/../../") @@ -115,6 +117,10 @@ def __init__(self, arclight_dir, aspace_dir, solr_url, aspace_solr_url, ead_extr self.log.error(f'Error authorizing ASnakeClient: {e}') exit(0) + # Initialize services + self.xml_transform = XmlTransformService(client=self.client, log=self.log) + self.agent_service = AgentService(client=self.client, log=self.log) + def is_running(self): """ @@ -262,50 +268,24 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): # (record group/subgroup labels and biographical/historical notes) if xml.content: xml_content = xml.content.decode('utf-8') - insert_pos = xml_content.find('') - - if insert_pos != -1: - # Find the position after the closing tag - did_end_pos = xml_content.find('', insert_pos) - - if did_end_pos != -1: - # Move to after the tag - did_end_pos += len('') - extra_xml = '' - - # Add record group and subgroup labels - rg_label, sg_label = extract_labels(resource)[1:3] - if rg_label: - extra_xml += f'\n{xml_escape(rg_label)}' - if sg_label: - extra_xml += f'\n{xml_escape(sg_label)}' - - # Handle biographical/historical notes from creator agents - bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) - if bioghist_content: - # Check if there's already a bioghist element in the EAD - # Search for existing bioghist after but before - archdesc_end = xml_content.find('', did_end_pos) - search_section = xml_content[did_end_pos:archdesc_end] if archdesc_end != -1 else xml_content[did_end_pos:] - - # Look for closing tag - existing_bioghist_end = search_section.rfind('') - - if existing_bioghist_end != -1: - # Found existing bioghist - insert agent elements INSIDE it (before closing tag) - insert_pos = did_end_pos + existing_bioghist_end - xml_content = (xml_content[:insert_pos] + - f'\n{bioghist_content}\n' + - xml_content[insert_pos:]) - else: - # No existing bioghist - wrap agent elements in parent container - wrapped_content = f'\n{bioghist_content}\n' - extra_xml += f'\n{wrapped_content}' - - if extra_xml: - xml_content = (xml_content[:did_end_pos] + - extra_xml + - xml_content[did_end_pos:]) + + # Add arcuit:creator_id attributes (in a custom namespace) to origination name elements + # (links creator names in EAD to their corresponding creator records, e.g., in Solr) + xml_content = self.xml_transform.add_creator_ids_to_ead(xml_content, resource, indent_size=indent_size) + + # Get record group and subgroup labels + rg_label, sg_label = extract_labels(resource)[1:3] + + # Get biographical/historical notes from creator agents + bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) + + # Inject all collection metadata using XmlTransformService + xml_content = self.xml_transform.inject_collection_metadata( + xml_content, + record_group=rg_label, + subgroup=sg_label, + bioghist_content=bioghist_content + ) xml_content = xml_content.encode('utf-8') else: @@ -634,7 +614,6 @@ def get_creator_bioghist(self, resource, indent_size=0): Returns nested bioghist elements for each creator, or None if no creator agents have notes. Each bioghist element includes the creator name in a head element and an id attribute. """ - indent = ' ' * indent_size bioghist_elements = [] if 'linked_agents' not in resource: @@ -646,58 +625,16 @@ def get_creator_bioghist(self, resource, indent_size=0): if linked_agent.get('role') == 'creator': agent_ref = linked_agent.get('ref') if agent_ref: - try: - agent = self.client.get(agent_ref).json() - - # Get agent name for head element - agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown') - - # Check for notes in the agent record - if 'notes' in agent: - for note in agent['notes']: - # Look for biographical/historical notes - if note.get('jsonmodel_type') == 'note_bioghist': - # Get persistent_id for the id attribute - persistent_id = note.get('persistent_id', '') - if not persistent_id: - self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist for agent {agent_ref}') - # Skip creating id attribute if persistent_id is missing - persistent_id = None - - # Extract note content from subnotes - paragraphs = [] - if 'subnotes' in note: - for subnote in note['subnotes']: - if 'content' in subnote: - # Split content on single newlines to create paragraphs - content = subnote['content'] - # Handle content as either string or list with explicit type checking - if isinstance(content, str): - # Split on newline and filter out empty strings - lines = [line.strip() for line in content.split('\n') if line.strip()] - elif isinstance(content, list): - # Content is already a list - use as is - lines = [str(item).strip() for item in content if str(item).strip()] - else: - # Log unexpected content type prominently - self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content in agent {agent_ref}, got {type(content).__name__}') - continue - # Wrap each line in

tags - for line in lines: - paragraphs.append(f'

{line}

') - - # Create nested bioghist element if we have paragraphs - if paragraphs: - paragraphs_xml = '\n'.join(paragraphs) - heading = f'Historical Note from {xml_escape(agent_name)} Creator Record' - # Only include id attribute if persistent_id is available - if persistent_id: - bioghist_el = f'{heading}\n{paragraphs_xml}\n' - else: - bioghist_el = f'{heading}\n{paragraphs_xml}\n' - bioghist_elements.append(bioghist_el) - except Exception as e: - self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}') + bioghist_data = self.agent_service.get_agent_bioghist_data( + agent_ref, indent_size=indent_size + ) + if bioghist_data: + bioghist_xml = self.xml_transform.build_bioghist_element( + bioghist_data['agent_name'], + bioghist_data['persistent_id'], + bioghist_data['paragraphs'] + ) + bioghist_elements.append(bioghist_xml) if bioghist_elements: # Return the agent bioghist elements (unwrapped) @@ -879,14 +816,14 @@ def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0): eac_cpf_xml = response.text - # Parse the EAC-CPF XML to validate and inspect its structure - try: - root = ET.fromstring(eac_cpf_xml) - self.log.debug(f'{indent}Parsed EAC-CPF XML root element: {root.tag}') - except ET.ParseError as e: - self.log.error(f'{indent}Failed to parse EAC-CPF XML for {agent_uri}: {e}') + # Validate EAC-CPF XML structure + if not self.xml_transform.validate_eac_cpf_xml(eac_cpf_xml, agent_uri, indent_size=indent_size): + self.log.error(f'{indent}Invalid EAC-CPF XML for {agent_uri}, skipping') return None + # Add collection ead_ids to resourceRelation creatorOf elements + eac_cpf_xml = self.xml_transform.add_collection_links_to_eac_cpf(eac_cpf_xml, indent_size=indent_size) + # Generate creator ID creator_id = f'creator_{agent_type}_{agent_id}' diff --git a/arcflow/services/__init__.py b/arcflow/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/arcflow/services/agent_service.py b/arcflow/services/agent_service.py new file mode 100644 index 0000000..35e6a16 --- /dev/null +++ b/arcflow/services/agent_service.py @@ -0,0 +1,115 @@ +""" +Service for fetching and processing agent data from ArchivesSpace. + +Handles agent-related operations including: +- Fetching agent biographical/historical notes +- Processing note content into structured data +""" + +import logging +from typing import Optional, List, Dict + + +class AgentService: + """Service for agent data fetching and processing.""" + + def __init__(self, client, log=None): + """ + Initialize the agent service. + + Args: + client: ASnake client for fetching agent data + log: Logger instance (optional, creates default if not provided) + """ + self.client = client + self.log = log or logging.getLogger(__name__) + + def get_agent_bioghist_data(self, agent_uri: str, indent_size: int = 0) -> Optional[Dict]: + """ + Fetch bioghist DATA for an agent. + + Returns structured data (not XML) so it can be used in different contexts: + - Build EAD XML for collections + - Build EAC-CPF XML for creator records + - Display in a web UI + - Export as JSON + + Args: + agent_uri: Agent URI from ArchivesSpace (e.g., '/agents/corporate_entities/123') + indent_size: Indentation size for logging + + Returns: + dict with keys: 'agent_name', 'persistent_id', 'paragraphs' + or None if no bioghist found or on error + """ + indent = ' ' * indent_size + + try: + agent = self.client.get(agent_uri).json() + agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown') + + for note in agent.get('notes', []): + if note.get('jsonmodel_type') == 'note_bioghist': + persistent_id = note.get('persistent_id') + paragraphs = self._extract_paragraphs(note, agent_uri, indent_size) + + if paragraphs: + return { + 'agent_name': agent_name, + 'persistent_id': persistent_id, + 'paragraphs': paragraphs + } + + return None # No bioghist + + except Exception as e: + self.log.error(f'{indent}Error fetching agent {agent_uri}: {e}') + return None + + def _extract_paragraphs(self, note: dict, agent_uri: str, indent_size: int = 0) -> List[str]: + """ + Extract paragraph content from a bioghist note. + + Args: + note: Note dictionary from ArchivesSpace + agent_uri: Agent URI for logging purposes + indent_size: Indentation size for logging + + Returns: + List of plain text paragraph strings (not wrapped in

tags) + """ + indent = ' ' * indent_size + paragraphs = [] + + if 'subnotes' in note: + for subnote in note['subnotes']: + if 'content' in subnote: + content = subnote['content'] + + # Handle content as either string or list with explicit type checking + if isinstance(content, str): + # Split on newline and filter out empty strings + lines = [line.strip() for line in content.split('\n') if line.strip()] + elif isinstance(content, list): + # Content is already a list - use as is + lines = [str(item).strip() for item in content if str(item).strip()] + else: + # Log unexpected content type prominently + self.log.error( + f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content ' + f'in agent {agent_uri}, got {type(content).__name__}' + ) + continue + + # Add plain text lines (will be wrapped in

tags by build_bioghist_element) + for line in lines: + paragraphs.append(line) + + # Log if persistent_id is missing + if not note.get('persistent_id'): + self.log.error( + f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist ' + f'for agent {agent_uri}' + ) + + return paragraphs diff --git a/arcflow/services/xml_transform_service.py b/arcflow/services/xml_transform_service.py new file mode 100644 index 0000000..ea2e3fb --- /dev/null +++ b/arcflow/services/xml_transform_service.py @@ -0,0 +1,445 @@ +""" +Service for transforming and manipulating XML content. + +Handles EAD and EAC-CPF XML transformations including: +- Adding creator IDs to origination elements +- Injecting collection metadata (record groups, subgroups, bioghist) +- Adding collection links to EAC-CPF resourceRelation elements +- Building bioghist XML elements from structured data +""" + +import re +from typing import Optional, List +from lxml import etree +import logging + + +class XmlTransformService: + """Service for XML transformations and manipulations.""" + + def __init__(self, client=None, log=None): + """ + Initialize the XML transform service. + + Args: + client: ASnake client for fetching resources (optional, needed for some operations) + log: Logger instance (optional, creates default if not provided) + """ + self.client = client + self.log = log or logging.getLogger(__name__) + + def add_creator_ids_to_ead(self, ead: str, resource: dict, indent_size: int = 0) -> str: + """ + Add arcuit:creator_id attributes to name elements inside elements in EAD XML. + + Uses a custom namespace (xmlns:arcuit="https://arcuit.library.illinois.edu/ead-extensions") to avoid + collisions with standard EAD attributes like authfilenumber. + + Maps linked_agents with role='creator' to origination elements by index order. + The arcuit:creator_id value is a creator ID in the format creator_{type}_{id}. + + Args: + ead: EAD XML as a string + resource: ArchivesSpace resource record with resolved linked_agents + indent_size: Indentation size for logging + + Returns: + str: Modified EAD XML string with arcuit namespace and creator_id attributes + """ + indent = ' ' * indent_size + + # Extract creator IDs from linked_agents in order + creator_ids = [] + for linked_agent in resource.get('linked_agents', []): + if linked_agent.get('role') == 'creator': + agent_ref = linked_agent.get('ref', '') + match = re.match(r'.*/agents/(corporate_entities|people|families)/(\d+)$', agent_ref) + if match: + creator_ids.append(f'creator_{match.group(1)}_{match.group(2)}') + else: + self.log.warning(f'{indent}Could not parse creator ID from agent ref: {agent_ref}') + + if not creator_ids: + return ead + + try: + # Define the Arcuit namespace + arcuit_ns = "https://arcuit.library.illinois.edu/ead-extensions" + + # Parse the XML with lxml + parser = etree.XMLParser(remove_blank_text=False) + root = etree.fromstring(ead.encode('utf-8'), parser) + namespace = '' + if root.tag.startswith('{'): + namespace = root.tag.split('}')[0] + '}' + + # Add arcuit namespace declaration to root element if not present + nsmap = root.nsmap.copy() if root.nsmap else {} + if 'arcuit' not in nsmap: + nsmap['arcuit'] = arcuit_ns + # Create a new root element with updated namespace map + new_root = etree.Element(root.tag, nsmap=nsmap, attrib=root.attrib) + new_root.text = root.text + new_root.tail = root.tail + for child in root: + new_root.append(child) + root = new_root + + # Find all origination elements with label="Creator" + creator_idx = 0 + for origination in root.iter(f'{namespace}origination'): + if origination.get('label') == 'Creator' and creator_idx < len(creator_ids): + creator_id = creator_ids[creator_idx] + + # Find the first name element (corpname, persname, or famname) + name_elem = None + for tag in ['corpname', 'persname', 'famname']: + name_elem = origination.find(f'{namespace}{tag}') + if name_elem is not None: + break + + if name_elem is not None: + # Add the arcuit:creator_id attribute (always, never skip) + name_elem.set(f'{{{arcuit_ns}}}creator_id', creator_id) + creator_idx += 1 + else: + # No eligible name element found + self.log.debug( + f'{indent}No eligible name element in for creator ID {creator_id}' + ) + + # Convert back to string with lxml, preserving XML declaration and namespaces + # Serialize to bytes first (which allows xml_declaration), then decode + result_bytes = etree.tostring( + root, + encoding='UTF-8', + method='xml', + pretty_print=False, + xml_declaration=True + ) + result = result_bytes.decode('utf-8') + return result + + except etree.ParseError as e: + self.log.error(f'{indent}Failed to parse EAD XML: {e}. Returning original content.') + return ead + + def inject_collection_metadata( + self, + ead: str, + record_group: Optional[str], + subgroup: Optional[str], + bioghist_content: Optional[str] + ) -> str: + """ + Inject ArcFlow metadata into collection EAD XML after tag. + + Adds: + - Record group and subgroup classification labels + - Biographical/historical notes from creator agents + + Args: + ead: EAD XML as a string + record_group: Record group label (e.g., "ALA 52 — Library Periodicals") + subgroup: Subgroup label (e.g., "ALA 52.2 — Publications") + bioghist_content: XML string of bioghist elements to inject + + Returns: + str: Modified EAD XML string + """ + try: + # Parse the XML with lxml + parser = etree.XMLParser(remove_blank_text=False) + root = etree.fromstring(ead.encode('utf-8'), parser) + + # Get the namespace, if any + namespace = '' + if root.tag.startswith('{'): + namespace = root.tag.split('}')[0] + '}' + + archdesc = None + for elem in root.iter(f'{namespace}archdesc'): + if elem.get('level') == 'collection': + archdesc = elem + break + + if archdesc is None: + return ead + + did = archdesc.find(f'{namespace}did') + if did is None: + return ead + + did_index = list(archdesc).index(did) + insert_index = did_index + 1 + + if record_group: + recordgroup = etree.Element(f'{namespace}recordgroup') + recordgroup.text = record_group + archdesc.insert(insert_index, recordgroup) + insert_index += 1 + + if subgroup: + subgroup_elem = etree.Element(f'{namespace}subgroup') + subgroup_elem.text = subgroup + archdesc.insert(insert_index, subgroup_elem) + insert_index += 1 + + if bioghist_content: + existing_bioghist = None + for elem in archdesc: + if elem.tag == f'{namespace}bioghist': + existing_bioghist = elem + break + + try: + # Wrap in a temporary root to handle multiple bioghist elements + bioghist_wrapper = etree.fromstring(f'{bioghist_content}'.encode('utf-8')) + bioghist_elements = list(bioghist_wrapper) + + def _qualify_namespace(elem): + """ + Ensure elem and its descendants use the same namespace as the + source EAD document when a default namespace is present. + """ + if not namespace: + return + for child in elem.iter(): + if isinstance(child.tag, str) and not child.tag.startswith('{'): + child.tag = f'{namespace}{child.tag}' + + if existing_bioghist is not None: + for bioghist_elem in bioghist_elements: + _qualify_namespace(bioghist_elem) + existing_bioghist.append(bioghist_elem) + else: + # No existing bioghist: insert each parsed bioghist element + # directly into archdesc to preserve creator-level wrappers + # and attributes (e.g., id) returned by get_creator_bioghist. + for bioghist_elem in bioghist_elements: + _qualify_namespace(bioghist_elem) + archdesc.insert(insert_index, bioghist_elem) + insert_index += 1 + + except etree.ParseError as e: + self.log.warning(f'Failed to parse bioghist content: {e}') + + result_bytes = etree.tostring( + root, + encoding='UTF-8', + method='xml', + pretty_print=False, + xml_declaration=True + ) + result = result_bytes.decode('utf-8') + return result + + except etree.ParseError as e: + self.log.error(f'Failed to parse EAD XML: {e}. Returning original content.') + return ead + + def add_collection_links_to_eac_cpf(self, eac_cpf_xml: str, indent_size: int = 0) -> str: + """ + Add

ead_id:{ead_id}

to + elements in EAC-CPF XML. + + For each creatorOf resourceRelation, fetches the linked ArchivesSpace resource + to obtain its ead_id. If a resource cannot be fetched (deleted, unpublished, etc.), + logs a warning and skips that collection link. + + Args: + eac_cpf_xml: EAC-CPF XML as a string + indent_size: Indentation size for logging + + Returns: + str: Modified EAC-CPF XML string + + Raises: + ValueError: If client is not configured (required for fetching resources) + """ + if not self.client: + raise ValueError("Client is required for add_collection_links_to_eac_cpf operation") + + indent = ' ' * indent_size + + # Save the original XML to return if no changes are made + original_xml = eac_cpf_xml + + try: + # Parse the XML with lxml, handling potential namespace issues + parser = etree.XMLParser(remove_blank_text=False) + try: + root = etree.fromstring(eac_cpf_xml.encode('utf-8'), parser) + except etree.ParseError: + # If parsing fails, it might be due to undeclared namespaces + # Try to fix by adding namespace declarations + if 'xlink:' in eac_cpf_xml and 'xmlns:xlink' not in eac_cpf_xml: + # Add xlink namespace declaration to root element + eac_cpf_xml = eac_cpf_xml.replace('', '', 1) + root = etree.fromstring(eac_cpf_xml.encode('utf-8'), parser) + + # Detect EAC-CPF namespace + namespace = '' + if root.tag.startswith('{'): + namespace = root.tag.split('}')[0] + '}' + + # Track if any changes were made + changes_made = False + + # Find all resourceRelation elements with resourceRelationType="creatorOf" + for resource_relation in root.iter(f'{namespace}resourceRelation'): + if resource_relation.get('resourceRelationType') != 'creatorOf': + continue + + # Check if descriptiveNote with ead_id pattern already exists + has_ead_id_note = False + for desc_note in resource_relation.findall(f'{namespace}descriptiveNote'): + for p in desc_note.findall(f'{namespace}p'): + if p.text and p.text.startswith('ead_id:'): + has_ead_id_note = True + break + if has_ead_id_note: + break + + if has_ead_id_note: + # Already has our descriptiveNote, skip + continue + + # Extract href attribute - try multiple variations + href = None + # Try with xlink namespace + for attr_key in resource_relation.attrib: + if 'href' in attr_key: + href = resource_relation.attrib[attr_key] + break + + if not href: + continue + + # Only process resource URLs (skip digital_objects, etc.) + # Pattern: repositories/{number}/resources/{number} + uri_match = re.search(r'/repositories/(\d+)/resources/(\d+)', href) + if not uri_match: + # Not a resource URL (likely digital_object or other type) - skip silently + continue + + res_repo_id = uri_match.group(1) + res_resource_id = uri_match.group(2) + + # Fetch resource to get ead_id; skip on any error + try: + response = self.client.get(f'/repositories/{res_repo_id}/resources/{res_resource_id}') + if response.status_code != 200: + self.log.warning( + f'{indent}Could not fetch resource {href}: HTTP {response.status_code}. ' + 'Skipping collection link.') + continue + + resource = response.json() + ead_id = resource.get('ead_id') + if not ead_id: + self.log.warning( + f'{indent}Resource /repositories/{res_repo_id}/resources/{res_resource_id} ' + 'has no ead_id. Skipping collection link.') + continue + + # Create descriptiveNote element with ead_id (namespace-aware) + descriptive_note = etree.Element(f'{namespace}descriptiveNote') + p = etree.SubElement(descriptive_note, f'{namespace}p') + p.text = f'ead_id:{ead_id}' + + # Append to resourceRelation + resource_relation.append(descriptive_note) + changes_made = True + + except Exception as e: + self.log.warning(f'{indent}Could not fetch resource for {href}: {e}. Skipping collection link.') + continue + + # Only convert back to string if changes were made + if changes_made: + result_bytes = etree.tostring( + root, + encoding='UTF-8', + method='xml', + pretty_print=False, + xml_declaration=True + ) + result = result_bytes.decode('utf-8') + return result + else: + # Return original XML (not the potentially modified version with namespace) + return original_xml + + except etree.ParseError as e: + self.log.error(f'{indent}Failed to parse EAC-CPF XML: {e}. Returning original content.') + return original_xml + + def build_bioghist_element( + self, + agent_name: str, + persistent_id: Optional[str], + paragraphs: List[str] + ) -> str: + """ + Build bioghist XML element from structured data using lxml for proper escaping. + + Args: + agent_name: Name of the agent for the head element + persistent_id: Persistent ID for the bioghist element (optional) + paragraphs: List of plain text paragraph strings (will be wrapped in

tags with proper escaping) + + Returns: + str: Bioghist XML element as a string + """ + # Create bioghist element + bioghist = etree.Element('bioghist') + + # Add id attribute if persistent_id is available + if persistent_id: + bioghist.set('id', f'aspace_{persistent_id}') + + # Create head element with escaped text + head = etree.SubElement(bioghist, 'head') + head.text = f'Historical Note from {agent_name} Creator Record' + + # Create

elements from plain text paragraphs + # lxml automatically handles XML escaping + for paragraph_text in paragraphs: + p = etree.SubElement(bioghist, 'p') + p.text = paragraph_text + + # Convert to string (no XML declaration for fragments) + return etree.tostring(bioghist, encoding='unicode', method='xml') + + def validate_eac_cpf_xml(self, eac_cpf_xml: str, agent_uri: str, indent_size: int = 0) -> Optional['etree._Element']: + """ + Parse and validate EAC-CPF XML structure. + + Args: + eac_cpf_xml: EAC-CPF XML as a string + agent_uri: Agent URI for logging purposes + indent_size: Indentation size for logging + + Returns: + lxml Element if valid, None if parsing fails + """ + indent = ' ' * indent_size + + try: + # Try to parse with lxml, with fallback for missing xlink namespace + parser = etree.XMLParser(remove_blank_text=False) + try: + root = etree.fromstring(eac_cpf_xml.encode('utf-8'), parser) + except etree.ParseError: + # If parsing fails, it might be due to undeclared namespaces + if 'xlink:' in eac_cpf_xml and 'xmlns:xlink' not in eac_cpf_xml: + # Add xlink namespace declaration to root element + eac_cpf_xml = eac_cpf_xml.replace('', '', 1) + root = etree.fromstring(eac_cpf_xml.encode('utf-8'), parser) + + self.log.debug(f'{indent}Parsed EAC-CPF XML root element: {root.tag}') + return root + except etree.ParseError as e: + self.log.error(f'{indent}Failed to parse EAC-CPF XML for {agent_uri}: {e}') + return None \ No newline at end of file diff --git a/example_traject_config_eac_cpf.rb b/example_traject_config_eac_cpf.rb index 177da4f..7b804f3 100644 --- a/example_traject_config_eac_cpf.rb +++ b/example_traject_config_eac_cpf.rb @@ -203,6 +203,14 @@ end end +# Related Agents - Parallel array of names to match relationship ids, uris and type +to_field 'related_agent_names_ssim' do |record, accumulator| + relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation/eac:relationEntry', EAC_NS) + relations.each do |rel| + accumulator << rel.text + end +end + # Related Agents - Parallel array of relationship types to match relationship ids and uris to_field 'related_agent_relationship_types_ssim' do |record, accumulator| relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS) @@ -224,6 +232,66 @@ end end +# Collections this creator is responsible for - EAD IDs injected by arcflow +# into elements as: +#

ead_id:{ead_id}

+# Indexed as an array of EAD IDs (e.g., ["ALA.9.5.16"]) for bidirectional +# creator↔collection linking in Solr. +to_field 'creator_of_collection__collection_ids_ssim' do |record, accumulator| + relations = record.xpath( + '//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="creatorOf"]', + EAC_NS + ) + relations.each do |rel| + note = rel.xpath('eac:descriptiveNote/eac:p', EAC_NS).first + if note && note.text =~ /\Aead_id:(.+)\z/ + accumulator << $1.strip + end + end +end + +to_field 'creator_of_collection__collection_name_ssim' do |record, accumulator| + relations = record.xpath( + '//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="creatorOf"]', + EAC_NS + ) + relations.each do |rel| + note = rel.xpath('eac:descriptiveNote/eac:p', EAC_NS).first + if note && note.text =~ /\Aead_id:(.+)\z/ + name = rel.xpath('eac:relationEntry', EAC_NS) + accumulator << name.text + end + end +end + + +to_field 'creator_of_digital_object__do_ids_ssim' do |record, accumulator| + relations = record.xpath( + '//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="creatorOf"]', + EAC_NS + ) + relations.each do |rel| + href = rel['href'] || rel['xlink:href'] + if href.include? "digital_object" + accumulator << href + end + end +end + +to_field 'subject_of_digital_object__do_ids_ssim' do |record, accumulator| + relations = record.xpath( + '//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="subjectOf"]', + EAC_NS + ) + relations.each do |rel| + href = rel['href'] || rel['xlink:href'] + if href.include? "digital_object" + accumulator << href + end + end +end + + # Agent source URI (from original ArchivesSpace) to_field 'agent_uri_ssi' do |record, accumulator| # Try to extract from control section or otherRecordId @@ -238,11 +306,6 @@ accumulator << Time.now.utc.iso8601 end -# # Document type marker -# to_field 'document_type' do |record, accumulator| -# accumulator << 'creator' -# end - # Log successful indexing each_record do |record, context| record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first diff --git a/example_traject_config_ead_extra.rb b/example_traject_config_ead_extra.rb new file mode 100644 index 0000000..8ad70a7 --- /dev/null +++ b/example_traject_config_ead_extra.rb @@ -0,0 +1,66 @@ +# Example Traject extra config for EAD collection indexing. +# You can copy this file into Arclight (or a theme you have modifying Arclight, +# e.g., Arcuit): +# {arclight_dir}/lib/arcuit/traject/ead_extra_config.rb +# +# Any additional Traject commands you add to this file will be added to collection +# records in Arclight. +# +# This file shows the fields that arcflow injects into EAD XML to support: +# 1. Record group and sub-group categories +# 2. Solr ID for the creator records also created by arcflow +# +# GROUP + SUB-GROUP +# Arcflow adds and elements directly after +# ALA 52 — Library Periodicals Round Table +# ALA 52.2 — Publications +# +# CREATOR RECORDS +# Arcflow adds arcuit:creator_id attributes to origination name elements +# using a custom namespace to avoid collisions with existing authfilenumber values: +# +# +# +# ALA Allied Professional Association +# +# +# + +# + +# Creator ArcLight IDs - extracted from arcuit:creator_id attributes on origination +# name elements (, , ) injected by arcflow. +# Uses custom namespace xmlns:arcuit="https://arcuit.library.illinois.edu/ead-extensions" +# Indexed as an array of creator IDs (e.g., ["creator_corporate_entities_123"]) +# for bidirectional creator↔collection linking in Solr. +to_field 'creator_arclight_ids_ssim' do |record, accumulator| + # Define namespace + arcuit_ns = {'arcuit' => 'https://arcuit.library.illinois.edu/ead-extensions', + 'ead' => 'urn:isbn:1-931666-22-9'} + + # Extract arcuit:creator_id from origination name elements + record.xpath('//ead:archdesc/ead:did/ead:origination/ead:corpname[@arcuit:creator_id] | + //ead:archdesc/ead:did/ead:origination/ead:persname[@arcuit:creator_id] | + //ead:archdesc/ead:did/ead:origination/ead:famname[@arcuit:creator_id]', + arcuit_ns).each do |node| + accumulator << node['arcuit:creator_id'] + end + + # Also check without EAD namespace (some ASpace EAD exports omit it) + if accumulator.empty? + record.xpath('//archdesc/did/origination/corpname[@arcuit:creator_id] | + //archdesc/did/origination/persname[@arcuit:creator_id] | + //archdesc/did/origination/famname[@arcuit:creator_id]', + arcuit_ns).each do |node| + accumulator << node['arcuit:creator_id'] + end + end +end + +# Record group and sub-group - extracted from recordgroup and subgroup elements +# injected by Arcflow into EAD documents created by ArchivesSpace +to_field 'record_group_ssim', extract_xpath('/ead/archdesc/recordgroup') +to_field 'subgroup_ssim', extract_xpath('/ead/archdesc/subgroup') diff --git a/requirements.txt b/requirements.txt index 6efbe65..84174a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ ArchivesSnake -pyyaml \ No newline at end of file +pyyaml +lxml \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_agent_service.py b/tests/test_agent_service.py new file mode 100644 index 0000000..fde3792 --- /dev/null +++ b/tests/test_agent_service.py @@ -0,0 +1,257 @@ +""" +Tests for AgentService. +""" + +import unittest +from unittest.mock import Mock +from arcflow.services.agent_service import AgentService + + +class TestAgentService(unittest.TestCase): + """Test cases for AgentService.""" + + def setUp(self): + """Set up test fixtures.""" + self.mock_client = Mock() + self.mock_log = Mock() + self.service = AgentService(client=self.mock_client, log=self.mock_log) + + def test_get_agent_bioghist_data_success(self): + """Test successfully fetching agent bioghist data.""" + # Mock agent response + mock_response = Mock() + mock_response.json.return_value = { + 'title': 'Test Agent', + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'abc123', + 'subnotes': [ + {'content': 'First paragraph.\nSecond paragraph.'} + ] + } + ] + } + self.mock_client.get.return_value = mock_response + + result = self.service.get_agent_bioghist_data('/agents/corporate_entities/123') + + self.assertIsNotNone(result) + self.assertEqual(result['agent_name'], 'Test Agent') + self.assertEqual(result['persistent_id'], 'abc123') + self.assertEqual(len(result['paragraphs']), 2) + self.assertIn('First paragraph.', result['paragraphs']) + self.assertIn('Second paragraph.', result['paragraphs']) + + def test_get_agent_bioghist_data_no_bioghist(self): + """Test fetching agent with no bioghist notes.""" + mock_response = Mock() + mock_response.json.return_value = { + 'title': 'Test Agent', + 'notes': [] + } + self.mock_client.get.return_value = mock_response + + result = self.service.get_agent_bioghist_data('/agents/corporate_entities/123') + + self.assertIsNone(result) + + def test_get_agent_bioghist_data_with_list_content(self): + """Test handling subnote content as a list.""" + mock_response = Mock() + mock_response.json.return_value = { + 'title': 'Test Agent', + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'xyz789', + 'subnotes': [ + {'content': ['First item', 'Second item']} + ] + } + ] + } + self.mock_client.get.return_value = mock_response + + result = self.service.get_agent_bioghist_data('/agents/people/456') + + self.assertIsNotNone(result) + self.assertEqual(len(result['paragraphs']), 2) + self.assertIn('First item', result['paragraphs']) + self.assertIn('Second item', result['paragraphs']) + + def test_get_agent_bioghist_data_filters_empty_lines(self): + """Test that empty lines are filtered out.""" + mock_response = Mock() + mock_response.json.return_value = { + 'title': 'Test Agent', + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'def456', + 'subnotes': [ + {'content': 'Line 1\n\n\nLine 2\n \nLine 3'} + ] + } + ] + } + self.mock_client.get.return_value = mock_response + + result = self.service.get_agent_bioghist_data('/agents/families/789') + + self.assertIsNotNone(result) + self.assertEqual(len(result['paragraphs']), 3) + self.assertIn('Line 1', result['paragraphs']) + self.assertIn('Line 2', result['paragraphs']) + self.assertIn('Line 3', result['paragraphs']) + + def test_get_agent_bioghist_data_missing_persistent_id(self): + """Test handling bioghist note without persistent_id.""" + mock_response = Mock() + mock_response.json.return_value = { + 'title': 'Test Agent', + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + # No persistent_id + 'subnotes': [ + {'content': 'Some content'} + ] + } + ] + } + self.mock_client.get.return_value = mock_response + + result = self.service.get_agent_bioghist_data('/agents/corporate_entities/999') + + self.assertIsNotNone(result) + self.assertIsNone(result['persistent_id']) + # Should log error about missing persistent_id + self.mock_log.error.assert_called() + error_call = str(self.mock_log.error.call_args) + self.assertIn('ASSUMPTION VIOLATION', error_call) + self.assertIn('persistent_id', error_call) + + def test_get_agent_bioghist_data_invalid_content_type(self): + """Test handling unexpected content type.""" + mock_response = Mock() + mock_response.json.return_value = { + 'title': 'Test Agent', + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'ghi123', + 'subnotes': [ + {'content': {'unexpected': 'dict'}} # Invalid type + ] + } + ] + } + self.mock_client.get.return_value = mock_response + + result = self.service.get_agent_bioghist_data('/agents/corporate_entities/111') + + # Should return None when no valid paragraphs are extracted + self.assertIsNone(result) + # Should log error about unexpected type + self.mock_log.error.assert_called() + error_calls = [str(call) for call in self.mock_log.error.call_args_list] + error_text = ''.join(error_calls) + self.assertIn('ASSUMPTION VIOLATION', error_text) + self.assertIn('dict', error_text) + + def test_get_agent_bioghist_data_uses_display_name_fallback(self): + """Test using display_name.sort_name when title is missing.""" + mock_response = Mock() + mock_response.json.return_value = { + # No 'title' field + 'display_name': {'sort_name': 'Fallback Name'}, + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'jkl456', + 'subnotes': [ + {'content': 'Some content'} + ] + } + ] + } + self.mock_client.get.return_value = mock_response + + result = self.service.get_agent_bioghist_data('/agents/people/222') + + self.assertIsNotNone(result) + self.assertEqual(result['agent_name'], 'Fallback Name') + + def test_get_agent_bioghist_data_handles_exception(self): + """Test handling exceptions during agent fetch.""" + self.mock_client.get.side_effect = Exception('Network error') + + result = self.service.get_agent_bioghist_data('/agents/corporate_entities/333') + + self.assertIsNone(result) + self.mock_log.error.assert_called() + error_call = str(self.mock_log.error.call_args) + self.assertIn('Network error', error_call) + + def test_get_agent_bioghist_data_multiple_subnotes(self): + """Test handling multiple subnotes in a bioghist note.""" + mock_response = Mock() + mock_response.json.return_value = { + 'title': 'Test Agent', + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'mno789', + 'subnotes': [ + {'content': 'First subnote'}, + {'content': 'Second subnote'}, + {'content': 'Third subnote'} + ] + } + ] + } + self.mock_client.get.return_value = mock_response + + result = self.service.get_agent_bioghist_data('/agents/families/444') + + self.assertIsNotNone(result) + self.assertEqual(len(result['paragraphs']), 3) + self.assertIn('First subnote', result['paragraphs']) + self.assertIn('Second subnote', result['paragraphs']) + self.assertIn('Third subnote', result['paragraphs']) + + def test_get_agent_bioghist_data_returns_first_bioghist_only(self): + """Test that only the first bioghist note is returned.""" + mock_response = Mock() + mock_response.json.return_value = { + 'title': 'Test Agent', + 'notes': [ + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'first123', + 'subnotes': [ + {'content': 'First bioghist'} + ] + }, + { + 'jsonmodel_type': 'note_bioghist', + 'persistent_id': 'second456', + 'subnotes': [ + {'content': 'Second bioghist'} + ] + } + ] + } + self.mock_client.get.return_value = mock_response + + result = self.service.get_agent_bioghist_data('/agents/corporate_entities/555') + + self.assertIsNotNone(result) + self.assertEqual(result['persistent_id'], 'first123') + self.assertIn('First bioghist', result['paragraphs']) + self.assertNotIn('Second bioghist', result['paragraphs']) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_xml_transform_service.py b/tests/test_xml_transform_service.py new file mode 100644 index 0000000..b49ec10 --- /dev/null +++ b/tests/test_xml_transform_service.py @@ -0,0 +1,506 @@ +""" +Tests for XmlTransformService. +""" + +import unittest +from unittest.mock import Mock +from arcflow.services.xml_transform_service import XmlTransformService + +# Real ArchivesSpace EAD fixture with namespace +REAL_EAD_WITH_NAMESPACE = ''' + + + test-collection + + + + Test Collection with Namespace + + Test Corporation + + + +''' + +# Real EAC-CPF fixture with namespace +REAL_EAC_CPF_WITH_NAMESPACE = ''' + + + test-agent + + + + + Test Collection + + + +''' + +class TestXmlTransformService(unittest.TestCase): + """Test cases for XmlTransformService.""" + + def setUp(self): + """Set up test fixtures.""" + self.mock_client = Mock() + self.mock_log = Mock() + self.service = XmlTransformService(client=self.mock_client, log=self.mock_log) + + def test_add_creator_ids_to_ead(self): + """Test adding arcuit:creator_id attributes to origination elements.""" + + resource = { + 'linked_agents': [ + {'role': 'creator', 'ref': '/agents/corporate_entities/123'} + ] + } + + result = self.service.add_creator_ids_to_ead(REAL_EAD_WITH_NAMESPACE, resource) + + # Should contain arcuit namespace declaration + self.assertIn('xmlns:arcuit', result) + self.assertIn('https://arcuit.library.illinois.edu/ead-extensions', result) + # Should contain the creator_id attribute + self.assertIn('creator_id="creator_corporate_entities_123"', result) + # Should preserve EAD namespace + self.assertIn('urn:isbn:1-931666-22-9', result) + # Should still find and modify the corpname element + self.assertIn('corpname', result) + + def test_add_creator_ids_multiple_creators(self): + """Test adding arcuit:creator_id to multiple origination elements.""" + xml_content = ''' + + First Corp + + + Second Person + +''' + + resource = { + 'linked_agents': [ + {'role': 'creator', 'ref': '/agents/corporate_entities/123'}, + {'role': 'creator', 'ref': '/agents/people/456'} + ] + } + + result = self.service.add_creator_ids_to_ead(xml_content, resource) + + self.assertIn('creator_id="creator_corporate_entities_123"', result) + self.assertIn('creator_id="creator_people_456"', result) + self.assertIn('xmlns:arcuit', result) + + def test_add_creator_ids_no_creators(self): + """Test that XML is unchanged when there are no creators.""" + xml_content = 'Test' + resource = {'linked_agents': []} + + result = self.service.add_creator_ids_to_ead(xml_content, resource) + + self.assertEqual(xml_content, result) + + def test_inject_collection_metadata_with_all_fields(self): + """Test injecting record group, subgroup, and bioghist.""" + xml_content = ''' + + + + Test Collection + + + ''' + + result = self.service.inject_collection_metadata( + xml_content, + record_group='RG 1 — Test Group', + subgroup='SG 1.1 — Test Subgroup', + bioghist_content='

Test bioghist

' + ) + + # Should add recordgroup with namespace + self.assertIn('recordgroup', result) + self.assertIn('RG 1 — Test Group', result) + # Should add subgroup with namespace + self.assertIn('subgroup', result) + self.assertIn('SG 1.1 — Test Subgroup', result) + # Should add bioghist with EAD namespace + self.assertIn('bioghist', result) + self.assertIn('Test bioghist', result) + # Should preserve original namespace + self.assertIn('xmlns', result) + self.assertIn('urn:isbn:1-931666-22-9', result) + + def test_inject_collection_metadata_into_existing_bioghist(self): + """Test that bioghist content is inserted into existing bioghist element.""" + xml_content = ''' + + + Test Collection + + +

Existing content

+
+
+
''' + + result = self.service.inject_collection_metadata( + xml_content, + record_group=None, + subgroup=None, + bioghist_content='

New content

' + ) + + # Should insert before + self.assertIn('Existing content', result) + self.assertIn('New content', result) + # Should not create a new bioghist wrapper + self.assertEqual(result.count(''), 2) # Original + inserted + + def test_inject_collection_metadata_xml_escaping(self): + """Test that special XML characters are properly escaped.""" + xml_content = ''' + + + Test + + +''' + + result = self.service.inject_collection_metadata( + xml_content, + record_group='Group & Co ', + subgroup=None, + bioghist_content=None + ) + + self.assertIn('Group & Co <test>', result) + self.assertNotIn('Group & Co ', result) + + def test_add_collection_links_to_eac_cpf(self): + """Test adding ead_id descriptiveNote to resourceRelation elements.""" + + # Mock the client response + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {'ead_id': 'TEST.1.2.3'} + self.mock_client.get.return_value = mock_response + + result = self.service.add_collection_links_to_eac_cpf(REAL_EAC_CPF_WITH_NAMESPACE) + + # Should add descriptiveNote (namespace-aware check) + self.assertIn('descriptiveNote', result) + self.assertIn('ead_id:TEST.1.2.3', result) + # Should preserve EAC-CPF namespace + self.assertIn('urn:isbn:1-931666-33-4', result) + + def test_multiple_creators_with_namespace(self): + """Test handling multiple creators when EAD has default namespace.""" + xml_with_namespace = ''' + + + + + First Corp + + + Second Person + + + +''' + + resource = { + 'linked_agents': [ + {'role': 'creator', 'ref': '/agents/corporate_entities/123'}, + {'role': 'creator', 'ref': '/agents/people/456'} + ] + } + + result = self.service.add_creator_ids_to_ead(xml_with_namespace, resource) + + # Should add both creator IDs + self.assertIn('creator_id="creator_corporate_entities_123"', result) + self.assertIn('creator_id="creator_people_456"', result) + # Should preserve namespace + self.assertIn('urn:isbn:1-931666-22-9', result) + + def test_add_collection_links_idempotent(self): + """Test that adding collection links is idempotent.""" + eac_cpf_xml = ''' + + Test Collection + +

ead_id:TEST.1.2.3

+
+
+
''' + + result = self.service.add_collection_links_to_eac_cpf(eac_cpf_xml) + + # Should not call the client since descriptiveNote already exists + self.mock_client.get.assert_not_called() + # Should return unchanged XML + self.assertEqual(eac_cpf_xml, result) + + def test_add_collection_links_skips_digital_objects(self): + """Test that digital object URLs are skipped silently.""" + eac_cpf_xml = ''' + + Test Digital Object + +''' + + result = self.service.add_collection_links_to_eac_cpf(eac_cpf_xml) + + # Should not call the client + self.mock_client.get.assert_not_called() + # Should return unchanged XML + self.assertEqual(eac_cpf_xml, result) + + def test_add_collection_links_handles_fetch_errors(self): + """Test that fetch errors are handled gracefully.""" + eac_cpf_xml = ''' + + Test Collection + +''' + + # Mock a 404 response + mock_response = Mock() + mock_response.status_code = 404 + self.mock_client.get.return_value = mock_response + + result = self.service.add_collection_links_to_eac_cpf(eac_cpf_xml) + + # Should log a warning + self.mock_log.warning.assert_called() + # Should return unchanged XML + self.assertNotIn('', result) + + def test_build_bioghist_element(self): + """Test building bioghist XML element from structured data.""" + result = self.service.build_bioghist_element( + agent_name='Test Agent', + persistent_id='abc123', + paragraphs=['First paragraph', 'Second paragraph'] + ) + + self.assertIn('', result) + self.assertIn('Historical Note from Test Agent Creator Record', result) + self.assertIn('

First paragraph

', result) + self.assertIn('

Second paragraph

', result) + self.assertIn('
', result) + + def test_build_bioghist_element_without_persistent_id(self): + """Test building bioghist without persistent_id.""" + result = self.service.build_bioghist_element( + agent_name='Test Agent', + persistent_id=None, + paragraphs=['Content'] + ) + + self.assertIn('', result) + self.assertNotIn('id=', result) + self.assertIn('

Content

', result) + + def test_build_bioghist_element_escapes_agent_name(self): + """Test that agent name is properly XML-escaped.""" + result = self.service.build_bioghist_element( + agent_name='Agent & Co ', + persistent_id='abc', + paragraphs=['Content'] + ) + + self.assertIn('Agent & Co <test>', result) + + def test_build_bioghist_element_escapes_paragraph_content(self): + """Test that paragraph content with special XML characters is properly escaped.""" + result = self.service.build_bioghist_element( + agent_name='Test Agent', + persistent_id='abc', + paragraphs=['Content with & ampersand', 'Content with and "quotes"'] + ) + + self.assertIn('

Content with & ampersand

', result) + self.assertIn('

Content with <tags> and "quotes"

', result) + + def test_validate_eac_cpf_xml_valid(self): + """Test validating valid EAC-CPF XML.""" + eac_cpf_xml = '' + + root = self.service.validate_eac_cpf_xml(eac_cpf_xml, '/agents/corporate_entities/123') + + self.assertIsNotNone(root) + self.assertEqual(root.tag, 'eac-cpf') + + def test_validate_eac_cpf_xml_invalid(self): + """Test validating invalid EAC-CPF XML.""" + eac_cpf_xml = '' # Missing closing tags + + root = self.service.validate_eac_cpf_xml(eac_cpf_xml, '/agents/corporate_entities/123') + + self.assertIsNone(root) + self.mock_log.error.assert_called() + + def test_add_collection_links_requires_client(self): + """Test that add_collection_links_to_eac_cpf requires a client.""" + service_no_client = XmlTransformService(client=None) + + with self.assertRaises(ValueError) as context: + service_no_client.add_collection_links_to_eac_cpf('') + + self.assertIn('Client is required', str(context.exception)) + + def test_namespace_preservation_ead_with_declaration(self): + """Test that EAD namespace prefixes and XML declaration are preserved.""" + xml_input = ''' + + + test-collection + + + + Test Collection + + Test Corporation + + + +''' + + resource = { + 'linked_agents': [ + {'role': 'creator', 'ref': '/agents/corporate_entities/123'} + ] + } + + result = self.service.add_creator_ids_to_ead(xml_input, resource) + + # Should have XML declaration + self.assertTrue(result.startswith(' + + + test-agent + + + + + Test Collection + + + +''' + + # Mock the client response + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {'ead_id': 'TEST.1.2.3'} + self.mock_client.get.return_value = mock_response + + result = self.service.add_collection_links_to_eac_cpf(xml_input) + + # Should have XML declaration + self.assertTrue(result.startswith(' + + + test-collection + + + + Test Collection + + +''' + + bioghist_content = ''' + Historical Note from Test Agent Creator Record +

Test paragraph

+
''' + + result = self.service.inject_collection_metadata( + xml_input, + record_group="Test Group", + subgroup="Test Subgroup", + bioghist_content=bioghist_content + ) + + # Should have XML declaration + self.assertTrue(result.startswith('', result) + self.assertIn('', result) + self.assertIn(' + + test-agent + +
''' + + # No changes will be made (no resourceRelations) + result = self.service.add_collection_links_to_eac_cpf(xml_input) + + # Should not add XML declaration when original didn't have one and no changes made + self.assertEqual(xml_input, result, 'Unchanged XML should be returned as-is') + self.assertFalse(result.startswith('