UIUCLibrary · Copilot · Dec 23, 2025 · Feb 11, 2026 · Feb 21, 2026 · Feb 26, 2026
diff --git a/arcflow/__init__.py b/arcflow/__init__.py
@@ -1 +1,15 @@
-from .main import ArcFlow
+"""
+ArcFlow package for syncing ArchivesSpace to ArcLight.
+
+To use ArcFlow, import directly from the main module:
+    from arcflow.main import ArcFlow
+
+Services can be imported independently:
+    from arcflow.services.xml_transform_service import XmlTransformService
+    from arcflow.services.agent_service import AgentService
+
+The top-level import is disabled to avoid eager loading of dependencies.
+"""
+
+# Avoid eager imports to allow services to be imported independently
+# from .main import ArcFlow
-Services can be imported independently:
-    from arcflow.services.xml_transform_service import XmlTransformService
-    from arcflow.services.agent_service import AgentService
-
-The top-level import is disabled to avoid eager loading of dependencies.
-"""
-
-# Avoid eager imports to allow services to be imported independently
-# from .main import ArcFlow
+Or, for backwards compatibility, from the package root (lazy loaded):
+    from arcflow import ArcFlow
+
+Services can be imported independently:
+    from arcflow.services.xml_transform_service import XmlTransformService
+    from arcflow.services.agent_service import AgentService
+"""
+
+# Avoid eager imports to allow services to be imported independently,
+# but preserve backwards compatibility for `from arcflow import ArcFlow`
+# via a lazy module-level attribute lookup.
+
+__all__ = ["ArcFlow"]
+
+def __getattr__(name: str):
+    """
+    Lazily provide ArcFlow at the package level to avoid eager imports.
+
+    This allows:
+        from arcflow import ArcFlow
+
+    without importing .main until ArcFlow is actually requested.
+    """
+    if name == "ArcFlow":
+        from .main import ArcFlow  # type: ignore[attr-defined]
+        return ArcFlow
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-Services can be imported independently:
-    from arcflow.services.xml_transform_service import XmlTransformService
-    from arcflow.services.agent_service import AgentService
-
-The top-level import is disabled to avoid eager loading of dependencies.
-"""
-
-# Avoid eager imports to allow services to be imported independently
-# from .main import ArcFlow
+Or, for backwards compatibility, from the package root (lazy loaded):
+    from arcflow import ArcFlow
+
+Services can be imported independently:
+    from arcflow.services.xml_transform_service import XmlTransformService
+    from arcflow.services.agent_service import AgentService
+"""
+
+# Avoid eager imports to allow services to be imported independently,
+# but preserve backwards compatibility for `from arcflow import ArcFlow`
+# via a lazy module-level attribute lookup.
+
+__all__ = ["ArcFlow"]
+
+def __getattr__(name: str):
+    """
+    Lazily provide ArcFlow at the package level to avoid eager imports.
+
+    This allows:
+        from arcflow import ArcFlow
+
+    without importing .main until ArcFlow is actually requested.
+    """
+    if name == "ArcFlow":
+        from .main import ArcFlow  # type: ignore[attr-defined]
+        return ArcFlow
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/arcflow/main.py b/arcflow/main.py
@@ -19,6 +19,8 @@
 from asnake.client import ASnakeClient
 from multiprocessing.pool import ThreadPool as Pool
 from utils.stage_classifications import extract_labels
+from .services.xml_transform_service import XmlTransformService
+from .services.agent_service import AgentService
 import glob
 
 base_dir = os.path.abspath((__file__) + "/../../")
@@ -115,6 +117,10 @@ def __init__(self, arclight_dir, aspace_dir, solr_url, aspace_solr_url, ead_extr
             self.log.error(f'Error authorizing ASnakeClient: {e}')
             exit(0)
 
+        # Initialize services
+        self.xml_transform = XmlTransformService(client=self.client, log=self.log)
+        self.agent_service = AgentService(client=self.client, log=self.log)
+
 
     def is_running(self):
         """
@@ -262,50 +268,24 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0):
             # (record group/subgroup labels and biographical/historical notes)
             if xml.content:
                 xml_content = xml.content.decode('utf-8')
-                insert_pos = xml_content.find('<archdesc level="collection">')
-
-                if insert_pos != -1:
-                    # Find the position after the closing </did> tag
-                    did_end_pos = xml_content.find('</did>', insert_pos)
-
-                    if did_end_pos != -1:
-                        # Move to after the </did> tag
-                        did_end_pos += len('</did>')
-                        extra_xml = ''
-
-                        # Add record group and subgroup labels
-                        rg_label, sg_label = extract_labels(resource)[1:3]
-                        if rg_label:
-                            extra_xml += f'\n<recordgroup>{xml_escape(rg_label)}</recordgroup>'
-                            if sg_label:
-                                extra_xml += f'\n<subgroup>{xml_escape(sg_label)}</subgroup>'
-
-                        # Handle biographical/historical notes from creator agents
-                        bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size)
-                        if bioghist_content:
-                            # Check if there's already a bioghist element in the EAD
-                            # Search for existing bioghist after </did> but before </archdesc>
-                            archdesc_end = xml_content.find('</archdesc>', did_end_pos)
-                            search_section = xml_content[did_end_pos:archdesc_end] if archdesc_end != -1 else xml_content[did_end_pos:]
-
-                            # Look for closing </bioghist> tag
-                            existing_bioghist_end = search_section.rfind('</bioghist>')
-
-                            if existing_bioghist_end != -1:
-                                # Found existing bioghist - insert agent elements INSIDE it (before closing tag)
-                                insert_pos = did_end_pos + existing_bioghist_end
-                                xml_content = (xml_content[:insert_pos] +
-                                    f'\n{bioghist_content}\n' +
-                                    xml_content[insert_pos:])
-                            else:
-                                # No existing bioghist - wrap agent elements in parent container
-                                wrapped_content = f'<bioghist>\n{bioghist_content}\n</bioghist>'
-                                extra_xml += f'\n{wrapped_content}'
-
-                        if extra_xml:
-                            xml_content = (xml_content[:did_end_pos] +
-                                extra_xml +
-                                xml_content[did_end_pos:])
+
+                # Add arcuit:creator_id attributes (in a custom namespace) to origination name elements
+                # (links creator names in EAD to their corresponding creator records, e.g., in Solr)
+                xml_content = self.xml_transform.add_creator_ids_to_ead(xml_content, resource, indent_size=indent_size)
+
+                # Get record group and subgroup labels
+                rg_label, sg_label = extract_labels(resource)[1:3]
+
+                # Get biographical/historical notes from creator agents
+                bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size)
+
+                # Inject all collection metadata using XmlTransformService
+                xml_content = self.xml_transform.inject_collection_metadata(
+                    xml_content,
+                    record_group=rg_label,
+                    subgroup=sg_label,
+                    bioghist_content=bioghist_content
+                )
 
                 xml_content = xml_content.encode('utf-8')
             else:
@@ -634,7 +614,6 @@ def get_creator_bioghist(self, resource, indent_size=0):
         Returns nested bioghist elements for each creator, or None if no creator agents have notes.
         Each bioghist element includes the creator name in a head element and an id attribute.
         """
-        indent = ' ' * indent_size
         bioghist_elements = []
 
         if 'linked_agents' not in resource:
@@ -646,58 +625,16 @@ def get_creator_bioghist(self, resource, indent_size=0):
             if linked_agent.get('role') == 'creator':
                 agent_ref = linked_agent.get('ref')
                 if agent_ref:
-                    try:
-                        agent = self.client.get(agent_ref).json()
-
-                        # Get agent name for head element
-                        agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown')
-
-                        # Check for notes in the agent record
-                        if 'notes' in agent:
-                            for note in agent['notes']:
-                                # Look for biographical/historical notes
-                                if note.get('jsonmodel_type') == 'note_bioghist':
-                                    # Get persistent_id for the id attribute
-                                    persistent_id = note.get('persistent_id', '')
-                                    if not persistent_id:
-                                        self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist for agent {agent_ref}')
-                                        # Skip creating id attribute if persistent_id is missing
-                                        persistent_id = None
-
-                                    # Extract note content from subnotes
-                                    paragraphs = []
-                                    if 'subnotes' in note:
-                                        for subnote in note['subnotes']:
-                                            if 'content' in subnote:
-                                                # Split content on single newlines to create paragraphs
-                                                content = subnote['content']
-                                                # Handle content as either string or list with explicit type checking
-                                                if isinstance(content, str):
-                                                    # Split on newline and filter out empty strings
-                                                    lines = [line.strip() for line in content.split('\n') if line.strip()]
-                                                elif isinstance(content, list):
-                                                    # Content is already a list - use as is
-                                                    lines = [str(item).strip() for item in content if str(item).strip()]
-                                                else:
-                                                    # Log unexpected content type prominently
-                                                    self.log.error(f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content in agent {agent_ref}, got {type(content).__name__}')
-                                                    continue
-                                                # Wrap each line in <p> tags
-                                                for line in lines:
-                                                    paragraphs.append(f'<p>{line}</p>')
-
-                                    # Create nested bioghist element if we have paragraphs
-                                    if paragraphs:
-                                        paragraphs_xml = '\n'.join(paragraphs)
-                                        heading = f'Historical Note from {xml_escape(agent_name)} Creator Record'
-                                        # Only include id attribute if persistent_id is available
-                                        if persistent_id:
-                                            bioghist_el = f'<bioghist id="aspace_{persistent_id}"><head>{heading}</head>\n{paragraphs_xml}\n</bioghist>'
-                                        else:
-                                            bioghist_el = f'<bioghist><head>{heading}</head>\n{paragraphs_xml}\n</bioghist>'
-                                        bioghist_elements.append(bioghist_el)
-                    except Exception as e:
-                        self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}')
+                    bioghist_data = self.agent_service.get_agent_bioghist_data(
+                        agent_ref, indent_size=indent_size
+                    )
+                    if bioghist_data:
+                        bioghist_xml = self.xml_transform.build_bioghist_element(
+                            bioghist_data['agent_name'],
+                            bioghist_data['persistent_id'],
+                            bioghist_data['paragraphs']
+                        )
+                        bioghist_elements.append(bioghist_xml)
 
         if bioghist_elements:
             # Return the agent bioghist elements (unwrapped)
@@ -879,14 +816,14 @@ def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0):
 
             eac_cpf_xml = response.text
 
-            # Parse the EAC-CPF XML to validate and inspect its structure
-            try:
-                root = ET.fromstring(eac_cpf_xml)
-                self.log.debug(f'{indent}Parsed EAC-CPF XML root element: {root.tag}')
-            except ET.ParseError as e:
-                self.log.error(f'{indent}Failed to parse EAC-CPF XML for {agent_uri}: {e}')
+            # Validate EAC-CPF XML structure
+            if not self.xml_transform.validate_eac_cpf_xml(eac_cpf_xml, agent_uri, indent_size=indent_size):
+                self.log.error(f'{indent}Invalid EAC-CPF XML for {agent_uri}, skipping')
                 return None
 
+            # Add collection ead_ids to resourceRelation creatorOf elements
+            eac_cpf_xml = self.xml_transform.add_collection_links_to_eac_cpf(eac_cpf_xml, indent_size=indent_size)
+
             # Generate creator ID
             creator_id = f'creator_{agent_type}_{agent_id}'
 

diff --git a/arcflow/services/__init__.py b/arcflow/services/__init__.py
diff --git a/arcflow/services/agent_service.py b/arcflow/services/agent_service.py
@@ -0,0 +1,115 @@
+"""
+Service for fetching and processing agent data from ArchivesSpace.
+
+Handles agent-related operations including:
+- Fetching agent biographical/historical notes
+- Processing note content into structured data
+"""
+
+import logging
+from typing import Optional, List, Dict
+
+
+class AgentService:
+    """Service for agent data fetching and processing."""
+
+    def __init__(self, client, log=None):
+        """
+        Initialize the agent service.
+
+        Args:
+            client: ASnake client for fetching agent data
+            log: Logger instance (optional, creates default if not provided)
+        """
+        self.client = client
+        self.log = log or logging.getLogger(__name__)
+
+    def get_agent_bioghist_data(self, agent_uri: str, indent_size: int = 0) -> Optional[Dict]:
+        """
+        Fetch bioghist DATA for an agent.
+
+        Returns structured data (not XML) so it can be used in different contexts:
+        - Build EAD XML for collections
+        - Build EAC-CPF XML for creator records
+        - Display in a web UI
+        - Export as JSON
+
+        Args:
+            agent_uri: Agent URI from ArchivesSpace (e.g., '/agents/corporate_entities/123')
+            indent_size: Indentation size for logging
+
+        Returns:
+            dict with keys: 'agent_name', 'persistent_id', 'paragraphs'
+            or None if no bioghist found or on error
+        """
+        indent = ' ' * indent_size
+
+        try:
+            agent = self.client.get(agent_uri).json()
+            agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown')
+
+            for note in agent.get('notes', []):
+                if note.get('jsonmodel_type') == 'note_bioghist':
+                    persistent_id = note.get('persistent_id')
+                    paragraphs = self._extract_paragraphs(note, agent_uri, indent_size)
+
+                    if paragraphs:
+                        return {
+                            'agent_name': agent_name,
+                            'persistent_id': persistent_id,
+                            'paragraphs': paragraphs
+                        }
+
+            return None  # No bioghist
+
+        except Exception as e:
+            self.log.error(f'{indent}Error fetching agent {agent_uri}: {e}')
+            return None
+
+    def _extract_paragraphs(self, note: dict, agent_uri: str, indent_size: int = 0) -> List[str]:
+        """
+        Extract paragraph content from a bioghist note.
+
+        Args:
+            note: Note dictionary from ArchivesSpace
+            agent_uri: Agent URI for logging purposes
+            indent_size: Indentation size for logging
+
+        Returns:
+            List of plain text paragraph strings (not wrapped in <p> tags)
+        """
+        indent = ' ' * indent_size
+        paragraphs = []
+
+        if 'subnotes' in note:
+            for subnote in note['subnotes']:
+                if 'content' in subnote:
+                    content = subnote['content']
+
+                    # Handle content as either string or list with explicit type checking
+                    if isinstance(content, str):
+                        # Split on newline and filter out empty strings
+                        lines = [line.strip() for line in content.split('\n') if line.strip()]
+                    elif isinstance(content, list):
+                        # Content is already a list - use as is
+                        lines = [str(item).strip() for item in content if str(item).strip()]
+                    else:
+                        # Log unexpected content type prominently
+                        self.log.error(
+                            f'{indent}**ASSUMPTION VIOLATION**: Expected string or list for subnote content '
+                            f'in agent {agent_uri}, got {type(content).__name__}'
+                        )
+                        continue
+
+                    # Add plain text lines (will be wrapped in <p> tags by build_bioghist_element)
+                    for line in lines:
+                        paragraphs.append(line)
+
+        # Log if persistent_id is missing
+        if not note.get('persistent_id'):
+            self.log.error(
+                f'{indent}**ASSUMPTION VIOLATION**: Expected persistent_id in note_bioghist '
+                f'for agent {agent_uri}'
+            )
+
+        return paragraphs