diff --git a/CHANGELOG.md b/CHANGELOG.md index aef98b3..6c6392b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added +- client method to write search results to an XML file, with validation against expected number of records to be written +- client method to return an iterator of XML records from a search, to support streaming results for large result sets +- xml fixture files for testing +- tests for the new client methods, including edge cases for validation + +### Changed +- README examples + +### Deprecated +- N/A + +### Removed +- N/A + +### Fixed +- N/A + +### Security +- N/A + + ## [0.1.1] ### Added diff --git a/README.md b/README.md index 45483ef..5926422 100644 --- a/README.md +++ b/README.md @@ -75,8 +75,12 @@ ids = client.fetch_ids_search("collection:'Disabled Students Program Photos'") records = client.fetch_search_metadata("collection:'Disabled Students Program Photos'") # return raw XML or PyMARC records from a paginated search +# NOTE: for large result sets, use the write_search_results_to_file() method and then parse that file xml_results = client.search("collection:'Disabled Students Program Photos'", result_format="xml") pymarc_results = client.search("collection:'Disabled Students Program Photos'", result_format="pymarc") + +# search Tind with a query and write results to an XML file in the default storage directory +records_written = client.write_search_results_to_file("Old Emperor Norton", "full_norton_results.xml") ``` ## Running tests diff --git a/pyproject.toml b/pyproject.toml index 5268abe..8e65718 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,3 +61,5 @@ allow-init-docstring = true skip-checking-raises = true style = "sphinx" +[tool.pylint.format] +max-line-length = 100 diff --git a/tests/fixtures/1st-batch-tind-response.xml b/tests/fixtures/1st-batch-tind-response.xml new file mode 100644 index 0000000..8dddf73 --- /dev/null +++ b/tests/fixtures/1st-batch-tind-response.xml @@ -0,0 +1,174 @@ + + 3 + FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFm04dVFEUk1JUjVpaHgzV2VIdTBOclEAAAAAAbR0aRZZQVkyRloyLVJtRzVJT1ZZTmdmMFpn + + + 27320 + 20250429135007.0 + + BANC PIC 1996.003:Volume 24:42a--fALB + + + calher_cubanc_25_132_00180699 + + + Old Emperor Norton in 1876 + + + Image + + + Researchers may make free and open use of the UC Berkeley Library’s digitized public domain materials. However, some materials in our online collections may be protected by U.S. copyright law (Title 17, U.S.C.). Use or reproduction of materials protected by copyright beyond that allowed by fair use (Title 17, U.S.C. § 107) requires permission from the copyright owners. The use or reproduction of some materials may also be restricted by terms of University of California gift or purchase agreements, privacy and publicity rights, or trademark law. Responsibility for determining rights status and permissibility of any use or reproduction rests exclusively with the researcher. To learn more or make inquiries, please see our permissions policies (https://www.lib.berkeley.edu/about/permissions-policies). + + + The Bancroft Library + + + http://www.oac.cdlib.org/findaid/ark:/13030/tf129005j4 + View collection guide + + + c6cf7b70-4acd-466f-ab1a-3ca53b4a2789 + 196176 + https://digicoll.lib.berkeley.edu/record/27320/files/I0051200A.jpg + + + ark:/13030/tf9m3nb8s8 + ark:/13030/tf129005j4 + 25:132 + + + cubanc_25_132_00180699.xml + + + oai:digicoll.lib.berkeley.edu:27320 + sfg + calher:cookscrapbook + mcleanCalisphere_oai + + + ark:/13030/m5s5429m + Merritt + + + CalHer: Cook Scrapbook + + + Jesse Brown Cook Scrapbooks + Jesse Brown Cook Scrapbooks Documenting San Francisco History and Law Enforcement + + + + 28819 + 20250429135229.0 + + BANC PIC 1996.003:Volume 24:41b--fALB + + + calher_cubanc_25_132_00180698 + + + Old Emperor Norton in 1876 + + + Image + + + Researchers may make free and open use of the UC Berkeley Library’s digitized public domain materials. However, some materials in our online collections may be protected by U.S. copyright law (Title 17, U.S.C.). Use or reproduction of materials protected by copyright beyond that allowed by fair use (Title 17, U.S.C. § 107) requires permission from the copyright owners. The use or reproduction of some materials may also be restricted by terms of University of California gift or purchase agreements, privacy and publicity rights, or trademark law. Responsibility for determining rights status and permissibility of any use or reproduction rests exclusively with the researcher. To learn more or make inquiries, please see our permissions policies (https://www.lib.berkeley.edu/about/permissions-policies). + + + The Bancroft Library + + + http://www.oac.cdlib.org/findaid/ark:/13030/tf129005j4 + View collection guide + + + 5e1f7508-117f-4120-9acf-88f09c2c20d8 + 170081 + https://digicoll.lib.berkeley.edu/record/28819/files/I0051199A.jpg + + + ark:/13030/tf496nb4j6 + ark:/13030/tf129005j4 + 25:132 + + + cubanc_25_132_00180698.xml + + + oai:digicoll.lib.berkeley.edu:28819 + sfg + calher:cookscrapbook + mcleanCalisphere_oai + + + ark:/13030/m57159jp + Merritt + + + CalHer: Cook Scrapbook + + + Jesse Brown Cook Scrapbooks + Jesse Brown Cook Scrapbooks Documenting San Francisco History and Law Enforcement + + + + 29563 + 20250429135339.0 + + BANC PIC 1996.003:Volume 24:41a--fALB + + + calher_cubanc_25_132_00180697 + + + Old Emperor Norton in 1876 + + + Image + + + Researchers may make free and open use of the UC Berkeley Library’s digitized public domain materials. However, some materials in our online collections may be protected by U.S. copyright law (Title 17, U.S.C.). Use or reproduction of materials protected by copyright beyond that allowed by fair use (Title 17, U.S.C. § 107) requires permission from the copyright owners. The use or reproduction of some materials may also be restricted by terms of University of California gift or purchase agreements, privacy and publicity rights, or trademark law. Responsibility for determining rights status and permissibility of any use or reproduction rests exclusively with the researcher. To learn more or make inquiries, please see our permissions policies (https://www.lib.berkeley.edu/about/permissions-policies). + + + The Bancroft Library + + + http://www.oac.cdlib.org/findaid/ark:/13030/tf129005j4 + View collection guide + + + 7f7df611-bbe6-4855-8725-2fbd8f9e3d90 + 199139 + https://digicoll.lib.berkeley.edu/record/29563/files/I0051198A.jpg + + + ark:/13030/tf7g5010k5 + ark:/13030/tf129005j4 + 25:132 + + + cubanc_25_132_00180697.xml + + + oai:digicoll.lib.berkeley.edu:29563 + sfg + calher:cookscrapbook + mcleanCalisphere_oai + + + ark:/13030/m5bp7bwt + Merritt + + + CalHer: Cook Scrapbook + + + Jesse Brown Cook Scrapbooks + Jesse Brown Cook Scrapbooks Documenting San Francisco History and Law Enforcement + + + + diff --git a/tests/fixtures/end-of-batch-tind-response.xml b/tests/fixtures/end-of-batch-tind-response.xml new file mode 100644 index 0000000..4c7614d --- /dev/null +++ b/tests/fixtures/end-of-batch-tind-response.xml @@ -0,0 +1,5 @@ + + 3 + FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFm04dVFEUk1JUjVpaHgzV2VIdTBOclEAAAAAAbR0aRZZQVkyRloyLVJtRzVJT1ZZTmdmMFpn + + \ No newline at end of file diff --git a/tests/fixtures/tind_results.xml b/tests/fixtures/tind_results.xml new file mode 100644 index 0000000..e3d11a8 --- /dev/null +++ b/tests/fixtures/tind_results.xml @@ -0,0 +1,174 @@ + + + + 27320 + 20250429135007.0 + + BANC PIC 1996.003:Volume 24:42a--fALB + + + calher_cubanc_25_132_00180699 + + + Old Emperor Norton in 1876 + + + Image + + + Researchers may make free and open use of the UC Berkeley Library’s digitized public domain materials. However, some materials in our online collections may be protected by U.S. copyright law (Title 17, U.S.C.). Use or reproduction of materials protected by copyright beyond that allowed by fair use (Title 17, U.S.C. § 107) requires permission from the copyright owners. The use or reproduction of some materials may also be restricted by terms of University of California gift or purchase agreements, privacy and publicity rights, or trademark law. Responsibility for determining rights status and permissibility of any use or reproduction rests exclusively with the researcher. To learn more or make inquiries, please see our permissions policies (https://www.lib.berkeley.edu/about/permissions-policies). + + + The Bancroft Library + + + http://www.oac.cdlib.org/findaid/ark:/13030/tf129005j4 + View collection guide + + + c6cf7b70-4acd-466f-ab1a-3ca53b4a2789 + 196176 + https://digicoll.lib.berkeley.edu/record/27320/files/I0051200A.jpg + + + ark:/13030/tf9m3nb8s8 + ark:/13030/tf129005j4 + 25:132 + + + cubanc_25_132_00180699.xml + + + oai:digicoll.lib.berkeley.edu:27320 + sfg + calher:cookscrapbook + mcleanCalisphere_oai + + + ark:/13030/m5s5429m + Merritt + + + CalHer: Cook Scrapbook + + + Jesse Brown Cook Scrapbooks + Jesse Brown Cook Scrapbooks Documenting San Francisco History and Law Enforcement + + + + + 28819 + 20250429135229.0 + + BANC PIC 1996.003:Volume 24:41b--fALB + + + calher_cubanc_25_132_00180698 + + + Old Emperor Norton in 1876 + + + Image + + + Researchers may make free and open use of the UC Berkeley Library’s digitized public domain materials. However, some materials in our online collections may be protected by U.S. copyright law (Title 17, U.S.C.). Use or reproduction of materials protected by copyright beyond that allowed by fair use (Title 17, U.S.C. § 107) requires permission from the copyright owners. The use or reproduction of some materials may also be restricted by terms of University of California gift or purchase agreements, privacy and publicity rights, or trademark law. Responsibility for determining rights status and permissibility of any use or reproduction rests exclusively with the researcher. To learn more or make inquiries, please see our permissions policies (https://www.lib.berkeley.edu/about/permissions-policies). + + + The Bancroft Library + + + http://www.oac.cdlib.org/findaid/ark:/13030/tf129005j4 + View collection guide + + + 5e1f7508-117f-4120-9acf-88f09c2c20d8 + 170081 + https://digicoll.lib.berkeley.edu/record/28819/files/I0051199A.jpg + + + ark:/13030/tf496nb4j6 + ark:/13030/tf129005j4 + 25:132 + + + cubanc_25_132_00180698.xml + + + oai:digicoll.lib.berkeley.edu:28819 + sfg + calher:cookscrapbook + mcleanCalisphere_oai + + + ark:/13030/m57159jp + Merritt + + + CalHer: Cook Scrapbook + + + Jesse Brown Cook Scrapbooks + Jesse Brown Cook Scrapbooks Documenting San Francisco History and Law Enforcement + + + + + 29563 + 20250429135339.0 + + BANC PIC 1996.003:Volume 24:41a--fALB + + + calher_cubanc_25_132_00180697 + + + Old Emperor Norton in 1876 + + + Image + + + Researchers may make free and open use of the UC Berkeley Library’s digitized public domain materials. However, some materials in our online collections may be protected by U.S. copyright law (Title 17, U.S.C.). Use or reproduction of materials protected by copyright beyond that allowed by fair use (Title 17, U.S.C. § 107) requires permission from the copyright owners. The use or reproduction of some materials may also be restricted by terms of University of California gift or purchase agreements, privacy and publicity rights, or trademark law. Responsibility for determining rights status and permissibility of any use or reproduction rests exclusively with the researcher. To learn more or make inquiries, please see our permissions policies (https://www.lib.berkeley.edu/about/permissions-policies). + + + The Bancroft Library + + + http://www.oac.cdlib.org/findaid/ark:/13030/tf129005j4 + View collection guide + + + 7f7df611-bbe6-4855-8725-2fbd8f9e3d90 + 199139 + https://digicoll.lib.berkeley.edu/record/29563/files/I0051198A.jpg + + + ark:/13030/tf7g5010k5 + ark:/13030/tf129005j4 + 25:132 + + + cubanc_25_132_00180697.xml + + + oai:digicoll.lib.berkeley.edu:29563 + sfg + calher:cookscrapbook + mcleanCalisphere_oai + + + ark:/13030/m5bp7bwt + Merritt + + + CalHer: Cook Scrapbook + + + Jesse Brown Cook Scrapbooks + Jesse Brown Cook Scrapbooks Documenting San Francisco History and Law Enforcement + + + + diff --git a/tests/test_fetch.py b/tests/test_fetch.py index 4655ad0..927f3ac 100644 --- a/tests/test_fetch.py +++ b/tests/test_fetch.py @@ -3,6 +3,9 @@ """ import json +import xml.etree.ElementTree as E + +from pathlib import Path import pytest import requests_mock as req_mock # noqa: F401 — activates the requests_mock fixture @@ -25,9 +28,7 @@ def test_fetch_metadata_success( client: TINDClient, ) -> None: """fetch_metadata returns a PyMARC Record for a valid record ID.""" - requests_mock.get( - f"{BASE_URL}/record/12345/", text=sample_marc_xml, status_code=200 - ) + requests_mock.get(f"{BASE_URL}/record/12345/", text=sample_marc_xml, status_code=200) record = client.fetch_metadata("12345") assert record["245"]["a"] == "Sample Title" @@ -39,9 +40,7 @@ def test_fetch_metadata_404(requests_mock: req_mock.Mocker, client: TINDClient) client.fetch_metadata("99999") -def test_fetch_metadata_empty_body( - requests_mock: req_mock.Mocker, client: TINDClient -) -> None: +def test_fetch_metadata_empty_body(requests_mock: req_mock.Mocker, client: TINDClient) -> None: """fetch_metadata raises RecordNotFoundError when the response body is empty.""" requests_mock.get(f"{BASE_URL}/record/11111/", text=" ", status_code=200) with pytest.raises(RecordNotFoundError): @@ -61,7 +60,7 @@ def test_fetch_file_invalid_url(client: TINDClient) -> None: def test_fetch_file_success( requests_mock: req_mock.Mocker, - tmp_path: pytest.TempPathFactory, + tmp_path: Path, client: TINDClient, ) -> None: """fetch_file downloads and saves a file, returning its local path.""" @@ -78,7 +77,7 @@ def test_fetch_file_success( def test_fetch_file_not_found( requests_mock: req_mock.Mocker, - tmp_path: pytest.TempPathFactory, + tmp_path: Path, client: TINDClient, ) -> None: """fetch_file raises RecordNotFoundError when the download returns non-200.""" @@ -93,9 +92,7 @@ def test_fetch_file_not_found( # --------------------------------------------------------------------------- -def test_fetch_file_metadata_success( - requests_mock: req_mock.Mocker, client: TINDClient -) -> None: +def test_fetch_file_metadata_success(requests_mock: req_mock.Mocker, client: TINDClient) -> None: """fetch_file_metadata returns a list of file metadata dicts.""" payload = [{"name": "file.pdf", "size": 1024}] requests_mock.get( @@ -107,9 +104,7 @@ def test_fetch_file_metadata_success( assert result[0]["name"] == "file.pdf" -def test_fetch_file_metadata_error( - requests_mock: req_mock.Mocker, client: TINDClient -) -> None: +def test_fetch_file_metadata_error(requests_mock: req_mock.Mocker, client: TINDClient) -> None: """fetch_file_metadata raises TINDError on non-200 responses.""" requests_mock.get( f"{BASE_URL}/record/12345/files", @@ -125,9 +120,7 @@ def test_fetch_file_metadata_error( # --------------------------------------------------------------------------- -def test_fetch_ids_search_success( - requests_mock: req_mock.Mocker, client: TINDClient -) -> None: +def test_fetch_ids_search_success(requests_mock: req_mock.Mocker, client: TINDClient) -> None: """fetch_ids_search returns the list of record IDs from the search response.""" requests_mock.get( f"{BASE_URL}/search", @@ -138,9 +131,7 @@ def test_fetch_ids_search_success( assert ids == ["1", "2", "3"] -def test_fetch_ids_search_error( - requests_mock: req_mock.Mocker, client: TINDClient -) -> None: +def test_fetch_ids_search_error(requests_mock: req_mock.Mocker, client: TINDClient) -> None: """fetch_ids_search raises TINDError on non-200 responses.""" requests_mock.get( f"{BASE_URL}/search", @@ -180,3 +171,126 @@ def test_search_returns_xml( assert isinstance(results, list) assert len(results) >= 1 assert requests_mock.call_count == 1 + + +# --------------------------------------------------------------------------- +# write_search_results_to_file / _iter_xml_records +# --------------------------------------------------------------------------- + +FIXTURES = Path(__file__).parent / "fixtures" + + +def test_write_search_results_to_file_empty_filename(client: TINDClient) -> None: + """write_search_results_to_file raises ValueError for a blank output filename.""" + with pytest.raises(ValueError): + client.write_search_results_to_file("title:foo", output_file_name=" ") + + +def test_write_search_results_to_file_zero_hits( + requests_mock: req_mock.Mocker, + client: TINDClient, + tmp_path: Path, +) -> None: + """write_search_results_to_file returns 0 immediately when the query has no hits.""" + client.default_storage_dir = str(tmp_path) + requests_mock.get( + f"{BASE_URL}/search", + text=json.dumps({"hits": []}), + status_code=200, + ) + assert client.write_search_results_to_file("collection:'empty'") == 0 + assert not (tmp_path / "tind.xml").exists() + + +def test_write_search_results_to_file_success( + requests_mock: req_mock.Mocker, + client: TINDClient, + tmp_path: Path, +) -> None: + """write_search_results_to_file writes 3 records and returns 3.""" + client.default_storage_dir = str(tmp_path) + requests_mock.get( + f"{BASE_URL}/search", + response_list=[ + # fetch_ids_search call (JSON) + {"text": json.dumps({"hits": ["27320", "28819", "29563"]}), "status_code": 200}, + # first paginated XML batch + {"text": (FIXTURES / "1st-batch-tind-response.xml").read_text(), "status_code": 200}, + # end-of-results sentinel (empty collection) + {"text": (FIXTURES / "end-of-batch-tind-response.xml").read_text(), "status_code": 200}, + ], + ) + count = client.write_search_results_to_file("collection:'test'", "out.xml") + assert count == 3 + + marc21_ns = "http://www.loc.gov/MARC21/slim" + tree = E.parse(tmp_path / "out.xml") + records = tree.getroot().findall(f"{{{marc21_ns}}}record") + assert len(records) == 3 + assert ( + tree.getroot().findtext(f"{{{marc21_ns}}}record/{{{marc21_ns}}}controlfield[@tag='001']") + == "27320" + ) + + +def test_write_search_results_to_file_matched_but_no_records_returned( + requests_mock: req_mock.Mocker, + client: TINDClient, + tmp_path: Path, +) -> None: + """write_search_results_to_file raises TINDError when API returns no records for matched IDs""" + client.default_storage_dir = str(tmp_path) + requests_mock.get( + f"{BASE_URL}/search", + response_list=[ + # fetch_ids_search says 3 hits + {"text": json.dumps({"hits": ["27320", "28819", "29563"]}), "status_code": 200}, + # but the XML stream returns nothing immediately + {"text": (FIXTURES / "end-of-batch-tind-response.xml").read_text(), "status_code": 200}, + ], + ) + with pytest.raises(TINDError, match="API did not return any."): + client.write_search_results_to_file("collection:'test'", "mismatch.xml") + + +def test_write_search_results_to_file_matched_but_api_mismatch( + requests_mock: req_mock.Mocker, + client: TINDClient, + tmp_path: Path, +) -> None: + """write_search_results_to_file raises TINDError when streamed record count != ID count.""" + client.default_storage_dir = str(tmp_path) + requests_mock.get( + f"{BASE_URL}/search", + response_list=[ + # fetch_ids_search says 3 hits + { + "text": json.dumps({"hits": ["27320", "28819", "29563", "123123"]}), + "status_code": 200, + }, + # first paginated XML batch, but only 3 records instead of 4 as expected from the IDs + {"text": (FIXTURES / "1st-batch-tind-response.xml").read_text(), "status_code": 200}, + # but the XML stream returns nothing immediately + {"text": (FIXTURES / "end-of-batch-tind-response.xml").read_text(), "status_code": 200}, + ], + ) + with pytest.raises(TINDError, match="Expected 4 records"): + client.write_search_results_to_file("collection:'test'", "mismatch.xml") + + +def test_write_search_results_to_file_malformed_xml_response( + requests_mock: req_mock.Mocker, + client: TINDClient, + tmp_path: Path, +) -> None: + """write_search_results_to_file raises TINDError when the API returns malformed XML.""" + client.default_storage_dir = str(tmp_path) + requests_mock.get( + f"{BASE_URL}/search", + response_list=[ + {"text": json.dumps({"hits": ["1"]}), "status_code": 200}, + {"text": "this is not xml <<<", "status_code": 200}, + ], + ) + with pytest.raises(TINDError, match="Failed to parse"): + client.write_search_results_to_file("collection:'test'", "malformed.xml") diff --git a/tind_client/client.py b/tind_client/client.py index 3194a14..eae9581 100644 --- a/tind_client/client.py +++ b/tind_client/client.py @@ -6,7 +6,8 @@ import os import re from io import StringIO -from typing import Any +from pathlib import Path +from typing import Any, Iterator import xml.etree.ElementTree as E from pymarc import Record @@ -16,6 +17,13 @@ from .errors import RecordNotFoundError, TINDError +NS = "http://www.loc.gov/MARC21/slim" +E.register_namespace("", NS) + +# remove namespace that ElementTree adds to records when passed +_NS_DECL: str = f' xmlns="{NS}"' + + class TINDClient: """Client for interacting with a TIND DA instance. @@ -57,9 +65,7 @@ def fetch_metadata(self, record: str) -> Record: # records. Additionally, if the XML is malformed, the parser function may return # multiple records. We need to ensure that exactly one record is parsed. if len(records) != 1: - raise RecordNotFoundError( - f"Record {record} did not match exactly one record in TIND." - ) + raise RecordNotFoundError(f"Record {record} did not match exactly one record in TIND.") return records[0] @@ -78,9 +84,7 @@ def fetch_file(self, file_url: str, output_dir: str = "") -> str: raise ValueError("URL is not a valid TIND file download URL.") output_target = output_dir or self.default_storage_dir - (status, saved_to) = tind_download( - file_url, output_dir=output_target, api_key=self.api_key - ) + (status, saved_to) = tind_download(file_url, output_dir=output_target, api_key=self.api_key) if status != 200: raise RecordNotFoundError("Referenced file could not be downloaded.") @@ -159,7 +163,6 @@ def search(self, query: str, result_format: str = "xml") -> list[Any]: while True: response = self._search_request(query, search_id=search_id) xml, search_id = self._retrieve_xml_search_id(response) - collection = xml.find("{http://www.loc.gov/MARC21/slim}collection") records = list(collection) if collection is not None else [] @@ -174,6 +177,66 @@ def search(self, query: str, result_format: str = "xml") -> list[Any]: return recs + def write_search_results_to_file( + self, query: str = "", output_file_name: str = "tind.xml" + ) -> int: + """Search TIND and stream results to an XML file. + + :param str query: A TIND search query string. + :param str output_file_name: filename for the output XML file. + :returns int: The number of records written to the file. + """ + if not output_file_name.endswith(".xml"): + raise ValueError("output_file_name must be a string ending with .xml") + + total_hits = len(self.fetch_ids_search(query)) + if total_hits == 0: + return 0 + + recs_written = 0 + output_path = os.path.join(self.default_storage_dir, output_file_name) + try: + with open(output_path, "w", encoding="utf-8") as f: + f.write(f'\n\n') + for record in self._iter_xml_records(query): + record_xml = E.tostring(record, encoding="unicode") + f.write(record_xml.replace(_NS_DECL, "")) + f.write("\n") + recs_written += 1 + if recs_written == 0: + # We expected records but didn't receive any through pagination + raise TINDError(f"Matched {total_hits} tind ids, but API did not return any.") + f.write("\n") + except Exception: + Path(output_path).unlink(missing_ok=True) + raise + + if recs_written != total_hits: + raise TINDError(f"Expected {total_hits} records, but wrote {recs_written} to file.") + return recs_written + + def _iter_xml_records(self, query: str) -> Iterator[E.Element]: + """Yield every ```` element from all pages of a search. + + Issues the initial search request, then yields records one at a time, + and continues to issue paginated search requests until all records have been yielded. + :param str query: A TIND search query string. + :yields: An iterator of XML elements representing the search results. + """ + search_id: str = "" + + while True: + response = self._search_request(query, search_id=search_id) + xml, search_id = self._retrieve_xml_search_id(response) + collection = xml.find(f"{{{NS}}}collection") + if collection is None or len(collection) == 0: + break + + yield from collection + + if not search_id: + break + def _search_request(self, query: str, *, search_id: str | None = None) -> str: """Retrieve a page of MARC data records. @@ -201,8 +264,11 @@ def _retrieve_xml_search_id(self, response: str) -> tuple[E.Element, str]: :returns: A parsable XML element and the search ID for the next page. :rtype: tuple[xml.etree.ElementTree.Element, str] """ - E.register_namespace("", "http://www.loc.gov/MARC21/slim") - xml = E.fromstring(response) + try: + xml = E.fromstring(response) + except E.ParseError as e: + raise TINDError(f"Failed to parse xml response: {e}") from e + search_id = xml.findtext("search_id", default="") return xml, search_id