diff --git a/Gemfile.lock b/Gemfile.lock index c0e0cf4..fb0eb00 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -145,7 +145,7 @@ GEM tzinfo (2.0.6) concurrent-ruby (~> 1.0) unicode-display_width (2.6.0) - uri (1.0.3) + uri (1.0.4) zeitwerk (2.7.2) PLATFORMS diff --git a/lib/folio_api_client.rb b/lib/folio_api_client.rb index 08b9554..e1fb7e2 100644 --- a/lib/folio_api_client.rb +++ b/lib/folio_api_client.rb @@ -55,6 +55,12 @@ def with_token_refresh_attempt_when_unauthorized # one time in responde to a 401 or 403. refresh_auth_token! + # Re-run block + yield + rescue Faraday::ConnectionFailed + # If we make too many rapid requests in a row, FOLIO sometimes disconnects. + # If this happens, we'll sleep for a little while and retry the request. + sleep 5 # Re-run block yield end diff --git a/lib/folio_api_client/finders.rb b/lib/folio_api_client/finders.rb index c1fc4f8..44426bf 100644 --- a/lib/folio_api_client/finders.rb +++ b/lib/folio_api_client/finders.rb @@ -90,25 +90,49 @@ def find_source_record(instance_record_id: nil, instance_record_hrid: nil) source_record_search_results['sourceRecords'].first end - def find_source_marc_records(modified_since: nil, with_965_value: nil, &block) - query = marc_records_query(modified_since: modified_since, with_965_value: with_965_value) + # Retrieve and yield marc records, filtered by the given modified_since and with_965_value parameters. + # NOTE: This method skips staff-suppressed FOLIO records. + def find_source_marc_records(modified_since: nil, with_965_value: nil, &block) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength + # FOLIO does not allow an offset value higher than 9999, but this method needs to be able to retrieve + # result sets that have more than 9999 results, so we break big queries up into a bunch of smaller + # queries that split up the results based on the first character of the instance UUIDs. The first character + # of a UUID is a hex character (0-9 or a-f), which means that we will perform 16 different searches + # (i.e. "all of the results that have instance ids that start with a", then "all of the results that have + # instance ids that start with b", and so on). + + # Since we're splitting up the query into a bunch of different sub-queries, we need to do a + # non-prefix-filtered query first just to get the total number of results. + total_query = response = self.get( + 'search/instances', + marc_records_query(modified_since: modified_since, with_965_value: with_965_value).merge({ limit: 0 }) + ) + total_records = total_query['totalRecords'] - loop do - response = self.get('search/instances', query) - process_marc_for_instance(response['instances'], &block) if block - break if (query[:offset] + query[:limit]) >= response['totalRecords'] + with_uuid_prefixes do |uuid_prefix| + query = marc_records_query(modified_since: modified_since, with_965_value: with_965_value, + uuid_prefix: uuid_prefix) + loop do + response = self.get('search/instances', query) + process_marc_for_instance(response['instances'], total_records, &block) if block + break if (query[:offset] + query[:limit]) >= response['totalRecords'] - query[:offset] += query[:limit] + query[:offset] += query[:limit] + end end end - def process_marc_for_instance(instances, &block) + # UUIDs can start with + def with_uuid_prefixes(&block) + (('0'..'9').to_a + ('a'..'f').to_a).each(&block) + end + + def process_marc_for_instance(instances, total_records, &block) instances.each do |instance| source_record = find_source_record(instance_record_id: instance['id']) next if source_record.nil? # Occasionally, we find an instance record without a source record. Skip these. marc_content = source_record.dig('parsedRecord', 'content') - yield(marc_content) if marc_content && block + yield(marc_content, total_records) if marc_content && block end end @@ -135,7 +159,7 @@ def location_record_query(code: nil) 'Missing query field. Must supply a code.' end - def marc_records_query(modified_since: nil, with_965_value: nil) # rubocop:disable Metrics/MethodLength + def marc_records_query(modified_since: nil, with_965_value: nil, uuid_prefix: nil) # rubocop:disable Metrics/MethodLength,Metrics/CyclomaticComplexity params = { limit: 100, offset: 0 } if modified_since.nil? && with_965_value.nil? @@ -152,6 +176,14 @@ def marc_records_query(modified_since: nil, with_965_value: nil) # rubocop:disab query_parts << "metadata.updatedDate>=\"#{modified_since}\"" if modified_since query_parts << %(identifiers.value="#{with_965_value}") if with_965_value + # Only include non-staff-suppressed records because staff-suppressed records represent deleted records in FOLIO. + # Reminder: staff-suppressed records are NOT the same as discovery-suppressed records. Discovery-suppressed + # records are ones that aren't displayed to the public, and we DO want to include discovery-suppressed records + # in the results returned by this query. + query_parts << 'staffSuppress==false' + + query_parts << %(id="#{uuid_prefix}*") if uuid_prefix + params[:query] = query_parts.join(' and ') params end diff --git a/spec/folio_api_client/finders_spec.rb b/spec/folio_api_client/finders_spec.rb index 1103c07..25ac2ac 100644 --- a/spec/folio_api_client/finders_spec.rb +++ b/spec/folio_api_client/finders_spec.rb @@ -331,12 +331,31 @@ end end + describe '#with_uuid_prefixes' do + it 'yields the expected prefixes' do + expect { |b| + instance.with_uuid_prefixes(&b) + }.to yield_successive_args( + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'a', 'b', 'c', 'd', 'e', 'f' + ) + end + end + describe '#find_source_marc_records' do let(:modified_since) { '2025-01-01T00:00:00Z' } let(:value_965) { '965hyacinth' } let(:marc_content) { { 'fields' => [{ '001' => '12345' }] } } - let(:instance_record) { { 'id' => 'instance-123' } } - let(:instances_response) { { 'totalRecords' => 1, 'instances' => [instance_record] } } + let(:instance_record_1) { { 'id' => 'instance-123' } } + let(:instance_record_2) { { 'id' => 'instance-456' } } + let(:instances) { [instance_record_1, instance_record_2] } + let(:total_records) { instances.length } + let(:instances_response) { { 'totalRecords' => total_records, 'instances' => instances } } + let(:uuid_prefixes) do + arr = [] + instance.with_uuid_prefixes { |uuid_prefix| arr << uuid_prefix } + arr + end before do allow(instance).to receive(:find_source_record).and_return({ 'parsedRecord' => { 'content' => marc_content } }) @@ -344,15 +363,85 @@ context 'with modified_since parameter' do before do + # Allow first query for getting total result count allow(instance).to receive(:get).with( - 'search/instances', { query: "metadata.updatedDate>=\"#{modified_since}\"", limit: 100, offset: 0 } + 'search/instances', + { query: "metadata.updatedDate>=\"#{modified_since}\" and staffSuppress==false", limit: 0, offset: 0 } ).and_return(instances_response) + + # Allow uuid-prefix-bucket queries for aggregating actual results + uuid_prefixes.each do |uuid_prefix| + allow(instance).to receive(:get).with( + 'search/instances', + { + query: %(metadata.updatedDate>="#{modified_since}" and staffSuppress==false and id="#{uuid_prefix}*"), + limit: 100, + offset: 0 + } + ).and_return(instances_response) + end end - it 'yields MARC content for each source record' do - yielded_records = [] - instance.find_source_marc_records(modified_since: modified_since) { |record| yielded_records << record } - expect(yielded_records).to eq([marc_content]) + it 'retrieves records in batches based on uuid prefix' do + # Expect the first /search/instances query to be one that just fetches the total record count + # (and has a limit of 0 because we don't need results to be returned for the record count query). + expect(instance).to receive(:get).with( + 'search/instances', + { query: "metadata.updatedDate>=\"#{modified_since}\" and staffSuppress==false", limit: 0, offset: 0 } + ) + + # Then expect queries for each uuid prefix batch + uuid_prefixes.each do |uuid_prefix| + expect(instance).to receive(:get).with( + 'search/instances', + { + query: %(metadata.updatedDate>="#{modified_since}" and staffSuppress==false and id="#{uuid_prefix}*"), + limit: 100, + offset: 0 + } + ).and_return(instances_response) + end + + instance.find_source_marc_records(modified_since: modified_since) { |_| } + end + + it 'yields MARC content for each source record' do # rubocop:disable RSpec/ExampleLength + expect { |b| + instance.find_source_marc_records(modified_since: modified_since, &b) + }.to yield_successive_args( + [marc_content, total_records], # batch 0, record 1 + [marc_content, total_records], # batch 0, record 2 + [marc_content, total_records], # batch 1, record 1 + [marc_content, total_records], # batch 1, record 2 + [marc_content, total_records], # batch 2, record 1 + [marc_content, total_records], # batch 2, record 2 + [marc_content, total_records], # batch 3, record 1 + [marc_content, total_records], # batch 3, record 2 + [marc_content, total_records], # batch 4, record 1 + [marc_content, total_records], # batch 4, record 2 + [marc_content, total_records], # batch 5, record 1 + [marc_content, total_records], # batch 5, record 2 + [marc_content, total_records], # batch 6, record 1 + [marc_content, total_records], # batch 6, record 2 + [marc_content, total_records], # batch 7, record 1 + [marc_content, total_records], # batch 7, record 2 + [marc_content, total_records], # batch 8, record 1 + [marc_content, total_records], # batch 8, record 2 + [marc_content, total_records], # batch 9, record 1 + [marc_content, total_records], # batch 9, record 2 + [marc_content, total_records], # batch a, record 1 + [marc_content, total_records], # batch a, record 2 + [marc_content, total_records], # batch b, record 1 + [marc_content, total_records], # batch b, record 2 + [marc_content, total_records], # batch c, record 1 + [marc_content, total_records], # batch c, record 2 + [marc_content, total_records], # batch d, record 1 + [marc_content, total_records], # batch d, record 2 + [marc_content, total_records], # batch e, record 1 + [marc_content, total_records], # batch e, record 2 + [marc_content, total_records], # batch f, record 1 + [marc_content, total_records] # batch f, record 2 + ) end it 'raises an exception if the given modified_since parameter is an invalid format' do @@ -364,37 +453,85 @@ context 'with with_965_value parameter' do before do + # Allow first query for getting total result count allow(instance).to receive(:get).with( - 'search/instances', { query: %(identifiers.value="#{value_965}"), limit: 100, offset: 0 } + 'search/instances', + { query: %(identifiers.value="#{value_965}" and staffSuppress==false), limit: 0, offset: 0 } ).and_return(instances_response) + + # Allow uuid-prefix-bucket queries for aggregating actual results + uuid_prefixes.each do |uuid_prefix| + allow(instance).to receive(:get).with( + 'search/instances', + { + query: %(identifiers.value="#{value_965}" and staffSuppress==false and id="#{uuid_prefix}*"), + limit: 100, + offset: 0 + } + ).and_return(instances_response) + end end it 'uses the correct query' do instance.find_source_marc_records(with_965_value: value_965) { |_| } - expect(instance).to have_received(:get).with( - 'search/instances', { query: 'identifiers.value="965hyacinth"', limit: 100, offset: 0 } - ) + + # Then expect queries for each uuid prefix batch + uuid_prefixes.each do |uuid_prefix| + expect(instance).to have_received(:get).with( + 'search/instances', + { + query: %(identifiers.value="#{value_965}" and staffSuppress==false and id="#{uuid_prefix}*"), + limit: 100, + offset: 0 + } + ) + end end end context 'with modified_since and with_965_value parameters' do before do + # Allow first query for getting total result count allow(instance).to receive(:get).with( - 'search/instances', { - query: %(metadata.updatedDate>="#{modified_since}" and identifiers.value="#{value_965}"), - limit: 100, offset: 0 + 'search/instances', + { + query: %(metadata.updatedDate>="#{modified_since}" and identifiers.value="#{value_965}" and staffSuppress==false), # rubocop:disable Layout/LineLength + limit: 0, + offset: 0 } ).and_return(instances_response) + + puts %( + multiline! + cool! + cool! + ) + + # Allow uuid-prefix-bucket queries for aggregating actual results + uuid_prefixes.each do |uuid_prefix| + allow(instance).to receive(:get).with( + 'search/instances', + { + query: %(metadata.updatedDate>="#{modified_since}" and identifiers.value="#{value_965}" and staffSuppress==false and id="#{uuid_prefix}*"), # rubocop:disable Layout/LineLength + limit: 100, + offset: 0 + } + ).and_return(instances_response) + end end it 'combines both filters' do instance.find_source_marc_records(modified_since: modified_since, with_965_value: value_965) { |_| } - expect(instance).to have_received(:get).with( - 'search/instances', { - query: %(metadata.updatedDate>="#{modified_since}" and identifiers.value="#{value_965}"), - limit: 100, offset: 0 - } - ) + + # Then expect queries for each uuid prefix batch + uuid_prefixes.each do |uuid_prefix| + expect(instance).to have_received(:get).with( + 'search/instances', { + query: %(metadata.updatedDate>="#{modified_since}" and identifiers.value="#{value_965}" and staffSuppress==false and id="#{uuid_prefix}*"), # rubocop:disable Layout/LineLength + limit: 100, offset: 0 + } + ) + end end end