From d4b190c3817bdc82ffe584a40b3b6e332373ed18 Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 16:01:33 +0000 Subject: [PATCH 01/19] Add dataset migration --- .../20250707155320_create_statcan_datasets.rb | 17 +++++++++++++++++ db/schema.rb | 16 +++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 db/migrate/20250707155320_create_statcan_datasets.rb diff --git a/db/migrate/20250707155320_create_statcan_datasets.rb b/db/migrate/20250707155320_create_statcan_datasets.rb new file mode 100644 index 0000000..388b7ca --- /dev/null +++ b/db/migrate/20250707155320_create_statcan_datasets.rb @@ -0,0 +1,17 @@ +class CreateStatcanDatasets < ActiveRecord::Migration[8.0] + def change + create_table :statcan_datasets do |t| + t.text :statcan_url, null: false + t.string :name, null: false + t.string :sync_schedule, null: false + t.jsonb :current_data + t.timestamp :last_synced_at + + t.timestamps + end + + add_index :statcan_datasets, :statcan_url, unique: true + add_index :statcan_datasets, :name, unique: true + add_index :statcan_datasets, :last_synced_at + end +end diff --git a/db/schema.rb b/db/schema.rb index 89b6810..0fa7e64 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.0].define(version: 2025_06_30_162816) do +ActiveRecord::Schema[8.0].define(version: 2025_07_07_155320) do # These are extensions that must be enabled in order to support this database enable_extension "pg_catalog.plpgsql" @@ -158,6 +158,7 @@ t.string "language" t.string "url" t.jsonb "raw" + t.string "source_url" t.bigint "government_id", null: false t.datetime "created_at", null: false t.datetime "updated_at", null: false @@ -365,6 +366,19 @@ t.datetime "updated_at", null: false end + create_table "statcan_datasets", force: :cascade do |t| + t.text "statcan_url", null: false + t.string "name", null: false + t.string "sync_schedule", null: false + t.jsonb "current_data" + t.datetime "last_synced_at", precision: nil + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.index ["last_synced_at"], name: "index_statcan_datasets_on_last_synced_at" + t.index ["name"], name: "index_statcan_datasets_on_name", unique: true + t.index ["statcan_url"], name: "index_statcan_datasets_on_statcan_url", unique: true + end + create_table "tool_calls", force: :cascade do |t| t.bigint "message_id", null: false t.string "tool_call_id" From cb81dbcb65a6b95982be8c820a905ecc42fb3349 Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 18:02:40 +0000 Subject: [PATCH 02/19] Add Fugit gemfile (for cron expression parsing) --- Gemfile | 3 +++ Gemfile.lock | 1 + 2 files changed, 4 insertions(+) diff --git a/Gemfile b/Gemfile index a718f87..0c8c36a 100644 --- a/Gemfile +++ b/Gemfile @@ -11,6 +11,9 @@ gem "good_job", "~> 4.10", ">= 4.10.2" # Build JSON APIs with ease [https://github.com/rails/jbuilder] gem "jbuilder" +# Use Fugit for parsing/validating cron expressions +gem "fugit", "~> 1.11" + # Use Active Model has_secure_password [https://guides.rubyonrails.org/active_model_basics.html#securepassword] # gem "bcrypt", "~> 3.1.7" diff --git a/Gemfile.lock b/Gemfile.lock index 3a2d587..c981c89 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -432,6 +432,7 @@ DEPENDENCIES devise (~> 4.9) dotenv feedjira (~> 3.2) + fugit (~> 1.11) good_job (~> 4.10, >= 4.10.2) http (~> 5.3) iconv (~> 1.1) From 70a07eb3473152d8e2863df19e44df468a81b18d Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 18:09:06 +0000 Subject: [PATCH 03/19] Add StatcanDataset model and tests --- app/models/statcan_dataset.rb | 17 ++++ test/models/statcan_dataset_test.rb | 122 ++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 app/models/statcan_dataset.rb create mode 100644 test/models/statcan_dataset_test.rb diff --git a/app/models/statcan_dataset.rb b/app/models/statcan_dataset.rb new file mode 100644 index 0000000..4ec833d --- /dev/null +++ b/app/models/statcan_dataset.rb @@ -0,0 +1,17 @@ +class StatcanDataset < ApplicationRecord + validates :statcan_url, presence: true, uniqueness: true, format: { with: URI::DEFAULT_PARSER.make_regexp } + validates :name, presence: true, uniqueness: true, format: { with: /\A[a-z0-9-]+\z/, message: "must be lowercase with hyphens only" } + validates :sync_schedule, presence: true + validate :valid_cron_expression + + private + + def valid_cron_expression + return unless sync_schedule.present? + + parsed_cron = Fugit::Cron.parse(sync_schedule) + if parsed_cron.nil? + errors.add(:sync_schedule, "must be a valid cron expression") + end + end +end diff --git a/test/models/statcan_dataset_test.rb b/test/models/statcan_dataset_test.rb new file mode 100644 index 0000000..76a90e2 --- /dev/null +++ b/test/models/statcan_dataset_test.rb @@ -0,0 +1,122 @@ +# test/models/statcan_dataset_test.rb +require "test_helper" + +class StatcanDatasetTest < ActiveSupport::TestCase + test "valid dataset" do + dataset = StatcanDataset.new( + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + name: "demographic-incomes", + sync_schedule: "0 6 * * *" + ) + + assert dataset.valid? + end + + test "requires statcan_url" do + dataset = StatcanDataset.new( + name: "demographic-incomes", + sync_schedule: "0 6 * * *" + ) + + assert_not dataset.valid? + assert_includes dataset.errors[:statcan_url], "can't be blank" + end + + test "requires name" do + dataset = StatcanDataset.new( + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + sync_schedule: "0 6 * * *" + ) + + assert_not dataset.valid? + assert_includes dataset.errors[:name], "can't be blank" + end + + test "requires sync_schedule" do + dataset = StatcanDataset.new( + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + name: "demographic-incomes" + ) + + assert_not dataset.valid? + assert_includes dataset.errors[:sync_schedule], "can't be blank" + end + + test "statcan_url must be unique" do + url = "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701" + + StatcanDataset.create!( + statcan_url: url, + name: "first-dataset", + sync_schedule: "0 6 * * *" + ) + + duplicate = StatcanDataset.new( + statcan_url: url, + name: "second-dataset", + sync_schedule: "0 12 * * *" + ) + + assert_not duplicate.valid? + assert_includes duplicate.errors[:statcan_url], "has already been taken" + end + + test "name must be unique" do + name = "demographic-incomes" + + StatcanDataset.create!( + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + name: name, + sync_schedule: "0 6 * * *" + ) + + duplicate = StatcanDataset.new( + statcan_url: "https://different-url.statcan.gc.ca/data", + name: name, + sync_schedule: "0 12 * * *" + ) + + assert_not duplicate.valid? + assert_includes duplicate.errors[:name], "has already been taken" + end + + test "name must be kebab-case" do + dataset = StatcanDataset.new( + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + name: "Invalid Name", + sync_schedule: "0 6 * * *" + ) + + assert_not dataset.valid? + assert_includes dataset.errors[:name], "must be lowercase with hyphens only" + end + + test "sync_schedule accepts valid cron expressions" do + valid_schedules = [ "0 6 * * *", "30 14 1 * *", "0 * * * 0", "15 9 * * 1-5" ] + + valid_schedules.each do |schedule| + dataset = StatcanDataset.new( + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + name: "test-dataset-#{schedule.hash}", + sync_schedule: schedule + ) + + assert dataset.valid?, "#{schedule} should be valid" + end + end + + test "sync_schedule rejects invalid cron expressions" do + invalid_schedules = [ "invalid", "60 25 32 13 8", "not a cron" ] + + invalid_schedules.each do |schedule| + dataset = StatcanDataset.new( + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + name: "test-dataset-#{schedule.hash}", + sync_schedule: schedule + ) + + assert_not dataset.valid?, "#{schedule} should be invalid" + assert_includes dataset.errors[:sync_schedule], "must be a valid cron expression" + end + end +end From 5eefd9aeb7d6099eadc724adf4b90fd2ed320b9a Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 21:26:00 +0000 Subject: [PATCH 04/19] Add CSV gem explicitly (not included in Ruby 3.4+) --- Gemfile | 8 ++++---- Gemfile.lock | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Gemfile b/Gemfile index 0c8c36a..91161b1 100644 --- a/Gemfile +++ b/Gemfile @@ -11,8 +11,6 @@ gem "good_job", "~> 4.10", ">= 4.10.2" # Build JSON APIs with ease [https://github.com/rails/jbuilder] gem "jbuilder" -# Use Fugit for parsing/validating cron expressions -gem "fugit", "~> 1.11" # Use Active Model has_secure_password [https://guides.rubyonrails.org/active_model_basics.html#securepassword] # gem "bcrypt", "~> 3.1.7" @@ -32,8 +30,6 @@ gem "rack-cors" # Search gem "ransack", "~> 4.2.1" - - group :development, :test do # See https://guides.rubyonrails.org/debugging_rails_applications.html#debugging-with-the-debug-gem gem "debug", platforms: %i[ mri windows ], require: "debug/prelude" @@ -46,6 +42,8 @@ group :development, :test do end gem "avo", ">= 3.2" +gem "csv", "~> 3.3" + gem "devise", "~> 4.9" gem "importmap-rails", "~> 2.1" @@ -59,6 +57,8 @@ gem "dotenv", groups: [ :development, :test ] gem "feedjira", "~> 3.2" +gem "fugit", "~> 1.11" + gem "http", "~> 5.3" gem "iconv", "~> 1.1" diff --git a/Gemfile.lock b/Gemfile.lock index c981c89..e33e49f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -132,6 +132,7 @@ GEM concurrent-ruby (1.3.5) connection_pool (2.5.3) crass (1.0.6) + csv (3.3.5) date (3.4.1) debug (1.11.0) irb (~> 1.10) @@ -428,6 +429,7 @@ DEPENDENCIES bootsnap brakeman commonmarker (~> 2.3) + csv (~> 3.3) debug devise (~> 4.9) dotenv From ded351139cf54bb3d98b9a66373b9df6ae5805d7 Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 21:26:18 +0000 Subject: [PATCH 05/19] Add service to fetch and parse StatCan datasets --- app/services/statcan_fetcher.rb | 17 +++++++++++++++++ test/models/statcan_dataset_test.rb | 1 - 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 app/services/statcan_fetcher.rb diff --git a/app/services/statcan_fetcher.rb b/app/services/statcan_fetcher.rb new file mode 100644 index 0000000..887b3fa --- /dev/null +++ b/app/services/statcan_fetcher.rb @@ -0,0 +1,17 @@ +require "csv" + +class StatcanFetcher + def self.fetch(url) + response = HTTP + .timeout(connect: 10, read: 60) + .headers("User-Agent" => "BuildCanada/OutcomeTrackerAPI") + .get(url) + + csv_string = response.body.to_s + + # Remove UTF-8 Byte Order Mark (BOM) if present + csv_string = csv_string.sub(/\A\uFEFF/, "") + + CSV.parse(csv_string, headers: true, liberal_parsing: true, skip_blanks: true).map(&:to_h) + end +end diff --git a/test/models/statcan_dataset_test.rb b/test/models/statcan_dataset_test.rb index 76a90e2..9111b0a 100644 --- a/test/models/statcan_dataset_test.rb +++ b/test/models/statcan_dataset_test.rb @@ -1,4 +1,3 @@ -# test/models/statcan_dataset_test.rb require "test_helper" class StatcanDatasetTest < ActiveSupport::TestCase From 06965e06bc1f622512a9b75428fc13c50acbf044 Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 21:53:39 +0000 Subject: [PATCH 06/19] Add job to sync datasets --- app/jobs/statcan_sync_job.rb | 13 ++++++++++++ test/jobs/statcan_sync_job_test.rb | 34 ++++++++++++++++++++++++++++++ test/test_helper.rb | 1 + 3 files changed, 48 insertions(+) create mode 100644 app/jobs/statcan_sync_job.rb create mode 100644 test/jobs/statcan_sync_job_test.rb diff --git a/app/jobs/statcan_sync_job.rb b/app/jobs/statcan_sync_job.rb new file mode 100644 index 0000000..8ffa2ac --- /dev/null +++ b/app/jobs/statcan_sync_job.rb @@ -0,0 +1,13 @@ +class StatcanSyncJob < ApplicationJob + queue_as :default + + def perform(statcan_dataset_id) + dataset = StatcanDataset.find(statcan_dataset_id) + data = StatcanFetcher.fetch(dataset.statcan_url) + + dataset.update!( + current_data: data, + last_synced_at: Time.current + ) + end +end diff --git a/test/jobs/statcan_sync_job_test.rb b/test/jobs/statcan_sync_job_test.rb new file mode 100644 index 0000000..28d1f7a --- /dev/null +++ b/test/jobs/statcan_sync_job_test.rb @@ -0,0 +1,34 @@ +require "test_helper" + +class StatcanSyncJobTest < ActiveJob::TestCase + def setup + @dataset = StatcanDataset.create!( + name: "test-dataset", + statcan_url: "https://statcan.gc.ca/test.csv", + sync_schedule: "0 0 * * *" + ) + end + + test "should update dataset with fetched data" do + parsed_data = [ { "year" => 2020, "population" => 38000000 } ] + + StatcanFetcher.stub :fetch, parsed_data do + StatcanSyncJob.perform_now(@dataset.id) + end + + @dataset.reload + assert_equal parsed_data, @dataset.current_data + assert_not_nil @dataset.last_synced_at + end + + test "should not update dataset when fetch times out" do + StatcanFetcher.stub :fetch, ->(url) { raise HTTP::TimeoutError.new("Request timed out") } do + assert_raises HTTP::TimeoutError do + StatcanSyncJob.perform_now(@dataset.id) + end + end + + @dataset.reload + assert_nil @dataset.current_data + end +end diff --git a/test/test_helper.rb b/test/test_helper.rb index 0c22470..332896f 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -1,6 +1,7 @@ ENV["RAILS_ENV"] ||= "test" require_relative "../config/environment" require "rails/test_help" +require "minitest/mock" module ActiveSupport class TestCase From 62aa3802d96298fe910645a2a917d9e2233aa5f1 Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 22:22:27 +0000 Subject: [PATCH 07/19] Add syncing/stale detection logic --- app/models/statcan_dataset.rb | 13 ++++++ test/models/statcan_dataset_test.rb | 64 +++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/app/models/statcan_dataset.rb b/app/models/statcan_dataset.rb index 4ec833d..2b8e9f7 100644 --- a/app/models/statcan_dataset.rb +++ b/app/models/statcan_dataset.rb @@ -4,6 +4,19 @@ class StatcanDataset < ApplicationRecord validates :sync_schedule, presence: true validate :valid_cron_expression + def self.filter_stale(datasets, current_time = Time.current) + datasets.select { |dataset| dataset.needs_sync?(current_time) } + end + + def needs_sync?(current_time = Time.current) + return true if last_synced_at.nil? + + cron = Fugit::Cron.parse(sync_schedule) + last_scheduled_time = cron.previous_time(current_time) + + last_synced_at.to_i < last_scheduled_time.seconds + end + private def valid_cron_expression diff --git a/test/models/statcan_dataset_test.rb b/test/models/statcan_dataset_test.rb index 9111b0a..f29ec9d 100644 --- a/test/models/statcan_dataset_test.rb +++ b/test/models/statcan_dataset_test.rb @@ -118,4 +118,68 @@ class StatcanDatasetTest < ActiveSupport::TestCase assert_includes dataset.errors[:sync_schedule], "must be a valid cron expression" end end + + test "needs_sync returns true when last_synced_at is nil" do + dataset = StatcanDataset.create!( + name: "test-dataset", + statcan_url: "https://statcan.gc.ca/test.csv", + sync_schedule: "0 0 * * *" + ) + + current_time = Time.parse("2025-01-02 14:00:00") + assert dataset.needs_sync?(current_time) + end + + test "needs_sync returns true when last sync was before last scheduled time" do + dataset = StatcanDataset.create!( + name: "test-dataset-old", + statcan_url: "https://statcan.gc.ca/test-old.csv", + sync_schedule: "0 0 * * *", # Daily at midnight + last_synced_at: Time.parse("2025-01-01 23:00:00") # Yesterday 11pm + ) + + current_time = Time.parse("2025-01-02 14:00:00") # 2pm next day + assert dataset.needs_sync?(current_time) + end + + test "needs_sync returns false when last sync was after last scheduled time" do + dataset = StatcanDataset.create!( + name: "test-dataset-fresh", + statcan_url: "https://statcan.gc.ca/test-fresh.csv", + sync_schedule: "0 0 * * *", # Daily at midnight + last_synced_at: Time.parse("2025-01-02 01:00:00") # 1am today + ) + + current_time = Time.parse("2025-01-02 14:00:00") # 2pm same day + assert_not dataset.needs_sync?(current_time) + end + + test "filter_stale returns datasets that need syncing" do + stale_dataset = StatcanDataset.create!( + name: "stale-one", + statcan_url: "https://statcan.gc.ca/stale1.csv", + sync_schedule: "0 0 * * *", # Daily at midnight + last_synced_at: Time.parse("2025-01-01 23:00:00") # Yesterday 11pm + ) + + fresh_dataset = StatcanDataset.create!( + name: "fresh-one", + statcan_url: "https://statcan.gc.ca/fresh.csv", + sync_schedule: "0 0 * * *", # Daily at midnight + last_synced_at: Time.parse("2025-01-02 01:00:00") # 1am today + ) + + current_time = Time.parse("2025-01-02 14:00:00") # 2pm same day + all_datasets = [ stale_dataset, fresh_dataset ] + stale_datasets = StatcanDataset.filter_stale(all_datasets, current_time) + + assert_includes stale_datasets, stale_dataset + assert_not_includes stale_datasets, fresh_dataset + assert_equal 1, stale_datasets.length +end + + test "filter_stale works with empty collection" do + stale_datasets = StatcanDataset.filter_stale([]) + assert_empty stale_datasets + end end From 1b249955af652dde5dde9761d4a227137a0cc1a5 Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 22:42:58 +0000 Subject: [PATCH 08/19] Add cron job to schedule StatCan dataset syncs --- app/jobs/statcan_cron_job.rb | 14 ++++++++ config/environments/test.rb | 2 +- test/jobs/statcan_cron_job_test.rb | 57 ++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 app/jobs/statcan_cron_job.rb create mode 100644 test/jobs/statcan_cron_job_test.rb diff --git a/app/jobs/statcan_cron_job.rb b/app/jobs/statcan_cron_job.rb new file mode 100644 index 0000000..3cf400d --- /dev/null +++ b/app/jobs/statcan_cron_job.rb @@ -0,0 +1,14 @@ +class StatcanCronJob < ApplicationJob + queue_as :default + + def perform(current_time = Time.current) + datasets = StatcanDataset.select(:id, :sync_schedule, :last_synced_at) + stale_datasets = StatcanDataset.filter_stale(datasets, current_time) + + stale_datasets.each do |dataset| + StatcanSyncJob.perform_later(dataset.id) + end + + Rails.logger.info "Enqueued #{stale_datasets.count} Statcan sync jobs" + end +end diff --git a/config/environments/test.rb b/config/environments/test.rb index 30cc5cd..5f4c96a 100644 --- a/config/environments/test.rb +++ b/config/environments/test.rb @@ -52,5 +52,5 @@ config.action_controller.raise_on_missing_callback_actions = true # Use inline adapter for Active Job in tests for faster execution - config.active_job.queue_adapter = :inline + config.active_job.queue_adapter = :test end diff --git a/test/jobs/statcan_cron_job_test.rb b/test/jobs/statcan_cron_job_test.rb new file mode 100644 index 0000000..d10f53f --- /dev/null +++ b/test/jobs/statcan_cron_job_test.rb @@ -0,0 +1,57 @@ +require "test_helper" + +class StatcanCronJobTest < ActiveJob::TestCase + test "should enqueue sync jobs for stale datasets only" do + current_time = Time.parse("2025-01-02 14:00:00") # 2pm + + # Create a stale dataset (never synced) + stale_dataset1 = StatcanDataset.create!( + name: "stale-never-synced", + statcan_url: "https://statcan.gc.ca/stale1.csv", + sync_schedule: "0 0 * * *", + last_synced_at: nil + ) + + # Create another stale dataset (old sync) + stale_dataset2 = StatcanDataset.create!( + name: "stale-old-sync", + statcan_url: "https://statcan.gc.ca/stale2.csv", + sync_schedule: "0 0 * * *", + last_synced_at: Time.parse("2025-01-01 23:00:00") # Yesterday 11pm + ) + + # Create a fresh dataset (recent sync) + _fresh_dataset = StatcanDataset.create!( + name: "fresh-dataset", + statcan_url: "https://statcan.gc.ca/fresh.csv", + sync_schedule: "0 0 * * *", + last_synced_at: Time.parse("2025-01-02 01:00:00") # 1am today + ) + + # Track enqueued jobs + assert_enqueued_jobs 2, only: StatcanSyncJob do + StatcanCronJob.perform_now(current_time) + end + + # Verify the correct jobs were enqueued + assert_enqueued_with(job: StatcanSyncJob, args: [ stale_dataset1.id ]) + assert_enqueued_with(job: StatcanSyncJob, args: [ stale_dataset2.id ]) + end + + test "should not enqueue jobs when no datasets need syncing" do + current_time = Time.parse("2025-01-02 14:00:00") + + # Create only fresh datasets + StatcanDataset.create!( + name: "fresh-dataset-1", + statcan_url: "https://statcan.gc.ca/fresh1.csv", + sync_schedule: "0 0 * * *", + last_synced_at: Time.parse("2025-01-02 01:00:00") # 1am today + ) + + # Should not enqueue any jobs + assert_enqueued_jobs 0, only: StatcanSyncJob do + StatcanCronJob.perform_now(current_time) + end + end +end From 915e0a7389cf7ea917c342050b00c05635f5eaf2 Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 23:32:21 +0000 Subject: [PATCH 09/19] Schedule statcan cron job to run every hour --- config/initializers/good_job.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/config/initializers/good_job.rb b/config/initializers/good_job.rb index 322bed7..01cee83 100644 --- a/config/initializers/good_job.rb +++ b/config/initializers/good_job.rb @@ -16,6 +16,10 @@ class: "FeedRefresherJob", # name of the job class as a String; must reference an Active Job job class description: "Refreshed feeds and creates new entries", # optional description that appears in Dashboard, enabled_by_default: -> { Rails.env.production? } # Only enable in production, otherwise can be enabled manually through Dashboard + }, + statcan_sync: { + cron: "0 * * * *", # Every hour + class: "StatcanCronJob" } } From 0abca463c8ea09bde15290a7e4c911d2abd85cf2 Mon Sep 17 00:00:00 2001 From: James Long Date: Mon, 7 Jul 2025 23:51:30 +0000 Subject: [PATCH 10/19] Add StatcanDatasets setup task --- lib/tasks/statcan.rake | 66 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 lib/tasks/statcan.rake diff --git a/lib/tasks/statcan.rake b/lib/tasks/statcan.rake new file mode 100644 index 0000000..79f8831 --- /dev/null +++ b/lib/tasks/statcan.rake @@ -0,0 +1,66 @@ +namespace :statcan do + desc "Setup Statcan datasets" + task setup_datasets: :environment do + statcan_datasets = [ + { + name: "balance-sheets", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=1010001501&latestN=0&startDate=19901001&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B2%5D%2C%5B%5D%5D&checkedLevels=2D1%2C2D2%2C2D3", + sync_schedule: "23 6 * * *" # Daily at 6:23 AM + }, + { + name: "demographic-incomes-non-permanent-residents", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=1110009101&latestN=2&startDate=&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B%5D%2C%5B%5D%2C%5B%5D%2C%5B5%5D%5D&checkedLevels=1D1%2C1D2%2C1D3%2C2D1%2C3D1", + sync_schedule: "23 8 * * *" # Daily at 8:23 AM + }, + { + name: "gdp", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=3610010401&latestN=0&startDate=19610101&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B1%5D%2C%5B1%5D%2C%5B%5D%5D&checkedLevels=3D1%2C3D2%2C3D3%2C3D4", + sync_schedule: "23 6 * * *" # Daily at 6:23 AM + }, + { + name: "housing-starts", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData.action?pid=3410015101&latestN=0&startDate=19880101&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B1%5D%2C%5B%5D%5D&checkedLevels=0D1%2C2D1%2C2D2", + sync_schedule: "23 6 * * *" # Daily at 6:23 AM + }, + { + name: "labour-productivity", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=3610020701&latestN=0&startDate=19801001&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B5%5D%2C%5B1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11%2C13%2C14%2C15%2C16%2C17%2C18%2C19%2C20%2C21%5D%5D&checkedLevels=", + sync_schedule: "23 9 * * *" # Daily at 9:23 AM + }, + { + name: "non-permanent-residents", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData.action?pid=1710012101&latestN=0&startDate=20210101&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B%5D%5D&checkedLevels=0D1%2C1D1%2C1D2%2C1D3", + sync_schedule: "23 7 * * *" # Daily at 7:23 AM + }, + { + name: "population", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=1710000901&latestN=0&startDate=19000101&endDate=&csvLocale=en&selectedMembers=%5B%5B1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11%2C12%2C14%2C15%5D%5D&checkedLevels=", + sync_schedule: "23 6 * * *" # Daily at 6:23 AM + }, + { + name: "primary-energy-production", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=2510007901&latestN=5&startDate=&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B%5D%2C%5B%5D%5D&checkedLevels=0D1%2C1D1%2C1D2%2C1D3%2C2D1", + sync_schedule: "23 10 * * *" # Daily at 10:23 AM + } + ] + + puts "Setting up StatcanDatasets..." + + statcan_datasets.each do |dataset_attrs| + dataset = StatcanDataset.find_or_create_by(name: dataset_attrs[:name]) do |d| + d.statcan_url = dataset_attrs[:statcan_url] + d.sync_schedule = dataset_attrs[:sync_schedule] + end + + if dataset.persisted? + if dataset.previously_new_record? + puts "✓ #{dataset.name} - created" + else + puts "✓ #{dataset.name} - already exists" + end + else + puts "✗ #{dataset.name} - failed to create: #{dataset.errors.full_messages.join(', ')}" + end + end + end +end From 0c937b386137ef0e529f2d893442cda46b82491f Mon Sep 17 00:00:00 2001 From: James Long Date: Tue, 8 Jul 2025 16:14:35 +0000 Subject: [PATCH 11/19] Add GET route for StatCan datasets --- .../statcan_datasets_controller.rb | 6 ++++ config/routes.rb | 1 + .../statcan_datasets_controller_test.rb | 28 +++++++++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 app/controllers/statcan_datasets_controller.rb create mode 100644 test/controllers/statcan_datasets_controller_test.rb diff --git a/app/controllers/statcan_datasets_controller.rb b/app/controllers/statcan_datasets_controller.rb new file mode 100644 index 0000000..429c6dc --- /dev/null +++ b/app/controllers/statcan_datasets_controller.rb @@ -0,0 +1,6 @@ +class StatcanDatasetsController < ApplicationController + def show + dataset = StatcanDataset.find(params[:id]) + render json: dataset + end +end diff --git a/config/routes.rb b/config/routes.rb index 0d5c6be..9c2b7c1 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -18,6 +18,7 @@ resources :promises, only: [ :index, :show ] resources :evidences, only: [ :index, :show ] resources :builders, only: [ :index, :show ] + resources :statcan_datasets, only: [ :show ] namespace :admin do resources :promises, only: [ :index, :show, :update, :destroy ] diff --git a/test/controllers/statcan_datasets_controller_test.rb b/test/controllers/statcan_datasets_controller_test.rb new file mode 100644 index 0000000..b1bb13f --- /dev/null +++ b/test/controllers/statcan_datasets_controller_test.rb @@ -0,0 +1,28 @@ +require "test_helper" + +class StatcanDatasetsControllerTest < ActionDispatch::IntegrationTest + test "should show dataset" do + dataset = StatcanDataset.create!( + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + name: "demographic-incomes", + sync_schedule: "0 6 * * *", + current_data: [ { "year" => 2020, "population" => 38000000 } ], + last_synced_at: Time.parse("2025-01-01 23:00:00") # Yesterday 11pm + ) + + get statcan_dataset_url(dataset) + + assert_response :success + assert_equal "application/json; charset=utf-8", response.content_type + + json_response = JSON.parse(response.body) + assert_equal dataset.id, json_response["id"] + assert_equal dataset.name, json_response["name"] + end + + test "should return 404 for non-existent dataset" do + get statcan_dataset_url(id: 99999) + + assert_response :not_found + end +end From e99133331a9a56a31a953e5547a62a22e11b332f Mon Sep 17 00:00:00 2001 From: James Long Date: Tue, 8 Jul 2025 17:53:14 +0000 Subject: [PATCH 12/19] Add dataset fixture and simplify new tests --- .../statcan_datasets_controller_test.rb | 8 +- test/fixtures/statcan_datasets.yml | 15 +++ test/jobs/statcan_cron_job_test.rb | 5 + test/jobs/statcan_sync_job_test.rb | 27 ++--- test/models/statcan_dataset_test.rb | 103 +++++++----------- 5 files changed, 69 insertions(+), 89 deletions(-) create mode 100644 test/fixtures/statcan_datasets.yml diff --git a/test/controllers/statcan_datasets_controller_test.rb b/test/controllers/statcan_datasets_controller_test.rb index b1bb13f..5eff3f3 100644 --- a/test/controllers/statcan_datasets_controller_test.rb +++ b/test/controllers/statcan_datasets_controller_test.rb @@ -2,13 +2,7 @@ class StatcanDatasetsControllerTest < ActionDispatch::IntegrationTest test "should show dataset" do - dataset = StatcanDataset.create!( - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", - name: "demographic-incomes", - sync_schedule: "0 6 * * *", - current_data: [ { "year" => 2020, "population" => 38000000 } ], - last_synced_at: Time.parse("2025-01-01 23:00:00") # Yesterday 11pm - ) + dataset = statcan_datasets(:synced) get statcan_dataset_url(dataset) diff --git a/test/fixtures/statcan_datasets.yml b/test/fixtures/statcan_datasets.yml new file mode 100644 index 0000000..52c2644 --- /dev/null +++ b/test/fixtures/statcan_datasets.yml @@ -0,0 +1,15 @@ +unsynced: + statcan_url: "https://statcan.gc.ca/123.csv" + name: "test-dataset-unsynced" + sync_schedule: "0 0 * * *" + current_data: null + last_synced_at: null + +synced: + statcan_url: "https://statcan.gc.ca/456.csv" + name: "test-dataset-synced" + sync_schedule: "0 0 * * *" + current_data: + - year: 2020 + population: 38000000 + last_synced_at: "2024-01-15 10:30:00" \ No newline at end of file diff --git a/test/jobs/statcan_cron_job_test.rb b/test/jobs/statcan_cron_job_test.rb index d10f53f..1642860 100644 --- a/test/jobs/statcan_cron_job_test.rb +++ b/test/jobs/statcan_cron_job_test.rb @@ -1,6 +1,11 @@ require "test_helper" class StatcanCronJobTest < ActiveJob::TestCase + def setup + # Remove fixture data before each test + StatcanDataset.delete_all + end + test "should enqueue sync jobs for stale datasets only" do current_time = Time.parse("2025-01-02 14:00:00") # 2pm diff --git a/test/jobs/statcan_sync_job_test.rb b/test/jobs/statcan_sync_job_test.rb index 28d1f7a..3ca152f 100644 --- a/test/jobs/statcan_sync_job_test.rb +++ b/test/jobs/statcan_sync_job_test.rb @@ -1,34 +1,29 @@ require "test_helper" class StatcanSyncJobTest < ActiveJob::TestCase - def setup - @dataset = StatcanDataset.create!( - name: "test-dataset", - statcan_url: "https://statcan.gc.ca/test.csv", - sync_schedule: "0 0 * * *" - ) - end - test "should update dataset with fetched data" do - parsed_data = [ { "year" => 2020, "population" => 38000000 } ] + dataset = statcan_datasets(:unsynced) + parsed_data = [ { "population" => 1000000, "year" => 2023 } ] StatcanFetcher.stub :fetch, parsed_data do - StatcanSyncJob.perform_now(@dataset.id) + StatcanSyncJob.perform_now(dataset.id) end - @dataset.reload - assert_equal parsed_data, @dataset.current_data - assert_not_nil @dataset.last_synced_at + dataset.reload + assert_equal parsed_data, dataset.current_data + assert_not_nil dataset.last_synced_at end test "should not update dataset when fetch times out" do + dataset = statcan_datasets(:unsynced) + StatcanFetcher.stub :fetch, ->(url) { raise HTTP::TimeoutError.new("Request timed out") } do assert_raises HTTP::TimeoutError do - StatcanSyncJob.perform_now(@dataset.id) + StatcanSyncJob.perform_now(dataset.id) end end - @dataset.reload - assert_nil @dataset.current_data + dataset.reload + assert_nil dataset.current_data end end diff --git a/test/models/statcan_dataset_test.rb b/test/models/statcan_dataset_test.rb index f29ec9d..105722c 100644 --- a/test/models/statcan_dataset_test.rb +++ b/test/models/statcan_dataset_test.rb @@ -1,41 +1,40 @@ require "test_helper" class StatcanDatasetTest < ActiveSupport::TestCase + def self.valid_attributes + { + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + name: "demographic-incomes", + sync_schedule: "0 6 * * *" + } + end + + test "valid dataset" do - dataset = StatcanDataset.new( - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", - name: "demographic-incomes", - sync_schedule: "0 6 * * *" - ) + dataset = StatcanDataset.new(self.class.valid_attributes) assert dataset.valid? end test "requires statcan_url" do - dataset = StatcanDataset.new( - name: "demographic-incomes", - sync_schedule: "0 6 * * *" - ) + attributes = self.class.valid_attributes.except(:statcan_url) + dataset = StatcanDataset.new(attributes) assert_not dataset.valid? assert_includes dataset.errors[:statcan_url], "can't be blank" end test "requires name" do - dataset = StatcanDataset.new( - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", - sync_schedule: "0 6 * * *" - ) + attributes = self.class.valid_attributes.except(:name) + dataset = StatcanDataset.new(attributes) assert_not dataset.valid? assert_includes dataset.errors[:name], "can't be blank" end test "requires sync_schedule" do - dataset = StatcanDataset.new( - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", - name: "demographic-incomes" - ) + attributes = self.class.valid_attributes.except(:sync_schedule) + dataset = StatcanDataset.new(attributes) assert_not dataset.valid? assert_includes dataset.errors[:sync_schedule], "can't be blank" @@ -80,11 +79,8 @@ class StatcanDatasetTest < ActiveSupport::TestCase end test "name must be kebab-case" do - dataset = StatcanDataset.new( - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", - name: "Invalid Name", - sync_schedule: "0 6 * * *" - ) + attributes = self.class.valid_attributes.merge(name: "InvalidName") + dataset = StatcanDataset.new(attributes) assert_not dataset.valid? assert_includes dataset.errors[:name], "must be lowercase with hyphens only" @@ -94,11 +90,8 @@ class StatcanDatasetTest < ActiveSupport::TestCase valid_schedules = [ "0 6 * * *", "30 14 1 * *", "0 * * * 0", "15 9 * * 1-5" ] valid_schedules.each do |schedule| - dataset = StatcanDataset.new( - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", - name: "test-dataset-#{schedule.hash}", - sync_schedule: schedule - ) + attributes = self.class.valid_attributes.merge(sync_schedule: schedule, name: "test-dataset-#{schedule.hash}") + dataset = StatcanDataset.new(attributes) assert dataset.valid?, "#{schedule} should be valid" end @@ -108,11 +101,8 @@ class StatcanDatasetTest < ActiveSupport::TestCase invalid_schedules = [ "invalid", "60 25 32 13 8", "not a cron" ] invalid_schedules.each do |schedule| - dataset = StatcanDataset.new( - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", - name: "test-dataset-#{schedule.hash}", - sync_schedule: schedule - ) + attributes = self.class.valid_attributes.merge(name: "test-dataset-#{schedule.hash}", sync_schedule: schedule) + dataset = StatcanDataset.new(attributes) assert_not dataset.valid?, "#{schedule} should be invalid" assert_includes dataset.errors[:sync_schedule], "must be a valid cron expression" @@ -120,54 +110,35 @@ class StatcanDatasetTest < ActiveSupport::TestCase end test "needs_sync returns true when last_synced_at is nil" do - dataset = StatcanDataset.create!( - name: "test-dataset", - statcan_url: "https://statcan.gc.ca/test.csv", - sync_schedule: "0 0 * * *" - ) - + attributes = self.class.valid_attributes.merge(last_synced_at: nil) + dataset = StatcanDataset.new(attributes) current_time = Time.parse("2025-01-02 14:00:00") + assert dataset.needs_sync?(current_time) end test "needs_sync returns true when last sync was before last scheduled time" do - dataset = StatcanDataset.create!( - name: "test-dataset-old", - statcan_url: "https://statcan.gc.ca/test-old.csv", - sync_schedule: "0 0 * * *", # Daily at midnight - last_synced_at: Time.parse("2025-01-01 23:00:00") # Yesterday 11pm - ) + attributes = self.class.valid_attributes.merge(sync_schedule: "0 0 * * *", last_synced_at: Time.parse("2025-01-01 23:00:00")) + dataset = StatcanDataset.new(attributes) + current_time = Time.parse("2025-01-02 14:00:00") # 2pm next day - current_time = Time.parse("2025-01-02 14:00:00") # 2pm next day - assert dataset.needs_sync?(current_time) + assert dataset.needs_sync?(current_time) end test "needs_sync returns false when last sync was after last scheduled time" do - dataset = StatcanDataset.create!( - name: "test-dataset-fresh", - statcan_url: "https://statcan.gc.ca/test-fresh.csv", - sync_schedule: "0 0 * * *", # Daily at midnight - last_synced_at: Time.parse("2025-01-02 01:00:00") # 1am today - ) - + attributes = self.class.valid_attributes.merge(sync_schedule: "0 0 * * *", last_synced_at: Time.parse("2025-01-02 01:00:00")) + dataset = StatcanDataset.new(attributes) current_time = Time.parse("2025-01-02 14:00:00") # 2pm same day + assert_not dataset.needs_sync?(current_time) end test "filter_stale returns datasets that need syncing" do - stale_dataset = StatcanDataset.create!( - name: "stale-one", - statcan_url: "https://statcan.gc.ca/stale1.csv", - sync_schedule: "0 0 * * *", # Daily at midnight - last_synced_at: Time.parse("2025-01-01 23:00:00") # Yesterday 11pm - ) - - fresh_dataset = StatcanDataset.create!( - name: "fresh-one", - statcan_url: "https://statcan.gc.ca/fresh.csv", - sync_schedule: "0 0 * * *", # Daily at midnight - last_synced_at: Time.parse("2025-01-02 01:00:00") # 1am today - ) + stale_attributes = self.class.valid_attributes.merge(last_synced_at: nil) + stale_dataset = StatcanDataset.new(stale_attributes) + + fresh_attributes = self.class.valid_attributes.merge(sync_schedule: "0 0 * * *", last_synced_at: Time.parse("2025-01-02 01:00:00")) + fresh_dataset = StatcanDataset.new(fresh_attributes) current_time = Time.parse("2025-01-02 14:00:00") # 2pm same day all_datasets = [ stale_dataset, fresh_dataset ] From 4210e517e5a14ffb41d31ea1f0ff28d2c5bcf680 Mon Sep 17 00:00:00 2001 From: James Long Date: Tue, 8 Jul 2025 22:53:07 +0000 Subject: [PATCH 13/19] Add Avo resource for StatCan datasets --- app/avo/resources/statcan_dataset.rb | 16 ++++++++++++++++ .../avo/statcan_datasets_controller.rb | 4 ++++ 2 files changed, 20 insertions(+) create mode 100644 app/avo/resources/statcan_dataset.rb create mode 100644 app/controllers/avo/statcan_datasets_controller.rb diff --git a/app/avo/resources/statcan_dataset.rb b/app/avo/resources/statcan_dataset.rb new file mode 100644 index 0000000..7c25498 --- /dev/null +++ b/app/avo/resources/statcan_dataset.rb @@ -0,0 +1,16 @@ +class Avo::Resources::StatcanDataset < Avo::BaseResource + # self.includes = [] + # self.attachments = [] + # self.search = { + # query: -> { query.ransack(id_eq: params[:q], m: "or").result(distinct: false) } + # } + + def fields + field :id, as: :id + field :statcan_url, as: :text + field :name, as: :text + field :sync_schedule, as: :text + field :current_data, as: :code + field :last_synced_at, as: :date_time + end +end diff --git a/app/controllers/avo/statcan_datasets_controller.rb b/app/controllers/avo/statcan_datasets_controller.rb new file mode 100644 index 0000000..4b0b7a2 --- /dev/null +++ b/app/controllers/avo/statcan_datasets_controller.rb @@ -0,0 +1,4 @@ +# This controller has been generated to enable Rails' resource routes. +# More information on https://docs.avohq.io/3.0/controllers.html +class Avo::StatcanDatasetsController < Avo::ResourcesController +end From 32e6671741e1e2eed6d511a2306ba2813fe316bb Mon Sep 17 00:00:00 2001 From: James Long Date: Wed, 9 Jul 2025 16:38:33 +0000 Subject: [PATCH 14/19] Review feedback: Add StatcanDatasets seed data --- db/seeds/canada.rb | 2 ++ db/seeds/statcan_datasets.rb | 61 +++++++++++++++++++++++++++++++++ lib/tasks/statcan.rake | 66 ------------------------------------ 3 files changed, 63 insertions(+), 66 deletions(-) create mode 100644 db/seeds/statcan_datasets.rb delete mode 100644 lib/tasks/statcan.rake diff --git a/db/seeds/canada.rb b/db/seeds/canada.rb index 722e6f9..697e832 100644 --- a/db/seeds/canada.rb +++ b/db/seeds/canada.rb @@ -637,4 +637,6 @@ puts "Seeding Evidences..." +require_relative 'statcan_datasets' + puts "Done seeding" diff --git a/db/seeds/statcan_datasets.rb b/db/seeds/statcan_datasets.rb new file mode 100644 index 0000000..f4d1f57 --- /dev/null +++ b/db/seeds/statcan_datasets.rb @@ -0,0 +1,61 @@ +puts "Seeding StatcanDatasets..." + +statcan_datasets = [ + { + name: "balance-sheets", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=1010001501&latestN=0&startDate=19901001&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B2%5D%2C%5B%5D%5D&checkedLevels=2D1%2C2D2%2C2D3", + sync_schedule: "23 6 * * *" # Daily at 6:23 AM + }, + { + name: "demographic-incomes-non-permanent-residents", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=1110009101&latestN=2&startDate=&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B%5D%2C%5B%5D%2C%5B%5D%2C%5B5%5D%5D&checkedLevels=1D1%2C1D2%2C1D3%2C2D1%2C3D1", + sync_schedule: "23 8 * * *" # Daily at 8:23 AM + }, + { + name: "gdp", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=3610010401&latestN=0&startDate=19610101&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B1%5D%2C%5B1%5D%2C%5B%5D%5D&checkedLevels=3D1%2C3D2%2C3D3%2C3D4", + sync_schedule: "23 6 * * *" # Daily at 6:23 AM + }, + { + name: "housing-starts", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData.action?pid=3410015101&latestN=0&startDate=19880101&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B1%5D%2C%5B%5D%5D&checkedLevels=0D1%2C2D1%2C2D2", + sync_schedule: "23 6 * * *" # Daily at 6:23 AM + }, + { + name: "labour-productivity", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=3610020701&latestN=0&startDate=19801001&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B5%5D%2C%5B1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11%2C13%2C14%2C15%2C16%2C17%2C18%2C19%2C20%2C21%5D%5D&checkedLevels=", + sync_schedule: "23 9 * * *" # Daily at 9:23 AM + }, + { + name: "non-permanent-residents", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData.action?pid=1710012101&latestN=0&startDate=20210101&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B%5D%5D&checkedLevels=0D1%2C1D1%2C1D2%2C1D3", + sync_schedule: "23 7 * * *" # Daily at 7:23 AM + }, + { + name: "population", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=1710000901&latestN=0&startDate=19000101&endDate=&csvLocale=en&selectedMembers=%5B%5B1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11%2C12%2C14%2C15%5D%5D&checkedLevels=", + sync_schedule: "23 6 * * *" # Daily at 6:23 AM + }, + { + name: "primary-energy-production", + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=2510007901&latestN=5&startDate=&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B%5D%2C%5B%5D%5D&checkedLevels=0D1%2C1D1%2C1D2%2C1D3%2C2D1", + sync_schedule: "23 10 * * *" # Daily at 10:23 AM + } +] + +statcan_datasets.each do |dataset_attrs| + dataset = StatcanDataset.find_or_create_by(name: dataset_attrs[:name]) do |d| + d.statcan_url = dataset_attrs[:statcan_url] + d.sync_schedule = dataset_attrs[:sync_schedule] + end + + if dataset.persisted? + if dataset.previously_new_record? + puts "✓ #{dataset.name} - created" + else + puts "✓ #{dataset.name} - already exists" + end + else + puts "✗ #{dataset.name} - failed to create: #{dataset.errors.full_messages.join(', ')}" + end +end diff --git a/lib/tasks/statcan.rake b/lib/tasks/statcan.rake deleted file mode 100644 index 79f8831..0000000 --- a/lib/tasks/statcan.rake +++ /dev/null @@ -1,66 +0,0 @@ -namespace :statcan do - desc "Setup Statcan datasets" - task setup_datasets: :environment do - statcan_datasets = [ - { - name: "balance-sheets", - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=1010001501&latestN=0&startDate=19901001&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B2%5D%2C%5B%5D%5D&checkedLevels=2D1%2C2D2%2C2D3", - sync_schedule: "23 6 * * *" # Daily at 6:23 AM - }, - { - name: "demographic-incomes-non-permanent-residents", - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=1110009101&latestN=2&startDate=&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B%5D%2C%5B%5D%2C%5B%5D%2C%5B5%5D%5D&checkedLevels=1D1%2C1D2%2C1D3%2C2D1%2C3D1", - sync_schedule: "23 8 * * *" # Daily at 8:23 AM - }, - { - name: "gdp", - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=3610010401&latestN=0&startDate=19610101&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B1%5D%2C%5B1%5D%2C%5B%5D%5D&checkedLevels=3D1%2C3D2%2C3D3%2C3D4", - sync_schedule: "23 6 * * *" # Daily at 6:23 AM - }, - { - name: "housing-starts", - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData.action?pid=3410015101&latestN=0&startDate=19880101&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B1%5D%2C%5B%5D%5D&checkedLevels=0D1%2C2D1%2C2D2", - sync_schedule: "23 6 * * *" # Daily at 6:23 AM - }, - { - name: "labour-productivity", - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=3610020701&latestN=0&startDate=19801001&endDate=&csvLocale=en&selectedMembers=%5B%5B1%5D%2C%5B5%5D%2C%5B1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11%2C13%2C14%2C15%2C16%2C17%2C18%2C19%2C20%2C21%5D%5D&checkedLevels=", - sync_schedule: "23 9 * * *" # Daily at 9:23 AM - }, - { - name: "non-permanent-residents", - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData.action?pid=1710012101&latestN=0&startDate=20210101&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B%5D%5D&checkedLevels=0D1%2C1D1%2C1D2%2C1D3", - sync_schedule: "23 7 * * *" # Daily at 7:23 AM - }, - { - name: "population", - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=1710000901&latestN=0&startDate=19000101&endDate=&csvLocale=en&selectedMembers=%5B%5B1%2C2%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11%2C12%2C14%2C15%5D%5D&checkedLevels=", - sync_schedule: "23 6 * * *" # Daily at 6:23 AM - }, - { - name: "primary-energy-production", - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/dtl!downloadDbLoadingData-nonTraduit.action?pid=2510007901&latestN=5&startDate=&endDate=&csvLocale=en&selectedMembers=%5B%5B%5D%2C%5B%5D%2C%5B%5D%5D&checkedLevels=0D1%2C1D1%2C1D2%2C1D3%2C2D1", - sync_schedule: "23 10 * * *" # Daily at 10:23 AM - } - ] - - puts "Setting up StatcanDatasets..." - - statcan_datasets.each do |dataset_attrs| - dataset = StatcanDataset.find_or_create_by(name: dataset_attrs[:name]) do |d| - d.statcan_url = dataset_attrs[:statcan_url] - d.sync_schedule = dataset_attrs[:sync_schedule] - end - - if dataset.persisted? - if dataset.previously_new_record? - puts "✓ #{dataset.name} - created" - else - puts "✓ #{dataset.name} - already exists" - end - else - puts "✗ #{dataset.name} - failed to create: #{dataset.errors.full_messages.join(', ')}" - end - end - end -end From 329e46ba850602ae9453f30a109eb1b92833529a Mon Sep 17 00:00:00 2001 From: James Long Date: Wed, 9 Jul 2025 16:46:29 +0000 Subject: [PATCH 15/19] Review feedback: Enqueue jobs with global ids --- app/jobs/statcan_cron_job.rb | 5 +---- app/jobs/statcan_sync_job.rb | 7 +++---- test/jobs/statcan_cron_job_test.rb | 4 ++-- test/jobs/statcan_sync_job_test.rb | 4 ++-- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/app/jobs/statcan_cron_job.rb b/app/jobs/statcan_cron_job.rb index 3cf400d..48bfdc4 100644 --- a/app/jobs/statcan_cron_job.rb +++ b/app/jobs/statcan_cron_job.rb @@ -4,10 +4,7 @@ class StatcanCronJob < ApplicationJob def perform(current_time = Time.current) datasets = StatcanDataset.select(:id, :sync_schedule, :last_synced_at) stale_datasets = StatcanDataset.filter_stale(datasets, current_time) - - stale_datasets.each do |dataset| - StatcanSyncJob.perform_later(dataset.id) - end + stale_datasets.each(&StatcanSyncJob.method(:perform_later)) Rails.logger.info "Enqueued #{stale_datasets.count} Statcan sync jobs" end diff --git a/app/jobs/statcan_sync_job.rb b/app/jobs/statcan_sync_job.rb index 8ffa2ac..7e1874d 100644 --- a/app/jobs/statcan_sync_job.rb +++ b/app/jobs/statcan_sync_job.rb @@ -1,11 +1,10 @@ class StatcanSyncJob < ApplicationJob queue_as :default - def perform(statcan_dataset_id) - dataset = StatcanDataset.find(statcan_dataset_id) - data = StatcanFetcher.fetch(dataset.statcan_url) + def perform(statcan_dataset) + data = StatcanFetcher.fetch(statcan_dataset.statcan_url) - dataset.update!( + statcan_dataset.update!( current_data: data, last_synced_at: Time.current ) diff --git a/test/jobs/statcan_cron_job_test.rb b/test/jobs/statcan_cron_job_test.rb index 1642860..23d06d6 100644 --- a/test/jobs/statcan_cron_job_test.rb +++ b/test/jobs/statcan_cron_job_test.rb @@ -39,8 +39,8 @@ def setup end # Verify the correct jobs were enqueued - assert_enqueued_with(job: StatcanSyncJob, args: [ stale_dataset1.id ]) - assert_enqueued_with(job: StatcanSyncJob, args: [ stale_dataset2.id ]) + assert_enqueued_with(job: StatcanSyncJob, args: [ stale_dataset1 ]) + assert_enqueued_with(job: StatcanSyncJob, args: [ stale_dataset2 ]) end test "should not enqueue jobs when no datasets need syncing" do diff --git a/test/jobs/statcan_sync_job_test.rb b/test/jobs/statcan_sync_job_test.rb index 3ca152f..ec2b9fb 100644 --- a/test/jobs/statcan_sync_job_test.rb +++ b/test/jobs/statcan_sync_job_test.rb @@ -6,7 +6,7 @@ class StatcanSyncJobTest < ActiveJob::TestCase parsed_data = [ { "population" => 1000000, "year" => 2023 } ] StatcanFetcher.stub :fetch, parsed_data do - StatcanSyncJob.perform_now(dataset.id) + StatcanSyncJob.perform_now(dataset) end dataset.reload @@ -19,7 +19,7 @@ class StatcanSyncJobTest < ActiveJob::TestCase StatcanFetcher.stub :fetch, ->(url) { raise HTTP::TimeoutError.new("Request timed out") } do assert_raises HTTP::TimeoutError do - StatcanSyncJob.perform_now(dataset.id) + StatcanSyncJob.perform_now(dataset) end end From 7eb00ab1e2b9370f280185bdfa89f6e30282937c Mon Sep 17 00:00:00 2001 From: James Long Date: Wed, 9 Jul 2025 16:58:15 +0000 Subject: [PATCH 16/19] Review feedback: Move sync logic from job -> model --- app/jobs/statcan_sync_job.rb | 7 +------ app/models/statcan_dataset.rb | 5 +++++ test/jobs/statcan_sync_job_test.rb | 29 ----------------------------- test/models/statcan_dataset_test.rb | 26 ++++++++++++++++++++++++++ 4 files changed, 32 insertions(+), 35 deletions(-) delete mode 100644 test/jobs/statcan_sync_job_test.rb diff --git a/app/jobs/statcan_sync_job.rb b/app/jobs/statcan_sync_job.rb index 7e1874d..1fe719b 100644 --- a/app/jobs/statcan_sync_job.rb +++ b/app/jobs/statcan_sync_job.rb @@ -2,11 +2,6 @@ class StatcanSyncJob < ApplicationJob queue_as :default def perform(statcan_dataset) - data = StatcanFetcher.fetch(statcan_dataset.statcan_url) - - statcan_dataset.update!( - current_data: data, - last_synced_at: Time.current - ) + statcan_dataset.sync! end end diff --git a/app/models/statcan_dataset.rb b/app/models/statcan_dataset.rb index 2b8e9f7..c095152 100644 --- a/app/models/statcan_dataset.rb +++ b/app/models/statcan_dataset.rb @@ -17,6 +17,11 @@ def needs_sync?(current_time = Time.current) last_synced_at.to_i < last_scheduled_time.seconds end + def sync! + data = StatcanFetcher.fetch(statcan_url) + update!(current_data: data, last_synced_at: Time.current) + end + private def valid_cron_expression diff --git a/test/jobs/statcan_sync_job_test.rb b/test/jobs/statcan_sync_job_test.rb deleted file mode 100644 index ec2b9fb..0000000 --- a/test/jobs/statcan_sync_job_test.rb +++ /dev/null @@ -1,29 +0,0 @@ -require "test_helper" - -class StatcanSyncJobTest < ActiveJob::TestCase - test "should update dataset with fetched data" do - dataset = statcan_datasets(:unsynced) - parsed_data = [ { "population" => 1000000, "year" => 2023 } ] - - StatcanFetcher.stub :fetch, parsed_data do - StatcanSyncJob.perform_now(dataset) - end - - dataset.reload - assert_equal parsed_data, dataset.current_data - assert_not_nil dataset.last_synced_at - end - - test "should not update dataset when fetch times out" do - dataset = statcan_datasets(:unsynced) - - StatcanFetcher.stub :fetch, ->(url) { raise HTTP::TimeoutError.new("Request timed out") } do - assert_raises HTTP::TimeoutError do - StatcanSyncJob.perform_now(dataset) - end - end - - dataset.reload - assert_nil dataset.current_data - end -end diff --git a/test/models/statcan_dataset_test.rb b/test/models/statcan_dataset_test.rb index 105722c..d8788ef 100644 --- a/test/models/statcan_dataset_test.rb +++ b/test/models/statcan_dataset_test.rb @@ -153,4 +153,30 @@ def self.valid_attributes stale_datasets = StatcanDataset.filter_stale([]) assert_empty stale_datasets end + + test "sync! should update dataset with fetched data" do + dataset = statcan_datasets(:unsynced) + parsed_data = [ { "population" => 1000000, "year" => 2023 } ] + + StatcanFetcher.stub :fetch, parsed_data do + dataset.sync! + end + + dataset.reload + assert_equal parsed_data, dataset.current_data + assert_not_nil dataset.last_synced_at + end + + test "sync! should not update dataset when fetch times out" do + dataset = statcan_datasets(:unsynced) + + StatcanFetcher.stub :fetch, ->(url) { raise HTTP::TimeoutError.new("Request timed out") } do + assert_raises HTTP::TimeoutError do + dataset.sync! + end + end + + dataset.reload + assert_nil dataset.current_data + end end From c97200ea7feb389b91a42f88c0503f1475b313a0 Mon Sep 17 00:00:00 2001 From: James Long Date: Wed, 9 Jul 2025 18:08:20 +0000 Subject: [PATCH 17/19] Fix test indentation (for consistency) --- test/models/statcan_dataset_test.rb | 258 ++++++++++++++-------------- 1 file changed, 129 insertions(+), 129 deletions(-) diff --git a/test/models/statcan_dataset_test.rb b/test/models/statcan_dataset_test.rb index d8788ef..f278f0a 100644 --- a/test/models/statcan_dataset_test.rb +++ b/test/models/statcan_dataset_test.rb @@ -10,173 +10,173 @@ def self.valid_attributes end - test "valid dataset" do - dataset = StatcanDataset.new(self.class.valid_attributes) + test "valid dataset" do + dataset = StatcanDataset.new(self.class.valid_attributes) - assert dataset.valid? - end - - test "requires statcan_url" do - attributes = self.class.valid_attributes.except(:statcan_url) - dataset = StatcanDataset.new(attributes) + assert dataset.valid? + end - assert_not dataset.valid? - assert_includes dataset.errors[:statcan_url], "can't be blank" - end + test "requires statcan_url" do + attributes = self.class.valid_attributes.except(:statcan_url) + dataset = StatcanDataset.new(attributes) - test "requires name" do - attributes = self.class.valid_attributes.except(:name) - dataset = StatcanDataset.new(attributes) + assert_not dataset.valid? + assert_includes dataset.errors[:statcan_url], "can't be blank" + end - assert_not dataset.valid? - assert_includes dataset.errors[:name], "can't be blank" - end + test "requires name" do + attributes = self.class.valid_attributes.except(:name) + dataset = StatcanDataset.new(attributes) - test "requires sync_schedule" do - attributes = self.class.valid_attributes.except(:sync_schedule) - dataset = StatcanDataset.new(attributes) - - assert_not dataset.valid? - assert_includes dataset.errors[:sync_schedule], "can't be blank" - end + assert_not dataset.valid? + assert_includes dataset.errors[:name], "can't be blank" + end - test "statcan_url must be unique" do - url = "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701" + test "requires sync_schedule" do + attributes = self.class.valid_attributes.except(:sync_schedule) + dataset = StatcanDataset.new(attributes) - StatcanDataset.create!( - statcan_url: url, - name: "first-dataset", - sync_schedule: "0 6 * * *" - ) + assert_not dataset.valid? + assert_includes dataset.errors[:sync_schedule], "can't be blank" + end - duplicate = StatcanDataset.new( - statcan_url: url, - name: "second-dataset", - sync_schedule: "0 12 * * *" - ) + test "statcan_url must be unique" do + url = "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701" - assert_not duplicate.valid? - assert_includes duplicate.errors[:statcan_url], "has already been taken" - end + StatcanDataset.create!( + statcan_url: url, + name: "first-dataset", + sync_schedule: "0 6 * * *" + ) - test "name must be unique" do - name = "demographic-incomes" + duplicate = StatcanDataset.new( + statcan_url: url, + name: "second-dataset", + sync_schedule: "0 12 * * *" + ) - StatcanDataset.create!( - statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", - name: name, - sync_schedule: "0 6 * * *" - ) + assert_not duplicate.valid? + assert_includes duplicate.errors[:statcan_url], "has already been taken" + end - duplicate = StatcanDataset.new( - statcan_url: "https://different-url.statcan.gc.ca/data", - name: name, - sync_schedule: "0 12 * * *" - ) + test "name must be unique" do + name = "demographic-incomes" - assert_not duplicate.valid? - assert_includes duplicate.errors[:name], "has already been taken" - end + StatcanDataset.create!( + statcan_url: "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410028701", + name: name, + sync_schedule: "0 6 * * *" + ) - test "name must be kebab-case" do - attributes = self.class.valid_attributes.merge(name: "InvalidName") - dataset = StatcanDataset.new(attributes) + duplicate = StatcanDataset.new( + statcan_url: "https://different-url.statcan.gc.ca/data", + name: name, + sync_schedule: "0 12 * * *" + ) - assert_not dataset.valid? - assert_includes dataset.errors[:name], "must be lowercase with hyphens only" - end + assert_not duplicate.valid? + assert_includes duplicate.errors[:name], "has already been taken" + end - test "sync_schedule accepts valid cron expressions" do - valid_schedules = [ "0 6 * * *", "30 14 1 * *", "0 * * * 0", "15 9 * * 1-5" ] + test "name must be kebab-case" do + attributes = self.class.valid_attributes.merge(name: "InvalidName") + dataset = StatcanDataset.new(attributes) - valid_schedules.each do |schedule| - attributes = self.class.valid_attributes.merge(sync_schedule: schedule, name: "test-dataset-#{schedule.hash}") - dataset = StatcanDataset.new(attributes) + assert_not dataset.valid? + assert_includes dataset.errors[:name], "must be lowercase with hyphens only" + end - assert dataset.valid?, "#{schedule} should be valid" - end - end + test "sync_schedule accepts valid cron expressions" do + valid_schedules = [ "0 6 * * *", "30 14 1 * *", "0 * * * 0", "15 9 * * 1-5" ] - test "sync_schedule rejects invalid cron expressions" do - invalid_schedules = [ "invalid", "60 25 32 13 8", "not a cron" ] + valid_schedules.each do |schedule| + attributes = self.class.valid_attributes.merge(sync_schedule: schedule, name: "test-dataset-#{schedule.hash}") + dataset = StatcanDataset.new(attributes) - invalid_schedules.each do |schedule| - attributes = self.class.valid_attributes.merge(name: "test-dataset-#{schedule.hash}", sync_schedule: schedule) - dataset = StatcanDataset.new(attributes) + assert dataset.valid?, "#{schedule} should be valid" + end + end - assert_not dataset.valid?, "#{schedule} should be invalid" - assert_includes dataset.errors[:sync_schedule], "must be a valid cron expression" - end - end + test "sync_schedule rejects invalid cron expressions" do + invalid_schedules = [ "invalid", "60 25 32 13 8", "not a cron" ] - test "needs_sync returns true when last_synced_at is nil" do - attributes = self.class.valid_attributes.merge(last_synced_at: nil) - dataset = StatcanDataset.new(attributes) - current_time = Time.parse("2025-01-02 14:00:00") + invalid_schedules.each do |schedule| + attributes = self.class.valid_attributes.merge(name: "test-dataset-#{schedule.hash}", sync_schedule: schedule) + dataset = StatcanDataset.new(attributes) - assert dataset.needs_sync?(current_time) - end + assert_not dataset.valid?, "#{schedule} should be invalid" + assert_includes dataset.errors[:sync_schedule], "must be a valid cron expression" + end + end - test "needs_sync returns true when last sync was before last scheduled time" do - attributes = self.class.valid_attributes.merge(sync_schedule: "0 0 * * *", last_synced_at: Time.parse("2025-01-01 23:00:00")) - dataset = StatcanDataset.new(attributes) - current_time = Time.parse("2025-01-02 14:00:00") # 2pm next day + test "needs_sync returns true when last_synced_at is nil" do + attributes = self.class.valid_attributes.merge(last_synced_at: nil) + dataset = StatcanDataset.new(attributes) + current_time = Time.parse("2025-01-02 14:00:00") - assert dataset.needs_sync?(current_time) - end + assert dataset.needs_sync?(current_time) + end - test "needs_sync returns false when last sync was after last scheduled time" do - attributes = self.class.valid_attributes.merge(sync_schedule: "0 0 * * *", last_synced_at: Time.parse("2025-01-02 01:00:00")) + test "needs_sync returns true when last sync was before last scheduled time" do + attributes = self.class.valid_attributes.merge(sync_schedule: "0 0 * * *", last_synced_at: Time.parse("2025-01-01 23:00:00")) dataset = StatcanDataset.new(attributes) - current_time = Time.parse("2025-01-02 14:00:00") # 2pm same day + current_time = Time.parse("2025-01-02 14:00:00") # 2pm next day - assert_not dataset.needs_sync?(current_time) - end + assert dataset.needs_sync?(current_time) + end - test "filter_stale returns datasets that need syncing" do - stale_attributes = self.class.valid_attributes.merge(last_synced_at: nil) - stale_dataset = StatcanDataset.new(stale_attributes) + test "needs_sync returns false when last sync was after last scheduled time" do + attributes = self.class.valid_attributes.merge(sync_schedule: "0 0 * * *", last_synced_at: Time.parse("2025-01-02 01:00:00")) + dataset = StatcanDataset.new(attributes) + current_time = Time.parse("2025-01-02 14:00:00") # 2pm same day - fresh_attributes = self.class.valid_attributes.merge(sync_schedule: "0 0 * * *", last_synced_at: Time.parse("2025-01-02 01:00:00")) - fresh_dataset = StatcanDataset.new(fresh_attributes) + assert_not dataset.needs_sync?(current_time) + end - current_time = Time.parse("2025-01-02 14:00:00") # 2pm same day - all_datasets = [ stale_dataset, fresh_dataset ] - stale_datasets = StatcanDataset.filter_stale(all_datasets, current_time) + test "filter_stale returns datasets that need syncing" do + stale_attributes = self.class.valid_attributes.merge(last_synced_at: nil) + stale_dataset = StatcanDataset.new(stale_attributes) - assert_includes stale_datasets, stale_dataset - assert_not_includes stale_datasets, fresh_dataset - assert_equal 1, stale_datasets.length -end + fresh_attributes = self.class.valid_attributes.merge(sync_schedule: "0 0 * * *", last_synced_at: Time.parse("2025-01-02 01:00:00")) + fresh_dataset = StatcanDataset.new(fresh_attributes) - test "filter_stale works with empty collection" do - stale_datasets = StatcanDataset.filter_stale([]) - assert_empty stale_datasets + current_time = Time.parse("2025-01-02 14:00:00") # 2pm same day + all_datasets = [ stale_dataset, fresh_dataset ] + stale_datasets = StatcanDataset.filter_stale(all_datasets, current_time) + + assert_includes stale_datasets, stale_dataset + assert_not_includes stale_datasets, fresh_dataset + assert_equal 1, stale_datasets.length end - test "sync! should update dataset with fetched data" do - dataset = statcan_datasets(:unsynced) - parsed_data = [ { "population" => 1000000, "year" => 2023 } ] + test "filter_stale works with empty collection" do + stale_datasets = StatcanDataset.filter_stale([]) + assert_empty stale_datasets + end - StatcanFetcher.stub :fetch, parsed_data do - dataset.sync! - end + test "sync! should update dataset with fetched data" do + dataset = statcan_datasets(:unsynced) + parsed_data = [ { "population" => 1000000, "year" => 2023 } ] - dataset.reload - assert_equal parsed_data, dataset.current_data - assert_not_nil dataset.last_synced_at - end + StatcanFetcher.stub :fetch, parsed_data do + dataset.sync! + end - test "sync! should not update dataset when fetch times out" do - dataset = statcan_datasets(:unsynced) + dataset.reload + assert_equal parsed_data, dataset.current_data + assert_not_nil dataset.last_synced_at + end - StatcanFetcher.stub :fetch, ->(url) { raise HTTP::TimeoutError.new("Request timed out") } do - assert_raises HTTP::TimeoutError do - dataset.sync! - end - end + test "sync! should not update dataset when fetch times out" do + dataset = statcan_datasets(:unsynced) - dataset.reload - assert_nil dataset.current_data - end + StatcanFetcher.stub :fetch, ->(url) { raise HTTP::TimeoutError.new("Request timed out") } do + assert_raises HTTP::TimeoutError do + dataset.sync! + end + end + + dataset.reload + assert_nil dataset.current_data + end end From f8a8f11d8e17e9eb4560e0d9b2f4aa105a29e2f3 Mon Sep 17 00:00:00 2001 From: James Long Date: Thu, 10 Jul 2025 00:00:49 +0000 Subject: [PATCH 18/19] Fix: Don't store dataset data if not successfully fetched --- app/services/statcan_fetcher.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/app/services/statcan_fetcher.rb b/app/services/statcan_fetcher.rb index 887b3fa..6ab47e2 100644 --- a/app/services/statcan_fetcher.rb +++ b/app/services/statcan_fetcher.rb @@ -7,6 +7,10 @@ def self.fetch(url) .headers("User-Agent" => "BuildCanada/OutcomeTrackerAPI") .get(url) + unless response.status.success? + raise "HTTP Error: #{response.status} - #{response.status.reason}" + end + csv_string = response.body.to_s # Remove UTF-8 Byte Order Mark (BOM) if present From 09af215ea445b6c2f43b69dcbbf478e1c743a918 Mon Sep 17 00:00:00 2001 From: James Long Date: Thu, 10 Jul 2025 00:03:41 +0000 Subject: [PATCH 19/19] Undo whitespace changes to gemfile --- Gemfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index c217aba..f957415 100644 --- a/Gemfile +++ b/Gemfile @@ -11,7 +11,6 @@ gem "good_job", "~> 4.11" # Build JSON APIs with ease [https://github.com/rails/jbuilder] gem "jbuilder" - # Use Active Model has_secure_password [https://guides.rubyonrails.org/active_model_basics.html#securepassword] # gem "bcrypt", "~> 3.1.7" @@ -30,6 +29,8 @@ gem "rack-cors" # Search gem "ransack", "~> 4.3.0" + + group :development, :test do # See https://guides.rubyonrails.org/debugging_rails_applications.html#debugging-with-the-debug-gem gem "debug", platforms: %i[ mri windows ], require: "debug/prelude"