From d0d1d42f2e58474b3268dd8d41abe35991bd0756 Mon Sep 17 00:00:00 2001 From: xrendan Date: Thu, 19 Jun 2025 12:47:05 -0600 Subject: [PATCH] Refactor html handling, extract activities in a job --- Gemfile | 2 + Gemfile.lock | 1 + app/jobs/entry_activity_extractor_job.rb | 7 +++ app/jobs/entry_data_fetcher_job.rb | 2 +- app/models/entry.rb | 62 ++++++++++++------- app/models/feed.rb | 1 + ...50619173204_add_index_fields_to_entries.rb | 6 ++ db/schema.rb | 6 +- lib/defuddle.rb | 24 +++++++ lib/html_extractor.rb | 39 ++++++++++++ 10 files changed, 124 insertions(+), 26 deletions(-) create mode 100644 app/jobs/entry_activity_extractor_job.rb create mode 100644 db/migrate/20250619173204_add_index_fields_to_entries.rb create mode 100644 lib/defuddle.rb create mode 100644 lib/html_extractor.rb diff --git a/Gemfile b/Gemfile index d0e6388..e4abf42 100644 --- a/Gemfile +++ b/Gemfile @@ -58,3 +58,5 @@ gem "http", "~> 5.3" gem "iconv", "~> 1.1" gem "structify", "~> 0.3.4" + +gem "nokogiri", "~> 1.18" diff --git a/Gemfile.lock b/Gemfile.lock index 1801729..7e60a75 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -416,6 +416,7 @@ DEPENDENCIES iconv (~> 1.1) importmap-rails (~> 2.1) jbuilder + nokogiri (~> 1.18) pg (~> 1.1) propshaft (~> 1.1) puma (>= 5.0) diff --git a/app/jobs/entry_activity_extractor_job.rb b/app/jobs/entry_activity_extractor_job.rb new file mode 100644 index 0000000..7c38518 --- /dev/null +++ b/app/jobs/entry_activity_extractor_job.rb @@ -0,0 +1,7 @@ +class EntryActivityExtractorJob < ApplicationJob + queue_as :default + + def perform(entry) + entry.extract_activities!(inline: true) + end +end diff --git a/app/jobs/entry_data_fetcher_job.rb b/app/jobs/entry_data_fetcher_job.rb index 41582da..7d3eb61 100644 --- a/app/jobs/entry_data_fetcher_job.rb +++ b/app/jobs/entry_data_fetcher_job.rb @@ -2,7 +2,7 @@ class EntryDataFetcherJob < ApplicationJob queue_as :default def perform(entry) - entry.fetch_data!(in_background: false) + entry.fetch_data!(inline: true) # Do something later end end diff --git a/app/models/entry.rb b/app/models/entry.rb index 84dbe15..a41d6df 100644 --- a/app/models/entry.rb +++ b/app/models/entry.rb @@ -3,56 +3,70 @@ class Entry < ApplicationRecord belongs_to :feed belongs_to :government + belongs_to :parent, class_name: "Entry", inverse_of: :children, optional: true has_many :activity_extractors, as: :record + has_many :children, class_name: "Entry", foreign_key: "parent_id" + after_commit :fetch_data!, on: [ :create ] validates :url, presence: true, uniqueness: true - def fetch_data!(in_background: true) - if in_background + def fetch_data!(inline: false) + unless inline return EntryDataFetcherJob.perform_later(self) end # Fetch data from external source r = HTTP.get(url) - html = r.body.to_s - - ic = Iconv.new("UTF-8//IGNORE", "UTF-8") - self.raw_html = ic.iconv(html + " ")[0..-2] - - # Call defuddle to parse the HTML - temp_file = Tempfile.new("entry_html", encoding: "utf-8") - temp_file.write(raw_html) - - md_json, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-m", "-j") - md_html, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-j") + if r.status >= 300 or r.status < 200 + Rails.logger.error("Error fetching data for entry #{id}: #{r.status}") + raise HTTP::Error.new("Failed to fetch #{url} Status code #{r.status}") + end + self.raw_html = Defuddle.prepare_html(r.body.to_s) - # replace anything before the first { deffudle returns errors and is dumb here. - md_json = "{" + md_json.split("{", 2).last - html_json = "{" + md_html.split("{", 2).last + self.parsed_markdown, self.parsed_html = Defuddle.defuddle(raw_html) + self.scraped_at = Time.now - self.parsed_markdown = JSON.parse(md_json)["content"] - self.parsed_html = JSON.parse(html_json)["content"] + self.is_index = document_relative_links.any? + self.save! - temp_file.close - self.save - create_subentries! + if is_index + create_subentries! if parent.nil? # only create subentries if this is the top-level entry + else + extract_activities! + end rescue => e - puts parsed_markdown + Rails.logger.error("Error fetching data for entry #{id}: #{e.message}") raise e end def create_subentries! - # Some data sources (like the canada gazette have an RSS feed that is just an index of all the entries, so we need to fetch the actual entries from the feed) + # get any document relative links from the html, this is somewhat specific to Canada Gazette, but we'll handle + # other edge cases when they come up. + document_relative_links.each do |relative_link, link, text| + Entry.find_or_create_by!(government: government, feed: feed, url: link) do |rec| + rec.title = text.gsub(/\s+/, " ").strip # remove any extra whitespace and trim + rec.published_at = published_at + rec.parent = self + end + end end - def extract_activities! + def document_relative_links + HtmlExtractor.extract_links_with_text(raw_html, url, include: [ :document_relative ]) + end + + def extract_activities!(inline: false) + unless inline + return EntryActivityExtractorJob.perform_later(self) + end extractor = ActivityExtractor.create!(record: self) extractor.extract_activities! self.activities_extracted_at = Time.now + self.save! end def format_for_llm diff --git a/app/models/feed.rb b/app/models/feed.rb index 93ba723..2dcef30 100644 --- a/app/models/feed.rb +++ b/app/models/feed.rb @@ -1,6 +1,7 @@ class Feed < ApplicationRecord belongs_to :government + has_many :entries, dependent: :destroy def refresh! # Implement the logic to refresh the feed data diff --git a/db/migrate/20250619173204_add_index_fields_to_entries.rb b/db/migrate/20250619173204_add_index_fields_to_entries.rb new file mode 100644 index 0000000..92b9f13 --- /dev/null +++ b/db/migrate/20250619173204_add_index_fields_to_entries.rb @@ -0,0 +1,6 @@ +class AddIndexFieldsToEntries < ActiveRecord::Migration[8.0] + def change + add_column :entries, :is_index, :boolean + add_reference :entries, :parent, null: true, foreign_key: { to_table: :entries, primary_key: :id } + end +end diff --git a/db/schema.rb b/db/schema.rb index b6b62cf..63c87c9 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[8.0].define(version: 2025_06_19_005535) do +ActiveRecord::Schema[8.0].define(version: 2025_06_19_173204) do # These are extensions that must be enabled in order to support this database enable_extension "pg_catalog.plpgsql" @@ -94,8 +94,11 @@ t.datetime "created_at", null: false t.datetime "updated_at", null: false t.datetime "activities_extracted_at", precision: nil + t.boolean "is_index" + t.bigint "parent_id" t.index ["feed_id"], name: "index_entries_on_feed_id" t.index ["government_id"], name: "index_entries_on_government_id" + t.index ["parent_id"], name: "index_entries_on_parent_id" end create_table "evidences", force: :cascade do |t| @@ -364,6 +367,7 @@ add_foreign_key "activities", "entries" add_foreign_key "activities", "governments" add_foreign_key "departments", "governments" + add_foreign_key "entries", "entries", column: "parent_id" add_foreign_key "entries", "feeds" add_foreign_key "entries", "governments" add_foreign_key "evidences", "activities" diff --git a/lib/defuddle.rb b/lib/defuddle.rb new file mode 100644 index 0000000..d70fb93 --- /dev/null +++ b/lib/defuddle.rb @@ -0,0 +1,24 @@ +module Defuddle + def self.defuddle(html) + temp_file = Tempfile.new("entry_html", encoding: "utf-8") + temp_file.write(html) + + md_json, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-m", "-j") + md_html, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-j") + + # replace anything before the first { deffudle returns errors and is dumb here. + md_json = "{" + md_json.split("{", 2).last + html_json = "{" + md_html.split("{", 2).last + + return JSON.parse(md_json)["content"], JSON.parse(html_json)["content"] + ensure + temp_file.close + temp_file.unlink + end + + def self.prepare_html(html) + ic = Iconv.new("UTF-8//IGNORE", "UTF-8") + + ic.iconv(html + " ")[0..-2] + end +end diff --git a/lib/html_extractor.rb b/lib/html_extractor.rb new file mode 100644 index 0000000..19dae65 --- /dev/null +++ b/lib/html_extractor.rb @@ -0,0 +1,39 @@ +require "nokogiri" +require "uri" + + +module HtmlExtractor + def self.extract_links_with_text(html, source_url, include: [ :hashes, :absolute, :document_relative, :root_relative ]) + include_set = include.to_set + + doc = Nokogiri::HTML(html) + base_uri = URI.parse(source_url) + base_dir = base_uri.path.end_with?("/") ? base_uri : base_uri.dup.tap { |u| u.path = File.dirname(base_uri.path) + "/" } + + doc.css("a[href]").map do |a_tag| + href = a_tag["href"] + next if href.nil? || href.empty? + + type = + case href + when /\Ahttps?:\/\// + :absolute + when /\A\/(?!\/)/ + :root_relative + when /\A#/ + :hashes + else + :document_relative + end + + next unless include_set.include?(type) + + begin + absolute_url = URI.join(base_dir.to_s, href).to_s + [ href, absolute_url, a_tag.text.strip ] + rescue URI::InvalidURIError + nil + end + end.compact + end +end