BuildCanada · xrendan · Jun 19, 2025 · Jun 19, 2025
diff --git a/Gemfile b/Gemfile
@@ -58,3 +58,5 @@ gem "http", "~> 5.3"
 gem "iconv", "~> 1.1"
 
 gem "structify", "~> 0.3.4"
+
+gem "nokogiri", "~> 1.18"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -416,6 +416,7 @@ DEPENDENCIES
   iconv (~> 1.1)
   importmap-rails (~> 2.1)
   jbuilder
+  nokogiri (~> 1.18)
   pg (~> 1.1)
   propshaft (~> 1.1)
   puma (>= 5.0)

diff --git a/app/jobs/entry_activity_extractor_job.rb b/app/jobs/entry_activity_extractor_job.rb
@@ -0,0 +1,7 @@
+class EntryActivityExtractorJob < ApplicationJob
+  queue_as :default
+
+  def perform(entry)
+    entry.extract_activities!(inline: true)
+  end
+end
diff --git a/app/jobs/entry_data_fetcher_job.rb b/app/jobs/entry_data_fetcher_job.rb
@@ -2,7 +2,7 @@ class EntryDataFetcherJob < ApplicationJob
   queue_as :default
 
   def perform(entry)
-    entry.fetch_data!(in_background: false)
+    entry.fetch_data!(inline: true)
     # Do something later
   end
 end
diff --git a/app/models/entry.rb b/app/models/entry.rb
@@ -3,56 +3,70 @@ class Entry < ApplicationRecord
 
   belongs_to :feed
   belongs_to :government
+  belongs_to :parent, class_name: "Entry", inverse_of: :children, optional: true
   has_many :activity_extractors, as: :record
+  has_many :children, class_name: "Entry", foreign_key: "parent_id"
+
 
   after_commit :fetch_data!, on: [ :create ]
 
   validates :url, presence: true, uniqueness: true
 
 
-  def fetch_data!(in_background: true)
-    if in_background
+  def fetch_data!(inline: false)
+    unless inline
       return EntryDataFetcherJob.perform_later(self)
     end
     # Fetch data from external source
 
     r = HTTP.get(url)
-    html = r.body.to_s
-
-    ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
-    self.raw_html = ic.iconv(html + " ")[0..-2]
-
-    # Call defuddle to parse the HTML
 
-    temp_file = Tempfile.new("entry_html", encoding: "utf-8")
-    temp_file.write(raw_html)
-
-    md_json, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-m", "-j")
-    md_html, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-j")
+    if r.status >= 300 or r.status < 200
+      Rails.logger.error("Error fetching data for entry #{id}: #{r.status}")
+      raise HTTP::Error.new("Failed to fetch #{url} Status code #{r.status}")
+    end
+    self.raw_html = Defuddle.prepare_html(r.body.to_s)
 
-    # replace anything before the first { deffudle returns errors and is dumb here.
-    md_json = "{" + md_json.split("{", 2).last
-    html_json = "{" + md_html.split("{", 2).last
+    self.parsed_markdown, self.parsed_html = Defuddle.defuddle(raw_html)
+    self.scraped_at = Time.now
 
-    self.parsed_markdown = JSON.parse(md_json)["content"]
-    self.parsed_html = JSON.parse(html_json)["content"]
+    self.is_index = document_relative_links.any?
+    self.save!
 
-    temp_file.close
-    self.save
-    create_subentries!
+    if is_index
+      create_subentries! if parent.nil? # only create subentries if this is the top-level entry
+    else
+      extract_activities!
+    end
   rescue => e
-    puts parsed_markdown
+    Rails.logger.error("Error fetching data for entry #{id}: #{e.message}")
     raise e
   end
 
   def create_subentries!
-    # Some data sources (like the canada gazette have an RSS feed that is just an index of all the entries, so we need to fetch the actual entries from the feed)
+    # get any document relative links from the html, this is somewhat specific to Canada Gazette, but we'll handle
+    # other edge cases when they come up.
+    document_relative_links.each do |relative_link, link, text|
+      Entry.find_or_create_by!(government: government, feed: feed, url: link) do |rec|
+        rec.title = text.gsub(/\s+/, " ").strip # remove any extra whitespace and trim
+        rec.published_at = published_at
+        rec.parent = self
+      end
+    end
   end
 
-  def extract_activities!
+  def document_relative_links
+    HtmlExtractor.extract_links_with_text(raw_html, url, include: [ :document_relative ])
+  end
+
+  def extract_activities!(inline: false)
+    unless inline
+      return EntryActivityExtractorJob.perform_later(self)
+    end
     extractor = ActivityExtractor.create!(record: self)
     extractor.extract_activities!
     self.activities_extracted_at = Time.now
+    self.save!
   end
 
   def format_for_llm

diff --git a/app/models/feed.rb b/app/models/feed.rb
@@ -1,6 +1,7 @@
 class Feed < ApplicationRecord
   belongs_to :government
 
+  has_many :entries, dependent: :destroy
 
   def refresh!
     # Implement the logic to refresh the feed data

diff --git a/db/migrate/20250619173204_add_index_fields_to_entries.rb b/db/migrate/20250619173204_add_index_fields_to_entries.rb
@@ -0,0 +1,6 @@
+class AddIndexFieldsToEntries < ActiveRecord::Migration[8.0]
+  def change
+    add_column :entries, :is_index, :boolean
+    add_reference :entries, :parent, null: true, foreign_key: { to_table: :entries, primary_key: :id }
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
diff --git a/lib/defuddle.rb b/lib/defuddle.rb
@@ -0,0 +1,24 @@
+module Defuddle
+  def self.defuddle(html)
+    temp_file = Tempfile.new("entry_html", encoding: "utf-8")
+    temp_file.write(html)
+
+    md_json, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-m", "-j")
+    md_html, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-j")
+
+    # replace anything before the first { deffudle returns errors and is dumb here.
+    md_json = "{" + md_json.split("{", 2).last
+    html_json = "{" + md_html.split("{", 2).last
+
+    return JSON.parse(md_json)["content"], JSON.parse(html_json)["content"]
+  ensure
+    temp_file.close
+    temp_file.unlink
+  end
+
+  def self.prepare_html(html)
+    ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
+
+    ic.iconv(html + " ")[0..-2]
+  end
+end
diff --git a/lib/html_extractor.rb b/lib/html_extractor.rb
@@ -0,0 +1,39 @@
+require "nokogiri"
+require "uri"
+
+
+module HtmlExtractor
+  def self.extract_links_with_text(html, source_url, include: [ :hashes, :absolute, :document_relative, :root_relative ])
+    include_set = include.to_set
+
+    doc = Nokogiri::HTML(html)
+    base_uri = URI.parse(source_url)
+    base_dir = base_uri.path.end_with?("/") ? base_uri : base_uri.dup.tap { |u| u.path = File.dirname(base_uri.path) + "/" }
+
+    doc.css("a[href]").map do |a_tag|
+      href = a_tag["href"]
+      next if href.nil? || href.empty?
+
+      type =
+        case href
+        when /\Ahttps?:\/\//
+          :absolute
+        when /\A\/(?!\/)/
+          :root_relative
+        when /\A#/
+          :hashes
+        else
+          :document_relative
+        end
+
+      next unless include_set.include?(type)
+
+      begin
+        absolute_url = URI.join(base_dir.to_s, href).to_s
+        [ href, absolute_url, a_tag.text.strip ]
+      rescue URI::InvalidURIError
+        nil
+      end
+    end.compact
+  end
+end
Original file line number	Diff line number	Diff line change
Expand Up		@@ -58,3 +58,5 @@ gem "http", "~> 5.3"
		gem "iconv", "~> 1.1"

		gem "structify", "~> 0.3.4"

		gem "nokogiri", "~> 1.18"