Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,5 @@ gem "http", "~> 5.3"
gem "iconv", "~> 1.1"

gem "structify", "~> 0.3.4"

gem "nokogiri", "~> 1.18"
1 change: 1 addition & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ DEPENDENCIES
iconv (~> 1.1)
importmap-rails (~> 2.1)
jbuilder
nokogiri (~> 1.18)
pg (~> 1.1)
propshaft (~> 1.1)
puma (>= 5.0)
Expand Down
7 changes: 7 additions & 0 deletions app/jobs/entry_activity_extractor_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class EntryActivityExtractorJob < ApplicationJob
queue_as :default

def perform(entry)
entry.extract_activities!(inline: true)
end
end
2 changes: 1 addition & 1 deletion app/jobs/entry_data_fetcher_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ class EntryDataFetcherJob < ApplicationJob
queue_as :default

def perform(entry)
entry.fetch_data!(in_background: false)
entry.fetch_data!(inline: true)
# Do something later
end
end
62 changes: 38 additions & 24 deletions app/models/entry.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,56 +3,70 @@ class Entry < ApplicationRecord

belongs_to :feed
belongs_to :government
belongs_to :parent, class_name: "Entry", inverse_of: :children, optional: true
has_many :activity_extractors, as: :record
has_many :children, class_name: "Entry", foreign_key: "parent_id"


after_commit :fetch_data!, on: [ :create ]

validates :url, presence: true, uniqueness: true


def fetch_data!(in_background: true)
if in_background
def fetch_data!(inline: false)
unless inline
return EntryDataFetcherJob.perform_later(self)
end
# Fetch data from external source

r = HTTP.get(url)
html = r.body.to_s

ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
self.raw_html = ic.iconv(html + " ")[0..-2]

# Call defuddle to parse the HTML

temp_file = Tempfile.new("entry_html", encoding: "utf-8")
temp_file.write(raw_html)

md_json, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-m", "-j")
md_html, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-j")
if r.status >= 300 or r.status < 200
Rails.logger.error("Error fetching data for entry #{id}: #{r.status}")
raise HTTP::Error.new("Failed to fetch #{url} Status code #{r.status}")
end
self.raw_html = Defuddle.prepare_html(r.body.to_s)

# replace anything before the first { deffudle returns errors and is dumb here.
md_json = "{" + md_json.split("{", 2).last
html_json = "{" + md_html.split("{", 2).last
self.parsed_markdown, self.parsed_html = Defuddle.defuddle(raw_html)
self.scraped_at = Time.now

self.parsed_markdown = JSON.parse(md_json)["content"]
self.parsed_html = JSON.parse(html_json)["content"]
self.is_index = document_relative_links.any?
self.save!

temp_file.close
self.save
create_subentries!
if is_index
create_subentries! if parent.nil? # only create subentries if this is the top-level entry
else
extract_activities!
end
rescue => e
puts parsed_markdown
Rails.logger.error("Error fetching data for entry #{id}: #{e.message}")
raise e
end

def create_subentries!
# Some data sources (like the canada gazette have an RSS feed that is just an index of all the entries, so we need to fetch the actual entries from the feed)
# get any document relative links from the html, this is somewhat specific to Canada Gazette, but we'll handle
# other edge cases when they come up.
document_relative_links.each do |relative_link, link, text|
Entry.find_or_create_by!(government: government, feed: feed, url: link) do |rec|
rec.title = text.gsub(/\s+/, " ").strip # remove any extra whitespace and trim
rec.published_at = published_at
rec.parent = self
end
end
end

def extract_activities!
def document_relative_links
HtmlExtractor.extract_links_with_text(raw_html, url, include: [ :document_relative ])
end

def extract_activities!(inline: false)
unless inline
return EntryActivityExtractorJob.perform_later(self)
end
extractor = ActivityExtractor.create!(record: self)
extractor.extract_activities!
self.activities_extracted_at = Time.now
self.save!
end

def format_for_llm
Expand Down
1 change: 1 addition & 0 deletions app/models/feed.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
class Feed < ApplicationRecord
belongs_to :government

has_many :entries, dependent: :destroy

def refresh!
# Implement the logic to refresh the feed data
Expand Down
6 changes: 6 additions & 0 deletions db/migrate/20250619173204_add_index_fields_to_entries.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class AddIndexFieldsToEntries < ActiveRecord::Migration[8.0]
def change
add_column :entries, :is_index, :boolean
add_reference :entries, :parent, null: true, foreign_key: { to_table: :entries, primary_key: :id }
end
end
6 changes: 5 additions & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions lib/defuddle.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
module Defuddle
def self.defuddle(html)
temp_file = Tempfile.new("entry_html", encoding: "utf-8")
temp_file.write(html)

md_json, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-m", "-j")
md_html, err, status = Open3.capture3("defuddle", "parse", temp_file.path, "-j")

# replace anything before the first { deffudle returns errors and is dumb here.
md_json = "{" + md_json.split("{", 2).last
html_json = "{" + md_html.split("{", 2).last

return JSON.parse(md_json)["content"], JSON.parse(html_json)["content"]
ensure
temp_file.close
temp_file.unlink
end

def self.prepare_html(html)
ic = Iconv.new("UTF-8//IGNORE", "UTF-8")

ic.iconv(html + " ")[0..-2]
end
end
39 changes: 39 additions & 0 deletions lib/html_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
require "nokogiri"
require "uri"


module HtmlExtractor
def self.extract_links_with_text(html, source_url, include: [ :hashes, :absolute, :document_relative, :root_relative ])
include_set = include.to_set

doc = Nokogiri::HTML(html)
base_uri = URI.parse(source_url)
base_dir = base_uri.path.end_with?("/") ? base_uri : base_uri.dup.tap { |u| u.path = File.dirname(base_uri.path) + "/" }

doc.css("a[href]").map do |a_tag|
href = a_tag["href"]
next if href.nil? || href.empty?

type =
case href
when /\Ahttps?:\/\//
:absolute
when /\A\/(?!\/)/
:root_relative
when /\A#/
:hashes
else
:document_relative
end

next unless include_set.include?(type)

begin
absolute_url = URI.join(base_dir.to_s, href).to_s
[ href, absolute_url, a_tag.text.strip ]
rescue URI::InvalidURIError
nil
end
end.compact
end
end