From 2cab1c7b09f64cae8128fa33645137fe55daf075 Mon Sep 17 00:00:00 2001 From: David Roetzel Date: Fri, 21 Jun 2024 14:51:10 +0200 Subject: [PATCH] Improve encoding detection for link cards (#30780) --- app/lib/link_details_extractor.rb | 15 ++++++++++----- .../fixtures/requests/low_confidence_latin1.txt | 17 +++++++++++++++++ spec/services/fetch_link_card_service_spec.rb | 9 +++++++++ 3 files changed, 36 insertions(+), 5 deletions(-) create mode 100644 spec/fixtures/requests/low_confidence_latin1.txt diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index 2e49d3fb4f..dbfdd33fcc 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -269,16 +269,21 @@ class LinkDetailsExtractor end def document - @document ||= Nokogiri::HTML(@html, nil, encoding) + @document ||= detect_encoding_and_parse_document end - def encoding - @encoding ||= begin - guess = detector.detect(@html, @html_charset) - guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil + def detect_encoding_and_parse_document + [detect_encoding, nil, @html_charset, 'UTF-8'].uniq.each do |encoding| + document = Nokogiri::HTML(@html, nil, encoding) + return document if document.to_s.valid_encoding? end end + def detect_encoding + guess = detector.detect(@html, @html_charset) + guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil + end + def detector @detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector| detector.strip_tags = true diff --git a/spec/fixtures/requests/low_confidence_latin1.txt b/spec/fixtures/requests/low_confidence_latin1.txt new file mode 100644 index 0000000000..39c3e23d64 --- /dev/null +++ b/spec/fixtures/requests/low_confidence_latin1.txt @@ -0,0 +1,17 @@ +HTTP/1.1 200 OK +server: nginx +date: Thu, 13 Jun 2024 14:33:13 GMT +content-type: text/html; charset=ISO-8859-1 +content-length: 158 +accept-ranges: bytes + + + + + + Tofu á l'orange + + +

Tofu á l'orange

+ + diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb index 63ebc3b978..239f84fde9 100644 --- a/spec/services/fetch_link_card_service_spec.rb +++ b/spec/services/fetch_link_card_service_spec.rb @@ -26,6 +26,7 @@ RSpec.describe FetchLinkCardService do stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt')) stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt')) stub_request(:get, 'http://example.com/windows-1251').to_return(request_fixture('windows-1251.txt')) + stub_request(:get, 'http://example.com/low_confidence_latin1').to_return(request_fixture('low_confidence_latin1.txt')) Rails.cache.write('oembed_endpoint:example.com', oembed_cache) if oembed_cache @@ -148,6 +149,14 @@ RSpec.describe FetchLinkCardService do end end + context 'with a URL of a page in ISO-8859-1 encoding, that charlock_holmes cannot detect' do + let(:status) { Fabricate(:status, text: 'Check out http://example.com/low_confidence_latin1') } + + it 'decodes the HTML' do + expect(status.preview_card.title).to eq("Tofu á l'orange") + end + end + context 'with a Japanese path URL' do let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') }