Skip to content

Commit

Permalink
Remove GRPC
Browse files Browse the repository at this point in the history
  • Loading branch information
WillNigel23 committed Jan 28, 2025
1 parent c24efa8 commit 2a62429
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 280 deletions.
19 changes: 10 additions & 9 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,26 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
sudo apt-get -qq update
sudo apt-get install -y ghostscript poppler-utils graphviz pandoc texlive-full libmagickwand-dev imagemagick
- name: Install Ruby and gems
uses: ruby/setup-ruby@v1
with:
ruby-version: 2.7.3
bundler-cache: false
- name: Install dependencies
run: |
sudo apt-get -qq update
sudo apt-get install -y ghostscript poppler-utils graphviz pandoc texlive-full
bundle install --jobs 4 --retry 3
bundler-cache: true
- name: Bundle install
run: bundle install --jobs 4 --retry 3
- name: Create db
run: bundle exec rake db:create
- name: Migrate
run: bundle exec rake db:migrate
- name: Load fixtures
run: bundle exec rake db:fixtures:load FIXTURES_PATH=spec/fixtures
# - name: Run RuboCop
# run: bundle exec rubocop
# continue-on-error: true
- name: Run RuboCop
run: bundle exec rubocop
continue-on-error: true
- name: Run tests
uses: coactions/setup-xvfb@v1
with:
Expand Down
1 change: 0 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ gem 'diffy'
gem 'edtf'
gem 'edtf-humanize'
gem 'terser'
gem 'google-cloud-vision', '~> 1.4'

gem 'interactor-rails', '~> 2.0'

Expand Down
50 changes: 0 additions & 50 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -241,57 +241,14 @@ GEM
faraday-multipart (1.0.4)
multipart-post (~> 2)
faraday-net_http (3.0.2)
faraday-retry (2.2.1)
faraday (~> 2.0)
ffi (1.16.3)
flamegraph (0.9.5)
forty_facets (0.2.1)
friendly_id (5.5.1)
activerecord (>= 4.0.0)
gapic-common (0.21.1)
faraday (>= 1.9, < 3.a)
faraday-retry (>= 1.0, < 3.a)
google-protobuf (~> 3.18)
googleapis-common-protos (>= 1.4.0, < 2.a)
googleapis-common-protos-types (>= 1.11.0, < 2.a)
googleauth (~> 1.9)
grpc (~> 1.59)
globalid (1.2.1)
activesupport (>= 6.1)
google-cloud-core (1.7.1)
google-cloud-env (>= 1.0, < 3.a)
google-cloud-errors (~> 1.0)
google-cloud-env (2.1.1)
faraday (>= 1.0, < 3.a)
google-cloud-errors (1.4.0)
google-cloud-vision (1.5.1)
google-cloud-core (~> 1.6)
google-cloud-vision-v1 (>= 0.13, < 2.a)
google-cloud-vision-v1p3beta1 (>= 0.12, < 2.a)
google-cloud-vision-v1 (1.0.1)
gapic-common (>= 0.21.1, < 2.a)
google-cloud-errors (~> 1.0)
google-cloud-vision-v1p3beta1 (0.13.1)
gapic-common (>= 0.21.1, < 2.a)
google-cloud-errors (~> 1.0)
google-protobuf (3.25.5)
googleapis-common-protos (1.6.0)
google-protobuf (>= 3.18, < 5.a)
googleapis-common-protos-types (~> 1.7)
grpc (~> 1.41)
googleapis-common-protos-types (1.16.0)
google-protobuf (>= 3.18, < 5.a)
googleauth (1.11.2)
faraday (>= 1.0, < 3.a)
google-cloud-env (~> 2.1)
jwt (>= 1.4, < 3.0)
multi_json (~> 1.11)
os (>= 0.9, < 2.0)
signet (>= 0.16, < 2.a)
gravatar_image_tag (1.2.0)
grpc (1.65.2)
google-protobuf (>= 3.25, < 5.0)
googleapis-common-protos-types (~> 1.0)
gyoku (1.4.0)
builder (>= 2.1.2)
rexml (~> 3.0)
Expand Down Expand Up @@ -455,7 +412,6 @@ GEM
sanitize
open3 (0.2.1)
orm_adapter (0.5.0)
os (1.1.4)
parser (3.3.0.5)
ast (~> 2.4.1)
racc
Expand Down Expand Up @@ -611,11 +567,6 @@ GEM
shoulda-context (2.0.0)
shoulda-matchers (4.5.1)
activesupport (>= 4.2.0)
signet (0.19.0)
addressable (~> 2.8)
faraday (>= 0.17.5, < 3.a)
jwt (>= 1.5, < 3.0)
multi_json (~> 1.10)
simplecov (0.22.0)
docile (~> 1.1)
simplecov-html (~> 0.11)
Expand Down Expand Up @@ -746,7 +697,6 @@ DEPENDENCIES
flamegraph
forty_facets
friendly_id
google-cloud-vision (~> 1.4)
gravatar_image_tag
http_accept_language
httparty
Expand Down
24 changes: 0 additions & 24 deletions app/models/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -628,25 +628,6 @@ def alto_xml=(xml)
File.write(alto_path, xml)
end


def has_gcv_json?
File.exists?(gcv_json_path)
end

def gcv_json
if has_gcv_json?
File.read(gcv_json_path)
else
""
end
end

def gcv_json=(json)
FileUtils.mkdir_p(File.dirname(gcv_json_path)) unless Dir.exist? File.dirname(gcv_json_path)
File.write(gcv_json_path, json)
end


def image_url_for_download
if sc_canvas
self.sc_canvas.sc_resource_id
Expand All @@ -669,11 +650,6 @@ def image_url_for_download
end
end

# This needs to be public for the ocr transformer to get atit
def gcv_json_path
File.join(Rails.root, 'public', 'text', self.work_id.to_s, "#{self.id}_gcv.json")
end

private
def ai_plaintext_path
File.join(Rails.root, 'public', 'text', self.work_id.to_s, "#{self.id}_ai_plaintext.txt")
Expand Down
6 changes: 1 addition & 5 deletions config/initializers/01fromthepage.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#IDP_SSO_TARGET_URL = 'https://capriza.github.io/samling/samling.html' #easy test for saml without a saml server
#the below isn't a reference to the cert file, but the actual cert. See https://github.com/omniauth/omniauth-saml for other options, like fingerprint.
#the initializer/devise.rb file is where this is used, and if you want to use fingerprint rather than cert, you can modify that file
IDP_CERT = ENV['IDP_CERT']
IDP_CERT = ENV['IDP_CERT']

# ReCAPTCHA Settings
RECAPTCHA_SITE_KEY = ENV['RECAPTCHA_SITE_KEY']
Expand All @@ -35,7 +35,3 @@

ENABLE_TRANSKRIBUS=true
TRANSKRIBUS_ACCESS_TOKEN=ENV['TRANSKRIBUS_ACCESS_TOKEN']

GCV_ENABLED = true
GCV_CREDENTIAL_FILE='/home/benwbrum/dev/products/fromthepage/integration/gcv/fromthepage-e2932d0557ba.json'
OCR_TRANSFORM_COMMAND='docker run --rm -i ubma/ocr-fileformat ocr-transform gcv hocr | docker run --rm -i ubma/ocr-fileformat ocr-transform hocr alto4.0'
77 changes: 0 additions & 77 deletions lib/google/cloud_vision_page_processor.rb

This file was deleted.

60 changes: 25 additions & 35 deletions lib/tasks/ai_plaintext_generator.rake
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
require 'alto_transformer'
require 'google/cloud_vision_page_processor'
require 'openai/text_normalizer'
require 'diff_tools'
namespace :fromthepage do
Expand All @@ -19,11 +18,16 @@ namespace :fromthepage do
end

work.pages.each do |page|
plaintext = plaintext_for_page(page, diff_level)
if !plaintext.blank?
# save the plaintext
page.ai_plaintext = plaintext
page.save!
# check to see if ALTO XML exists
if page.has_alto?
# if it does, read the ALTO XML and generate AI Plaintext
raw_alto = page.alto_xml
plaintext = generate_plaintext(raw_alto, diff_level)
if !plaintext.blank?
# save the plaintext
page.ai_plaintext = generate_plaintext(raw_alto, diff_level)
page.save!
end
end
end
end
Expand All @@ -43,41 +47,27 @@ namespace :fromthepage do
end

work.pages.each do |page|
plaintext = plaintext_for_page(page, diff_level)
# save the plaintext without creating derivatives
if !plaintext.blank?
page.update_column(:source_text, plaintext)
# check to see if ALTO XML exists
if page.has_alto?
# if it does, read the ALTO XML and generate AI Plaintext
raw_alto = page.alto_xml
# convert the alto to plaintext, using the same method as when we ingest XML files
plaintext = generate_plaintext(raw_alto, diff_level)
# do any additional processing here
# save the plaintext without creating derivatives
if !plaintext.blank?
page.update_column(:source_text, plaintext)
end
end

end
end

def plaintext_for_page(page, diff_level)
plaintext = nil

# Google CloudVision generates its own plaintext that is better than its ALTO XML
if page.has_gcv_json?
# if it does, read the GCV JSON and generate AI Plaintext
gcv_json = page.gcv_json
# convert the GCV JSON to plaintext
plaintext = Google::CloudVision::PageProcessor.plaintext_from_gcv_json(gcv_json)
elsif page.has_alto? # check to see if ALTO XML exists
# if it does, read the ALTO XML and generate AI Plaintext
raw_alto = page.alto_xml
# convert the alto to plaintext, using the same method as when we ingest XML files
plaintext = AltoTransformer.plaintext_from_alto_xml(raw_alto)
end

# do any additional processing here
postprocess_plaintext(plaintext, diff_level)

plaintext
end

def postprocess_plaintext(plaintext, diff_level)
def generate_plaintext(raw_alto, diff_level)
# convert the alto to plaintext, using the same method as when we ingest XML files
plaintext = AltoTransformer.plaintext_from_alto_xml(raw_alto)
# some pages are blank, so they will have no word characters in the plaintext
# we want to skip those pages
return nil if plaintext.blank? || !plaintext.match(/\w/m)
return nil if !plaintext.match(/\w/m)

# do any additional processing here
if diff_level != :none
Expand Down
Loading

0 comments on commit 2a62429

Please sign in to comment.