diff --git a/app/models/publication.rb b/app/models/publication.rb index fdbecd4b4..708bf9824 100644 --- a/app/models/publication.rb +++ b/app/models/publication.rb @@ -31,6 +31,8 @@ class Publication < ApplicationRecord self.publication_type = pub_hash[:type] if pub_hash[:type].present? self.year = pub_hash[:year] if pub_hash[:year].present? self.wos_uid ||= web_of_science_source_record.uid if web_of_science_source_record.present? + # NOTE: we already validate the presence and value of newly generated provenance with the PubHashValidator, though it is possible for old bad data to exist + self.provenance = pub_hash[:provenance].to_s.downcase # could be nil or CAPS in old bad data end has_one :batch_uploaded_source_record, dependent: :destroy @@ -270,7 +272,9 @@ def authoritative_doi_source? private - # @return [String] might be empty, won't be nil + # @return [String] might be empty, won't be nil, normalize since we have some older data in varying cases + # @note obscures ActiveRecord field/attribute getter for provenance, once we are sure we have backfilled all previous + # records with the rake data:add_provenance rake task, we can get rid of this method def provenance pub_hash[:provenance].to_s.downcase end diff --git a/db/migrate/20220311182838_add_provenance_field.rb b/db/migrate/20220311182838_add_provenance_field.rb new file mode 100644 index 000000000..1cbe62292 --- /dev/null +++ b/db/migrate/20220311182838_add_provenance_field.rb @@ -0,0 +1,6 @@ +class AddProvenanceField < ActiveRecord::Migration[6.1] + def change + add_column :publications, :provenance, :string + add_index :publications, :provenance + end +end diff --git a/db/schema.rb b/db/schema.rb index f8591eef5..660d19c4e 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2022_01_11_185021) do +ActiveRecord::Schema.define(version: 2022_03_11_182838) do create_table "author_identities", id: :integer, charset: "utf8", collation: "utf8_unicode_ci", force: :cascade do |t| t.integer "author_id", null: false @@ -140,9 +140,11 @@ t.string "issn" t.string "publication_type" t.string "wos_uid" + t.string "provenance" t.index ["issn"], name: "index_publications_on_issn" t.index ["pages"], name: "index_publications_on_pages" t.index ["pmid"], name: "index_publications_on_pmid" + t.index ["provenance"], name: "index_publications_on_provenance" t.index ["sciencewire_id"], name: "index_publications_on_sciencewire_id" t.index ["title"], name: "index_publications_on_title", length: 255 t.index ["updated_at"], name: "index_publications_on_updated_at" diff --git a/lib/tasks/data.rake b/lib/tasks/data.rake new file mode 100644 index 000000000..435a839ff --- /dev/null +++ b/lib/tasks/data.rake @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +namespace :data do + desc 'Backfile provenance into AR column for all records' + # A new field was added to the publication table to allow for querying on publication provenance (already stored in pub_hash). + # This task goes through all publications and adds the value to this field from the pub_hash + # After this task completes, we can remove the `Publication#provenance` method + # RAILS_ENV=production bundle exec rake cleanup:merge_profiles[123,456] # will merge all publications from cap_profile_id 456 into 123, without duplication + # rubocop:disable Rails/SkipsModelValidations + task add_provenance: :environment do |_t, _args| + num_pubs = Publication.where(provenance: nil).count + puts "Started at #{Time.zone.now}" + puts "Found #{num_pubs} with missing provenance." + Publication.where(provenance: nil).find_each.with_index do |pub, i| + puts "#{i + 1} of #{num_pubs}" + pub.update_column('provenance', pub.pub_hash[:provenance]) # skip callbacks and timestamp updates, just set the value + end + puts "Finished at #{Time.zone.now}" + end + # rubocop:enable Rails/SkipsModelValidations +end