Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 40 additions & 31 deletions dspace2hydra.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
require_relative 'lib/bag'
require_relative 'lib/hydra_endpoint'
require_relative 'lib/work'
require_relative 'lib/extract'
require_relative 'mapping/mapping'

include Loggable
include Timeable
include Extract

started_at = DateTime.now

Expand All @@ -37,6 +39,7 @@
opts.on('-j', '--cached_json PATH', 'Post the json file directly to the server.') { |v| options['cached_json'] = v }
opts.on('-p', '--parent ID', 'Use this ID as the parent work.') { |v| options['parent_id'] = v }
opts.on('-s', '--skip-children INDEXES', 'Comma-delimited list of children indexes to skip. (0,1,3)') { |v| options['skip_children'] = v }
opts.on('-e', '--extract_csv PATH', 'Extract metadata from bags into a CSV, to a folder.') { |v| options['extract_csv'] = v }
opts.on('-f', '--extract_files PATH', 'Extract content files to a folder.') { |v| options['extract_files'] = v }
opts.on('-h', '--help', 'Display this screen') do
puts opts
Expand Down Expand Up @@ -120,41 +123,47 @@ def item_log_path(bag, started_at)

@logger.info('DSpace2Hydra started processing bags.')

# Process all of the bags individually
bags.each do |bag|
item_id = "ITEM@#{bag.item.item_id}"

if CONFIG['extract_files']
bag.files_for_upload.each do |f|
@logger.info(f.metadata_full_path)
if CONFIG['extract_csv']
records = Array.new (0)
records = extract_records_metadata(bags)
build_csv(records)
else
# Process all of the bags individually
bags.each do |bag|
item_id = "ITEM@#{bag.item.item_id}"

if CONFIG['extract_files']
bag.files_for_upload.each do |f|
@logger.info(f.metadata_full_path)
begin
FileUtils.copy f.full_path, File.join(CONFIG['extract_files'], "#{bag.item.item_id}_#{f.name}")
rescue StandardError => e
@logger.fatal("#{e.message}")
end
end
else
begin
FileUtils.copy f.full_path, File.join(CONFIG['extract_files'], "#{bag.item.item_id}_#{f.name}")
bag_start = DateTime.now
start_logging_to(item_log_path(bag, started_at), item_id: item_id)
@logger.info('Started')
server = HydraEndpoint::Server.new(CONFIG['hydra_endpoint'], type_config, started_at)

# We've decided that if a work has 2+ files, then it should be a Parent work with each file being a
# child.
if bag.files_for_upload.count > 1
work = Work::MigrationStrategy::ParentWithChildren.new(bag, server, CONFIG, type_config)
else
work = Work::MigrationStrategy::SingleWork.new(bag, server, CONFIG, type_config)
end
work.process_bag

@logger.info("Finished in #{time_since(bag_start)}")
rescue StandardError => e
@logger.fatal("#{e.message}")
@logger.fatal("#{e.message} : #{e.backtrace.join("\n\t")}")
ensure
stop_logging_to(item_log_path(bag, started_at), item_id: item_id)
end
end
else
begin
bag_start = DateTime.now
start_logging_to(item_log_path(bag, started_at), item_id: item_id)
@logger.info('Started')
server = HydraEndpoint::Server.new(CONFIG['hydra_endpoint'], type_config, started_at)

# We've decided that if a work has 2+ files, then it should be a Parent work with each file being a
# child.
if bag.files_for_upload.count > 1
work = Work::MigrationStrategy::ParentWithChildren.new(bag, server, CONFIG, type_config)
else
work = Work::MigrationStrategy::SingleWork.new(bag, server, CONFIG, type_config)
end
work.process_bag

@logger.info("Finished in #{time_since(bag_start)}")
rescue StandardError => e
@logger.fatal("#{e.message} : #{e.backtrace.join("\n\t")}")
ensure
stop_logging_to(item_log_path(bag, started_at), item_id: item_id)
end
end
end
@logger.info("DSpace2Hydra finished processing #{bags.count} bags in #{time_since(started_at)}")
Expand Down
5 changes: 5 additions & 0 deletions lib/extract.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# frozen_string_literal: true
require 'nokogiri'
require_relative 'loggable'
require_relative 'extract/metadata'
require_relative 'extract/csv'
39 changes: 39 additions & 0 deletions lib/extract/csv.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# frozen_string_literal: true
module Extract
include Loggable

attr_reader :records

# Using extracted records metadata, extract distinct_keys, set as headers, iterate over each record_hash and save
def build_csv(records)
# Find distinct keys across all metadata records
distinct_keys = records.map(&:keys).flatten.uniq

@logger.info("Building CSV file.")
CSV.open(CONFIG['extract_csv'] + "/output.csv", "wb", {headers: true}) do |csv|
csv << distinct_keys

# iterate over each record
records.sort_by {|r| r[:id]}.each do |rec|
row = []

# for each field, look for values, if none still output
# for multiple values in same field, separate with ';'
distinct_keys.each do |key|
row_values = String.new("")

if rec[key.to_sym].kind_of?(Array) then
row_values << rec[key.to_sym].join('; ')
else
row_values << rec[key.to_sym].to_s
end

row << row_values
end

csv << row
end
end
@logger.info("CSV file complete: " + CONFIG['extract_csv'] + "/output.csv")
end
end
50 changes: 50 additions & 0 deletions lib/extract/metadata.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# frozen_string_literal: true
module Extract
include Loggable

attr_reader :bags

# For a group of bags, extract each item's XML metadata into a Hash, then add to Array of all records
def extract_records_metadata(bags)
records = Array.new(0)

# Build an array of item hashes from record metadata
bags.each do |bag|
xml = bag.item.metadata_xml
item_elements_with_qualifiers = Array.new (0)

# Find all elements (with any qualifiers) in metadata
xml.xpath("//metadata/value").each do |element|
item_elements_with_qualifiers << element.xpath("@element").text + '_' + element.xpath("@qualifier").text
end

# Set id and filename values directly, filename matches 'extract_files' filename output
item_hash = Hash.new
item_hash[:id] = [bag.item.item_id]
item_hash[:filename] = bag.files_for_upload.map { |f| "#{bag.item.item_id}_#{f.name}" }

# Iterate over unique elements in metadata to pull out all values
item_elements_with_qualifiers.uniq.each do |name|
values = Array.new (0)
element_name = name.split('_').first
element_qualifier = (name.split('_').last == name.split('_').first) ? nil : name.split('_').last

if element_qualifier.nil?
xml.xpath("//metadata/value[@element='#{element_name}']").each do |e|
values << e.text
end
else
xml.xpath("//metadata/value[@element='#{element_name}'][@qualifier='#{element_qualifier}']").each do |e|
values << e.text
end
end

item_hash[name.chomp('_').to_sym] = values
end

records.push (item_hash)
end

return records
end
end