From 8dcfb5e706e49d2f9cd44a5f33578c5cc2a4f1c4 Mon Sep 17 00:00:00 2001 From: Ryan Wick Date: Wed, 12 Jul 2017 12:45:10 -0700 Subject: [PATCH] Added 'extract_csv' option This option complements 'extract_files' and will extract all of the metadata from each bag's XML into a combined CSV. Filenames are added directly that match the ones used by 'extract_files'. --- dspace2hydra.rb | 71 +++++++++++++++++++++++------------------ lib/extract.rb | 5 +++ lib/extract/csv.rb | 39 ++++++++++++++++++++++ lib/extract/metadata.rb | 50 +++++++++++++++++++++++++++++ 4 files changed, 134 insertions(+), 31 deletions(-) create mode 100644 lib/extract.rb create mode 100644 lib/extract/csv.rb create mode 100644 lib/extract/metadata.rb diff --git a/dspace2hydra.rb b/dspace2hydra.rb index b37fdec..9b21e64 100644 --- a/dspace2hydra.rb +++ b/dspace2hydra.rb @@ -9,10 +9,12 @@ require_relative 'lib/bag' require_relative 'lib/hydra_endpoint' require_relative 'lib/work' +require_relative 'lib/extract' require_relative 'mapping/mapping' include Loggable include Timeable +include Extract started_at = DateTime.now @@ -37,6 +39,7 @@ opts.on('-j', '--cached_json PATH', 'Post the json file directly to the server.') { |v| options['cached_json'] = v } opts.on('-p', '--parent ID', 'Use this ID as the parent work.') { |v| options['parent_id'] = v } opts.on('-s', '--skip-children INDEXES', 'Comma-delimited list of children indexes to skip. (0,1,3)') { |v| options['skip_children'] = v } + opts.on('-e', '--extract_csv PATH', 'Extract metadata from bags into a CSV, to a folder.') { |v| options['extract_csv'] = v } opts.on('-f', '--extract_files PATH', 'Extract content files to a folder.') { |v| options['extract_files'] = v } opts.on('-h', '--help', 'Display this screen') do puts opts @@ -120,41 +123,47 @@ def item_log_path(bag, started_at) @logger.info('DSpace2Hydra started processing bags.') - # Process all of the bags individually - bags.each do |bag| - item_id = "ITEM@#{bag.item.item_id}" - - if CONFIG['extract_files'] - bag.files_for_upload.each do |f| - @logger.info(f.metadata_full_path) + if CONFIG['extract_csv'] + records = Array.new (0) + records = extract_records_metadata(bags) + build_csv(records) + else + # Process all of the bags individually + bags.each do |bag| + item_id = "ITEM@#{bag.item.item_id}" + + if CONFIG['extract_files'] + bag.files_for_upload.each do |f| + @logger.info(f.metadata_full_path) + begin + FileUtils.copy f.full_path, File.join(CONFIG['extract_files'], "#{bag.item.item_id}_#{f.name}") + rescue StandardError => e + @logger.fatal("#{e.message}") + end + end + else begin - FileUtils.copy f.full_path, File.join(CONFIG['extract_files'], "#{bag.item.item_id}_#{f.name}") + bag_start = DateTime.now + start_logging_to(item_log_path(bag, started_at), item_id: item_id) + @logger.info('Started') + server = HydraEndpoint::Server.new(CONFIG['hydra_endpoint'], type_config, started_at) + + # We've decided that if a work has 2+ files, then it should be a Parent work with each file being a + # child. + if bag.files_for_upload.count > 1 + work = Work::MigrationStrategy::ParentWithChildren.new(bag, server, CONFIG, type_config) + else + work = Work::MigrationStrategy::SingleWork.new(bag, server, CONFIG, type_config) + end + work.process_bag + + @logger.info("Finished in #{time_since(bag_start)}") rescue StandardError => e - @logger.fatal("#{e.message}") + @logger.fatal("#{e.message} : #{e.backtrace.join("\n\t")}") + ensure + stop_logging_to(item_log_path(bag, started_at), item_id: item_id) end end - else - begin - bag_start = DateTime.now - start_logging_to(item_log_path(bag, started_at), item_id: item_id) - @logger.info('Started') - server = HydraEndpoint::Server.new(CONFIG['hydra_endpoint'], type_config, started_at) - - # We've decided that if a work has 2+ files, then it should be a Parent work with each file being a - # child. - if bag.files_for_upload.count > 1 - work = Work::MigrationStrategy::ParentWithChildren.new(bag, server, CONFIG, type_config) - else - work = Work::MigrationStrategy::SingleWork.new(bag, server, CONFIG, type_config) - end - work.process_bag - - @logger.info("Finished in #{time_since(bag_start)}") - rescue StandardError => e - @logger.fatal("#{e.message} : #{e.backtrace.join("\n\t")}") - ensure - stop_logging_to(item_log_path(bag, started_at), item_id: item_id) - end end end @logger.info("DSpace2Hydra finished processing #{bags.count} bags in #{time_since(started_at)}") diff --git a/lib/extract.rb b/lib/extract.rb new file mode 100644 index 0000000..e3d924c --- /dev/null +++ b/lib/extract.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true +require 'nokogiri' +require_relative 'loggable' +require_relative 'extract/metadata' +require_relative 'extract/csv' diff --git a/lib/extract/csv.rb b/lib/extract/csv.rb new file mode 100644 index 0000000..1050cb9 --- /dev/null +++ b/lib/extract/csv.rb @@ -0,0 +1,39 @@ +# frozen_string_literal: true +module Extract + include Loggable + + attr_reader :records + + # Using extracted records metadata, extract distinct_keys, set as headers, iterate over each record_hash and save + def build_csv(records) + # Find distinct keys across all metadata records + distinct_keys = records.map(&:keys).flatten.uniq + + @logger.info("Building CSV file.") + CSV.open(CONFIG['extract_csv'] + "/output.csv", "wb", {headers: true}) do |csv| + csv << distinct_keys + + # iterate over each record + records.sort_by {|r| r[:id]}.each do |rec| + row = [] + + # for each field, look for values, if none still output + # for multiple values in same field, separate with ';' + distinct_keys.each do |key| + row_values = String.new("") + + if rec[key.to_sym].kind_of?(Array) then + row_values << rec[key.to_sym].join('; ') + else + row_values << rec[key.to_sym].to_s + end + + row << row_values + end + + csv << row + end + end + @logger.info("CSV file complete: " + CONFIG['extract_csv'] + "/output.csv") + end +end diff --git a/lib/extract/metadata.rb b/lib/extract/metadata.rb new file mode 100644 index 0000000..5bb09d1 --- /dev/null +++ b/lib/extract/metadata.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true +module Extract + include Loggable + + attr_reader :bags + + # For a group of bags, extract each item's XML metadata into a Hash, then add to Array of all records + def extract_records_metadata(bags) + records = Array.new(0) + + # Build an array of item hashes from record metadata + bags.each do |bag| + xml = bag.item.metadata_xml + item_elements_with_qualifiers = Array.new (0) + + # Find all elements (with any qualifiers) in metadata + xml.xpath("//metadata/value").each do |element| + item_elements_with_qualifiers << element.xpath("@element").text + '_' + element.xpath("@qualifier").text + end + + # Set id and filename values directly, filename matches 'extract_files' filename output + item_hash = Hash.new + item_hash[:id] = [bag.item.item_id] + item_hash[:filename] = bag.files_for_upload.map { |f| "#{bag.item.item_id}_#{f.name}" } + + # Iterate over unique elements in metadata to pull out all values + item_elements_with_qualifiers.uniq.each do |name| + values = Array.new (0) + element_name = name.split('_').first + element_qualifier = (name.split('_').last == name.split('_').first) ? nil : name.split('_').last + + if element_qualifier.nil? + xml.xpath("//metadata/value[@element='#{element_name}']").each do |e| + values << e.text + end + else + xml.xpath("//metadata/value[@element='#{element_name}'][@qualifier='#{element_qualifier}']").each do |e| + values << e.text + end + end + + item_hash[name.chomp('_').to_sym] = values + end + + records.push (item_hash) + end + + return records + end +end