eightBitter · February 2, 2023 15:10
diff --git a/archivemanager_reformat_csvs.rb b/archivemanager_reformat_csvs.rb
 require_relative './reformatter'

 ### AM HGARC object props
 # Read in props data to dataframe
 archivemanager_object_props = Polars.read_csv('data/archivemanager/object_properties_escaped.csv',dtypes:{'node_id'=>:str})

 # Read aspects data to dataframe
 archivemanager_object_aspects = Polars.read_csv('data/archivemanager/object_aspects.csv',dtypes:{'node_id'=>:str})

 # Read hierarchy data to dataframe
 archivemanager_object_hierarchy = Polars.read_csv('data/archivemanager/object_hierarchy.csv',dtypes:{'node_id'=>:str})

 # Reformat dataframe
 archivemanager_object_reformatted = reformat_dataset(archivemanager_object_props,[archivemanager_object_aspects,archivemanager_object_hierarchy])

 # Write reformatted dataframe to csv
 archivemanager_object_reformatted.write_csv('data/archivemanager/reformatted_object_properties.csv')

 # Cleaning memory
 archivemanager_object_props = nil
 archivemanager_object_aspects = nil
 archivemanager_object_hierarchy = nil
 archivemanager_object_reformatted = nil

 puts 'AM HGARC object props reformat complete'

 ### AM HGARC agent props
 archivemanager_agent_props = Polars.read_csv('data/archivemanager/agent_properties_escaped.csv',dtypes:{'node_id'=>:str})

 archivemanager_agent_reformatted = reformat_dataset(archivemanager_agent_props)

 archivemanager_agent_reformatted.write_csv('data/archivemanager/reformatted_agent_properties.csv')

 archivemanager_agent_props = nil
 archivemanager_agent_reformatted = nil

 puts 'AM HGARC agent props reformat complete'

 ### AM HGARC subject props
 archivemanager_subject_props = Polars.read_csv('data/archivemanager/subject_properties_escaped.csv',dtypes:{'node_id'=>:str})

 archivemanager_subject_reformatted = reformat_dataset(archivemanager_subject_props)

 archivemanager_subject_reformatted.write_csv('data/archivemanager/reformatted_subject_properties.csv')

 archivemanager_subject_props = nil
 archivemanager_subject_reformatted = nil

 puts 'AM HGARC subject props reformat complete'

 ### AM HGARC note props
 archivemanager_note_props = Polars.read_csv('data/archivemanager/note_properties_escaped.csv',dtypes:{'node_id'=>:str})

 archivemanager_object_notes_assoc = Polars.read_csv('data/archivemanager/object_notes_assoc.csv',dtypes:{'object_id'=>:str,'note_id'=>:str})
 archivemanager_object_notes_assoc = archivemanager_object_notes_assoc.rename({'note_id'=>'node_id'})

 archivemanager_note_reformatted = reformat_dataset(archivemanager_note_props,archivemanager_object_notes_assoc)

 archivemanager_note_reformatted.write_csv('data/archivemanager/reformatted_note_properties.csv')

 archivemanager_note_props = nil
 archivemanager_object_notes_assoc = nil
 archivemanager_note_reformatted = nil

 puts 'AM HGARC note props reformat complete'
diff --git a/reformatter.rb b/reformatter.rb
 require 'polars'

 # Takes a Polars::Expression for a given node_id and reformats into a single
 # hash the property names and their values
 # 
 # @param node [Polars::Expression] a Polars Expression that contains all the rows for a given node_id
 # @param property_names [Array<String>] an array of property names for the whole dataset.
 #   required for inserting a new DataFrame into an existing DataFrame
 # @return [Hash] returns a hash where each key is each property name and the values their values
 def reformat_props(node,property_names)
  # find the value from the appropriate column
  # assumes there is 0 or 1 values
  grab_value = ->(row){
    if row[0] != "NULL"
      return row[0]
    elsif row[1] != "NULL"
      return row[1]
    elsif row[2] != "NULL"
      return row[2]
    # skipping row[3] which is serializable_value
    elsif row[4] != "NULL"
      return row[4]
    else
      return nil
    end
  }
  
  target_row = []
  output = property_names.map{|name| {name=>[""]}}.reduce(Hash.new,:merge)
  # de-duplicate data that is repeated
  node.rows.uniq.each do |row|
    # inserts a hash where the key is the property name and the value is the property value
    # pulled from the appropriate column using grab_value lambda
    target_row << {row[6]=>grab_value.call(row)} unless grab_value.call(row) == nil
  end

  # target_row << {'node_id'=>node.rows.first[5]}
  target_row << {'node_id'=>node['node_id'].unique.to_a.first}
  # convert to hash
  target_row = target_row.reduce(Hash.new, :merge)
  # wrap string values in array, which is expected with inserting a new Polars::DataFrame later
  target_row = target_row.each{|k,v| target_row[k] = [v]}
  # merges the main hash (target_row) into the output
  # this creates a single hash that contains all property names
  # with the appropriate ones filled out
  output.merge!(target_row)
  
  return output

 end

 # Takes a Polars::DataFrame and reformats properties by grouping related rows
 #   on node_id. This expects a very specific node structure for a specific project.
 # 
 # @param dataframe [Polars::DataFrame] the main Polars DataFrame to reformat
 # @param dataframes_to_join [Array<Polars::DataFrame>,DataFrame] optional. additional DataFrame(s) to left join onto `dataframe`.
 #   Joins on node_id. This is project-specific behavior.
 # @return [Polars::DataFrame] returns the process Polars DataFrame
 def reformat_dataset(dataframe,dataframes_to_join=[])

  # Create a creates a hash of unique property names where the key is the property name and the value is :str
  # This is needed to create the columns with correct data types in the target dataframe
  property_name_index = dataframe['property_name'].to_a.uniq.flatten.map{|name| {name=>:str}}.reduce(Hash.new, :merge).merge({'node_id'=>:str})

  property_names = dataframe['property_name'].to_a.uniq.flatten

  target_df = Polars::DataFrame.new({columns:property_name_index})

  # Create a unique array of the node_ids in dataframe so we can iterate
  node_ids = dataframe['node_id'].to_a.uniq.flatten

  # Iterate over the unique node_ids
  node_ids.each do |node_id|
    # Filter by node_id
    node = dataframe.filter(Polars.col('node_id') == node_id)
    
    node_df = Polars::DataFrame.new(reformat_props(node,property_names))

    target_df.extend(node_df)
    
  end

  # left joins each of `dataframes_to_join` onto `dataframe`
  [dataframes_to_join].flatten.each {|dataframe| target_df = target_df.join(dataframe,on:'node_id',how:'left')}

  # target_df = target_df.join(object_aspects,on:'node_id',how:'left').join(object_hierarchy,on:'node_id',how:'left')

  return target_df

 end
	require_relative './reformatter'

	### AM HGARC object props
	# Read in props data to dataframe
	archivemanager_object_props = Polars.read_csv('data/archivemanager/object_properties_escaped.csv',dtypes:{'node_id'=>:str})

	# Read aspects data to dataframe
	archivemanager_object_aspects = Polars.read_csv('data/archivemanager/object_aspects.csv',dtypes:{'node_id'=>:str})

	# Read hierarchy data to dataframe
	archivemanager_object_hierarchy = Polars.read_csv('data/archivemanager/object_hierarchy.csv',dtypes:{'node_id'=>:str})

	# Reformat dataframe
	archivemanager_object_reformatted = reformat_dataset(archivemanager_object_props,[archivemanager_object_aspects,archivemanager_object_hierarchy])

	# Write reformatted dataframe to csv
	archivemanager_object_reformatted.write_csv('data/archivemanager/reformatted_object_properties.csv')

	# Cleaning memory
	archivemanager_object_props = nil
	archivemanager_object_aspects = nil
	archivemanager_object_hierarchy = nil
	archivemanager_object_reformatted = nil

	puts 'AM HGARC object props reformat complete'

	### AM HGARC agent props
	archivemanager_agent_props = Polars.read_csv('data/archivemanager/agent_properties_escaped.csv',dtypes:{'node_id'=>:str})

	archivemanager_agent_reformatted = reformat_dataset(archivemanager_agent_props)

	archivemanager_agent_reformatted.write_csv('data/archivemanager/reformatted_agent_properties.csv')

	archivemanager_agent_props = nil
	archivemanager_agent_reformatted = nil

	puts 'AM HGARC agent props reformat complete'

	### AM HGARC subject props
	archivemanager_subject_props = Polars.read_csv('data/archivemanager/subject_properties_escaped.csv',dtypes:{'node_id'=>:str})

	archivemanager_subject_reformatted = reformat_dataset(archivemanager_subject_props)

	archivemanager_subject_reformatted.write_csv('data/archivemanager/reformatted_subject_properties.csv')

	archivemanager_subject_props = nil
	archivemanager_subject_reformatted = nil

	puts 'AM HGARC subject props reformat complete'

	### AM HGARC note props
	archivemanager_note_props = Polars.read_csv('data/archivemanager/note_properties_escaped.csv',dtypes:{'node_id'=>:str})

	archivemanager_object_notes_assoc = Polars.read_csv('data/archivemanager/object_notes_assoc.csv',dtypes:{'object_id'=>:str,'note_id'=>:str})
	archivemanager_object_notes_assoc = archivemanager_object_notes_assoc.rename({'note_id'=>'node_id'})

	archivemanager_note_reformatted = reformat_dataset(archivemanager_note_props,archivemanager_object_notes_assoc)

	archivemanager_note_reformatted.write_csv('data/archivemanager/reformatted_note_properties.csv')

	archivemanager_note_props = nil
	archivemanager_object_notes_assoc = nil
	archivemanager_note_reformatted = nil

	puts 'AM HGARC note props reformat complete'
	require 'polars'

	# Takes a Polars::Expression for a given node_id and reformats into a single
	# hash the property names and their values
	#
	# @param node [Polars::Expression] a Polars Expression that contains all the rows for a given node_id
	# @param property_names [Array<String>] an array of property names for the whole dataset.
	# required for inserting a new DataFrame into an existing DataFrame
	# @return [Hash] returns a hash where each key is each property name and the values their values
	def reformat_props(node,property_names)
	# find the value from the appropriate column
	# assumes there is 0 or 1 values
	grab_value = ->(row){
	if row[0] != "NULL"
	return row[0]
	elsif row[1] != "NULL"
	return row[1]
	elsif row[2] != "NULL"
	return row[2]
	# skipping row[3] which is serializable_value
	elsif row[4] != "NULL"
	return row[4]
	else
	return nil
	end
	}

	target_row = []
	output = property_names.map{\|name\| {name=>[""]}}.reduce(Hash.new,:merge)
	# de-duplicate data that is repeated
	node.rows.uniq.each do \|row\|
	# inserts a hash where the key is the property name and the value is the property value
	# pulled from the appropriate column using grab_value lambda
	target_row << {row[6]=>grab_value.call(row)} unless grab_value.call(row) == nil
	end

	# target_row << {'node_id'=>node.rows.first[5]}
	target_row << {'node_id'=>node['node_id'].unique.to_a.first}
	# convert to hash
	target_row = target_row.reduce(Hash.new, :merge)
	# wrap string values in array, which is expected with inserting a new Polars::DataFrame later
	target_row = target_row.each{\|k,v\| target_row[k] = [v]}
	# merges the main hash (target_row) into the output
	# this creates a single hash that contains all property names
	# with the appropriate ones filled out
	output.merge!(target_row)

	return output

	end

	# Takes a Polars::DataFrame and reformats properties by grouping related rows
	# on node_id. This expects a very specific node structure for a specific project.
	#
	# @param dataframe [Polars::DataFrame] the main Polars DataFrame to reformat
	# @param dataframes_to_join [Array<Polars::DataFrame>,DataFrame] optional. additional DataFrame(s) to left join onto `dataframe`.
	# Joins on node_id. This is project-specific behavior.
	# @return [Polars::DataFrame] returns the process Polars DataFrame
	def reformat_dataset(dataframe,dataframes_to_join=[])

	# Create a creates a hash of unique property names where the key is the property name and the value is :str
	# This is needed to create the columns with correct data types in the target dataframe
	property_name_index = dataframe['property_name'].to_a.uniq.flatten.map{\|name\| {name=>:str}}.reduce(Hash.new, :merge).merge({'node_id'=>:str})

	property_names = dataframe['property_name'].to_a.uniq.flatten

	target_df = Polars::DataFrame.new({columns:property_name_index})

	# Create a unique array of the node_ids in dataframe so we can iterate
	node_ids = dataframe['node_id'].to_a.uniq.flatten

	# Iterate over the unique node_ids
	node_ids.each do \|node_id\|
	# Filter by node_id
	node = dataframe.filter(Polars.col('node_id') == node_id)

	node_df = Polars::DataFrame.new(reformat_props(node,property_names))

	target_df.extend(node_df)

	end

	# left joins each of `dataframes_to_join` onto `dataframe`
	[dataframes_to_join].flatten.each {\|dataframe\| target_df = target_df.join(dataframe,on:'node_id',how:'left')}

	# target_df = target_df.join(object_aspects,on:'node_id',how:'left').join(object_hierarchy,on:'node_id',how:'left')

	return target_df

	end