Skip to content

Instantly share code, notes, and snippets.

@eightBitter
Created February 2, 2023 15:10
Show Gist options
  • Select an option

  • Save eightBitter/5e144db251e5f094633f57a7a49ed37f to your computer and use it in GitHub Desktop.

Select an option

Save eightBitter/5e144db251e5f094633f57a7a49ed37f to your computer and use it in GitHub Desktop.
example data reformatting using Polars
require_relative './reformatter'
### AM HGARC object props
# Read in props data to dataframe
archivemanager_object_props = Polars.read_csv('data/archivemanager/object_properties_escaped.csv',dtypes:{'node_id'=>:str})
# Read aspects data to dataframe
archivemanager_object_aspects = Polars.read_csv('data/archivemanager/object_aspects.csv',dtypes:{'node_id'=>:str})
# Read hierarchy data to dataframe
archivemanager_object_hierarchy = Polars.read_csv('data/archivemanager/object_hierarchy.csv',dtypes:{'node_id'=>:str})
# Reformat dataframe
archivemanager_object_reformatted = reformat_dataset(archivemanager_object_props,[archivemanager_object_aspects,archivemanager_object_hierarchy])
# Write reformatted dataframe to csv
archivemanager_object_reformatted.write_csv('data/archivemanager/reformatted_object_properties.csv')
# Cleaning memory
archivemanager_object_props = nil
archivemanager_object_aspects = nil
archivemanager_object_hierarchy = nil
archivemanager_object_reformatted = nil
puts 'AM HGARC object props reformat complete'
### AM HGARC agent props
archivemanager_agent_props = Polars.read_csv('data/archivemanager/agent_properties_escaped.csv',dtypes:{'node_id'=>:str})
archivemanager_agent_reformatted = reformat_dataset(archivemanager_agent_props)
archivemanager_agent_reformatted.write_csv('data/archivemanager/reformatted_agent_properties.csv')
archivemanager_agent_props = nil
archivemanager_agent_reformatted = nil
puts 'AM HGARC agent props reformat complete'
### AM HGARC subject props
archivemanager_subject_props = Polars.read_csv('data/archivemanager/subject_properties_escaped.csv',dtypes:{'node_id'=>:str})
archivemanager_subject_reformatted = reformat_dataset(archivemanager_subject_props)
archivemanager_subject_reformatted.write_csv('data/archivemanager/reformatted_subject_properties.csv')
archivemanager_subject_props = nil
archivemanager_subject_reformatted = nil
puts 'AM HGARC subject props reformat complete'
### AM HGARC note props
archivemanager_note_props = Polars.read_csv('data/archivemanager/note_properties_escaped.csv',dtypes:{'node_id'=>:str})
archivemanager_object_notes_assoc = Polars.read_csv('data/archivemanager/object_notes_assoc.csv',dtypes:{'object_id'=>:str,'note_id'=>:str})
archivemanager_object_notes_assoc = archivemanager_object_notes_assoc.rename({'note_id'=>'node_id'})
archivemanager_note_reformatted = reformat_dataset(archivemanager_note_props,archivemanager_object_notes_assoc)
archivemanager_note_reformatted.write_csv('data/archivemanager/reformatted_note_properties.csv')
archivemanager_note_props = nil
archivemanager_object_notes_assoc = nil
archivemanager_note_reformatted = nil
puts 'AM HGARC note props reformat complete'
require 'polars'
# Takes a Polars::Expression for a given node_id and reformats into a single
# hash the property names and their values
#
# @param node [Polars::Expression] a Polars Expression that contains all the rows for a given node_id
# @param property_names [Array<String>] an array of property names for the whole dataset.
# required for inserting a new DataFrame into an existing DataFrame
# @return [Hash] returns a hash where each key is each property name and the values their values
def reformat_props(node,property_names)
# find the value from the appropriate column
# assumes there is 0 or 1 values
grab_value = ->(row){
if row[0] != "NULL"
return row[0]
elsif row[1] != "NULL"
return row[1]
elsif row[2] != "NULL"
return row[2]
# skipping row[3] which is serializable_value
elsif row[4] != "NULL"
return row[4]
else
return nil
end
}
target_row = []
output = property_names.map{|name| {name=>[""]}}.reduce(Hash.new,:merge)
# de-duplicate data that is repeated
node.rows.uniq.each do |row|
# inserts a hash where the key is the property name and the value is the property value
# pulled from the appropriate column using grab_value lambda
target_row << {row[6]=>grab_value.call(row)} unless grab_value.call(row) == nil
end
# target_row << {'node_id'=>node.rows.first[5]}
target_row << {'node_id'=>node['node_id'].unique.to_a.first}
# convert to hash
target_row = target_row.reduce(Hash.new, :merge)
# wrap string values in array, which is expected with inserting a new Polars::DataFrame later
target_row = target_row.each{|k,v| target_row[k] = [v]}
# merges the main hash (target_row) into the output
# this creates a single hash that contains all property names
# with the appropriate ones filled out
output.merge!(target_row)
return output
end
# Takes a Polars::DataFrame and reformats properties by grouping related rows
# on node_id. This expects a very specific node structure for a specific project.
#
# @param dataframe [Polars::DataFrame] the main Polars DataFrame to reformat
# @param dataframes_to_join [Array<Polars::DataFrame>,DataFrame] optional. additional DataFrame(s) to left join onto `dataframe`.
# Joins on node_id. This is project-specific behavior.
# @return [Polars::DataFrame] returns the process Polars DataFrame
def reformat_dataset(dataframe,dataframes_to_join=[])
# Create a creates a hash of unique property names where the key is the property name and the value is :str
# This is needed to create the columns with correct data types in the target dataframe
property_name_index = dataframe['property_name'].to_a.uniq.flatten.map{|name| {name=>:str}}.reduce(Hash.new, :merge).merge({'node_id'=>:str})
property_names = dataframe['property_name'].to_a.uniq.flatten
target_df = Polars::DataFrame.new({columns:property_name_index})
# Create a unique array of the node_ids in dataframe so we can iterate
node_ids = dataframe['node_id'].to_a.uniq.flatten
# Iterate over the unique node_ids
node_ids.each do |node_id|
# Filter by node_id
node = dataframe.filter(Polars.col('node_id') == node_id)
node_df = Polars::DataFrame.new(reformat_props(node,property_names))
target_df.extend(node_df)
end
# left joins each of `dataframes_to_join` onto `dataframe`
[dataframes_to_join].flatten.each {|dataframe| target_df = target_df.join(dataframe,on:'node_id',how:'left')}
# target_df = target_df.join(object_aspects,on:'node_id',how:'left').join(object_hierarchy,on:'node_id',how:'left')
return target_df
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment