Created
February 2, 2023 15:10
-
-
Save eightBitter/5e144db251e5f094633f57a7a49ed37f to your computer and use it in GitHub Desktop.
example data reformatting using Polars
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require_relative './reformatter' | |
| ### AM HGARC object props | |
| # Read in props data to dataframe | |
| archivemanager_object_props = Polars.read_csv('data/archivemanager/object_properties_escaped.csv',dtypes:{'node_id'=>:str}) | |
| # Read aspects data to dataframe | |
| archivemanager_object_aspects = Polars.read_csv('data/archivemanager/object_aspects.csv',dtypes:{'node_id'=>:str}) | |
| # Read hierarchy data to dataframe | |
| archivemanager_object_hierarchy = Polars.read_csv('data/archivemanager/object_hierarchy.csv',dtypes:{'node_id'=>:str}) | |
| # Reformat dataframe | |
| archivemanager_object_reformatted = reformat_dataset(archivemanager_object_props,[archivemanager_object_aspects,archivemanager_object_hierarchy]) | |
| # Write reformatted dataframe to csv | |
| archivemanager_object_reformatted.write_csv('data/archivemanager/reformatted_object_properties.csv') | |
| # Cleaning memory | |
| archivemanager_object_props = nil | |
| archivemanager_object_aspects = nil | |
| archivemanager_object_hierarchy = nil | |
| archivemanager_object_reformatted = nil | |
| puts 'AM HGARC object props reformat complete' | |
| ### AM HGARC agent props | |
| archivemanager_agent_props = Polars.read_csv('data/archivemanager/agent_properties_escaped.csv',dtypes:{'node_id'=>:str}) | |
| archivemanager_agent_reformatted = reformat_dataset(archivemanager_agent_props) | |
| archivemanager_agent_reformatted.write_csv('data/archivemanager/reformatted_agent_properties.csv') | |
| archivemanager_agent_props = nil | |
| archivemanager_agent_reformatted = nil | |
| puts 'AM HGARC agent props reformat complete' | |
| ### AM HGARC subject props | |
| archivemanager_subject_props = Polars.read_csv('data/archivemanager/subject_properties_escaped.csv',dtypes:{'node_id'=>:str}) | |
| archivemanager_subject_reformatted = reformat_dataset(archivemanager_subject_props) | |
| archivemanager_subject_reformatted.write_csv('data/archivemanager/reformatted_subject_properties.csv') | |
| archivemanager_subject_props = nil | |
| archivemanager_subject_reformatted = nil | |
| puts 'AM HGARC subject props reformat complete' | |
| ### AM HGARC note props | |
| archivemanager_note_props = Polars.read_csv('data/archivemanager/note_properties_escaped.csv',dtypes:{'node_id'=>:str}) | |
| archivemanager_object_notes_assoc = Polars.read_csv('data/archivemanager/object_notes_assoc.csv',dtypes:{'object_id'=>:str,'note_id'=>:str}) | |
| archivemanager_object_notes_assoc = archivemanager_object_notes_assoc.rename({'note_id'=>'node_id'}) | |
| archivemanager_note_reformatted = reformat_dataset(archivemanager_note_props,archivemanager_object_notes_assoc) | |
| archivemanager_note_reformatted.write_csv('data/archivemanager/reformatted_note_properties.csv') | |
| archivemanager_note_props = nil | |
| archivemanager_object_notes_assoc = nil | |
| archivemanager_note_reformatted = nil | |
| puts 'AM HGARC note props reformat complete' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'polars' | |
| # Takes a Polars::Expression for a given node_id and reformats into a single | |
| # hash the property names and their values | |
| # | |
| # @param node [Polars::Expression] a Polars Expression that contains all the rows for a given node_id | |
| # @param property_names [Array<String>] an array of property names for the whole dataset. | |
| # required for inserting a new DataFrame into an existing DataFrame | |
| # @return [Hash] returns a hash where each key is each property name and the values their values | |
| def reformat_props(node,property_names) | |
| # find the value from the appropriate column | |
| # assumes there is 0 or 1 values | |
| grab_value = ->(row){ | |
| if row[0] != "NULL" | |
| return row[0] | |
| elsif row[1] != "NULL" | |
| return row[1] | |
| elsif row[2] != "NULL" | |
| return row[2] | |
| # skipping row[3] which is serializable_value | |
| elsif row[4] != "NULL" | |
| return row[4] | |
| else | |
| return nil | |
| end | |
| } | |
| target_row = [] | |
| output = property_names.map{|name| {name=>[""]}}.reduce(Hash.new,:merge) | |
| # de-duplicate data that is repeated | |
| node.rows.uniq.each do |row| | |
| # inserts a hash where the key is the property name and the value is the property value | |
| # pulled from the appropriate column using grab_value lambda | |
| target_row << {row[6]=>grab_value.call(row)} unless grab_value.call(row) == nil | |
| end | |
| # target_row << {'node_id'=>node.rows.first[5]} | |
| target_row << {'node_id'=>node['node_id'].unique.to_a.first} | |
| # convert to hash | |
| target_row = target_row.reduce(Hash.new, :merge) | |
| # wrap string values in array, which is expected with inserting a new Polars::DataFrame later | |
| target_row = target_row.each{|k,v| target_row[k] = [v]} | |
| # merges the main hash (target_row) into the output | |
| # this creates a single hash that contains all property names | |
| # with the appropriate ones filled out | |
| output.merge!(target_row) | |
| return output | |
| end | |
| # Takes a Polars::DataFrame and reformats properties by grouping related rows | |
| # on node_id. This expects a very specific node structure for a specific project. | |
| # | |
| # @param dataframe [Polars::DataFrame] the main Polars DataFrame to reformat | |
| # @param dataframes_to_join [Array<Polars::DataFrame>,DataFrame] optional. additional DataFrame(s) to left join onto `dataframe`. | |
| # Joins on node_id. This is project-specific behavior. | |
| # @return [Polars::DataFrame] returns the process Polars DataFrame | |
| def reformat_dataset(dataframe,dataframes_to_join=[]) | |
| # Create a creates a hash of unique property names where the key is the property name and the value is :str | |
| # This is needed to create the columns with correct data types in the target dataframe | |
| property_name_index = dataframe['property_name'].to_a.uniq.flatten.map{|name| {name=>:str}}.reduce(Hash.new, :merge).merge({'node_id'=>:str}) | |
| property_names = dataframe['property_name'].to_a.uniq.flatten | |
| target_df = Polars::DataFrame.new({columns:property_name_index}) | |
| # Create a unique array of the node_ids in dataframe so we can iterate | |
| node_ids = dataframe['node_id'].to_a.uniq.flatten | |
| # Iterate over the unique node_ids | |
| node_ids.each do |node_id| | |
| # Filter by node_id | |
| node = dataframe.filter(Polars.col('node_id') == node_id) | |
| node_df = Polars::DataFrame.new(reformat_props(node,property_names)) | |
| target_df.extend(node_df) | |
| end | |
| # left joins each of `dataframes_to_join` onto `dataframe` | |
| [dataframes_to_join].flatten.each {|dataframe| target_df = target_df.join(dataframe,on:'node_id',how:'left')} | |
| # target_df = target_df.join(object_aspects,on:'node_id',how:'left').join(object_hierarchy,on:'node_id',how:'left') | |
| return target_df | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment