Skip to content

Instantly share code, notes, and snippets.

@asemic-horizon
Created December 18, 2024 13:56
Show Gist options
  • Select an option

  • Save asemic-horizon/f974019dd735ca04ecc6d769a47c03d9 to your computer and use it in GitHub Desktop.

Select an option

Save asemic-horizon/f974019dd735ca04ecc6d769a47c03d9 to your computer and use it in GitHub Desktop.
This is a script to concatenate a mass of csv files to a parquet file using some of my "commonsense" rules. it's only lightly modified from chatgpt output but I still like to have it at hand any time I switch computers
#!/usr/bin/env python3
import pandas as pd
import sys
import os
from pathlib import Path
from argparse import ArgumentParser
from rich.console import Console
from rich.table import Table
console = Console()
# Function to guess separator
def guess_separator(file_path):
with open(file_path, 'r') as f:
first_line = f.readline()
if ',' in first_line:
return ','
elif '\t' in first_line:
return '\t'
elif ';' in first_line:
return ';'
else:
console.print(f"[red]Error:[/red] Could not determine separator for {file_path}")
sys.exit(1)
# Function to validate column names
def validate_column_names(files, separator):
first_file = pd.read_csv(files[0], sep=separator(files[0]), nrows=0)
reference_columns = set(first_file.columns)
for file in files[1:]:
current_columns = set(pd.read_csv(file, sep=separator(file), nrows=0).columns)
if reference_columns != current_columns:
console.print(f"[red]Error:[/red] Column names do not match for {file}. Exiting.")
sys.exit(1)
# Function to generate output filename
def generate_output_filename(files):
base_names = [Path(file).stem for file in files]
prefix = os.path.commonprefix(base_names)
stripped_names = [name[len(prefix):].split('-')[0] for name in base_names if len(name) > len(prefix)]
unique_parts = "+".join(stripped_names)
return f"{prefix}{unique_parts}.parquet"
# Main script logic
def main():
parser = ArgumentParser(description="Concatenate CSV files into a single output file.")
parser.add_argument("files", metavar="file", type=str, nargs="+", help="CSV files to concatenate")
args = parser.parse_args()
files = args.files
# Check that all files exist
for file in files:
if not os.path.isfile(file):
console.print(f"[red]Error:[/red] File {file} not found. Exiting.")
sys.exit(1)
# Guess separator for each file
separators = {file: guess_separator(file) for file in files}
# Validate column names
validate_column_names(files, guess_separator)
# Concatenate files
dataframes = [pd.read_csv(file, sep=separators[file]) for file in files]
combined_df = pd.concat(dataframes, ignore_index=True)
# Generate output filename
output_file = generate_output_filename(files)
# Export to Parquet for efficiency
combined_df.to_parquet(output_file, engine='pyarrow', index=False)
# Pretty-print filenames and row counts
table = Table(title="Files and Row Counts", show_header=True, header_style="bold magenta")
table.add_column("File Name", justify="left")
table.add_column("Row Count", justify="right")
for file, df in zip(files, dataframes):
table.add_row(Path(file).name, str(len(df)))
console.print(table)
console.print(f"\n[green]Consolidated file saved as:[/green] {output_file}")
console.print("To open in pandas, use:")
console.print(f"[cyan]pd.read_parquet('{output_file}')[/cyan]")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment