Created
December 18, 2024 13:56
-
-
Save asemic-horizon/f974019dd735ca04ecc6d769a47c03d9 to your computer and use it in GitHub Desktop.
This is a script to concatenate a mass of csv files to a parquet file using some of my "commonsense" rules. it's only lightly modified from chatgpt output but I still like to have it at hand any time I switch computers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import pandas as pd | |
| import sys | |
| import os | |
| from pathlib import Path | |
| from argparse import ArgumentParser | |
| from rich.console import Console | |
| from rich.table import Table | |
| console = Console() | |
| # Function to guess separator | |
| def guess_separator(file_path): | |
| with open(file_path, 'r') as f: | |
| first_line = f.readline() | |
| if ',' in first_line: | |
| return ',' | |
| elif '\t' in first_line: | |
| return '\t' | |
| elif ';' in first_line: | |
| return ';' | |
| else: | |
| console.print(f"[red]Error:[/red] Could not determine separator for {file_path}") | |
| sys.exit(1) | |
| # Function to validate column names | |
| def validate_column_names(files, separator): | |
| first_file = pd.read_csv(files[0], sep=separator(files[0]), nrows=0) | |
| reference_columns = set(first_file.columns) | |
| for file in files[1:]: | |
| current_columns = set(pd.read_csv(file, sep=separator(file), nrows=0).columns) | |
| if reference_columns != current_columns: | |
| console.print(f"[red]Error:[/red] Column names do not match for {file}. Exiting.") | |
| sys.exit(1) | |
| # Function to generate output filename | |
| def generate_output_filename(files): | |
| base_names = [Path(file).stem for file in files] | |
| prefix = os.path.commonprefix(base_names) | |
| stripped_names = [name[len(prefix):].split('-')[0] for name in base_names if len(name) > len(prefix)] | |
| unique_parts = "+".join(stripped_names) | |
| return f"{prefix}{unique_parts}.parquet" | |
| # Main script logic | |
| def main(): | |
| parser = ArgumentParser(description="Concatenate CSV files into a single output file.") | |
| parser.add_argument("files", metavar="file", type=str, nargs="+", help="CSV files to concatenate") | |
| args = parser.parse_args() | |
| files = args.files | |
| # Check that all files exist | |
| for file in files: | |
| if not os.path.isfile(file): | |
| console.print(f"[red]Error:[/red] File {file} not found. Exiting.") | |
| sys.exit(1) | |
| # Guess separator for each file | |
| separators = {file: guess_separator(file) for file in files} | |
| # Validate column names | |
| validate_column_names(files, guess_separator) | |
| # Concatenate files | |
| dataframes = [pd.read_csv(file, sep=separators[file]) for file in files] | |
| combined_df = pd.concat(dataframes, ignore_index=True) | |
| # Generate output filename | |
| output_file = generate_output_filename(files) | |
| # Export to Parquet for efficiency | |
| combined_df.to_parquet(output_file, engine='pyarrow', index=False) | |
| # Pretty-print filenames and row counts | |
| table = Table(title="Files and Row Counts", show_header=True, header_style="bold magenta") | |
| table.add_column("File Name", justify="left") | |
| table.add_column("Row Count", justify="right") | |
| for file, df in zip(files, dataframes): | |
| table.add_row(Path(file).name, str(len(df))) | |
| console.print(table) | |
| console.print(f"\n[green]Consolidated file saved as:[/green] {output_file}") | |
| console.print("To open in pandas, use:") | |
| console.print(f"[cyan]pd.read_parquet('{output_file}')[/cyan]") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment