Created
July 29, 2025 12:26
-
-
Save nielsdejong/d7b3c36097fefeebf0b0f0248a07e2fd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import os | |
| import io # Import the io module to handle byte streams | |
| from langchain_core.documents import Document # Still useful for metadata, but not for LLM inference | |
| from neo4j import GraphDatabase | |
| from google.colab import files # Import the files module for uploading | |
| # --- Configuration --- | |
| NEO4J_URI = "neo4j+s://46b4811c.databases.neo4j.io" | |
| NEO4J_USERNAME = "neo4j" | |
| NEO4J_PASSWORD = "..." | |
| CONCEPTUAL_CSV_NAME = "yacht_rock.csv" | |
| # --- Script Execution --- | |
| # Connect to Neo4j | |
| driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) | |
| driver.verify_connectivity() | |
| print("Connected to Neo4j.") | |
| print("LLM and LLMGraphTransformer components are removed for direct CSV parsing.") | |
| # Load CSV | |
| print(f"Please upload any CSV file. It will be processed as '{CONCEPTUAL_CSV_NAME}'.") | |
| uploaded = files.upload() # This will open a file selection dialog in your browser | |
| # Get the actual name of the uploaded file | |
| if not uploaded: | |
| raise FileNotFoundError("No file was uploaded. Please upload a CSV file.") | |
| # Assuming only one file is uploaded for this process, take the first one. | |
| actual_uploaded_filename = list(uploaded.keys())[0] | |
| # Read the uploaded CSV file into a pandas DataFrame | |
| df = pd.read_csv(io.BytesIO(uploaded[actual_uploaded_filename])) | |
| # --- Delete the uploaded file from Colab's temporary storage --- | |
| try: | |
| os.remove(actual_uploaded_filename) | |
| print(f"'{actual_uploaded_filename}' deleted from Colab's temporary storage.") | |
| except OSError as e: | |
| print(f"Error deleting file '{actual_uploaded_filename}': {e}") | |
| # --- End file deletion --- | |
| if "Track URI" not in df.columns or "Instruments" not in df.columns: | |
| raise ValueError("CSV must contain 'Track URI' and 'Instruments' columns.") | |
| print(f"Loaded CSV from uploaded file: {actual_uploaded_filename} (processed as '{CONCEPTUAL_CSV_NAME}')") | |
| with driver.session() as session: | |
| print("Skipping deletion of old 'Track' and 'Instrument' nodes in Neo4j.") | |
| session.run("CREATE INDEX IF NOT EXISTS FOR (i:Instrument) ON (i.name)") | |
| # Iterate through the DataFrame rows | |
| for _, row in df.iterrows(): | |
| track_id = row["Track URI"] | |
| instruments_text = str(row["Instruments"]) # Ensure it's a string | |
| # Clean the instruments text by removing brackets | |
| cleaned_instruments_text = instruments_text.replace('[', '').replace(']', '') | |
| # Attempt to split by comma first | |
| instruments_candidate = [ | |
| instrument.strip() | |
| for instrument in cleaned_instruments_text.split(',') | |
| if instrument.strip() | |
| ] | |
| # If comma split yields too few instruments (e.g., 0 or 1, implying it wasn't comma-separated | |
| # or was just a single item without commas), try splitting by space. | |
| if len(instruments_candidate) <= 1 and ' ' in cleaned_instruments_text: | |
| instruments = [ | |
| instrument.strip() | |
| for instrument in cleaned_instruments_text.split(' ') | |
| if instrument.strip() | |
| ] | |
| else: | |
| instruments = instruments_candidate | |
| # Filter out any empty strings that might result from multiple spaces or just bad data | |
| instruments = [inst for inst in instruments if inst] | |
| # Check if instruments list is empty after all parsing attempts | |
| if not instruments: | |
| print(f"Warning: No valid instruments found for Track '{track_id}' from input: '{instruments_text}'. Skipping relationship creation.") | |
| continue # Skip to the next row | |
| # Create/Merge Instrument nodes and relationships | |
| # Note: You currently use (t:Song) in your Cypher, but the index is on (t:Track). | |
| # Assuming 'Track' is the intended label for the track nodes. | |
| session.run(""" | |
| UNWIND $instruments as instrument | |
| MATCH (t:Song {trackUri: $track_id}) | |
| MERGE (i:Instrument {name: instrument}) | |
| CREATE (t)-[:FEATURES_INSTRUMENT]->(i) | |
| """, track_id=track_id, instruments=instruments) | |
| print(f"Linked '{instruments}' to Track '{track_id}'.") | |
| driver.close() | |
| print("\nScript finished. Check Neo4j.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment