Skip to content

Instantly share code, notes, and snippets.

@nielsdejong
Created July 29, 2025 12:26
Show Gist options
  • Select an option

  • Save nielsdejong/d7b3c36097fefeebf0b0f0248a07e2fd to your computer and use it in GitHub Desktop.

Select an option

Save nielsdejong/d7b3c36097fefeebf0b0f0248a07e2fd to your computer and use it in GitHub Desktop.
import pandas as pd
import os
import io # Import the io module to handle byte streams
from langchain_core.documents import Document # Still useful for metadata, but not for LLM inference
from neo4j import GraphDatabase
from google.colab import files # Import the files module for uploading
# --- Configuration ---
NEO4J_URI = "neo4j+s://46b4811c.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "..."
CONCEPTUAL_CSV_NAME = "yacht_rock.csv"
# --- Script Execution ---
# Connect to Neo4j
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
driver.verify_connectivity()
print("Connected to Neo4j.")
print("LLM and LLMGraphTransformer components are removed for direct CSV parsing.")
# Load CSV
print(f"Please upload any CSV file. It will be processed as '{CONCEPTUAL_CSV_NAME}'.")
uploaded = files.upload() # This will open a file selection dialog in your browser
# Get the actual name of the uploaded file
if not uploaded:
raise FileNotFoundError("No file was uploaded. Please upload a CSV file.")
# Assuming only one file is uploaded for this process, take the first one.
actual_uploaded_filename = list(uploaded.keys())[0]
# Read the uploaded CSV file into a pandas DataFrame
df = pd.read_csv(io.BytesIO(uploaded[actual_uploaded_filename]))
# --- Delete the uploaded file from Colab's temporary storage ---
try:
os.remove(actual_uploaded_filename)
print(f"'{actual_uploaded_filename}' deleted from Colab's temporary storage.")
except OSError as e:
print(f"Error deleting file '{actual_uploaded_filename}': {e}")
# --- End file deletion ---
if "Track URI" not in df.columns or "Instruments" not in df.columns:
raise ValueError("CSV must contain 'Track URI' and 'Instruments' columns.")
print(f"Loaded CSV from uploaded file: {actual_uploaded_filename} (processed as '{CONCEPTUAL_CSV_NAME}')")
with driver.session() as session:
print("Skipping deletion of old 'Track' and 'Instrument' nodes in Neo4j.")
session.run("CREATE INDEX IF NOT EXISTS FOR (i:Instrument) ON (i.name)")
# Iterate through the DataFrame rows
for _, row in df.iterrows():
track_id = row["Track URI"]
instruments_text = str(row["Instruments"]) # Ensure it's a string
# Clean the instruments text by removing brackets
cleaned_instruments_text = instruments_text.replace('[', '').replace(']', '')
# Attempt to split by comma first
instruments_candidate = [
instrument.strip()
for instrument in cleaned_instruments_text.split(',')
if instrument.strip()
]
# If comma split yields too few instruments (e.g., 0 or 1, implying it wasn't comma-separated
# or was just a single item without commas), try splitting by space.
if len(instruments_candidate) <= 1 and ' ' in cleaned_instruments_text:
instruments = [
instrument.strip()
for instrument in cleaned_instruments_text.split(' ')
if instrument.strip()
]
else:
instruments = instruments_candidate
# Filter out any empty strings that might result from multiple spaces or just bad data
instruments = [inst for inst in instruments if inst]
# Check if instruments list is empty after all parsing attempts
if not instruments:
print(f"Warning: No valid instruments found for Track '{track_id}' from input: '{instruments_text}'. Skipping relationship creation.")
continue # Skip to the next row
# Create/Merge Instrument nodes and relationships
# Note: You currently use (t:Song) in your Cypher, but the index is on (t:Track).
# Assuming 'Track' is the intended label for the track nodes.
session.run("""
UNWIND $instruments as instrument
MATCH (t:Song {trackUri: $track_id})
MERGE (i:Instrument {name: instrument})
CREATE (t)-[:FEATURES_INSTRUMENT]->(i)
""", track_id=track_id, instruments=instruments)
print(f"Linked '{instruments}' to Track '{track_id}'.")
driver.close()
print("\nScript finished. Check Neo4j.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment