guenp · November 28, 2024 13:42
diff --git a/README.md b/README.md
diff --git a/bird_migration.ipynb b/bird_migration.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3a0f5c76-0aeb-40ca-83b1-eaa1cc0d41dd",
   "metadata": {},
   "source": [
    "## 0. Load eBird data into MotherDuck"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6285fb23-e1cd-4009-aeed-9bcf4b918356",
   "metadata": {},
   "source": [
    "This took approximately 1 hour to run (not including downloading the `.tar` file from eBird).\n",
    "\n",
    "This includes a simple data cleaning step that casts the `\"OBSERVATION COUNT\"` column to an integer."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8fdba394-e4bd-4237-b977-43eb739b768a",
   "metadata": {},
   "source": [
    "```bash\n",
    "$ tar -xf data/ebd_relSep-2024.tar\n",
    "$ duckdb \"md:ebird\"\n",
    "```\n",
    "\n",
    "```sql\n",
    "CREATE TABLE ebd AS\n",
    "SELECT\n",
    "    \"GLOBAL UNIQUE IDENTIFIER\",\n",
    "    \"COMMON NAME\",\n",
    "    \"SCIENTIFIC NAME\",\n",
    "    \"SUBSPECIES COMMON NAME\",\n",
    "    \"SUBSPECIES SCIENTIFIC NAME\",\n",
    "    CAST(CASE WHEN \"OBSERVATION COUNT\" = 'X' THEN 0 ELSE CAST(\"OBSERVATION COUNT\" AS INT) END AS INT) AS \"OBSERVATION COUNT\",\n",
    "    \"COUNTRY\",\n",
    "    \"STATE\",\n",
    "    \"COUNTY\",\n",
    "    \"LOCALITY\",\n",
    "    \"LATITUDE\",\n",
    "    \"LONGITUDE\",\n",
    "    \"OBSERVATION DATE\",\n",
    "FROM read_csv('data/ebd_relSep-2024.txt.gz');\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c30a4d5-3ee3-4794-8bdf-7ceb478d597f",
   "metadata": {},
   "source": [
    "## 1. Connect to MotherDuck and explore the `ebd` table in the `md:ebird` database"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "712ae9bb-4756-4a60-8430-ad36f29a3d73",
   "metadata": {},
   "source": [
    "Load `MOTHERDUCK_TOKEN` environment variable (get this from https://app.motherduck.com/settings) from .env file:\n",
    "\n",
    "```\n",
    "MOTHERDUCK_TOKEN=<paste token here>\n",
    "```\n",
    "\n",
    "You can also export it to the environment variables before you start Jupyter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bc2dcd6-a5c3-471d-9d44-f6748405464c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dotenv import load_dotenv\n",
    "load_dotenv();"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76a05395-6a14-46b0-8fb9-2be0104cf646",
   "metadata": {},
   "source": [
    "Connect to MotherDuck"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f99723dc-9d1a-4b68-9c1e-fcb75014dcb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import duckdb\n",
    "con = duckdb.connect(\"md:ebird\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "988392e8-1723-434a-8a87-5c4e1dc1feb8",
   "metadata": {},
   "source": [
    "Add ConnectionWrapper with convenience functions and get all tables in the database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "364f54aa-fdba-4b95-b498-733deb08442d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tools import ConnectionWrapper\n",
    "con = ConnectionWrapper(con)\n",
    "con.tables()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfad8041-0d4e-4813-94f2-d4d34478f6d0",
   "metadata": {},
   "source": [
    "Get a high level description of the table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d196b78f-6a03-45cf-855e-b9b46b090112",
   "metadata": {},
   "outputs": [],
   "source": [
    "con.describe(\"ebd\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ced4483d-3d00-41ed-96fd-17ba5a638136",
   "metadata": {},
   "source": [
    "Count the number of rows in the table with the **DuckDB relational API**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40688763-0e78-4578-9964-c1c06fdb44f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "con.table(\"ebd\").count(\"*\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a04cbc92-36ab-46ae-b369-4cea6d2340ae",
   "metadata": {},
   "source": [
    "Get a preview of the table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf075096-2b4a-4ca7-a6ef-f4b543712671",
   "metadata": {},
   "outputs": [],
   "source": [
    "con.table(\"ebd\").select(\"COMMON NAME\", \"OBSERVATION DATE\", \"OBSERVATION COUNT\", \"COUNTRY\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "798e0b50-dd01-4453-9420-073af67a0d04",
   "metadata": {},
   "source": [
    "## 2. Get the top 5 bird species observed in Washington State"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cd803ed-79b7-4fb9-a096-d5f4562b5ba4",
   "metadata": {},
   "source": [
    "Use the PySpark API to filter, group by, aggregate, order and limit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4b5f540-5eeb-41e0-9c44-2d46df015416",
   "metadata": {},
   "outputs": [],
   "source": [
    "from duckdb.experimental.spark.sql import SparkSession\n",
    "from duckdb.experimental.spark.sql.functions import avg, col, count, sum, lit, year\n",
    "from duckdb.experimental.spark.sql.dataframe import DataFrame\n",
    "\n",
    "# create a SparkSession\n",
    "spark = SparkSession.builder.appName(\"DuckMigration\").getOrCreate()\n",
    "\n",
    "# Load the data into DuckDB Spark API\n",
    "df = DataFrame(con.table(\"ebd\"), spark)\n",
    "\n",
    "result = (\n",
    "    df\n",
    "    .filter(\n",
    "        (year(col(\"OBSERVATION DATE\")) == 2023) & \n",
    "        (col(\"COUNTRY\") == \"United States\") & \n",
    "        (col(\"STATE\") == \"Washington\")\n",
    "    )\n",
    "    .groupBy(\"COMMON NAME\")\n",
    "    .agg(sum(\"OBSERVATION COUNT\").alias(\"total_count\"))\n",
    "    .orderBy(col(\"total_count\").desc())\n",
    "    .limit(5)\n",
    "    .select(\"COMMON NAME\", \"total_count\")\n",
    ")\n",
    "\n",
    "# show the result\n",
    "result.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "acb3e141-d911-459a-b2f5-487fbac806df",
   "metadata": {},
   "source": [
    "Alternatively, use `prompt_query` or `prompt_sql` to generate SQL queries using natural language"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4201c813-3e1c-4a59-ac1e-ff18cc00955d",
   "metadata": {},
   "outputs": [],
   "source": [
    "con.prompt_query(\"Get the top 5 common names and total observation count of birds in Washington State in 2023\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d3cd4c5-8d3b-4abf-8222-efd966f566b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = con.prompt_sql(\"Get the top 5 common names and total observation count of birds in Washington State in 2023\")\n",
    "print(query)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "902c69ff-581d-46a3-86e9-8ab9cb0e31b9",
   "metadata": {},
   "source": [
    "## 3. Get all observations of Snow Geese in 2023, sorted by month"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "47b77f15-e0fa-4af6-9a82-099970076e5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get all Snow Goose observations from 2023 sorted by month\n",
    "from duckdb.experimental.spark.sql import SparkSession\n",
    "from duckdb.experimental.spark.sql.functions import avg, col, count, sum, lit, month, year\n",
    "from duckdb.experimental.spark.sql.dataframe import DataFrame\n",
    "\n",
    "common_name = \"Snow Goose\"\n",
    "spark = SparkSession.builder.appName(common_name).getOrCreate()\n",
    "df = DataFrame(con.table(\"ebd\"), spark)\n",
    "\n",
    "# Get all Snow Goose observations from 2023\n",
    "result = (\n",
    "    df\n",
    "    .filter((year(col(\"OBSERVATION DATE\")) == 2023) & (col(\"COMMON NAME\") == common_name))\n",
    "    .orderBy(col(\"OBSERVATION DATE\").asc())\n",
    "    .select(\n",
    "        col(\"COMMON NAME\").alias(\"name\"),\n",
    "        col(\"OBSERVATION DATE\").alias(\"obs_dt\"),\n",
    "        month(\"OBSERVATION DATE\").alias(\"month\"),\n",
    "        col(\"OBSERVATION COUNT\").alias(\"count\"),\n",
    "        col(\"LATITUDE\").alias(\"lat\"),\n",
    "        col(\"LONGITUDE\").alias(\"lng\"),\n",
    "        col(\"COUNTRY\").alias(\"country\")\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18eff6d3-a46b-4a54-be1b-f7f56c0b6f47",
   "metadata": {},
   "source": [
    "Store the results in a temporary in-memory table."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4729439c-3d56-4102-9bb8-58991cc79eb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "con.to_temp_table(result.relation, table_name=\"bird_obs\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a2ea29a9-115b-471d-9aff-63bdfc3bae15",
   "metadata": {},
   "source": [
    "Inspect the table."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8fccfe66-1116-45ce-8943-ada2bba73057",
   "metadata": {},
   "outputs": [],
   "source": [
    "con.table(\"bird_obs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d4baf36-212e-4139-b001-d6026d723eb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "con.table(\"bird_obs\").count(\"*\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c1e3832c-ad6d-47b7-b743-e77e58707270",
   "metadata": {},
   "source": [
    "## 4. Prepare the results and visualize them"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e4f3a0ab-d6cd-4aaf-9d14-ff274f361637",
   "metadata": {},
   "source": [
    "Use H3 community extension to downsample the data for plotting and export to Pandas dataframe.\n",
    "\n",
    "This code will run all locally on my temporary table!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "168a384b-1558-426e-9347-6ef807a6dc5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "con.install_extension(\"h3\", repository=\"community\")\n",
    "con.load_extension(\"h3\")\n",
    "\n",
    "rel = (\n",
    "    con\n",
    "    .table(\"bird_obs\")\n",
    "    .select(\"\"\"\n",
    "        *,\n",
    "        h3_latlng_to_cell(lat::double, lng::double, 6) AS cell_id,\n",
    "        h3_cell_to_lat(cell_id) AS cell_lat,\n",
    "        h3_cell_to_lng(cell_id) AS cell_lng\n",
    "    \"\"\")\n",
    "    .order(\"obs_dt ASC\")\n",
    ")\n",
    "\n",
    "# Export to Pandas\n",
    "df = rel.df()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "346a447d-80a6-4b14-b1ca-0f2df2a48e9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0e5a246-549d-415e-a62b-2b07f18ecaf8",
   "metadata": {},
   "source": [
    "Plot on heat map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0225b249-6d4b-4de6-850e-8fefb172ff09",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.express as px\n",
    "fig = px.density_map(df, lat='cell_lat', lon='cell_lng', z='count', animation_frame='month', radius=4,\n",
    "                        center=dict(lat=39.5, lon=-96.5), zoom=3, range_color=[0,10000],\n",
    "                        map_style=\"open-street-map\", height=800, title=df.name.unique()[0])\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0cd7230f-7025-4b5e-b9ba-0863f972b8e3",
   "metadata": {},
   "source": [
    "## 5. Get the bird that traveled the furthest distance"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1464c625-9877-4f71-9fa4-70341db04831",
   "metadata": {},
   "source": [
    "Get the birds that traveled the longest distance: spend the winter in Mexico and the summer in Canada"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8e7b94b-f2e1-4640-872f-be2e18b0cd83",
   "metadata": {},
   "outputs": [],
   "source": [
    "con.prompt_query(\n",
    "    \"What is the top 5 of bird species (by common name) that were most frequently observed in Mexico \"\n",
    "    \"during the Winter months and in Canada during the Summer months?\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8ce4432-2adb-45af-b1e2-14cd47d1aefb",
   "metadata": {},
   "source": [
    "Get all bird observations from 2023 sorted by month"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca19c7b6-ad52-4102-a65e-b75e2585eb83",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tools import get_observations\n",
    "\n",
    "common_name = \"Semipalmated Sandpiper\"\n",
    "df = get_observations(con, common_name, 2023)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27096166-a413-4039-81b2-ec1253e494fc",
   "metadata": {},
   "source": [
    "Plot on heat map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52ff4341-c12f-4abc-8a80-09f423349447",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.express as px\n",
    "fig = px.density_map(df, lat='cell_lat', lon='cell_lng', z='count', animation_frame='month', radius=4,\n",
    "                        center=dict(lat=19.5, lon=-96.5), zoom=1, range_color=[0,800],\n",
    "                        map_style=\"open-street-map\", height=800, title=df.name.unique()[0])\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e779bcc-26be-4b8a-a5d2-c15b31ca3036",
   "metadata": {},
   "source": [
    "## 6. Use gpt-4o model to find the most observed species in the list that are ducks"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cbb92b2b-86e5-4afe-ab96-e5ee1fc89405",
   "metadata": {},
   "source": [
    "We can use gpt-4o and utilize the data in that model to get insights into our data!\n",
    "\n",
    "For instance, it has some basic knowledge about duck species."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4b57c82-84e1-4dfa-8edd-d7876a984f5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(con.prompt(\"What is the scientific name for duck?\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3343232e-4ad6-45bd-8b55-aa53dc43b22f",
   "metadata": {},
   "source": [
    "Let's use the SQL generator to create a text prompt for gpt-4o that contains some of the data from the eBird database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf5a3b90-f77a-4dfb-9a4c-77e627c69713",
   "metadata": {},
   "outputs": [],
   "source": [
    "top_observations = con.prompt_sql(\"\"\"\n",
    "Give me the top 100 common names and observation year for Washington State in 2023 by total number of observations.\n",
    "Then aggregate them into a single column in the form \"Column name: value\" and aggregate all rows.\n",
    "\n",
    "Then prepend this text:\n",
    "    User asks the following question: Which top 10 duck species were observed most frequently?\n",
    "    Please answer the question, given the additional information.\n",
    "    Here is the additional information:\n",
    "\n",
    "Add newlines to every new sentence. The result column should be called \"text\".\n",
    "\"\"\").replace(\"\\\\n\", \"\\n\").replace(\";\", \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac8179f9-bf56-439e-a4fd-3730b8a035d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(con.sql(top_observations).fetchone()[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e55121f3-f160-4931-bf1e-ece7adc71243",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(con.sql(f\"\"\"\n",
    "WITH prompt_text AS (\n",
    "    {top_observations}\n",
    ")\n",
    "\n",
    "SELECT prompt(text) FROM prompt_text\n",
    "\"\"\").fetchone()[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf2644b6-c0a3-4390-8f16-43bb1e335b76",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tools import get_observations\n",
    "\n",
    "df = get_observations(con, \"American Wigeon\", 2023)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56f29eb5-5db0-489c-93dd-1c614657849b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-08T07:21:40.645490Z",
     "iopub.status.busy": "2024-11-08T07:21:40.645039Z",
     "iopub.status.idle": "2024-11-08T07:21:40.653664Z",
     "shell.execute_reply": "2024-11-08T07:21:40.651681Z",
     "shell.execute_reply.started": "2024-11-08T07:21:40.645458Z"
    }
   },
   "source": [
    "Plot on heat map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c66a1f6e-8707-457e-879b-4bea40baf385",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.express as px\n",
    "fig = px.density_map(df, lat='cell_lat', lon='cell_lng', z='count', animation_frame='month', radius=4,\n",
    "                        center=dict(lat=19.5, lon=-96.5), zoom=1, range_color=[0,1000],\n",
    "                        map_style=\"open-street-map\", height=800, title=df.name.unique()[0])\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cbc14d9-423e-4b81-8a36-c7145c265b81",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/pyproject.toml b/pyproject.toml
 [tool.poetry]
 name = "pydata-demo"
 version = "0.1.0"
 description = ""
 authors = ["Guen Prawiroatmodjo <[email protected]>"]
 readme = "README.md"

 [tool.poetry.dependencies]
 python = "^3.12"
 duckdb = "^1.0.0, <=1.1.2"
 jupyter = "^1.1.1"
 python-dotenv = "^1.0.1"
 jupyterlab-execute-time = "^3.2.0"
 plotly = "^5.24.1"
 plotly-express = "^0.4.1"


 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
diff --git a/tools.py b/tools.py
 from typing import Optional
 from duckdb import DuckDBPyConnection, DuckDBPyRelation


 class ConnectionWrapper():
    """Connection wrapper for easier querying"""
    def __init__(self, con: DuckDBPyConnection):
        self.con = con
        (self.database, ) = self.con.sql("select current_database()").fetchone()

    def attach(self, database: str, name: str):
        return self.con.sql(f"ATTACH '{database}' AS {name}")

    def describe(self, table_name: str):
        return self.con.sql(f"DESCRIBE {table_name}")

    def summarize(self, table_name: str):
        return self.con.sql(f"SUMMARIZE {table_name}")

    def tables(self):
        return self.con.sql(f"SELECT database, schema, name FROM (SHOW ALL TABLES) WHERE database IN ('memdb', '{self.database}')")

    def table(self, table_name: str) -> DuckDBPyRelation:
        if "." in table_name:
            return self.con.sql(f"FROM {table_name}")
        return self.con.table(table_name)
    
    def to_table(self, relation: DuckDBPyRelation, table_name: str, replace: bool = False, temp: bool = False):
        or_replace = " OR REPLACE " if replace is True else " "
        temp_tbl = "TEMP "
        self.con.sql(f"CREATE{or_replace}{temp_tbl}TABLE {table_name} AS ({relation.sql_query()})")

    def to_temp_table(self, relation: DuckDBPyRelation, table_name: str):
        self.to_table(relation, table_name, replace=True, temp=True)

    def prompt_query(self, prompt: str):
        return self.con.sql(f"PRAGMA prompt_query('{prompt}')")
    
    def prompt_sql(self, prompt: str):
        (query,) = self.con.sql(f"CALL prompt_sql('{prompt}')").fetchone()
        return query
    
    def prompt(self, prompt: str):
        return self.con.sql(f"SELECT prompt('{prompt}')").fetchone()[0]

    def __getattr__(self, name: str):
        if hasattr(self.con, name):
            return getattr(self.con, name)


 def get_observations(con: ConnectionWrapper, common_name: str, year: int):
    from duckdb.experimental.spark.sql import SparkSession
    from duckdb.experimental.spark.sql.functions import col, month
    from duckdb.experimental.spark.sql.functions import year as apply_year
    from duckdb.experimental.spark.sql.dataframe import DataFrame

    spark = SparkSession.builder.appName(common_name).getOrCreate()
    df = DataFrame(con.table("ebd"), spark)

    result = (
        df
        .filter((apply_year(col("OBSERVATION DATE")) == year) & (col("COMMON NAME") == common_name))
        .orderBy(col("OBSERVATION DATE").asc())
        .select(
            col("COMMON NAME").alias("name"),
            col("OBSERVATION DATE").alias("obs_dt"),
            month("OBSERVATION DATE").alias("month"),
            col("OBSERVATION COUNT").alias("count"),
            col("LATITUDE").alias("lat"),
            col("LONGITUDE").alias("lng"),
            col("COUNTRY").alias("country")
        )
    )

    con.to_temp_table(result.relation, table_name="bird_obs")

    # Use H3 to downsample the data for plotting
    con.install_extension("h3", repository="community")
    con.load_extension("h3")

    rel = (
        con
        .table("bird_obs")
        .select("""
            *,
            h3_latlng_to_cell(lat::double, lng::double, 6) AS cell_id,
            h3_cell_to_lat(cell_id) AS cell_lat,
            h3_cell_to_lng(cell_id) AS cell_lng
        """)
    )

    df = DataFrame(rel, spark)

    result = (
        df
        .orderBy(
            col("obs_dt").asc()
        )  # Order by the week number
    )

    # Export to Pandas
    df_result = result.toPandas()

    return df_result
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "3a0f5c76-0aeb-40ca-83b1-eaa1cc0d41dd",
	"metadata": {},
	"source": [
	"## 0. Load eBird data into MotherDuck"
	]
	},
	{
	"cell_type": "markdown",
	"id": "6285fb23-e1cd-4009-aeed-9bcf4b918356",
	"metadata": {},
	"source": [
	"This took approximately 1 hour to run (not including downloading the `.tar` file from eBird).\n",
	"\n",
	"This includes a simple data cleaning step that casts the `\"OBSERVATION COUNT\"` column to an integer."
	]
	},
	{
	"cell_type": "markdown",
	"id": "8fdba394-e4bd-4237-b977-43eb739b768a",
	"metadata": {},
	"source": [
	"```bash\n",
	"$ tar -xf data/ebd_relSep-2024.tar\n",
	"$ duckdb \"md:ebird\"\n",
	"```\n",
	"\n",
	"```sql\n",
	"CREATE TABLE ebd AS\n",
	"SELECT\n",
	" \"GLOBAL UNIQUE IDENTIFIER\",\n",
	" \"COMMON NAME\",\n",
	" \"SCIENTIFIC NAME\",\n",
	" \"SUBSPECIES COMMON NAME\",\n",
	" \"SUBSPECIES SCIENTIFIC NAME\",\n",
	" CAST(CASE WHEN \"OBSERVATION COUNT\" = 'X' THEN 0 ELSE CAST(\"OBSERVATION COUNT\" AS INT) END AS INT) AS \"OBSERVATION COUNT\",\n",
	" \"COUNTRY\",\n",
	" \"STATE\",\n",
	" \"COUNTY\",\n",
	" \"LOCALITY\",\n",
	" \"LATITUDE\",\n",
	" \"LONGITUDE\",\n",
	" \"OBSERVATION DATE\",\n",
	"FROM read_csv('data/ebd_relSep-2024.txt.gz');\n",
	"```"
	]
	},
	{
	"cell_type": "markdown",
	"id": "0c30a4d5-3ee3-4794-8bdf-7ceb478d597f",
	"metadata": {},
	"source": [
	"## 1. Connect to MotherDuck and explore the `ebd` table in the `md:ebird` database"
	]
	},
	{
	"cell_type": "markdown",
	"id": "712ae9bb-4756-4a60-8430-ad36f29a3d73",
	"metadata": {},
	"source": [
	"Load `MOTHERDUCK_TOKEN` environment variable (get this from https://app.motherduck.com/settings) from .env file:\n",
	"\n",
	"```\n",
	"MOTHERDUCK_TOKEN=<paste token here>\n",
	"```\n",
	"\n",
	"You can also export it to the environment variables before you start Jupyter."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4bc2dcd6-a5c3-471d-9d44-f6748405464c",
	"metadata": {},
	"outputs": [],
	"source": [
	"from dotenv import load_dotenv\n",
	"load_dotenv();"
	]
	},
	{
	"cell_type": "markdown",
	"id": "76a05395-6a14-46b0-8fb9-2be0104cf646",
	"metadata": {},
	"source": [
	"Connect to MotherDuck"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f99723dc-9d1a-4b68-9c1e-fcb75014dcb4",
	"metadata": {},
	"outputs": [],
	"source": [
	"import duckdb\n",
	"con = duckdb.connect(\"md:ebird\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "988392e8-1723-434a-8a87-5c4e1dc1feb8",
	"metadata": {},
	"source": [
	"Add ConnectionWrapper with convenience functions and get all tables in the database"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "364f54aa-fdba-4b95-b498-733deb08442d",
	"metadata": {},
	"outputs": [],
	"source": [
	"from tools import ConnectionWrapper\n",
	"con = ConnectionWrapper(con)\n",
	"con.tables()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "bfad8041-0d4e-4813-94f2-d4d34478f6d0",
	"metadata": {},
	"source": [
	"Get a high level description of the table"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "d196b78f-6a03-45cf-855e-b9b46b090112",
	"metadata": {},
	"outputs": [],
	"source": [
	"con.describe(\"ebd\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "ced4483d-3d00-41ed-96fd-17ba5a638136",
	"metadata": {},
	"source": [
	"Count the number of rows in the table with the DuckDB relational API"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "40688763-0e78-4578-9964-c1c06fdb44f0",
	"metadata": {},
	"outputs": [],
	"source": [
	"con.table(\"ebd\").count(\"*\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a04cbc92-36ab-46ae-b369-4cea6d2340ae",
	"metadata": {},
	"source": [
	"Get a preview of the table"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "cf075096-2b4a-4ca7-a6ef-f4b543712671",
	"metadata": {},
	"outputs": [],
	"source": [
	"con.table(\"ebd\").select(\"COMMON NAME\", \"OBSERVATION DATE\", \"OBSERVATION COUNT\", \"COUNTRY\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "798e0b50-dd01-4453-9420-073af67a0d04",
	"metadata": {},
	"source": [
	"## 2. Get the top 5 bird species observed in Washington State"
	]
	},
	{
	"cell_type": "markdown",
	"id": "7cd803ed-79b7-4fb9-a096-d5f4562b5ba4",
	"metadata": {},
	"source": [
	"Use the PySpark API to filter, group by, aggregate, order and limit"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f4b5f540-5eeb-41e0-9c44-2d46df015416",
	"metadata": {},
	"outputs": [],
	"source": [
	"from duckdb.experimental.spark.sql import SparkSession\n",
	"from duckdb.experimental.spark.sql.functions import avg, col, count, sum, lit, year\n",
	"from duckdb.experimental.spark.sql.dataframe import DataFrame\n",
	"\n",
	"# create a SparkSession\n",
	"spark = SparkSession.builder.appName(\"DuckMigration\").getOrCreate()\n",
	"\n",
	"# Load the data into DuckDB Spark API\n",
	"df = DataFrame(con.table(\"ebd\"), spark)\n",
	"\n",
	"result = (\n",
	" df\n",
	" .filter(\n",
	" (year(col(\"OBSERVATION DATE\")) == 2023) & \n",
	" (col(\"COUNTRY\") == \"United States\") & \n",
	" (col(\"STATE\") == \"Washington\")\n",
	" )\n",
	" .groupBy(\"COMMON NAME\")\n",
	" .agg(sum(\"OBSERVATION COUNT\").alias(\"total_count\"))\n",
	" .orderBy(col(\"total_count\").desc())\n",
	" .limit(5)\n",
	" .select(\"COMMON NAME\", \"total_count\")\n",
	")\n",
	"\n",
	"# show the result\n",
	"result.show()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "acb3e141-d911-459a-b2f5-487fbac806df",
	"metadata": {},
	"source": [
	"Alternatively, use `prompt_query` or `prompt_sql` to generate SQL queries using natural language"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4201c813-3e1c-4a59-ac1e-ff18cc00955d",
	"metadata": {},
	"outputs": [],
	"source": [
	"con.prompt_query(\"Get the top 5 common names and total observation count of birds in Washington State in 2023\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "2d3cd4c5-8d3b-4abf-8222-efd966f566b6",
	"metadata": {},
	"outputs": [],
	"source": [
	"query = con.prompt_sql(\"Get the top 5 common names and total observation count of birds in Washington State in 2023\")\n",
	"print(query)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "902c69ff-581d-46a3-86e9-8ab9cb0e31b9",
	"metadata": {},
	"source": [
	"## 3. Get all observations of Snow Geese in 2023, sorted by month"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "47b77f15-e0fa-4af6-9a82-099970076e5f",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Get all Snow Goose observations from 2023 sorted by month\n",
	"from duckdb.experimental.spark.sql import SparkSession\n",
	"from duckdb.experimental.spark.sql.functions import avg, col, count, sum, lit, month, year\n",
	"from duckdb.experimental.spark.sql.dataframe import DataFrame\n",
	"\n",
	"common_name = \"Snow Goose\"\n",
	"spark = SparkSession.builder.appName(common_name).getOrCreate()\n",
	"df = DataFrame(con.table(\"ebd\"), spark)\n",
	"\n",
	"# Get all Snow Goose observations from 2023\n",
	"result = (\n",
	" df\n",
	" .filter((year(col(\"OBSERVATION DATE\")) == 2023) & (col(\"COMMON NAME\") == common_name))\n",
	" .orderBy(col(\"OBSERVATION DATE\").asc())\n",
	" .select(\n",
	" col(\"COMMON NAME\").alias(\"name\"),\n",
	" col(\"OBSERVATION DATE\").alias(\"obs_dt\"),\n",
	" month(\"OBSERVATION DATE\").alias(\"month\"),\n",
	" col(\"OBSERVATION COUNT\").alias(\"count\"),\n",
	" col(\"LATITUDE\").alias(\"lat\"),\n",
	" col(\"LONGITUDE\").alias(\"lng\"),\n",
	" col(\"COUNTRY\").alias(\"country\")\n",
	" )\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "18eff6d3-a46b-4a54-be1b-f7f56c0b6f47",
	"metadata": {},
	"source": [
	"Store the results in a temporary in-memory table."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4729439c-3d56-4102-9bb8-58991cc79eb2",
	"metadata": {},
	"outputs": [],
	"source": [
	"con.to_temp_table(result.relation, table_name=\"bird_obs\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "a2ea29a9-115b-471d-9aff-63bdfc3bae15",
	"metadata": {},
	"source": [
	"Inspect the table."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "8fccfe66-1116-45ce-8943-ada2bba73057",
	"metadata": {},
	"outputs": [],
	"source": [
	"con.table(\"bird_obs\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "9d4baf36-212e-4139-b001-d6026d723eb8",
	"metadata": {},
	"outputs": [],
	"source": [
	"con.table(\"bird_obs\").count(\"*\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "c1e3832c-ad6d-47b7-b743-e77e58707270",
	"metadata": {},
	"source": [
	"## 4. Prepare the results and visualize them"
	]
	},
	{
	"cell_type": "markdown",
	"id": "e4f3a0ab-d6cd-4aaf-9d14-ff274f361637",
	"metadata": {},
	"source": [
	"Use H3 community extension to downsample the data for plotting and export to Pandas dataframe.\n",
	"\n",
	"This code will run all locally on my temporary table!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "168a384b-1558-426e-9347-6ef807a6dc5c",
	"metadata": {},
	"outputs": [],
	"source": [
	"con.install_extension(\"h3\", repository=\"community\")\n",
	"con.load_extension(\"h3\")\n",
	"\n",
	"rel = (\n",
	" con\n",
	" .table(\"bird_obs\")\n",
	" .select(\"\"\"\n",
	" *,\n",
	" h3_latlng_to_cell(lat::double, lng::double, 6) AS cell_id,\n",
	" h3_cell_to_lat(cell_id) AS cell_lat,\n",
	" h3_cell_to_lng(cell_id) AS cell_lng\n",
	" \"\"\")\n",
	" .order(\"obs_dt ASC\")\n",
	")\n",
	"\n",
	"# Export to Pandas\n",
	"df = rel.df()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "346a447d-80a6-4b14-b1ca-0f2df2a48e9b",
	"metadata": {},
	"outputs": [],
	"source": [
	"df.info()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "e0e5a246-549d-415e-a62b-2b07f18ecaf8",
	"metadata": {},
	"source": [
	"Plot on heat map"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "0225b249-6d4b-4de6-850e-8fefb172ff09",
	"metadata": {},
	"outputs": [],
	"source": [
	"import plotly.express as px\n",
	"fig = px.density_map(df, lat='cell_lat', lon='cell_lng', z='count', animation_frame='month', radius=4,\n",
	" center=dict(lat=39.5, lon=-96.5), zoom=3, range_color=[0,10000],\n",
	" map_style=\"open-street-map\", height=800, title=df.name.unique()[0])\n",
	"fig.show()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "0cd7230f-7025-4b5e-b9ba-0863f972b8e3",
	"metadata": {},
	"source": [
	"## 5. Get the bird that traveled the furthest distance"
	]
	},
	{
	"cell_type": "markdown",
	"id": "1464c625-9877-4f71-9fa4-70341db04831",
	"metadata": {},
	"source": [
	"Get the birds that traveled the longest distance: spend the winter in Mexico and the summer in Canada"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "c8e7b94b-f2e1-4640-872f-be2e18b0cd83",
	"metadata": {},
	"outputs": [],
	"source": [
	"con.prompt_query(\n",
	" \"What is the top 5 of bird species (by common name) that were most frequently observed in Mexico \"\n",
	" \"during the Winter months and in Canada during the Summer months?\"\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "e8ce4432-2adb-45af-b1e2-14cd47d1aefb",
	"metadata": {},
	"source": [
	"Get all bird observations from 2023 sorted by month"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ca19c7b6-ad52-4102-a65e-b75e2585eb83",
	"metadata": {},
	"outputs": [],
	"source": [
	"from tools import get_observations\n",
	"\n",
	"common_name = \"Semipalmated Sandpiper\"\n",
	"df = get_observations(con, common_name, 2023)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "27096166-a413-4039-81b2-ec1253e494fc",
	"metadata": {},
	"source": [
	"Plot on heat map"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "52ff4341-c12f-4abc-8a80-09f423349447",
	"metadata": {},
	"outputs": [],
	"source": [
	"import plotly.express as px\n",
	"fig = px.density_map(df, lat='cell_lat', lon='cell_lng', z='count', animation_frame='month', radius=4,\n",
	" center=dict(lat=19.5, lon=-96.5), zoom=1, range_color=[0,800],\n",
	" map_style=\"open-street-map\", height=800, title=df.name.unique()[0])\n",
	"fig.show()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "5e779bcc-26be-4b8a-a5d2-c15b31ca3036",
	"metadata": {},
	"source": [
	"## 6. Use gpt-4o model to find the most observed species in the list that are ducks"
	]
	},
	{
	"cell_type": "markdown",
	"id": "cbb92b2b-86e5-4afe-ab96-e5ee1fc89405",
	"metadata": {},
	"source": [
	"We can use gpt-4o and utilize the data in that model to get insights into our data!\n",
	"\n",
	"For instance, it has some basic knowledge about duck species."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f4b57c82-84e1-4dfa-8edd-d7876a984f5e",
	"metadata": {},
	"outputs": [],
	"source": [
	"print(con.prompt(\"What is the scientific name for duck?\"))"
	]
	},
	{
	"cell_type": "markdown",
	"id": "3343232e-4ad6-45bd-8b55-aa53dc43b22f",
	"metadata": {},
	"source": [
	"Let's use the SQL generator to create a text prompt for gpt-4o that contains some of the data from the eBird database."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "bf5a3b90-f77a-4dfb-9a4c-77e627c69713",
	"metadata": {},
	"outputs": [],
	"source": [
	"top_observations = con.prompt_sql(\"\"\"\n",
	"Give me the top 100 common names and observation year for Washington State in 2023 by total number of observations.\n",
	"Then aggregate them into a single column in the form \"Column name: value\" and aggregate all rows.\n",
	"\n",
	"Then prepend this text:\n",
	" User asks the following question: Which top 10 duck species were observed most frequently?\n",
	" Please answer the question, given the additional information.\n",
	" Here is the additional information:\n",
	"\n",
	"Add newlines to every new sentence. The result column should be called \"text\".\n",
	"\"\"\").replace(\"\\\\n\", \"\\n\").replace(\";\", \"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ac8179f9-bf56-439e-a4fd-3730b8a035d0",
	"metadata": {},
	"outputs": [],
	"source": [
	"print(con.sql(top_observations).fetchone()[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "e55121f3-f160-4931-bf1e-ece7adc71243",
	"metadata": {},
	"outputs": [],
	"source": [
	"print(con.sql(f\"\"\"\n",
	"WITH prompt_text AS (\n",
	" {top_observations}\n",
	")\n",
	"\n",
	"SELECT prompt(text) FROM prompt_text\n",
	"\"\"\").fetchone()[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "bf2644b6-c0a3-4390-8f16-43bb1e335b76",
	"metadata": {},
	"outputs": [],
	"source": [
	"from tools import get_observations\n",
	"\n",
	"df = get_observations(con, \"American Wigeon\", 2023)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "56f29eb5-5db0-489c-93dd-1c614657849b",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2024-11-08T07:21:40.645490Z",
	"iopub.status.busy": "2024-11-08T07:21:40.645039Z",
	"iopub.status.idle": "2024-11-08T07:21:40.653664Z",
	"shell.execute_reply": "2024-11-08T07:21:40.651681Z",
	"shell.execute_reply.started": "2024-11-08T07:21:40.645458Z"
	}
	},
	"source": [
	"Plot on heat map"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "c66a1f6e-8707-457e-879b-4bea40baf385",
	"metadata": {},
	"outputs": [],
	"source": [
	"import plotly.express as px\n",
	"fig = px.density_map(df, lat='cell_lat', lon='cell_lng', z='count', animation_frame='month', radius=4,\n",
	" center=dict(lat=19.5, lon=-96.5), zoom=1, range_color=[0,1000],\n",
	" map_style=\"open-street-map\", height=800, title=df.name.unique()[0])\n",
	"fig.show()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "5cbc14d9-423e-4b81-8a36-c7145c265b81",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
	[tool.poetry]
	name = "pydata-demo"
	version = "0.1.0"
	description = ""
	authors = ["Guen Prawiroatmodjo <[email protected]>"]
	readme = "README.md"

	[tool.poetry.dependencies]
	python = "^3.12"
	duckdb = "^1.0.0, <=1.1.2"
	jupyter = "^1.1.1"
	python-dotenv = "^1.0.1"
	jupyterlab-execute-time = "^3.2.0"
	plotly = "^5.24.1"
	plotly-express = "^0.4.1"


	[build-system]
	requires = ["poetry-core"]
	build-backend = "poetry.core.masonry.api"
	from typing import Optional
	from duckdb import DuckDBPyConnection, DuckDBPyRelation


	class ConnectionWrapper():
	"""Connection wrapper for easier querying"""
	def __init__(self, con: DuckDBPyConnection):
	self.con = con
	(self.database, ) = self.con.sql("select current_database()").fetchone()

	def attach(self, database: str, name: str):
	return self.con.sql(f"ATTACH '{database}' AS {name}")

	def describe(self, table_name: str):
	return self.con.sql(f"DESCRIBE {table_name}")

	def summarize(self, table_name: str):
	return self.con.sql(f"SUMMARIZE {table_name}")

	def tables(self):
	return self.con.sql(f"SELECT database, schema, name FROM (SHOW ALL TABLES) WHERE database IN ('memdb', '{self.database}')")

	def table(self, table_name: str) -> DuckDBPyRelation:
	if "." in table_name:
	return self.con.sql(f"FROM {table_name}")
	return self.con.table(table_name)

	def to_table(self, relation: DuckDBPyRelation, table_name: str, replace: bool = False, temp: bool = False):
	or_replace = " OR REPLACE " if replace is True else " "
	temp_tbl = "TEMP "
	self.con.sql(f"CREATE{or_replace}{temp_tbl}TABLE {table_name} AS ({relation.sql_query()})")

	def to_temp_table(self, relation: DuckDBPyRelation, table_name: str):
	self.to_table(relation, table_name, replace=True, temp=True)

	def prompt_query(self, prompt: str):
	return self.con.sql(f"PRAGMA prompt_query('{prompt}')")

	def prompt_sql(self, prompt: str):
	(query,) = self.con.sql(f"CALL prompt_sql('{prompt}')").fetchone()
	return query

	def prompt(self, prompt: str):
	return self.con.sql(f"SELECT prompt('{prompt}')").fetchone()[0]

	def __getattr__(self, name: str):
	if hasattr(self.con, name):
	return getattr(self.con, name)


	def get_observations(con: ConnectionWrapper, common_name: str, year: int):
	from duckdb.experimental.spark.sql import SparkSession
	from duckdb.experimental.spark.sql.functions import col, month
	from duckdb.experimental.spark.sql.functions import year as apply_year
	from duckdb.experimental.spark.sql.dataframe import DataFrame

	spark = SparkSession.builder.appName(common_name).getOrCreate()
	df = DataFrame(con.table("ebd"), spark)

	result = (
	df
	.filter((apply_year(col("OBSERVATION DATE")) == year) & (col("COMMON NAME") == common_name))
	.orderBy(col("OBSERVATION DATE").asc())
	.select(
	col("COMMON NAME").alias("name"),
	col("OBSERVATION DATE").alias("obs_dt"),
	month("OBSERVATION DATE").alias("month"),
	col("OBSERVATION COUNT").alias("count"),
	col("LATITUDE").alias("lat"),
	col("LONGITUDE").alias("lng"),
	col("COUNTRY").alias("country")
	)
	)

	con.to_temp_table(result.relation, table_name="bird_obs")

	# Use H3 to downsample the data for plotting
	con.install_extension("h3", repository="community")
	con.load_extension("h3")

	rel = (
	con
	.table("bird_obs")
	.select("""
	*,
	h3_latlng_to_cell(lat::double, lng::double, 6) AS cell_id,
	h3_cell_to_lat(cell_id) AS cell_lat,
	h3_cell_to_lng(cell_id) AS cell_lng
	""")
	)

	df = DataFrame(rel, spark)

	result = (
	df
	.orderBy(
	col("obs_dt").asc()
	) # Order by the week number
	)

	# Export to Pandas
	df_result = result.toPandas()

	return df_result