vschmidt94 · March 3, 2021 16:54
diff --git a/partition_on_sentinel_value.ipynb b/partition_on_sentinel_value.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Partition Pandas dataframe based on sentinel value\n",
    "\n",
    "First create some sample data - simulate what is retrieved from database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>timestamp</th>\n",
       "      <th>motor_rpm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>2021-03-03 15:52:28.312731</td>\n",
       "      <td>370</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2021-03-03 15:53:28.312731</td>\n",
       "      <td>290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2021-03-03 15:54:28.312731</td>\n",
       "      <td>130</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2021-03-03 15:55:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>2021-03-03 15:56:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>2021-03-04 08:27:28.312731</td>\n",
       "      <td>610</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>2021-03-04 08:28:28.312731</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>2021-03-04 08:29:28.312731</td>\n",
       "      <td>210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>2021-03-04 08:30:28.312731</td>\n",
       "      <td>1260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>2021-03-04 08:31:28.312731</td>\n",
       "      <td>1040</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     timestamp  motor_rpm\n",
       "0   2021-03-03 15:52:28.312731        370\n",
       "1   2021-03-03 15:53:28.312731        290\n",
       "2   2021-03-03 15:54:28.312731        130\n",
       "3   2021-03-03 15:55:28.312731          0\n",
       "4   2021-03-03 15:56:28.312731          0\n",
       "..                         ...        ...\n",
       "995 2021-03-04 08:27:28.312731        610\n",
       "996 2021-03-04 08:28:28.312731        260\n",
       "997 2021-03-04 08:29:28.312731        210\n",
       "998 2021-03-04 08:30:28.312731       1260\n",
       "999 2021-03-04 08:31:28.312731       1040\n",
       "\n",
       "[1000 rows x 2 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import random, datetime\n",
    "import pandas\n",
    "\n",
    "d = {'timestamp': [], 'motor_rpm': []}\n",
    "now = datetime.datetime.utcnow()\n",
    "for i in range(1000):\n",
    "    rpm = random.randrange(100,1500, 10)\n",
    "    d['timestamp'].append(now)\n",
    "    d['motor_rpm'].append(rpm)\n",
    "    now += datetime.timedelta(minutes=1)\n",
    "    \n",
    "# Insert 5 \"events\" where motor rpm goes to zero\n",
    "# An \"event\" is where we see 1 or more contiguous readings of zero rpm.\n",
    "# i.e., if the motor stops - it could be down for a minute or several minutes\n",
    "zero_rpm_indices = [3,4,\n",
    "                   299,\n",
    "                   466, 467, 468, 469,\n",
    "                   700, 701,\n",
    "                   950, 951, 952, 953, 954]\n",
    "for idx in zero_rpm_indices:\n",
    "    d['motor_rpm'][idx] = 0\n",
    "\n",
    "df = pandas.DataFrame.from_dict(d)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Partitioning\n",
    "\n",
    "Now that we have a reasonable facsimile of the sensor data, the problem is identifying partition boundaries.\n",
    "\n",
    "The 0 values in `motor_rpm` are our sentinels - but note we may have a continguous series of the sentinel value/\n",
    "\n",
    "For my purposes, I define a partition as:\n",
    "  1. Starting at the first non-zero RPM value following a 0 rpm record. In the case the first record is non-zero RPM, that will also begin a partition\n",
    "  2. Ending at the last occurrence of a zero RPM value before a non-zero RPM value. In all cases, the last record ends the last partition."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>timestamp</th>\n",
       "      <th>motor_rpm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2021-03-03 15:55:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>2021-03-03 15:56:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>299</td>\n",
       "      <td>2021-03-03 20:51:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>466</td>\n",
       "      <td>2021-03-03 23:38:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>467</td>\n",
       "      <td>2021-03-03 23:39:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>468</td>\n",
       "      <td>2021-03-03 23:40:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>469</td>\n",
       "      <td>2021-03-03 23:41:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>700</td>\n",
       "      <td>2021-03-04 03:32:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>701</td>\n",
       "      <td>2021-03-04 03:33:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>950</td>\n",
       "      <td>2021-03-04 07:42:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>951</td>\n",
       "      <td>2021-03-04 07:43:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>952</td>\n",
       "      <td>2021-03-04 07:44:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>953</td>\n",
       "      <td>2021-03-04 07:45:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>954</td>\n",
       "      <td>2021-03-04 07:46:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     timestamp  motor_rpm\n",
       "3   2021-03-03 15:55:28.312731          0\n",
       "4   2021-03-03 15:56:28.312731          0\n",
       "299 2021-03-03 20:51:28.312731          0\n",
       "466 2021-03-03 23:38:28.312731          0\n",
       "467 2021-03-03 23:39:28.312731          0\n",
       "468 2021-03-03 23:40:28.312731          0\n",
       "469 2021-03-03 23:41:28.312731          0\n",
       "700 2021-03-04 03:32:28.312731          0\n",
       "701 2021-03-04 03:33:28.312731          0\n",
       "950 2021-03-04 07:42:28.312731          0\n",
       "951 2021-03-04 07:43:28.312731          0\n",
       "952 2021-03-04 07:44:28.312731          0\n",
       "953 2021-03-04 07:45:28.312731          0\n",
       "954 2021-03-04 07:46:28.312731          0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Step 1: Find the zero-rpm events\n",
    "zero_rpm = df.loc[df['motor_rpm'] == 0]\n",
    "zero_rpm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3      1\n",
       "4      1\n",
       "299    2\n",
       "466    3\n",
       "467    3\n",
       "468    3\n",
       "469    3\n",
       "700    4\n",
       "701    4\n",
       "950    5\n",
       "951    5\n",
       "952    5\n",
       "953    5\n",
       "954    5\n",
       "dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Step 2: Create a map of indexes to zero-rpm records\n",
    "# i.e., we'll see what partition each zero-rpm event 'belongs' to.\n",
    "m = zero_rpm.index.to_series().diff().ne(1).cumsum()\n",
    "m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 4), (5, 299), (300, 469), (470, 701), (702, 954), (955, 999)]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Step 3: Walk the map elements and use that to definepart partition start, stop tuples\n",
    "def partition(df):\n",
    "    partitions = []\n",
    "    current_event = 1\n",
    "    partition_start_idx = 0\n",
    "    partition_end_idx = 0\n",
    "    for idx, event in m.items():\n",
    "        if event != current_event:\n",
    "            partitions.append((partition_start_idx, partition_end_idx))\n",
    "            partition_start_idx = partition_end_idx + 1\n",
    "            current_event = event\n",
    "        partition_end_idx = idx\n",
    "    \n",
    "    # The last event falls outside the loop\n",
    "    partitions.append((partition_start_idx, partition_end_idx))\n",
    "\n",
    "    # Finally, capture all remaining records (if any) after the last event.\n",
    "    if partition_end_idx < len(df.index) - 1:\n",
    "        partitions.append((partition_end_idx + 1, len(df.index) - 1))\n",
    "    \n",
    "    return partitions\n",
    "\n",
    "partition(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check the edge cases\n",
    "\n",
    "That appears to work, but let's check edge cases where our data has zero-rpm event at beginning and/or end of the dataframe as well."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>timestamp</th>\n",
       "      <th>motor_rpm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>2021-03-03 15:52:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2021-03-03 15:53:28.312731</td>\n",
       "      <td>290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2021-03-03 15:54:28.312731</td>\n",
       "      <td>130</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2021-03-03 15:55:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>2021-03-03 15:56:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>2021-03-04 08:27:28.312731</td>\n",
       "      <td>610</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>2021-03-04 08:28:28.312731</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>2021-03-04 08:29:28.312731</td>\n",
       "      <td>210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>2021-03-04 08:30:28.312731</td>\n",
       "      <td>1260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>2021-03-04 08:31:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     timestamp  motor_rpm\n",
       "0   2021-03-03 15:52:28.312731          0\n",
       "1   2021-03-03 15:53:28.312731        290\n",
       "2   2021-03-03 15:54:28.312731        130\n",
       "3   2021-03-03 15:55:28.312731          0\n",
       "4   2021-03-03 15:56:28.312731          0\n",
       "..                         ...        ...\n",
       "995 2021-03-04 08:27:28.312731        610\n",
       "996 2021-03-04 08:28:28.312731        260\n",
       "997 2021-03-04 08:29:28.312731        210\n",
       "998 2021-03-04 08:30:28.312731       1260\n",
       "999 2021-03-04 08:31:28.312731          0\n",
       "\n",
       "[1000 rows x 2 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.at[0, 'motor_rpm'] = 0\n",
    "df.at[len(df.index) - 1, 'motor_rpm'] = 0\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, 4), (5, 299), (300, 469), (470, 701), (702, 954), (955, 999)]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "partitions = partition(df)\n",
    "partitions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Next steps\n",
    "\n",
    "Now, I can analyze specific partitions as needed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>timestamp</th>\n",
       "      <th>motor_rpm</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>2021-03-03 15:52:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2021-03-03 15:53:28.312731</td>\n",
       "      <td>290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2021-03-03 15:54:28.312731</td>\n",
       "      <td>130</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2021-03-03 15:55:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>2021-03-03 15:56:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>2021-03-04 08:27:28.312731</td>\n",
       "      <td>610</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>2021-03-04 08:28:28.312731</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>2021-03-04 08:29:28.312731</td>\n",
       "      <td>210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>2021-03-04 08:30:28.312731</td>\n",
       "      <td>1260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>2021-03-04 08:31:28.312731</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     timestamp  motor_rpm\n",
       "0   2021-03-03 15:52:28.312731          0\n",
       "1   2021-03-03 15:53:28.312731        290\n",
       "2   2021-03-03 15:54:28.312731        130\n",
       "3   2021-03-03 15:55:28.312731          0\n",
       "4   2021-03-03 15:56:28.312731          0\n",
       "..                         ...        ...\n",
       "995 2021-03-04 08:27:28.312731        610\n",
       "996 2021-03-04 08:28:28.312731        260\n",
       "997 2021-03-04 08:29:28.312731        210\n",
       "998 2021-03-04 08:30:28.312731       1260\n",
       "999 2021-03-04 08:31:28.312731          0\n",
       "\n",
       "[1000 rows x 2 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Parition 0: df[0:5]\n",
      "        motor_rpm\n",
      "count    5.000000\n",
      "mean    84.000000\n",
      "std    128.179562\n",
      "min      0.000000\n",
      "25%      0.000000\n",
      "50%      0.000000\n",
      "75%    130.000000\n",
      "max    290.000000\n",
      "\n",
      "Parition 1: df[5:300]\n",
      "         motor_rpm\n",
      "count   295.000000\n",
      "mean    815.457627\n",
      "std     387.093272\n",
      "min       0.000000\n",
      "25%     500.000000\n",
      "50%     810.000000\n",
      "75%    1170.000000\n",
      "max    1490.000000\n",
      "\n",
      "Parition 2: df[300:470]\n",
      "         motor_rpm\n",
      "count   170.000000\n",
      "mean    747.058824\n",
      "std     418.921759\n",
      "min       0.000000\n",
      "25%     410.000000\n",
      "50%     710.000000\n",
      "75%    1070.000000\n",
      "max    1490.000000\n",
      "\n",
      "Parition 3: df[470:702]\n",
      "         motor_rpm\n",
      "count   232.000000\n",
      "mean    778.879310\n",
      "std     420.860326\n",
      "min       0.000000\n",
      "25%     430.000000\n",
      "50%     770.000000\n",
      "75%    1150.000000\n",
      "max    1490.000000\n",
      "\n",
      "Parition 4: df[702:955]\n",
      "         motor_rpm\n",
      "count   253.000000\n",
      "mean    789.960474\n",
      "std     427.263810\n",
      "min       0.000000\n",
      "25%     410.000000\n",
      "50%     850.000000\n",
      "75%    1170.000000\n",
      "max    1480.000000\n",
      "\n",
      "Parition 5: df[955:1000]\n",
      "         motor_rpm\n",
      "count    45.000000\n",
      "mean    732.444444\n",
      "std     439.633916\n",
      "min       0.000000\n",
      "25%     340.000000\n",
      "50%     630.000000\n",
      "75%    1040.000000\n",
      "max    1490.000000\n"
     ]
    }
   ],
   "source": [
    "for i, (start, stop) in enumerate(partitions):\n",
    "    print(f\"\\nParition {i}: df[{start}:{stop + 1}]\")\n",
    "    part_df = df[start:stop + 1]\n",
    "    print(part_df.describe())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Partition Pandas dataframe based on sentinel value\n",
	"\n",
	"First create some sample data - simulate what is retrieved from database"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>timestamp</th>\n",
	" <th>motor_rpm</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td>0</td>\n",
	" <td>2021-03-03 15:52:28.312731</td>\n",
	" <td>370</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>1</td>\n",
	" <td>2021-03-03 15:53:28.312731</td>\n",
	" <td>290</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>2</td>\n",
	" <td>2021-03-03 15:54:28.312731</td>\n",
	" <td>130</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>3</td>\n",
	" <td>2021-03-03 15:55:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>4</td>\n",
	" <td>2021-03-03 15:56:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>995</td>\n",
	" <td>2021-03-04 08:27:28.312731</td>\n",
	" <td>610</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>996</td>\n",
	" <td>2021-03-04 08:28:28.312731</td>\n",
	" <td>260</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>997</td>\n",
	" <td>2021-03-04 08:29:28.312731</td>\n",
	" <td>210</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>998</td>\n",
	" <td>2021-03-04 08:30:28.312731</td>\n",
	" <td>1260</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>999</td>\n",
	" <td>2021-03-04 08:31:28.312731</td>\n",
	" <td>1040</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>1000 rows × 2 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" timestamp motor_rpm\n",
	"0 2021-03-03 15:52:28.312731 370\n",
	"1 2021-03-03 15:53:28.312731 290\n",
	"2 2021-03-03 15:54:28.312731 130\n",
	"3 2021-03-03 15:55:28.312731 0\n",
	"4 2021-03-03 15:56:28.312731 0\n",
	".. ... ...\n",
	"995 2021-03-04 08:27:28.312731 610\n",
	"996 2021-03-04 08:28:28.312731 260\n",
	"997 2021-03-04 08:29:28.312731 210\n",
	"998 2021-03-04 08:30:28.312731 1260\n",
	"999 2021-03-04 08:31:28.312731 1040\n",
	"\n",
	"[1000 rows x 2 columns]"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import random, datetime\n",
	"import pandas\n",
	"\n",
	"d = {'timestamp': [], 'motor_rpm': []}\n",
	"now = datetime.datetime.utcnow()\n",
	"for i in range(1000):\n",
	" rpm = random.randrange(100,1500, 10)\n",
	" d['timestamp'].append(now)\n",
	" d['motor_rpm'].append(rpm)\n",
	" now += datetime.timedelta(minutes=1)\n",
	" \n",
	"# Insert 5 \"events\" where motor rpm goes to zero\n",
	"# An \"event\" is where we see 1 or more contiguous readings of zero rpm.\n",
	"# i.e., if the motor stops - it could be down for a minute or several minutes\n",
	"zero_rpm_indices = [3,4,\n",
	" 299,\n",
	" 466, 467, 468, 469,\n",
	" 700, 701,\n",
	" 950, 951, 952, 953, 954]\n",
	"for idx in zero_rpm_indices:\n",
	" d['motor_rpm'][idx] = 0\n",
	"\n",
	"df = pandas.DataFrame.from_dict(d)\n",
	"df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Partitioning\n",
	"\n",
	"Now that we have a reasonable facsimile of the sensor data, the problem is identifying partition boundaries.\n",
	"\n",
	"The 0 values in `motor_rpm` are our sentinels - but note we may have a continguous series of the sentinel value/\n",
	"\n",
	"For my purposes, I define a partition as:\n",
	" 1. Starting at the first non-zero RPM value following a 0 rpm record. In the case the first record is non-zero RPM, that will also begin a partition\n",
	" 2. Ending at the last occurrence of a zero RPM value before a non-zero RPM value. In all cases, the last record ends the last partition."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>timestamp</th>\n",
	" <th>motor_rpm</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td>3</td>\n",
	" <td>2021-03-03 15:55:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>4</td>\n",
	" <td>2021-03-03 15:56:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>299</td>\n",
	" <td>2021-03-03 20:51:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>466</td>\n",
	" <td>2021-03-03 23:38:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>467</td>\n",
	" <td>2021-03-03 23:39:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>468</td>\n",
	" <td>2021-03-03 23:40:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>469</td>\n",
	" <td>2021-03-03 23:41:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>700</td>\n",
	" <td>2021-03-04 03:32:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>701</td>\n",
	" <td>2021-03-04 03:33:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>950</td>\n",
	" <td>2021-03-04 07:42:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>951</td>\n",
	" <td>2021-03-04 07:43:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>952</td>\n",
	" <td>2021-03-04 07:44:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>953</td>\n",
	" <td>2021-03-04 07:45:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>954</td>\n",
	" <td>2021-03-04 07:46:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" timestamp motor_rpm\n",
	"3 2021-03-03 15:55:28.312731 0\n",
	"4 2021-03-03 15:56:28.312731 0\n",
	"299 2021-03-03 20:51:28.312731 0\n",
	"466 2021-03-03 23:38:28.312731 0\n",
	"467 2021-03-03 23:39:28.312731 0\n",
	"468 2021-03-03 23:40:28.312731 0\n",
	"469 2021-03-03 23:41:28.312731 0\n",
	"700 2021-03-04 03:32:28.312731 0\n",
	"701 2021-03-04 03:33:28.312731 0\n",
	"950 2021-03-04 07:42:28.312731 0\n",
	"951 2021-03-04 07:43:28.312731 0\n",
	"952 2021-03-04 07:44:28.312731 0\n",
	"953 2021-03-04 07:45:28.312731 0\n",
	"954 2021-03-04 07:46:28.312731 0"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Step 1: Find the zero-rpm events\n",
	"zero_rpm = df.loc[df['motor_rpm'] == 0]\n",
	"zero_rpm"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"3 1\n",
	"4 1\n",
	"299 2\n",
	"466 3\n",
	"467 3\n",
	"468 3\n",
	"469 3\n",
	"700 4\n",
	"701 4\n",
	"950 5\n",
	"951 5\n",
	"952 5\n",
	"953 5\n",
	"954 5\n",
	"dtype: int64"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Step 2: Create a map of indexes to zero-rpm records\n",
	"# i.e., we'll see what partition each zero-rpm event 'belongs' to.\n",
	"m = zero_rpm.index.to_series().diff().ne(1).cumsum()\n",
	"m"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(0, 4), (5, 299), (300, 469), (470, 701), (702, 954), (955, 999)]"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Step 3: Walk the map elements and use that to definepart partition start, stop tuples\n",
	"def partition(df):\n",
	" partitions = []\n",
	" current_event = 1\n",
	" partition_start_idx = 0\n",
	" partition_end_idx = 0\n",
	" for idx, event in m.items():\n",
	" if event != current_event:\n",
	" partitions.append((partition_start_idx, partition_end_idx))\n",
	" partition_start_idx = partition_end_idx + 1\n",
	" current_event = event\n",
	" partition_end_idx = idx\n",
	" \n",
	" # The last event falls outside the loop\n",
	" partitions.append((partition_start_idx, partition_end_idx))\n",
	"\n",
	" # Finally, capture all remaining records (if any) after the last event.\n",
	" if partition_end_idx < len(df.index) - 1:\n",
	" partitions.append((partition_end_idx + 1, len(df.index) - 1))\n",
	" \n",
	" return partitions\n",
	"\n",
	"partition(df)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Check the edge cases\n",
	"\n",
	"That appears to work, but let's check edge cases where our data has zero-rpm event at beginning and/or end of the dataframe as well."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>timestamp</th>\n",
	" <th>motor_rpm</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td>0</td>\n",
	" <td>2021-03-03 15:52:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>1</td>\n",
	" <td>2021-03-03 15:53:28.312731</td>\n",
	" <td>290</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>2</td>\n",
	" <td>2021-03-03 15:54:28.312731</td>\n",
	" <td>130</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>3</td>\n",
	" <td>2021-03-03 15:55:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>4</td>\n",
	" <td>2021-03-03 15:56:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>995</td>\n",
	" <td>2021-03-04 08:27:28.312731</td>\n",
	" <td>610</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>996</td>\n",
	" <td>2021-03-04 08:28:28.312731</td>\n",
	" <td>260</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>997</td>\n",
	" <td>2021-03-04 08:29:28.312731</td>\n",
	" <td>210</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>998</td>\n",
	" <td>2021-03-04 08:30:28.312731</td>\n",
	" <td>1260</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>999</td>\n",
	" <td>2021-03-04 08:31:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>1000 rows × 2 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" timestamp motor_rpm\n",
	"0 2021-03-03 15:52:28.312731 0\n",
	"1 2021-03-03 15:53:28.312731 290\n",
	"2 2021-03-03 15:54:28.312731 130\n",
	"3 2021-03-03 15:55:28.312731 0\n",
	"4 2021-03-03 15:56:28.312731 0\n",
	".. ... ...\n",
	"995 2021-03-04 08:27:28.312731 610\n",
	"996 2021-03-04 08:28:28.312731 260\n",
	"997 2021-03-04 08:29:28.312731 210\n",
	"998 2021-03-04 08:30:28.312731 1260\n",
	"999 2021-03-04 08:31:28.312731 0\n",
	"\n",
	"[1000 rows x 2 columns]"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.at[0, 'motor_rpm'] = 0\n",
	"df.at[len(df.index) - 1, 'motor_rpm'] = 0\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(0, 4), (5, 299), (300, 469), (470, 701), (702, 954), (955, 999)]"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"partitions = partition(df)\n",
	"partitions"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Next steps\n",
	"\n",
	"Now, I can analyze specific partitions as needed"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>timestamp</th>\n",
	" <th>motor_rpm</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <td>0</td>\n",
	" <td>2021-03-03 15:52:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>1</td>\n",
	" <td>2021-03-03 15:53:28.312731</td>\n",
	" <td>290</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>2</td>\n",
	" <td>2021-03-03 15:54:28.312731</td>\n",
	" <td>130</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>3</td>\n",
	" <td>2021-03-03 15:55:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>4</td>\n",
	" <td>2021-03-03 15:56:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>995</td>\n",
	" <td>2021-03-04 08:27:28.312731</td>\n",
	" <td>610</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>996</td>\n",
	" <td>2021-03-04 08:28:28.312731</td>\n",
	" <td>260</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>997</td>\n",
	" <td>2021-03-04 08:29:28.312731</td>\n",
	" <td>210</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>998</td>\n",
	" <td>2021-03-04 08:30:28.312731</td>\n",
	" <td>1260</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <td>999</td>\n",
	" <td>2021-03-04 08:31:28.312731</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>1000 rows × 2 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" timestamp motor_rpm\n",
	"0 2021-03-03 15:52:28.312731 0\n",
	"1 2021-03-03 15:53:28.312731 290\n",
	"2 2021-03-03 15:54:28.312731 130\n",
	"3 2021-03-03 15:55:28.312731 0\n",
	"4 2021-03-03 15:56:28.312731 0\n",
	".. ... ...\n",
	"995 2021-03-04 08:27:28.312731 610\n",
	"996 2021-03-04 08:28:28.312731 260\n",
	"997 2021-03-04 08:29:28.312731 210\n",
	"998 2021-03-04 08:30:28.312731 1260\n",
	"999 2021-03-04 08:31:28.312731 0\n",
	"\n",
	"[1000 rows x 2 columns]"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"Parition 0: df[0:5]\n",
	" motor_rpm\n",
	"count 5.000000\n",
	"mean 84.000000\n",
	"std 128.179562\n",
	"min 0.000000\n",
	"25% 0.000000\n",
	"50% 0.000000\n",
	"75% 130.000000\n",
	"max 290.000000\n",
	"\n",
	"Parition 1: df[5:300]\n",
	" motor_rpm\n",
	"count 295.000000\n",
	"mean 815.457627\n",
	"std 387.093272\n",
	"min 0.000000\n",
	"25% 500.000000\n",
	"50% 810.000000\n",
	"75% 1170.000000\n",
	"max 1490.000000\n",
	"\n",
	"Parition 2: df[300:470]\n",
	" motor_rpm\n",
	"count 170.000000\n",
	"mean 747.058824\n",
	"std 418.921759\n",
	"min 0.000000\n",
	"25% 410.000000\n",
	"50% 710.000000\n",
	"75% 1070.000000\n",
	"max 1490.000000\n",
	"\n",
	"Parition 3: df[470:702]\n",
	" motor_rpm\n",
	"count 232.000000\n",
	"mean 778.879310\n",
	"std 420.860326\n",
	"min 0.000000\n",
	"25% 430.000000\n",
	"50% 770.000000\n",
	"75% 1150.000000\n",
	"max 1490.000000\n",
	"\n",
	"Parition 4: df[702:955]\n",
	" motor_rpm\n",
	"count 253.000000\n",
	"mean 789.960474\n",
	"std 427.263810\n",
	"min 0.000000\n",
	"25% 410.000000\n",
	"50% 850.000000\n",
	"75% 1170.000000\n",
	"max 1480.000000\n",
	"\n",
	"Parition 5: df[955:1000]\n",
	" motor_rpm\n",
	"count 45.000000\n",
	"mean 732.444444\n",
	"std 439.633916\n",
	"min 0.000000\n",
	"25% 340.000000\n",
	"50% 630.000000\n",
	"75% 1040.000000\n",
	"max 1490.000000\n"
	]
	}
	],
	"source": [
	"for i, (start, stop) in enumerate(partitions):\n",
	" print(f\"\\nParition {i}: df[{start}:{stop + 1}]\")\n",
	" part_df = df[start:stop + 1]\n",
	" print(part_df.describe())"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found