Skip to content

Instantly share code, notes, and snippets.

@SqrtMinusOne
Last active February 10, 2020 20:33
Show Gist options
  • Select an option

  • Save SqrtMinusOne/38b06e29e59e3625e6e40277d5c52217 to your computer and use it in GitHub Desktop.

Select an option

Save SqrtMinusOne/38b06e29e59e3625e6e40277d5c52217 to your computer and use it in GitHub Desktop.
Visualization of VK messaging activity
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# My VK messaging activity visualization\n",
"## Upload stuff to mongo\n",
"Message format:\n",
"```\n",
"\"%username%\", \"%date%\", \"%message%\"\n",
"\n",
"```\n",
"\n",
"Date format:\n",
"```\n",
"yyyy-mm-dd HH:MM:ss \n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from dateutil import parser\n",
"from IPython.display import display\n",
"from pymongo import MongoClient\n",
"from tqdm import tqdm\n",
"\n",
"import glob\n",
"import json\n",
"import numpy as np\n",
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"FILES_DIR = './_files'\n",
"AUTHOR = 'Павел Корытов'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"client = MongoClient('localhost', 27017)\n",
"db = client.vk\n",
"db['messages'].delete_many({})\n",
"pass"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def check(line):\n",
" return not re.match(r'^\\t', line) and not re.search('\"\"', line)\n",
"\n",
"def parse_file(filename):\n",
" content = []\n",
" with open(filename) as file:\n",
" for line in file.readlines():\n",
" if check(line):\n",
" try:\n",
" parts = [part.strip() for part in line.split('\"', 5)]\n",
" try:\n",
" pd.Timestamp(parts[3])\n",
" except ValueError:\n",
" content[-1][2] += line\n",
" continue\n",
" if parts[5][-1] == '\"':\n",
" parts[5] = parts[5][:-1]\n",
" content.append([parts[1], parts[3], parts[5]])\n",
" except IndexError:\n",
" if line[-1] == '\"':\n",
" line = line[:-1]\n",
" content[-1][2] += line\n",
" \n",
" df = pd.DataFrame(content)\n",
" df.columns = 'Sender', 'Date', 'Message'\n",
" members = df['Sender'].unique()\n",
" if len(members) == 2:\n",
" interlocutor = members[np.where(members != AUTHOR)[0][0]]\n",
" else:\n",
" interlocutor = re.search('\\/([^\\/])*\\.txt', filename).group(0)[1:-4]\n",
" df['Recipient'] = df['Sender'].apply(\n",
" lambda sender: AUTHOR if sender == interlocutor else interlocutor\n",
" )\n",
" df['Date'] = pd.to_datetime(df['Date'])\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def input_category(df):\n",
" out = [\n",
" {\n",
" \"start\": df['Date'][0],\n",
" \"type\": input(f\"{df['Date'][0]}:\")\n",
" }\n",
" ]\n",
" while True:\n",
" next_ = input('Next: ')\n",
" if next_:\n",
" out.append({\n",
" \"start\": parser.parse(next_),\n",
" \"type\": input(f\"{parser.parse(next_)}: \")\n",
" })\n",
" else:\n",
" break\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"if not db['files'].find_one({\"filename\": f\"{FILES_DIR}/DELETED.txt\"}):\n",
" db['files'].insert_one({\n",
" \"filename\": f\"{FILES_DIR}/DELETED.txt\",\n",
" \"category\": [\n",
" {\n",
" \"start\": parser.parse('1970-01-01'),\n",
" \"type\": \"Misc\"\n",
" }\n",
" ],\n",
" \"users\": ['DELETED', AUTHOR]\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/77 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"./_files/Александр Шевченко.txt\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"2011-10-05 14:33:49: School\n",
"Next: \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 77/77 [00:10<00:00, 7.33it/s]\n"
]
}
],
"source": [
"for filename in tqdm(glob.glob(f\"{FILES_DIR}/*.txt\")):\n",
" df = parse_file(filename)\n",
" file_ref = db['files'].find_one({ \"filename\": filename })\n",
" if not file_ref:\n",
" print(filename)\n",
" data = {\n",
" \"filename\": filename,\n",
" \"category\": input_category(df),\n",
" \"users\": list(df[\"Sender\"].unique())\n",
" }\n",
" db['files'].insert_one(data)\n",
" file_ref = db['files'].find_one({ \"filename\": filename })\n",
" df['file'] = file_ref['_id']\n",
" db['messages'].insert_many(df.to_dict(orient='records'))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Date_1'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db['messages'].create_index([(\"Date\", 1)])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment