Last active
February 10, 2020 20:33
-
-
Save SqrtMinusOne/38b06e29e59e3625e6e40277d5c52217 to your computer and use it in GitHub Desktop.
Visualization of VK messaging activity
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# My VK messaging activity visualization\n", | |
| "## Upload stuff to mongo\n", | |
| "Message format:\n", | |
| "```\n", | |
| "\"%username%\", \"%date%\", \"%message%\"\n", | |
| "\n", | |
| "```\n", | |
| "\n", | |
| "Date format:\n", | |
| "```\n", | |
| "yyyy-mm-dd HH:MM:ss \n", | |
| "```" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from dateutil import parser\n", | |
| "from IPython.display import display\n", | |
| "from pymongo import MongoClient\n", | |
| "from tqdm import tqdm\n", | |
| "\n", | |
| "import glob\n", | |
| "import json\n", | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "import re" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "FILES_DIR = './_files'\n", | |
| "AUTHOR = 'Павел Корытов'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "client = MongoClient('localhost', 27017)\n", | |
| "db = client.vk\n", | |
| "db['messages'].delete_many({})\n", | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def check(line):\n", | |
| " return not re.match(r'^\\t', line) and not re.search('\"\"', line)\n", | |
| "\n", | |
| "def parse_file(filename):\n", | |
| " content = []\n", | |
| " with open(filename) as file:\n", | |
| " for line in file.readlines():\n", | |
| " if check(line):\n", | |
| " try:\n", | |
| " parts = [part.strip() for part in line.split('\"', 5)]\n", | |
| " try:\n", | |
| " pd.Timestamp(parts[3])\n", | |
| " except ValueError:\n", | |
| " content[-1][2] += line\n", | |
| " continue\n", | |
| " if parts[5][-1] == '\"':\n", | |
| " parts[5] = parts[5][:-1]\n", | |
| " content.append([parts[1], parts[3], parts[5]])\n", | |
| " except IndexError:\n", | |
| " if line[-1] == '\"':\n", | |
| " line = line[:-1]\n", | |
| " content[-1][2] += line\n", | |
| " \n", | |
| " df = pd.DataFrame(content)\n", | |
| " df.columns = 'Sender', 'Date', 'Message'\n", | |
| " members = df['Sender'].unique()\n", | |
| " if len(members) == 2:\n", | |
| " interlocutor = members[np.where(members != AUTHOR)[0][0]]\n", | |
| " else:\n", | |
| " interlocutor = re.search('\\/([^\\/])*\\.txt', filename).group(0)[1:-4]\n", | |
| " df['Recipient'] = df['Sender'].apply(\n", | |
| " lambda sender: AUTHOR if sender == interlocutor else interlocutor\n", | |
| " )\n", | |
| " df['Date'] = pd.to_datetime(df['Date'])\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def input_category(df):\n", | |
| " out = [\n", | |
| " {\n", | |
| " \"start\": df['Date'][0],\n", | |
| " \"type\": input(f\"{df['Date'][0]}:\")\n", | |
| " }\n", | |
| " ]\n", | |
| " while True:\n", | |
| " next_ = input('Next: ')\n", | |
| " if next_:\n", | |
| " out.append({\n", | |
| " \"start\": parser.parse(next_),\n", | |
| " \"type\": input(f\"{parser.parse(next_)}: \")\n", | |
| " })\n", | |
| " else:\n", | |
| " break\n", | |
| " return out" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "if not db['files'].find_one({\"filename\": f\"{FILES_DIR}/DELETED.txt\"}):\n", | |
| " db['files'].insert_one({\n", | |
| " \"filename\": f\"{FILES_DIR}/DELETED.txt\",\n", | |
| " \"category\": [\n", | |
| " {\n", | |
| " \"start\": parser.parse('1970-01-01'),\n", | |
| " \"type\": \"Misc\"\n", | |
| " }\n", | |
| " ],\n", | |
| " \"users\": ['DELETED', AUTHOR]\n", | |
| " })" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| " 0%| | 0/77 [00:00<?, ?it/s]" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "./_files/Александр Шевченко.txt\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdin", | |
| "output_type": "stream", | |
| "text": [ | |
| "2011-10-05 14:33:49: School\n", | |
| "Next: \n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "100%|██████████| 77/77 [00:10<00:00, 7.33it/s]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for filename in tqdm(glob.glob(f\"{FILES_DIR}/*.txt\")):\n", | |
| " df = parse_file(filename)\n", | |
| " file_ref = db['files'].find_one({ \"filename\": filename })\n", | |
| " if not file_ref:\n", | |
| " print(filename)\n", | |
| " data = {\n", | |
| " \"filename\": filename,\n", | |
| " \"category\": input_category(df),\n", | |
| " \"users\": list(df[\"Sender\"].unique())\n", | |
| " }\n", | |
| " db['files'].insert_one(data)\n", | |
| " file_ref = db['files'].find_one({ \"filename\": filename })\n", | |
| " df['file'] = file_ref['_id']\n", | |
| " db['messages'].insert_many(df.to_dict(orient='records'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'Date_1'" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "db['messages'].create_index([(\"Date\", 1)])" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment