Created
June 23, 2016 21:18
-
-
Save bhtucker/927fc24e3f48c0224b90b2a72fb30b72 to your computer and use it in GitHub Desktop.
PySpark Notebook for Spark Implementation of https://gist.github.com/bhtucker/5dccc1fa96d8030a752cb9f76cbaf558
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<pyspark.context.SparkContext at 0x10dc61690>" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sc" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "ok\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(\"ok\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "keyed_rdd = sc.parallelize([\n", | |
| " ('a', 2),\n", | |
| " ('a', 2),\n", | |
| " ('a', 2),\n", | |
| " ('b', 2), \n", | |
| " ('b', 2)\n", | |
| " ])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('a', [2, 2, 2]), ('b', [2, 2])]" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "keyed_rdd.reduceByKey(lambda a, b: a + b).collect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "keyed_rdd = sc.parallelize([\n", | |
| " ('a', [2]),\n", | |
| " ('a', [2]),\n", | |
| " ('a', [2]),\n", | |
| " ('b', [2]), \n", | |
| " ('b', [2])\n", | |
| " ])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "10" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "reduce(lambda a, b: a + b, range(5))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "10" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sum(range(5))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "('a', [2], 'a', [2], 'a', [2], 'b', [2], 'b', [2])" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "keyed_rdd.reduce(lambda a, b: a + b)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "phrase_rdd = sc.parallelize([\n", | |
| " 'one whole sentence',\n", | |
| " 'another sentence entirely'\n", | |
| " ])\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "2" | |
| ] | |
| }, | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "phrase_rdd.count()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[['one', 'whole', 'sentence'], ['another', 'sentence', 'entirely']]" | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "phrase_rdd.map(lambda v: v.split()).count()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[3, 3]" | |
| ] | |
| }, | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "phrase_rdd.map(lambda v: v.split()).map(len).collect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['one', 'whole', 'sentence', 'another', 'sentence', 'entirely']" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "phrase_rdd.flatMap(lambda v: v.split()).collect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "6" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "phrase_rdd.flatMap(lambda v: v.split()).count()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('whole', 1), ('entirely', 1), ('another', 1), ('one', 1), ('sentence', 2)]" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "word_rdd = phrase_rdd.flatMap(lambda v: v.split())\n", | |
| "\n", | |
| "word_keyed_rdd = word_rdd.map(lambda v: (v, 1))\n", | |
| "\n", | |
| "word_keyed_rdd.reduceByKey(lambda a, b: a + b).collect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "defaultdict(int,\n", | |
| " {'another': 1, 'entirely': 1, 'one': 1, 'sentence': 2, 'whole': 1})" | |
| ] | |
| }, | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "word_rdd.countByValue()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "puzzle_rdd = sc.parallelize(\n", | |
| " [(0, 0)]\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import operator\n", | |
| "\n", | |
| "def map_to_children(row, delta_values=[3, 5], delta_ops=[operator.add, operator.sub], current_level=None):\n", | |
| " value, level = row\n", | |
| " if current_level is not None and level != current_level:\n", | |
| " return [row]\n", | |
| " return [\n", | |
| " (op(value, delta), level + 1)\n", | |
| " for op in delta_ops\n", | |
| " for delta in delta_values\n", | |
| " if 0 <= op(value, delta) <= 8\n", | |
| " ]\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(1, 3), (1, 5)]" | |
| ] | |
| }, | |
| "execution_count": 31, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "map_to_children((0, 0))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 63, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[(0, 0), (3, 1), (5, 1)]\n", | |
| "[(0, 0), (2, 2), (3, 1), (5, 1), (6, 2), (8, 2)]\n", | |
| "[(0, 0), (1, 3), (2, 2), (3, 1), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
| "[(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
| "[(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
| "[(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from functools import partial\n", | |
| "\n", | |
| "puzzle_rdd = sc.parallelize(\n", | |
| " [(0, 0)]\n", | |
| ")\n", | |
| "\n", | |
| "prior_count = -10\n", | |
| "next_count = -5\n", | |
| "level = 0\n", | |
| "\n", | |
| "while prior_count != next_count:\n", | |
| " # propagate level and collect dupes\n", | |
| " prior_level = puzzle_rdd\n", | |
| " prior_count = prior_level.count()\n", | |
| " next_level = puzzle_rdd.flatMap(lambda v: map_to_children(v, current_level=level)).cache()\n", | |
| " next_count = next_level.count()\n", | |
| " puzzle_rdd = prior_level.union(next_level)\n", | |
| " puzzle_rdd = puzzle_rdd.reduceByKey(lambda a, b: min(a, b))\n", | |
| " level += 1\n", | |
| " print(puzzle_rdd.collect())\n", | |
| "\n", | |
| "# .flatMap(map_to_children).flatMap(map_to_children).flatMap(map_to_children).collect()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 61, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(1, 1)" | |
| ] | |
| }, | |
| "execution_count": 61, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "prior_count, next_count" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[[(1, 3), (1, 5)]]" | |
| ] | |
| }, | |
| "execution_count": 33, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Row reduction via only getting children for 'frontier' level:\n", | |
| "\n", | |
| "# 3\n", | |
| "# [(0, 0), (3, 1), (5, 1)]\n", | |
| "# 10\n", | |
| "# [(0, 0), (2, 2), (3, 1), (5, 1), (6, 2), (8, 2)]\n", | |
| "# 15\n", | |
| "# [(0, 0), (1, 3), (2, 2), (3, 1), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
| "# 18\n", | |
| "# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
| "# 19\n", | |
| "# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
| "\n", | |
| "\n", | |
| "# 3\n", | |
| "# [(0, 0), (3, 1), (5, 1)]\n", | |
| "# 11\n", | |
| "# [(0, 0), (2, 2), (3, 1), (5, 1), (6, 2), (8, 2)]\n", | |
| "# 20\n", | |
| "# [(0, 0), (1, 3), (2, 2), (3, 1), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
| "# 26\n", | |
| "# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]\n", | |
| "# 29\n", | |
| "# [(0, 0), (1, 3), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 3), (8, 2)]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 47, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from functools import partial\n", | |
| "def addx(a, x=4):\n", | |
| " return a + x\n", | |
| "\n", | |
| "pseudopartial = lambda v: addx(v, x=8)\n", | |
| "aa = pseudopartial(2)\n", | |
| "realpartial = partial(addx, x=8)\n", | |
| "bb = realpartial(2)\n", | |
| "assert aa == bb" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.10" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment