Created
April 21, 2015 12:44
-
-
Save xccds/6dfd67737f53aa40f50a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### 用spark进行数据挖掘\n", | |
| "\n", | |
| "- 本例使用spark的python接口,对titanic数据做了一个完整的尝试\n", | |
| "- 首先用算质数的例子显示,即使在单机中,spark利用了多核处理能提高计算效率\n", | |
| "- 之后读入数据集,并对数据进行预处理\n", | |
| " - 步骤1:对名字进行了处理,用正则取出四种常见title\n", | |
| " - 步骤2:基于title,对年龄进行了缺失值处理\n", | |
| " - 步骤3:将类别变量均转为0-1变量\n", | |
| "- 数据合并整理成spark.mllib需要的格式\n", | |
| "- 使用线性模型建模,并得出错误率\n", | |
| "- 本例代码参考了《machine learning with spark》一书" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from pyspark import SparkContext\n", | |
| "sc = SparkContext( 'local[4]')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "- 算质数的例子" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def isprime(n):\n", | |
| " \"\"\"\n", | |
| " check if integer n is a prime\n", | |
| " \"\"\"\n", | |
| " # make sure n is a positive integer\n", | |
| " n = abs(int(n))\n", | |
| " # 0 and 1 are not primes\n", | |
| " if n < 2:\n", | |
| " return False\n", | |
| " # 2 is the only even prime number\n", | |
| " if n == 2:\n", | |
| " return True\n", | |
| " # all other even numbers are not primes\n", | |
| " if not n & 1:\n", | |
| " return False\n", | |
| " # range starts with 3 and only needs to go up the square root of n\n", | |
| " # for all odd numbers\n", | |
| " for x in range(3, int(n**0.5)+1, 2):\n", | |
| " if n % x == 0:\n", | |
| " return False\n", | |
| " return True" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "78498\n", | |
| "78498\n", | |
| "78498\n", | |
| "78498\n", | |
| "1 loops, best of 3: 4.81 s per loop\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%timeit\n", | |
| "import numpy as np\n", | |
| "nums = xrange(1000000)\n", | |
| "print np.sum([1 for x in nums if isprime(x)])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "78498\n", | |
| "78498\n", | |
| "78498\n", | |
| "78498\n", | |
| "1 loops, best of 3: 2.71 s per loop\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%timeit\n", | |
| "nums = sc.parallelize(xrange(1000000))\n", | |
| "print nums.filter(isprime).count()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "- titanic例子,先读入变量名" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "vname = !head -1 titanic.csv\n", | |
| "vname = vname[0].split(',')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "u'0,3,\"Braund, Mr. Owen Harris\",male,22,1,0,A/5 21171,7.25,,S'" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "#!sed 1d titanic.csv > titanic_noheader.csv\n", | |
| "raw = sc.textFile('titanic_noheader.csv')\n", | |
| "raw.first() # 原始数据" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "- 数据预处理" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# 处理title\n", | |
| "def extract_name(x):\n", | |
| " import re\n", | |
| " return re.search(\"\\\"(.*)\\\"\", x).group(1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[u'Braund, Mr. Owen Harris',\n", | |
| " u'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',\n", | |
| " u'Heikkinen, Miss. Laina',\n", | |
| " u'Futrelle, Mrs. Jacques Heath (Lily May Peel)']" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "names = raw.map(extract_name)\n", | |
| "names.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import re\n", | |
| "title = names.map(lambda x: re.search(r\", (.*?)\\. \", x).group(1))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(u'Mr', 517),\n", | |
| " (u'Miss', 182),\n", | |
| " (u'Mrs', 125),\n", | |
| " (u'Master', 40),\n", | |
| " (u'Dr', 7),\n", | |
| " (u'Rev', 6),\n", | |
| " (u'Major', 2),\n", | |
| " (u'Mlle', 2),\n", | |
| " (u'Col', 2),\n", | |
| " (u'Sir', 1),\n", | |
| " (u'the Countess', 1),\n", | |
| " (u'Don', 1),\n", | |
| " (u'Capt', 1),\n", | |
| " (u'Lady', 1),\n", | |
| " (u'Jonkheer', 1),\n", | |
| " (u'Ms', 1),\n", | |
| " (u'Mme', 1)]" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[u'Mr', u'Miss', u'Mrs', u'Master']" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "top_title = [x[0] for x in sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)[:4]]\n", | |
| "top_title" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def assign_title(x):\n", | |
| " if x in top_title: return x\n", | |
| " else: return u'other'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[u'Mr', u'Mrs', u'Miss', u'Mrs']" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "title_less = title.map(assign_title)\n", | |
| "title_less.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# 处理其它数据\n", | |
| "def split_rest(x):\n", | |
| " import re\n", | |
| " rec = re.sub(\"\\\"(.*)\\\",\", '', x)\n", | |
| " return rec.split(',')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[u'0', u'3', u'male', u'22', u'1', u'0', u'A/5 21171', u'7.25', u'', u'S']" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df = raw.map(split_rest)\n", | |
| "df.first()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# 观察数据\n", | |
| "vname.remove('name')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "0th variable:survived distinct value: 2\n", | |
| "1th variable:pclass distinct value: 3\n", | |
| "2th variable:sex distinct value: 2\n", | |
| "3th variable:age distinct value: 89\n", | |
| "4th variable:sibsp distinct value: 7\n", | |
| "5th variable:parch distinct value: 7\n", | |
| "6th variable:ticket distinct value: 681\n", | |
| "7th variable:fare distinct value: 248\n", | |
| "8th variable:cabin distinct value: 148\n", | |
| "9th variable:embarked distinct value: 4\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# 取值个数\n", | |
| "m = len(df.first())\n", | |
| "for i in range(m):\n", | |
| " print '%dth variable:%s distinct value: %s' %(i, vname[i],df.map(lambda row: row[i]).distinct().count())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "0th variable:survived miss value: 0\n", | |
| "1th variable:pclass miss value: 0\n", | |
| "2th variable:sex miss value: 0\n", | |
| "3th variable:age miss value: 177\n", | |
| "4th variable:sibsp miss value: 0\n", | |
| "5th variable:parch miss value: 0\n", | |
| "6th variable:ticket miss value: 0\n", | |
| "7th variable:fare miss value: 0\n", | |
| "8th variable:cabin miss value: 687\n", | |
| "9th variable:embarked miss value: 2\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# 缺失个数\n", | |
| "for i in range(m):\n", | |
| " print '%dth variable:%s miss value: %s' %(i, vname[i],df.map(lambda row: row[i]=='').sum())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# 处理年龄缺失\n", | |
| "age = df.map(lambda x: x[3])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "title_age = title.zip(age)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "title_age = title_age.mapValues(lambda x: float(x) if x!='' else -1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def miss_mean(data):\n", | |
| " res = [x for x in data if x!=-1]\n", | |
| " return np.mean(res)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "age_dict = dict(title_age.groupByKey().map(lambda (k,v): (k, miss_mean(v.data))).collect())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{u'Capt': 70.0,\n", | |
| " u'Col': 58.0,\n", | |
| " u'Don': 40.0,\n", | |
| " u'Dr': 42.0,\n", | |
| " u'Jonkheer': 38.0,\n", | |
| " u'Lady': 48.0,\n", | |
| " u'Major': 48.5,\n", | |
| " u'Master': 4.5741666666666667,\n", | |
| " u'Miss': 21.773972602739725,\n", | |
| " u'Mlle': 24.0,\n", | |
| " u'Mme': 24.0,\n", | |
| " u'Mr': 32.368090452261306,\n", | |
| " u'Mrs': 35.898148148148145,\n", | |
| " u'Ms': 28.0,\n", | |
| " u'Rev': 43.166666666666664,\n", | |
| " u'Sir': 49.0,\n", | |
| " u'the Countess': 33.0}" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "age_dict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def age_func((title,age)):\n", | |
| " if age== -1: res = (title, age_dict[title])\n", | |
| " else: res = (title, age)\n", | |
| " return res" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[22.0, 38.0, 26.0, 35.0]" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "title_age = title_age.map(age_func)\n", | |
| "age_imputed = title_age.values()\n", | |
| "age_imputed.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "defaultdict(<type 'int'>, {u'Q': 77, u'': 2, u'S': 644, u'C': 168})" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# 处理 embarked缺失\n", | |
| "df.map(lambda record: record[9]).countByValue()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def embarked_func(record):\n", | |
| " if record[9]=='' : return u'S' \n", | |
| " else: return record[9]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "embarked= df.map(embarked_func)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# 将四个类别变量转为0-1二元变量" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{u'Master': 1, u'Miss': 0, u'Mr': 3, u'Mrs': 4, u'other': 2}" | |
| ] | |
| }, | |
| "execution_count": 29, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "title_dict = title_less.distinct().zipWithIndex().collectAsMap()\n", | |
| "title_dict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def create_vector(term, term_dict):\n", | |
| " #from scipy import sparse as sp\n", | |
| " num_terms = len(term_dict)\n", | |
| " #x = sp.csc_matrix((1, num_terms))\n", | |
| " x = [0]*num_terms\n", | |
| " idx = term_dict[term]\n", | |
| " x[idx] = 1\n", | |
| " return x" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[0, 1, 0, 0, 0]" | |
| ] | |
| }, | |
| "execution_count": 31, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "create_vector(u'Master',title_dict)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 32, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[[0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [1, 0, 0, 0, 0], [0, 0, 0, 0, 1]]" | |
| ] | |
| }, | |
| "execution_count": 32, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "title_ind = title_less.map(lambda x: create_vector(x,title_dict))\n", | |
| "title_ind.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{u'1': 0, u'2': 2, u'3': 1}" | |
| ] | |
| }, | |
| "execution_count": 33, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pclass_dict = df.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()\n", | |
| "pclass_dict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[[0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0]]" | |
| ] | |
| }, | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pclass_ind = df.map(lambda x: create_vector(x[1],pclass_dict))\n", | |
| "pclass_ind.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{u'C': 2, u'Q': 0, u'S': 1}" | |
| ] | |
| }, | |
| "execution_count": 35, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "embarked_dict = embarked.distinct().zipWithIndex().collectAsMap()\n", | |
| "embarked_dict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 1, 0]]" | |
| ] | |
| }, | |
| "execution_count": 36, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "embarked_ind = embarked.map(lambda x: create_vector(x,embarked_dict))\n", | |
| "embarked_ind.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 37, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "gender_ind = df.map(lambda x: 1 if x[2]==u'male' else 0)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(0, [0, 1, 0, 7.25]),\n", | |
| " (1, [1, 1, 0, 71.2833]),\n", | |
| " (2, [1, 0, 0, 7.925]),\n", | |
| " (3, [1, 1, 0, 53.1])]" | |
| ] | |
| }, | |
| "execution_count": 38, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# 合并数据\n", | |
| "restdf = df.map(lambda x: [int(x[0]),int(x[4]), int(x[5]), float(x[7])]).zipWithIndex().map(lambda (v,k): (k,v))\n", | |
| "restdf.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(0, [0, 0, 0, 1, 0]),\n", | |
| " (1, [0, 0, 0, 0, 1]),\n", | |
| " (2, [1, 0, 0, 0, 0]),\n", | |
| " (3, [0, 0, 0, 0, 1])]" | |
| ] | |
| }, | |
| "execution_count": 39, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "title_ind = title_ind.zipWithIndex().map(lambda (v,k): (k,v))\n", | |
| "title_ind.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 40, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(0, [0, 1, 0]), (1, [1, 0, 0]), (2, [0, 1, 0]), (3, [1, 0, 0])]" | |
| ] | |
| }, | |
| "execution_count": 40, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pclass_ind = pclass_ind.zipWithIndex().map(lambda (v,k): (k,v))\n", | |
| "pclass_ind.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(0, [0, 1, 0]), (1, [0, 0, 1]), (2, [0, 1, 0]), (3, [0, 1, 0])]" | |
| ] | |
| }, | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "embarked_ind = embarked_ind.zipWithIndex().map(lambda (v,k): (k,v))\n", | |
| "embarked_ind.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(0, [1]), (1, [0]), (2, [0]), (3, [0])]" | |
| ] | |
| }, | |
| "execution_count": 42, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "gender_ind = gender_ind.zipWithIndex().map(lambda (v,k): (k,[v]))\n", | |
| "gender_ind.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(0, [22.0]), (1, [38.0]), (2, [26.0]), (3, [35.0])]" | |
| ] | |
| }, | |
| "execution_count": 43, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "age_imputed = age_imputed.zipWithIndex().map(lambda (v,k): (k,[v]))\n", | |
| "age_imputed.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "finaldf = restdf.union(embarked_ind).reduceByKey(lambda x,y: x + y)\n", | |
| "finaldf = finaldf.union(age_imputed).reduceByKey(lambda x,y: x + y)\n", | |
| "finaldf = finaldf.union(gender_ind).reduceByKey(lambda x,y: x + y)\n", | |
| "finaldf = finaldf.union(title_ind).reduceByKey(lambda x,y: x + y)\n", | |
| "finaldf = finaldf.union(pclass_ind).reduceByKey(lambda x,y: x + y)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(0, [0, 1, 0, 7.25, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n", | |
| " (384,\n", | |
| " [0, 0, 0, 7.8958, 0, 1, 0, 32.368090452261306, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n", | |
| " (132, [0, 1, 0, 14.5, 0, 1, 0, 47.0, 0, 0, 0, 0, 0, 1, 0, 1, 0]),\n", | |
| " (588, [0, 0, 0, 8.05, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0])]" | |
| ] | |
| }, | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "finaldf.take(4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 46, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# 准备建模需要格式\n", | |
| "from pyspark.mllib.classification import LogisticRegressionWithSGD\n", | |
| "from pyspark.mllib.regression import LabeledPoint\n", | |
| "def parsePoint(line):\n", | |
| " features = line[1][1:]\n", | |
| " target = line[1][0]\n", | |
| " return LabeledPoint(target, features)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 47, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "modeldata = finaldf.map(parsePoint)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 48, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "LabeledPoint(0.0, [1.0,0.0,7.25,0.0,1.0,0.0,22.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0])" | |
| ] | |
| }, | |
| "execution_count": 48, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "modeldata.first()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# 数据切分\n", | |
| "train, test = modeldata.randomSplit([0.75,0.25])\n", | |
| "# 建模\n", | |
| "model = LogisticRegressionWithSGD.train(train,iterations =1000,regType='l2')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Training Error = 0.308056872038\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# 评估\n", | |
| "labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))\n", | |
| "testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test.count())\n", | |
| "print(\"Training Error = \" + str(testErr))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment