xccds · April 21, 2015 12:44
diff --git a/spark.ipynb b/spark.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 用spark进行数据挖掘\n",
    "\n",
    "- 本例使用spark的python接口，对titanic数据做了一个完整的尝试\n",
    "- 首先用算质数的例子显示，即使在单机中，spark利用了多核处理能提高计算效率\n",
    "- 之后读入数据集，并对数据进行预处理\n",
    "    - 步骤1：对名字进行了处理，用正则取出四种常见title\n",
    "    - 步骤2：基于title，对年龄进行了缺失值处理\n",
    "    - 步骤3：将类别变量均转为0-1变量\n",
    "- 数据合并整理成spark.mllib需要的格式\n",
    "- 使用线性模型建模，并得出错误率\n",
    "- 本例代码参考了《machine learning with spark》一书"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pyspark import  SparkContext\n",
    "sc = SparkContext( 'local[4]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 算质数的例子"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def isprime(n):\n",
    "    \"\"\"\n",
    "    check if integer n is a prime\n",
    "    \"\"\"\n",
    "    # make sure n is a positive integer\n",
    "    n = abs(int(n))\n",
    "    # 0 and 1 are not primes\n",
    "    if n < 2:\n",
    "        return False\n",
    "    # 2 is the only even prime number\n",
    "    if n == 2:\n",
    "        return True\n",
    "    # all other even numbers are not primes\n",
    "    if not n & 1:\n",
    "        return False\n",
    "    # range starts with 3 and only needs to go up the square root of n\n",
    "    # for all odd numbers\n",
    "    for x in range(3, int(n**0.5)+1, 2):\n",
    "        if n % x == 0:\n",
    "            return False\n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "78498\n",
      "78498\n",
      "78498\n",
      "78498\n",
      "1 loops, best of 3: 4.81 s per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "import numpy as np\n",
    "nums = xrange(1000000)\n",
    "print np.sum([1 for x in nums if isprime(x)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "78498\n",
      "78498\n",
      "78498\n",
      "78498\n",
      "1 loops, best of 3: 2.71 s per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "nums = sc.parallelize(xrange(1000000))\n",
    "print nums.filter(isprime).count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- titanic例子，先读入变量名"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "vname = !head -1 titanic.csv\n",
    "vname = vname[0].split(',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u'0,3,\"Braund, Mr. Owen Harris\",male,22,1,0,A/5 21171,7.25,,S'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#!sed 1d titanic.csv > titanic_noheader.csv\n",
    "raw = sc.textFile('titanic_noheader.csv')\n",
    "raw.first() # 原始数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- 数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 处理title\n",
    "def extract_name(x):\n",
    "    import re\n",
    "    return re.search(\"\\\"(.*)\\\"\", x).group(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'Braund, Mr. Owen Harris',\n",
       " u'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',\n",
       " u'Heikkinen, Miss. Laina',\n",
       " u'Futrelle, Mrs. Jacques Heath (Lily May Peel)']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names = raw.map(extract_name)\n",
    "names.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re\n",
    "title = names.map(lambda x: re.search(r\", (.*?)\\. \", x).group(1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(u'Mr', 517),\n",
       " (u'Miss', 182),\n",
       " (u'Mrs', 125),\n",
       " (u'Master', 40),\n",
       " (u'Dr', 7),\n",
       " (u'Rev', 6),\n",
       " (u'Major', 2),\n",
       " (u'Mlle', 2),\n",
       " (u'Col', 2),\n",
       " (u'Sir', 1),\n",
       " (u'the Countess', 1),\n",
       " (u'Don', 1),\n",
       " (u'Capt', 1),\n",
       " (u'Lady', 1),\n",
       " (u'Jonkheer', 1),\n",
       " (u'Ms', 1),\n",
       " (u'Mme', 1)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'Mr', u'Miss', u'Mrs', u'Master']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "top_title = [x[0] for x in sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)[:4]]\n",
    "top_title"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def assign_title(x):\n",
    "    if x in top_title: return x\n",
    "    else: return u'other'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'Mr', u'Mrs', u'Miss', u'Mrs']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "title_less = title.map(assign_title)\n",
    "title_less.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 处理其它数据\n",
    "def split_rest(x):\n",
    "    import re\n",
    "    rec = re.sub(\"\\\"(.*)\\\",\", '', x)\n",
    "    return rec.split(',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'0', u'3', u'male', u'22', u'1', u'0', u'A/5 21171', u'7.25', u'', u'S']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = raw.map(split_rest)\n",
    "df.first()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 观察数据\n",
    "vname.remove('name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0th variable:survived  distinct value: 2\n",
      "1th variable:pclass  distinct value: 3\n",
      "2th variable:sex  distinct value: 2\n",
      "3th variable:age  distinct value: 89\n",
      "4th variable:sibsp  distinct value: 7\n",
      "5th variable:parch  distinct value: 7\n",
      "6th variable:ticket  distinct value: 681\n",
      "7th variable:fare  distinct value: 248\n",
      "8th variable:cabin  distinct value: 148\n",
      "9th variable:embarked  distinct value: 4\n"
     ]
    }
   ],
   "source": [
    "# 取值个数\n",
    "m = len(df.first())\n",
    "for i in range(m):\n",
    "    print '%dth variable:%s  distinct value: %s' %(i, vname[i],df.map(lambda row: row[i]).distinct().count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0th variable:survived  miss value: 0\n",
      "1th variable:pclass  miss value: 0\n",
      "2th variable:sex  miss value: 0\n",
      "3th variable:age  miss value: 177\n",
      "4th variable:sibsp  miss value: 0\n",
      "5th variable:parch  miss value: 0\n",
      "6th variable:ticket  miss value: 0\n",
      "7th variable:fare  miss value: 0\n",
      "8th variable:cabin  miss value: 687\n",
      "9th variable:embarked  miss value: 2\n"
     ]
    }
   ],
   "source": [
    "# 缺失个数\n",
    "for i in range(m):\n",
    "    print '%dth variable:%s  miss value: %s' %(i, vname[i],df.map(lambda row: row[i]=='').sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 处理年龄缺失\n",
    "age  = df.map(lambda x: x[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "title_age = title.zip(age)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "title_age = title_age.mapValues(lambda x: float(x) if x!='' else -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def miss_mean(data):\n",
    "    res = [x for x in data if x!=-1]\n",
    "    return np.mean(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "age_dict = dict(title_age.groupByKey().map(lambda (k,v): (k, miss_mean(v.data))).collect())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'Capt': 70.0,\n",
       " u'Col': 58.0,\n",
       " u'Don': 40.0,\n",
       " u'Dr': 42.0,\n",
       " u'Jonkheer': 38.0,\n",
       " u'Lady': 48.0,\n",
       " u'Major': 48.5,\n",
       " u'Master': 4.5741666666666667,\n",
       " u'Miss': 21.773972602739725,\n",
       " u'Mlle': 24.0,\n",
       " u'Mme': 24.0,\n",
       " u'Mr': 32.368090452261306,\n",
       " u'Mrs': 35.898148148148145,\n",
       " u'Ms': 28.0,\n",
       " u'Rev': 43.166666666666664,\n",
       " u'Sir': 49.0,\n",
       " u'the Countess': 33.0}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def age_func((title,age)):\n",
    "    if  age== -1: res = (title, age_dict[title])\n",
    "    else: res = (title, age)\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[22.0, 38.0, 26.0, 35.0]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "title_age = title_age.map(age_func)\n",
    "age_imputed = title_age.values()\n",
    "age_imputed.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(<type 'int'>, {u'Q': 77, u'': 2, u'S': 644, u'C': 168})"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 处理 embarked缺失\n",
    "df.map(lambda record: record[9]).countByValue()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def embarked_func(record):\n",
    "    if record[9]=='' : return u'S' \n",
    "    else: return record[9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "embarked= df.map(embarked_func)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 将四个类别变量转为0-1二元变量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'Master': 1, u'Miss': 0, u'Mr': 3, u'Mrs': 4, u'other': 2}"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "title_dict = title_less.distinct().zipWithIndex().collectAsMap()\n",
    "title_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_vector(term, term_dict):\n",
    "    #from scipy import sparse as sp\n",
    "    num_terms = len(term_dict)\n",
    "    #x = sp.csc_matrix((1, num_terms))\n",
    "    x = [0]*num_terms\n",
    "    idx = term_dict[term]\n",
    "    x[idx] = 1\n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 1, 0, 0, 0]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "create_vector(u'Master',title_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [1, 0, 0, 0, 0], [0, 0, 0, 0, 1]]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "title_ind = title_less.map(lambda x: create_vector(x,title_dict))\n",
    "title_ind.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'1': 0, u'2': 2, u'3': 1}"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pclass_dict = df.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()\n",
    "pclass_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0]]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pclass_ind = df.map(lambda x: create_vector(x[1],pclass_dict))\n",
    "pclass_ind.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{u'C': 2, u'Q': 0, u'S': 1}"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embarked_dict = embarked.distinct().zipWithIndex().collectAsMap()\n",
    "embarked_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 1, 0]]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embarked_ind = embarked.map(lambda x: create_vector(x,embarked_dict))\n",
    "embarked_ind.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "gender_ind = df.map(lambda x: 1 if x[2]==u'male' else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, [0, 1, 0, 7.25]),\n",
       " (1, [1, 1, 0, 71.2833]),\n",
       " (2, [1, 0, 0, 7.925]),\n",
       " (3, [1, 1, 0, 53.1])]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 合并数据\n",
    "restdf = df.map(lambda x: [int(x[0]),int(x[4]), int(x[5]), float(x[7])]).zipWithIndex().map(lambda (v,k): (k,v))\n",
    "restdf.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, [0, 0, 0, 1, 0]),\n",
       " (1, [0, 0, 0, 0, 1]),\n",
       " (2, [1, 0, 0, 0, 0]),\n",
       " (3, [0, 0, 0, 0, 1])]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "title_ind = title_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
    "title_ind.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, [0, 1, 0]), (1, [1, 0, 0]), (2, [0, 1, 0]), (3, [1, 0, 0])]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pclass_ind = pclass_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
    "pclass_ind.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, [0, 1, 0]), (1, [0, 0, 1]), (2, [0, 1, 0]), (3, [0, 1, 0])]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embarked_ind = embarked_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
    "embarked_ind.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, [1]), (1, [0]), (2, [0]), (3, [0])]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gender_ind = gender_ind.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
    "gender_ind.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, [22.0]), (1, [38.0]), (2, [26.0]), (3, [35.0])]"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_imputed = age_imputed.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
    "age_imputed.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "finaldf = restdf.union(embarked_ind).reduceByKey(lambda x,y: x + y)\n",
    "finaldf = finaldf.union(age_imputed).reduceByKey(lambda x,y: x + y)\n",
    "finaldf = finaldf.union(gender_ind).reduceByKey(lambda x,y: x + y)\n",
    "finaldf = finaldf.union(title_ind).reduceByKey(lambda x,y: x + y)\n",
    "finaldf = finaldf.union(pclass_ind).reduceByKey(lambda x,y: x + y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0, [0, 1, 0, 7.25, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
       " (384,\n",
       "  [0, 0, 0, 7.8958, 0, 1, 0, 32.368090452261306, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
       " (132, [0, 1, 0, 14.5, 0, 1, 0, 47.0, 0, 0, 0, 0, 0, 1, 0, 1, 0]),\n",
       " (588, [0, 0, 0, 8.05, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0])]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "finaldf.take(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 准备建模需要格式\n",
    "from pyspark.mllib.classification import LogisticRegressionWithSGD\n",
    "from pyspark.mllib.regression import LabeledPoint\n",
    "def parsePoint(line):\n",
    "    features = line[1][1:]\n",
    "    target = line[1][0]\n",
    "    return LabeledPoint(target, features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "modeldata = finaldf.map(parsePoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LabeledPoint(0.0, [1.0,0.0,7.25,0.0,1.0,0.0,22.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0])"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "modeldata.first()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 数据切分\n",
    "train, test = modeldata.randomSplit([0.75,0.25])\n",
    "# 建模\n",
    "model = LogisticRegressionWithSGD.train(train,iterations =1000,regType='l2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Error = 0.308056872038\n"
     ]
    }
   ],
   "source": [
    "# 评估\n",
    "labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))\n",
    "testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test.count())\n",
    "print(\"Training Error = \" + str(testErr))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 用spark进行数据挖掘\n",
	"\n",
	"- 本例使用spark的python接口，对titanic数据做了一个完整的尝试\n",
	"- 首先用算质数的例子显示，即使在单机中，spark利用了多核处理能提高计算效率\n",
	"- 之后读入数据集，并对数据进行预处理\n",
	" - 步骤1：对名字进行了处理，用正则取出四种常见title\n",
	" - 步骤2：基于title，对年龄进行了缺失值处理\n",
	" - 步骤3：将类别变量均转为0-1变量\n",
	"- 数据合并整理成spark.mllib需要的格式\n",
	"- 使用线性模型建模，并得出错误率\n",
	"- 本例代码参考了《machine learning with spark》一书"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from pyspark import SparkContext\n",
	"sc = SparkContext( 'local[4]')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- 算质数的例子"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def isprime(n):\n",
	" \"\"\"\n",
	" check if integer n is a prime\n",
	" \"\"\"\n",
	" # make sure n is a positive integer\n",
	" n = abs(int(n))\n",
	" # 0 and 1 are not primes\n",
	" if n < 2:\n",
	" return False\n",
	" # 2 is the only even prime number\n",
	" if n == 2:\n",
	" return True\n",
	" # all other even numbers are not primes\n",
	" if not n & 1:\n",
	" return False\n",
	" # range starts with 3 and only needs to go up the square root of n\n",
	" # for all odd numbers\n",
	" for x in range(3, int(n**0.5)+1, 2):\n",
	" if n % x == 0:\n",
	" return False\n",
	" return True"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"78498\n",
	"78498\n",
	"78498\n",
	"78498\n",
	"1 loops, best of 3: 4.81 s per loop\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"import numpy as np\n",
	"nums = xrange(1000000)\n",
	"print np.sum([1 for x in nums if isprime(x)])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"78498\n",
	"78498\n",
	"78498\n",
	"78498\n",
	"1 loops, best of 3: 2.71 s per loop\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"nums = sc.parallelize(xrange(1000000))\n",
	"print nums.filter(isprime).count()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- titanic例子，先读入变量名"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"vname = !head -1 titanic.csv\n",
	"vname = vname[0].split(',')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"u'0,3,\"Braund, Mr. Owen Harris\",male,22,1,0,A/5 21171,7.25,,S'"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"#!sed 1d titanic.csv > titanic_noheader.csv\n",
	"raw = sc.textFile('titanic_noheader.csv')\n",
	"raw.first() # 原始数据"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- 数据预处理"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# 处理title\n",
	"def extract_name(x):\n",
	" import re\n",
	" return re.search(\"\\\"(.*)\\\"\", x).group(1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[u'Braund, Mr. Owen Harris',\n",
	" u'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',\n",
	" u'Heikkinen, Miss. Laina',\n",
	" u'Futrelle, Mrs. Jacques Heath (Lily May Peel)']"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"names = raw.map(extract_name)\n",
	"names.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import re\n",
	"title = names.map(lambda x: re.search(r\", (.*?)\\. \", x).group(1))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(u'Mr', 517),\n",
	" (u'Miss', 182),\n",
	" (u'Mrs', 125),\n",
	" (u'Master', 40),\n",
	" (u'Dr', 7),\n",
	" (u'Rev', 6),\n",
	" (u'Major', 2),\n",
	" (u'Mlle', 2),\n",
	" (u'Col', 2),\n",
	" (u'Sir', 1),\n",
	" (u'the Countess', 1),\n",
	" (u'Don', 1),\n",
	" (u'Capt', 1),\n",
	" (u'Lady', 1),\n",
	" (u'Jonkheer', 1),\n",
	" (u'Ms', 1),\n",
	" (u'Mme', 1)]"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[u'Mr', u'Miss', u'Mrs', u'Master']"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"top_title = [x[0] for x in sorted(title.countByValue().iteritems(),key=lambda (k,v): v,reverse=True)[:4]]\n",
	"top_title"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def assign_title(x):\n",
	" if x in top_title: return x\n",
	" else: return u'other'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[u'Mr', u'Mrs', u'Miss', u'Mrs']"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"title_less = title.map(assign_title)\n",
	"title_less.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# 处理其它数据\n",
	"def split_rest(x):\n",
	" import re\n",
	" rec = re.sub(\"\\\"(.*)\\\",\", '', x)\n",
	" return rec.split(',')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[u'0', u'3', u'male', u'22', u'1', u'0', u'A/5 21171', u'7.25', u'', u'S']"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = raw.map(split_rest)\n",
	"df.first()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# 观察数据\n",
	"vname.remove('name')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0th variable:survived distinct value: 2\n",
	"1th variable:pclass distinct value: 3\n",
	"2th variable:sex distinct value: 2\n",
	"3th variable:age distinct value: 89\n",
	"4th variable:sibsp distinct value: 7\n",
	"5th variable:parch distinct value: 7\n",
	"6th variable:ticket distinct value: 681\n",
	"7th variable:fare distinct value: 248\n",
	"8th variable:cabin distinct value: 148\n",
	"9th variable:embarked distinct value: 4\n"
	]
	}
	],
	"source": [
	"# 取值个数\n",
	"m = len(df.first())\n",
	"for i in range(m):\n",
	" print '%dth variable:%s distinct value: %s' %(i, vname[i],df.map(lambda row: row[i]).distinct().count())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0th variable:survived miss value: 0\n",
	"1th variable:pclass miss value: 0\n",
	"2th variable:sex miss value: 0\n",
	"3th variable:age miss value: 177\n",
	"4th variable:sibsp miss value: 0\n",
	"5th variable:parch miss value: 0\n",
	"6th variable:ticket miss value: 0\n",
	"7th variable:fare miss value: 0\n",
	"8th variable:cabin miss value: 687\n",
	"9th variable:embarked miss value: 2\n"
	]
	}
	],
	"source": [
	"# 缺失个数\n",
	"for i in range(m):\n",
	" print '%dth variable:%s miss value: %s' %(i, vname[i],df.map(lambda row: row[i]=='').sum())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# 处理年龄缺失\n",
	"age = df.map(lambda x: x[3])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"title_age = title.zip(age)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"title_age = title_age.mapValues(lambda x: float(x) if x!='' else -1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def miss_mean(data):\n",
	" res = [x for x in data if x!=-1]\n",
	" return np.mean(res)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"age_dict = dict(title_age.groupByKey().map(lambda (k,v): (k, miss_mean(v.data))).collect())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{u'Capt': 70.0,\n",
	" u'Col': 58.0,\n",
	" u'Don': 40.0,\n",
	" u'Dr': 42.0,\n",
	" u'Jonkheer': 38.0,\n",
	" u'Lady': 48.0,\n",
	" u'Major': 48.5,\n",
	" u'Master': 4.5741666666666667,\n",
	" u'Miss': 21.773972602739725,\n",
	" u'Mlle': 24.0,\n",
	" u'Mme': 24.0,\n",
	" u'Mr': 32.368090452261306,\n",
	" u'Mrs': 35.898148148148145,\n",
	" u'Ms': 28.0,\n",
	" u'Rev': 43.166666666666664,\n",
	" u'Sir': 49.0,\n",
	" u'the Countess': 33.0}"
	]
	},
	"execution_count": 22,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"age_dict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def age_func((title,age)):\n",
	" if age== -1: res = (title, age_dict[title])\n",
	" else: res = (title, age)\n",
	" return res"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[22.0, 38.0, 26.0, 35.0]"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"title_age = title_age.map(age_func)\n",
	"age_imputed = title_age.values()\n",
	"age_imputed.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"defaultdict(<type 'int'>, {u'Q': 77, u'': 2, u'S': 644, u'C': 168})"
	]
	},
	"execution_count": 25,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 处理 embarked缺失\n",
	"df.map(lambda record: record[9]).countByValue()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def embarked_func(record):\n",
	" if record[9]=='' : return u'S' \n",
	" else: return record[9]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"embarked= df.map(embarked_func)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# 将四个类别变量转为0-1二元变量"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{u'Master': 1, u'Miss': 0, u'Mr': 3, u'Mrs': 4, u'other': 2}"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"title_dict = title_less.distinct().zipWithIndex().collectAsMap()\n",
	"title_dict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def create_vector(term, term_dict):\n",
	" #from scipy import sparse as sp\n",
	" num_terms = len(term_dict)\n",
	" #x = sp.csc_matrix((1, num_terms))\n",
	" x = [0]*num_terms\n",
	" idx = term_dict[term]\n",
	" x[idx] = 1\n",
	" return x"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[0, 1, 0, 0, 0]"
	]
	},
	"execution_count": 31,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"create_vector(u'Master',title_dict)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[[0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [1, 0, 0, 0, 0], [0, 0, 0, 0, 1]]"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"title_ind = title_less.map(lambda x: create_vector(x,title_dict))\n",
	"title_ind.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{u'1': 0, u'2': 2, u'3': 1}"
	]
	},
	"execution_count": 33,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pclass_dict = df.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()\n",
	"pclass_dict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[[0, 1, 0], [1, 0, 0], [0, 1, 0], [1, 0, 0]]"
	]
	},
	"execution_count": 34,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pclass_ind = df.map(lambda x: create_vector(x[1],pclass_dict))\n",
	"pclass_ind.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{u'C': 2, u'Q': 0, u'S': 1}"
	]
	},
	"execution_count": 35,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"embarked_dict = embarked.distinct().zipWithIndex().collectAsMap()\n",
	"embarked_dict"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 1, 0]]"
	]
	},
	"execution_count": 36,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"embarked_ind = embarked.map(lambda x: create_vector(x,embarked_dict))\n",
	"embarked_ind.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"gender_ind = df.map(lambda x: 1 if x[2]==u'male' else 0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(0, [0, 1, 0, 7.25]),\n",
	" (1, [1, 1, 0, 71.2833]),\n",
	" (2, [1, 0, 0, 7.925]),\n",
	" (3, [1, 1, 0, 53.1])]"
	]
	},
	"execution_count": 38,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 合并数据\n",
	"restdf = df.map(lambda x: [int(x[0]),int(x[4]), int(x[5]), float(x[7])]).zipWithIndex().map(lambda (v,k): (k,v))\n",
	"restdf.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(0, [0, 0, 0, 1, 0]),\n",
	" (1, [0, 0, 0, 0, 1]),\n",
	" (2, [1, 0, 0, 0, 0]),\n",
	" (3, [0, 0, 0, 0, 1])]"
	]
	},
	"execution_count": 39,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"title_ind = title_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
	"title_ind.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(0, [0, 1, 0]), (1, [1, 0, 0]), (2, [0, 1, 0]), (3, [1, 0, 0])]"
	]
	},
	"execution_count": 40,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pclass_ind = pclass_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
	"pclass_ind.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(0, [0, 1, 0]), (1, [0, 0, 1]), (2, [0, 1, 0]), (3, [0, 1, 0])]"
	]
	},
	"execution_count": 41,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"embarked_ind = embarked_ind.zipWithIndex().map(lambda (v,k): (k,v))\n",
	"embarked_ind.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(0, [1]), (1, [0]), (2, [0]), (3, [0])]"
	]
	},
	"execution_count": 42,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"gender_ind = gender_ind.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
	"gender_ind.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(0, [22.0]), (1, [38.0]), (2, [26.0]), (3, [35.0])]"
	]
	},
	"execution_count": 43,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"age_imputed = age_imputed.zipWithIndex().map(lambda (v,k): (k,[v]))\n",
	"age_imputed.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"finaldf = restdf.union(embarked_ind).reduceByKey(lambda x,y: x + y)\n",
	"finaldf = finaldf.union(age_imputed).reduceByKey(lambda x,y: x + y)\n",
	"finaldf = finaldf.union(gender_ind).reduceByKey(lambda x,y: x + y)\n",
	"finaldf = finaldf.union(title_ind).reduceByKey(lambda x,y: x + y)\n",
	"finaldf = finaldf.union(pclass_ind).reduceByKey(lambda x,y: x + y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(0, [0, 1, 0, 7.25, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
	" (384,\n",
	" [0, 0, 0, 7.8958, 0, 1, 0, 32.368090452261306, 1, 0, 0, 0, 1, 0, 0, 1, 0]),\n",
	" (132, [0, 1, 0, 14.5, 0, 1, 0, 47.0, 0, 0, 0, 0, 0, 1, 0, 1, 0]),\n",
	" (588, [0, 0, 0, 8.05, 0, 1, 0, 22.0, 1, 0, 0, 0, 1, 0, 0, 1, 0])]"
	]
	},
	"execution_count": 45,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"finaldf.take(4)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# 准备建模需要格式\n",
	"from pyspark.mllib.classification import LogisticRegressionWithSGD\n",
	"from pyspark.mllib.regression import LabeledPoint\n",
	"def parsePoint(line):\n",
	" features = line[1][1:]\n",
	" target = line[1][0]\n",
	" return LabeledPoint(target, features)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"modeldata = finaldf.map(parsePoint)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 48,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"LabeledPoint(0.0, [1.0,0.0,7.25,0.0,1.0,0.0,22.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0])"
	]
	},
	"execution_count": 48,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"modeldata.first()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# 数据切分\n",
	"train, test = modeldata.randomSplit([0.75,0.25])\n",
	"# 建模\n",
	"model = LogisticRegressionWithSGD.train(train,iterations =1000,regType='l2')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Training Error = 0.308056872038\n"
	]
	}
	],
	"source": [
	"# 评估\n",
	"labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))\n",
	"testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test.count())\n",
	"print(\"Training Error = \" + str(testErr))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
No results found