Created
September 8, 2018 15:06
-
-
Save ClebsonDantasUchoa/efbbcb3e9e742ba08a06175b17ec9dc6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Predizer se um candidado do ENEM fez a prova como treineiro apartir das suas notas na prova" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Importação das bibliotecas" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%matplotlib inline\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "from sklearn import linear_model\n", | |
| "from sklearn import metrics\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import seaborn as sns\n", | |
| "from sklearn import tree" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## leitura dos dados" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "(13730, 167)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>Unnamed: 0</th>\n", | |
| " <th>NU_INSCRICAO</th>\n", | |
| " <th>NU_ANO</th>\n", | |
| " <th>CO_MUNICIPIO_RESIDENCIA</th>\n", | |
| " <th>NO_MUNICIPIO_RESIDENCIA</th>\n", | |
| " <th>CO_UF_RESIDENCIA</th>\n", | |
| " <th>SG_UF_RESIDENCIA</th>\n", | |
| " <th>NU_IDADE</th>\n", | |
| " <th>TP_SEXO</th>\n", | |
| " <th>TP_ESTADO_CIVIL</th>\n", | |
| " <th>...</th>\n", | |
| " <th>Q041</th>\n", | |
| " <th>Q042</th>\n", | |
| " <th>Q043</th>\n", | |
| " <th>Q044</th>\n", | |
| " <th>Q045</th>\n", | |
| " <th>Q046</th>\n", | |
| " <th>Q047</th>\n", | |
| " <th>Q048</th>\n", | |
| " <th>Q049</th>\n", | |
| " <th>Q050</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>ed50e8aaa58e7a806c337585efee9ca41f1eb1ad</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>4314902</td>\n", | |
| " <td>Porto Alegre</td>\n", | |
| " <td>43</td>\n", | |
| " <td>RS</td>\n", | |
| " <td>24</td>\n", | |
| " <td>M</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>D</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2</td>\n", | |
| " <td>2c3acac4b33ec2b195d77e7c04a2d75727fad723</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>2304707</td>\n", | |
| " <td>Granja</td>\n", | |
| " <td>23</td>\n", | |
| " <td>CE</td>\n", | |
| " <td>17</td>\n", | |
| " <td>F</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>3</td>\n", | |
| " <td>f4545f8ccb9ff5c8aad7d32951b3f251a26e6568</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>2304400</td>\n", | |
| " <td>Fortaleza</td>\n", | |
| " <td>23</td>\n", | |
| " <td>CE</td>\n", | |
| " <td>21</td>\n", | |
| " <td>F</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>4</td>\n", | |
| " <td>3d6ec248fef899c414e77f82d5c6d2bffbeaf7fe</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>3304557</td>\n", | |
| " <td>Rio de Janeiro</td>\n", | |
| " <td>33</td>\n", | |
| " <td>RJ</td>\n", | |
| " <td>25</td>\n", | |
| " <td>F</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>D</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>5</td>\n", | |
| " <td>bf896ac8d3ecadd6dba1dfbf50110afcbf5d3268</td>\n", | |
| " <td>2016</td>\n", | |
| " <td>1302603</td>\n", | |
| " <td>Manaus</td>\n", | |
| " <td>13</td>\n", | |
| " <td>AM</td>\n", | |
| " <td>28</td>\n", | |
| " <td>M</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>5 rows × 167 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " Unnamed: 0 NU_INSCRICAO NU_ANO \\\n", | |
| "0 1 ed50e8aaa58e7a806c337585efee9ca41f1eb1ad 2016 \n", | |
| "1 2 2c3acac4b33ec2b195d77e7c04a2d75727fad723 2016 \n", | |
| "2 3 f4545f8ccb9ff5c8aad7d32951b3f251a26e6568 2016 \n", | |
| "3 4 3d6ec248fef899c414e77f82d5c6d2bffbeaf7fe 2016 \n", | |
| "4 5 bf896ac8d3ecadd6dba1dfbf50110afcbf5d3268 2016 \n", | |
| "\n", | |
| " CO_MUNICIPIO_RESIDENCIA NO_MUNICIPIO_RESIDENCIA CO_UF_RESIDENCIA \\\n", | |
| "0 4314902 Porto Alegre 43 \n", | |
| "1 2304707 Granja 23 \n", | |
| "2 2304400 Fortaleza 23 \n", | |
| "3 3304557 Rio de Janeiro 33 \n", | |
| "4 1302603 Manaus 13 \n", | |
| "\n", | |
| " SG_UF_RESIDENCIA NU_IDADE TP_SEXO TP_ESTADO_CIVIL ... Q041 Q042 Q043 \\\n", | |
| "0 RS 24 M 0.0 ... 5.0 A A \n", | |
| "1 CE 17 F 0.0 ... NaN A A \n", | |
| "2 CE 21 F 0.0 ... NaN A A \n", | |
| "3 RJ 25 F 0.0 ... 5.0 C A \n", | |
| "4 AM 28 M 0.0 ... NaN A A \n", | |
| "\n", | |
| " Q044 Q045 Q046 Q047 Q048 Q049 Q050 \n", | |
| "0 A A A A A B D \n", | |
| "1 C A B A A C A \n", | |
| "2 A A C A A B A \n", | |
| "3 A A A D A A A \n", | |
| "4 A A A A A A A \n", | |
| "\n", | |
| "[5 rows x 167 columns]" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data = pd.read_csv(\"enem.csv\")\n", | |
| "print(data.shape)\n", | |
| "data.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Quantidade de treineiros" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "0\n", | |
| "-------------------\n", | |
| "IN_TREINEIRO\n", | |
| "0 11947\n", | |
| "1 1783\n", | |
| "dtype: int64\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(data['IN_TREINEIRO'].isnull().sum())\n", | |
| "print('-------------------')\n", | |
| "print(data.groupby('IN_TREINEIRO').size())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Filtrando dados importantes" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "(13730, 6)\n", | |
| "---------------\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>NU_NOTA_CN</th>\n", | |
| " <th>NU_NOTA_CH</th>\n", | |
| " <th>NU_NOTA_LC</th>\n", | |
| " <th>NU_NOTA_REDACAO</th>\n", | |
| " <th>NU_NOTA_MT</th>\n", | |
| " <th>IN_TREINEIRO</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>436.3</td>\n", | |
| " <td>495.4</td>\n", | |
| " <td>581.2</td>\n", | |
| " <td>520.0</td>\n", | |
| " <td>399.4</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>474.5</td>\n", | |
| " <td>544.1</td>\n", | |
| " <td>599.0</td>\n", | |
| " <td>580.0</td>\n", | |
| " <td>459.8</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " NU_NOTA_CN NU_NOTA_CH NU_NOTA_LC NU_NOTA_REDACAO NU_NOTA_MT \\\n", | |
| "0 436.3 495.4 581.2 520.0 399.4 \n", | |
| "1 474.5 544.1 599.0 580.0 459.8 \n", | |
| "2 NaN NaN NaN NaN NaN \n", | |
| "3 NaN NaN NaN NaN NaN \n", | |
| "4 NaN NaN NaN NaN NaN \n", | |
| "\n", | |
| " IN_TREINEIRO \n", | |
| "0 0 \n", | |
| "1 0 \n", | |
| "2 0 \n", | |
| "3 0 \n", | |
| "4 0 " | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dados = pd.DataFrame()\n", | |
| "dados['NU_NOTA_CN'] = data['NU_NOTA_CN']\n", | |
| "dados['NU_NOTA_CH'] = data['NU_NOTA_CH']\n", | |
| "dados['NU_NOTA_LC'] = data['NU_NOTA_LC']\n", | |
| "dados['NU_NOTA_REDACAO'] = data['NU_NOTA_REDACAO']\n", | |
| "dados['NU_NOTA_MT'] = data['NU_NOTA_MT']\n", | |
| "dados['IN_TREINEIRO'] = data['IN_TREINEIRO']\n", | |
| "print(dados.shape)\n", | |
| "print(\"---------------\")\n", | |
| "dados.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Visualização dos dados" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>NU_NOTA_CN</th>\n", | |
| " <th>NU_NOTA_CH</th>\n", | |
| " <th>NU_NOTA_LC</th>\n", | |
| " <th>NU_NOTA_REDACAO</th>\n", | |
| " <th>NU_NOTA_MT</th>\n", | |
| " <th>IN_TREINEIRO</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>count</th>\n", | |
| " <td>10341.000000</td>\n", | |
| " <td>10341.000000</td>\n", | |
| " <td>10133.000000</td>\n", | |
| " <td>10133.000000</td>\n", | |
| " <td>10133.000000</td>\n", | |
| " <td>13730.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>mean</th>\n", | |
| " <td>473.495155</td>\n", | |
| " <td>529.661087</td>\n", | |
| " <td>516.472841</td>\n", | |
| " <td>529.048258</td>\n", | |
| " <td>482.497928</td>\n", | |
| " <td>0.129862</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>std</th>\n", | |
| " <td>71.093674</td>\n", | |
| " <td>73.726344</td>\n", | |
| " <td>68.688190</td>\n", | |
| " <td>154.294758</td>\n", | |
| " <td>99.826323</td>\n", | |
| " <td>0.336163</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>min</th>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>0.000000</td>\n", | |
| " <td>0.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25%</th>\n", | |
| " <td>419.900000</td>\n", | |
| " <td>480.400000</td>\n", | |
| " <td>468.100000</td>\n", | |
| " <td>440.000000</td>\n", | |
| " <td>408.900000</td>\n", | |
| " <td>0.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>50%</th>\n", | |
| " <td>459.800000</td>\n", | |
| " <td>532.000000</td>\n", | |
| " <td>520.900000</td>\n", | |
| " <td>540.000000</td>\n", | |
| " <td>461.200000</td>\n", | |
| " <td>0.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>75%</th>\n", | |
| " <td>514.500000</td>\n", | |
| " <td>581.200000</td>\n", | |
| " <td>564.900000</td>\n", | |
| " <td>600.000000</td>\n", | |
| " <td>537.600000</td>\n", | |
| " <td>0.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>max</th>\n", | |
| " <td>806.400000</td>\n", | |
| " <td>807.000000</td>\n", | |
| " <td>763.600000</td>\n", | |
| " <td>1000.000000</td>\n", | |
| " <td>952.000000</td>\n", | |
| " <td>1.000000</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " NU_NOTA_CN NU_NOTA_CH NU_NOTA_LC NU_NOTA_REDACAO \\\n", | |
| "count 10341.000000 10341.000000 10133.000000 10133.000000 \n", | |
| "mean 473.495155 529.661087 516.472841 529.048258 \n", | |
| "std 71.093674 73.726344 68.688190 154.294758 \n", | |
| "min 0.000000 0.000000 0.000000 0.000000 \n", | |
| "25% 419.900000 480.400000 468.100000 440.000000 \n", | |
| "50% 459.800000 532.000000 520.900000 540.000000 \n", | |
| "75% 514.500000 581.200000 564.900000 600.000000 \n", | |
| "max 806.400000 807.000000 763.600000 1000.000000 \n", | |
| "\n", | |
| " NU_NOTA_MT IN_TREINEIRO \n", | |
| "count 10133.000000 13730.000000 \n", | |
| "mean 482.497928 0.129862 \n", | |
| "std 99.826323 0.336163 \n", | |
| "min 0.000000 0.000000 \n", | |
| "25% 408.900000 0.000000 \n", | |
| "50% 461.200000 0.000000 \n", | |
| "75% 537.600000 0.000000 \n", | |
| "max 952.000000 1.000000 " | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dados.describe()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<Figure size 432x288 with 2 Axes>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "corr = dados.corr()\n", | |
| "sns.heatmap(corr, xticklabels=dados.columns, yticklabels=dados.columns, cmap='RdBu');" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<Figure size 1080x1080 with 6 Axes>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "dados.hist(figsize=(15,15), bins=30);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Removendo linhas com dados faltantes" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "(10097, 6)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>NU_NOTA_CN</th>\n", | |
| " <th>NU_NOTA_CH</th>\n", | |
| " <th>NU_NOTA_LC</th>\n", | |
| " <th>NU_NOTA_REDACAO</th>\n", | |
| " <th>NU_NOTA_MT</th>\n", | |
| " <th>IN_TREINEIRO</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>436.3</td>\n", | |
| " <td>495.4</td>\n", | |
| " <td>581.2</td>\n", | |
| " <td>520.0</td>\n", | |
| " <td>399.4</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>474.5</td>\n", | |
| " <td>544.1</td>\n", | |
| " <td>599.0</td>\n", | |
| " <td>580.0</td>\n", | |
| " <td>459.8</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>439.7</td>\n", | |
| " <td>583.2</td>\n", | |
| " <td>410.9</td>\n", | |
| " <td>620.0</td>\n", | |
| " <td>364.5</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>420.1</td>\n", | |
| " <td>604.2</td>\n", | |
| " <td>484.5</td>\n", | |
| " <td>560.0</td>\n", | |
| " <td>529.2</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>619.6</td>\n", | |
| " <td>625.8</td>\n", | |
| " <td>611.2</td>\n", | |
| " <td>620.0</td>\n", | |
| " <td>566.7</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " NU_NOTA_CN NU_NOTA_CH NU_NOTA_LC NU_NOTA_REDACAO NU_NOTA_MT \\\n", | |
| "0 436.3 495.4 581.2 520.0 399.4 \n", | |
| "1 474.5 544.1 599.0 580.0 459.8 \n", | |
| "5 439.7 583.2 410.9 620.0 364.5 \n", | |
| "6 420.1 604.2 484.5 560.0 529.2 \n", | |
| "7 619.6 625.8 611.2 620.0 566.7 \n", | |
| "\n", | |
| " IN_TREINEIRO \n", | |
| "0 0 \n", | |
| "1 0 \n", | |
| "5 0 \n", | |
| "6 0 \n", | |
| "7 0 " | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dados = dados.dropna()\n", | |
| "print(dados.shape)\n", | |
| "dados.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Separando em dados de treino e dados de teste\n", | |
| "### Usaremos 75% dos valores para treino e 25% para teste" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "(7572, 5) (2525, 5) (7572,) (2525,)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "values = dados.values\n", | |
| "np.random.seed(1)\n", | |
| "np.random.shuffle(values)\n", | |
| "x_train = values[ :int(10097*0.75), 0:5]\n", | |
| "x_test = values[ int(10097*0.75):, 0:5]\n", | |
| "y_train = values[ :int(10097*0.75), 5]\n", | |
| "y_test = values[ int(10097*0.75):, 5]\n", | |
| "print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Criação do modelos de predição\n", | |
| "### Criação do modelo de Regressão Logistica" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.8598019801980198" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "lr = linear_model.LogisticRegression()\n", | |
| "lr.fit(x_train, y_train)\n", | |
| "lr_pred = lr.predict(x_test)\n", | |
| "metrics.accuracy_score(y_test, lr_pred)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Criação do modelo de árvore de classificação" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.8598019801980198" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "clf = tree.DecisionTreeClassifier()\n", | |
| "clf.fit(x_train, y_train)\n", | |
| "clf_pred = lr.predict(x_test)\n", | |
| "metrics.accuracy_score(y_test, clf_pred)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Geração dos gráficos de comparação entre os modelos" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "predicoes = pd.DataFrame()\n", | |
| "predicoes['Regressao Linear'] = lr_pred\n", | |
| "predicoes['Arvore de decisao'] = clf_pred" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " precision recall f1-score support\n", | |
| "\n", | |
| " 0.0 0.86 1.00 0.92 2172\n", | |
| " 1.0 0.00 0.00 0.00 353\n", | |
| "\n", | |
| "avg / total 0.74 0.86 0.80 2525\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(metrics.classification_report(y_test, lr_pred))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " precision recall f1-score support\n", | |
| "\n", | |
| " 0.0 0.86 1.00 0.92 2172\n", | |
| " 1.0 0.00 0.00 0.00 353\n", | |
| "\n", | |
| "avg / total 0.74 0.86 0.80 2525\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(metrics.classification_report(y_test, clf_pred))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<Figure size 432x288 with 1 Axes>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "plt.plot(lr_pred, clf_pred);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<Figure size 432x288 with 1 Axes>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "predicoes.plot(kind='bar', grid=True);" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.5" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment