Created
September 6, 2018 01:58
-
-
Save ClebsonDantasUchoa/f28beebcbcb91f2a79c191ba39f83470 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%matplotlib inline\n", | |
| "import pandas as pd\n", | |
| "from sklearn import linear_model\n", | |
| "from sklearn import linear_model\n", | |
| "from sklearn import metrics" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data = pd.read_csv(\"train.csv\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(13730, 167)" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data.shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dados = pd.DataFrame()\n", | |
| "dados['Q047'] = data['Q047'] #Tipo de escola que concluiu o EM, ALFANUMÉRICA, TRATAR\n", | |
| "dados['Q046'] = data['Q046'] #Já concluiu o EM?, ALFANUMÉRICA, TRATAR\n", | |
| "dados['Q006'] = data['Q006'] #RENDA MENSAL DA FAMILIA, ALFANUMÉRICA, TRATAR\n", | |
| "dados['Q005'] = data['Q005'] #quantidade de pessoas na residencia\n", | |
| "dados['IN_TREINEIRO'] = data['IN_TREINEIRO']\n", | |
| "#dados['TP_ENSINO'] = data['TP_ENSINO']\n", | |
| "dados['TP_ESCOLA'] = data['TP_ESCOLA']\n", | |
| "dados['TP_ST_CONCLUSAO'] = data['TP_ST_CONCLUSAO']\n", | |
| "dados['TP_COR_RACA'] = data['TP_COR_RACA']\n", | |
| "dados['NU_IDADE'] = data['NU_IDADE']\n", | |
| "dados['CO_UF_RESIDENCIA'] = data['CO_UF_RESIDENCIA']\n", | |
| "dados['NU_NOTA_CN'] = data['NU_NOTA_CN']\n", | |
| "dados['NU_NOTA_CH'] = data['NU_NOTA_CH']\n", | |
| "dados['NU_NOTA_LC'] = data['NU_NOTA_LC']\n", | |
| "dados['NU_NOTA_REDACAO'] = data['NU_NOTA_REDACAO']\n", | |
| "dados['NU_NOTA_MT'] = data['NU_NOTA_MT']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(13730, 15)" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dados.shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Q047 0\n", | |
| "Q046 0\n", | |
| "Q006 0\n", | |
| "Q005 0\n", | |
| "IN_TREINEIRO 0\n", | |
| "TP_ESCOLA 0\n", | |
| "TP_ST_CONCLUSAO 0\n", | |
| "TP_COR_RACA 0\n", | |
| "NU_IDADE 0\n", | |
| "CO_UF_RESIDENCIA 0\n", | |
| "dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dados.isnull().sum().head(10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Q047 0\n", | |
| "Q046 0\n", | |
| "Q006 0\n", | |
| "Q005 0\n", | |
| "IN_TREINEIRO 0\n", | |
| "TP_ESCOLA 0\n", | |
| "TP_ST_CONCLUSAO 0\n", | |
| "TP_COR_RACA 0\n", | |
| "NU_IDADE 0\n", | |
| "CO_UF_RESIDENCIA 0\n", | |
| "NU_NOTA_CN 0\n", | |
| "NU_NOTA_CH 0\n", | |
| "NU_NOTA_LC 0\n", | |
| "NU_NOTA_REDACAO 0\n", | |
| "NU_NOTA_MT 0\n", | |
| "dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dados = dados.dropna()\n", | |
| "dados.isnull().sum()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(10097, 15)" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dados.shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "matematica = pd.DataFrame()\n", | |
| "matematica['NU_NOTA_MT'] = dados['NU_NOTA_MT']\n", | |
| "dados = dados.drop('NU_NOTA_MT', axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dados = pd.get_dummies(dados, drop_first = True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(10097, 34)" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dados.shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "NU_NOTA_MT 0\n", | |
| "dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "matematica.isnull().sum()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dadosValues = dados.values\n", | |
| "matematicaValues = matematica.values" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dados_treino = dadosValues[:822, :]\n", | |
| "dados_teste = dadosValues[822: , :]\n", | |
| "matematica_treino = matematicaValues[ :822]\n", | |
| "matematica_teste = matematicaValues[ 822: ]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "modelo = linear_model.LinearRegression()\n", | |
| "modelo.fit(dados_treino, matematica_treino)\n", | |
| "predicao = modelo.predict(dados_teste)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.5" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment