Last active
January 8, 2019 00:56
-
-
Save sremy/1e3a82ea287c9877980c3cd965d81d95 to your computer and use it in GitHub Desktop.
Multiple Linear Regression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Multiple Linear Regression | |
| # https://www.superdatascience.com/machine-learning/ | |
| # Importing the libraries | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| # Importing the dataset | |
| dataset = pd.read_csv('50_Startups.csv') | |
| X = dataset.iloc[:, :-1].values | |
| y = dataset.iloc[:, 4].values | |
| # Encoding categorical data | |
| from sklearn.preprocessing import LabelEncoder, OneHotEncoder | |
| labelencoder = LabelEncoder() | |
| X[:, 3] = labelencoder.fit_transform(X[:, 3]) | |
| onehotencoder = OneHotEncoder(categorical_features = [3]) | |
| X = onehotencoder.fit_transform(X).toarray() | |
| # Avoiding the Dummy Variable Trap | |
| X = X[:, 1:] | |
| # Splitting the dataset into the Training set and Test set | |
| from sklearn.cross_validation import train_test_split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) | |
| # Feature Scaling | |
| """from sklearn.preprocessing import StandardScaler | |
| sc_X = StandardScaler() | |
| X_train = sc_X.fit_transform(X_train) | |
| X_test = sc_X.transform(X_test) | |
| sc_y = StandardScaler() | |
| y_train = sc_y.fit_transform(y_train)""" | |
| # Fitting Multiple Linear Regression to the Training set | |
| from sklearn.linear_model import LinearRegression | |
| regressor = LinearRegression() | |
| regressor.fit(X_train, y_train) | |
| # Predicting the Test set results | |
| y_pred = regressor.predict(X_test) | |
| # Building the optimal model using Backward Elimination | |
| import statsmodels.formula.api as sm | |
| X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1) | |
| X_opt = X[:, [0, 1, 2, 3, 4, 5]] | |
| regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() | |
| regressor_OLS.summary() | |
| X_opt = X[:, [0, 1, 3, 4, 5]] | |
| regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() | |
| regressor_OLS.summary() | |
| X_opt = X[:, [0, 3, 4, 5]] | |
| regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() | |
| regressor_OLS.summary() | |
| X_opt = X[:, [0, 3, 5]] | |
| regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() | |
| regressor_OLS.summary() | |
| X_opt = X[:, [0, 3]] | |
| regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() | |
| regressor_OLS.summary() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Multiple Linear Regression | |
| # https://www.superdatascience.com/machine-learning/ | |
| # Importing the libraries | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| # Importing the dataset | |
| dataset = pd.read_csv('50_Startups.csv') | |
| X = dataset.iloc[:, 0: 4].values | |
| y = dataset.iloc[:, -1].values | |
| # Encoding categorical data | |
| from sklearn.preprocessing import LabelEncoder, OneHotEncoder | |
| labelencoder = LabelEncoder() | |
| X[:, 3] = labelencoder.fit_transform(X[:, 3]) | |
| onehotencoder = OneHotEncoder(categorical_features = [3]) | |
| X = onehotencoder.fit_transform(X).toarray() | |
| # Avoiding the Dummy Variable Trap | |
| X = X[:, 1:] | |
| # Splitting the dataset into the Training set and Test set | |
| from sklearn.cross_validation import train_test_split | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) | |
| # Feature Scaling | |
| """from sklearn.preprocessing import StandardScaler | |
| sc_X = StandardScaler() | |
| X_train = sc_X.fit_transform(X_train) | |
| X_test = sc_X.transform(X_test) | |
| sc_y = StandardScaler() | |
| y_train = sc_y.fit_transform(y_train)""" | |
| # Fitting Multiple Linear Regression to the Training set | |
| from sklearn.linear_model import LinearRegression | |
| regressor = LinearRegression() | |
| regressor.fit(X_train, y_train) | |
| # Predicting the Test set results | |
| y_pred = regressor.predict(X_test) | |
| # Building the optimal model | |
| import statsmodels.formula.api as sm | |
| X = np.append(arr=np.ones([50,1]).astype(int), values=X , axis=1) | |
| def backwardElimination_WithPValues(x, signifiance_level): | |
| import statsmodels.formula.api as sm | |
| numVars = len(x[0]) | |
| for i in range(0, numVars): | |
| regressor_OLS = sm.OLS(y, x).fit() | |
| maxPvalue = max(regressor_OLS.pvalues).astype(float) | |
| if maxPvalue > signifiance_level: | |
| for j in range(0, numVars - i): | |
| if (regressor_OLS.pvalues[j].astype(float) == maxPvalue): | |
| x = np.delete(x, j, 1) | |
| regressor_OLS.summary() | |
| return x | |
| def backwardElimination_WithPValuesAndRSqared(x, SL): | |
| import statsmodels.formula.api as sm | |
| numVars = len(x[0]) | |
| temp = np.zeros((50,6)).astype(int) | |
| for i in range(0, numVars): | |
| regressor_OLS = sm.OLS(y, x).fit() | |
| maxVar = max(regressor_OLS.pvalues).astype(float) | |
| adjR_before = regressor_OLS.rsquared_adj.astype(float) | |
| if maxVar > SL: | |
| for j in range(0, numVars - i): | |
| if (regressor_OLS.pvalues[j].astype(float) == maxVar): | |
| temp[:,j] = x[:, j] | |
| x = np.delete(x, j, 1) | |
| tmp_regressor = sm.OLS(y, x).fit() | |
| adjR_after = tmp_regressor.rsquared_adj.astype(float) | |
| if (adjR_before >= adjR_after): | |
| x_rollback = np.hstack((x, temp[:,[0,j]])) | |
| x_rollback = np.delete(x_rollback, j, 1) | |
| print (regressor_OLS.summary()) | |
| return x_rollback | |
| else: | |
| continue | |
| regressor_OLS.summary() | |
| return x | |
| SL = 0.05 | |
| X_opt = X[:, [0, 1, 2, 3, 4, 5]] | |
| X_Modeled = backwardElimination_WithPValues(X_opt, SL) | |
| X_reduced = X[:, [0,1,2,3,4,5]] | |
| regressor_OLS = sm.OLS(endog=y, exog=X_reduced).fit() | |
| regressor_OLS.summary() | |
| X_reduced = X[:, [0,1,3,4,5]] | |
| regressor_OLS = sm.OLS(endog=y, exog=X_reduced).fit() | |
| regressor_OLS.summary() | |
| X_reduced = X[:, [0,3,5]] | |
| regressor_OLS = sm.OLS(endog=y, exog=X_reduced).fit() | |
| regressor_OLS.summary() | |
| X_reduced = X[:, [0,3]] | |
| regressor_OLS = sm.OLS(endog=y, exog=X_reduced).fit() | |
| regressor_OLS.summary() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment