Created
October 20, 2017 13:59
-
-
Save vrajesh26/6dccca983aeb2414696f7a6cb357055a to your computer and use it in GitHub Desktop.
To predict which individuals might leave an organisation based on patterns and use key variables that influence churn based on IBM HR Analytics employee attrition data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| employee<-read.csv("D:/WA_Fn-UseC_-HR-Employee-Attrition.csv") | |
| View(employee) | |
| str(employee) | |
| dim(employee) | |
| colnames(employee)[1]="Age" | |
| library("caTools") | |
| set.seed(12345) | |
| emp <- sample.split(employee$Attrition,SplitRatio = 0.75) | |
| emp_train <- subset(employee,emp==TRUE) | |
| emp_test <- subset(employee,emp==FALSE) | |
| View(emp_train) | |
| summary(is.na(emp_train)) | |
| colSums(is.na(employee)) | |
| library("ggplot2") | |
| ggplot(data=emp_train,mapping=aes(Attrition,fill=Attrition))+geom_bar() | |
| prop.table(table(emp_train$Attrition)) | |
| ggplot(data=emp_train,mapping=aes(x=Age,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(BusinessTravel,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(x=DailyRate,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(Department,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(x=DistanceFromHome,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(Education,fill=Attrition))+geom_bar() | |
| tt<-table(Train1$Attrition,Train1$BusinessTravel) | |
| chisq.test(tt) | |
| ggplot(data=emp_train,mapping=aes(EducationField,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(Gender,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(EnvironmentSatisfaction,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(HourlyRate,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(JobInvolvement,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(JobLevel,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(JobRole,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(JobSatisfaction,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(MaritalStatus,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(MonthlyIncome,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(MonthlyRate,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(NumCompaniesWorked,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(OverTime,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(PercentSalaryHike,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(PerformanceRating,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(RelationshipSatisfaction,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(StockOptionLevel,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(TotalWorkingYears,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(TrainingTimesLastYear,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(WorkLifeBalance,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(YearsAtCompany,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(YearsInCurrentRole,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(YearsSinceLastPromotion,fill=Attrition))+geom_bar() | |
| ggplot(data=emp_train,mapping=aes(YearsWithCurrManager,fill=Attrition))+geom_bar() | |
| summary(emp_train$DistanceFromHome) | |
| library(ROSE) | |
| abcd <- ROSE(Attrition ~ ., data = emp_train, seed = 1,p=0.45)$data | |
| table(abcd$Attrition) | |
| View(emp_train) | |
| as.integer(emp_train1$Age) | |
| #Feature Engg | |
| emp_train$TenurePerJob<-ifelse(emp_train$NumCompaniesWorked!=0, emp_train$TotalWorkingYears/emp_train$NumCompaniesWorked,0) | |
| emp_train$YearWithoutChange <- emp_train$YearsInCurrentRole - emp_train$YearsSinceLastPromotion | |
| emp_train$YearsWithoutChange2 <- emp_train$TotalWorkingYears - emp_train$YearsSinceLastPromotion | |
| ggplot(data=emp_train,mapping=aes(TenurePerJob,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(YearWithoutChange,fill=Attrition))+geom_histogram() | |
| ggplot(data=emp_train,mapping=aes(YearsWithoutChange2,fill=Attrition))+geom_histogram() | |
| Med_HR <- median(emp_train[emp_train$Department == 'Human Resources',]$MonthlyIncome) | |
| Med_RnD <- median(emp_train[emp_train$Department == 'Research & Development',]$MonthlyIncome) | |
| Med_Sales <- median(emp_train[emp_train$Department == 'Sales',]$MonthlyIncome) | |
| emp_train$CompaRatioDep <- ifelse(emp_train$Department == 'Human Resources', | |
| emp_train$MonthlyIncome/Med_HR, | |
| ifelse(emp_train$Department=='Research & Development',emp_train$MonthlyIncome/Med_RnD,emp_train$MonthlyIncome/Med_Sales)) | |
| ggplot(data=emp_train,mapping=aes(CompaRatioDep,fill=Attrition))+geom_histogram() | |
| ##### | |
| emp_test$TenurePerJob<-ifelse(emp_test$NumCompaniesWorked!=0, emp_test$TotalWorkingYears/emp_test$NumCompaniesWorked,0) | |
| emp_test$YearWithoutChange <- emp_test$YearsInCurrentRole - emp_test$YearsSinceLastPromotion | |
| emp_test$YearsWithoutChange2 <- emp_test$TotalWorkingYears - emp_test$YearsSinceLastPromotion | |
| emp_test$CompaRatioDep <- ifelse(emp_test$Department == 'Human Resources',emp_test$MonthlyIncome/Med_HR,ifelse(emp_test$Department=='Research & Development',emp_test$MonthlyIncome/Med_RnD,emp_test$MonthlyIncome/Med_Sales)) | |
| #Binning | |
| emp_train$AgeGroup <- with(emp_train,ifelse(Age>55,8,ifelse(Age>50,7,ifelse(Age>45,6,ifelse(Age>40,5,ifelse(Age>35,4,ifelse(Age>30,3,ifelse(Age>25,2,1)))))))) | |
| emp_train$DistanceGroup <- with(emp_train,ifelse(DistanceFromHome>25,6,ifelse(DistanceFromHome>20,5,ifelse(DistanceFromHome>15,4,ifelse(DistanceFromHome>10,3,ifelse(DistanceFromHome>5,2,1)))))) | |
| emp_train$YearsWithManagerGroup <- with(emp_train,ifelse(YearsWithCurrManager>15,5,ifelse(YearsWithCurrManager>10,4,ifelse(YearsWithCurrManager>5,3,ifelse(YearsWithCurrManager>2,2,1))))) | |
| #emp_train$YearsWithManagerGroup <- with(emp_train,ifelse(YearsWithCurrManager>15,5,ifelse(YearsWithCurrManager>10,4,ifelse(YearsWithCurrManager>5,3,ifelse(YearsWithCurrManager>2,2,1))))) | |
| emp_train$TenureGroup <- with(emp_train,ifelse(TenurePerJob>35,9,ifelse(TenurePerJob>30,8,ifelse(TenurePerJob>25,7,ifelse(TenurePerJob>20,6,ifelse(TenurePerJob>15,5,ifelse(TenurePerJob>10,4,ifelse(TenurePerJob>5,3,ifelse(TenurePerJob>2,2,1))))))))) | |
| emp_train$Change2Group <- with(emp_train,ifelse(YearsWithoutChange2>10,3,ifelse(YearsWithoutChange2>5,2,1))) | |
| emp_train$Change1Group <- with(emp_train,ifelse(YearWithoutChange>2.5,3,ifelse(YearWithoutChange>-2.5,2,1))) | |
| emp_train$WorkYearGroup <- with(emp_train,ifelse(TotalWorkingYears>35,9,ifelse(TotalWorkingYears>30,8,ifelse(TotalWorkingYears>25,7,ifelse(TotalWorkingYears>20,6,ifelse(TotalWorkingYears>15,5,ifelse(TotalWorkingYears>10,4,ifelse(TotalWorkingYears>5,3,ifelse(TotalWorkingYears>2,2,1))))))))) | |
| emp_train$NumCompGroup <- with(emp_train,ifelse(NumCompaniesWorked>4,3,ifelse(NumCompaniesWorked>2,2,1))) | |
| #Testing | |
| #emp_test$tenureperjob<-ifelse(emp_test$NumCompaniesWorked!=0, emp_test$TotalWorkingYears/emp_test$NumCompaniesWorked,0) | |
| #emp_test$YearWithoutChange <- emp_test$YearsInCurrentRole - emp_test$YearsSinceLastPromotion | |
| #emp_test$YearsWithoutChange2 <- emp_test$TotalWorkingYears - emp_test$YearsSinceLastPromotion | |
| emp_test$AgeGroup <- with(emp_test,ifelse(Age>55,8,ifelse(Age>50,7,ifelse(Age>45,6,ifelse(Age>40,5,ifelse(Age>35,4,ifelse(Age>30,3,ifelse(Age>25,2,1)))))))) | |
| emp_test$DistanceGroup <- with(emp_test,ifelse(DistanceFromHome>25,6,ifelse(DistanceFromHome>20,5,ifelse(DistanceFromHome>15,4,ifelse(DistanceFromHome>10,3,ifelse(DistanceFromHome>5,2,1)))))) | |
| emp_test$YearsWithManagerGroup <- with(emp_test,ifelse(YearsWithCurrManager>15,5,ifelse(YearsWithCurrManager>10,4,ifelse(YearsWithCurrManager>5,3,ifelse(YearsWithCurrManager>2,2,1))))) | |
| #emp_test$YearsWithManagerGroup <- with(emp_test,ifelse(YearsWithCurrManager>15,5,ifelse(YearsWithCurrManager>10,4,ifelse(YearsWithCurrManager>5,3,ifelse(YearsWithCurrManager>2,2,1))))) | |
| emp_test$TenureGroup <- with(emp_test,ifelse(TenurePerJob>35,9,ifelse(TenurePerJob>30,8,ifelse(TenurePerJob>25,7,ifelse(TenurePerJob>20,6,ifelse(TenurePerJob>15,5,ifelse(TenurePerJob>10,4,ifelse(TenurePerJob>5,3,ifelse(TenurePerJob>2,2,1))))))))) | |
| emp_test$Change2Group <- with(emp_test,ifelse(YearsWithoutChange2>10,3,ifelse(YearsWithoutChange2>5,2,1))) | |
| emp_test$Change1Group <- with(emp_test,ifelse(YearWithoutChange>2.5,3,ifelse(YearWithoutChange>-2.5,2,1))) | |
| emp_test$WorkYearGroup <- with(emp_test,ifelse(TotalWorkingYears>35,9,ifelse(TotalWorkingYears>30,8,ifelse(TotalWorkingYears>25,7,ifelse(TotalWorkingYears>20,6,ifelse(TotalWorkingYears>15,5,ifelse(TotalWorkingYears>10,4,ifelse(TotalWorkingYears>5,3,ifelse(TotalWorkingYears>2,2,1))))))))) | |
| emp_test$NumCompGroup <- with(emp_test,ifelse(NumCompaniesWorked>4,3,ifelse(NumCompaniesWorked>2,2,1))) | |
| colnames(emp_train) | |
| colnames(emp_test) | |
| ## | |
| install.packages("corrplot") | |
| library(corrplot) | |
| library(psych) | |
| str(emp_train) | |
| for(i in 1:ncol(emp_train)){ | |
| emp_train[,i]<- as.integer(emp_train[,i]) | |
| } | |
| corrplot(cor(emp_train)) | |
| ## | |
| #Correlation | |
| cor(emp_train[,c(1,4,6,7,10,11,13,14,15,17,19,20,21,24,25,26,28:38)]) | |
| Train <- emp_train[,c(3,5,7,8,12,14,15,16,17,18,21,23,24,26,28,29,30,31,40:47,2)] | |
| Test=emp_test[,c(2,3,5,7,8,12,14,15,16,17,18,21,23,24,26,28,29,30,31,40:47)] | |
| #Test=Test[,-1] | |
| Train$BusinessTravel <- as.integer(Train$BusinessTravel) | |
| Train$Department <- as.integer(Train$Department) | |
| Train$Gender <- as.integer(Train$Gender) | |
| Train$MaritalStatus <- as.integer(Train$MaritalStatus) | |
| Train$OverTime <- as.integer(Train$OverTime) | |
| Train$JobRole <- as.integer(Train$JobRole) | |
| Train$EducationField <- as.integer(Train$EducationField) | |
| Test$BusinessTravel <- as.integer(Test$BusinessTravel) | |
| Test$Department <- as.integer(Test$Department) | |
| Test$Gender <- as.integer(Test$Gender) | |
| Test$MaritalStatus <- as.integer(Test$MaritalStatus) | |
| Test$OverTime <- as.integer(Test$OverTime) | |
| Test$JobRole <- as.integer(Test$JobRole) | |
| Test$EducationField <- as.integer(Test$EducationField) | |
| Train1 <- Train | |
| Test1<-Test | |
| Test1=Test1[,-1] | |
| for(i in 1:ncol(Train1)){ | |
| Train1[,i] <- as.factor(Train1[,i]) | |
| } | |
| for(i in 1:ncol(Test1)){ | |
| Test1[,i] <- as.factor(Test1[,i]) | |
| } | |
| fit_rpart <- train(Attrition ~.,Train1,method = 'rpart', trControl = trainControl(method = 'cv',number = 3)) | |
| predict(fit_rpart,newdata=Test1) | |
| set.seed(123) | |
| fit_rf <- train(Attrition ~.,Train1,method = 'rf', trControl = trainControl(method = 'repeatedcv',number = 3)) | |
| pred_rf<-predict(fit_rf,newdata=Test1) | |
| library("xgboost") | |
| xgbGrid <- expand.grid(nrounds = 300, | |
| max_depth = 1, | |
| eta = 0.3, | |
| gamma = 0.01, | |
| colsample_bytree = .7, | |
| min_child_weight = 1, | |
| subsample = 0.9) | |
| set.seed(12) | |
| fit_xgb <- train(Attrition ~.,Train1,method = 'xgbTree',tuneGrid = xgbGrid,trControl = trainControl(method = 'repeatedcv',number = 3,classProbs = TRUE)) | |
| pred_xgb<-predict(fit_xgb,newdata=Test1) | |
| confusionMatrix(pred_rf,Test$Attrition) | |
| confusionMatrix(pred_xgb,Test$Attrition) | |
| varImp(fit_rf) | |
| awb<-data.frame(Emp_No=emp_test$EmployeeNumber,Employee_churn=pred_xgb) | |
| View(awb) | |
| write.csv(awb,file = "Employee churn.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment