library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(nnet)
df <- read.csv("C:/Users/frann/Downloads/Semester 5/Supervised Learning/Project/employee_promotion.csv")
df[df==""]<-NA #make blank values NA first in order to remove them
sum(is.na(df)) #There are 9093 missing values
## [1] 9093
c<-colSums(is.na(df))
View(c)
df<-na.omit(df) #removes all NA values
#Binary logistic regression
df<- select(df, -employee_id)
#split dataset into training and test
set.seed(42)
df_train<-df%>%sample_frac(.8)
df_test<-df%>%setdiff(df_train)
df_model<-glm(is_promoted~., family=binomial("logit"), data=df)
summary(df_model)
##
## Call:
## glm(formula = is_promoted ~ ., family = binomial("logit"), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7319 -0.3945 -0.2706 -0.1760 3.4420
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -24.987976 0.483662 -51.664 < 2e-16 ***
## departmentFinance 5.886840 0.162595 36.205 < 2e-16 ***
## departmentHR 7.995435 0.210487 37.985 < 2e-16 ***
## departmentLegal 5.461325 0.231412 23.600 < 2e-16 ***
## departmentOperations 6.078879 0.138917 43.759 < 2e-16 ***
## departmentProcurement 3.716943 0.104449 35.586 < 2e-16 ***
## departmentR&D -0.576220 0.152930 -3.768 0.000165 ***
## departmentSales & Marketing 8.476310 0.182056 46.559 < 2e-16 ***
## departmentTechnology 1.481909 0.076979 19.251 < 2e-16 ***
## regionregion_10 -0.033918 0.244489 -0.139 0.889665
## regionregion_11 -0.348618 0.226556 -1.539 0.123860
## regionregion_12 -0.793024 0.345762 -2.294 0.021816 *
## regionregion_13 0.045602 0.193507 0.236 0.813697
## regionregion_14 -0.082375 0.234916 -0.351 0.725844
## regionregion_15 0.029014 0.194344 0.149 0.881322
## regionregion_16 -0.178835 0.216588 -0.826 0.408980
## regionregion_17 0.475867 0.217754 2.185 0.028864 *
## regionregion_18 -10.251740 106.447508 -0.096 0.923276
## regionregion_19 -0.143300 0.238804 -0.600 0.548457
## regionregion_2 0.202512 0.179665 1.127 0.259674
## regionregion_20 -0.368833 0.250970 -1.470 0.141662
## regionregion_21 -0.286065 0.322243 -0.888 0.374686
## regionregion_22 0.505042 0.180343 2.800 0.005103 **
## regionregion_23 0.369730 0.205290 1.801 0.071701 .
## regionregion_24 -0.388005 0.310057 -1.251 0.210789
## regionregion_25 0.602300 0.214190 2.812 0.004924 **
## regionregion_26 -0.094985 0.201513 -0.471 0.637383
## regionregion_27 0.071358 0.207352 0.344 0.730740
## regionregion_28 0.307260 0.204428 1.503 0.132833
## regionregion_29 -0.490291 0.251071 -1.953 0.050843 .
## regionregion_3 0.266655 0.279978 0.952 0.340886
## regionregion_30 -0.038334 0.253940 -0.151 0.880010
## regionregion_31 -0.269143 0.210604 -1.278 0.201264
## regionregion_32 -0.519077 0.261258 -1.987 0.046940 *
## regionregion_33 -0.485774 0.375519 -1.294 0.195801
## regionregion_34 -1.004135 0.465531 -2.157 0.031008 *
## regionregion_4 0.668660 0.192678 3.470 0.000520 ***
## regionregion_5 -0.569068 0.272437 -2.089 0.036725 *
## regionregion_6 -0.516632 0.290743 -1.777 0.075579 .
## regionregion_7 0.380528 0.182466 2.085 0.037027 *
## regionregion_8 0.065448 0.244454 0.268 0.788905
## regionregion_9 -1.086523 0.428900 -2.533 0.011300 *
## educationBelow Secondary -0.364018 0.207072 -1.758 0.078759 .
## educationMaster's & above 0.192801 0.045253 4.261 2.04e-05 ***
## genderm 0.010760 0.043613 0.247 0.805130
## recruitment_channelreferred -0.097063 0.118333 -0.820 0.412074
## recruitment_channelsourcing 0.025679 0.039140 0.656 0.511780
## no_of_trainings -0.183348 0.037004 -4.955 7.24e-07 ***
## age -0.029807 0.003849 -7.745 9.56e-15 ***
## previous_year_rating 0.500918 0.017853 28.059 < 2e-16 ***
## length_of_service 0.008413 0.006162 1.365 0.172157
## awards_won 1.812311 0.086665 20.912 < 2e-16 ***
## avg_training_score 0.254306 0.004954 51.330 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27583 on 46379 degrees of freedom
## Residual deviance: 20172 on 46327 degrees of freedom
## AIC: 20278
##
## Number of Fisher Scoring iterations: 12
#Train accuracy
prob_in_df_train<-predict(df_model, type="response", newdata=df_train)
pred_in_df_train<-ifelse(prob_in_df_train>.5, 1,0)
#Train confusion table
confusion_in_df_train<-table(prediction=pred_in_df_train, actual=df_train$is_promoted)
confusion_in_df_train
## actual
## prediction 0 1
## 0 33783 2258
## 1 62 1001
#33783 employees were predicted correctly as not recommended for promotion. 62 were predicted incorrectly as recommended for promotion when they actually were not. 1001 employees were predicted correctly as recommended for promotion. 2258 were predicted incorrectly as not recommended for promotion when they actually were recommended.
accuracy_in_df_train<- sum(diag(confusion_in_df_train)/sum(confusion_in_df_train))
accuracy_in_df_train
## [1] 0.937473
#93.75% accuracy
#Test accuracy
prob_in_df_test<-predict(df_model, type="response", newdata=df_test)
pred_in_df_test<-ifelse(prob_in_df_test>.5, 1,0)
#Test confusion table
confusion_in_df_test<-table(prediction=pred_in_df_test, actual=df_test$is_promoted)
confusion_in_df_test
## actual
## prediction 0 1
## 0 8416 570
## 1 10 242
#8416 employees were predicted correctly as not recommended for promotion. 10 were predicted incorrectly as recommended for promotion when they actually were not. 242 employees were predicted correctly as recommended for promotion. 570 were predicted incorrectly as not recommended for promotion when they actually were recommended.
accuracy_in_df_test<- sum(diag(confusion_in_df_test)/sum(confusion_in_df_test))
accuracy_in_df_test
## [1] 0.9372158
#93.72% accuracy, so the training set is only a tiny bit more accurate. Algorithm learns very well. Test and training accuracy are on the same scale so there is no overfitting problem.
s<- ifelse(pred_in_df_test>.5, 1,0)
s<- factor(pred_in_df_test, levels=c(1,0), labels= c("Yes", "No"))
v<- factor(df_test$is_promoted, levels=c(1,0), labels= c("Yes", "No"))
confusionMatrix(s,v)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 242 10
## No 570 8416
##
## Accuracy : 0.9372
## 95% CI : (0.9321, 0.9421)
## No Information Rate : 0.9121
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4312
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.29803
## Specificity : 0.99881
## Pos Pred Value : 0.96032
## Neg Pred Value : 0.93657
## Prevalence : 0.08790
## Detection Rate : 0.02620
## Detection Prevalence : 0.02728
## Balanced Accuracy : 0.64842
##
## 'Positive' Class : Yes
##
#Kappa value is .4312
#In general kappas > 0.75 are excellent, 0.40-0.75 are fair to good. For this classification model, the kappa is .4372 so this is fair. Even though this is not excellent, it is not the only factor that plays a role into determining if a model if good to use.
#The test p-value is very small, which means the model is significant.
#Overall this is an acceptable and good prediction model