library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(nnet)
df <- read.csv("C:/Users/frann/Downloads/Semester 5/Supervised Learning/Project/employee_promotion.csv")
df[df==""]<-NA #make blank values NA first in order to remove them
sum(is.na(df)) #There are 9093 missing values
## [1] 9093
c<-colSums(is.na(df))
View(c) 
df<-na.omit(df) #removes all NA values
#Binary logistic regression
df<- select(df, -employee_id)

#split dataset into training and test
set.seed(42)

df_train<-df%>%sample_frac(.8)
df_test<-df%>%setdiff(df_train)

df_model<-glm(is_promoted~., family=binomial("logit"), data=df)
summary(df_model)
## 
## Call:
## glm(formula = is_promoted ~ ., family = binomial("logit"), data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7319  -0.3945  -0.2706  -0.1760   3.4420  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -24.987976   0.483662 -51.664  < 2e-16 ***
## departmentFinance             5.886840   0.162595  36.205  < 2e-16 ***
## departmentHR                  7.995435   0.210487  37.985  < 2e-16 ***
## departmentLegal               5.461325   0.231412  23.600  < 2e-16 ***
## departmentOperations          6.078879   0.138917  43.759  < 2e-16 ***
## departmentProcurement         3.716943   0.104449  35.586  < 2e-16 ***
## departmentR&D                -0.576220   0.152930  -3.768 0.000165 ***
## departmentSales & Marketing   8.476310   0.182056  46.559  < 2e-16 ***
## departmentTechnology          1.481909   0.076979  19.251  < 2e-16 ***
## regionregion_10              -0.033918   0.244489  -0.139 0.889665    
## regionregion_11              -0.348618   0.226556  -1.539 0.123860    
## regionregion_12              -0.793024   0.345762  -2.294 0.021816 *  
## regionregion_13               0.045602   0.193507   0.236 0.813697    
## regionregion_14              -0.082375   0.234916  -0.351 0.725844    
## regionregion_15               0.029014   0.194344   0.149 0.881322    
## regionregion_16              -0.178835   0.216588  -0.826 0.408980    
## regionregion_17               0.475867   0.217754   2.185 0.028864 *  
## regionregion_18             -10.251740 106.447508  -0.096 0.923276    
## regionregion_19              -0.143300   0.238804  -0.600 0.548457    
## regionregion_2                0.202512   0.179665   1.127 0.259674    
## regionregion_20              -0.368833   0.250970  -1.470 0.141662    
## regionregion_21              -0.286065   0.322243  -0.888 0.374686    
## regionregion_22               0.505042   0.180343   2.800 0.005103 ** 
## regionregion_23               0.369730   0.205290   1.801 0.071701 .  
## regionregion_24              -0.388005   0.310057  -1.251 0.210789    
## regionregion_25               0.602300   0.214190   2.812 0.004924 ** 
## regionregion_26              -0.094985   0.201513  -0.471 0.637383    
## regionregion_27               0.071358   0.207352   0.344 0.730740    
## regionregion_28               0.307260   0.204428   1.503 0.132833    
## regionregion_29              -0.490291   0.251071  -1.953 0.050843 .  
## regionregion_3                0.266655   0.279978   0.952 0.340886    
## regionregion_30              -0.038334   0.253940  -0.151 0.880010    
## regionregion_31              -0.269143   0.210604  -1.278 0.201264    
## regionregion_32              -0.519077   0.261258  -1.987 0.046940 *  
## regionregion_33              -0.485774   0.375519  -1.294 0.195801    
## regionregion_34              -1.004135   0.465531  -2.157 0.031008 *  
## regionregion_4                0.668660   0.192678   3.470 0.000520 ***
## regionregion_5               -0.569068   0.272437  -2.089 0.036725 *  
## regionregion_6               -0.516632   0.290743  -1.777 0.075579 .  
## regionregion_7                0.380528   0.182466   2.085 0.037027 *  
## regionregion_8                0.065448   0.244454   0.268 0.788905    
## regionregion_9               -1.086523   0.428900  -2.533 0.011300 *  
## educationBelow Secondary     -0.364018   0.207072  -1.758 0.078759 .  
## educationMaster's & above     0.192801   0.045253   4.261 2.04e-05 ***
## genderm                       0.010760   0.043613   0.247 0.805130    
## recruitment_channelreferred  -0.097063   0.118333  -0.820 0.412074    
## recruitment_channelsourcing   0.025679   0.039140   0.656 0.511780    
## no_of_trainings              -0.183348   0.037004  -4.955 7.24e-07 ***
## age                          -0.029807   0.003849  -7.745 9.56e-15 ***
## previous_year_rating          0.500918   0.017853  28.059  < 2e-16 ***
## length_of_service             0.008413   0.006162   1.365 0.172157    
## awards_won                    1.812311   0.086665  20.912  < 2e-16 ***
## avg_training_score            0.254306   0.004954  51.330  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27583  on 46379  degrees of freedom
## Residual deviance: 20172  on 46327  degrees of freedom
## AIC: 20278
## 
## Number of Fisher Scoring iterations: 12
#Train accuracy
prob_in_df_train<-predict(df_model, type="response", newdata=df_train)
pred_in_df_train<-ifelse(prob_in_df_train>.5, 1,0)

#Train confusion table
confusion_in_df_train<-table(prediction=pred_in_df_train, actual=df_train$is_promoted)
confusion_in_df_train
##           actual
## prediction     0     1
##          0 33783  2258
##          1    62  1001
#33783 employees were predicted correctly as not recommended for promotion. 62 were predicted incorrectly as recommended for promotion when they actually were not. 1001 employees were predicted correctly as recommended for promotion. 2258 were predicted incorrectly as not recommended for promotion when they actually were recommended.

accuracy_in_df_train<- sum(diag(confusion_in_df_train)/sum(confusion_in_df_train))
accuracy_in_df_train
## [1] 0.937473
#93.75% accuracy


#Test accuracy
prob_in_df_test<-predict(df_model, type="response", newdata=df_test)
pred_in_df_test<-ifelse(prob_in_df_test>.5, 1,0)

#Test confusion table
confusion_in_df_test<-table(prediction=pred_in_df_test, actual=df_test$is_promoted)
confusion_in_df_test
##           actual
## prediction    0    1
##          0 8416  570
##          1   10  242
#8416 employees were predicted correctly as not recommended for promotion. 10 were predicted incorrectly as recommended for promotion when they actually were not. 242 employees were predicted correctly as recommended for promotion. 570 were predicted incorrectly as not recommended for promotion when they actually were recommended.

accuracy_in_df_test<- sum(diag(confusion_in_df_test)/sum(confusion_in_df_test))
accuracy_in_df_test
## [1] 0.9372158
#93.72% accuracy, so the training set is only a tiny bit more accurate. Algorithm learns very well. Test and training accuracy are on the same scale so there is no overfitting problem. 

s<- ifelse(pred_in_df_test>.5, 1,0)
s<- factor(pred_in_df_test, levels=c(1,0), labels= c("Yes", "No"))
v<- factor(df_test$is_promoted, levels=c(1,0), labels= c("Yes", "No"))
confusionMatrix(s,v) 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Yes   No
##        Yes  242   10
##        No   570 8416
##                                           
##                Accuracy : 0.9372          
##                  95% CI : (0.9321, 0.9421)
##     No Information Rate : 0.9121          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4312          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.29803         
##             Specificity : 0.99881         
##          Pos Pred Value : 0.96032         
##          Neg Pred Value : 0.93657         
##              Prevalence : 0.08790         
##          Detection Rate : 0.02620         
##    Detection Prevalence : 0.02728         
##       Balanced Accuracy : 0.64842         
##                                           
##        'Positive' Class : Yes             
## 
#Kappa value is .4312
#In general kappas > 0.75 are excellent, 0.40-0.75 are fair to good. For this classification model, the kappa is .4372 so this is fair. Even though this is not excellent, it is not the only factor that plays a role into determining if a model if good to use.
#The test p-value is very small, which means the model is significant.
#Overall this is an acceptable and good prediction model