Exploratory Data Analysis (EDA) from raw data
Data preprocessing to generate clean dataset
Interpreting affected factors on employee decision
Predict the probability of a candidate to look for a new job or will work for the company
This dataset is collected from Kaggle which show a Big Data courses company would like to identify which candidates will work for the company after training or look for new employment in Data Science field. This analysis can help company save cost and time to plan for training and categorization of candidates.
Tan Wee Kiat (S2033309) - Introduction and objective 4
Most Sarmin Sultana (S2035295) - Objective 2
Bhavya Sree Geda (S2020579) - Objective 1
Joycelyn Teo Sze Ling (17035423) - Objective 3
library(stats)
library(tidyverse)
library(dplyr)
library(stringr)
library(mice)
library(VIM)
library(ggplot2)
library(caret)
library(ggalluvial)
library(ggfittext)
library(janitor)
library(naniar)
library(scales)
library(randomForest)
library(missForest)
| Feature | Description |
|---|---|
| enrollee_id | Unique ID for candidate |
| city | City code |
| city_ development _index | Developement index of the city (scaled) |
| gender | Gender of candidate |
| relevent_experience | Relevant experience of candidate |
| enrolled_university | Type of University course enrolled if any |
| education_level | Education level of candidate |
| major_discipline | Education major discipline of candidate |
| experience | Candidate total experience in years |
| company_size | No of employees in current employer’s company |
| company_type | Type of current employer |
| last_new_job | Difference in years between previous job and current job |
| training_hours | training hours completed |
| target | 0 – Not looking for job change 1 – Looking for a job change |
#Exploring Data
#Importing Data
train_df <- read.csv("Data/aug_train.csv") %>% na_if("")
View(train_df)
#Gender
#90% of the targets are men, Other 10% are shared by women and others.
library(stats)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(ggalluvial)
library(ggfittext)
library(janitor)
library(naniar)
library(scales)
library(randomForest)
library(missForest)
library(caret)
ggplot(train_df,aes(x=gender))+
geom_bar(fill = "blue")+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Gender Distribution")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Gender")
count(filter(train_df,train_df$gender == "Male")) / count(dplyr::select(train_df,gender))
## n
## 1 0.6901034
#Relevent Experience
#Over 70 % of the whole candidates have relevent experience.
library(tidyverse)
ggplot(train_df,aes(x= relevent_experience))+
geom_bar(fill = "blue")+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Relevent Experience")+
xlab("Relvent experience")
ylab("Count")
## $y
## [1] "Count"
##
## attr(,"class")
## [1] "labels"
count(filter(train_df,train_df$relevent_experience == "Has relevent experience")) / count(dplyr::select(train_df,relevent_experience))
## n
## 1 0.7199081
#In-Depth Relevent Experience
#Here relevent experience is higher than non relevent experience for candidates who attended to college. However, targets have higher non relevent experience than relevent experience who didn't go to college.
ggplot(train_df,aes(x=relevent_experience))+
geom_bar(fill = "blue")+
facet_wrap(~education_level)+
ggtitle("Relevent experience by education level")+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Experience")+
ylab("Count")
#Enrolled University
#73% of the targets were not currently enrolled in college.Assuming that most of the candidates already graduated from college.The remaining 27 % are shared by full time course and part time course.
ggplot(train_df,aes(x=enrolled_university,fill = relevent_experience))+
geom_bar()+
facet_wrap(~relevent_experience)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Enrolled University")+
xlab("Enrolled University")+
ylab("Count")
count(filter(train_df,train_df$enrolled_university == "no_enrollment")) / count(dplyr::select(train_df,enrolled_university ))
## n
## 1 0.7212131
count(filter(train_df,train_df$enrolled_university == "Full time course")) / count(dplyr::select(train_df,enrolled_university ))
## n
## 1 0.1961061
count(filter(train_df,train_df$enrolled_university == "Part time course")) / count(dplyr::select(train_df,enrolled_university ))
## n
## 1 0.06253262
#College Education
ggplot(train_df,aes(x=education_level,fill=relevent_experience))+
geom_bar()+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Education Background")+
xlab("Education Level")+
ylab("Count")
count(filter(train_df,train_df$education_level == "Graduate")) / count(dplyr::select(train_df,education_level ))
## n
## 1 0.6053868
count(filter(train_df,train_df$education_level == "Masters")) / count(dplyr::select(train_df,education_level ))
## n
## 1 0.2276334
count(filter(train_df,train_df$education_level == "Phd")) / count(dplyr::select(train_df,education_level ))
## n
## 1 0.02160977
#College Major
ggplot(train_df,aes(x=major_discipline,fill=relevent_experience))+
geom_bar()+
facet_wrap(~gender)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
ggtitle("College major")+
xlab("Major")+
ylab("Count")
count(filter(train_df,train_df$major_discipline == "STEM")) / count(dplyr::select(train_df,major_discipline))
## n
## 1 0.7564464
count(filter(train_df,train_df$major_discipline == "Humanities")) / count(dplyr::select(train_df,major_discipline))
## n
## 1 0.03492014
count(filter(train_df,train_df$major_discipline == "Business Degree")) / count(dplyr::select(train_df,major_discipline))
## n
## 1 0.01706859
#Company Size
ggplot(train_df,aes(x=company_size))+
geom_bar(fill = "blue")+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Company size")+
xlab("Size")+
ylab("Count")
count(filter(train_df,train_df$company_size == "50-99"))/count(dplyr::select(train_df,company_size))
## n
## 1 0.1609249
count(filter(train_df,train_df$company_size == "100-500"))/count(dplyr::select(train_df,company_size))
## n
## 1 0.1341998
count(filter(train_df,train_df$company_size == "5000-9999"))/count(dplyr::select(train_df,company_size))
## n
## 1 0.0293872
count(filter(train_df,train_df$company_size == "10000+"))/count(dplyr::select(train_df,company_size))
## n
## 1 0.1053868
#Experience
ggplot(train_df,aes(x=experience))+
geom_bar(fill = "blue")+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Experience")+
xlab("Experience")+
ylab("Count")
#In-depth Experience:
ggplot(train_df,aes(x=experience))+
geom_bar(fill = "blue")+
facet_wrap(~relevent_experience)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Experience by relevent experience")+
xlab("Experience")+
ylab("Count")
#Company Type
ggplot(train_df,aes(x= company_type))+
geom_bar(fill = "blue")+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Company Type")+
xlab("Type")+
ylab("Count")
count(filter(train_df,train_df$company_type == "Pvt Ltd")) / count(dplyr::select(train_df,company_type))
## n
## 1 0.512423
count(filter(train_df,train_df$company_type == "Funded Startup")) / count(dplyr::select(train_df,company_type))
## n
## 1 0.05224971
count(filter(train_df,train_df$company_type == "Public Sector")) / count(dplyr::select(train_df,company_type))
## n
## 1 0.04984863
#In-depth Company Type
ggplot(train_df,aes(x= company_type))+
geom_bar(fill = "blue")+
facet_wrap(~company_size)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Company Type")+
xlab("Type")+
ylab("Count")
#Last new job
ggplot(train_df,aes(x=last_new_job))+
geom_bar(fill = "blue")+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Time gap")+
xlab("Last job")+
ylab("Count")
count(filter(train_df,train_df$last_new_job == 1))/count(dplyr::select(train_df,last_new_job))
## n
## 1 0.419668
count(filter(train_df,train_df$last_new_job == ">4"))/count(dplyr::select(train_df,last_new_job))
## n
## 1 0.1717298
#City Development index
ggplot(train_df,aes(x=city_development_index,fill = relevent_experience))+
geom_bar()+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("City development index")+
xlab("City development")+
ylab("Count")
#Training hours
ggplot(train_df,aes(x= training_hours,fill = relevent_experience))+
geom_density()
ggplot(train_df,aes(x= training_hours,fill = relevent_experience))+
geom_density()+
facet_wrap(~relevent_experience)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Training hotrs by relevent experience")+
xlim(0,60)
#Target
#Gender: same distribution
ggplot(train_df,aes(x=gender,fill = gender))+
geom_bar()+
facet_wrap(~target)+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by gender")+
xlab("Target")+
ylab("Count")
#Relevent experience : same distribution
ggplot(train_df,aes(x=relevent_experience,fill = relevent_experience))+
geom_bar()+
facet_wrap(~target)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by relevent experience")+
xlab("Target")+
ylab("Count")
#City development index
ggplot(train_df,aes(x=city_development_index,fill = relevent_experience))+
geom_density(alpha = 0.5)+
facet_wrap(~target)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by city development index")+
xlab("Target")+
ylab("Count")
#Enrolled university: same distribution
ggplot(train_df,aes(x=enrolled_university,fill = enrolled_university))+
geom_bar()+
facet_wrap(~target)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by enrolled university")+
xlab("Target")+
ylab("Count")
#Education level : same distribution
ggplot(train_df,aes(x=education_level,fill = education_level))+
geom_bar()+
facet_wrap(~target)+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
ggtitle("Target by education level")+
xlab("Target")+
ylab("Count")
#Major Discipline : Same distribution
ggplot(train_df,aes(x=major_discipline,fill = major_discipline))+
geom_bar()+
facet_wrap(~target)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by Major discipline")+
xlab("Target")+
ylab("Count")
#Experience
ggplot(train_df,aes(x=experience))+
geom_bar(fill = "blue")+
facet_wrap(~target)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by experience")+
xlab("Target")+
ylab("Count")
#Company Size : Same distibution
ggplot(train_df,aes(x=company_size))+
geom_bar(fill = "blue")+
facet_wrap(~target)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by company size")+
xlab("Target")+
ylab("Count")
#Company Type : same distribution
ggplot(train_df,aes(x=company_type,fill = company_type))+
geom_bar()+
facet_wrap(~target)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by company size ")+
xlab("Target")+
ylab("Count")
#Last_new_job
ggplot(train_df,aes(x = last_new_job, fill = last_new_job))+
geom_bar()+
facet_wrap(~target)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1,check_overlap = TRUE)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by last new job")+
xlab("Target")+
ylab("Count")
#Traning hours : same distribution
ggplot(train_df,aes(x=training_hours,fill = relevent_experience))+
geom_density(alpha = 0.7)+
facet_wrap(~target)+
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Target by training hours")+
xlab("Training hours")+
ylab("Count")
#Convert to factor
train_df <-
train_df %>% rename(relevant_experience = relevent_experience) %>%
mutate(
relevant_experience = str_replace(
relevant_experience,
pattern = "relevent",
replacement = "relevant"
)
) %>%
mutate_if(is.character, as.factor) %>% mutate(target = as.factor(target)) %>% mutate(city_development_index = as.factor(city_development_index))
#removing enrollee_id, city and gender as predictors by creating new dataframe of aug_train_for_Im
aug_train_for_Im <- train_df %>% select(-enrollee_id, -city, -gender ) %>%
mutate(city_development_index = as.numeric(city_development_index),
target = as.factor(target)) %>% as.data.frame()
### training data imputation, using maxiter = 2 and ntree = 20
set.seed(1967)
TrainDataImputed <- missForest(xmis = aug_train_for_Im, maxiter = 2, ntree = 20)
## missForest iteration 1 in progress...done!
## missForest iteration 2 in progress...done!
TrainDataImputed <- TrainDataImputed$ximp
#split training data so to create a validation dataframe using createDataPartition function from caret package
set.seed(1967)
PartIndex <- createDataPartition(TrainDataImputed$target,
p = .75,
list = FALSE,
times = 1)
New_aug_Train <- TrainDataImputed[ PartIndex,]
aug_validation <- TrainDataImputed[-PartIndex,]
# Training data randomForest model
set.seed(1967)
tune_aug_train <- tuneRF(New_aug_Train[,-11], New_aug_Train[,11],
stepFactor = 0.5,
plot = TRUE,
ntreeTry = 200,
trace = TRUE,
improve = 0.05)
## mtry = 3 OOB error = 20.17%
## Searching left ...
## mtry = 6 OOB error = 20.52%
## -0.01759834 0.05
## Searching right ...
## mtry = 1 OOB error = 21.98%
## -0.08971705 0.05
set.seed(1967)
rf <- randomForest(
target ~ .,
data = New_aug_Train,
ntree = 200,
mtry = 3,
importance = TRUE
)
rf
##
## Call:
## randomForest(formula = target ~ ., data = New_aug_Train, ntree = 200, mtry = 3, importance = TRUE)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 20%
## Confusion matrix:
## 0 1 class.error
## 0 9854 932 0.08640831
## 1 1942 1641 0.54200391
plot(rf)
varImpPlot(rf,
sort = T,
n.var = 5,
main = "Top 5 - Variable Importance")
importance(rf)
## 0 1 MeanDecreaseAccuracy
## city_development_index 71.075530 108.747569 125.168509
## relevant_experience 23.250360 19.936988 30.310253
## enrolled_university 23.202339 15.153798 28.020608
## education_level 29.923590 14.673108 35.225661
## major_discipline 1.444982 6.805779 4.290806
## experience 26.470620 16.744178 39.815191
## company_size 16.638853 56.353496 46.303143
## company_type 14.763053 46.706295 40.999175
## last_new_job 20.617637 19.480677 28.196528
## training_hours 6.164753 14.555578 14.127658
## MeanDecreaseGini
## city_development_index 1185.0660
## relevant_experience 107.2122
## enrolled_university 164.8414
## education_level 212.9720
## major_discipline 136.9308
## experience 809.0286
## company_size 505.6898
## company_type 261.6418
## last_new_job 370.2241
## training_hours 976.6072
#As we see, the city_development_index is the most important predictor among all.
#randomForest predictions
predicted_aug_train <- predict(rf, New_aug_Train)
table(predicted = predicted_aug_train, actual = New_aug_Train$target)
## actual
## predicted 0 1
## 0 10743 256
## 1 43 3327
#creating confusionMatrix
confusionMatrix(predicted_aug_train, New_aug_Train$target)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10743 256
## 1 43 3327
##
## Accuracy : 0.9792
## 95% CI : (0.9767, 0.9815)
## No Information Rate : 0.7506
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9433
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9960
## Specificity : 0.9286
## Pos Pred Value : 0.9767
## Neg Pred Value : 0.9872
## Prevalence : 0.7506
## Detection Rate : 0.7477
## Detection Prevalence : 0.7655
## Balanced Accuracy : 0.9623
##
## 'Positive' Class : 0
##
#It is normal since the model is familiar with the train data,lets see what will get with validation data
#randomForest prediction imputed validation data
predicted_aug_valid <- predict(rf, aug_validation)
table(predicted = predicted_aug_valid, actual = aug_validation$target)
## actual
## predicted 0 1
## 0 3283 666
## 1 312 528
ConMatrix <- confusionMatrix(predicted_aug_valid, aug_validation$target)
print(ConMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3283 666
## 1 312 528
##
## Accuracy : 0.7958
## 95% CI : (0.7841, 0.8071)
## No Information Rate : 0.7507
## P-Value [Acc > NIR] : 9.55e-14
##
## Kappa : 0.3945
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9132
## Specificity : 0.4422
## Pos Pred Value : 0.8313
## Neg Pred Value : 0.6286
## Prevalence : 0.7507
## Detection Rate : 0.6855
## Detection Prevalence : 0.8246
## Balanced Accuracy : 0.6777
##
## 'Positive' Class : 0
##
#Finding out the accuracy
Acc <- round(ConMatrix$overall[1],4)
Acc
## Accuracy
## 0.7958
#0.7975 is good but not very convincing enough
test_df <- read.csv("Data/aug_test.csv") %>% na_if("")
train_df <- read.csv("Data/aug_train.csv") %>% na_if("")
head(test_df)
## enrollee_id city city_development_index gender relevent_experience
## 1 32403 city_41 0.827 Male Has relevent experience
## 2 9858 city_103 0.920 Female Has relevent experience
## 3 31806 city_21 0.624 Male No relevent experience
## 4 27385 city_13 0.827 Male Has relevent experience
## 5 27724 city_103 0.920 Male Has relevent experience
## 6 217 city_23 0.899 Male No relevent experience
## enrolled_university education_level major_discipline experience company_size
## 1 Full time course Graduate STEM 9 <10
## 2 no_enrollment Graduate STEM 5 <NA>
## 3 no_enrollment High School <NA> <1 <NA>
## 4 no_enrollment Masters STEM 11 10/49
## 5 no_enrollment Graduate STEM >20 10000+
## 6 Part time course Masters STEM 10 <NA>
## company_type last_new_job training_hours
## 1 <NA> 1 21
## 2 Pvt Ltd 1 98
## 3 Pvt Ltd never 15
## 4 Pvt Ltd 1 39
## 5 Pvt Ltd >4 72
## 6 <NA> 2 12
head(train_df)
## enrollee_id city city_development_index gender relevent_experience
## 1 8949 city_103 0.920 Male Has relevent experience
## 2 29725 city_40 0.776 Male No relevent experience
## 3 11561 city_21 0.624 <NA> No relevent experience
## 4 33241 city_115 0.789 <NA> No relevent experience
## 5 666 city_162 0.767 Male Has relevent experience
## 6 21651 city_176 0.764 <NA> Has relevent experience
## enrolled_university education_level major_discipline experience company_size
## 1 no_enrollment Graduate STEM >20 <NA>
## 2 no_enrollment Graduate STEM 15 50-99
## 3 Full time course Graduate STEM 5 <NA>
## 4 <NA> Graduate Business Degree <1 <NA>
## 5 no_enrollment Masters STEM >20 50-99
## 6 Part time course Graduate STEM 11 <NA>
## company_type last_new_job training_hours target
## 1 <NA> 1 36 1
## 2 Pvt Ltd >4 47 0
## 3 <NA> never 83 0
## 4 Pvt Ltd never 52 1
## 5 Funded Startup 4 8 0
## 6 <NA> 1 24 1
dim(test_df)
## [1] 2129 13
dim(train_df)
## [1] 19158 14
glimpse(test_df)
## Rows: 2,129
## Columns: 13
## $ enrollee_id <int> 32403, 9858, 31806, 27385, 27724, 217, 21465, 2~
## $ city <chr> "city_41", "city_103", "city_21", "city_13", "c~
## $ city_development_index <dbl> 0.827, 0.920, 0.624, 0.827, 0.920, 0.899, 0.624~
## $ gender <chr> "Male", "Female", "Male", "Male", "Male", "Male~
## $ relevent_experience <chr> "Has relevent experience", "Has relevent experi~
## $ enrolled_university <chr> "Full time course", "no_enrollment", "no_enroll~
## $ education_level <chr> "Graduate", "Graduate", "High School", "Masters~
## $ major_discipline <chr> "STEM", "STEM", NA, "STEM", "STEM", "STEM", "ST~
## $ experience <chr> "9", "5", "<1", "11", ">20", "10", "<1", ">20",~
## $ company_size <chr> "<10", NA, NA, "10/49", "10000+", NA, "100-500"~
## $ company_type <chr> NA, "Pvt Ltd", "Pvt Ltd", "Pvt Ltd", "Pvt Ltd",~
## $ last_new_job <chr> "1", "1", "never", "1", ">4", "2", "1", ">4", "~
## $ training_hours <int> 21, 98, 15, 39, 72, 12, 11, 81, 2, 4, 196, 51, ~
glimpse(train_df)
## Rows: 19,158
## Columns: 14
## $ enrollee_id <int> 8949, 29725, 11561, 33241, 666, 21651, 28806, 4~
## $ city <chr> "city_103", "city_40", "city_21", "city_115", "~
## $ city_development_index <dbl> 0.920, 0.776, 0.624, 0.789, 0.767, 0.764, 0.920~
## $ gender <chr> "Male", "Male", NA, NA, "Male", NA, "Male", "Ma~
## $ relevent_experience <chr> "Has relevent experience", "No relevent experie~
## $ enrolled_university <chr> "no_enrollment", "no_enrollment", "Full time co~
## $ education_level <chr> "Graduate", "Graduate", "Graduate", "Graduate",~
## $ major_discipline <chr> "STEM", "STEM", "STEM", "Business Degree", "STE~
## $ experience <chr> ">20", "15", "5", "<1", ">20", "11", "5", "13",~
## $ company_size <chr> NA, "50-99", NA, NA, "50-99", NA, "50-99", "<10~
## $ company_type <chr> NA, "Pvt Ltd", NA, "Pvt Ltd", "Funded Startup",~
## $ last_new_job <chr> "1", ">4", "never", "never", "4", "1", "1", ">4~
## $ training_hours <int> 36, 47, 83, 52, 8, 24, 24, 18, 46, 123, 32, 108~
## $ target <dbl> 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,~
names(test_df)[names(test_df) == "relevent_experience"] <- "relevant_experience"
test_df$relevant_experience <- str_replace_all(test_df$relevant_experience, "relevent", "relevant")
names(train_df)[names(train_df) == "relevent_experience"] <- "relevant_experience"
train_df$relevant_experience <- str_replace_all(train_df$relevant_experience, "relevent", "relevant")
new_test_df <-test_df %>% mutate(last_new_job = ifelse(last_new_job == "never", 0, last_new_job ))
new_test_df <-new_test_df %>% mutate(last_new_job = ifelse(last_new_job == ">4", 5, last_new_job ))
new_train_df <-train_df %>% mutate(last_new_job = ifelse(last_new_job == "never", 0, last_new_job ))
new_train_df <-new_train_df %>% mutate(last_new_job = ifelse(last_new_job == ">4", 5, last_new_job ))
As we want to convert the data type of last_new_job to numeric.
new_test_df$last_new_job <- as.numeric(new_test_df$last_new_job)
new_train_df$last_new_job <- as.numeric(new_train_df$last_new_job)
new_test_df <-new_test_df %>% mutate(experience = ifelse(experience == "<1", 0.5, experience ))
new_test_df <-new_test_df %>% mutate(experience = ifelse(experience == ">20", 21, experience ))
new_train_df <-new_train_df %>% mutate(experience = ifelse(experience == "<1", 0.5, experience ))
new_train_df <-new_train_df %>% mutate(experience = ifelse(experience == ">20", 21, experience ))
new_test_df <-new_test_df %>% mutate(company_size = ifelse(company_size == "10/49", "10-49", company_size ))
new_train_df <-new_train_df %>% mutate(company_size = ifelse(company_size == "10/49", "10-49", company_size ))
table(is.na(new_test_df))
##
## FALSE TRUE
## 25473 2204
table(is.na(new_train_df))
##
## FALSE TRUE
## 247479 20733
As more then 50% data is missing in the dataset. Missing data can have a big impact on predictive modeling. So we will be performing a mice imputation for all missing values. For that perpose we change the character data type to factor data type.
sapply(new_train_df, function(x) sum(is.na(x)))
## enrollee_id city city_development_index
## 0 0 0
## gender relevant_experience enrolled_university
## 4508 0 386
## education_level major_discipline experience
## 460 2813 65
## company_size company_type last_new_job
## 5938 6140 423
## training_hours target
## 0 0
Top column with missing values:
Company Type : 6140
Company Size : 5938
Gender : 4508
Major Discipline : 2813
new_test_df$experience <- as.numeric(new_test_df$experience)
new_test_df$gender <- as.factor(new_test_df$gender)
new_test_df$enrolled_university <- as.factor(new_test_df$enrolled_university)
new_test_df$education_level <- as.factor(new_test_df$education_level)
new_test_df$company_size <- as.factor(new_test_df$company_size)
new_test_df$company_type <- as.factor(new_test_df$company_type)
new_test_df$major_discipline <- as.factor(new_test_df$major_discipline)
new_train_df$experience <- as.numeric(new_train_df$experience)
new_train_df$gender <- as.factor(new_train_df$gender)
new_train_df$enrolled_university <- as.factor(new_train_df$enrolled_university)
new_train_df$education_level <- as.factor(new_train_df$education_level)
new_train_df$company_size <- as.factor(new_train_df$company_size)
new_train_df$company_type <- as.factor(new_train_df$company_type)
new_train_df$major_discipline <- as.factor(new_train_df$major_discipline)
str(new_train_df)
## 'data.frame': 19158 obs. of 14 variables:
## $ enrollee_id : int 8949 29725 11561 33241 666 21651 28806 402 27107 699 ...
## $ city : chr "city_103" "city_40" "city_21" "city_115" ...
## $ city_development_index: num 0.92 0.776 0.624 0.789 0.767 0.764 0.92 0.762 0.92 0.92 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 2 2 NA NA 2 NA 2 2 2 NA ...
## $ relevant_experience : chr "Has relevant experience" "No relevant experience" "No relevant experience" "No relevant experience" ...
## $ enrolled_university : Factor w/ 3 levels "Full time course",..: 2 2 1 NA 2 3 2 2 2 2 ...
## $ education_level : Factor w/ 5 levels "Graduate","High School",..: 1 1 1 1 3 1 2 1 1 1 ...
## $ major_discipline : Factor w/ 6 levels "Arts","Business Degree",..: 6 6 6 2 6 6 NA 6 6 6 ...
## $ experience : num 21 15 5 0.5 21 11 5 13 7 17 ...
## $ company_size : Factor w/ 8 levels "<10","10-49",..: NA 6 NA NA 6 NA 6 1 6 5 ...
## $ company_type : Factor w/ 6 levels "Early Stage Startup",..: NA 6 NA 6 2 NA 2 6 6 6 ...
## $ last_new_job : num 1 5 0 0 4 1 1 5 1 5 ...
## $ training_hours : int 36 47 83 52 8 24 24 18 46 123 ...
## $ target : num 1 0 0 1 0 1 0 1 1 0 ...
ggplot(new_test_df,aes(x=gender))+
geom_bar(fill = "blue")+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Gender Distribution")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Gender")
ggplot(new_train_df,aes(x=gender))+
geom_bar(fill = "blue")+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Gender Distribution")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Gender")
Even if we add all NA value to Female, Male will be double!
ggplot(new_test_df,aes(x=education_level,fill = gender))+
geom_bar()+
facet_wrap(~gender)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Education Level with Gender before Imputation")+
xlab("Gender")+
ylab("Count")
ggplot(new_train_df,aes(x=education_level,fill = gender))+
geom_bar()+
facet_wrap(~gender)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Education Level with Gender before Imputation")+
xlab("Gender")+
ylab("Count")
ggplot(new_test_df,aes(x=gender,fill = relevant_experience))+
geom_bar()+
facet_wrap(~relevant_experience)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Relevant experience with gender before Imputation")+
xlab("Gender")+
ylab("Count")
ggplot(new_train_df,aes(x=gender,fill = relevant_experience))+
geom_bar()+
facet_wrap(~relevant_experience)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Relevant experience with gender before Imputation")+
xlab("Gender")+
ylab("Count")
The mice package has a function known as md.pattern(). It returns a tabular form of missing value present in each variable in a data set.
md.pattern(new_train_df)
## enrollee_id city city_development_index relevant_experience training_hours
## 8955 1 1 1 1 1
## 462 1 1 1 1 1
## 283 1 1 1 1 1
## 2777 1 1 1 1 1
## 2224 1 1 1 1 1
## 158 1 1 1 1 1
## 98 1 1 1 1 1
## 835 1 1 1 1 1
## 660 1 1 1 1 1
## 52 1 1 1 1 1
## 115 1 1 1 1 1
## 847 1 1 1 1 1
## 177 1 1 1 1 1
## 16 1 1 1 1 1
## 26 1 1 1 1 1
## 329 1 1 1 1 1
## 74 1 1 1 1 1
## 11 1 1 1 1 1
## 13 1 1 1 1 1
## 111 1 1 1 1 1
## 45 1 1 1 1 1
## 5 1 1 1 1 1
## 9 1 1 1 1 1
## 63 1 1 1 1 1
## 22 1 1 1 1 1
## 3 1 1 1 1 1
## 4 1 1 1 1 1
## 21 1 1 1 1 1
## 115 1 1 1 1 1
## 27 1 1 1 1 1
## 7 1 1 1 1 1
## 79 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 9 1 1 1 1 1
## 14 1 1 1 1 1
## 3 1 1 1 1 1
## 40 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 4 1 1 1 1 1
## 3 1 1 1 1 1
## 12 1 1 1 1 1
## 62 1 1 1 1 1
## 9 1 1 1 1 1
## 5 1 1 1 1 1
## 53 1 1 1 1 1
## 44 1 1 1 1 1
## 9 1 1 1 1 1
## 5 1 1 1 1 1
## 28 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 26 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 16 1 1 1 1 1
## 3 1 1 1 1 1
## 2 1 1 1 1 1
## 2 1 1 1 1 1
## 35 1 1 1 1 1
## 3 1 1 1 1 1
## 4 1 1 1 1 1
## 34 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 2 1 1 1 1 1
## 4 1 1 1 1 1
## 5 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 12 1 1 1 1 1
## 5 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 11 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 6 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 5 1 1 1 1 1
## 5 1 1 1 1 1
## 1 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 0 0 0 0 0
## target experience enrolled_university last_new_job education_level
## 8955 1 1 1 1 1
## 462 1 1 1 1 1
## 283 1 1 1 1 1
## 2777 1 1 1 1 1
## 2224 1 1 1 1 1
## 158 1 1 1 1 1
## 98 1 1 1 1 1
## 835 1 1 1 1 1
## 660 1 1 1 1 1
## 52 1 1 1 1 1
## 115 1 1 1 1 1
## 847 1 1 1 1 1
## 177 1 1 1 1 1
## 16 1 1 1 1 1
## 26 1 1 1 1 1
## 329 1 1 1 1 1
## 74 1 1 1 1 0
## 11 1 1 1 1 0
## 13 1 1 1 1 0
## 111 1 1 1 1 0
## 45 1 1 1 1 0
## 5 1 1 1 1 0
## 9 1 1 1 1 0
## 63 1 1 1 1 0
## 22 1 1 1 0 1
## 3 1 1 1 0 1
## 4 1 1 1 0 1
## 21 1 1 1 0 1
## 115 1 1 1 0 1
## 27 1 1 1 0 1
## 7 1 1 1 0 1
## 79 1 1 1 0 1
## 1 1 1 1 0 1
## 1 1 1 1 0 1
## 9 1 1 1 0 1
## 14 1 1 1 0 1
## 3 1 1 1 0 1
## 40 1 1 1 0 1
## 1 1 1 1 0 0
## 1 1 1 1 0 0
## 4 1 1 1 0 0
## 3 1 1 1 0 0
## 12 1 1 1 0 0
## 62 1 1 0 1 1
## 9 1 1 0 1 1
## 5 1 1 0 1 1
## 53 1 1 0 1 1
## 44 1 1 0 1 1
## 9 1 1 0 1 1
## 5 1 1 0 1 1
## 28 1 1 0 1 1
## 5 1 1 0 1 1
## 2 1 1 0 1 1
## 26 1 1 0 1 1
## 1 1 1 0 1 1
## 1 1 1 0 1 1
## 16 1 1 0 1 1
## 3 1 1 0 1 0
## 2 1 1 0 1 0
## 2 1 1 0 1 0
## 35 1 1 0 1 0
## 3 1 1 0 1 0
## 4 1 1 0 1 0
## 34 1 1 0 1 0
## 1 1 1 0 0 1
## 1 1 1 0 0 1
## 1 1 1 0 0 1
## 1 1 1 0 0 1
## 2 1 1 0 0 1
## 2 1 1 0 0 1
## 4 1 1 0 0 1
## 5 1 1 0 0 1
## 1 1 1 0 0 0
## 2 1 1 0 0 0
## 12 1 1 0 0 0
## 5 1 0 1 1 1
## 3 1 0 1 1 1
## 1 1 0 1 1 1
## 1 1 0 1 1 1
## 11 1 0 1 1 1
## 3 1 0 1 1 1
## 1 1 0 1 1 1
## 4 1 0 1 1 1
## 1 1 0 1 1 1
## 1 1 0 1 1 0
## 6 1 0 1 1 0
## 1 1 0 1 0 1
## 1 1 0 1 0 1
## 5 1 0 1 0 1
## 5 1 0 1 0 1
## 1 1 0 1 0 1
## 4 1 0 1 0 1
## 1 1 0 1 0 1
## 2 1 0 1 0 1
## 1 1 0 1 0 1
## 1 1 0 1 0 1
## 1 1 0 1 0 0
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 0
## 1 1 0 0 1 0
## 1 1 0 0 0 1
## 0 65 386 423 460
## major_discipline gender company_size company_type
## 8955 1 1 1 1 0
## 462 1 1 1 0 1
## 283 1 1 0 1 1
## 2777 1 1 0 0 2
## 2224 1 0 1 1 1
## 158 1 0 1 0 2
## 98 1 0 0 1 2
## 835 1 0 0 0 3
## 660 0 1 1 1 1
## 52 0 1 1 0 2
## 115 0 1 0 1 2
## 847 0 1 0 0 3
## 177 0 0 1 1 2
## 16 0 0 1 0 3
## 26 0 0 0 1 3
## 329 0 0 0 0 4
## 74 0 1 1 1 2
## 11 0 1 1 0 3
## 13 0 1 0 1 3
## 111 0 1 0 0 4
## 45 0 0 1 1 3
## 5 0 0 1 0 4
## 9 0 0 0 1 4
## 63 0 0 0 0 5
## 22 1 1 1 1 1
## 3 1 1 1 0 2
## 4 1 1 0 1 2
## 21 1 1 0 0 3
## 115 1 0 1 1 2
## 27 1 0 1 0 3
## 7 1 0 0 1 3
## 79 1 0 0 0 4
## 1 0 1 1 1 2
## 1 0 1 1 0 3
## 9 0 1 0 0 4
## 14 0 0 1 1 3
## 3 0 0 1 0 4
## 40 0 0 0 0 5
## 1 0 1 1 1 3
## 1 0 1 0 0 5
## 4 0 0 1 1 4
## 3 0 0 1 0 5
## 12 0 0 0 0 6
## 62 1 1 1 1 1
## 9 1 1 1 0 2
## 5 1 1 0 1 2
## 53 1 1 0 0 3
## 44 1 0 1 1 2
## 9 1 0 1 0 3
## 5 1 0 0 1 3
## 28 1 0 0 0 4
## 5 0 1 1 1 2
## 2 0 1 0 1 3
## 26 0 1 0 0 4
## 1 0 0 1 1 3
## 1 0 0 0 1 4
## 16 0 0 0 0 5
## 3 0 1 1 1 3
## 2 0 1 1 0 4
## 2 0 1 0 1 4
## 35 0 1 0 0 5
## 3 0 0 1 1 4
## 4 0 0 0 1 5
## 34 0 0 0 0 6
## 1 1 1 1 1 2
## 1 1 1 1 0 3
## 1 1 1 0 1 3
## 1 1 1 0 0 4
## 2 1 0 1 1 3
## 2 1 0 1 0 4
## 4 1 0 0 0 5
## 5 0 0 0 0 6
## 1 0 1 1 1 4
## 2 0 1 0 0 6
## 12 0 0 0 0 7
## 5 1 1 1 1 1
## 3 1 1 1 0 2
## 1 1 1 0 1 2
## 1 1 1 0 0 3
## 11 1 0 1 1 2
## 3 1 0 1 0 3
## 1 1 0 0 1 3
## 4 1 0 0 0 4
## 1 0 0 0 0 5
## 1 0 0 1 0 5
## 6 0 0 0 0 6
## 1 1 1 1 1 2
## 1 1 1 1 0 3
## 5 1 0 1 1 3
## 5 1 0 1 0 4
## 1 1 0 0 1 4
## 4 1 0 0 0 5
## 1 0 1 0 0 5
## 2 0 0 1 1 4
## 1 0 0 1 0 5
## 1 0 0 0 0 6
## 1 0 1 0 0 6
## 1 1 0 1 1 3
## 1 0 0 1 1 4
## 1 0 1 1 0 5
## 1 0 1 0 0 6
## 1 1 0 1 0 5
## 2813 4508 5938 6140 20733
impute <- mice(new_test_df[,1:13], m=3, seed = 123)
##
## iter imp variable
## 1 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 1 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 1 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 2 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 2 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 2 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 3 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 3 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 3 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 4 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 4 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 4 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 5 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 5 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 5 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## Warning: Number of logged events: 17
impute1 <- mice(new_train_df, m=3, seed = 123)
##
## iter imp variable
## 1 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 1 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 1 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 2 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 2 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 2 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 3 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 3 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 3 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 4 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 4 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 4 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 5 1 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 5 2 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## 5 3 gender enrolled_university education_level major_discipline experience company_size company_type last_new_job
## Warning: Number of logged events: 17
impute() function simply imputes missing value using user defined statistical method (mean, max, mean).The methods used by mice package for imputation:
print(impute)
## Class: mids
## Number of multiple imputations: 3
## Imputation methods:
## enrollee_id city city_development_index
## "" "" ""
## gender relevant_experience enrolled_university
## "polyreg" "" "polyreg"
## education_level major_discipline experience
## "polyreg" "polyreg" "pmm"
## company_size company_type last_new_job
## "polyreg" "polyreg" "pmm"
## training_hours
## ""
## PredictorMatrix:
## enrollee_id city city_development_index gender
## enrollee_id 0 0 1 1
## city 1 0 1 1
## city_development_index 1 0 0 1
## gender 1 0 1 0
## relevant_experience 1 0 1 1
## enrolled_university 1 0 1 1
## relevant_experience enrolled_university education_level
## enrollee_id 0 1 1
## city 0 1 1
## city_development_index 0 1 1
## gender 0 1 1
## relevant_experience 0 1 1
## enrolled_university 0 0 1
## major_discipline experience company_size company_type
## enrollee_id 1 1 1 1
## city 1 1 1 1
## city_development_index 1 1 1 1
## gender 1 1 1 1
## relevant_experience 1 1 1 1
## enrolled_university 1 1 1 1
## last_new_job training_hours
## enrollee_id 1 1
## city 1 1
## city_development_index 1 1
## gender 1 1
## relevant_experience 1 1
## enrolled_university 1 1
## Number of logged events: 17
## it im dep meth
## 1 0 0 constant
## 2 0 0 constant
## 3 1 1 major_discipline polyreg
## 4 1 2 major_discipline polyreg
## 5 1 3 major_discipline polyreg
## 6 2 1 major_discipline polyreg
## out
## 1 city
## 2 relevant_experience
## 3 education_levelHigh School, education_levelPrimary School
## 4 education_levelHigh School, education_levelPrimary School
## 5 education_levelHigh School, education_levelPrimary School
## 6 education_levelHigh School, education_levelPrimary School
impute$imp$gender
## 1 2 3
## 7 Male Male Male
## 14 Male Male Male
## 15 Male Female Male
## 16 Male Female Male
## 19 Male Male Male
## 21 Female Male Male
## 22 Male Male Female
## 29 Male Male Male
## 30 Male Male Male
## 31 Male Male Other
## 33 Male Female Male
## 36 Male Male Male
## 39 Male Male Male
## 54 Male Male Female
## 56 Other Male Male
## 57 Male Male Male
## 63 Male Female Male
## 64 Male Male Male
## 74 Male Male Female
## 83 Male Male Male
## 86 Male Female Male
## 90 Male Male Male
## 99 Male Male Male
## 100 Male Male Male
## 101 Male Male Male
## 108 Male Male Male
## 113 Male Male Male
## 114 Male Male Male
## 115 Male Female Female
## 118 Male Male Male
## 121 Male Male Male
## 126 Male Male Male
## 127 Male Male Male
## 128 Male Male Male
## 130 Male Male Male
## 131 Male Female Male
## 137 Male Male Male
## 138 Male Male Female
## 145 Male Male Male
## 149 Male Other Male
## 150 Male Male Male
## 158 Male Male Male
## 160 Male Male Male
## 161 Male Male Male
## 173 Male Male Male
## 175 Female Male Male
## 186 Male Male Male
## 190 Male Male Male
## 193 Male Male Male
## 198 Male Male Other
## 201 Male Female Male
## 205 Male Male Male
## 217 Male Male Male
## 219 Male Male Male
## 221 Male Male Male
## 226 Female Female Male
## 227 Male Male Male
## 231 Male Male Male
## 233 Male Male Male
## 240 Male Male Male
## 242 Male Female Male
## 247 Male Male Male
## 251 Male Male Male
## 254 Male Male Male
## 256 Male Male Male
## 259 Male Male Male
## 263 Male Male Male
## 270 Male Female Female
## 273 Male Male Male
## 290 Male Male Male
## 292 Male Male Male
## 294 Male Female Male
## 295 Male Male Male
## 296 Male Male Female
## 297 Male Male Male
## 304 Male Male Male
## 305 Male Male Male
## 314 Male Male Male
## 316 Female Male Male
## 322 Male Male Male
## 325 Male Male Male
## 326 Male Male Male
## 327 Female Male Male
## 329 Male Male Male
## 331 Male Male Female
## 332 Male Male Male
## 334 Male Male Male
## 336 Male Male Male
## 349 Male Male Male
## 350 Male Male Male
## 353 Male Male Male
## 356 Male Male Female
## 358 Male Male Male
## 367 Male Male Male
## 369 Male Male Male
## 374 Male Male Male
## 376 Male Female Male
## 378 Female Male Male
## 387 Male Female Male
## 399 Male Female Male
## 405 Male Male Male
## 414 Male Male Male
## 431 Other Female Male
## 437 Male Female Male
## 441 Male Male Female
## 450 Male Male Male
## 458 Male Male Male
## 461 Male Male Male
## 466 Male Male Male
## 467 Male Male Male
## 473 Male Male Female
## 480 Male Male Male
## 487 Male Male Male
## 491 Male Male Male
## 499 Male Male Male
## 510 Male Male Male
## 511 Male Male Male
## 534 Male Male Male
## 539 Male Male Male
## 550 Male Other Other
## 556 Female Male Male
## 558 Male Male Male
## 559 Male Male Male
## 569 Male Male Male
## 572 Male Male Male
## 576 Male Male Male
## 582 Male Male Male
## 589 Female Male Male
## 594 Female Male Male
## 596 Male Male Male
## 597 Other Male Male
## 598 Male Male Male
## 599 Female Other Male
## 600 Male Male Male
## 603 Female Male Male
## 606 Male Male Other
## 619 Male Other Male
## 623 Male Male Female
## 639 Male Male Male
## 640 Male Male Male
## 641 Male Male Female
## 649 Male Male Male
## 661 Male Male Male
## 662 Male Male Male
## 664 Male Male Male
## 665 Male Male Male
## 667 Male Male Male
## 668 Female Male Female
## 669 Male Male Male
## 671 Male Male Male
## 672 Male Male Male
## 677 Male Male Male
## 678 Male Male Male
## 680 Male Male Male
## 687 Male Male Male
## 690 Male Other Female
## 694 Male Male Male
## 695 Male Male Male
## 696 Male Male Male
## 701 Male Male Male
## 704 Male Male Male
## 707 Male Male Male
## 713 Male Female Male
## 714 Male Male Male
## 729 Male Male Male
## 732 Male Male Male
## 745 Male Male Male
## 746 Male Female Other
## 755 Male Male Male
## 756 Male Male Male
## 759 Male Male Male
## 762 Male Male Female
## 765 Other Male Male
## 771 Male Female Female
## 773 Male Male Male
## 786 Other Female Male
## 789 Female Male Male
## 790 Female Male Male
## 794 Male Male Female
## 798 Male Male Male
## 805 Male Male Male
## 809 Male Male Male
## 810 Male Male Male
## 811 Male Male Male
## 816 Male Male Male
## 817 Male Male Male
## 818 Male Male Male
## 821 Male Male Male
## 822 Male Male Female
## 823 Male Male Male
## 826 Female Female Male
## 828 Male Male Female
## 830 Male Male Male
## 835 Male Male Male
## 836 Male Male Male
## 845 Male Male Male
## 851 Female Female Male
## 852 Male Male Male
## 854 Male Male Male
## 859 Male Male Male
## 876 Female Male Male
## 880 Male Male Male
## 884 Male Male Male
## 895 Male Male Male
## 902 Male Male Male
## 909 Female Male Male
## 910 Male Male Male
## 911 Male Male Male
## 917 Male Male Male
## 919 Male Male Male
## 921 Male Male Male
## 939 Male Male Male
## 946 Male Male Male
## 948 Male Male Male
## 952 Male Male Male
## 962 Male Male Male
## 964 Male Male Male
## 970 Male Male Male
## 974 Male Male Male
## 975 Male Male Female
## 977 Male Male Male
## 981 Male Male Male
## 983 Male Male Male
## 984 Male Male Male
## 987 Male Male Male
## 988 Male Male Male
## 992 Female Male Male
## 993 Male Male Male
## 994 Male Male Male
## 995 Male Male Female
## 996 Male Female Male
## 997 Male Male Male
## 1006 Female Male Male
## 1007 Male Male Male
## 1008 Female Male Male
## 1030 Female Male Male
## 1036 Male Male Male
## 1048 Male Male Male
## 1050 Male Male Male
## 1053 Female Male Male
## 1054 Male Male Female
## 1061 Male Female Male
## 1068 Male Male Female
## 1070 Male Other Male
## 1076 Male Male Male
## 1081 Male Male Male
## 1088 Male Male Male
## 1090 Male Male Male
## 1092 Male Male Male
## 1095 Male Male Male
## 1096 Male Male Male
## 1099 Male Male Male
## 1101 Male Male Male
## 1103 Male Male Male
## 1105 Male Male Male
## 1107 Male Male Male
## 1111 Male Male Male
## 1113 Male Male Male
## 1115 Male Female Male
## 1120 Male Male Male
## 1125 Male Male Male
## 1140 Male Male Male
## 1142 Male Male Male
## 1147 Female Female Male
## 1149 Male Male Female
## 1160 Male Male Male
## 1162 Male Male Male
## 1175 Male Male Male
## 1180 Female Male Male
## 1191 Male Male Other
## 1196 Male Male Male
## 1200 Male Male Male
## 1201 Male Male Male
## 1204 Female Male Male
## 1217 Male Male Male
## 1220 Male Male Male
## 1224 Male Male Male
## 1230 Male Male Male
## 1232 Male Male Male
## 1235 Male Male Male
## 1240 Male Male Male
## 1246 Male Male Male
## 1247 Male Male Male
## 1248 Female Male Male
## 1255 Male Male Male
## 1256 Male Female Male
## 1262 Other Male Other
## 1264 Male Male Male
## 1270 Male Male Male
## 1273 Male Male Male
## 1277 Male Male Male
## 1281 Male Male Male
## 1284 Male Male Male
## 1286 Male Male Male
## 1292 Female Female Male
## 1293 Female Male Male
## 1300 Male Female Male
## 1307 Female Male Male
## 1308 Male Male Male
## 1317 Male Male Male
## 1326 Male Male Male
## 1327 Male Male Male
## 1330 Male Male Male
## 1336 Male Female Male
## 1340 Male Male Male
## 1345 Male Male Male
## 1350 Male Male Male
## 1364 Male Male Male
## 1365 Male Male Male
## 1366 Male Male Male
## 1367 Female Male Male
## 1370 Male Other Female
## 1373 Male Male Female
## 1379 Female Male Female
## 1383 Male Female Male
## 1385 Male Male Male
## 1386 Male Male Male
## 1387 Male Male Male
## 1395 Male Male Female
## 1397 Male Male Male
## 1402 Male Male Male
## 1409 Male Male Male
## 1411 Male Male Male
## 1418 Male Male Female
## 1428 Male Male Female
## 1438 Male Male Male
## 1450 Male Female Male
## 1451 Male Female Female
## 1452 Male Male Female
## 1453 Male Female Male
## 1459 Male Male Male
## 1464 Male Male Male
## 1466 Male Male Male
## 1468 Male Other Male
## 1469 Male Female Male
## 1477 Male Male Male
## 1478 Male Male Male
## 1487 Male Male Male
## 1488 Male Male Male
## 1489 Female Male Male
## 1505 Male Male Other
## 1506 Male Male Male
## 1507 Male Male Male
## 1508 Male Male Male
## 1511 Male Male Male
## 1519 Male Male Male
## 1522 Male Male Male
## 1523 Male Male Male
## 1527 Male Male Female
## 1528 Male Female Female
## 1529 Male Male Male
## 1530 Male Male Male
## 1539 Female Male Male
## 1542 Male Male Other
## 1543 Male Male Male
## 1546 Male Male Male
## 1556 Male Male Male
## 1567 Male Male Male
## 1573 Male Male Male
## 1578 Female Male Female
## 1583 Male Male Male
## 1586 Male Male Male
## 1588 Male Female Female
## 1591 Male Male Male
## 1592 Male Male Male
## 1598 Female Male Male
## 1602 Male Male Male
## 1612 Male Male Male
## 1614 Male Male Male
## 1623 Male Other Male
## 1629 Male Male Male
## 1631 Male Male Male
## 1633 Male Male Male
## 1640 Male Male Male
## 1641 Male Male Male
## 1644 Male Female Female
## 1645 Male Male Male
## 1646 Male Male Male
## 1647 Male Female Male
## 1649 Male Male Male
## 1657 Male Male Male
## 1659 Male Male Male
## 1660 Female Male Male
## 1667 Male Male Male
## 1672 Male Female Female
## 1676 Male Male Male
## 1679 Female Male Male
## 1680 Male Female Other
## 1683 Male Male Male
## 1688 Male Male Male
## 1692 Male Male Male
## 1703 Male Male Male
## 1713 Male Male Male
## 1715 Female Male Female
## 1717 Male Male Male
## 1718 Female Female Male
## 1721 Female Male Male
## 1723 Other Male Male
## 1725 Male Male Male
## 1726 Male Female Male
## 1727 Male Male Male
## 1730 Male Female Male
## 1731 Male Male Male
## 1735 Male Male Male
## 1744 Male Male Male
## 1750 Male Male Male
## 1756 Male Male Male
## 1763 Male Male Male
## 1764 Male Male Male
## 1769 Male Other Male
## 1770 Male Male Male
## 1773 Male Male Male
## 1776 Male Male Female
## 1778 Female Male Male
## 1779 Male Male Male
## 1781 Male Male Male
## 1785 Male Male Male
## 1793 Male Male Male
## 1797 Male Male Male
## 1799 Male Male Male
## 1800 Male Male Male
## 1807 Male Male Male
## 1810 Male Male Male
## 1814 Male Male Male
## 1815 Male Male Male
## 1817 Male Male Male
## 1828 Male Male Male
## 1832 Male Male Male
## 1833 Male Male Male
## 1840 Male Male Female
## 1856 Male Male Male
## 1857 Male Male Male
## 1860 Male Male Male
## 1866 Male Male Male
## 1867 Male Male Male
## 1869 Male Male Male
## 1872 Female Male Male
## 1873 Male Male Male
## 1885 Male Male Male
## 1886 Male Male Male
## 1887 Male Male Female
## 1893 Other Male Male
## 1902 Male Male Male
## 1903 Male Male Male
## 1911 Male Male Male
## 1912 Male Other Male
## 1913 Male Male Male
## 1914 Male Male Male
## 1915 Male Other Male
## 1923 Male Male Male
## 1927 Male Male Male
## 1928 Female Male Male
## 1932 Male Male Male
## 1933 Female Male Male
## 1934 Male Male Male
## 1935 Female Male Male
## 1936 Male Male Female
## 1939 Male Male Male
## 1946 Male Male Male
## 1948 Male Male Male
## 1950 Male Male Male
## 1955 Female Male Male
## 1961 Male Male Other
## 1962 Male Male Female
## 1971 Male Female Male
## 1973 Male Male Male
## 1978 Male Male Male
## 1979 Male Male Male
## 1983 Female Male Male
## 1984 Male Female Female
## 1986 Male Female Male
## 1987 Male Male Female
## 1988 Male Male Male
## 1992 Male Female Male
## 1995 Male Male Male
## 1997 Female Male Male
## 1998 Male Male Male
## 2000 Female Male Male
## 2002 Male Male Male
## 2008 Male Female Male
## 2021 Male Male Male
## 2031 Male Male Male
## 2033 Male Male Male
## 2042 Male Male Male
## 2043 Male Male Female
## 2052 Male Female Male
## 2055 Male Male Male
## 2056 Male Male Male
## 2063 Male Male Male
## 2064 Male Male Male
## 2066 Female Male Male
## 2068 Male Male Male
## 2073 Male Male Female
## 2083 Male Male Male
## 2084 Male Male Male
## 2087 Male Male Male
## 2091 Female Male Male
## 2093 Male Male Female
## 2094 Male Male Female
## 2096 Female Male Male
## 2103 Male Male Other
## 2104 Male Male Male
## 2108 Male Female Female
## 2109 Male Male Male
## 2111 Male Male Male
## 2115 Male Male Male
## 2120 Male Male Male
## 2123 Male Male Male
WE choose the first imputed value as it looking more reasonable then other
final_test_df <- complete(impute, 1)
final_train_df <- complete(impute1, 1)
head(final_test_df)
## enrollee_id city city_development_index gender relevant_experience
## 1 32403 city_41 0.827 Male Has relevant experience
## 2 9858 city_103 0.920 Female Has relevant experience
## 3 31806 city_21 0.624 Male No relevant experience
## 4 27385 city_13 0.827 Male Has relevant experience
## 5 27724 city_103 0.920 Male Has relevant experience
## 6 217 city_23 0.899 Male No relevant experience
## enrolled_university education_level major_discipline experience company_size
## 1 Full time course Graduate STEM 9.0 <10
## 2 no_enrollment Graduate STEM 5.0 5000-9999
## 3 no_enrollment High School STEM 0.5 100-500
## 4 no_enrollment Masters STEM 11.0 10-49
## 5 no_enrollment Graduate STEM 21.0 10000+
## 6 Part time course Masters STEM 10.0 10000+
## company_type last_new_job training_hours
## 1 Pvt Ltd 1 21
## 2 Pvt Ltd 1 98
## 3 Pvt Ltd 0 15
## 4 Pvt Ltd 1 39
## 5 Pvt Ltd 5 72
## 6 Pvt Ltd 2 12
tail(final_train_df)
## enrollee_id city city_development_index gender
## 19153 29754 city_103 0.920 Female
## 19154 7386 city_173 0.878 Male
## 19155 31398 city_103 0.920 Male
## 19156 24576 city_103 0.920 Male
## 19157 5756 city_65 0.802 Male
## 19158 23834 city_67 0.855 Male
## relevant_experience enrolled_university education_level
## 19153 Has relevant experience no_enrollment Graduate
## 19154 No relevant experience no_enrollment Graduate
## 19155 Has relevant experience no_enrollment Graduate
## 19156 Has relevant experience no_enrollment Graduate
## 19157 Has relevant experience no_enrollment High School
## 19158 No relevant experience no_enrollment Primary School
## major_discipline experience company_size company_type last_new_job
## 19153 Humanities 7.0 10-49 Funded Startup 1
## 19154 Humanities 14.0 100-500 Pvt Ltd 1
## 19155 STEM 14.0 1000-4999 Pvt Ltd 4
## 19156 STEM 21.0 50-99 Pvt Ltd 4
## 19157 Business Degree 0.5 500-999 Pvt Ltd 2
## 19158 STEM 2.0 10-49 Pvt Ltd 1
## training_hours target
## 19153 25 0
## 19154 42 1
## 19155 52 1
## 19156 44 0
## 19157 97 0
## 19158 127 0
table(is.na(final_test_df))
##
## FALSE
## 27677
sapply(final_train_df, function(x) sum(is.na(x)))
## enrollee_id city city_development_index
## 0 0 0
## gender relevant_experience enrolled_university
## 0 0 0
## education_level major_discipline experience
## 0 0 0
## company_size company_type last_new_job
## 0 0 0
## training_hours target
## 0 0
So there is no missing value in this final both dataset.
md.pattern(final_test_df)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## enrollee_id city city_development_index gender relevant_experience
## 2129 1 1 1 1 1
## 0 0 0 0 0
## enrolled_university education_level major_discipline experience
## 2129 1 1 1 1
## 0 0 0 0
## company_size company_type last_new_job training_hours
## 2129 1 1 1 1 0
## 0 0 0 0 0
ggplot(final_test_df,aes(x=gender))+
geom_bar(fill = "blue")+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Gender Distribution")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Gender")
ggplot(final_train_df,aes(x=gender))+
geom_bar(fill = "blue")+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Gender Distribution")+
theme(plot.title = element_text(hjust = 0.5))+
xlab("Gender")
ggplot(final_test_df,aes(x=education_level,fill = gender))+
geom_bar()+
facet_wrap(~gender)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Education Level with Gender after Imputation")+
xlab("Gender")+
ylab("Count")
ggplot(final_train_df,aes(x=education_level,fill = gender))+
geom_bar()+
facet_wrap(~gender)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Education Level with Gender after Imputation")+
xlab("Gender")+
ylab("Count")
ggplot(final_test_df,aes(x=gender,fill = relevant_experience))+
geom_bar()+
facet_wrap(~relevant_experience)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Relevant experience with gender after Imputation")+
xlab("Gender")+
ylab("Count")
ggplot(final_train_df,aes(x=gender,fill = relevant_experience))+
geom_bar()+
facet_wrap(~relevant_experience)+
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust = 0.5))+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(stat='count', aes(label=..count..), vjust=-1)+
ggtitle("Relevant experience with gender after Imputation")+
xlab("Gender")+
ylab("Count")
The dataset was already divided into test and train dataset. It has a lot of NAs and few miss-spelling and misformatting values in this dataset.I have corrected the common miss-spelling and mutated a few variables to make it easy to work with the dataset. To get rid of missing values, I have used a mice package for imputation. For that purpose, I have converted the data type character variable to factor. Since there are 3 imputed datasets, I have selected the first one using complete() function.
train <- read.csv("Data/aug_train.csv") %>% na_if("")
View(train)
require(ggplot2)
require(scales)
require(tidyverse)
#Recruitment Recruitment refers to the process of identifying, attracting, interviewing, selecting, hiring and onboarding employees.
#This study will focus on the preliminary stage in the recruitment stage.
#Tangible factors will be assessed and analysed.
#A few important factors here are taken into consideration in employee's decision making.
#1 Gender
ggplot(train, aes(x = gender)) +
geom_bar(aes(y = (..count..)/sum(..count..)), fill = "light blue") +
geom_text(aes(y = ((..count..)/sum(..count..)), label = scales::percent((..count..)/sum(..count..))), stat = "count", vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(title = "Factor 1 : Gender Distribution", y = "Percent", x = " ")
#2 Relevent Experience
ggplot(train, aes(x = relevent_experience)) +
geom_bar(aes(y = (..count..)/sum(..count..)), fill = "light yellow") +
geom_text(aes(y = ((..count..)/sum(..count..)), label = scales::percent((..count..)/sum(..count..))), stat = "count", vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(title = "Factor 2 : Relevent Experience", y = "Percent", x = " ")
#3 Enrolled University
ggplot(train, aes(x = enrolled_university)) +
geom_bar(aes(y = (..count..)/sum(..count..)), fill = "brown") +
geom_text(aes(y = ((..count..)/sum(..count..)), label = scales::percent((..count..)/sum(..count..))), stat = "count", vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(title = "Factor 3 : Enrolled University", y = "Percent", x = " ")
#4 Education Level
ggplot(train, aes(x = education_level)) +
geom_bar(aes(y = (..count..)/sum(..count..)), fill = "light pink") +
geom_text(aes(y = ((..count..)/sum(..count..)), label = scales::percent((..count..)/sum(..count..))), stat = "count", vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(title = "Factor 4 : Education Level", y = "Percent", x = " ")
#5 Major Discipline
ggplot(train, aes(x = major_discipline)) +
geom_bar(aes(y = (..count..)/sum(..count..)), fill = "brown") +
geom_text(aes(y = ((..count..)/sum(..count..)), label = scales::percent((..count..)/sum(..count..))), stat = "count", vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(title = "Factor 5 : Major Discipline", y = "Percent", x = " ")
# load package ----
library(tidyverse)
library(ggplot2)
library(caret)
library(dplyr)
# read data ----
train_df <- read.csv(file = "Data/final_train_df2.csv")
test_df <- read.csv(file = "Data/final_test_df2.csv")
# Change target class from integer to factor
train_df$target <- factor(train_df$target)
# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(train_df$target, p=0.80, list=FALSE)
# select 20% of the data for validation
validation <- train_df[-validation_index,]
# use the remaining 80% of data to training and testing the models
train_df <- train_df[validation_index,]
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
To test which algorithms will perform better, 4 different algorithms is used: + Linear Discriminant Analysis (LDA) + Classification and Regression Trees (CART) + k-Nearest Neighbors (kNN) + Random Forest (RF) The set.seed number is same to ensure that the evaluation of each algorithm is performed using exactly the same data splits.
# a) linear algorithms
set.seed(7)
fit.lda <- train(target~., data=train_df, method="lda", metric=metric, trControl=control)
# b) nonlinear algorithms
# CART
set.seed(7)
fit.cart <- train(target~., data=train_df, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(7)
fit.knn <- train(target~., data=train_df, method="knn", metric=metric, trControl=control)
# c) advanced algorithms
# Random Forest
set.seed(7)
fit.rf <- train(target~., data=train_df, method="rf", metric=metric, trControl=control)
# summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, rf=fit.rf))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, cart, knn, rf
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.7671233 0.7762728 0.7791192 0.7803602 0.7831773 0.7964775 2
## cart 0.7651663 0.7772703 0.7787206 0.7824754 0.7849217 0.8049576 0
## knn 0.7186684 0.7246329 0.7279843 0.7267567 0.7296460 0.7332029 0
## rf 0.7625571 0.7706228 0.7745517 0.7762769 0.7814386 0.7932159 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.30805294 0.34807543 0.35199960 0.356271887 0.3653595 0.39819873 2
## cart 0.29469500 0.33947991 0.35226704 0.357971015 0.3747790 0.41592707 0
## knn -0.01265567 0.00384776 0.01047221 0.008539166 0.0159108 0.02514956 0
## rf 0.26963118 0.30962881 0.32413087 0.324114364 0.3444705 0.36838719 0
# compare accuracy of models
dotplot(results)
# summarize Best Model
print(fit.lda)
## Linear Discriminant Analysis
##
## 15327 samples
## 14 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 13794, 13794, 13794, 13795, 13794, 13795, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7803602 0.3562719
# estimate skill of LDA on the validation dataset
predictions <- predict(fit.lda, validation, na.action = na.pass)
confusionMatrix(predictions, validation$target)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2579 542
## 1 297 413
##
## Accuracy : 0.781
## 95% CI : (0.7676, 0.794)
## No Information Rate : 0.7507
## P-Value [Acc > NIR] : 6.145e-06
##
## Kappa : 0.36
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8967
## Specificity : 0.4325
## Pos Pred Value : 0.8263
## Neg Pred Value : 0.5817
## Prevalence : 0.7507
## Detection Rate : 0.6732
## Detection Prevalence : 0.8147
## Balanced Accuracy : 0.6646
##
## 'Positive' Class : 0
##
# LDA prediction on test data
prdtest <- predict(fit.lda, test_df, type = "raw")
test_df <- test_df %>% select(everything()) %>% mutate(target = prdtest)
summary(prdtest)
## 0 1
## 1714 415
confusionMatrix(prdtest, test_df$target)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1714 0
## 1 0 415
##
## Accuracy : 1
## 95% CI : (0.9983, 1)
## No Information Rate : 0.8051
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8051
## Detection Rate : 0.8051
## Detection Prevalence : 0.8051
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##