============================================================================================================
About: This document is also available at http://rpubs.com/sherloconan/570366
Continued on: https://rpubs.com/sherloconan/625258
Data source: https://www.kaggle.com/sherloconan/anly-53053b
Reference: https://www.aeaweb.org/articles?id=10.1257/0002828042002561  

1. Exploratory Data Analysis

 

Step 1: Collecting the data

labor <- read.csv("~/Documents/HU/ANLY 530-53-B/Project Description/lakisha_aer.csv")
table(labor$firstname, labor$race) %>% kable() %>% kable_styling(font_size=11, full_width=F)
b w
Aisha 180 0
Allison 0 232
Anne 0 242
Brad 0 63
Brendan 0 65
Brett 0 59
Carrie 0 168
Darnell 42 0
Ebony 208 0
Emily 0 227
Geoffrey 0 59
Greg 0 51
Hakim 55 0
Jamal 61 0
Jay 0 67
Jermaine 52 0
Jill 0 203
Kareem 64 0
Keisha 183 0
Kenya 196 0
Kristen 0 213
Lakisha 200 0
Latonya 230 0
Latoya 226 0
Laurie 0 195
Leroy 64 0
Matthew 0 67
Meredith 0 187
Neil 0 76
Rasheed 67 0
Sarah 0 193
Tamika 256 0
Tanisha 207 0
Todd 0 68
Tremayne 69 0
Tyrone 75 0
vars <- c("call","h","sex","race","education","ofjobs","yearsexp","honors","volunteer","military","empholes","workinschool","email","computerskills","specialskills")
data <- labor[vars]

 

Step 2: Histogram and descriptive analysis

ggplot(data,aes(yearsexp))+geom_histogram(binwidth=1)+labs(x="Years",y="Count")+ggtitle("Number of years of work experience on the resume")+theme_classic()

prop.table(table(data$race, data$call)) %>% kable(digits=4) %>% kable_styling(full_width=F)
0 1
b 0.4678 0.0322
w 0.4517 0.0483
backup <- data
data[] <- lapply(data, as.factor)
Hmisc::describe(data)
## data 
## 
##  15  Variables      4870  Observations
## --------------------------------------------------------------------------------
## call 
##        n  missing distinct 
##     4870        0        2 
##                     
## Value         0    1
## Frequency  4478  392
## Proportion 0.92 0.08
## --------------------------------------------------------------------------------
## h 
##        n  missing distinct 
##     4870        0        2 
##                       
## Value          0     1
## Frequency   2424  2446
## Proportion 0.498 0.502
## --------------------------------------------------------------------------------
## sex 
##        n  missing distinct 
##     4870        0        2 
##                       
## Value          f     m
## Frequency   3746  1124
## Proportion 0.769 0.231
## --------------------------------------------------------------------------------
## race 
##        n  missing distinct 
##     4870        0        2 
##                     
## Value         b    w
## Frequency  2435 2435
## Proportion  0.5  0.5
## --------------------------------------------------------------------------------
## education 
##        n  missing distinct 
##     4870        0        5 
## 
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##                                         
## Value          0     1     2     3     4
## Frequency     46    40   274  1006  3504
## Proportion 0.009 0.008 0.056 0.207 0.720
## --------------------------------------------------------------------------------
## ofjobs 
##        n  missing distinct 
##     4870        0        7 
## 
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency    110   704  1429  1611   533   464    19
## Proportion 0.023 0.145 0.293 0.331 0.109 0.095 0.004
## --------------------------------------------------------------------------------
## yearsexp 
##        n  missing distinct 
##     4870        0       26 
## 
## lowest : 1  2  3  4  5 , highest: 22 23 25 26 44
## --------------------------------------------------------------------------------
## honors 
##        n  missing distinct 
##     4870        0        2 
##                       
## Value          0     1
## Frequency   4613   257
## Proportion 0.947 0.053
## --------------------------------------------------------------------------------
## volunteer 
##        n  missing distinct 
##     4870        0        2 
##                       
## Value          0     1
## Frequency   2866  2004
## Proportion 0.589 0.411
## --------------------------------------------------------------------------------
## military 
##        n  missing distinct 
##     4870        0        2 
##                       
## Value          0     1
## Frequency   4397   473
## Proportion 0.903 0.097
## --------------------------------------------------------------------------------
## empholes 
##        n  missing distinct 
##     4870        0        2 
##                       
## Value          0     1
## Frequency   2688  2182
## Proportion 0.552 0.448
## --------------------------------------------------------------------------------
## workinschool 
##        n  missing distinct 
##     4870        0        2 
##                     
## Value         0    1
## Frequency  2145 2725
## Proportion 0.44 0.56
## --------------------------------------------------------------------------------
## email 
##        n  missing distinct 
##     4870        0        2 
##                       
## Value          0     1
## Frequency   2536  2334
## Proportion 0.521 0.479
## --------------------------------------------------------------------------------
## computerskills 
##        n  missing distinct 
##     4870        0        2 
##                       
## Value          0     1
## Frequency    874  3996
## Proportion 0.179 0.821
## --------------------------------------------------------------------------------
## specialskills 
##        n  missing distinct 
##     4870        0        2 
##                       
## Value          0     1
## Frequency   3269  1601
## Proportion 0.671 0.329
## --------------------------------------------------------------------------------

 

Step 3: Chi-squared Test of Independence

#REMINDER: "for" loop is not recommended in R
pairs <- c()
for (i in 1:(ncol(backup)-2)) {
  for (j in (i+1):(ncol(backup)-1)) {
    if (chisq.test(table(backup[,i+1],backup[,j+1]))$p.value<0.05) {
      pairs <- c(pairs, c(i,j))
    }
  }
} #O(n²)
#pairs of variables that are rejected by Chi-squared Test of Independence
#such variables in a pair have a significant association
table(pairs)
## pairs
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 
## 11 10  1 12 12 12  8 10 11 12 12 12 11 10
#REMINDER: declaration with dimensions will speed up the execution in R
chiTable <- data.frame(matrix(nrow=14,ncol=14))
for (i in 1:14) {
  for (j in 1:14) {
    if (chisq.test(table(backup[,i+1],backup[,j+1]))$p.value<0.05) {
      chiTable[i,j] <- "⬤"
    } else {
      chiTable[i,j] <- "◯"
    }
  }
}
colnames(chiTable) <- c(1:14)
chiTable$Variable <- c(1:14)
chiTable <- chiTable[,c(15,1:14)]
#full results of Chi-squared Test of Independence: ⬤ reject null hypothesis; ◯ fail to reject null hypothesis
chiTable %>% kable() %>% kable_styling(full_width=F)
Variable 1 2 3 4 5 6 7 8 9 10 11 12 13 14
1
2
3
4
5
6
7
8
9
10
11
12
13
14

 

 

2. Data Wrangling

 

Randomization in sampling

data <- backup
data$call <- as.factor(data$call)
set.seed(530)
training <- sample(nrow(data), as.integer(nrow(data)*0.75)) #75%, 3652 of 4870
dataTraining <- data[training,]
dataTest <- data[-training,]
prop.table(table(dataTraining$call)) %>% kable(digits=4) %>% kable_styling(full_width=F)
Var1 Freq
0 0.92
1 0.08
prop.table(table(dataTest$call)) %>% kable(digits=4) %>% kable_styling(full_width=F)
Var1 Freq
0 0.9179
1 0.0821

 

 

3. Decision Tree C5.0 (unbalanced)

 

Step 1: Training a model on the data

modelDT <- C5.0(x=dataTraining[-1], y=dataTraining$call)
trainPredDT <- predict(modelDT, dataTraining)
CrossTable(dataTraining$call, trainPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  3652 
## 
##  
##                   | trainPredDT 
## dataTraining$call |         0 | Row Total | 
## ------------------|-----------|-----------|
##                 0 |      3360 |      3360 | 
##                   |     0.920 |           | 
## ------------------|-----------|-----------|
##                 1 |       292 |       292 | 
##                   |     0.080 |           | 
## ------------------|-----------|-----------|
##      Column Total |      3652 |      3652 | 
## ------------------|-----------|-----------|
## 
## 

 

Step 2: Evaluating model performance

testPredDT <- predict(modelDT, dataTest)
CrossTable(dataTest$call, testPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1218 
## 
##  
##               | testPredDT 
## dataTest$call |         0 | Row Total | 
## --------------|-----------|-----------|
##             0 |      1118 |      1118 | 
##               |     0.918 |           | 
## --------------|-----------|-----------|
##             1 |       100 |       100 | 
##               |     0.082 |           | 
## --------------|-----------|-----------|
##  Column Total |      1218 |      1218 | 
## --------------|-----------|-----------|
## 
## 

 

 

4. Random Forest (unbalanced)

 

Step 1: Training a model on the data

modelRF <- randomForest(call~., data=dataTraining, importance=T)
trainPredRF <- predict(modelRF, dataTraining)
CrossTable(dataTraining$call, trainPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  3652 
## 
##  
##              | Predicted 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |      3360 |         0 |      3360 | 
##              |     0.920 |     0.000 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       284 |         8 |       292 | 
##              |     0.078 |     0.002 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      3644 |         8 |      3652 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

Step 2: Evaluating model performance

testPredRF <- predict(modelRF, dataTest)
CrossTable(dataTest$call, testPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1218 
## 
##  
##               | testPredRF 
## dataTest$call |         0 | Row Total | 
## --------------|-----------|-----------|
##             0 |      1118 |      1118 | 
##               |     0.918 |           | 
## --------------|-----------|-----------|
##             1 |       100 |       100 | 
##               |     0.082 |           | 
## --------------|-----------|-----------|
##  Column Total |      1218 |      1218 | 
## --------------|-----------|-----------|
## 
## 
importance(modelRF, type=1)
##                MeanDecreaseAccuracy
## h                        14.9811850
## sex                      15.2506583
## race                      0.7955825
## education                18.2286908
## ofjobs                   29.5982489
## yearsexp                 32.5244040
## honors                   18.1259418
## volunteer                14.8749497
## military                 10.1074748
## empholes                 22.3303431
## workinschool             17.1157557
## email                    16.4070666
## computerskills           13.6910030
## specialskills            21.7068915

 

 

5. Decision Tree rpart (unbalanced)

 

Step 1: Training a model on the data

modelRT <- rpart(call~., data=dataTraining)
trainPredRT <- predict(modelRT, dataTraining, type="class")
CrossTable(dataTraining$call, trainPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  3652 
## 
##  
##                   | trainPredRT 
## dataTraining$call |         0 | Row Total | 
## ------------------|-----------|-----------|
##                 0 |      3360 |      3360 | 
##                   |     0.920 |           | 
## ------------------|-----------|-----------|
##                 1 |       292 |       292 | 
##                   |     0.080 |           | 
## ------------------|-----------|-----------|
##      Column Total |      3652 |      3652 | 
## ------------------|-----------|-----------|
## 
## 
rpart.plot(modelRT, digits=4, type=1)

 

Step 2: Evaluating model performance

testPredRT <- predict(modelRT, dataTest, type="class")
CrossTable(dataTest$call, testPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1218 
## 
##  
##               | testPredRT 
## dataTest$call |         0 | Row Total | 
## --------------|-----------|-----------|
##             0 |      1118 |      1118 | 
##               |     0.918 |           | 
## --------------|-----------|-----------|
##             1 |       100 |       100 | 
##               |     0.082 |           | 
## --------------|-----------|-----------|
##  Column Total |      1218 |      1218 | 
## --------------|-----------|-----------|
## 
## 

 

 

6. Naïve Bayes Classifier

 

Step 1: Training a model on the data

modelNB <- naive_bayes(call~., data=dataTraining)
trainPredNB <- predict(modelNB, dataTraining, type="class")
CrossTable(dataTraining$call, trainPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  3652 
## 
##  
##              | Predicted 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |      3199 |       161 |      3360 | 
##              |     0.876 |     0.044 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       262 |        30 |       292 | 
##              |     0.072 |     0.008 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      3461 |       191 |      3652 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

Step 2: Evaluating model performance

testPredNB <- predict(modelNB, dataTest, type="class")
CrossTable(dataTest$call, testPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1218 
## 
##  
##              | Predicted 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |      1063 |        55 |      1118 | 
##              |     0.873 |     0.045 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        88 |        12 |       100 | 
##              |     0.072 |     0.010 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1151 |        67 |      1218 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

 

7. Support Vector Machine (unbalanced)

 

Step 1: Training a model on the data

tryKernel <- function(train, test, kern) {
  model <- ksvm(call~., data=train, kernel=kern)
  trainAccuracy <- round((1-model@error)*100,2)
  pred <- predict(model, test)
  table <- table(test$call==pred)
  testAccuracy <- round((table[2]/(table[1]+table[2]))*100,2)
  names(testAccuracy) <- NULL
  return(c(trainAccuracy, testAccuracy))
}

modelSVM <- ksvm(call~., data=dataTraining)
trainPredSVM <- predict(modelSVM, dataTraining)
CrossTable(dataTraining$call, trainPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  3652 
## 
##  
##                   | trainPredSVM 
## dataTraining$call |         0 | Row Total | 
## ------------------|-----------|-----------|
##                 0 |      3360 |      3360 | 
##                   |     0.920 |           | 
## ------------------|-----------|-----------|
##                 1 |       292 |       292 | 
##                   |     0.080 |           | 
## ------------------|-----------|-----------|
##      Column Total |      3652 |      3652 | 
## ------------------|-----------|-----------|
## 
## 

 

Step 2: Evaluating model performance

result <- data.frame("Accuracy"=c("Training (%)", "Test (%)"),
                     "Linear"=tryKernel(dataTraining,dataTest,kern="vanilladot"),
                     "Polynomial"=tryKernel(dataTraining,dataTest,kern="polydot"),
                     "RBF"=tryKernel(dataTraining,dataTest,kern="rbfdot"))
##  Setting default kernel parameters  
##  Setting default kernel parameters
result %>% kable() %>% kable_styling(full_width=F)
Accuracy Linear Polynomial RBF
Training (%) 92.00 92.00 92.00
Test (%) 91.79 91.79 91.79
testPredSVM <- predict(modelSVM, dataTest)
CrossTable(dataTest$call, testPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1218 
## 
##  
##               | testPredSVM 
## dataTest$call |         0 | Row Total | 
## --------------|-----------|-----------|
##             0 |      1118 |      1118 | 
##               |     0.918 |           | 
## --------------|-----------|-----------|
##             1 |       100 |       100 | 
##               |     0.082 |           | 
## --------------|-----------|-----------|
##  Column Total |      1218 |      1218 | 
## --------------|-----------|-----------|
## 
## 

 

 

8. k-Nearest Neighbors (unbalanced)

 

Evaluating model performance
The data preparation for KNN often involves three tasks:
(1) Fix all NA or "" values
(2) Convert all factors into a set of booleans, one for each level in the factor
(3) Normalize the values of each variable to the range 0:1 so that no variable’s range has an unduly large impact on the distance measurement.

#testPredKNN <- knn(train=dataTraining[-1], test=dataTest[-1], cl=dataTraining[,1], k=60)
#CrossTable(dataTest$call, testPredKNN, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))