============================================================================================================
About: This document is also available at http://rpubs.com/sherloconan/570366
Continued on: https://rpubs.com/sherloconan/625258
Data source: https://www.kaggle.com/sherloconan/anly-53053b
Reference: https://www.aeaweb.org/articles?id=10.1257/0002828042002561
Step 1: Collecting the data
labor <- read.csv("~/Documents/HU/ANLY 530-53-B/Project Description/lakisha_aer.csv")
table(labor$firstname, labor$race) %>% kable() %>% kable_styling(font_size=11, full_width=F)
| b | w | |
|---|---|---|
| Aisha | 180 | 0 |
| Allison | 0 | 232 |
| Anne | 0 | 242 |
| Brad | 0 | 63 |
| Brendan | 0 | 65 |
| Brett | 0 | 59 |
| Carrie | 0 | 168 |
| Darnell | 42 | 0 |
| Ebony | 208 | 0 |
| Emily | 0 | 227 |
| Geoffrey | 0 | 59 |
| Greg | 0 | 51 |
| Hakim | 55 | 0 |
| Jamal | 61 | 0 |
| Jay | 0 | 67 |
| Jermaine | 52 | 0 |
| Jill | 0 | 203 |
| Kareem | 64 | 0 |
| Keisha | 183 | 0 |
| Kenya | 196 | 0 |
| Kristen | 0 | 213 |
| Lakisha | 200 | 0 |
| Latonya | 230 | 0 |
| Latoya | 226 | 0 |
| Laurie | 0 | 195 |
| Leroy | 64 | 0 |
| Matthew | 0 | 67 |
| Meredith | 0 | 187 |
| Neil | 0 | 76 |
| Rasheed | 67 | 0 |
| Sarah | 0 | 193 |
| Tamika | 256 | 0 |
| Tanisha | 207 | 0 |
| Todd | 0 | 68 |
| Tremayne | 69 | 0 |
| Tyrone | 75 | 0 |
vars <- c("call","h","sex","race","education","ofjobs","yearsexp","honors","volunteer","military","empholes","workinschool","email","computerskills","specialskills")
data <- labor[vars]
Step 2: Histogram and descriptive analysis
ggplot(data,aes(yearsexp))+geom_histogram(binwidth=1)+labs(x="Years",y="Count")+ggtitle("Number of years of work experience on the resume")+theme_classic()
prop.table(table(data$race, data$call)) %>% kable(digits=4) %>% kable_styling(full_width=F)
| 0 | 1 | |
|---|---|---|
| b | 0.4678 | 0.0322 |
| w | 0.4517 | 0.0483 |
backup <- data
data[] <- lapply(data, as.factor)
Hmisc::describe(data)
## data
##
## 15 Variables 4870 Observations
## --------------------------------------------------------------------------------
## call
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 4478 392
## Proportion 0.92 0.08
## --------------------------------------------------------------------------------
## h
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2424 2446
## Proportion 0.498 0.502
## --------------------------------------------------------------------------------
## sex
## n missing distinct
## 4870 0 2
##
## Value f m
## Frequency 3746 1124
## Proportion 0.769 0.231
## --------------------------------------------------------------------------------
## race
## n missing distinct
## 4870 0 2
##
## Value b w
## Frequency 2435 2435
## Proportion 0.5 0.5
## --------------------------------------------------------------------------------
## education
## n missing distinct
## 4870 0 5
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 46 40 274 1006 3504
## Proportion 0.009 0.008 0.056 0.207 0.720
## --------------------------------------------------------------------------------
## ofjobs
## n missing distinct
## 4870 0 7
##
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##
## Value 1 2 3 4 5 6 7
## Frequency 110 704 1429 1611 533 464 19
## Proportion 0.023 0.145 0.293 0.331 0.109 0.095 0.004
## --------------------------------------------------------------------------------
## yearsexp
## n missing distinct
## 4870 0 26
##
## lowest : 1 2 3 4 5 , highest: 22 23 25 26 44
## --------------------------------------------------------------------------------
## honors
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 4613 257
## Proportion 0.947 0.053
## --------------------------------------------------------------------------------
## volunteer
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2866 2004
## Proportion 0.589 0.411
## --------------------------------------------------------------------------------
## military
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 4397 473
## Proportion 0.903 0.097
## --------------------------------------------------------------------------------
## empholes
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2688 2182
## Proportion 0.552 0.448
## --------------------------------------------------------------------------------
## workinschool
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2145 2725
## Proportion 0.44 0.56
## --------------------------------------------------------------------------------
## email
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2536 2334
## Proportion 0.521 0.479
## --------------------------------------------------------------------------------
## computerskills
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 874 3996
## Proportion 0.179 0.821
## --------------------------------------------------------------------------------
## specialskills
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 3269 1601
## Proportion 0.671 0.329
## --------------------------------------------------------------------------------
Step 3: Chi-squared Test of Independence
#REMINDER: "for" loop is not recommended in R
pairs <- c()
for (i in 1:(ncol(backup)-2)) {
for (j in (i+1):(ncol(backup)-1)) {
if (chisq.test(table(backup[,i+1],backup[,j+1]))$p.value<0.05) {
pairs <- c(pairs, c(i,j))
}
}
} #O(n²)
#pairs of variables that are rejected by Chi-squared Test of Independence
#such variables in a pair have a significant association
table(pairs)
## pairs
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 11 10 1 12 12 12 8 10 11 12 12 12 11 10
#REMINDER: declaration with dimensions will speed up the execution in R
chiTable <- data.frame(matrix(nrow=14,ncol=14))
for (i in 1:14) {
for (j in 1:14) {
if (chisq.test(table(backup[,i+1],backup[,j+1]))$p.value<0.05) {
chiTable[i,j] <- "⬤"
} else {
chiTable[i,j] <- "◯"
}
}
}
colnames(chiTable) <- c(1:14)
chiTable$Variable <- c(1:14)
chiTable <- chiTable[,c(15,1:14)]
#full results of Chi-squared Test of Independence: ⬤ reject null hypothesis; ◯ fail to reject null hypothesis
chiTable %>% kable() %>% kable_styling(full_width=F)
| Variable | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | ⬤ | ◯ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 2 | ◯ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 3 | ◯ | ◯ | ⬤ | ◯ | ◯ | ◯ | ◯ | ◯ | ◯ | ◯ | ◯ | ◯ | ⬤ | ◯ |
| 4 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 5 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 6 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 7 | ⬤ | ◯ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ◯ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ |
| 8 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ |
| 9 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 10 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 11 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 12 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 13 | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ |
| 14 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ |
Randomization in sampling
data <- backup
data$call <- as.factor(data$call)
set.seed(530)
training <- sample(nrow(data), as.integer(nrow(data)*0.75)) #75%, 3652 of 4870
dataTraining <- data[training,]
dataTest <- data[-training,]
prop.table(table(dataTraining$call)) %>% kable(digits=4) %>% kable_styling(full_width=F)
| Var1 | Freq |
|---|---|
| 0 | 0.92 |
| 1 | 0.08 |
prop.table(table(dataTest$call)) %>% kable(digits=4) %>% kable_styling(full_width=F)
| Var1 | Freq |
|---|---|
| 0 | 0.9179 |
| 1 | 0.0821 |
Step 1: Training a model on the data
modelDT <- C5.0(x=dataTraining[-1], y=dataTraining$call)
trainPredDT <- predict(modelDT, dataTraining)
CrossTable(dataTraining$call, trainPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3652
##
##
## | trainPredDT
## dataTraining$call | 0 | Row Total |
## ------------------|-----------|-----------|
## 0 | 3360 | 3360 |
## | 0.920 | |
## ------------------|-----------|-----------|
## 1 | 292 | 292 |
## | 0.080 | |
## ------------------|-----------|-----------|
## Column Total | 3652 | 3652 |
## ------------------|-----------|-----------|
##
##
Step 2: Evaluating model performance
testPredDT <- predict(modelDT, dataTest)
CrossTable(dataTest$call, testPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1218
##
##
## | testPredDT
## dataTest$call | 0 | Row Total |
## --------------|-----------|-----------|
## 0 | 1118 | 1118 |
## | 0.918 | |
## --------------|-----------|-----------|
## 1 | 100 | 100 |
## | 0.082 | |
## --------------|-----------|-----------|
## Column Total | 1218 | 1218 |
## --------------|-----------|-----------|
##
##
Step 1: Training a model on the data
modelRF <- randomForest(call~., data=dataTraining, importance=T)
trainPredRF <- predict(modelRF, dataTraining)
CrossTable(dataTraining$call, trainPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3652
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 3360 | 0 | 3360 |
## | 0.920 | 0.000 | |
## -------------|-----------|-----------|-----------|
## 1 | 284 | 8 | 292 |
## | 0.078 | 0.002 | |
## -------------|-----------|-----------|-----------|
## Column Total | 3644 | 8 | 3652 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
testPredRF <- predict(modelRF, dataTest)
CrossTable(dataTest$call, testPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1218
##
##
## | testPredRF
## dataTest$call | 0 | Row Total |
## --------------|-----------|-----------|
## 0 | 1118 | 1118 |
## | 0.918 | |
## --------------|-----------|-----------|
## 1 | 100 | 100 |
## | 0.082 | |
## --------------|-----------|-----------|
## Column Total | 1218 | 1218 |
## --------------|-----------|-----------|
##
##
importance(modelRF, type=1)
## MeanDecreaseAccuracy
## h 14.9811850
## sex 15.2506583
## race 0.7955825
## education 18.2286908
## ofjobs 29.5982489
## yearsexp 32.5244040
## honors 18.1259418
## volunteer 14.8749497
## military 10.1074748
## empholes 22.3303431
## workinschool 17.1157557
## email 16.4070666
## computerskills 13.6910030
## specialskills 21.7068915
Step 1: Training a model on the data
modelRT <- rpart(call~., data=dataTraining)
trainPredRT <- predict(modelRT, dataTraining, type="class")
CrossTable(dataTraining$call, trainPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3652
##
##
## | trainPredRT
## dataTraining$call | 0 | Row Total |
## ------------------|-----------|-----------|
## 0 | 3360 | 3360 |
## | 0.920 | |
## ------------------|-----------|-----------|
## 1 | 292 | 292 |
## | 0.080 | |
## ------------------|-----------|-----------|
## Column Total | 3652 | 3652 |
## ------------------|-----------|-----------|
##
##
rpart.plot(modelRT, digits=4, type=1)
Step 2: Evaluating model performance
testPredRT <- predict(modelRT, dataTest, type="class")
CrossTable(dataTest$call, testPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1218
##
##
## | testPredRT
## dataTest$call | 0 | Row Total |
## --------------|-----------|-----------|
## 0 | 1118 | 1118 |
## | 0.918 | |
## --------------|-----------|-----------|
## 1 | 100 | 100 |
## | 0.082 | |
## --------------|-----------|-----------|
## Column Total | 1218 | 1218 |
## --------------|-----------|-----------|
##
##
Step 1: Training a model on the data
modelNB <- naive_bayes(call~., data=dataTraining)
trainPredNB <- predict(modelNB, dataTraining, type="class")
CrossTable(dataTraining$call, trainPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3652
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 3199 | 161 | 3360 |
## | 0.876 | 0.044 | |
## -------------|-----------|-----------|-----------|
## 1 | 262 | 30 | 292 |
## | 0.072 | 0.008 | |
## -------------|-----------|-----------|-----------|
## Column Total | 3461 | 191 | 3652 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
testPredNB <- predict(modelNB, dataTest, type="class")
CrossTable(dataTest$call, testPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1218
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 1063 | 55 | 1118 |
## | 0.873 | 0.045 | |
## -------------|-----------|-----------|-----------|
## 1 | 88 | 12 | 100 |
## | 0.072 | 0.010 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1151 | 67 | 1218 |
## -------------|-----------|-----------|-----------|
##
##
Step 1: Training a model on the data
tryKernel <- function(train, test, kern) {
model <- ksvm(call~., data=train, kernel=kern)
trainAccuracy <- round((1-model@error)*100,2)
pred <- predict(model, test)
table <- table(test$call==pred)
testAccuracy <- round((table[2]/(table[1]+table[2]))*100,2)
names(testAccuracy) <- NULL
return(c(trainAccuracy, testAccuracy))
}
modelSVM <- ksvm(call~., data=dataTraining)
trainPredSVM <- predict(modelSVM, dataTraining)
CrossTable(dataTraining$call, trainPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3652
##
##
## | trainPredSVM
## dataTraining$call | 0 | Row Total |
## ------------------|-----------|-----------|
## 0 | 3360 | 3360 |
## | 0.920 | |
## ------------------|-----------|-----------|
## 1 | 292 | 292 |
## | 0.080 | |
## ------------------|-----------|-----------|
## Column Total | 3652 | 3652 |
## ------------------|-----------|-----------|
##
##
Step 2: Evaluating model performance
result <- data.frame("Accuracy"=c("Training (%)", "Test (%)"),
"Linear"=tryKernel(dataTraining,dataTest,kern="vanilladot"),
"Polynomial"=tryKernel(dataTraining,dataTest,kern="polydot"),
"RBF"=tryKernel(dataTraining,dataTest,kern="rbfdot"))
## Setting default kernel parameters
## Setting default kernel parameters
result %>% kable() %>% kable_styling(full_width=F)
| Accuracy | Linear | Polynomial | RBF |
|---|---|---|---|
| Training (%) | 92.00 | 92.00 | 92.00 |
| Test (%) | 91.79 | 91.79 | 91.79 |
testPredSVM <- predict(modelSVM, dataTest)
CrossTable(dataTest$call, testPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1218
##
##
## | testPredSVM
## dataTest$call | 0 | Row Total |
## --------------|-----------|-----------|
## 0 | 1118 | 1118 |
## | 0.918 | |
## --------------|-----------|-----------|
## 1 | 100 | 100 |
## | 0.082 | |
## --------------|-----------|-----------|
## Column Total | 1218 | 1218 |
## --------------|-----------|-----------|
##
##
Evaluating model performance
The data preparation for KNN often involves three tasks:
(1) Fix all NA or "" values
(2) Convert all factors into a set of booleans, one for each level in the factor
(3) Normalize the values of each variable to the range 0:1 so that no variable’s range has an unduly large impact on the distance measurement.
#testPredKNN <- knn(train=dataTraining[-1], test=dataTest[-1], cl=dataTraining[,1], k=60)
#CrossTable(dataTest$call, testPredKNN, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))