============================================================================================================
Data source: https://www.aeaweb.org/articles?id=10.1257/0002828042002561
| Decision Tree (C5.0) | Random Forest | Decision Tree (rpart) | Naïve Bayes Classifier | Linear SVM | Polynomial SVM | Radial Basis SVM | k-Nearest Neighbors | |
|---|---|---|---|---|---|---|---|---|
| Training | 91.41% | 92.63% | 90.19% | 79.45% | 79.88% | 79.93% | 88.29% | 78.37% |
| Test | 90.22% | 91.41% | 90.73% | 77.64% | 77.64% | 77.47% | 86.48% | 76.79% |
Step 1: Collecting the data
rawdata <- read.csv("~/Documents/HU/ANLY 525-50-B/525 paper/lakisha_aer.csv")
table(rawdata$firstname, rawdata$race) %>% kable() %>% kable_styling(font_size=11, full_width=F)
| b | w | |
|---|---|---|
| Aisha | 180 | 0 |
| Allison | 0 | 232 |
| Anne | 0 | 242 |
| Brad | 0 | 63 |
| Brendan | 0 | 65 |
| Brett | 0 | 59 |
| Carrie | 0 | 168 |
| Darnell | 42 | 0 |
| Ebony | 208 | 0 |
| Emily | 0 | 227 |
| Geoffrey | 0 | 59 |
| Greg | 0 | 51 |
| Hakim | 55 | 0 |
| Jamal | 61 | 0 |
| Jay | 0 | 67 |
| Jermaine | 52 | 0 |
| Jill | 0 | 203 |
| Kareem | 64 | 0 |
| Keisha | 183 | 0 |
| Kenya | 196 | 0 |
| Kristen | 0 | 213 |
| Lakisha | 200 | 0 |
| Latonya | 230 | 0 |
| Latoya | 226 | 0 |
| Laurie | 0 | 195 |
| Leroy | 64 | 0 |
| Matthew | 0 | 67 |
| Meredith | 0 | 187 |
| Neil | 0 | 76 |
| Rasheed | 67 | 0 |
| Sarah | 0 | 193 |
| Tamika | 256 | 0 |
| Tanisha | 207 | 0 |
| Todd | 0 | 68 |
| Tremayne | 69 | 0 |
| Tyrone | 75 | 0 |
vars <- c("call","h","sex","race","city","education","ofjobs","yearsexp","honors","volunteer","military","empholes","workinschool","email","computerskills","specialskills")
data <- rawdata[vars]
data[vars[-c(3:8)]] <- lapply(data[vars[-c(3:8)]], factor)
Step 2: Bar charts, histogram and descriptive analysis
ggplot(data,aes(education))+geom_bar()+labs(x="Education Level",y="Count")+ggtitle("Fig.1. Distribution of education on the resume")+theme_classic()
ggplot(data,aes(ofjobs))+geom_bar()+labs(x="Number of Jobs",y="Count")+ggtitle("Fig.2. Distribution of jobs on the resume")+theme_classic()
ggplot(data,aes(yearsexp))+geom_histogram(binwidth=1)+labs(x="Number of Years",y="Count")+ggtitle("Fig.3. Distribution of work experience on the resume")+theme_classic()
length(unique(rawdata$adid)) #the number of employment ads
## [1] 1323
#table(rawdata$id,rawdata$race,rawdata$h)
length(unique(rawdata$id))
## [1] 289
#table(rawdata$ad,rawdata$race,rawdata$h)
length(unique(rawdata$ad))
## [1] 303
rawdata[rawdata$id==1,vars] %>% kable() %>% kable_styling(font_size=10, full_width=F)
| call | h | sex | race | city | education | ofjobs | yearsexp | honors | volunteer | military | empholes | workinschool | computerskills | specialskills | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 179 | 0 | 1 | m | w | b | 4 | 6 | 18 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 189 | 0 | 0 | f | b | b | 4 | 5 | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 193 | 0 | 0 | m | w | b | 4 | 4 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 198 | 0 | 1 | f | b | b | 4 | 4 | 14 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
prop.table(table(data$race, data$call)) %>% kable(digits=4) %>% kable_styling(full_width=F)
| 0 | 1 | |
|---|---|---|
| b | 0.4678 | 0.0322 |
| w | 0.4517 | 0.0483 |
Hmisc::describe(data)
## data
##
## 16 Variables 4870 Observations
## --------------------------------------------------------------------------------
## call
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 4478 392
## Proportion 0.92 0.08
## --------------------------------------------------------------------------------
## h
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2424 2446
## Proportion 0.498 0.502
## --------------------------------------------------------------------------------
## sex
## n missing distinct
## 4870 0 2
##
## Value f m
## Frequency 3746 1124
## Proportion 0.769 0.231
## --------------------------------------------------------------------------------
## race
## n missing distinct
## 4870 0 2
##
## Value b w
## Frequency 2435 2435
## Proportion 0.5 0.5
## --------------------------------------------------------------------------------
## city
## n missing distinct
## 4870 0 2
##
## Value b c
## Frequency 2166 2704
## Proportion 0.445 0.555
## --------------------------------------------------------------------------------
## education
## n missing distinct Info Mean Gmd
## 4870 0 5 0.619 3.618 0.5941
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 46 40 274 1006 3504
## Proportion 0.009 0.008 0.056 0.207 0.720
## --------------------------------------------------------------------------------
## ofjobs
## n missing distinct Info Mean Gmd
## 4870 0 7 0.933 3.661 1.336
##
## lowest : 1 2 3 4 5, highest: 3 4 5 6 7
##
## Value 1 2 3 4 5 6 7
## Frequency 110 704 1429 1611 533 464 19
## Proportion 0.023 0.145 0.293 0.331 0.109 0.095 0.004
## --------------------------------------------------------------------------------
## yearsexp
## n missing distinct Info Mean Gmd .05 .10
## 4870 0 26 0.989 7.843 5.09 2 3
## .25 .50 .75 .90 .95
## 5 6 9 14 19
##
## lowest : 1 2 3 4 5, highest: 22 23 25 26 44
## --------------------------------------------------------------------------------
## honors
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 4613 257
## Proportion 0.947 0.053
## --------------------------------------------------------------------------------
## volunteer
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2866 2004
## Proportion 0.589 0.411
## --------------------------------------------------------------------------------
## military
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 4397 473
## Proportion 0.903 0.097
## --------------------------------------------------------------------------------
## empholes
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2688 2182
## Proportion 0.552 0.448
## --------------------------------------------------------------------------------
## workinschool
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2145 2725
## Proportion 0.44 0.56
## --------------------------------------------------------------------------------
## email
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 2536 2334
## Proportion 0.521 0.479
## --------------------------------------------------------------------------------
## computerskills
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 874 3996
## Proportion 0.179 0.821
## --------------------------------------------------------------------------------
## specialskills
## n missing distinct
## 4870 0 2
##
## Value 0 1
## Frequency 3269 1601
## Proportion 0.671 0.329
## --------------------------------------------------------------------------------
Step 3: Chi-squared Test of Independence
N <- ncol(data)-1 #for the explanatory variables
chiTable <- data.frame(matrix(nrow=N,ncol=N))
for (i in 1:N) {
for (j in 1:N) {
if (chisq.test(table(data[,i+1],data[,j+1]))$p.value<0.05) {
chiTable[i,j] <- "⬤"
} else {
chiTable[i,j] <- "◯"
}
}
}
colnames(chiTable) <- c(1:N)
chiTable$Variable <- c(1:N)
chiTable <- chiTable[,c(N+1,1:N)]
#full results of Chi-squared Test of Independence: ⬤ reject null hypothesis; ◯ fail to reject null hypothesis
chiTable %>% kable() %>% kable_styling(full_width=F)
| Variable | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | ⬤ | ◯ | ◯ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 2 | ◯ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 3 | ◯ | ◯ | ⬤ | ◯ | ◯ | ◯ | ◯ | ◯ | ◯ | ◯ | ◯ | ◯ | ◯ | ⬤ | ◯ |
| 4 | ◯ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 5 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 6 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 7 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 8 | ⬤ | ◯ | ◯ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ◯ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ |
| 9 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ |
| 10 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 11 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 12 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 13 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 14 | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ |
| 15 | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ⬤ |
Randomization in sampling
Unbalanced classification problems cause problems to many machine learning algorithms. These problems are characterized by the uneven proportion of the outcome/dependent/target/response variable classes.
The SMOTE (Synthetic Minority Oversampling Technique) function oversamples the rare event by using bootstrapping and k-nearest neighbor to synthetically create additional observations of that event.
The arguments perc.over and perc.under respectively control the amount of over-sampling of the minority class and under-sampling of the majority classes.
rawdata$call <- as.factor(rawdata$call)
set.seed(525)
data <- SMOTE(call~.,rawdata,perc.over=500,perc.under=120)
data <- data[vars[-2]]
rownames(data) <- c(1:nrow(data))
training <- sample(nrow(data), as.integer(nrow(data)*0.75))
dataTraining <- data[training,]
dataTest <- data[-training,]
prop.table(table(dataTraining$call)) %>% kable(digits=4) %>% kable_styling(full_width=F)
| Var1 | Freq |
|---|---|
| 0 | 0.5014 |
| 1 | 0.4986 |
prop.table(table(dataTest$call)) %>% kable(digits=4) %>% kable_styling(full_width=F)
| Var1 | Freq |
|---|---|
| 0 | 0.4957 |
| 1 | 0.5043 |
Inquiry about bootstrapping
nrow(na.omit(rawdata))
## [1] 447
nrow(distinct(rawdata))
## [1] 4870
nrow(distinct(data))
## [1] 3008
#minority x2
set.seed(525)
data1 <- SMOTE(call~.,rawdata,perc.over=100)
#minority x2'
set.seed(525)
data2 <- SMOTE(call~.,data,perc.over=100)
#minority x6
set.seed(525)
data3 <- SMOTE(call~.,rawdata,perc.over=500,perc.under=120)
#minority x6'
set.seed(525)
data4 <- SMOTE(call~.,data,perc.over=500,perc.under=120)
nrow(data1)
## [1] 1568
nrow(distinct(data1))
## [1] 1496
nrow(distinct(data2))
## [1] 4713
table(distinct(data1)$call)
##
## 0 1
## 712 784
table(distinct(data2)$call)
##
## 0 1
## 2767 1946
nrow(data3)
## [1] 4704
nrow(distinct(data3))
## [1] 4176
nrow(distinct(data4))
## [1] 12926
table(distinct(data3)$call)
##
## 0 1
## 1827 2349
table(distinct(data4)$call)
##
## 0 1
## 10708 2218
Step 1: Training a model on the data
modelDT <- C5.0(x=dataTraining[-1], y=dataTraining$call)
trainPredDT <- predict(modelDT, dataTraining)
CrossTable(dataTraining$call, trainPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3528
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 1741 | 28 | 1769 |
## | 0.493 | 0.008 | |
## -------------|-----------|-----------|-----------|
## 1 | 275 | 1484 | 1759 |
## | 0.078 | 0.421 | |
## -------------|-----------|-----------|-----------|
## Column Total | 2016 | 1512 | 3528 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
testPredDT <- predict(modelDT, dataTest)
CrossTable(dataTest$call, testPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1176
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 562 | 21 | 583 |
## | 0.478 | 0.018 | |
## -------------|-----------|-----------|-----------|
## 1 | 94 | 499 | 593 |
## | 0.080 | 0.424 | |
## -------------|-----------|-----------|-----------|
## Column Total | 656 | 520 | 1176 |
## -------------|-----------|-----------|-----------|
##
##
C5imp(modelDT)
## Overall
## computerskills 100.00
## honors 61.11
## volunteer 44.44
## workinschool 37.05
## empholes 36.14
## specialskills 3.34
## sex 1.70
## race 0.00
## city 0.00
## education 0.00
## ofjobs 0.00
## yearsexp 0.00
## military 0.00
## email 0.00
C5imp(modelDT, metric="splits")
## Overall
## honors 30
## specialskills 20
## computerskills 10
## empholes 10
## sex 10
## volunteer 10
## workinschool 10
## race 0
## city 0
## education 0
## ofjobs 0
## yearsexp 0
## military 0
## email 0
Step 1: Training a model on the data
set.seed(525)
modelRF <- randomForest(call~., data=dataTraining, importance=T)
trainPredRF <- predict(modelRF, dataTraining)
CrossTable(dataTraining$call, trainPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3528
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 1765 | 4 | 1769 |
## | 0.500 | 0.001 | |
## -------------|-----------|-----------|-----------|
## 1 | 256 | 1503 | 1759 |
## | 0.073 | 0.426 | |
## -------------|-----------|-----------|-----------|
## Column Total | 2021 | 1507 | 3528 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
testPredRF <- predict(modelRF, dataTest)
CrossTable(dataTest$call, testPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1176
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 577 | 6 | 583 |
## | 0.491 | 0.005 | |
## -------------|-----------|-----------|-----------|
## 1 | 95 | 498 | 593 |
## | 0.081 | 0.423 | |
## -------------|-----------|-----------|-----------|
## Column Total | 672 | 504 | 1176 |
## -------------|-----------|-----------|-----------|
##
##
importance(modelRF, type=1)
## MeanDecreaseAccuracy
## sex 22.13874
## race 15.69424
## city 25.79569
## education 21.95828
## ofjobs 26.04194
## yearsexp 32.90873
## honors 35.43818
## volunteer 29.83858
## military 18.32547
## empholes 30.67743
## workinschool 32.40884
## email 25.09637
## computerskills 42.97400
## specialskills 29.85340
#getTree(modelRF,labelVar=T)
Step 1: Training a model on the data
modelRT <- rpart(call~., data=dataTraining)
trainPredRT <- predict(modelRT, dataTraining, type="class")
CrossTable(dataTraining$call, trainPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3528
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 1697 | 72 | 1769 |
## | 0.481 | 0.020 | |
## -------------|-----------|-----------|-----------|
## 1 | 274 | 1485 | 1759 |
## | 0.078 | 0.421 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1971 | 1557 | 3528 |
## -------------|-----------|-----------|-----------|
##
##
rpart.plot(modelRT, digits=4, type=1)
Step 2: Evaluating model performance
testPredRT <- predict(modelRT, dataTest, type="class")
CrossTable(dataTest$call, testPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1176
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 565 | 18 | 583 |
## | 0.480 | 0.015 | |
## -------------|-----------|-----------|-----------|
## 1 | 91 | 502 | 593 |
## | 0.077 | 0.427 | |
## -------------|-----------|-----------|-----------|
## Column Total | 656 | 520 | 1176 |
## -------------|-----------|-----------|-----------|
##
##
modelRT$variable.importance
## computerskills volunteer honors email ofjobs
## 614.20139 404.29854 260.87263 238.43622 230.38539
## workinschool empholes specialskills education yearsexp
## 214.74672 178.67514 177.61838 124.56380 49.23174
Step 1: Training a model on the data
modelNB <- naive_bayes(call~., data=dataTraining)
trainPredNB <- predict(modelNB, dataTraining, type="class")
CrossTable(dataTraining$call, trainPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3528
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 1494 | 275 | 1769 |
## | 0.423 | 0.078 | |
## -------------|-----------|-----------|-----------|
## 1 | 450 | 1309 | 1759 |
## | 0.128 | 0.371 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1944 | 1584 | 3528 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
testPredNB <- predict(modelNB, dataTest, type="class")
CrossTable(dataTest$call, testPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1176
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 481 | 102 | 583 |
## | 0.409 | 0.087 | |
## -------------|-----------|-----------|-----------|
## 1 | 161 | 432 | 593 |
## | 0.137 | 0.367 | |
## -------------|-----------|-----------|-----------|
## Column Total | 642 | 534 | 1176 |
## -------------|-----------|-----------|-----------|
##
##
Step 1: Training a model on the data
tryKernel <- function(train, test, kern) {
set.seed(525)
model <- ksvm(call~., data=train, kernel=kern)
trainAccuracy <- round((1-model@error)*100,2)
pred <- predict(model, test)
table <- table(test$call==pred)
testAccuracy <- round((table[2]/(table[1]+table[2]))*100,2)
names(testAccuracy) <- NULL
return(c(trainAccuracy, testAccuracy))
}
modelSVM <- ksvm(call~., data=dataTraining)
trainPredSVM <- predict(modelSVM, dataTraining)
CrossTable(dataTraining$call, trainPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3528
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 1737 | 32 | 1769 |
## | 0.492 | 0.009 | |
## -------------|-----------|-----------|-----------|
## 1 | 380 | 1379 | 1759 |
## | 0.108 | 0.391 | |
## -------------|-----------|-----------|-----------|
## Column Total | 2117 | 1411 | 3528 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
result <- data.frame("Accuracy"=c("Training (%)", "Test (%)"),
"Linear"=tryKernel(dataTraining,dataTest,kern="vanilladot"),
"Polynomial"=tryKernel(dataTraining,dataTest,kern="polydot"),
"RBF"=tryKernel(dataTraining,dataTest,kern="rbfdot"))
## Setting default kernel parameters
## Setting default kernel parameters
result %>% kable() %>% kable_styling(full_width=F)
| Accuracy | Linear | Polynomial | RBF |
|---|---|---|---|
| Training (%) | 79.88 | 79.93 | 88.29 |
| Test (%) | 77.64 | 77.47 | 86.48 |
testPredSVM <- predict(modelSVM, dataTest)
CrossTable(dataTest$call, testPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1176
##
##
## | Predicted
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 561 | 22 | 583 |
## | 0.477 | 0.019 | |
## -------------|-----------|-----------|-----------|
## 1 | 137 | 456 | 593 |
## | 0.116 | 0.388 | |
## -------------|-----------|-----------|-----------|
## Column Total | 698 | 478 | 1176 |
## -------------|-----------|-----------|-----------|
##
##
Evaluating model performance
dataTraining[,-c(6:8)] <- lapply(dataTraining[,-c(6:8)], as.numeric)
dataTest[,-c(6:8)] <- lapply(dataTest[,-c(6:8)], as.numeric)
set.seed(525)
trainPredKNN <- knn(train=dataTraining[-1], test=dataTraining[-1], cl=dataTraining[,1], k=as.integer(sqrt(nrow(dataTraining))))
CrossTable(dataTraining$call, trainPredKNN, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (KNN)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3528
##
##
## | Predicted (KNN)
## Actual | 1 | 2 | Row Total |
## -------------|-----------|-----------|-----------|
## 1 | 1433 | 336 | 1769 |
## | 0.406 | 0.095 | |
## -------------|-----------|-----------|-----------|
## 2 | 427 | 1332 | 1759 |
## | 0.121 | 0.378 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1860 | 1668 | 3528 |
## -------------|-----------|-----------|-----------|
##
##
testPredKNN <- knn(train=dataTraining[-1], test=dataTest[-1], cl=dataTraining[,1], k=as.integer(sqrt(nrow(dataTraining))))
CrossTable(dataTest$call, testPredKNN, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (KNN)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1176
##
##
## | Predicted (KNN)
## Actual | 1 | 2 | Row Total |
## -------------|-----------|-----------|-----------|
## 1 | 467 | 116 | 583 |
## | 0.397 | 0.099 | |
## -------------|-----------|-----------|-----------|
## 2 | 157 | 436 | 593 |
## | 0.134 | 0.371 | |
## -------------|-----------|-----------|-----------|
## Column Total | 624 | 552 | 1176 |
## -------------|-----------|-----------|-----------|
##
##