============================================================================================================
About: This document is also available at http://rpubs.com/sherloconan/570470
Data source: https://www.kaggle.com/sherloconan/anly-53053b
| Decision Tree (C5.0) | Random Forest | Decision Tree (rpart) | Naïve Bayes Classifier | Linear SVM | Polynomial SVM | Radial Basis SVM | k-Nearest Neighbors | |
|---|---|---|---|---|---|---|---|---|
| Training | 75.91% | 93.66% | 76.96% | 76.11% | 75.91% | 75.91% | 75.91% | 76.01% |
| Test | 75.57% | 75.26% | 76.17% | 75.57% | 75.57% | 75.57% | 75.57% | 76.02% |
Step 1: Collecting the data
rawdata <- read.csv("~/Documents/HU/ANLY 530-53-B/Project Description/thornton_hiv_got.csv") #*
vars <- c("got", "any", "tinc", "distvct", "male", "age", "hiv2004", "site", "mar", "tb", "thinktreat") #*
data <- rawdata[vars]
colnames(data)[1] <- "target"
data <- na.omit(data)
Step 2: Histogram and descriptive analysis
f1 <- ggplot(data,aes(tinc))+geom_histogram(binwidth=10)+labs(x="Total value of the incentive (kwacha)",y="Count")+ggtitle("Fig.1 Distribution")+theme_classic() #*
f2 <- ggplot(data,aes(distvct))+geom_histogram(binwidth=0.2)+labs(x="Distance (km)",y="Count")+ggtitle("Fig.2 Distribution")+theme_classic() #*
f3 <- ggplot(data,aes(age))+geom_histogram(binwidth=1)+labs(x="Age",y="Count")+ggtitle("Fig.3 Distribution")+theme_classic() #*
f4 <- ggplot(aes(tinc, target), data=aggregate(target~tinc, data=data, mean))+ylim(0,1)+geom_point()+labs(x="Total value of the incentive (kwacha)",y="Mean response")+ggtitle("Fig.4 Scatter plot")+theme_classic() #*
gridExtra::grid.arrange(f1,f2,f3,f4, nrow=2) #*
prop.table(table(data$target)) %>% kable(digits=4) %>% kable_styling(full_width=F)
| Var1 | Freq |
|---|---|
| 0 | 0.315 |
| 1 | 0.685 |
backup <- data
data[c("target", "any", "male", "hiv2004", "site", "mar", "tb", "thinktreat")] <- lapply(data[c("target", "any", "male", "hiv2004", "site", "mar", "tb", "thinktreat")], as.factor) #*
Hmisc::describe(data)
## data
##
## 11 Variables 2651 Observations
## --------------------------------------------------------------------------------
## target
## n missing distinct
## 2651 0 2
##
## Value 0 1
## Frequency 835 1816
## Proportion 0.315 0.685
## --------------------------------------------------------------------------------
## any
## n missing distinct
## 2651 0 2
##
## Value 0 1
## Frequency 592 2059
## Proportion 0.223 0.777
## --------------------------------------------------------------------------------
## tinc
## n missing distinct Info Mean Gmd .05 .10
## 2651 0 27 0.979 105.6 106 0 0
## .25 .50 .75 .90 .95
## 20 100 200 250 300
##
## lowest : 0 10 20 30 40, highest: 230 240 250 260 300
## --------------------------------------------------------------------------------
## distvct
## n missing distinct Info Mean Gmd .05 .10
## 2651 0 2004 1 2.019 1.422 0.4062 0.5982
## .25 .50 .75 .90 .95
## 1.0288 1.6911 2.8034 3.9795 4.5979
##
## lowest : 0.00000000 0.03288736 0.03637984 0.03829136 0.03986998
## highest: 5.13910870 5.14017820 5.18828110 5.18977120 5.19155880
## --------------------------------------------------------------------------------
## male
## n missing distinct
## 2651 0 2
##
## Value 0 1
## Frequency 1411 1240
## Proportion 0.532 0.468
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 2651 0 67 0.999 33.32 15.38 16 17
## .25 .50 .75 .90 .95
## 22 32 43 52 57
##
## lowest : 11 12 13 14 15, highest: 73 74 75 76 80
## --------------------------------------------------------------------------------
## hiv2004
## n missing distinct
## 2651 0 3
##
## Value -1 0 1
## Frequency 12 2469 170
## Proportion 0.005 0.931 0.064
## --------------------------------------------------------------------------------
## site
## n missing distinct
## 2651 0 3
##
## Value 1 2 3
## Frequency 705 1002 944
## Proportion 0.266 0.378 0.356
## --------------------------------------------------------------------------------
## mar
## n missing distinct
## 2651 0 2
##
## Value 0 1
## Frequency 770 1881
## Proportion 0.29 0.71
## --------------------------------------------------------------------------------
## tb
## n missing distinct
## 2651 0 2
##
## Value 0 1
## Frequency 2168 483
## Proportion 0.818 0.182
## --------------------------------------------------------------------------------
## thinktreat
## n missing distinct
## 2651 0 2
##
## Value 0 1
## Frequency 1747 904
## Proportion 0.659 0.341
## --------------------------------------------------------------------------------
Step 3: Chi-squared Test of Independence
#REMINDER: "for" loop is not recommended in R
N <- ncol(backup)-1
pairs <- c()
for (i in 1:(ncol(backup)-2)) {
for (j in (i+1):(ncol(backup)-1)) {
if (chisq.test(table(backup[,i+1],backup[,j+1]))$p.value<0.05) {
pairs <- c(pairs, c(i,j))
}
}
} #O(n²)
#pairs of variables that are rejected by Chi-squared Test of Independence
#such variables in a pair have a significant association
table(pairs)
## pairs
## 1 2 3 4 5 6 7 8 9 10
## 3 4 2 3 4 3 8 5 5 3
#REMINDER: declaration with dimensions will speed up the execution in R
chiTable <- data.frame(matrix(nrow=N,ncol=N))
for (i in 1:N) {
for (j in 1:N) {
if (chisq.test(table(backup[,i+1],backup[,j+1]))$p.value<0.05) {
chiTable[i,j] <- "⬤"
} else {
chiTable[i,j] <- "◯"
}
}
}
colnames(chiTable) <- c(1:N)
chiTable$Variable <- c(1:N)
chiTable <- chiTable[,c(N+1,1:N)]
#full results of Chi-squared Test of Independence: ⬤ reject null hypothesis; ◯ fail to reject null hypothesis
chiTable %>% kable() %>% kable_styling(full_width=F)
| Variable | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | ⬤ | ⬤ | ◯ | ◯ | ◯ | ◯ | ⬤ | ◯ | ⬤ | ◯ |
| 2 | ⬤ | ⬤ | ◯ | ◯ | ◯ | ◯ | ⬤ | ◯ | ⬤ | ⬤ |
| 3 | ◯ | ◯ | ⬤ | ◯ | ◯ | ◯ | ⬤ | ⬤ | ◯ | ◯ |
| 4 | ◯ | ◯ | ◯ | ⬤ | ⬤ | ◯ | ◯ | ⬤ | ⬤ | ◯ |
| 5 | ◯ | ◯ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ◯ |
| 6 | ◯ | ◯ | ◯ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ◯ |
| 7 | ⬤ | ⬤ | ⬤ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ |
| 8 | ◯ | ◯ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ⬤ | ◯ | ◯ |
| 9 | ⬤ | ⬤ | ◯ | ⬤ | ◯ | ◯ | ⬤ | ◯ | ⬤ | ⬤ |
| 10 | ◯ | ⬤ | ◯ | ◯ | ◯ | ◯ | ⬤ | ◯ | ⬤ | ⬤ |
Randomization in sampling
data[,c(3,4,6)] <- scale(data[,c(3,4,6)]) #*
set.seed(530)
training <- sample(nrow(data), as.integer(nrow(data)*0.75))
dataTraining <- data[training,]
dataTest <- data[-training,]
prop.table(table(dataTraining$target)) %>% kable(digits=4) %>% kable_styling(full_width=F)
| Var1 | Freq |
|---|---|
| 0 | 0.3169 |
| 1 | 0.6831 |
prop.table(table(dataTest$target)) %>% kable(digits=4) %>% kable_styling(full_width=F)
| Var1 | Freq |
|---|---|
| 0 | 0.3092 |
| 1 | 0.6908 |
Step 1: Training a model on the data
modelDT <- C5.0(x=dataTraining[-1], y=dataTraining$target)
trainPredDT <- predict(modelDT, dataTraining)
CrossTable(dataTraining$target, trainPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (DT)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1988
##
##
## | Predicted (DT)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 300 | 330 | 630 |
## | 0.151 | 0.166 | |
## -------------|-----------|-----------|-----------|
## 1 | 149 | 1209 | 1358 |
## | 0.075 | 0.608 | |
## -------------|-----------|-----------|-----------|
## Column Total | 449 | 1539 | 1988 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
testPredDT <- predict(modelDT, dataTest)
CrossTable(dataTest$target, testPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (DT)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 663
##
##
## | Predicted (DT)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 93 | 112 | 205 |
## | 0.140 | 0.169 | |
## -------------|-----------|-----------|-----------|
## 1 | 50 | 408 | 458 |
## | 0.075 | 0.615 | |
## -------------|-----------|-----------|-----------|
## Column Total | 143 | 520 | 663 |
## -------------|-----------|-----------|-----------|
##
##
Step 1: Training a model on the data
modelRF <- randomForest(target~., data=dataTraining, importance=T)
trainPredRF <- predict(modelRF, dataTraining)
CrossTable(dataTraining$target, trainPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (RF)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1988
##
##
## | Predicted (RF)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 561 | 69 | 630 |
## | 0.282 | 0.035 | |
## -------------|-----------|-----------|-----------|
## 1 | 57 | 1301 | 1358 |
## | 0.029 | 0.654 | |
## -------------|-----------|-----------|-----------|
## Column Total | 618 | 1370 | 1988 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
testPredRF <- predict(modelRF, dataTest)
CrossTable(dataTest$target, testPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (RF)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 663
##
##
## | Predicted (RF)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 86 | 119 | 205 |
## | 0.130 | 0.179 | |
## -------------|-----------|-----------|-----------|
## 1 | 45 | 413 | 458 |
## | 0.068 | 0.623 | |
## -------------|-----------|-----------|-----------|
## Column Total | 131 | 532 | 663 |
## -------------|-----------|-----------|-----------|
##
##
Step 1: Training a model on the data
modelRT <- rpart(target~., data=dataTraining)
trainPredRT <- predict(modelRT, dataTraining, type="class")
CrossTable(dataTraining$target, trainPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (RT)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1988
##
##
## | Predicted (RT)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 265 | 365 | 630 |
## | 0.133 | 0.184 | |
## -------------|-----------|-----------|-----------|
## 1 | 93 | 1265 | 1358 |
## | 0.047 | 0.636 | |
## -------------|-----------|-----------|-----------|
## Column Total | 358 | 1630 | 1988 |
## -------------|-----------|-----------|-----------|
##
##
rpart.plot(modelRT, digits=4, type=1)
Step 2: Evaluating model performance
testPredRT <- predict(modelRT, dataTest, type="class")
CrossTable(dataTest$target, testPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (RT)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 663
##
##
## | Predicted (RT)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 72 | 133 | 205 |
## | 0.109 | 0.201 | |
## -------------|-----------|-----------|-----------|
## 1 | 25 | 433 | 458 |
## | 0.038 | 0.653 | |
## -------------|-----------|-----------|-----------|
## Column Total | 97 | 566 | 663 |
## -------------|-----------|-----------|-----------|
##
##
Step 1: Training a model on the data
modelNB <- naive_bayes(target~., data=dataTraining)
trainPredNB <- predict(modelNB, dataTraining, type="class")
CrossTable(dataTraining$target, trainPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (NB)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1988
##
##
## | Predicted (NB)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 307 | 323 | 630 |
## | 0.154 | 0.162 | |
## -------------|-----------|-----------|-----------|
## 1 | 152 | 1206 | 1358 |
## | 0.076 | 0.607 | |
## -------------|-----------|-----------|-----------|
## Column Total | 459 | 1529 | 1988 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
testPredNB <- predict(modelNB, dataTest, type="class")
CrossTable(dataTest$target, testPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (NB)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 663
##
##
## | Predicted (NB)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 94 | 111 | 205 |
## | 0.142 | 0.167 | |
## -------------|-----------|-----------|-----------|
## 1 | 51 | 407 | 458 |
## | 0.077 | 0.614 | |
## -------------|-----------|-----------|-----------|
## Column Total | 145 | 518 | 663 |
## -------------|-----------|-----------|-----------|
##
##
Step 1: Training a model on the data
tryKernel <- function(train, test, kern) {
model <- ksvm(target~., data=train, kernel=kern)
trainAccuracy <- round((1-model@error)*100,2)
pred <- predict(model, test)
table <- table(test$target==pred)
testAccuracy <- round((table[2]/(table[1]+table[2]))*100,2)
names(testAccuracy) <- NULL
return(c(trainAccuracy, testAccuracy))
}
modelSVM <- ksvm(target~., data=dataTraining, kernel="rbfdot")
trainPredSVM <- predict(modelSVM, dataTraining)
CrossTable(dataTraining$target, trainPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (SVM)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1988
##
##
## | Predicted (SVM)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 300 | 330 | 630 |
## | 0.151 | 0.166 | |
## -------------|-----------|-----------|-----------|
## 1 | 149 | 1209 | 1358 |
## | 0.075 | 0.608 | |
## -------------|-----------|-----------|-----------|
## Column Total | 449 | 1539 | 1988 |
## -------------|-----------|-----------|-----------|
##
##
Step 2: Evaluating model performance
result <- data.frame("Accuracy"=c("Training (%)", "Test (%)"),
"Linear"=tryKernel(dataTraining,dataTest,kern="vanilladot"),
"Polynomial"=tryKernel(dataTraining,dataTest,kern="polydot"),
"RBF"=tryKernel(dataTraining,dataTest,kern="rbfdot"))
## Setting default kernel parameters
## Setting default kernel parameters
result %>% kable() %>% kable_styling(full_width=F)
| Accuracy | Linear | Polynomial | RBF |
|---|---|---|---|
| Training (%) | 75.91 | 75.91 | 75.91 |
| Test (%) | 75.57 | 75.57 | 75.57 |
testPredSVM <- predict(modelSVM, dataTest)
CrossTable(dataTest$target, testPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (SVM)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 663
##
##
## | Predicted (SVM)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 93 | 112 | 205 |
## | 0.140 | 0.169 | |
## -------------|-----------|-----------|-----------|
## 1 | 50 | 408 | 458 |
## | 0.075 | 0.615 | |
## -------------|-----------|-----------|-----------|
## Column Total | 143 | 520 | 663 |
## -------------|-----------|-----------|-----------|
##
##
Evaluating model performance
trainPredKNN <- knn(train=dataTraining[-1], test=dataTraining[-1], cl=dataTraining[,1], k=as.integer(sqrt(nrow(dataTraining))))
CrossTable(dataTraining$target, trainPredKNN, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (KNN)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1988
##
##
## | Predicted (KNN)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 263 | 367 | 630 |
## | 0.132 | 0.185 | |
## -------------|-----------|-----------|-----------|
## 1 | 110 | 1248 | 1358 |
## | 0.055 | 0.628 | |
## -------------|-----------|-----------|-----------|
## Column Total | 373 | 1615 | 1988 |
## -------------|-----------|-----------|-----------|
##
##
testPredKNN <- knn(train=dataTraining[-1], test=dataTest[-1], cl=dataTraining[,1], k=as.integer(sqrt(nrow(dataTraining))))
CrossTable(dataTest$target, testPredKNN, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (KNN)"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 663
##
##
## | Predicted (KNN)
## Actual | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 79 | 126 | 205 |
## | 0.119 | 0.190 | |
## -------------|-----------|-----------|-----------|
## 1 | 33 | 425 | 458 |
## | 0.050 | 0.641 | |
## -------------|-----------|-----------|-----------|
## Column Total | 112 | 551 | 663 |
## -------------|-----------|-----------|-----------|
##
##
Model 1: Decision Tree (C5.0)
C5imp(modelDT)
## Overall
## any 100
## tinc 0
## distvct 0
## male 0
## age 0
## hiv2004 0
## site 0
## mar 0
## tb 0
## thinktreat 0
C5imp(modelDT, metric="splits")
## Overall
## any 100
## tinc 0
## distvct 0
## male 0
## age 0
## hiv2004 0
## site 0
## mar 0
## tb 0
## thinktreat 0
Model 2: Random Forest
importance(modelRF, type=1)
## MeanDecreaseAccuracy
## any 22.8513718
## tinc 39.2694252
## distvct 16.6968812
## male 0.6593123
## age 5.4931463
## hiv2004 2.0147891
## site 22.1241627
## mar 6.7307767
## tb 9.6326492
## thinktreat 5.6722017
Model 3: Decision Tree (rpart)
modelRT$variable.importance
## any tinc distvct site age
## 143.115562 143.115562 10.779899 8.532266 0.161306
dmCommunities <- cluster::daisy(data[-1])
set.seed(530)
data$cluster <- as.factor(kmeans(dmCommunities,2)$cluster)
DT::datatable(data,filter="top")
table(data$cluster)
##
## 1 2
## 980 1671
table(data$target)
##
## 0 1
## 835 1816
table(data$cluster, data$target)
##
## 0 1
## 1 479 501
## 2 356 1315