============================================================================================================
About: This document is also available at http://rpubs.com/sherloconan/570470
Data source: https://www.kaggle.com/sherloconan/anly-53053b

 

Decision Tree (C5.0) Random Forest Decision Tree (rpart) Naïve Bayes Classifier Linear SVM Polynomial SVM Radial Basis SVM k-Nearest Neighbors
Training 75.91% 93.66% 76.96% 76.11% 75.91% 75.91% 75.91% 76.01%
Test 75.57% 75.26% 76.17% 75.57% 75.57% 75.57% 75.57% 76.02%

 

1. Exploratory Data Analysis

 

Step 1: Collecting the data

rawdata <- read.csv("~/Documents/HU/ANLY 530-53-B/Project Description/thornton_hiv_got.csv") #*
vars <- c("got", "any", "tinc", "distvct", "male", "age", "hiv2004", "site", "mar", "tb", "thinktreat") #*
data <- rawdata[vars]
colnames(data)[1] <- "target"
data <- na.omit(data)

 

Step 2: Histogram and descriptive analysis

f1 <- ggplot(data,aes(tinc))+geom_histogram(binwidth=10)+labs(x="Total value of the incentive (kwacha)",y="Count")+ggtitle("Fig.1 Distribution")+theme_classic() #*
f2 <- ggplot(data,aes(distvct))+geom_histogram(binwidth=0.2)+labs(x="Distance (km)",y="Count")+ggtitle("Fig.2 Distribution")+theme_classic() #*
f3 <- ggplot(data,aes(age))+geom_histogram(binwidth=1)+labs(x="Age",y="Count")+ggtitle("Fig.3 Distribution")+theme_classic() #*
f4 <- ggplot(aes(tinc, target), data=aggregate(target~tinc, data=data, mean))+ylim(0,1)+geom_point()+labs(x="Total value of the incentive (kwacha)",y="Mean response")+ggtitle("Fig.4 Scatter plot")+theme_classic() #*
gridExtra::grid.arrange(f1,f2,f3,f4, nrow=2) #*

prop.table(table(data$target)) %>% kable(digits=4) %>% kable_styling(full_width=F)
Var1 Freq
0 0.315
1 0.685
backup <- data
data[c("target", "any", "male", "hiv2004", "site", "mar", "tb", "thinktreat")] <- lapply(data[c("target", "any", "male", "hiv2004", "site", "mar", "tb", "thinktreat")], as.factor) #*
Hmisc::describe(data)
## data 
## 
##  11  Variables      2651  Observations
## --------------------------------------------------------------------------------
## target 
##        n  missing distinct 
##     2651        0        2 
##                       
## Value          0     1
## Frequency    835  1816
## Proportion 0.315 0.685
## --------------------------------------------------------------------------------
## any 
##        n  missing distinct 
##     2651        0        2 
##                       
## Value          0     1
## Frequency    592  2059
## Proportion 0.223 0.777
## --------------------------------------------------------------------------------
## tinc 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     2651        0       27    0.979    105.6      106        0        0 
##      .25      .50      .75      .90      .95 
##       20      100      200      250      300 
## 
## lowest :   0  10  20  30  40, highest: 230 240 250 260 300
## --------------------------------------------------------------------------------
## distvct 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     2651        0     2004        1    2.019    1.422   0.4062   0.5982 
##      .25      .50      .75      .90      .95 
##   1.0288   1.6911   2.8034   3.9795   4.5979 
## 
## lowest : 0.00000000 0.03288736 0.03637984 0.03829136 0.03986998
## highest: 5.13910870 5.14017820 5.18828110 5.18977120 5.19155880
## --------------------------------------------------------------------------------
## male 
##        n  missing distinct 
##     2651        0        2 
##                       
## Value          0     1
## Frequency   1411  1240
## Proportion 0.532 0.468
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     2651        0       67    0.999    33.32    15.38       16       17 
##      .25      .50      .75      .90      .95 
##       22       32       43       52       57 
## 
## lowest : 11 12 13 14 15, highest: 73 74 75 76 80
## --------------------------------------------------------------------------------
## hiv2004 
##        n  missing distinct 
##     2651        0        3 
##                             
## Value         -1     0     1
## Frequency     12  2469   170
## Proportion 0.005 0.931 0.064
## --------------------------------------------------------------------------------
## site 
##        n  missing distinct 
##     2651        0        3 
##                             
## Value          1     2     3
## Frequency    705  1002   944
## Proportion 0.266 0.378 0.356
## --------------------------------------------------------------------------------
## mar 
##        n  missing distinct 
##     2651        0        2 
##                     
## Value         0    1
## Frequency   770 1881
## Proportion 0.29 0.71
## --------------------------------------------------------------------------------
## tb 
##        n  missing distinct 
##     2651        0        2 
##                       
## Value          0     1
## Frequency   2168   483
## Proportion 0.818 0.182
## --------------------------------------------------------------------------------
## thinktreat 
##        n  missing distinct 
##     2651        0        2 
##                       
## Value          0     1
## Frequency   1747   904
## Proportion 0.659 0.341
## --------------------------------------------------------------------------------

 

Step 3: Chi-squared Test of Independence

#REMINDER: "for" loop is not recommended in R
N <- ncol(backup)-1
pairs <- c()
for (i in 1:(ncol(backup)-2)) {
  for (j in (i+1):(ncol(backup)-1)) {
    if (chisq.test(table(backup[,i+1],backup[,j+1]))$p.value<0.05) {
      pairs <- c(pairs, c(i,j))
    }
  }
} #O(n²)
#pairs of variables that are rejected by Chi-squared Test of Independence
#such variables in a pair have a significant association
table(pairs)
## pairs
##  1  2  3  4  5  6  7  8  9 10 
##  3  4  2  3  4  3  8  5  5  3
#REMINDER: declaration with dimensions will speed up the execution in R
chiTable <- data.frame(matrix(nrow=N,ncol=N))
for (i in 1:N) {
  for (j in 1:N) {
    if (chisq.test(table(backup[,i+1],backup[,j+1]))$p.value<0.05) {
      chiTable[i,j] <- "⬤"
    } else {
      chiTable[i,j] <- "◯"
    }
  }
}
colnames(chiTable) <- c(1:N)
chiTable$Variable <- c(1:N)
chiTable <- chiTable[,c(N+1,1:N)]
#full results of Chi-squared Test of Independence: ⬤ reject null hypothesis; ◯ fail to reject null hypothesis
chiTable %>% kable() %>% kable_styling(full_width=F)
Variable 1 2 3 4 5 6 7 8 9 10
1
2
3
4
5
6
7
8
9
10

 

 

2. Data Wrangling

 

Randomization in sampling

data[,c(3,4,6)] <- scale(data[,c(3,4,6)]) #*

set.seed(530)
training <- sample(nrow(data), as.integer(nrow(data)*0.75))
dataTraining <- data[training,]
dataTest <- data[-training,]
prop.table(table(dataTraining$target)) %>% kable(digits=4) %>% kable_styling(full_width=F)
Var1 Freq
0 0.3169
1 0.6831
prop.table(table(dataTest$target)) %>% kable(digits=4) %>% kable_styling(full_width=F)
Var1 Freq
0 0.3092
1 0.6908

 

 

3. Decision Tree (C5.0) Model

 

Step 1: Training a model on the data

modelDT <- C5.0(x=dataTraining[-1], y=dataTraining$target)
trainPredDT <- predict(modelDT, dataTraining)
CrossTable(dataTraining$target, trainPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (DT)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1988 
## 
##  
##              | Predicted (DT) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       300 |       330 |       630 | 
##              |     0.151 |     0.166 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       149 |      1209 |      1358 | 
##              |     0.075 |     0.608 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       449 |      1539 |      1988 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

Step 2: Evaluating model performance

testPredDT <- predict(modelDT, dataTest)
CrossTable(dataTest$target, testPredDT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (DT)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  663 
## 
##  
##              | Predicted (DT) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        93 |       112 |       205 | 
##              |     0.140 |     0.169 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        50 |       408 |       458 | 
##              |     0.075 |     0.615 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       143 |       520 |       663 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

 

4. Random Forest Model

 

Step 1: Training a model on the data

modelRF <- randomForest(target~., data=dataTraining, importance=T)
trainPredRF <- predict(modelRF, dataTraining)
CrossTable(dataTraining$target, trainPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (RF)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1988 
## 
##  
##              | Predicted (RF) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       561 |        69 |       630 | 
##              |     0.282 |     0.035 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        57 |      1301 |      1358 | 
##              |     0.029 |     0.654 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       618 |      1370 |      1988 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

Step 2: Evaluating model performance

testPredRF <- predict(modelRF, dataTest)
CrossTable(dataTest$target, testPredRF, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (RF)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  663 
## 
##  
##              | Predicted (RF) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        86 |       119 |       205 | 
##              |     0.130 |     0.179 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        45 |       413 |       458 | 
##              |     0.068 |     0.623 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       131 |       532 |       663 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

 

5. Decision Tree (rpart) Model

 

Step 1: Training a model on the data

modelRT <- rpart(target~., data=dataTraining)
trainPredRT <- predict(modelRT, dataTraining, type="class")
CrossTable(dataTraining$target, trainPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (RT)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1988 
## 
##  
##              | Predicted (RT) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       265 |       365 |       630 | 
##              |     0.133 |     0.184 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        93 |      1265 |      1358 | 
##              |     0.047 |     0.636 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       358 |      1630 |      1988 | 
## -------------|-----------|-----------|-----------|
## 
## 
rpart.plot(modelRT, digits=4, type=1)

 

Step 2: Evaluating model performance

testPredRT <- predict(modelRT, dataTest, type="class")
CrossTable(dataTest$target, testPredRT, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (RT)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  663 
## 
##  
##              | Predicted (RT) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        72 |       133 |       205 | 
##              |     0.109 |     0.201 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        25 |       433 |       458 | 
##              |     0.038 |     0.653 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        97 |       566 |       663 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

 

6. Naïve Bayes Classifier

 

Step 1: Training a model on the data

modelNB <- naive_bayes(target~., data=dataTraining)
trainPredNB <- predict(modelNB, dataTraining, type="class")
CrossTable(dataTraining$target, trainPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (NB)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1988 
## 
##  
##              | Predicted (NB) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       307 |       323 |       630 | 
##              |     0.154 |     0.162 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       152 |      1206 |      1358 | 
##              |     0.076 |     0.607 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       459 |      1529 |      1988 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

Step 2: Evaluating model performance

testPredNB <- predict(modelNB, dataTest, type="class")
CrossTable(dataTest$target, testPredNB, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (NB)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  663 
## 
##  
##              | Predicted (NB) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        94 |       111 |       205 | 
##              |     0.142 |     0.167 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        51 |       407 |       458 | 
##              |     0.077 |     0.614 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       145 |       518 |       663 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

 

7. Support Vector Machine

 

Step 1: Training a model on the data

tryKernel <- function(train, test, kern) {
  model <- ksvm(target~., data=train, kernel=kern)
  trainAccuracy <- round((1-model@error)*100,2)
  pred <- predict(model, test)
  table <- table(test$target==pred)
  testAccuracy <- round((table[2]/(table[1]+table[2]))*100,2)
  names(testAccuracy) <- NULL
  return(c(trainAccuracy, testAccuracy))
}

modelSVM <- ksvm(target~., data=dataTraining, kernel="rbfdot")
trainPredSVM <- predict(modelSVM, dataTraining)
CrossTable(dataTraining$target, trainPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (SVM)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1988 
## 
##  
##              | Predicted (SVM) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       300 |       330 |       630 | 
##              |     0.151 |     0.166 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       149 |      1209 |      1358 | 
##              |     0.075 |     0.608 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       449 |      1539 |      1988 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

Step 2: Evaluating model performance

result <- data.frame("Accuracy"=c("Training (%)", "Test (%)"),
                     "Linear"=tryKernel(dataTraining,dataTest,kern="vanilladot"),
                     "Polynomial"=tryKernel(dataTraining,dataTest,kern="polydot"),
                     "RBF"=tryKernel(dataTraining,dataTest,kern="rbfdot"))
##  Setting default kernel parameters  
##  Setting default kernel parameters
result %>% kable() %>% kable_styling(full_width=F)
Accuracy Linear Polynomial RBF
Training (%) 75.91 75.91 75.91
Test (%) 75.57 75.57 75.57
testPredSVM <- predict(modelSVM, dataTest)
CrossTable(dataTest$target, testPredSVM, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (SVM)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  663 
## 
##  
##              | Predicted (SVM) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        93 |       112 |       205 | 
##              |     0.140 |     0.169 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        50 |       408 |       458 | 
##              |     0.075 |     0.615 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       143 |       520 |       663 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

 

8. k-Nearest Neighbors Model

 

Evaluating model performance

trainPredKNN <- knn(train=dataTraining[-1], test=dataTraining[-1], cl=dataTraining[,1], k=as.integer(sqrt(nrow(dataTraining))))
CrossTable(dataTraining$target, trainPredKNN, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (KNN)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1988 
## 
##  
##              | Predicted (KNN) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       263 |       367 |       630 | 
##              |     0.132 |     0.185 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       110 |      1248 |      1358 | 
##              |     0.055 |     0.628 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       373 |      1615 |      1988 | 
## -------------|-----------|-----------|-----------|
## 
## 
testPredKNN <- knn(train=dataTraining[-1], test=dataTest[-1], cl=dataTraining[,1], k=as.integer(sqrt(nrow(dataTraining))))
CrossTable(dataTest$target, testPredKNN, prop.chisq=F, prop.c=F, prop.r=F, dnn=c("Actual", "Predicted (KNN)"))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  663 
## 
##  
##              | Predicted (KNN) 
##       Actual |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        79 |       126 |       205 | 
##              |     0.119 |     0.190 |           | 
## -------------|-----------|-----------|-----------|
##            1 |        33 |       425 |       458 | 
##              |     0.050 |     0.641 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       112 |       551 |       663 | 
## -------------|-----------|-----------|-----------|
## 
## 

 

 

9. Feature Importance

 

Model 1: Decision Tree (C5.0)

C5imp(modelDT)
##            Overall
## any            100
## tinc             0
## distvct          0
## male             0
## age              0
## hiv2004          0
## site             0
## mar              0
## tb               0
## thinktreat       0
C5imp(modelDT, metric="splits")
##            Overall
## any            100
## tinc             0
## distvct          0
## male             0
## age              0
## hiv2004          0
## site             0
## mar              0
## tb               0
## thinktreat       0

 

Model 2: Random Forest

importance(modelRF, type=1)
##            MeanDecreaseAccuracy
## any                  22.8513718
## tinc                 39.2694252
## distvct              16.6968812
## male                  0.6593123
## age                   5.4931463
## hiv2004               2.0147891
## site                 22.1241627
## mar                   6.7307767
## tb                    9.6326492
## thinktreat            5.6722017

 

Model 3: Decision Tree (rpart)

modelRT$variable.importance
##        any       tinc    distvct       site        age 
## 143.115562 143.115562  10.779899   8.532266   0.161306

 

 

10. Clustering

 

dmCommunities <- cluster::daisy(data[-1])
set.seed(530)
data$cluster <- as.factor(kmeans(dmCommunities,2)$cluster)
DT::datatable(data,filter="top")
table(data$cluster)
## 
##    1    2 
##  980 1671
table(data$target)
## 
##    0    1 
##  835 1816
table(data$cluster, data$target)
##    
##        0    1
##   1  479  501
##   2  356 1315