Introductory R Class for AMREC

This class is a continuation of the last one we had on introduction to R for Amrec. We will be using markdown in this training.

Below are knitr options: -

Markdown supports LaTeX code, e.g.: - The logistic regression function is defined as;-

  1. \(\frac{p}{1-p} = e^{\beta_0 + \beta_1x_1 + \beta_2x_2 + . . . + \beta_px_p}\) , or
  2. \(ln(\frac{p}{1-p}) = \beta_0 + \beta_1x_1 + \beta_2x_2 + . . . + \beta_px_p\) , or
  3. \(p = \frac{1}{1 + e ^{\beta_0 + \beta_1x_1 + \beta_2x_2 + . . . + \beta_px_p}}\)

We can see in (2) above that it is very similar to OLS (ordinary least squares, or simply linear regression) except that for OLS, \(y = \beta_0 + \beta_1x_1 + \beta_2x_2 + . . . + \beta_px_p\).

A) Setting Working Directory

getwd()    # Check path to currect directory
## [1] "C:/Users/User/Desktop/R Class for Amrec"
setwd("C:\\Users\\User\\Desktop\\R Class for Amrec")
dir()      # Print items of current directory
## [1] "R-Class-Continuation from Word.pdf" "R-Class-Continuation.docx"         
## [3] "R-Class-Continuation.html"          "R-Class-Continuation.pdf"          
## [5] "R-Class-Continuation.Rmd"           "R-Class-Continuation.tex.bak"      
## [7] "R Class for Amrec.Rproj"            "rsconnect"                         
## [9] "STIData.dta"

B) Loading Packages

set echo = TRUE to hide results from package loading process.

library(gmodels)   # Descriptive Stats (Cross-tabulations)
library(haven)     # Loading data from several formats
library(magrittr)  # Piping functions " %>% ", " %$% ", and " %<>% "
library(tidyverse) # Advanced data management and visualization
library(rstatix)   # Advanced data analysis (Descriptive and Inferential Stats)
library(jmv)       # Advanced data analysis (descriptives() function)
library(caret)     # Confusion matrix
library(ROCR)      # Model evaluation
library(rpart)     # Engine for decision trees
library(rpart.plot)         # Visualize decision trees
library(randomForest)       # Engine for random forests
library(rsample)   # For splitting data
library(neuralnet) # ANN
library(nnet)
library(broom)     # Extracting neat results from models

# tinytex::install_tinytex() # For knitting to PDF if you don't have LaTeX 

C) Load Data

STIData <- read_dta("STIData.dta")

View(STIData) # Viewing data

D) Clean the Variables for Use

Data <- STIData %>% select(CaseStatus, A1Age, 
                           A2Occupation, A5MaritalStatus, 
                           N12UseCondom, AlcoholUse, 
                           Weight, Height, Sex)

Data <- Data %>% rename(STI_Status = CaseStatus,
                        Age = A1Age,
                        Occupation = A2Occupation,
                        Marital_Status = A5MaritalStatus,
                        Condom_Use = N12UseCondom,
                        Alcohol_Use = AlcoholUse)

Occup <- c("1 unemployed" = "Unemployed",
           "2 informal" = "Formal",
           "3 formal" = "Formal",
           "4 student" = "Unemployed")

M_Status <- c("1 single" = "Single",
              "2 married" = "Married",
              "3 co-habiting" = "Married",
              "4 divorcee" = "Single",
              "5 widowed" = "Single")

C_Use <- c("1 yes" = "Yes",
           "2 no" = "No",
           "2 No" = "No")

Data <- within(Data, {
  STI_Status <- ifelse(STI_Status == 3, 1, STI_Status)
  STI_Status <- ifelse(STI_Status == 2, 0, STI_Status)
  STI_Status <- factor(STI_Status, 
                       levels = 0:1,
                       labels = c("Negative", "Positive"))
  Occupation <- as.factor(recode(Occupation, !!!Occup))
  Marital_Status <- as.factor(recode(Marital_Status, !!!M_Status))
  Condom_Use <- as.factor(recode(Condom_Use, !!!C_Use))
  Alcohol_Use <- ifelse(Alcohol_Use == 1, "Yes", "No")
  Sex <- as.factor(Sex)})

E) Recap of Descriptives and Inferentials

# Descriptives

Data %>% select (Age, Height, Weight) %>% 
  descriptives(hist = TRUE,     # The function "descriptives()" is from package "jmv"
               violin = TRUE, 
               dot = TRUE, dotType = "jitter", 
               qq = FALSE,
               n = TRUE, 
               missing = TRUE, 
               mean = TRUE, 
               median = TRUE,
               mode = TRUE, 
               sum = TRUE, 
               sd = TRUE, 
               variance = TRUE,
               range = FALSE, 
               min = TRUE, 
               max = TRUE, 
               se = FALSE,
               skew = TRUE, kurt = TRUE)
## 
##  DESCRIPTIVES
## 
##  Descriptives                                                   
##  -------------------------------------------------------------- 
##                           Age          Height       Weight      
##  -------------------------------------------------------------- 
##    N                            227          226          227   
##    Missing                        0            1            0   
##    Mean                    28.03965     160.9336     53.75110   
##    Median                  26.00000     160.0000     51.00000   
##    Mode                    23.00000     157.0000     49.00000   
##    Sum                     6365.000     36371.00     12201.50   
##    Standard deviation      7.184251     8.013323     11.68474   
##    Variance                51.61347     64.21335     136.5331   
##    Minimum                 16.00000     138.0000     32.80000   
##    Maximum                 63.00000     191.0000     100.0000   
##    Skewness                1.597835    0.3187386     1.064346   
##    Std. error skewness    0.1615177    0.1618700    0.1615177   
##    Kurtosis                4.374053    0.7074604     1.550776   
##    Std. error kurtosis    0.3216650    0.3223608    0.3216650   
##  --------------------------------------------------------------

# Inferentials 
summary(MLRM <- Data %$% lm(Weight ~ Height + Age))
## 
## Call:
## lm(formula = Weight ~ Height + Age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.076  -8.258  -1.455   6.377  42.889 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -32.15102   15.18101  -2.118   0.0353 *  
## Height        0.50561    0.09161   5.519 9.43e-08 ***
## Age           0.16154    0.10203   1.583   0.1148    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11 on 223 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.126,  Adjusted R-squared:  0.1181 
## F-statistic: 16.07 on 2 and 223 DF,  p-value: 3.015e-07
aov(MLRM)
## Call:
##    aov(formula = MLRM)
## 
## Terms:
##                    Height       Age Residuals
## Sum of Squares   3584.136   303.167 26967.617
## Deg. of Freedom         1         1       223
## 
## Residual standard error: 10.99686
## Estimated effects may be unbalanced
## 1 observation deleted due to missingness
glance(MLRM)   # This function is in the broom package, used to check R^2
## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <int>  <dbl> <dbl> <dbl>
## 1     0.126         0.118  11.0      16.1 3.01e-7     3  -861. 1730. 1744.
## # ... with 2 more variables: deviance <dbl>, df.residual <int>

G) Classification Problems

Splitting the data into training and validation sets. A conservative split is 7:3 for training and validation respectively. Use the function sample()

set.seed(1)
Index <- sample(2, nrow(Data), 
                replace = TRUE, 
                prob = c(0.7, 0.3))
Training <- Data[Index == 1, ]   # Training Set
Validation <- Data[Index == 2, ] # Validation Set

Gi) Logistic Regression

A) Training Logistic Regression

MLLR <- glm(STI_Status ~ Age + Occupation + 
              Sex + Marital_Status + 
              Condom_Use + Alcohol_Use, 
            data = Training, 
            family = binomial(link = "logit"))
summary(MLLR)
## 
## Call:
## glm(formula = STI_Status ~ Age + Occupation + Sex + Marital_Status + 
##     Condom_Use + Alcohol_Use, family = binomial(link = "logit"), 
##     data = Training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9329  -1.0663   0.4801   1.0696   1.9118  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.69821    0.87590   0.797 0.425372    
## Age                  -0.05310    0.02805  -1.893 0.058389 .  
## OccupationUnemployed  0.78196    0.42677   1.832 0.066910 .  
## SexMale               0.62774    0.43343   1.448 0.147526    
## Marital_StatusSingle  0.11697    0.43887   0.267 0.789845    
## Condom_UseYes        -1.75679    0.60535  -2.902 0.003707 ** 
## Alcohol_UseYes        1.49454    0.43791   3.413 0.000643 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 216.16  on 155  degrees of freedom
## Residual deviance: 189.66  on 149  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 203.66
## 
## Number of Fisher Scoring iterations: 4
glance(MLLR)
## # A tibble: 1 x 7
##   null.deviance df.null logLik   AIC   BIC deviance df.residual
##           <dbl>   <int>  <dbl> <dbl> <dbl>    <dbl>       <int>
## 1          216.     155  -94.8  204.  225.     190.         149

B) Validating Logistic Regression

Pred_Prob <- predict(MLLR,
                     type = "response",  # Predict probabilities of outcome
                     data = Validation)
summary(Pred_Prob)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1483  0.3821  0.5129  0.5128  0.6216  0.9240
hist(Pred_Prob, 
     col = "yellow", 
     main = "Histogram of Probablities of Status")

Pred_Outcome <- ifelse(Pred_Prob <= 0.5376091, 0, 1) # Use median as cutoff
Pred_Outcome <- factor(Pred_Outcome, 
                       levels = 0:1, 
                       labels = c("Negative", "Positive"))
table(Pred_Outcome)
## Pred_Outcome
## Negative Positive 
##       88       68

C) Evaluating Logistic Regression

confusionMatrix(Pred_Outcome, 
                Training$STI_Status[-1]) # Drop one row to balance the table
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative       51       37
##   Positive       24       44
##                                          
##                Accuracy : 0.609          
##                  95% CI : (0.5277, 0.686)
##     No Information Rate : 0.5192         
##     P-Value [Acc > NIR] : 0.01495        
##                                          
##                   Kappa : 0.2218         
##                                          
##  Mcnemar's Test P-Value : 0.12443        
##                                          
##             Sensitivity : 0.6800         
##             Specificity : 0.5432         
##          Pos Pred Value : 0.5795         
##          Neg Pred Value : 0.6471         
##              Prevalence : 0.4808         
##          Detection Rate : 0.3269         
##    Detection Prevalence : 0.5641         
##       Balanced Accuracy : 0.6116         
##                                          
##        'Positive' Class : Negative       
## 
Training$STI_Status
##   [1] Negative Positive Negative Negative Positive Negative Positive Negative
##   [9] Positive Negative Positive Positive Negative Positive Negative Positive
##  [17] Negative Positive Negative Positive Positive Negative Positive Negative
##  [25] Positive Positive Positive Positive Positive Positive Negative Negative
##  [33] Positive Positive Positive Positive Negative Positive Negative Positive
##  [41] Negative Positive Negative Negative Positive Negative Positive Negative
##  [49] Positive Positive Positive Negative Positive Positive Negative Positive
##  [57] Positive Negative Negative Negative Positive Negative Positive Negative
##  [65] Positive Positive Negative Negative Positive Negative Positive Positive
##  [73] Negative Positive Negative Negative Positive Negative Positive Negative
##  [81] Negative Positive Negative Negative Positive Negative Negative Positive
##  [89] Negative Positive Negative Positive Negative Positive Negative Negative
##  [97] Positive Negative Negative Positive Negative Positive Negative Negative
## [105] Positive Positive Positive Negative Positive Negative Positive Negative
## [113] Positive Negative Positive Negative Positive Positive Negative Positive
## [121] Negative Negative Positive Negative Positive Positive Positive Positive
## [129] Negative Negative Negative Negative Negative Positive Positive Negative
## [137] Positive Positive Positive Negative Positive Negative Positive Negative
## [145] Positive Negative Positive Negative Negative Positive Negative Positive
## [153] Negative Positive Negative Negative Positive
## Levels: Negative Positive
pred <- ROCR::prediction(Pred_Prob, 
                         Training$STI_Status[-1])
eval <- performance(pred, "acc")

plot(eval)

D) Best Cutoff for Logistic Regression Model

max <- which.max(slot(eval, "y.values")[[1]]) 
# str(eval); unlist(eval@y.values); max(unlist(eval@y.values))
acc <- slot(eval, "y.values")[[1]][max]
cut <- slot(eval, "x.values")[[1]][max]
print(c(Accuracy = acc, Cutoff = cut))
##  Accuracy Cutoff.63 
## 0.6153846 0.5288155
plot(eval); abline(v = cut, 
                   h = acc, 
                   col = "red", 
                   lty = 3)

E) Receiver Operating Characteristic Curve for Logistic Regression Model

roc <- performance(pred, "tpr", "fpr")
plot(roc, 
     colorize = TRUE, 
     main = "ROC Curve - Logistic Classifier",
     ylab = "Sesnsitivity",
     xlab = "1-Specificity"); abline(a = 0, 
                                     b = 1, 
                                     col = "red", 
                                     lty = 3)

auc <- performance(pred, "auc")
auc <- round(unlist(slot(auc, "y.values")), 4)

legend(0.6, 0.4, auc, title = "AUC", cex = 0.8)  # Area Under Curve

Gii) Decision Trees

A) Training Decision Tree Model

D_Tree = rpart(STI_Status ~ Age + Occupation + Sex + 
                 Marital_Status + Condom_Use + Alcohol_Use, 
               data = Training, 
               method = "class",
               cp = 0.000001)
plot(D_Tree)
text(D_Tree, use.n = TRUE)

rpart.plot(D_Tree, col = "red")

B) Validating Decision Tree Model

Pred_DT <- predict(D_Tree, 
                   type = "class", 
                   data = Validation)
table(Pred_DT)
## Pred_DT
## Negative Positive 
##       86       71
confusionMatrix(Pred_DT, Training$STI_Status)   
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative       59       27
##   Positive       17       54
##                                           
##                Accuracy : 0.7197          
##                  95% CI : (0.6426, 0.7884)
##     No Information Rate : 0.5159          
##     P-Value [Acc > NIR] : 1.509e-07       
##                                           
##                   Kappa : 0.4412          
##                                           
##  Mcnemar's Test P-Value : 0.1748          
##                                           
##             Sensitivity : 0.7763          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.6860          
##          Neg Pred Value : 0.7606          
##              Prevalence : 0.4841          
##          Detection Rate : 0.3758          
##    Detection Prevalence : 0.5478          
##       Balanced Accuracy : 0.7215          
##                                           
##        'Positive' Class : Negative        
## 
Pred_DT <- predict(D_Tree, type = "vector", data = Validation)
pred_dt <- ROCR::prediction(Pred_DT, Training$STI_Status)
eval_dt <- performance(pred_dt, "acc")

# plot(eval_dt)

# ROC Curve
roc <- performance(pred_dt, "tpr", "fpr")
plot(roc, 
     colorize = TRUE, 
     main = "ROC Curve - Decision Tree Classifier",
     ylab = "Sesnsitivity",
     xlab = "1-Specificity"); abline(a = 0, 
                                     b = 1, 
                                     col = "red", 
                                     lty = 3)

C) Best Cutoff for Decision Tree Model

max_dt <- which.max(slot(eval_dt, "y.values")[[1]]) 
# str(eval); unlist(eval@y.values); max(unlist(eval@y.values))
acc_dt <- slot(eval_dt, "y.values")[[1]][max_dt]
cut_dt <- slot(eval_dt, "x.values")[[1]][max_dt]
print(c(Accuracy = acc_dt, Cutoff = cut_dt))
##   Accuracy Cutoff.157 
##  0.7197452  2.0000000

D) Area Under Curve for Decision Tree Model

auc_dt <- performance(pred_dt, "auc")
auc_dt <- round(unlist(slot(auc_dt, "y.values")), 4)

plot(roc, 
     colorize = TRUE, 
     main = "ROC Curve - Decision Tree Classifier",
     ylab = "Sesnsitivity",
     xlab = "1-Specificity"); abline(a = 0, 
                                     b = 1, 
                                     col = "red", 
                                     lty = 3)


legend(0.6, 0.4, auc, title = "AUC", cex = 0.8)

printcp(D_Tree)
## 
## Classification tree:
## rpart(formula = STI_Status ~ Age + Occupation + Sex + Marital_Status + 
##     Condom_Use + Alcohol_Use, data = Training, method = "class", 
##     cp = 1e-06)
## 
## Variables actually used in tree construction:
## [1] Age            Alcohol_Use    Condom_Use     Marital_Status Occupation    
## 
## Root node error: 76/157 = 0.48408
## 
## n= 157 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.171053      0   1.00000 1.09211 0.082298
## 2 0.078947      1   0.82895 1.07895 0.082352
## 3 0.052632      2   0.75000 0.93421 0.082057
## 4 0.019737      3   0.69737 0.81579 0.080592
## 5 0.000001      9   0.57895 0.78947 0.080112
plotcp(D_Tree)

Hyper parameter Tuning

rpart.control(minsplit = 20, minbucket = round(minsplit/3), maxdepth = 30)

printcp(fit) display cp table plotcp(fit) plot cross-validation results rsq.rpart(fit) plot approximate R-squared and relative error for different splits (2 plots). labels are only appropriate for the “anova” method. print(fit) print results summary(fit) detailed results including surrogate splits

Gii) Random Forest

A) Training RF Model

R_Forest <- randomForest(STI_Status ~ ., 
                         data = Training,
                         importance = TRUE,
                         na.action = na.omit, 
                         ntree = 500)

B) Validating RF Model

pred_rf <- predict(R_Forest, 
                   data = Validation, 
                   type = "class")

table(pred_rf)
## pred_rf
## Negative Positive 
##       76       80
confusionMatrix(pred_rf, Training$STI_Status[-1])  
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative       44       32
##   Positive       31       49
##                                           
##                Accuracy : 0.5962          
##                  95% CI : (0.5147, 0.6739)
##     No Information Rate : 0.5192          
##     P-Value [Acc > NIR] : 0.03232         
##                                           
##                   Kappa : 0.1915          
##                                           
##  Mcnemar's Test P-Value : 1.00000         
##                                           
##             Sensitivity : 0.5867          
##             Specificity : 0.6049          
##          Pos Pred Value : 0.5789          
##          Neg Pred Value : 0.6125          
##              Prevalence : 0.4808          
##          Detection Rate : 0.2821          
##    Detection Prevalence : 0.4872          
##       Balanced Accuracy : 0.5958          
##                                           
##        'Positive' Class : Negative        
## 

Hyper parameter Tuning (later on this)

Giii) Artificial Neural Network

A) Training Artificial Neural Network Model

Training1 <- sapply(Training, as.numeric) %>% as.data.frame()
Training1 <- Training1 %>% select(everything(), -Alcohol_Use) # Remove NA variable

ANN <- neuralnet(STI_Status ~ Age + Occupation + 
                   Sex + Marital_Status + Condom_Use, 
                 data = Training1,
                 linear.output = FALSE, 
                 act.fct = "logistic", # act.fct can be a user defined function
                 hidden = c(3, 2))   # These are 2-hidden layers each with 3 and 2 nodes

plot(ANN)

# Setting linear.output = TRUE will ignore activation function

source('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')

plot.nnet(ANN)
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## Loading required package: reshape
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths

A) Validating Artificial Neural Network Model

Validation1 <- sapply(Validation, as.numeric) %>% as.data.frame()
Validation1 <- Validation1 %>% 
  select(everything(), -Alcohol_Use, -STI_Status)  # Remove NA variable

pred_ann <- compute(ANN, Validation1)
# pred_ann$net.result

# summary(pred_ann$net.result)

pred_ann_out <- as.data.frame(ifelse(pred_ann$net.result > 1.5195, 1, 0))
pred_ann_out <- pred_ann_out$V1
pred_ann_out <- factor(pred_ann_out, 
                       levels = 0:1, 
                       labels = c("Negative", "Positive"))

table(pred_ann_out, Validation$STI_Status)
##             
## pred_ann_out Negative Positive
##     Negative       37       33
##     Positive        0        0
confusionMatrix(pred_ann_out, Validation$STI_Status)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Negative Positive
##   Negative       37       33
##   Positive        0        0
##                                           
##                Accuracy : 0.5286          
##                  95% CI : (0.4055, 0.6491)
##     No Information Rate : 0.5286          
##     P-Value [Acc > NIR] : 0.5485          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 2.54e-08        
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.5286          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.5286          
##          Detection Rate : 0.5286          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Negative        
## 

THE END