This class is a continuation of the last one we had on introduction to R for Amrec. We will be using markdown in this training.
Below are knitr options: -
Markdown supports LaTeX code, e.g.: - The logistic regression function is defined as;-
We can see in (2) above that it is very similar to OLS (ordinary least squares, or simply linear regression) except that for OLS, \(y = \beta_0 + \beta_1x_1 + \beta_2x_2 + . . . + \beta_px_p\).
getwd() # Check path to currect directory
## [1] "C:/Users/User/Desktop/R Class for Amrec"
setwd("C:\\Users\\User\\Desktop\\R Class for Amrec")
dir() # Print items of current directory
## [1] "R-Class-Continuation from Word.pdf" "R-Class-Continuation.docx"
## [3] "R-Class-Continuation.html" "R-Class-Continuation.pdf"
## [5] "R-Class-Continuation.Rmd" "R-Class-Continuation.tex.bak"
## [7] "R Class for Amrec.Rproj" "rsconnect"
## [9] "STIData.dta"
set echo = TRUE to hide results from package loading process.
library(gmodels) # Descriptive Stats (Cross-tabulations)
library(haven) # Loading data from several formats
library(magrittr) # Piping functions " %>% ", " %$% ", and " %<>% "
library(tidyverse) # Advanced data management and visualization
library(rstatix) # Advanced data analysis (Descriptive and Inferential Stats)
library(jmv) # Advanced data analysis (descriptives() function)
library(caret) # Confusion matrix
library(ROCR) # Model evaluation
library(rpart) # Engine for decision trees
library(rpart.plot) # Visualize decision trees
library(randomForest) # Engine for random forests
library(rsample) # For splitting data
library(neuralnet) # ANN
library(nnet)
library(broom) # Extracting neat results from models
# tinytex::install_tinytex() # For knitting to PDF if you don't have LaTeX
STIData <- read_dta("STIData.dta")
View(STIData) # Viewing data
Data <- STIData %>% select(CaseStatus, A1Age,
A2Occupation, A5MaritalStatus,
N12UseCondom, AlcoholUse,
Weight, Height, Sex)
Data <- Data %>% rename(STI_Status = CaseStatus,
Age = A1Age,
Occupation = A2Occupation,
Marital_Status = A5MaritalStatus,
Condom_Use = N12UseCondom,
Alcohol_Use = AlcoholUse)
Occup <- c("1 unemployed" = "Unemployed",
"2 informal" = "Formal",
"3 formal" = "Formal",
"4 student" = "Unemployed")
M_Status <- c("1 single" = "Single",
"2 married" = "Married",
"3 co-habiting" = "Married",
"4 divorcee" = "Single",
"5 widowed" = "Single")
C_Use <- c("1 yes" = "Yes",
"2 no" = "No",
"2 No" = "No")
Data <- within(Data, {
STI_Status <- ifelse(STI_Status == 3, 1, STI_Status)
STI_Status <- ifelse(STI_Status == 2, 0, STI_Status)
STI_Status <- factor(STI_Status,
levels = 0:1,
labels = c("Negative", "Positive"))
Occupation <- as.factor(recode(Occupation, !!!Occup))
Marital_Status <- as.factor(recode(Marital_Status, !!!M_Status))
Condom_Use <- as.factor(recode(Condom_Use, !!!C_Use))
Alcohol_Use <- ifelse(Alcohol_Use == 1, "Yes", "No")
Sex <- as.factor(Sex)})
# Descriptives
Data %>% select (Age, Height, Weight) %>%
descriptives(hist = TRUE, # The function "descriptives()" is from package "jmv"
violin = TRUE,
dot = TRUE, dotType = "jitter",
qq = FALSE,
n = TRUE,
missing = TRUE,
mean = TRUE,
median = TRUE,
mode = TRUE,
sum = TRUE,
sd = TRUE,
variance = TRUE,
range = FALSE,
min = TRUE,
max = TRUE,
se = FALSE,
skew = TRUE, kurt = TRUE)
##
## DESCRIPTIVES
##
## Descriptives
## --------------------------------------------------------------
## Age Height Weight
## --------------------------------------------------------------
## N 227 226 227
## Missing 0 1 0
## Mean 28.03965 160.9336 53.75110
## Median 26.00000 160.0000 51.00000
## Mode 23.00000 157.0000 49.00000
## Sum 6365.000 36371.00 12201.50
## Standard deviation 7.184251 8.013323 11.68474
## Variance 51.61347 64.21335 136.5331
## Minimum 16.00000 138.0000 32.80000
## Maximum 63.00000 191.0000 100.0000
## Skewness 1.597835 0.3187386 1.064346
## Std. error skewness 0.1615177 0.1618700 0.1615177
## Kurtosis 4.374053 0.7074604 1.550776
## Std. error kurtosis 0.3216650 0.3223608 0.3216650
## --------------------------------------------------------------
# Inferentials
summary(MLRM <- Data %$% lm(Weight ~ Height + Age))
##
## Call:
## lm(formula = Weight ~ Height + Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.076 -8.258 -1.455 6.377 42.889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -32.15102 15.18101 -2.118 0.0353 *
## Height 0.50561 0.09161 5.519 9.43e-08 ***
## Age 0.16154 0.10203 1.583 0.1148
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11 on 223 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.126, Adjusted R-squared: 0.1181
## F-statistic: 16.07 on 2 and 223 DF, p-value: 3.015e-07
aov(MLRM)
## Call:
## aov(formula = MLRM)
##
## Terms:
## Height Age Residuals
## Sum of Squares 3584.136 303.167 26967.617
## Deg. of Freedom 1 1 223
##
## Residual standard error: 10.99686
## Estimated effects may be unbalanced
## 1 observation deleted due to missingness
glance(MLRM) # This function is in the broom package, used to check R^2
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.126 0.118 11.0 16.1 3.01e-7 3 -861. 1730. 1744.
## # ... with 2 more variables: deviance <dbl>, df.residual <int>
Splitting the data into training and validation sets. A conservative split is 7:3 for training and validation respectively. Use the function sample()
set.seed(1)
Index <- sample(2, nrow(Data),
replace = TRUE,
prob = c(0.7, 0.3))
Training <- Data[Index == 1, ] # Training Set
Validation <- Data[Index == 2, ] # Validation Set
MLLR <- glm(STI_Status ~ Age + Occupation +
Sex + Marital_Status +
Condom_Use + Alcohol_Use,
data = Training,
family = binomial(link = "logit"))
summary(MLLR)
##
## Call:
## glm(formula = STI_Status ~ Age + Occupation + Sex + Marital_Status +
## Condom_Use + Alcohol_Use, family = binomial(link = "logit"),
## data = Training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9329 -1.0663 0.4801 1.0696 1.9118
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.69821 0.87590 0.797 0.425372
## Age -0.05310 0.02805 -1.893 0.058389 .
## OccupationUnemployed 0.78196 0.42677 1.832 0.066910 .
## SexMale 0.62774 0.43343 1.448 0.147526
## Marital_StatusSingle 0.11697 0.43887 0.267 0.789845
## Condom_UseYes -1.75679 0.60535 -2.902 0.003707 **
## Alcohol_UseYes 1.49454 0.43791 3.413 0.000643 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 216.16 on 155 degrees of freedom
## Residual deviance: 189.66 on 149 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 203.66
##
## Number of Fisher Scoring iterations: 4
glance(MLLR)
## # A tibble: 1 x 7
## null.deviance df.null logLik AIC BIC deviance df.residual
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 216. 155 -94.8 204. 225. 190. 149
Pred_Prob <- predict(MLLR,
type = "response", # Predict probabilities of outcome
data = Validation)
summary(Pred_Prob)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1483 0.3821 0.5129 0.5128 0.6216 0.9240
hist(Pred_Prob,
col = "yellow",
main = "Histogram of Probablities of Status")
Pred_Outcome <- ifelse(Pred_Prob <= 0.5376091, 0, 1) # Use median as cutoff
Pred_Outcome <- factor(Pred_Outcome,
levels = 0:1,
labels = c("Negative", "Positive"))
table(Pred_Outcome)
## Pred_Outcome
## Negative Positive
## 88 68
confusionMatrix(Pred_Outcome,
Training$STI_Status[-1]) # Drop one row to balance the table
## Confusion Matrix and Statistics
##
## Reference
## Prediction Negative Positive
## Negative 51 37
## Positive 24 44
##
## Accuracy : 0.609
## 95% CI : (0.5277, 0.686)
## No Information Rate : 0.5192
## P-Value [Acc > NIR] : 0.01495
##
## Kappa : 0.2218
##
## Mcnemar's Test P-Value : 0.12443
##
## Sensitivity : 0.6800
## Specificity : 0.5432
## Pos Pred Value : 0.5795
## Neg Pred Value : 0.6471
## Prevalence : 0.4808
## Detection Rate : 0.3269
## Detection Prevalence : 0.5641
## Balanced Accuracy : 0.6116
##
## 'Positive' Class : Negative
##
Training$STI_Status
## [1] Negative Positive Negative Negative Positive Negative Positive Negative
## [9] Positive Negative Positive Positive Negative Positive Negative Positive
## [17] Negative Positive Negative Positive Positive Negative Positive Negative
## [25] Positive Positive Positive Positive Positive Positive Negative Negative
## [33] Positive Positive Positive Positive Negative Positive Negative Positive
## [41] Negative Positive Negative Negative Positive Negative Positive Negative
## [49] Positive Positive Positive Negative Positive Positive Negative Positive
## [57] Positive Negative Negative Negative Positive Negative Positive Negative
## [65] Positive Positive Negative Negative Positive Negative Positive Positive
## [73] Negative Positive Negative Negative Positive Negative Positive Negative
## [81] Negative Positive Negative Negative Positive Negative Negative Positive
## [89] Negative Positive Negative Positive Negative Positive Negative Negative
## [97] Positive Negative Negative Positive Negative Positive Negative Negative
## [105] Positive Positive Positive Negative Positive Negative Positive Negative
## [113] Positive Negative Positive Negative Positive Positive Negative Positive
## [121] Negative Negative Positive Negative Positive Positive Positive Positive
## [129] Negative Negative Negative Negative Negative Positive Positive Negative
## [137] Positive Positive Positive Negative Positive Negative Positive Negative
## [145] Positive Negative Positive Negative Negative Positive Negative Positive
## [153] Negative Positive Negative Negative Positive
## Levels: Negative Positive
pred <- ROCR::prediction(Pred_Prob,
Training$STI_Status[-1])
eval <- performance(pred, "acc")
plot(eval)
max <- which.max(slot(eval, "y.values")[[1]])
# str(eval); unlist(eval@y.values); max(unlist(eval@y.values))
acc <- slot(eval, "y.values")[[1]][max]
cut <- slot(eval, "x.values")[[1]][max]
print(c(Accuracy = acc, Cutoff = cut))
## Accuracy Cutoff.63
## 0.6153846 0.5288155
plot(eval); abline(v = cut,
h = acc,
col = "red",
lty = 3)
roc <- performance(pred, "tpr", "fpr")
plot(roc,
colorize = TRUE,
main = "ROC Curve - Logistic Classifier",
ylab = "Sesnsitivity",
xlab = "1-Specificity"); abline(a = 0,
b = 1,
col = "red",
lty = 3)
auc <- performance(pred, "auc")
auc <- round(unlist(slot(auc, "y.values")), 4)
legend(0.6, 0.4, auc, title = "AUC", cex = 0.8) # Area Under Curve
D_Tree = rpart(STI_Status ~ Age + Occupation + Sex +
Marital_Status + Condom_Use + Alcohol_Use,
data = Training,
method = "class",
cp = 0.000001)
plot(D_Tree)
text(D_Tree, use.n = TRUE)
rpart.plot(D_Tree, col = "red")
Pred_DT <- predict(D_Tree,
type = "class",
data = Validation)
table(Pred_DT)
## Pred_DT
## Negative Positive
## 86 71
confusionMatrix(Pred_DT, Training$STI_Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Negative Positive
## Negative 59 27
## Positive 17 54
##
## Accuracy : 0.7197
## 95% CI : (0.6426, 0.7884)
## No Information Rate : 0.5159
## P-Value [Acc > NIR] : 1.509e-07
##
## Kappa : 0.4412
##
## Mcnemar's Test P-Value : 0.1748
##
## Sensitivity : 0.7763
## Specificity : 0.6667
## Pos Pred Value : 0.6860
## Neg Pred Value : 0.7606
## Prevalence : 0.4841
## Detection Rate : 0.3758
## Detection Prevalence : 0.5478
## Balanced Accuracy : 0.7215
##
## 'Positive' Class : Negative
##
Pred_DT <- predict(D_Tree, type = "vector", data = Validation)
pred_dt <- ROCR::prediction(Pred_DT, Training$STI_Status)
eval_dt <- performance(pred_dt, "acc")
# plot(eval_dt)
# ROC Curve
roc <- performance(pred_dt, "tpr", "fpr")
plot(roc,
colorize = TRUE,
main = "ROC Curve - Decision Tree Classifier",
ylab = "Sesnsitivity",
xlab = "1-Specificity"); abline(a = 0,
b = 1,
col = "red",
lty = 3)
max_dt <- which.max(slot(eval_dt, "y.values")[[1]])
# str(eval); unlist(eval@y.values); max(unlist(eval@y.values))
acc_dt <- slot(eval_dt, "y.values")[[1]][max_dt]
cut_dt <- slot(eval_dt, "x.values")[[1]][max_dt]
print(c(Accuracy = acc_dt, Cutoff = cut_dt))
## Accuracy Cutoff.157
## 0.7197452 2.0000000
auc_dt <- performance(pred_dt, "auc")
auc_dt <- round(unlist(slot(auc_dt, "y.values")), 4)
plot(roc,
colorize = TRUE,
main = "ROC Curve - Decision Tree Classifier",
ylab = "Sesnsitivity",
xlab = "1-Specificity"); abline(a = 0,
b = 1,
col = "red",
lty = 3)
legend(0.6, 0.4, auc, title = "AUC", cex = 0.8)
printcp(D_Tree)
##
## Classification tree:
## rpart(formula = STI_Status ~ Age + Occupation + Sex + Marital_Status +
## Condom_Use + Alcohol_Use, data = Training, method = "class",
## cp = 1e-06)
##
## Variables actually used in tree construction:
## [1] Age Alcohol_Use Condom_Use Marital_Status Occupation
##
## Root node error: 76/157 = 0.48408
##
## n= 157
##
## CP nsplit rel error xerror xstd
## 1 0.171053 0 1.00000 1.09211 0.082298
## 2 0.078947 1 0.82895 1.07895 0.082352
## 3 0.052632 2 0.75000 0.93421 0.082057
## 4 0.019737 3 0.69737 0.81579 0.080592
## 5 0.000001 9 0.57895 0.78947 0.080112
plotcp(D_Tree)
Hyper parameter Tuning
rpart.control(minsplit = 20, minbucket = round(minsplit/3), maxdepth = 30)
printcp(fit) display cp table plotcp(fit) plot cross-validation results rsq.rpart(fit) plot approximate R-squared and relative error for different splits (2 plots). labels are only appropriate for the “anova” method. print(fit) print results summary(fit) detailed results including surrogate splits
R_Forest <- randomForest(STI_Status ~ .,
data = Training,
importance = TRUE,
na.action = na.omit,
ntree = 500)
pred_rf <- predict(R_Forest,
data = Validation,
type = "class")
table(pred_rf)
## pred_rf
## Negative Positive
## 76 80
confusionMatrix(pred_rf, Training$STI_Status[-1])
## Confusion Matrix and Statistics
##
## Reference
## Prediction Negative Positive
## Negative 44 32
## Positive 31 49
##
## Accuracy : 0.5962
## 95% CI : (0.5147, 0.6739)
## No Information Rate : 0.5192
## P-Value [Acc > NIR] : 0.03232
##
## Kappa : 0.1915
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.5867
## Specificity : 0.6049
## Pos Pred Value : 0.5789
## Neg Pred Value : 0.6125
## Prevalence : 0.4808
## Detection Rate : 0.2821
## Detection Prevalence : 0.4872
## Balanced Accuracy : 0.5958
##
## 'Positive' Class : Negative
##
Hyper parameter Tuning (later on this)
Training1 <- sapply(Training, as.numeric) %>% as.data.frame()
Training1 <- Training1 %>% select(everything(), -Alcohol_Use) # Remove NA variable
ANN <- neuralnet(STI_Status ~ Age + Occupation +
Sex + Marital_Status + Condom_Use,
data = Training1,
linear.output = FALSE,
act.fct = "logistic", # act.fct can be a user defined function
hidden = c(3, 2)) # These are 2-hidden layers each with 3 and 2 nodes
plot(ANN)
# Setting linear.output = TRUE will ignore activation function
source('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')
plot.nnet(ANN)
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: reshape
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
Validation1 <- sapply(Validation, as.numeric) %>% as.data.frame()
Validation1 <- Validation1 %>%
select(everything(), -Alcohol_Use, -STI_Status) # Remove NA variable
pred_ann <- compute(ANN, Validation1)
# pred_ann$net.result
# summary(pred_ann$net.result)
pred_ann_out <- as.data.frame(ifelse(pred_ann$net.result > 1.5195, 1, 0))
pred_ann_out <- pred_ann_out$V1
pred_ann_out <- factor(pred_ann_out,
levels = 0:1,
labels = c("Negative", "Positive"))
table(pred_ann_out, Validation$STI_Status)
##
## pred_ann_out Negative Positive
## Negative 37 33
## Positive 0 0
confusionMatrix(pred_ann_out, Validation$STI_Status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Negative Positive
## Negative 37 33
## Positive 0 0
##
## Accuracy : 0.5286
## 95% CI : (0.4055, 0.6491)
## No Information Rate : 0.5286
## P-Value [Acc > NIR] : 0.5485
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 2.54e-08
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5286
## Neg Pred Value : NaN
## Prevalence : 0.5286
## Detection Rate : 0.5286
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Negative
##