Load required

install.packages("ResourceSelection")

## Installing package into 'C:/Users/H P/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'ResourceSelection' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\H P\AppData\Local\Temp\RtmpaIHoTF\downloaded_packages

install.packages("caret")

## Installing package into 'C:/Users/H P/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'caret' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'caret'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\H P\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll to
## C:\Users\H P\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied

## Warning: restored 'caret'

## 
## The downloaded binary packages are in
##  C:\Users\H P\AppData\Local\Temp\RtmpaIHoTF\downloaded_packages

install.packages("tidyverse")

## Installing package into 'C:/Users/H P/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\H P\AppData\Local\Temp\RtmpaIHoTF\downloaded_packages

library(ResourceSelection)

## Warning: package 'ResourceSelection' was built under R version 4.4.2

## ResourceSelection 0.3-6   2023-06-27

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## Loading required package: ggplot2

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.4.2

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.2

## Warning: package 'dplyr' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

set.seed(123)
n <- 1000
x1 <- rnorm(n)
x2 <- rnorm(n)
z <- 1 + 2*x1 + 3*x2
prob <- 1 / (1 + exp(-z))
outcome <- rbinom(n, 1, prob) 
data <- data.frame(outcome = outcome, x1 = x1, x2 = x2) |>
  mutate(outcome = if_else(outcome == 1, "yes", "no")) |>
  mutate(outcome = factor(outcome))
# Perform binary logistic regression
model <- glm(outcome ~ x1 + x2, data = data, family = binomial)
# Summary of the model
summary(model)

## 
## Call:
## glm(formula = outcome ~ x1 + x2, family = binomial, data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.1877     0.1291   9.199   <2e-16 ***
## x1            2.1243     0.1768  12.018   <2e-16 ***
## x2            3.4635     0.2395  14.460   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1329.1  on 999  degrees of freedom
## Residual deviance:  515.4  on 997  degrees of freedom
## AIC: 521.4
## 
## Number of Fisher Scoring iterations: 7

# Hosmer-Lemeshow goodness of fit test
hl_test <- hoslem.test(model$y, 
                       fitted(model))
print(hl_test)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  model$y, fitted(model)
## X-squared = 3.4942, df = 8, p-value = 0.8996

# Calculate predictions
predictions <- predict(model, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "yes", "no")

# Create confusion matrix
conf_matrix <- confusionMatrix(factor(predicted_classes), 
                               factor(data$outcome))
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  320  46
##        yes  61 573
##                                           
##                Accuracy : 0.893           
##                  95% CI : (0.8722, 0.9115)
##     No Information Rate : 0.619           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7714          
##                                           
##  Mcnemar's Test P-Value : 0.1759          
##                                           
##             Sensitivity : 0.8399          
##             Specificity : 0.9257          
##          Pos Pred Value : 0.8743          
##          Neg Pred Value : 0.9038          
##              Prevalence : 0.3810          
##          Detection Rate : 0.3200          
##    Detection Prevalence : 0.3660          
##       Balanced Accuracy : 0.8828          
##                                           
##        'Positive' Class : no              
##

# Calculate additional metrics
accuracy <- conf_matrix$overall["Accuracy"]
sensitivity <- conf_matrix$byClass["Sensitivity"]
specificity <- conf_matrix$byClass["Specificity"]

cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.893

cat("Sensitivity:", sensitivity, "\n")

## Sensitivity: 0.839895

cat("Specificity:", specificity, "\n")

## Specificity: 0.9256866

# Plot ROC curve
library(pROC)

## Warning: package 'pROC' was built under R version 4.4.2

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

roc_curve <- roc(data$outcome, predictions)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

plot(roc_curve, main = "ROC Curve")

auc <- auc(roc_curve)
cat("AUC:", auc, "\n")

## AUC: 0.9563601

Goodness of Fit

Dr Omar Bin Nazmi

2024-11-15

Load required