GLM AND LOGISTIC REGRESSION
ALY6015: INTERMEDIATE ANALYTICS
NORTHEASTERN UNIVERSITY
Student - Jayakumar Moris Udayakumar
Instructor Name: Prof. Zhi (Richard) He
Date: 17th Mar 2024

Loading Dataset

Descriptive Statistics

#descriptive statistics
desc_sum <- summarytools::descr(College)

#plotting graph
kable(round(desc_sum, digits = 2), table.attr = "style='width:40%'",align="c", format="html", digits = 2) %>%
  kable_styling(bootstrap_options="bordered", latex_options = "striped", font_size = NULL)
Accept Apps Books Enroll Expend F.Undergrad Grad.Rate Outstate P.Undergrad perc.alumni Personal PhD Room.Board S.F.Ratio Terminal Top10perc Top25perc
Mean 2018.80 3001.64 549.38 779.97 9660.17 3699.91 65.46 10440.67 855.30 22.74 1340.64 72.66 4357.53 14.09 79.70 27.56 55.80
Std.Dev 2451.11 3870.20 165.11 929.18 5221.77 4850.42 17.18 4023.02 1522.43 12.39 677.07 16.33 1096.70 3.96 14.72 17.64 19.80
Min 72.00 81.00 96.00 35.00 3186.00 139.00 10.00 2340.00 1.00 0.00 250.00 8.00 1780.00 2.50 24.00 1.00 9.00
Q1 604.00 776.00 470.00 242.00 6751.00 992.00 53.00 7320.00 95.00 13.00 850.00 62.00 3597.00 11.50 71.00 15.00 41.00
Median 1110.00 1558.00 500.00 434.00 8377.00 1707.00 65.00 9990.00 353.00 21.00 1200.00 75.00 4200.00 13.60 82.00 23.00 54.00
Q3 2424.00 3624.00 600.00 902.00 10830.00 4005.00 78.00 12925.00 967.00 31.00 1700.00 85.00 5050.00 16.50 92.00 35.00 69.00
Max 26330.00 48094.00 2340.00 6392.00 56233.00 31643.00 118.00 21700.00 21836.00 64.00 6800.00 103.00 8124.00 39.80 100.00 96.00 100.00
MAD 1008.17 1463.33 148.26 354.34 2730.95 1441.09 17.79 4121.63 449.23 13.34 593.04 17.79 1005.20 3.41 14.83 13.34 20.76
IQR 1820.00 2848.00 130.00 660.00 4079.00 3013.00 25.00 5605.00 872.00 18.00 850.00 23.00 1453.00 5.00 21.00 20.00 28.00
CV 1.21 1.29 0.30 1.19 0.54 1.31 0.26 0.39 1.78 0.54 0.51 0.22 0.25 0.28 0.18 0.64 0.35
Skewness 3.40 3.71 3.47 2.68 3.45 2.60 -0.11 0.51 5.67 0.60 1.74 -0.77 0.48 0.66 -0.81 1.41 0.26
SE.Skewness 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09 0.09
Kurtosis 18.75 26.52 28.06 8.74 18.59 7.61 -0.22 -0.43 54.52 -0.11 7.04 0.54 -0.20 2.52 0.22 2.17 -0.57
N.Valid 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00 777.00
Pct.Valid 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00

Exploratory Data Analysis

#EDA
#Histogram
hist(College$Outstate, xlab = "Outstate", ylab = "Frequency", main = "Histogram - Outstate", col = "skyblue")

#mean and median
mean_outstate = round(mean(College$Outstate), 2)
median_outstate = round(median(College$Outstate), 2)

#Adding mean and median lines
abline(v = mean_outstate, col = "blue", lty = 2)
abline(v = median_outstate, col = "red", lty = 2)

#Adding labels for mean and median lines
text(x=mean_outstate, y=25, round(mean_outstate,2), srt=90, cex=0.8, adj = c(0.001,1))
text(x=median_outstate, y=25, round(median_outstate,2), srt=90, cex=0.8, adj = c(0.001,0))

#boxplot
# Assuming you have a data frame named College with a column Grad.Rate

# Create a boxplot of Grad.Rate with custom formatting
# Assuming you have a data frame named College with a column Grad.Rate

private_col_acc <- ggplot(data = College, aes(x = Private, y = Accept, fill = Private)) +
  geom_boxplot() +
  guides(fill = "none")

private_col_enrol <- ggplot(data = College, aes(x = Private, y = Enroll, fill = Private)) +
  geom_boxplot() +
  guides(fill = "none")

grid.arrange(private_col_acc, private_col_enrol, nrow = 1)


Splitting and Partitioning Dataset - Train and Test

#Partitioning the dataset into Train and Test
set.seed(20353)
Train_car_index <- createDataPartition(College$Private, p = 0.75, list = FALSE, times = 1)
sample_train_car = College[Train_car_index,]
sample_test_car = College[-Train_car_index,]

head(sample_train_car)
##                              Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University     Yes 1660   1232    721        23        52
## Adelphi University               Yes 2186   1924    512        16        29
## Adrian College                   Yes 1428   1097    336        22        50
## Agnes Scott College              Yes  417    349    137        60        89
## Alaska Pacific University        Yes  193    146     55        16        44
## Albertson College                Yes  587    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University        2885         537     7440       3300   450
## Adelphi University                  2683        1227    12280       6450   750
## Adrian College                      1036          99    11250       3750   400
## Agnes Scott College                  510          63    12960       5450   450
## Alaska Pacific University            249         869     7560       4120   800
## Albertson College                    678          41    13500       3335   500
##                              Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University     2200  70       78      18.1          12   7041
## Adelphi University               1500  29       30      12.2          16  10527
## Adrian College                   1165  53       66      12.9          30   8735
## Agnes Scott College               875  92       97       7.7          37  19016
## Alaska Pacific University        1500  76       72      11.9           2  10922
## Albertson College                 675  67       73       9.4          11   9727
##                              Grad.Rate
## Abilene Christian University        60
## Adelphi University                  56
## Adrian College                      54
## Agnes Scott College                 59
## Alaska Pacific University           15
## Albertson College                   55

Generalized Linear Model - Training - All features as Predictors

#glm model 1
glm_col_model1 = glm(Private ~., data = sample_train_car, family = binomial(link = "logit"))
summary(glm_col_model1)
## 
## Call:
## glm(formula = Private ~ ., family = binomial(link = "logit"), 
##     data = sample_train_car)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6709  -0.0438   0.0446   0.1817   3.4481  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.591e+00  2.237e+00  -0.711   0.4770    
## Apps        -6.642e-04  2.654e-04  -2.502   0.0123 *  
## Accept      -6.048e-05  5.117e-04  -0.118   0.9059    
## Enroll       1.811e-03  9.060e-04   1.999   0.0456 *  
## Top10perc   -5.908e-04  3.168e-02  -0.019   0.9851    
## Top25perc    1.256e-02  2.064e-02   0.609   0.5428    
## F.Undergrad -2.711e-04  1.399e-04  -1.937   0.0527 .  
## P.Undergrad -3.462e-04  3.084e-04  -1.123   0.2616    
## Outstate     6.327e-04  1.228e-04   5.154 2.56e-07 ***
## Room.Board   5.248e-04  2.983e-04   1.759   0.0785 .  
## Books        2.401e-03  1.608e-03   1.494   0.1353    
## Personal    -4.186e-04  3.236e-04  -1.293   0.1958    
## PhD         -5.183e-02  2.759e-02  -1.878   0.0603 .  
## Terminal    -4.226e-02  2.771e-02  -1.525   0.1272    
## S.F.Ratio   -6.777e-02  7.651e-02  -0.886   0.3757    
## perc.alumni  5.463e-02  2.447e-02   2.232   0.0256 *  
## Expend       2.341e-04  1.496e-04   1.564   0.1177    
## Grad.Rate    1.497e-02  1.313e-02   1.140   0.2542    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 683.22  on 582  degrees of freedom
## Residual deviance: 186.31  on 565  degrees of freedom
## AIC: 222.31
## 
## Number of Fisher Scoring iterations: 8
#to show regression coef in log-odds
coef(glm_col_model1)
##   (Intercept)          Apps        Accept        Enroll     Top10perc 
## -1.591177e+00 -6.642083e-04 -6.048014e-05  1.811327e-03 -5.908029e-04 
##     Top25perc   F.Undergrad   P.Undergrad      Outstate    Room.Board 
##  1.256172e-02 -2.710636e-04 -3.461744e-04  6.327025e-04  5.248531e-04 
##         Books      Personal           PhD      Terminal     S.F.Ratio 
##  2.401092e-03 -4.185936e-04 -5.183406e-02 -4.226127e-02 -6.777422e-02 
##   perc.alumni        Expend     Grad.Rate 
##  5.462570e-02  2.340785e-04  1.497043e-02
#to show regression coef in odds
exp(coef(glm_col_model1))
## (Intercept)        Apps      Accept      Enroll   Top10perc   Top25perc 
##   0.2036857   0.9993360   0.9999395   1.0018130   0.9994094   1.0126410 
## F.Undergrad P.Undergrad    Outstate  Room.Board       Books    Personal 
##   0.9997290   0.9996539   1.0006329   1.0005250   1.0024040   0.9995815 
##         PhD    Terminal   S.F.Ratio perc.alumni      Expend   Grad.Rate 
##   0.9494864   0.9586193   0.9344714   1.0561452   1.0002341   1.0150830

Generalized Linear Model - Training - Three Predictors

glm_col_model2 = glm(Private ~ Accept + Enroll + Grad.Rate, data = sample_train_car, family = binomial(link = "logit"))
summary(glm_col_model2)
## 
## Call:
## glm(formula = Private ~ Accept + Enroll + Grad.Rate, family = binomial(link = "logit"), 
##     data = sample_train_car)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4656  -0.0886   0.3369   0.5270   4.8536  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.9977861  0.4689462  -2.128   0.0334 *  
## Accept       0.0003047  0.0001483   2.055   0.0399 *  
## Enroll      -0.0031306  0.0004847  -6.459 1.06e-10 ***
## Grad.Rate    0.0612451  0.0080246   7.632 2.31e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 683.22  on 582  degrees of freedom
## Residual deviance: 403.90  on 579  degrees of freedom
## AIC: 411.9
## 
## Number of Fisher Scoring iterations: 6
#to show regression coef in log-odds
coef(glm_col_model2)
##   (Intercept)        Accept        Enroll     Grad.Rate 
## -0.9977860925  0.0003046799 -0.0031305910  0.0612451012
#to show regression coef in odds
exp(coef(glm_col_model2))
## (Intercept)      Accept      Enroll   Grad.Rate 
##   0.3686948   1.0003047   0.9968743   1.0631595
#The odds of Private increase by a factor of 0.997 for every 1 unit increase in Enroll

Confusion Matrix - Training dataset

#Training set predictions
prob.train = predict(glm_col_model2, newdata = sample_train_car, type = "response")
predicted.classes.min.train <- as.factor(ifelse(prob.train >= 0.5, "Yes", "No"))

#Confusion Matrix - training dataset
conf_matrix_tr = confusionMatrix(sample_train_car$Private, predicted.classes.min.train, positive = 'Yes')

# Print the confusion matrix
print(conf_matrix_tr)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No   97  62
##        Yes  19 405
##                                           
##                Accuracy : 0.8611          
##                  95% CI : (0.8303, 0.8881)
##     No Information Rate : 0.801           
##     P-Value [Acc > NIR] : 9.817e-05       
##                                           
##                   Kappa : 0.6174          
##                                           
##  Mcnemar's Test P-Value : 3.061e-06       
##                                           
##             Sensitivity : 0.8672          
##             Specificity : 0.8362          
##          Pos Pred Value : 0.9552          
##          Neg Pred Value : 0.6101          
##              Prevalence : 0.8010          
##          Detection Rate : 0.6947          
##    Detection Prevalence : 0.7273          
##       Balanced Accuracy : 0.8517          
##                                           
##        'Positive' Class : Yes             
## 
#Metrics Table - training dataset
# Calculate metrics
accuracy <- conf_matrix_tr$overall['Accuracy']
precision <- conf_matrix_tr$byClass['Pos Pred Value']
recall <- conf_matrix_tr$byClass['Sensitivity']      
specificity <- conf_matrix_tr$byClass['Specificity']
f1_score <- 2 * (precision * recall) / (precision + recall) 
beta <- 2  
f2_score <- ((1 + beta^2) * precision * recall) / ((beta^2 * precision) + recall)

# Create a data frame to store the metrics
metrics_df <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall (Sensitivity)", "Specificity", "F1 Score", "F2 Score"),
  Value = c(accuracy, precision, recall, specificity, f1_score, f2_score)
)

# Print the formatted table
kable(metrics_df, format = "html", digits = 3) %>%
  kable_styling(full_width = FALSE, bootstrap_options = "basic", table.envir = "table", latex_options = "basic", position = "center")
Metric Value
Accuracy 0.861
Precision 0.955
Recall (Sensitivity) 0.867
Specificity 0.836
F1 Score 0.909
F2 Score 0.884

Confusion Matrix - Test dataset

#Confusion Matrix - test dataset
#Test set predictions
prob.test = predict(glm_col_model2, newdata = sample_test_car, type = "response")
predicted.classes.min.test <- as.factor(ifelse(prob.test >= 0.5, "Yes", "No"))

#Confusion Matrix - training dataset
conf_matrix_te = confusionMatrix(sample_test_car$Private, predicted.classes.min.test, positive = 'Yes')

# Print the confusion matrix
print(conf_matrix_te)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No   33  20
##        Yes   4 137
##                                           
##                Accuracy : 0.8763          
##                  95% CI : (0.8215, 0.9191)
##     No Information Rate : 0.8093          
##     P-Value [Acc > NIR] : 0.008633        
##                                           
##                   Kappa : 0.6561          
##                                           
##  Mcnemar's Test P-Value : 0.002200        
##                                           
##             Sensitivity : 0.8726          
##             Specificity : 0.8919          
##          Pos Pred Value : 0.9716          
##          Neg Pred Value : 0.6226          
##              Prevalence : 0.8093          
##          Detection Rate : 0.7062          
##    Detection Prevalence : 0.7268          
##       Balanced Accuracy : 0.8823          
##                                           
##        'Positive' Class : Yes             
## 
#Metrics Table - training dataset
# Calculate metrics
accuracy_test <- conf_matrix_te$overall['Accuracy']
precision_test <- conf_matrix_te$byClass['Pos Pred Value']
recall_test <- conf_matrix_te$byClass['Sensitivity']      
specificity_test <- conf_matrix_te$byClass['Specificity']
f1_score_test <- 2 * (precision_test * recall_test) / (precision_test + recall_test) 
beta <- 2
f2_score_test <- ((1 + beta^2) * precision_test * recall_test) / ((beta^2 * precision_test) + recall_test)

# Create a data frame to store the metrics
metrics_df_test <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall (Sensitivity)", "Specificity", "F1 Score", "F2 Score"),
  Value = c(accuracy_test, precision_test, recall_test, specificity_test, f1_score_test, f2_score_test)
)

# Print the formatted table
kable(metrics_df_test, format = "html", digits = 3) %>%
  kable_styling(full_width = FALSE, bootstrap_options = "basic", table.envir = "table", latex_options = "basic", position = "center")
Metric Value
Accuracy 0.876
Precision 0.972
Recall (Sensitivity) 0.873
Specificity 0.892
F1 Score 0.919
F2 Score 0.891

GLM (Test) - Receiver Operator Characteristics Curve (ROC)

# Calculate ROC curve
roc_curve <- roc(as.factor(sample_test_car$Private), as.numeric(predicted.classes.min.test == "Yes"))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# Create ROC plot with AUC
ggroc(roc_curve) +
  ggtitle("ROC Curve - Test Dataset") +
  geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "red") +
  theme_minimal() +
  annotate("text", x = 0.125, y = 0.125, label = paste("AUC =", round(auc(roc_curve), 2)), size = 3)

Support Vecor Machine (SVM) - Test

# Train the SVM model
svm_model_train <- svm(Private ~ Accept + Enroll + Grad.Rate, data = sample_train_car, kernel = "radial")

# Summary of the SVM model
summary(svm_model_train)
## 
## Call:
## svm(formula = Private ~ Accept + Enroll + Grad.Rate, data = sample_train_car, 
##     kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  186
## 
##  ( 92 94 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  No Yes
# Assuming 'sample_test_car' is your test dataset
predict_svm_test <- predict(svm_model_train, newdata = sample_test_car)

# Create confusion matrix
conf_matrix_svm_test <- table(predict_svm_test, sample_test_car$Private)

# Print confusion matrix
print(conf_matrix_svm_test)
##                 
## predict_svm_test  No Yes
##              No   37   6
##              Yes  16 135
# Calculate metrics
TP <- conf_matrix_svm_test[2, 2]
TN <- conf_matrix_svm_test[1, 1]
FP <- conf_matrix_svm_test[2, 1]
FN <- conf_matrix_svm_test[1, 2]

Accuracy_svm <- (TP + TN) / sum(conf_matrix_svm_test)
Precision_svm <- TP / (TP + FP)
Recall_svm <- TP / (TP + FN)
Specificity_svm <- TN / (TN + FP)
F1_score_svm <- 2 * Precision_svm * Recall_svm / (Precision_svm + Recall_svm)
F2_score_svm <- (1 + 2^2) * Precision_svm * Recall_svm / ((2^2 * Precision_svm) + Recall_svm)

# Create a data frame for metrics
metrics_df_Svm <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "Specificity", "F1 Score", "F2 Score"),
  Value = c(Accuracy_svm, Precision_svm, Recall_svm, Specificity_svm, F1_score_svm, F2_score_svm)
)

# Print metrics table
print(metrics_df)
##                 Metric     Value
## 1             Accuracy 0.8610635
## 2            Precision 0.9551887
## 3 Recall (Sensitivity) 0.8672377
## 4          Specificity 0.8362069
## 5             F1 Score 0.9090909
## 6             F2 Score 0.8835079
# Calculate ROC curve
roc_curve_svm <- roc(as.factor(sample_test_car$Private), as.numeric(predict_svm_test == "Yes"))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# Create ROC plot with AUC
ggroc(roc_curve_svm) +
  ggtitle("ROC Curve - Test Dataset") +
  geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "red") +
  theme_minimal() +
  annotate("text", x = 0.125, y = 0.125, label = paste("AUC =", round(auc(roc_curve_svm), 2)), size = 3)