Loading Dataset
Descriptive Statistics
#descriptive statistics
desc_sum <- summarytools::descr(College)
#plotting graph
kable(round(desc_sum, digits = 2), table.attr = "style='width:40%'",align="c", format="html", digits = 2) %>%
kable_styling(bootstrap_options="bordered", latex_options = "striped", font_size = NULL)
| Accept | Apps | Books | Enroll | Expend | F.Undergrad | Grad.Rate | Outstate | P.Undergrad | perc.alumni | Personal | PhD | Room.Board | S.F.Ratio | Terminal | Top10perc | Top25perc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Mean | 2018.80 | 3001.64 | 549.38 | 779.97 | 9660.17 | 3699.91 | 65.46 | 10440.67 | 855.30 | 22.74 | 1340.64 | 72.66 | 4357.53 | 14.09 | 79.70 | 27.56 | 55.80 |
| Std.Dev | 2451.11 | 3870.20 | 165.11 | 929.18 | 5221.77 | 4850.42 | 17.18 | 4023.02 | 1522.43 | 12.39 | 677.07 | 16.33 | 1096.70 | 3.96 | 14.72 | 17.64 | 19.80 |
| Min | 72.00 | 81.00 | 96.00 | 35.00 | 3186.00 | 139.00 | 10.00 | 2340.00 | 1.00 | 0.00 | 250.00 | 8.00 | 1780.00 | 2.50 | 24.00 | 1.00 | 9.00 |
| Q1 | 604.00 | 776.00 | 470.00 | 242.00 | 6751.00 | 992.00 | 53.00 | 7320.00 | 95.00 | 13.00 | 850.00 | 62.00 | 3597.00 | 11.50 | 71.00 | 15.00 | 41.00 |
| Median | 1110.00 | 1558.00 | 500.00 | 434.00 | 8377.00 | 1707.00 | 65.00 | 9990.00 | 353.00 | 21.00 | 1200.00 | 75.00 | 4200.00 | 13.60 | 82.00 | 23.00 | 54.00 |
| Q3 | 2424.00 | 3624.00 | 600.00 | 902.00 | 10830.00 | 4005.00 | 78.00 | 12925.00 | 967.00 | 31.00 | 1700.00 | 85.00 | 5050.00 | 16.50 | 92.00 | 35.00 | 69.00 |
| Max | 26330.00 | 48094.00 | 2340.00 | 6392.00 | 56233.00 | 31643.00 | 118.00 | 21700.00 | 21836.00 | 64.00 | 6800.00 | 103.00 | 8124.00 | 39.80 | 100.00 | 96.00 | 100.00 |
| MAD | 1008.17 | 1463.33 | 148.26 | 354.34 | 2730.95 | 1441.09 | 17.79 | 4121.63 | 449.23 | 13.34 | 593.04 | 17.79 | 1005.20 | 3.41 | 14.83 | 13.34 | 20.76 |
| IQR | 1820.00 | 2848.00 | 130.00 | 660.00 | 4079.00 | 3013.00 | 25.00 | 5605.00 | 872.00 | 18.00 | 850.00 | 23.00 | 1453.00 | 5.00 | 21.00 | 20.00 | 28.00 |
| CV | 1.21 | 1.29 | 0.30 | 1.19 | 0.54 | 1.31 | 0.26 | 0.39 | 1.78 | 0.54 | 0.51 | 0.22 | 0.25 | 0.28 | 0.18 | 0.64 | 0.35 |
| Skewness | 3.40 | 3.71 | 3.47 | 2.68 | 3.45 | 2.60 | -0.11 | 0.51 | 5.67 | 0.60 | 1.74 | -0.77 | 0.48 | 0.66 | -0.81 | 1.41 | 0.26 |
| SE.Skewness | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 | 0.09 |
| Kurtosis | 18.75 | 26.52 | 28.06 | 8.74 | 18.59 | 7.61 | -0.22 | -0.43 | 54.52 | -0.11 | 7.04 | 0.54 | -0.20 | 2.52 | 0.22 | 2.17 | -0.57 |
| N.Valid | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 | 777.00 |
| Pct.Valid | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 | 100.00 |
Exploratory Data Analysis
#EDA
#Histogram
hist(College$Outstate, xlab = "Outstate", ylab = "Frequency", main = "Histogram - Outstate", col = "skyblue")
#mean and median
mean_outstate = round(mean(College$Outstate), 2)
median_outstate = round(median(College$Outstate), 2)
#Adding mean and median lines
abline(v = mean_outstate, col = "blue", lty = 2)
abline(v = median_outstate, col = "red", lty = 2)
#Adding labels for mean and median lines
text(x=mean_outstate, y=25, round(mean_outstate,2), srt=90, cex=0.8, adj = c(0.001,1))
text(x=median_outstate, y=25, round(median_outstate,2), srt=90, cex=0.8, adj = c(0.001,0))
#boxplot
# Assuming you have a data frame named College with a column Grad.Rate
# Create a boxplot of Grad.Rate with custom formatting
# Assuming you have a data frame named College with a column Grad.Rate
private_col_acc <- ggplot(data = College, aes(x = Private, y = Accept, fill = Private)) +
geom_boxplot() +
guides(fill = "none")
private_col_enrol <- ggplot(data = College, aes(x = Private, y = Enroll, fill = Private)) +
geom_boxplot() +
guides(fill = "none")
grid.arrange(private_col_acc, private_col_enrol, nrow = 1)
Splitting and Partitioning
Dataset - Train and Test
#Partitioning the dataset into Train and Test
set.seed(20353)
Train_car_index <- createDataPartition(College$Private, p = 0.75, list = FALSE, times = 1)
sample_train_car = College[Train_car_index,]
sample_test_car = College[-Train_car_index,]
head(sample_train_car)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
Generalized Linear Model -
Training - All features as Predictors
#glm model 1
glm_col_model1 = glm(Private ~., data = sample_train_car, family = binomial(link = "logit"))
summary(glm_col_model1)
##
## Call:
## glm(formula = Private ~ ., family = binomial(link = "logit"),
## data = sample_train_car)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6709 -0.0438 0.0446 0.1817 3.4481
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.591e+00 2.237e+00 -0.711 0.4770
## Apps -6.642e-04 2.654e-04 -2.502 0.0123 *
## Accept -6.048e-05 5.117e-04 -0.118 0.9059
## Enroll 1.811e-03 9.060e-04 1.999 0.0456 *
## Top10perc -5.908e-04 3.168e-02 -0.019 0.9851
## Top25perc 1.256e-02 2.064e-02 0.609 0.5428
## F.Undergrad -2.711e-04 1.399e-04 -1.937 0.0527 .
## P.Undergrad -3.462e-04 3.084e-04 -1.123 0.2616
## Outstate 6.327e-04 1.228e-04 5.154 2.56e-07 ***
## Room.Board 5.248e-04 2.983e-04 1.759 0.0785 .
## Books 2.401e-03 1.608e-03 1.494 0.1353
## Personal -4.186e-04 3.236e-04 -1.293 0.1958
## PhD -5.183e-02 2.759e-02 -1.878 0.0603 .
## Terminal -4.226e-02 2.771e-02 -1.525 0.1272
## S.F.Ratio -6.777e-02 7.651e-02 -0.886 0.3757
## perc.alumni 5.463e-02 2.447e-02 2.232 0.0256 *
## Expend 2.341e-04 1.496e-04 1.564 0.1177
## Grad.Rate 1.497e-02 1.313e-02 1.140 0.2542
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 683.22 on 582 degrees of freedom
## Residual deviance: 186.31 on 565 degrees of freedom
## AIC: 222.31
##
## Number of Fisher Scoring iterations: 8
#to show regression coef in log-odds
coef(glm_col_model1)
## (Intercept) Apps Accept Enroll Top10perc
## -1.591177e+00 -6.642083e-04 -6.048014e-05 1.811327e-03 -5.908029e-04
## Top25perc F.Undergrad P.Undergrad Outstate Room.Board
## 1.256172e-02 -2.710636e-04 -3.461744e-04 6.327025e-04 5.248531e-04
## Books Personal PhD Terminal S.F.Ratio
## 2.401092e-03 -4.185936e-04 -5.183406e-02 -4.226127e-02 -6.777422e-02
## perc.alumni Expend Grad.Rate
## 5.462570e-02 2.340785e-04 1.497043e-02
#to show regression coef in odds
exp(coef(glm_col_model1))
## (Intercept) Apps Accept Enroll Top10perc Top25perc
## 0.2036857 0.9993360 0.9999395 1.0018130 0.9994094 1.0126410
## F.Undergrad P.Undergrad Outstate Room.Board Books Personal
## 0.9997290 0.9996539 1.0006329 1.0005250 1.0024040 0.9995815
## PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
## 0.9494864 0.9586193 0.9344714 1.0561452 1.0002341 1.0150830
Generalized Linear Model -
Training - Three Predictors
glm_col_model2 = glm(Private ~ Accept + Enroll + Grad.Rate, data = sample_train_car, family = binomial(link = "logit"))
summary(glm_col_model2)
##
## Call:
## glm(formula = Private ~ Accept + Enroll + Grad.Rate, family = binomial(link = "logit"),
## data = sample_train_car)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4656 -0.0886 0.3369 0.5270 4.8536
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9977861 0.4689462 -2.128 0.0334 *
## Accept 0.0003047 0.0001483 2.055 0.0399 *
## Enroll -0.0031306 0.0004847 -6.459 1.06e-10 ***
## Grad.Rate 0.0612451 0.0080246 7.632 2.31e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 683.22 on 582 degrees of freedom
## Residual deviance: 403.90 on 579 degrees of freedom
## AIC: 411.9
##
## Number of Fisher Scoring iterations: 6
#to show regression coef in log-odds
coef(glm_col_model2)
## (Intercept) Accept Enroll Grad.Rate
## -0.9977860925 0.0003046799 -0.0031305910 0.0612451012
#to show regression coef in odds
exp(coef(glm_col_model2))
## (Intercept) Accept Enroll Grad.Rate
## 0.3686948 1.0003047 0.9968743 1.0631595
#The odds of Private increase by a factor of 0.997 for every 1 unit increase in Enroll
Confusion Matrix - Training
dataset
#Training set predictions
prob.train = predict(glm_col_model2, newdata = sample_train_car, type = "response")
predicted.classes.min.train <- as.factor(ifelse(prob.train >= 0.5, "Yes", "No"))
#Confusion Matrix - training dataset
conf_matrix_tr = confusionMatrix(sample_train_car$Private, predicted.classes.min.train, positive = 'Yes')
# Print the confusion matrix
print(conf_matrix_tr)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 97 62
## Yes 19 405
##
## Accuracy : 0.8611
## 95% CI : (0.8303, 0.8881)
## No Information Rate : 0.801
## P-Value [Acc > NIR] : 9.817e-05
##
## Kappa : 0.6174
##
## Mcnemar's Test P-Value : 3.061e-06
##
## Sensitivity : 0.8672
## Specificity : 0.8362
## Pos Pred Value : 0.9552
## Neg Pred Value : 0.6101
## Prevalence : 0.8010
## Detection Rate : 0.6947
## Detection Prevalence : 0.7273
## Balanced Accuracy : 0.8517
##
## 'Positive' Class : Yes
##
#Metrics Table - training dataset
# Calculate metrics
accuracy <- conf_matrix_tr$overall['Accuracy']
precision <- conf_matrix_tr$byClass['Pos Pred Value']
recall <- conf_matrix_tr$byClass['Sensitivity']
specificity <- conf_matrix_tr$byClass['Specificity']
f1_score <- 2 * (precision * recall) / (precision + recall)
beta <- 2
f2_score <- ((1 + beta^2) * precision * recall) / ((beta^2 * precision) + recall)
# Create a data frame to store the metrics
metrics_df <- data.frame(
Metric = c("Accuracy", "Precision", "Recall (Sensitivity)", "Specificity", "F1 Score", "F2 Score"),
Value = c(accuracy, precision, recall, specificity, f1_score, f2_score)
)
# Print the formatted table
kable(metrics_df, format = "html", digits = 3) %>%
kable_styling(full_width = FALSE, bootstrap_options = "basic", table.envir = "table", latex_options = "basic", position = "center")
| Metric | Value |
|---|---|
| Accuracy | 0.861 |
| Precision | 0.955 |
| Recall (Sensitivity) | 0.867 |
| Specificity | 0.836 |
| F1 Score | 0.909 |
| F2 Score | 0.884 |
Confusion Matrix - Test dataset
#Confusion Matrix - test dataset
#Test set predictions
prob.test = predict(glm_col_model2, newdata = sample_test_car, type = "response")
predicted.classes.min.test <- as.factor(ifelse(prob.test >= 0.5, "Yes", "No"))
#Confusion Matrix - training dataset
conf_matrix_te = confusionMatrix(sample_test_car$Private, predicted.classes.min.test, positive = 'Yes')
# Print the confusion matrix
print(conf_matrix_te)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 33 20
## Yes 4 137
##
## Accuracy : 0.8763
## 95% CI : (0.8215, 0.9191)
## No Information Rate : 0.8093
## P-Value [Acc > NIR] : 0.008633
##
## Kappa : 0.6561
##
## Mcnemar's Test P-Value : 0.002200
##
## Sensitivity : 0.8726
## Specificity : 0.8919
## Pos Pred Value : 0.9716
## Neg Pred Value : 0.6226
## Prevalence : 0.8093
## Detection Rate : 0.7062
## Detection Prevalence : 0.7268
## Balanced Accuracy : 0.8823
##
## 'Positive' Class : Yes
##
#Metrics Table - training dataset
# Calculate metrics
accuracy_test <- conf_matrix_te$overall['Accuracy']
precision_test <- conf_matrix_te$byClass['Pos Pred Value']
recall_test <- conf_matrix_te$byClass['Sensitivity']
specificity_test <- conf_matrix_te$byClass['Specificity']
f1_score_test <- 2 * (precision_test * recall_test) / (precision_test + recall_test)
beta <- 2
f2_score_test <- ((1 + beta^2) * precision_test * recall_test) / ((beta^2 * precision_test) + recall_test)
# Create a data frame to store the metrics
metrics_df_test <- data.frame(
Metric = c("Accuracy", "Precision", "Recall (Sensitivity)", "Specificity", "F1 Score", "F2 Score"),
Value = c(accuracy_test, precision_test, recall_test, specificity_test, f1_score_test, f2_score_test)
)
# Print the formatted table
kable(metrics_df_test, format = "html", digits = 3) %>%
kable_styling(full_width = FALSE, bootstrap_options = "basic", table.envir = "table", latex_options = "basic", position = "center")
| Metric | Value |
|---|---|
| Accuracy | 0.876 |
| Precision | 0.972 |
| Recall (Sensitivity) | 0.873 |
| Specificity | 0.892 |
| F1 Score | 0.919 |
| F2 Score | 0.891 |
GLM (Test) - Receiver Operator
Characteristics Curve (ROC)
# Calculate ROC curve
roc_curve <- roc(as.factor(sample_test_car$Private), as.numeric(predicted.classes.min.test == "Yes"))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# Create ROC plot with AUC
ggroc(roc_curve) +
ggtitle("ROC Curve - Test Dataset") +
geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "red") +
theme_minimal() +
annotate("text", x = 0.125, y = 0.125, label = paste("AUC =", round(auc(roc_curve), 2)), size = 3)
Support Vecor Machine (SVM) -
Test
# Train the SVM model
svm_model_train <- svm(Private ~ Accept + Enroll + Grad.Rate, data = sample_train_car, kernel = "radial")
# Summary of the SVM model
summary(svm_model_train)
##
## Call:
## svm(formula = Private ~ Accept + Enroll + Grad.Rate, data = sample_train_car,
## kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 186
##
## ( 92 94 )
##
##
## Number of Classes: 2
##
## Levels:
## No Yes
# Assuming 'sample_test_car' is your test dataset
predict_svm_test <- predict(svm_model_train, newdata = sample_test_car)
# Create confusion matrix
conf_matrix_svm_test <- table(predict_svm_test, sample_test_car$Private)
# Print confusion matrix
print(conf_matrix_svm_test)
##
## predict_svm_test No Yes
## No 37 6
## Yes 16 135
# Calculate metrics
TP <- conf_matrix_svm_test[2, 2]
TN <- conf_matrix_svm_test[1, 1]
FP <- conf_matrix_svm_test[2, 1]
FN <- conf_matrix_svm_test[1, 2]
Accuracy_svm <- (TP + TN) / sum(conf_matrix_svm_test)
Precision_svm <- TP / (TP + FP)
Recall_svm <- TP / (TP + FN)
Specificity_svm <- TN / (TN + FP)
F1_score_svm <- 2 * Precision_svm * Recall_svm / (Precision_svm + Recall_svm)
F2_score_svm <- (1 + 2^2) * Precision_svm * Recall_svm / ((2^2 * Precision_svm) + Recall_svm)
# Create a data frame for metrics
metrics_df_Svm <- data.frame(
Metric = c("Accuracy", "Precision", "Recall", "Specificity", "F1 Score", "F2 Score"),
Value = c(Accuracy_svm, Precision_svm, Recall_svm, Specificity_svm, F1_score_svm, F2_score_svm)
)
# Print metrics table
print(metrics_df)
## Metric Value
## 1 Accuracy 0.8610635
## 2 Precision 0.9551887
## 3 Recall (Sensitivity) 0.8672377
## 4 Specificity 0.8362069
## 5 F1 Score 0.9090909
## 6 F2 Score 0.8835079
# Calculate ROC curve
roc_curve_svm <- roc(as.factor(sample_test_car$Private), as.numeric(predict_svm_test == "Yes"))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# Create ROC plot with AUC
ggroc(roc_curve_svm) +
ggtitle("ROC Curve - Test Dataset") +
geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "red") +
theme_minimal() +
annotate("text", x = 0.125, y = 0.125, label = paste("AUC =", round(auc(roc_curve_svm), 2)), size = 3)