Question 1

#a
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.1
## Loading required package: lattice
# Load data
admission.train <- read.csv("C:/Users/trank/OneDrive/Documents/admission_train.csv")
admission.test  <- read.csv("C:/Users/trank/OneDrive/Documents/admission_test.csv")

# Clean missing data
admission1 <- na.omit(admission.train)

# Recode and clean
admission <- admission1 %>%
  mutate(
    admit_status = case_when(
      admit == 0 ~ 0,
      admit == 1 ~ 1
    )
  ) %>%
  dplyr::select(-admit)

# Convert categorical variables to factors
cat_vars <- c("admit_status", "prestige")
admission[cat_vars] <- lapply(admission[cat_vars], as.factor)

# Partition data (75% training, 25% testing)
set.seed(4061)
train_index <- createDataPartition(admission$admit_status, p = 0.75, list = FALSE)

training <- admission[train_index, ]
testing  <- admission[-train_index, ]

# Check class balance
table(training$admit_status)
## 
##   0   1 
## 153  71
table(testing$admit_status)
## 
##  0  1 
## 51 23
ctrl <- trainControl(method="LOOCV")
set.seed(4061)
knn_fit <- train(admit_status~ gre + gpa + prestige, data=training, method="knn",
preProcess=c("center", "scale"), trControl=trainControl(method="LOOCV"), tuneLength=50)
knn_pred <- predict(knn_fit, newdata=testing)
(knn_tab <- table(True=testing$admit_status, Predict=knn_pred))
##     Predict
## True  0  1
##    0 43  8
##    1 16  7
(knn_accuracy <- mean(knn_pred==testing$admit_status))
## [1] 0.6756757
print(knn_fit)
## k-Nearest Neighbors 
## 
## 224 samples
##   3 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered (5), scaled (5) 
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 223, 223, 223, 223, 223, 223, ... 
## Resampling results across tuning parameters:
## 
##   k    Accuracy   Kappa       
##     5  0.6785714   0.162877608
##     7  0.7008929   0.189982729
##     9  0.7187500   0.251591006
##    11  0.7187500   0.245024609
##    13  0.7321429   0.284116331
##    15  0.7232143   0.247099642
##    17  0.7008929   0.175384615
##    19  0.7142857   0.194154019
##    21  0.7187500   0.195622435
##    23  0.7098214   0.177772758
##    25  0.7008929   0.167886449
##    27  0.6964286   0.151609669
##    29  0.7008929   0.167886449
##    31  0.7053571   0.161239079
##    33  0.6964286   0.127705876
##    35  0.6875000   0.102050166
##    37  0.6875000   0.093536825
##    39  0.6875000   0.093536825
##    41  0.6875000   0.084860511
##    43  0.6830357   0.067323481
##    45  0.6785714   0.058713669
##    47  0.6919643   0.084793937
##    49  0.6785714   0.049616971
##    51  0.6607143  -0.012971558
##    53  0.6607143  -0.012971558
##    55  0.6741071   0.012560386
##    57  0.6785714   0.021240442
##    59  0.6741071   0.002683581
##    61  0.6785714   0.011401250
##    63  0.6741071   0.002683581
##    65  0.6785714   0.011401250
##    67  0.6830357   0.030007319
##    69  0.6785714   0.021240442
##    71  0.6785714   0.011401250
##    73  0.6785714   0.011401250
##    75  0.6696429  -0.005947324
##    77  0.6785714   0.021240442
##    79  0.6785714   0.011401250
##    81  0.6875000   0.038862327
##    83  0.6830357   0.020206999
##    85  0.6785714  -0.008882772
##    87  0.6785714  -0.008882772
##    89  0.6830357   0.000000000
##    91  0.6830357   0.000000000
##    93  0.6830357   0.000000000
##    95  0.6830357   0.000000000
##    97  0.6830357   0.000000000
##    99  0.6830357   0.000000000
##   101  0.6830357   0.000000000
##   103  0.6830357   0.000000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.
missclassacc <- 1 - knn_accuracy
#b
logit_fit <- glm(admit_status ~ gre + gpa + prestige,
                 data = training,
                 family = binomial)

# Summary of the model
summary(logit_fit)
## 
## Call:
## glm(formula = admit_status ~ gre + gpa + prestige, family = binomial, 
##     data = training)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.075820   1.595816  -3.807  0.00014 ***
## gre          0.004018   0.001426   2.818  0.00483 ** 
## gpa          1.070827   0.454504   2.356  0.01847 *  
## prestige2   -0.558905   0.433646  -1.289  0.19745    
## prestige3   -1.192752   0.466469  -2.557  0.01056 *  
## prestige4   -1.214588   0.569310  -2.133  0.03289 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 279.80  on 223  degrees of freedom
## Residual deviance: 246.83  on 218  degrees of freedom
## AIC: 258.83
## 
## Number of Fisher Scoring iterations: 4
logit_prob <- predict(logit_fit, newdata = testing, type = "response")
# Predicted class (threshold = 0.5)
logit_pred <- factor(ifelse(logit_prob >= 0.5, "1", "0"),levels = c("0", "1"))
logit_tab <- table(True=testing$admit_status, Predict=logit_pred)
(logit_accuracy <- mean(logit_pred == testing$admit_status))
## [1] 0.6891892
misclass_rate <- 1 - logit_accuracy

# Load PRROC package
library(PRROC)
## Warning: package 'PRROC' was built under R version 4.5.2
## Loading required package: rlang
## Warning: package 'rlang' was built under R version 4.5.1
# Convert factor labels to numeric (0 and 1)
test_labels <- as.numeric(as.character(testing$admit_status))

# Compute ROC curve using predicted probabilities
logit_prroc <- roc.curve(
  scores.class0 = logit_prob[test_labels == 1],  # predicted probs for admitted students
  scores.class1 = logit_prob[test_labels == 0],  # predicted probs for not admitted students
  curve = TRUE
)

# Plot ROC curve
plot(logit_prroc, cex.lab = 1.5, cex.axis = 1.35)

# Print AUC value
logit_prroc$auc
## [1] 0.6820119
#c
library(caret)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.2
# complexity parameters
tree_grid <- expand.grid(cp = seq(0, 0.1, by = 0.001))

# Cross-validation to find the best cp
set.seed(4061)
rp_model <- train(
  admit_status ~ .,                      
  data = training,                      
  method = "rpart",
  trControl = trainControl(method = "cv", number = 5),
  tuneGrid = tree_grid
)
rp_fit <- rpart(admit_status~., data=training, cp=rp_model$bestTune)
rpart.plot(rp_fit)

rp_pred <- predict(rp_fit, newdata = testing, type = "class")
rp_prob <- predict(rp_fit, newdata = testing, type = "prob")[, "1"]
(rp_accuracy <- mean(rp_pred == testing$admit_status))
## [1] 0.7162162
missclassacc <- 1 - rp_accuracy
print(missclassacc)
## [1] 0.2837838
#d
library(caret)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
training$admit_status <- factor(training$admit_status, levels = c(0, 1))
testing$admit_status  <- factor(testing$admit_status, levels = c(0, 1))

set.seed(4061)

# 3. Train Random Forest with 5-fold CV

rf_fit <- train(
  admit_status ~ ., 
  data = training, 
  method = "rf",
  trControl = trainControl(method = "cv", number = 5),
  tuneLength = 10
)
## note: only 4 unique complexity parameters in default grid. Truncating the grid to 4 .
plot(rf_fit, main = "Random Forest: Accuracy vs mtry")

best_mtry <- rf_fit$bestTune
best_mtry
# 5. Predict on test set

rf_pred <- predict(rf_fit, newdata = testing)

rf_pred_num <- as.numeric(as.character(rf_pred))
testY_num   <- as.numeric(as.character(testing$admit_status))

# 7. Confusion matrix

(rf_tab <- table(True = testY_num, Predict = rf_pred_num))
##     Predict
## True  0  1
##    0 42  9
##    1 18  5
# 8. Accuracy and misclassification rate

(rf_accuracy <- mean(rf_pred_num == testY_num))
## [1] 0.6351351
(rf_misclass <- 1 - rf_accuracy)
## [1] 0.3648649
# 9. Variable importance plot

varImpPlot(rf_fit$finalModel, main = "Random Forest: Top Predictors")

# 10. Detailed confusion matrix with stats

confusionMatrix(rf_pred, testing$admit_status)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 42 18
##          1  9  5
##                                          
##                Accuracy : 0.6351         
##                  95% CI : (0.5151, 0.744)
##     No Information Rate : 0.6892         
##     P-Value [Acc > NIR] : 0.8701         
##                                          
##                   Kappa : 0.0458         
##                                          
##  Mcnemar's Test P-Value : 0.1237         
##                                          
##             Sensitivity : 0.8235         
##             Specificity : 0.2174         
##          Pos Pred Value : 0.7000         
##          Neg Pred Value : 0.3571         
##              Prevalence : 0.6892         
##          Detection Rate : 0.5676         
##    Detection Prevalence : 0.8108         
##       Balanced Accuracy : 0.5205         
##                                          
##        'Positive' Class : 0              
## 
#e
library(caret)
library(lattice)

training$admit_status <- factor(training$admit_status, levels = c(0, 1))
testing$admit_status  <- factor(testing$admit_status, levels = c(0, 1))

set.seed(4061)

# 3. Number of observations and predictors

n <- nrow(training)
p <- ncol(training) - 1

grid <- expand.grid(
  sigma = 2^seq(log2(0.01), log2(10), length.out = 9),
  C     = 2^seq(log2(1/n), log2(10*n), length.out = 9)
)

# 5. Train SVM

svm_fit <- train(
  admit_status ~ ., 
  data = training, 
  method = "svmRadial",
  trControl = trainControl(method = "cv", number = 5),
  tuneGrid = grid,
  preProcess = c("center", "scale", "nzv")
)

plot(svm_fit, main = "SVM Radial Kernel: Accuracy vs Sigma and C")

# Best parameters

best_para <- svm_fit$bestTune
best_para
# 7. Predict on test set

svm_pred <- predict(svm_fit, newdata = testing)
svm_pred_num <- as.numeric(as.character(svm_pred))
testY_num    <- as.numeric(as.character(testing$admit_status))


# 9. Confusion matrix

(svm_tab <- table(True = testY_num, Predict = svm_pred_num))
##     Predict
## True  0  1
##    0 41 10
##    1 16  7
# 10. Accuracy and misclassification rate

(svm_accuracy <- mean(svm_pred_num == testY_num))
## [1] 0.6486486
(svm_misclass <- 1 - svm_accuracy)
## [1] 0.3513514

Question 2

#a
# Load packages
library(car)
## Warning: package 'car' was built under R version 4.5.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.1
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
# Load the Salaries dataset
data("Salaries")

# View the first few rows
head(Salaries)
# Fit a multiple linear regression model
salary_lm <- lm(salary ~ ., data = Salaries)

# Summarize the model
summary(salary_lm)
## 
## Call:
## lm(formula = salary ~ ., data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65248 -13211  -1775  10384  99592 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    65955.2     4588.6  14.374  < 2e-16 ***
## rankAssocProf  12907.6     4145.3   3.114  0.00198 ** 
## rankProf       45066.0     4237.5  10.635  < 2e-16 ***
## disciplineB    14417.6     2342.9   6.154 1.88e-09 ***
## yrs.since.phd    535.1      241.0   2.220  0.02698 *  
## yrs.service     -489.5      211.9  -2.310  0.02143 *  
## sexMale         4783.5     3858.7   1.240  0.21584    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared:  0.4547, Adjusted R-squared:  0.4463 
## F-statistic:  54.2 on 6 and 390 DF,  p-value: < 2.2e-16
#b
library(car)
library(rpart)
library(rpart.plot)

# Load the Salaries dataset
data("Salaries")

# View the data
head(Salaries)
# Fit a regression tree predicting salary
tree_fit <- rpart(
  salary ~ rank + discipline + yrs.since.phd + yrs.service + sex,
  data = Salaries,
  method = "anova"  
)

# Display summary
summary(tree_fit)
## Call:
## rpart(formula = salary ~ rank + discipline + yrs.since.phd + 
##     yrs.service + sex, data = Salaries, method = "anova")
##   n= 397 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.37878848      0 1.0000000 1.0052789 0.07486510
## 2 0.03308338      1 0.6212115 0.6259663 0.05576504
## 3 0.03005491      2 0.5881281 0.6250104 0.05760150
## 4 0.01803279      3 0.5580732 0.6020002 0.05718459
## 5 0.01606402      5 0.5220077 0.6275410 0.05789950
## 6 0.01545334      6 0.5059436 0.6210851 0.05803549
## 7 0.01000000      7 0.4904903 0.5951021 0.05878917
## 
## Variable importance
##          rank yrs.since.phd   yrs.service    discipline           sex 
##            37            32            26             5             1 
## 
## Node number 1: 397 observations,    complexity param=0.3787885
##   mean=113706.5, MSE=9.15115e+08 
##   left son=2 (131 obs) right son=3 (266 obs)
##   Primary splits:
##       rank          splits as  LLR,      improve=0.37878850, (0 missing)
##       yrs.since.phd < 12.5 to the left,  improve=0.27504760, (0 missing)
##       yrs.service   < 8.5  to the left,  improve=0.20355660, (0 missing)
##       discipline    splits as  LR,       improve=0.02436220, (0 missing)
##       sex           splits as  LR,       improve=0.01921278, (0 missing)
##   Surrogate splits:
##       yrs.since.phd < 11.5 to the left,  agree=0.909, adj=0.725, (0 split)
##       yrs.service   < 8.5  to the left,  agree=0.859, adj=0.573, (0 split)
##       sex           splits as  LR,       agree=0.678, adj=0.023, (0 split)
## 
## Node number 2: 131 observations,    complexity param=0.01606402
##   mean=87176.21, MSE=1.685529e+08 
##   left son=4 (50 obs) right son=5 (81 obs)
##   Primary splits:
##       discipline    splits as  LR,       improve=0.26430960, (0 missing)
##       rank          splits as  LR-,      improve=0.25441840, (0 missing)
##       yrs.since.phd < 8.5  to the left,  improve=0.16784740, (0 missing)
##       yrs.service   < 5.5  to the left,  improve=0.15902320, (0 missing)
##       sex           splits as  LR,       improve=0.01945016, (0 missing)
##   Surrogate splits:
##       yrs.since.phd < 24   to the right, agree=0.641, adj=0.06, (0 split)
##       yrs.service   < 14   to the right, agree=0.634, adj=0.04, (0 split)
## 
## Node number 3: 266 observations,    complexity param=0.03308338
##   mean=126772.1, MSE=7.654365e+08 
##   left son=6 (131 obs) right son=7 (135 obs)
##   Primary splits:
##       discipline    splits as  LR,       improve=0.059031700, (0 missing)
##       yrs.since.phd < 45.5 to the right, improve=0.037865850, (0 missing)
##       yrs.service   < 33.5 to the right, improve=0.011894720, (0 missing)
##       sex           splits as  LR,       improve=0.002188808, (0 missing)
##   Surrogate splits:
##       yrs.since.phd < 26.5 to the right, agree=0.632, adj=0.252, (0 split)
##       yrs.service   < 25.5 to the right, agree=0.590, adj=0.168, (0 split)
## 
## Node number 4: 50 observations
##   mean=78680.84, MSE=9.21205e+07 
## 
## Node number 5: 81 observations,    complexity param=0.01545334
##   mean=92420.26, MSE=1.436832e+08 
##   left son=10 (43 obs) right son=11 (38 obs)
##   Primary splits:
##       rank          splits as  LR-,      improve=4.823893e-01, (0 missing)
##       yrs.service   < 5.5  to the left,  improve=4.365599e-01, (0 missing)
##       yrs.since.phd < 8.5  to the left,  improve=3.449514e-01, (0 missing)
##       sex           splits as  RL,       improve=7.989063e-06, (0 missing)
##   Surrogate splits:
##       yrs.service   < 5.5  to the left,  agree=0.975, adj=0.947, (0 split)
##       yrs.since.phd < 8.5  to the left,  agree=0.914, adj=0.816, (0 split)
##       sex           splits as  RL,       agree=0.543, adj=0.026, (0 split)
## 
## Node number 6: 131 observations,    complexity param=0.03005491
##   mean=119948.3, MSE=7.758025e+08 
##   left son=12 (7 obs) right son=13 (124 obs)
##   Primary splits:
##       yrs.since.phd < 46.5 to the right, improve=0.107438300, (0 missing)
##       yrs.service   < 43.5 to the right, improve=0.053101880, (0 missing)
##       sex           splits as  LR,       improve=0.008922571, (0 missing)
##   Surrogate splits:
##       yrs.service < 47   to the right, agree=0.977, adj=0.571, (0 split)
## 
## Node number 7: 135 observations
##   mean=133393.8, MSE=6.663464e+08 
## 
## Node number 10: 43 observations
##   mean=84593.91, MSE=4.99113e+07 
## 
## Node number 11: 38 observations
##   mean=101276.4, MSE=1.020512e+08 
## 
## Node number 12: 7 observations
##   mean=81523, MSE=2.082952e+08 
## 
## Node number 13: 124 observations,    complexity param=0.01803279
##   mean=122117.4, MSE=7.197831e+08 
##   left son=26 (42 obs) right son=27 (82 obs)
##   Primary splits:
##       yrs.since.phd < 26   to the left,  improve=0.07094925, (0 missing)
##       yrs.service   < 41.5 to the left,  improve=0.01967016, (0 missing)
##       sex           splits as  LR,       improve=0.01493644, (0 missing)
##   Surrogate splits:
##       yrs.service < 18.5 to the left,  agree=0.831, adj=0.5, (0 split)
## 
## Node number 26: 42 observations
##   mean=112132.2, MSE=3.168359e+08 
## 
## Node number 27: 82 observations,    complexity param=0.01803279
##   mean=127231.8, MSE=8.489457e+08 
##   left son=54 (70 obs) right son=55 (12 obs)
##   Primary splits:
##       yrs.service   < 18.5 to the right, improve=0.097254140, (0 missing)
##       yrs.since.phd < 33.5 to the right, improve=0.006780819, (0 missing)
## 
## Node number 54: 70 observations
##   mean=123469.7, MSE=6.367022e+08 
## 
## Node number 55: 12 observations
##   mean=149177.7, MSE=1.522849e+09
# Simple base plot
plot(tree_fit)
text(tree_fit, use.n = TRUE, pretty = 0)

# Better tree
rpart.plot(tree_fit, type = 2, extra = 101, fallen.leaves = TRUE, 
           main = "Regression Tree for Professor Salaries")

# complexity parameter table
printcp(tree_fit)
## 
## Regression tree:
## rpart(formula = salary ~ rank + discipline + yrs.since.phd + 
##     yrs.service + sex, data = Salaries, method = "anova")
## 
## Variables actually used in tree construction:
## [1] discipline    rank          yrs.service   yrs.since.phd
## 
## Root node error: 3.633e+11/397 = 915114969
## 
## n= 397 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.378788      0   1.00000 1.00528 0.074865
## 2 0.033083      1   0.62121 0.62597 0.055765
## 3 0.030055      2   0.58813 0.62501 0.057602
## 4 0.018033      3   0.55807 0.60200 0.057185
## 5 0.016064      5   0.52201 0.62754 0.057900
## 6 0.015453      6   0.50594 0.62109 0.058035
## 7 0.010000      7   0.49049 0.59510 0.058789
# Plot cross-validation results
plotcp(tree_fit, main = "Complexity Parameter (CP) Plot")

# Best CP
best_cp <- tree_fit$cptable[which.min(tree_fit$cptable[,"xerror"]), "CP"]

# Prune the tree
pruned_tree <- prune(tree_fit, cp = best_cp)

# Plot the pruned tree
rpart.plot(pruned_tree, type = 2, extra = 101, fallen.leaves = TRUE, 
           main = "Pruned Regression Tree for Professor Salaries")

# Question 5

# Data from Q5
mu0 <- c(3, 6)
mu1 <- c(5, 8)
Spooled <- matrix(c(1,1,1,2), 2, 2)
x <- c(2, 7)

# LDA direction vector: Sigma^{-1}(mu0 - mu1)
e <- solve(Spooled) %*% matrix(mu0 - mu1, 2, 1)

# normalization
c <- t(e) %*% Spooled %*% e
a <- e / sqrt(c[1,1])

# project x and find distances to class means
d0 <- abs(t(a) %*% (matrix(x, 2, 1) - mu0))
d1 <- abs(t(a) %*% (matrix(x, 2, 1) - mu1))

c(d0, d1)  # smaller distance ⇒ closer class
## [1] 1 3

Question 6

mu0 <- c(3, 6)
mu1 <- c(5, 8)
mu2 <- c(5, 4)
x <- c(2, 7)

sigma0 <- sigma1 <- sigma2 <- matrix(c(2, 1.67, 1.67, 2.33), 2, 2)

#Class 0 vs Class 1
e01 <- solve(sigma0) %*% matrix(mu0 - mu1, 2, 1)
c01 <- t(e01) %*% sigma0 %*% e01
a01 <- e01 / sqrt(c01[1,1])
d0 <- abs(t(a01) %*% (matrix(x,2,1) - mu0))
d1 <- abs(t(a01) %*% (matrix(x,2,1) - mu1))
cat("d0 =", d0, " d1 =", d1, "\n")
## d0 = 0.2424643  d1 = 1.69725
#Class 0 vs Class 2
e02 <- solve(sigma0) %*% matrix(mu0 - mu2, 2, 1)
c02 <- t(e02) %*% sigma0 %*% e02
a02 <- e02 / sqrt(c02[1,1])
d02 <- abs(t(a02) %*% (matrix(x,2,1) - mu0))
d22 <- abs(t(a02) %*% (matrix(x,2,1) - mu2))
cat("d0 =", d02, " d2 =", d22, "\n")
## d0 = 2.024646  d2 = 6.073939
#Class 1 vs Class 2
e12 <- solve(sigma1) %*% matrix(mu1 - mu2, 2, 1)
c12 <- t(e12) %*% sigma1 %*% e12
a12 <- e12 / sqrt(c12[1,1])
d12 <- abs(t(a12) %*% (matrix(x,2,1) - mu1))
d22_2 <- abs(t(a12) %*% (matrix(x,2,1) - mu2))
cat("d1 =", d12, " d2 =", d22_2, "\n")
## d1 = 1.555976  d2 = 5.691462
# Assign to class 0