#a
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.1
## Loading required package: lattice
# Load data
admission.train <- read.csv("C:/Users/trank/OneDrive/Documents/admission_train.csv")
admission.test <- read.csv("C:/Users/trank/OneDrive/Documents/admission_test.csv")
# Clean missing data
admission1 <- na.omit(admission.train)
# Recode and clean
admission <- admission1 %>%
mutate(
admit_status = case_when(
admit == 0 ~ 0,
admit == 1 ~ 1
)
) %>%
dplyr::select(-admit)
# Convert categorical variables to factors
cat_vars <- c("admit_status", "prestige")
admission[cat_vars] <- lapply(admission[cat_vars], as.factor)
# Partition data (75% training, 25% testing)
set.seed(4061)
train_index <- createDataPartition(admission$admit_status, p = 0.75, list = FALSE)
training <- admission[train_index, ]
testing <- admission[-train_index, ]
# Check class balance
table(training$admit_status)
##
## 0 1
## 153 71
table(testing$admit_status)
##
## 0 1
## 51 23
ctrl <- trainControl(method="LOOCV")
set.seed(4061)
knn_fit <- train(admit_status~ gre + gpa + prestige, data=training, method="knn",
preProcess=c("center", "scale"), trControl=trainControl(method="LOOCV"), tuneLength=50)
knn_pred <- predict(knn_fit, newdata=testing)
(knn_tab <- table(True=testing$admit_status, Predict=knn_pred))
## Predict
## True 0 1
## 0 43 8
## 1 16 7
(knn_accuracy <- mean(knn_pred==testing$admit_status))
## [1] 0.6756757
print(knn_fit)
## k-Nearest Neighbors
##
## 224 samples
## 3 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (5), scaled (5)
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 223, 223, 223, 223, 223, 223, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.6785714 0.162877608
## 7 0.7008929 0.189982729
## 9 0.7187500 0.251591006
## 11 0.7187500 0.245024609
## 13 0.7321429 0.284116331
## 15 0.7232143 0.247099642
## 17 0.7008929 0.175384615
## 19 0.7142857 0.194154019
## 21 0.7187500 0.195622435
## 23 0.7098214 0.177772758
## 25 0.7008929 0.167886449
## 27 0.6964286 0.151609669
## 29 0.7008929 0.167886449
## 31 0.7053571 0.161239079
## 33 0.6964286 0.127705876
## 35 0.6875000 0.102050166
## 37 0.6875000 0.093536825
## 39 0.6875000 0.093536825
## 41 0.6875000 0.084860511
## 43 0.6830357 0.067323481
## 45 0.6785714 0.058713669
## 47 0.6919643 0.084793937
## 49 0.6785714 0.049616971
## 51 0.6607143 -0.012971558
## 53 0.6607143 -0.012971558
## 55 0.6741071 0.012560386
## 57 0.6785714 0.021240442
## 59 0.6741071 0.002683581
## 61 0.6785714 0.011401250
## 63 0.6741071 0.002683581
## 65 0.6785714 0.011401250
## 67 0.6830357 0.030007319
## 69 0.6785714 0.021240442
## 71 0.6785714 0.011401250
## 73 0.6785714 0.011401250
## 75 0.6696429 -0.005947324
## 77 0.6785714 0.021240442
## 79 0.6785714 0.011401250
## 81 0.6875000 0.038862327
## 83 0.6830357 0.020206999
## 85 0.6785714 -0.008882772
## 87 0.6785714 -0.008882772
## 89 0.6830357 0.000000000
## 91 0.6830357 0.000000000
## 93 0.6830357 0.000000000
## 95 0.6830357 0.000000000
## 97 0.6830357 0.000000000
## 99 0.6830357 0.000000000
## 101 0.6830357 0.000000000
## 103 0.6830357 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.
missclassacc <- 1 - knn_accuracy
#b
logit_fit <- glm(admit_status ~ gre + gpa + prestige,
data = training,
family = binomial)
# Summary of the model
summary(logit_fit)
##
## Call:
## glm(formula = admit_status ~ gre + gpa + prestige, family = binomial,
## data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.075820 1.595816 -3.807 0.00014 ***
## gre 0.004018 0.001426 2.818 0.00483 **
## gpa 1.070827 0.454504 2.356 0.01847 *
## prestige2 -0.558905 0.433646 -1.289 0.19745
## prestige3 -1.192752 0.466469 -2.557 0.01056 *
## prestige4 -1.214588 0.569310 -2.133 0.03289 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 279.80 on 223 degrees of freedom
## Residual deviance: 246.83 on 218 degrees of freedom
## AIC: 258.83
##
## Number of Fisher Scoring iterations: 4
logit_prob <- predict(logit_fit, newdata = testing, type = "response")
# Predicted class (threshold = 0.5)
logit_pred <- factor(ifelse(logit_prob >= 0.5, "1", "0"),levels = c("0", "1"))
logit_tab <- table(True=testing$admit_status, Predict=logit_pred)
(logit_accuracy <- mean(logit_pred == testing$admit_status))
## [1] 0.6891892
misclass_rate <- 1 - logit_accuracy
# Load PRROC package
library(PRROC)
## Warning: package 'PRROC' was built under R version 4.5.2
## Loading required package: rlang
## Warning: package 'rlang' was built under R version 4.5.1
# Convert factor labels to numeric (0 and 1)
test_labels <- as.numeric(as.character(testing$admit_status))
# Compute ROC curve using predicted probabilities
logit_prroc <- roc.curve(
scores.class0 = logit_prob[test_labels == 1], # predicted probs for admitted students
scores.class1 = logit_prob[test_labels == 0], # predicted probs for not admitted students
curve = TRUE
)
# Plot ROC curve
plot(logit_prroc, cex.lab = 1.5, cex.axis = 1.35)
# Print AUC value
logit_prroc$auc
## [1] 0.6820119
#c
library(caret)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.2
# complexity parameters
tree_grid <- expand.grid(cp = seq(0, 0.1, by = 0.001))
# Cross-validation to find the best cp
set.seed(4061)
rp_model <- train(
admit_status ~ .,
data = training,
method = "rpart",
trControl = trainControl(method = "cv", number = 5),
tuneGrid = tree_grid
)
rp_fit <- rpart(admit_status~., data=training, cp=rp_model$bestTune)
rpart.plot(rp_fit)
rp_pred <- predict(rp_fit, newdata = testing, type = "class")
rp_prob <- predict(rp_fit, newdata = testing, type = "prob")[, "1"]
(rp_accuracy <- mean(rp_pred == testing$admit_status))
## [1] 0.7162162
missclassacc <- 1 - rp_accuracy
print(missclassacc)
## [1] 0.2837838
#d
library(caret)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
training$admit_status <- factor(training$admit_status, levels = c(0, 1))
testing$admit_status <- factor(testing$admit_status, levels = c(0, 1))
set.seed(4061)
# 3. Train Random Forest with 5-fold CV
rf_fit <- train(
admit_status ~ .,
data = training,
method = "rf",
trControl = trainControl(method = "cv", number = 5),
tuneLength = 10
)
## note: only 4 unique complexity parameters in default grid. Truncating the grid to 4 .
plot(rf_fit, main = "Random Forest: Accuracy vs mtry")
best_mtry <- rf_fit$bestTune
best_mtry
# 5. Predict on test set
rf_pred <- predict(rf_fit, newdata = testing)
rf_pred_num <- as.numeric(as.character(rf_pred))
testY_num <- as.numeric(as.character(testing$admit_status))
# 7. Confusion matrix
(rf_tab <- table(True = testY_num, Predict = rf_pred_num))
## Predict
## True 0 1
## 0 42 9
## 1 18 5
# 8. Accuracy and misclassification rate
(rf_accuracy <- mean(rf_pred_num == testY_num))
## [1] 0.6351351
(rf_misclass <- 1 - rf_accuracy)
## [1] 0.3648649
# 9. Variable importance plot
varImpPlot(rf_fit$finalModel, main = "Random Forest: Top Predictors")
# 10. Detailed confusion matrix with stats
confusionMatrix(rf_pred, testing$admit_status)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 42 18
## 1 9 5
##
## Accuracy : 0.6351
## 95% CI : (0.5151, 0.744)
## No Information Rate : 0.6892
## P-Value [Acc > NIR] : 0.8701
##
## Kappa : 0.0458
##
## Mcnemar's Test P-Value : 0.1237
##
## Sensitivity : 0.8235
## Specificity : 0.2174
## Pos Pred Value : 0.7000
## Neg Pred Value : 0.3571
## Prevalence : 0.6892
## Detection Rate : 0.5676
## Detection Prevalence : 0.8108
## Balanced Accuracy : 0.5205
##
## 'Positive' Class : 0
##
#e
library(caret)
library(lattice)
training$admit_status <- factor(training$admit_status, levels = c(0, 1))
testing$admit_status <- factor(testing$admit_status, levels = c(0, 1))
set.seed(4061)
# 3. Number of observations and predictors
n <- nrow(training)
p <- ncol(training) - 1
grid <- expand.grid(
sigma = 2^seq(log2(0.01), log2(10), length.out = 9),
C = 2^seq(log2(1/n), log2(10*n), length.out = 9)
)
# 5. Train SVM
svm_fit <- train(
admit_status ~ .,
data = training,
method = "svmRadial",
trControl = trainControl(method = "cv", number = 5),
tuneGrid = grid,
preProcess = c("center", "scale", "nzv")
)
plot(svm_fit, main = "SVM Radial Kernel: Accuracy vs Sigma and C")
# Best parameters
best_para <- svm_fit$bestTune
best_para
# 7. Predict on test set
svm_pred <- predict(svm_fit, newdata = testing)
svm_pred_num <- as.numeric(as.character(svm_pred))
testY_num <- as.numeric(as.character(testing$admit_status))
# 9. Confusion matrix
(svm_tab <- table(True = testY_num, Predict = svm_pred_num))
## Predict
## True 0 1
## 0 41 10
## 1 16 7
# 10. Accuracy and misclassification rate
(svm_accuracy <- mean(svm_pred_num == testY_num))
## [1] 0.6486486
(svm_misclass <- 1 - svm_accuracy)
## [1] 0.3513514
#a
# Load packages
library(car)
## Warning: package 'car' was built under R version 4.5.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.1
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Load the Salaries dataset
data("Salaries")
# View the first few rows
head(Salaries)
# Fit a multiple linear regression model
salary_lm <- lm(salary ~ ., data = Salaries)
# Summarize the model
summary(salary_lm)
##
## Call:
## lm(formula = salary ~ ., data = Salaries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65248 -13211 -1775 10384 99592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65955.2 4588.6 14.374 < 2e-16 ***
## rankAssocProf 12907.6 4145.3 3.114 0.00198 **
## rankProf 45066.0 4237.5 10.635 < 2e-16 ***
## disciplineB 14417.6 2342.9 6.154 1.88e-09 ***
## yrs.since.phd 535.1 241.0 2.220 0.02698 *
## yrs.service -489.5 211.9 -2.310 0.02143 *
## sexMale 4783.5 3858.7 1.240 0.21584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared: 0.4547, Adjusted R-squared: 0.4463
## F-statistic: 54.2 on 6 and 390 DF, p-value: < 2.2e-16
#b
library(car)
library(rpart)
library(rpart.plot)
# Load the Salaries dataset
data("Salaries")
# View the data
head(Salaries)
# Fit a regression tree predicting salary
tree_fit <- rpart(
salary ~ rank + discipline + yrs.since.phd + yrs.service + sex,
data = Salaries,
method = "anova"
)
# Display summary
summary(tree_fit)
## Call:
## rpart(formula = salary ~ rank + discipline + yrs.since.phd +
## yrs.service + sex, data = Salaries, method = "anova")
## n= 397
##
## CP nsplit rel error xerror xstd
## 1 0.37878848 0 1.0000000 1.0052789 0.07486510
## 2 0.03308338 1 0.6212115 0.6259663 0.05576504
## 3 0.03005491 2 0.5881281 0.6250104 0.05760150
## 4 0.01803279 3 0.5580732 0.6020002 0.05718459
## 5 0.01606402 5 0.5220077 0.6275410 0.05789950
## 6 0.01545334 6 0.5059436 0.6210851 0.05803549
## 7 0.01000000 7 0.4904903 0.5951021 0.05878917
##
## Variable importance
## rank yrs.since.phd yrs.service discipline sex
## 37 32 26 5 1
##
## Node number 1: 397 observations, complexity param=0.3787885
## mean=113706.5, MSE=9.15115e+08
## left son=2 (131 obs) right son=3 (266 obs)
## Primary splits:
## rank splits as LLR, improve=0.37878850, (0 missing)
## yrs.since.phd < 12.5 to the left, improve=0.27504760, (0 missing)
## yrs.service < 8.5 to the left, improve=0.20355660, (0 missing)
## discipline splits as LR, improve=0.02436220, (0 missing)
## sex splits as LR, improve=0.01921278, (0 missing)
## Surrogate splits:
## yrs.since.phd < 11.5 to the left, agree=0.909, adj=0.725, (0 split)
## yrs.service < 8.5 to the left, agree=0.859, adj=0.573, (0 split)
## sex splits as LR, agree=0.678, adj=0.023, (0 split)
##
## Node number 2: 131 observations, complexity param=0.01606402
## mean=87176.21, MSE=1.685529e+08
## left son=4 (50 obs) right son=5 (81 obs)
## Primary splits:
## discipline splits as LR, improve=0.26430960, (0 missing)
## rank splits as LR-, improve=0.25441840, (0 missing)
## yrs.since.phd < 8.5 to the left, improve=0.16784740, (0 missing)
## yrs.service < 5.5 to the left, improve=0.15902320, (0 missing)
## sex splits as LR, improve=0.01945016, (0 missing)
## Surrogate splits:
## yrs.since.phd < 24 to the right, agree=0.641, adj=0.06, (0 split)
## yrs.service < 14 to the right, agree=0.634, adj=0.04, (0 split)
##
## Node number 3: 266 observations, complexity param=0.03308338
## mean=126772.1, MSE=7.654365e+08
## left son=6 (131 obs) right son=7 (135 obs)
## Primary splits:
## discipline splits as LR, improve=0.059031700, (0 missing)
## yrs.since.phd < 45.5 to the right, improve=0.037865850, (0 missing)
## yrs.service < 33.5 to the right, improve=0.011894720, (0 missing)
## sex splits as LR, improve=0.002188808, (0 missing)
## Surrogate splits:
## yrs.since.phd < 26.5 to the right, agree=0.632, adj=0.252, (0 split)
## yrs.service < 25.5 to the right, agree=0.590, adj=0.168, (0 split)
##
## Node number 4: 50 observations
## mean=78680.84, MSE=9.21205e+07
##
## Node number 5: 81 observations, complexity param=0.01545334
## mean=92420.26, MSE=1.436832e+08
## left son=10 (43 obs) right son=11 (38 obs)
## Primary splits:
## rank splits as LR-, improve=4.823893e-01, (0 missing)
## yrs.service < 5.5 to the left, improve=4.365599e-01, (0 missing)
## yrs.since.phd < 8.5 to the left, improve=3.449514e-01, (0 missing)
## sex splits as RL, improve=7.989063e-06, (0 missing)
## Surrogate splits:
## yrs.service < 5.5 to the left, agree=0.975, adj=0.947, (0 split)
## yrs.since.phd < 8.5 to the left, agree=0.914, adj=0.816, (0 split)
## sex splits as RL, agree=0.543, adj=0.026, (0 split)
##
## Node number 6: 131 observations, complexity param=0.03005491
## mean=119948.3, MSE=7.758025e+08
## left son=12 (7 obs) right son=13 (124 obs)
## Primary splits:
## yrs.since.phd < 46.5 to the right, improve=0.107438300, (0 missing)
## yrs.service < 43.5 to the right, improve=0.053101880, (0 missing)
## sex splits as LR, improve=0.008922571, (0 missing)
## Surrogate splits:
## yrs.service < 47 to the right, agree=0.977, adj=0.571, (0 split)
##
## Node number 7: 135 observations
## mean=133393.8, MSE=6.663464e+08
##
## Node number 10: 43 observations
## mean=84593.91, MSE=4.99113e+07
##
## Node number 11: 38 observations
## mean=101276.4, MSE=1.020512e+08
##
## Node number 12: 7 observations
## mean=81523, MSE=2.082952e+08
##
## Node number 13: 124 observations, complexity param=0.01803279
## mean=122117.4, MSE=7.197831e+08
## left son=26 (42 obs) right son=27 (82 obs)
## Primary splits:
## yrs.since.phd < 26 to the left, improve=0.07094925, (0 missing)
## yrs.service < 41.5 to the left, improve=0.01967016, (0 missing)
## sex splits as LR, improve=0.01493644, (0 missing)
## Surrogate splits:
## yrs.service < 18.5 to the left, agree=0.831, adj=0.5, (0 split)
##
## Node number 26: 42 observations
## mean=112132.2, MSE=3.168359e+08
##
## Node number 27: 82 observations, complexity param=0.01803279
## mean=127231.8, MSE=8.489457e+08
## left son=54 (70 obs) right son=55 (12 obs)
## Primary splits:
## yrs.service < 18.5 to the right, improve=0.097254140, (0 missing)
## yrs.since.phd < 33.5 to the right, improve=0.006780819, (0 missing)
##
## Node number 54: 70 observations
## mean=123469.7, MSE=6.367022e+08
##
## Node number 55: 12 observations
## mean=149177.7, MSE=1.522849e+09
# Simple base plot
plot(tree_fit)
text(tree_fit, use.n = TRUE, pretty = 0)
# Better tree
rpart.plot(tree_fit, type = 2, extra = 101, fallen.leaves = TRUE,
main = "Regression Tree for Professor Salaries")
# complexity parameter table
printcp(tree_fit)
##
## Regression tree:
## rpart(formula = salary ~ rank + discipline + yrs.since.phd +
## yrs.service + sex, data = Salaries, method = "anova")
##
## Variables actually used in tree construction:
## [1] discipline rank yrs.service yrs.since.phd
##
## Root node error: 3.633e+11/397 = 915114969
##
## n= 397
##
## CP nsplit rel error xerror xstd
## 1 0.378788 0 1.00000 1.00528 0.074865
## 2 0.033083 1 0.62121 0.62597 0.055765
## 3 0.030055 2 0.58813 0.62501 0.057602
## 4 0.018033 3 0.55807 0.60200 0.057185
## 5 0.016064 5 0.52201 0.62754 0.057900
## 6 0.015453 6 0.50594 0.62109 0.058035
## 7 0.010000 7 0.49049 0.59510 0.058789
# Plot cross-validation results
plotcp(tree_fit, main = "Complexity Parameter (CP) Plot")
# Best CP
best_cp <- tree_fit$cptable[which.min(tree_fit$cptable[,"xerror"]), "CP"]
# Prune the tree
pruned_tree <- prune(tree_fit, cp = best_cp)
# Plot the pruned tree
rpart.plot(pruned_tree, type = 2, extra = 101, fallen.leaves = TRUE,
main = "Pruned Regression Tree for Professor Salaries")
# Question 5
# Data from Q5
mu0 <- c(3, 6)
mu1 <- c(5, 8)
Spooled <- matrix(c(1,1,1,2), 2, 2)
x <- c(2, 7)
# LDA direction vector: Sigma^{-1}(mu0 - mu1)
e <- solve(Spooled) %*% matrix(mu0 - mu1, 2, 1)
# normalization
c <- t(e) %*% Spooled %*% e
a <- e / sqrt(c[1,1])
# project x and find distances to class means
d0 <- abs(t(a) %*% (matrix(x, 2, 1) - mu0))
d1 <- abs(t(a) %*% (matrix(x, 2, 1) - mu1))
c(d0, d1) # smaller distance ⇒ closer class
## [1] 1 3
mu0 <- c(3, 6)
mu1 <- c(5, 8)
mu2 <- c(5, 4)
x <- c(2, 7)
sigma0 <- sigma1 <- sigma2 <- matrix(c(2, 1.67, 1.67, 2.33), 2, 2)
#Class 0 vs Class 1
e01 <- solve(sigma0) %*% matrix(mu0 - mu1, 2, 1)
c01 <- t(e01) %*% sigma0 %*% e01
a01 <- e01 / sqrt(c01[1,1])
d0 <- abs(t(a01) %*% (matrix(x,2,1) - mu0))
d1 <- abs(t(a01) %*% (matrix(x,2,1) - mu1))
cat("d0 =", d0, " d1 =", d1, "\n")
## d0 = 0.2424643 d1 = 1.69725
#Class 0 vs Class 2
e02 <- solve(sigma0) %*% matrix(mu0 - mu2, 2, 1)
c02 <- t(e02) %*% sigma0 %*% e02
a02 <- e02 / sqrt(c02[1,1])
d02 <- abs(t(a02) %*% (matrix(x,2,1) - mu0))
d22 <- abs(t(a02) %*% (matrix(x,2,1) - mu2))
cat("d0 =", d02, " d2 =", d22, "\n")
## d0 = 2.024646 d2 = 6.073939
#Class 1 vs Class 2
e12 <- solve(sigma1) %*% matrix(mu1 - mu2, 2, 1)
c12 <- t(e12) %*% sigma1 %*% e12
a12 <- e12 / sqrt(c12[1,1])
d12 <- abs(t(a12) %*% (matrix(x,2,1) - mu1))
d22_2 <- abs(t(a12) %*% (matrix(x,2,1) - mu2))
cat("d1 =", d12, " d2 =", d22_2, "\n")
## d1 = 1.555976 d2 = 5.691462
# Assign to class 0