Final exam_Application of Financial Software_Sarantuya Sharavsambuu

#Chapter 4 Question 16 – Boston Data: Classification Models

library(MASS)       
library(class)      
library(e1071)      
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

set.seed(123)

library(dplyr)
data("Boston")

# Create binary response variable
Boston$crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)

Boston <- dplyr::select(Boston, -crim)

# View data
head(Boston)

##   zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat medv
## 1 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98 24.0
## 2  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14 21.6
## 3  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03 34.7
## 4  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94 33.4
## 5  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33 36.2
## 6  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21 28.7
##   crim01
## 1      0
## 2      0
## 3      0
## 4      0
## 5      0
## 6      0

train_index <- sample(1:nrow(Boston), size = 0.7 * nrow(Boston))
train <- Boston[train_index, ]
test <- Boston[-train_index, ]

train_x <- dplyr::select(train, -crim01)
test_x <- dplyr::select(test, -crim01)
train_y <- train$crim01
test_y <- test$crim01

log_model <- glm(crim01 ~ ., data = train, family = binomial)
log_probs <- predict(log_model, newdata = test, type = "response")
log_pred <- ifelse(log_probs > 0.5, 1, 0)

confusionMatrix(as.factor(log_pred), as.factor(test_y))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 64  6
##          1 11 71
##                                          
##                Accuracy : 0.8882         
##                  95% CI : (0.827, 0.9335)
##     No Information Rate : 0.5066         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.7761         
##                                          
##  Mcnemar's Test P-Value : 0.332          
##                                          
##             Sensitivity : 0.8533         
##             Specificity : 0.9221         
##          Pos Pred Value : 0.9143         
##          Neg Pred Value : 0.8659         
##              Prevalence : 0.4934         
##          Detection Rate : 0.4211         
##    Detection Prevalence : 0.4605         
##       Balanced Accuracy : 0.8877         
##                                          
##        'Positive' Class : 0              
##

lda_model <- lda(crim01 ~ ., data = train)
lda_pred <- predict(lda_model, newdata = test)$class

confusionMatrix(lda_pred, as.factor(test_y))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 65 15
##          1 10 62
##                                           
##                Accuracy : 0.8355          
##                  95% CI : (0.7669, 0.8906)
##     No Information Rate : 0.5066          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6713          
##                                           
##  Mcnemar's Test P-Value : 0.4237          
##                                           
##             Sensitivity : 0.8667          
##             Specificity : 0.8052          
##          Pos Pred Value : 0.8125          
##          Neg Pred Value : 0.8611          
##              Prevalence : 0.4934          
##          Detection Rate : 0.4276          
##    Detection Prevalence : 0.5263          
##       Balanced Accuracy : 0.8359          
##                                           
##        'Positive' Class : 0               
##

nb_model <- naiveBayes(as.factor(crim01) ~ ., data = train)
nb_pred <- predict(nb_model, newdata = test)

confusionMatrix(nb_pred, as.factor(test_y))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 65 18
##          1 10 59
##                                          
##                Accuracy : 0.8158         
##                  95% CI : (0.7449, 0.874)
##     No Information Rate : 0.5066         
##     P-Value [Acc > NIR] : 2.259e-15      
##                                          
##                   Kappa : 0.632          
##                                          
##  Mcnemar's Test P-Value : 0.1859         
##                                          
##             Sensitivity : 0.8667         
##             Specificity : 0.7662         
##          Pos Pred Value : 0.7831         
##          Neg Pred Value : 0.8551         
##              Prevalence : 0.4934         
##          Detection Rate : 0.4276         
##    Detection Prevalence : 0.5461         
##       Balanced Accuracy : 0.8165         
##                                          
##        'Positive' Class : 0              
##

# Normalize predictors
train_norm <- scale(train_x)
test_norm <- scale(test_x, center = attr(train_norm, "scaled:center"), 
                              scale = attr(train_norm, "scaled:scale"))

knn_pred <- knn(train = train_norm, test = test_norm, cl = train_y, k = 5)

confusionMatrix(knn_pred, as.factor(test_y))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 65  8
##          1 10 69
##                                           
##                Accuracy : 0.8816          
##                  95% CI : (0.8193, 0.9283)
##     No Information Rate : 0.5066          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.763           
##                                           
##  Mcnemar's Test P-Value : 0.8137          
##                                           
##             Sensitivity : 0.8667          
##             Specificity : 0.8961          
##          Pos Pred Value : 0.8904          
##          Neg Pred Value : 0.8734          
##              Prevalence : 0.4934          
##          Detection Rate : 0.4276          
##    Detection Prevalence : 0.4803          
##       Balanced Accuracy : 0.8814          
##                                           
##        'Positive' Class : 0               
##

#“Chapter 5 Question 5- Logistic Regression with Validation Set”

library(ISLR)
library(dplyr)
library(caret)
set.seed(42)

data("Default")
str(Default)

## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...

summary(Default)

##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554

model1 <- glm(default ~ income + balance, data = Default, family = binomial)
summary(model1)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

# Step i: Randomly split 50% training, 50% validation
set.seed(100)
train_index <- sample(1:nrow(Default), nrow(Default) / 2)
train <- Default[train_index, ]
valid <- Default[-train_index, ]

# Step ii: Fit model
model2 <- glm(default ~ income + balance, data = train, family = binomial)

# Step iii: Predict on validation set
probs <- predict(model2, newdata = valid, type = "response")
preds <- ifelse(probs > 0.5, "Yes", "No")

# Step iv: Compute validation error
confusionMatrix(as.factor(preds), as.factor(valid$default))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  4822  112
##        Yes   20   46
##                                           
##                Accuracy : 0.9736          
##                  95% CI : (0.9688, 0.9779)
##     No Information Rate : 0.9684          
##     P-Value [Acc > NIR] : 0.0176          
##                                           
##                   Kappa : 0.3995          
##                                           
##  Mcnemar's Test P-Value : 2.365e-15       
##                                           
##             Sensitivity : 0.9959          
##             Specificity : 0.2911          
##          Pos Pred Value : 0.9773          
##          Neg Pred Value : 0.6970          
##              Prevalence : 0.9684          
##          Detection Rate : 0.9644          
##    Detection Prevalence : 0.9868          
##       Balanced Accuracy : 0.6435          
##                                           
##        'Positive' Class : No              
##

mean(preds != valid$default)

## [1] 0.0264

set.seed(123)
validation_errors <- c()

for (i in 1:3) {
  split_idx <- sample(1:nrow(Default), nrow(Default)/2)
  train_split <- Default[split_idx, ]
  valid_split <- Default[-split_idx, ]
  
  model_split <- glm(default ~ income + balance, data = train_split, family = binomial)
  pred_split <- predict(model_split, newdata = valid_split, type = "response")
  class_split <- ifelse(pred_split > 0.5, "Yes", "No")
  
  error_rate <- mean(class_split != valid_split$default)
  validation_errors <- c(validation_errors, error_rate)
}

validation_errors

## [1] 0.0276 0.0246 0.0262

mean(validation_errors)

## [1] 0.02613333

set.seed(456)
split_idx <- sample(1:nrow(Default), nrow(Default)/2)
train_stud <- Default[split_idx, ]
valid_stud <- Default[-split_idx, ]

model_stud <- glm(default ~ income + balance + student, data = train_stud, family = binomial)
pred_stud <- predict(model_stud, newdata = valid_stud, type = "response")
class_stud <- ifelse(pred_stud > 0.5, "Yes", "No")

confusionMatrix(as.factor(class_stud), as.factor(valid_stud$default))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  4797  115
##        Yes   33   55
##                                           
##                Accuracy : 0.9704          
##                  95% CI : (0.9653, 0.9749)
##     No Information Rate : 0.966           
##     P-Value [Acc > NIR] : 0.04443         
##                                           
##                   Kappa : 0.4127          
##                                           
##  Mcnemar's Test P-Value : 2.773e-11       
##                                           
##             Sensitivity : 0.9932          
##             Specificity : 0.3235          
##          Pos Pred Value : 0.9766          
##          Neg Pred Value : 0.6250          
##              Prevalence : 0.9660          
##          Detection Rate : 0.9594          
##    Detection Prevalence : 0.9824          
##       Balanced Accuracy : 0.6583          
##                                           
##        'Positive' Class : No              
##

mean(class_stud != valid_stud$default)

## [1] 0.0296

#Chapter 6 Question 9 - Regression Modeling on College Dataset

# Load all tidymodels packages
library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──

## ✔ broom        1.0.8     ✔ rsample      1.3.0
## ✔ dials        1.4.0     ✔ tibble       3.3.0
## ✔ infer        1.0.8     ✔ tidyr        1.3.1
## ✔ modeldata    1.4.0     ✔ tune         1.3.0
## ✔ parsnip      1.3.2     ✔ workflows    1.2.0
## ✔ purrr        1.0.4     ✔ workflowsets 1.1.1
## ✔ recipes      1.3.1     ✔ yardstick    1.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()         masks scales::discard()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ purrr::lift()            masks caret::lift()
## ✖ rsample::permutations()  masks e1071::permutations()
## ✖ yardstick::precision()   masks caret::precision()
## ✖ yardstick::recall()      masks caret::recall()
## ✖ dplyr::select()          masks MASS::select()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step()          masks stats::step()
## ✖ tune::tune()             masks parsnip::tune(), e1071::tune()

library(ISLR2)

## 
## Attaching package: 'ISLR2'

## The following object is masked _by_ '.GlobalEnv':
## 
##     Boston

## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit

## The following object is masked from 'package:MASS':
## 
##     Boston

# Set seed for reproducibility
set.seed(123)

# Load data
data("College")
college <- College

# Split into training and test sets (70% train)
split <- initial_split(college, prop = 0.7)
train_data <- training(split)
test_data <- testing(split)

lm_model <- linear_reg() %>% 
  set_engine("lm")

lm_recipe <- recipe(Apps ~ ., data = train_data)

lm_workflow <- workflow() %>% 
  add_model(lm_model) %>% 
  add_recipe(lm_recipe)

lm_fit <- lm_workflow %>% fit(data = train_data)

lm_preds <- predict(lm_fit, test_data) %>% 
  bind_cols(test_data)

lm_mse <- mean((lm_preds$.pred - lm_preds$Apps)^2)
cat("(b) Linear Regression Test MSE:", lm_mse, "\n")

## (b) Linear Regression Test MSE: 1734841

install.packages("glmnet")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

## also installing the dependency 'RcppEigen'

## Warning in install.packages("glmnet"): installation of package 'RcppEigen' had
## non-zero exit status

## Warning in install.packages("glmnet"): installation of package 'glmnet' had
## non-zero exit status

# Manual Ridge using closed-form solution
library(MASS)

# Create model matrix (remove intercept)
x <- model.matrix(Apps ~ ., data = train_data)[, -1]
y <- train_data$Apps
x_test <- model.matrix(Apps ~ ., data = test_data)[, -1]
y_test <- test_data$Apps

# Try a range of lambda values
lambdas <- 10^seq(10, -2, length = 100)
mse_values <- numeric(length(lambdas))

for (i in seq_along(lambdas)) {
  lambda <- lambdas[i]
  ridge_fit <- lm.ridge(Apps ~ ., data = train_data, lambda = lambda)
  ridge_coefs <- coef(ridge_fit)
  preds <- cbind(1, x_test) %*% ridge_coefs
  mse_values[i] <- mean((y_test - preds)^2)
}

best_lambda <- lambdas[which.min(mse_values)]
best_mse <- min(mse_values)

cat("(c) Ridge Regression Test MSE (manual):", best_mse, "\n")

## (c) Ridge Regression Test MSE (manual): 1735254

cat("    Best lambda:", best_lambda, "\n")

##     Best lambda: 0.01

library(ISLR2)

set.seed(123)
index <- sample(1:nrow(College), nrow(College) * 0.7)
train_data <- College[index, ]
test_data <- College[-index, ]

# Full model
full_model <- lm(Apps ~ ., data = train_data)

# Use base R's step function explicitly
step_model <- stats::step(full_model, direction = "both", trace = FALSE)

# Predict
preds <- predict(step_model, newdata = test_data)

# MSE
mse <- mean((preds - test_data$Apps)^2)

# Count predictors
num_nonzero <- sum(coef(step_model)[-1] != 0)

cat("(d) Stepwise Regression (Lasso-like) Test MSE:", mse, "\n")

## (d) Stepwise Regression (Lasso-like) Test MSE: 1713938

cat("    Number of predictors selected:", num_nonzero, "\n")

##     Number of predictors selected: 11

library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

# Fit PCR model with cross-validation
set.seed(123)
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Plot CV error vs number of components
print(validationplot(pcr_model, val.type = "MSEP"))

## NULL

# Find best number of components
msep_values <- MSEP(pcr_model)$val[1,1,]
best_M <- which.min(msep_values)

# Predict on test data
pcr_preds <- predict(pcr_model, newdata = test_data, ncomp = best_M)

# Test MSE
pcr_mse <- mean((pcr_preds - test_data$Apps)^2)

# Output
cat("(e) PCR Test MSE:", pcr_mse, "\n")

## (e) PCR Test MSE: 1734841

cat("    Optimal number of components (M):", best_M, "\n")

##     Optimal number of components (M): 17

# Load pls package (already loaded if you've done PCR)
library(pls)

# Fit PLS model with cross-validation
set.seed(123)
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Plot CV error vs number of components (optional)
validationplot(pls_model, val.type = "MSEP")

# Find best number of components (min MSEP)
pls_msep_values <- MSEP(pls_model)$val[1,1,]
best_pls_M <- which.min(pls_msep_values)

# Predict on test data
pls_preds <- predict(pls_model, newdata = test_data, ncomp = best_pls_M)

# Calculate test MSE
pls_mse <- mean((pls_preds - test_data$Apps)^2)

# Output
cat("(f) PLS Test MSE:", pls_mse, "\n")

## (f) PLS Test MSE: 1742582

cat("    Optimal number of components (M):", best_pls_M, "\n")

##     Optimal number of components (M): 12

# (d) Stepwise Regression (Lasso-like) – RE-RUN THIS
full_model <- lm(Apps ~ ., data = train_data)
step_model <- stats::step(full_model, direction = "both", trace = FALSE)

step_preds <- predict(step_model, newdata = test_data)
step_mse <- mean((step_preds - test_data$Apps)^2)

# Automatically summarize model results
library(tibble)
library(dplyr)
library(glue)

model_results <- tibble(
  Model = c("Linear Regression", 
            "Ridge Regression", 
            "Stepwise Regression", 
            "PCR", 
            "PLS"),
  Test_MSE = c(
    lm_mse,     # from (b)
    best_mse,   # from Ridge
    step_mse,   # from Stepwise
    pcr_mse,    # from PCR
    pls_mse     # from PLS
  )
)

model_results <- model_results %>% arrange(Test_MSE)
best_model <- model_results$Model[1]
best_mse <- model_results$Test_MSE[1]
worst_model <- model_results$Model[5]
worst_mse <- model_results$Test_MSE[5]
range_mse <- round(worst_mse - best_mse)

summary_text <- glue("
Among the five models compared, {best_model} achieved the lowest test MSE at {round(best_mse)}. 
In contrast, {worst_model} had the highest MSE at {round(worst_mse)}, resulting in a difference of about {range_mse} in prediction error. 
This suggests that while all models perform reasonably well in predicting the number of college applications, 
the use of dimensionality reduction or variable selection can offer modest improvements. 
The close performance across models also indicates that the predictors are informative, 
and the problem does not suffer heavily from overfitting or multicollinearity. 
Overall, {best_model} stands out as the most accurate model for this dataset.
")

cat(summary_text)

## Among the five models compared, Stepwise Regression achieved the lowest test MSE at 1713938. 
## In contrast, PLS had the highest MSE at 1742582, resulting in a difference of about 28644 in prediction error. 
## This suggests that while all models perform reasonably well in predicting the number of college applications, 
## the use of dimensionality reduction or variable selection can offer modest improvements. 
## The close performance across models also indicates that the predictors are informative, 
## and the problem does not suffer heavily from overfitting or multicollinearity. 
## Overall, Stepwise Regression stands out as the most accurate model for this dataset.

Final exam_Application of Financial Software_Sarantuya Sharavsambuu_113035128

Sarantuya.Sh

2025-06-13