# Load necessary libraries and data
library(MASS)
library(e1071) # for naiveBayes
library(class) # for knn
data(Boston)

# Create binary response variable (1 if crime rate above median, 0 otherwise)
Boston$high_crime <- as.factor(ifelse(Boston$crim > median(Boston$crim), 1, 0))

# Remove original crim variable
Boston$crim <- NULL

# Split data into training and test sets (70/30 split)
set.seed(123)
train_index <- sample(1:nrow(Boston), size = 0.7*nrow(Boston))
train <- Boston[train_index, ]
test <- Boston[-train_index, ]
# Full model
logit_full <- glm(high_crime ~ ., data = train, family = binomial)
summary(logit_full)
## 
## Call:
## glm(formula = high_crime ~ ., family = binomial, data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -28.064366   8.944009  -3.138  0.00170 ** 
## zn           -0.093211   0.042125  -2.213  0.02692 *  
## indus        -0.044137   0.058791  -0.751  0.45281    
## chas          0.247705   0.823661   0.301  0.76362    
## nox          42.153836   9.034180   4.666 3.07e-06 ***
## rm            0.033207   0.882360   0.038  0.96998    
## age           0.028651   0.014189   2.019  0.04346 *  
## dis           0.657709   0.258111   2.548  0.01083 *  
## rad           0.532624   0.183178   2.908  0.00364 ** 
## tax          -0.007883   0.003367  -2.341  0.01923 *  
## ptratio       0.448510   0.161183   2.783  0.00539 ** 
## black        -0.034710   0.014439  -2.404  0.01622 *  
## lstat         0.103425   0.061650   1.678  0.09342 .  
## medv          0.220912   0.090027   2.454  0.01413 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 490.74  on 353  degrees of freedom
## Residual deviance: 145.64  on 340  degrees of freedom
## AIC: 173.64
## 
## Number of Fisher Scoring iterations: 9
# Reduced model (selecting significant predictors)
logit_reduced <- glm(high_crime ~ zn + nox + dis + rad + tax + ptratio + lstat + medv, 
                     data = train, family = binomial)
summary(logit_reduced)
## 
## Call:
## glm(formula = high_crime ~ zn + nox + dis + rad + tax + ptratio + 
##     lstat + medv, family = binomial, data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -36.050825   6.729471  -5.357 8.45e-08 ***
## zn           -0.094244   0.040971  -2.300 0.021433 *  
## nox          42.082114   7.565215   5.563 2.66e-08 ***
## dis           0.513892   0.226847   2.265 0.023490 *  
## rad           0.555490   0.167010   3.326 0.000881 ***
## tax          -0.007009   0.002799  -2.504 0.012276 *  
## ptratio       0.286214   0.125586   2.279 0.022666 *  
## lstat         0.148771   0.050608   2.940 0.003285 ** 
## medv          0.183874   0.055169   3.333 0.000859 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 490.74  on 353  degrees of freedom
## Residual deviance: 161.25  on 345  degrees of freedom
## AIC: 179.25
## 
## Number of Fisher Scoring iterations: 9
# Predictions and accuracy
pred_prob <- predict(logit_reduced, newdata = test, type = "response")
pred_class <- ifelse(pred_prob > 0.5, 1, 0)
mean(pred_class == test$high_crime) # Accuracy
## [1] 0.9144737
lda_model <- lda(high_crime ~ zn + nox + dis + rad + tax + ptratio + lstat + medv, data = train)
lda_pred <- predict(lda_model, newdata = test)
mean(lda_pred$class == test$high_crime)
## [1] 0.8618421
nb_model <- naiveBayes(high_crime ~ zn + nox + dis + rad + tax + ptratio + lstat + medv, data = train)
nb_pred <- predict(nb_model, newdata = test)
mean(nb_pred == test$high_crime)
## [1] 0.7960526
# Prepare data (standardize continuous variables)
train_scale <- scale(train[, -which(names(train) == "high_crime")])
test_scale <- scale(test[, -which(names(test) == "high_crime")])

# Find optimal k
k_values <- 1:20
accuracies <- sapply(k_values, function(k) {
  knn_pred <- knn(train_scale, test_scale, train$high_crime, k = k)
  mean(knn_pred == test$high_crime)
})
best_k <- k_values[which.max(accuracies)]

# Final KNN model
knn_pred <- knn(train_scale, test_scale, train$high_crime, k = best_k)
mean(knn_pred == test$high_crime)
## [1] 0.9144737
#Chapter 5 Question 5: Default Prediction with Validation Sets
library(ISLR2)
## 
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
## 
##     Boston
## The following object is masked from 'package:MASS':
## 
##     Boston
data(Default)

# a) Logistic regression model
set.seed(123)
logit_default <- glm(default ~ income + balance, data = Default, family = binomial)

# b) Validation set approach
val_set_error <- function() {
  train_index <- sample(1:nrow(Default), 0.7*nrow(Default))
  train <- Default[train_index, ]
  valid <- Default[-train_index, ]
  
  model <- glm(default ~ income + balance, data = train, family = binomial)
  pred_prob <- predict(model, newdata = valid, type = "response")
  pred_class <- ifelse(pred_prob > 0.5, "Yes", "No")
  
  mean(pred_class != valid$default)
}

# c) Repeat 3 times
set.seed(123)
errors <- replicate(3, val_set_error())
mean(errors) # Average error around 2.6%
## [1] 0.02666667
# d) Include student dummy
val_set_error_student <- function() {
  train_index <- sample(1:nrow(Default), 0.7*nrow(Default))
  train <- Default[train_index, ]
  valid <- Default[-train_index, ]
  
  model <- glm(default ~ income + balance + student, data = train, family = binomial)
  pred_prob <- predict(model, newdata = valid, type = "response")
  pred_class <- ifelse(pred_prob > 0.5, "Yes", "No")
  
  mean(pred_class != valid$default)
}

set.seed(123)
errors_student <- replicate(3, val_set_error_student())
mean(errors_student) # Slightly higher error (~2.7%)
## [1] 0.02711111
data(College)
#Chapter 6 Question 9: College Applications Prediction
# a) Split data
set.seed(123)
train_index <- sample(1:nrow(College), 0.7*nrow(College))
train <- College[train_index, ]
test <- College[-train_index, ]

# b) Linear regression
lm_fit <- lm(Apps ~ ., data = train)
lm_pred <- predict(lm_fit, newdata = test)
lm_error <- mean((lm_pred - test$Apps)^2) # MSE

# c) Ridge regression
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-9
x <- model.matrix(Apps ~ ., data = train)[,-1]
y <- train$Apps
cv_ridge <- cv.glmnet(x, y, alpha = 0)
ridge_pred <- predict(cv_ridge, newx = model.matrix(Apps ~ ., data = test)[,-1])
ridge_error <- mean((ridge_pred - test$Apps)^2)

# d) Lasso
cv_lasso <- cv.glmnet(x, y, alpha = 1)
lasso_pred <- predict(cv_lasso, newx = model.matrix(Apps ~ ., data = test)[,-1])
lasso_error <- mean((lasso_pred - test$Apps)^2)
coef(cv_lasso) # Non-zero coefficients
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept) -579.86250244
## PrivateYes  -175.69186792
## Accept         1.20869497
## Enroll         0.06116026
## Top10perc     25.19187735
## Top25perc      .         
## F.Undergrad    0.03318350
## P.Undergrad    .         
## Outstate       .         
## Room.Board     .         
## Books          .         
## Personal       .         
## PhD            .         
## Terminal       .         
## S.F.Ratio      .         
## perc.alumni    .         
## Expend         0.04311230
## Grad.Rate      .
# e) PCR
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
pcr_fit <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr_fit, val.type = "MSEP")

pcr_pred <- predict(pcr_fit, newdata = test, ncomp = 10) # Choose optimal components
pcr_error <- mean((pcr_pred - test$Apps)^2)

# f) PLS
pls_fit <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
pls_pred <- predict(pls_fit, newdata = test, ncomp = 5) # Choose optimal components
pls_error <- mean((pls_pred - test$Apps)^2)

# g) Compare results
errors <- c(LM = lm_error, Ridge = ridge_error, Lasso = lasso_error, 
            PCR = pcr_error, PLS = pls_error)
round(errors)
##      LM   Ridge   Lasso     PCR     PLS 
## 1734841 3522433 1874269 3962512 2469701
# Simulate credit data
set.seed(123)
n <- 1000
# Simulate credit data
set.seed(123)
n <- 1000

# First create the predictor variables
income <- rnorm(n, 70000, 15000)
balance <- rnorm(n, 5000, 1500)
late_payments <- rpois(n, 0.5)

# Then calculate default probabilities using the predictors
default_probs <- plogis(-5 + 0.00002*income + 0.001*balance + 0.5*late_payments)

# Simulate credit data
set.seed(123)
n <- 1000

# Create predictor variables first
income <- rnorm(n, 70000, 15000)
balance <- rnorm(n, 5000, 1500)
late_payments <- rpois(n, 0.5)

# Calculate default probabilities
default_probs <- plogis(-5 + 0.00002*income + 0.001*balance + 0.5*late_payments)

# Simulate loan data
set.seed(100)
n <- 500
loan_data <- data.frame(
  income = rnorm(n, 50000, 15000),
  loan_amount = rnorm(n, 15000, 5000),
  credit_score = rnorm(n, 650, 50),
  default = rbinom(n, 1, 0.2)
)

# Logistic model
glm_fit <- glm(default ~ income + loan_amount + credit_score, data=loan_data, family=binomial)
summary(glm_fit)
## 
## Call:
## glm(formula = default ~ income + loan_amount + credit_score, 
##     family = binomial, data = loan_data)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -2.437e+00  1.569e+00  -1.553    0.120
## income        1.395e-07  7.595e-06   0.018    0.985
## loan_amount  -4.075e-06  2.166e-05  -0.188    0.851
## credit_score  1.602e-03  2.302e-03   0.696    0.487
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 486.22  on 499  degrees of freedom
## Residual deviance: 485.71  on 496  degrees of freedom
## AIC: 493.71
## 
## Number of Fisher Scoring iterations: 4