final exam AOF

"CHAPTER 4"

## [1] "CHAPTER 4"

library(MASS)
data("Boston")
Boston$crim_above_median <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
set.seed(123)
library(caTools)
split <- sample.split(Boston$crim_above_median, SplitRatio = 0.7)
train <- subset(Boston, split == TRUE)
test <- subset(Boston, split == FALSE)
logistic_model <- glm(crim_above_median ~ ., data = train, family = binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(logistic_model)

## 
## Call:
## glm(formula = crim_above_median ~ ., family = binomial, data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.315e+03  1.581e+05  -0.008    0.993
## crim         6.665e+02  1.726e+04   0.039    0.969
## zn           1.249e+00  1.170e+02   0.011    0.991
## indus        8.851e-01  5.064e+02   0.002    0.999
## chas        -3.317e+00  1.364e+04   0.000    1.000
## nox          3.191e+01  7.608e+04   0.000    1.000
## rm          -1.782e+01  3.197e+03  -0.006    0.996
## age          3.317e-01  1.055e+02   0.003    0.997
## dis         -1.472e+01  2.631e+03  -0.006    0.996
## rad          1.160e+01  1.402e+03   0.008    0.993
## tax         -3.735e-01  2.555e+01  -0.015    0.988
## ptratio      1.262e+01  1.530e+03   0.008    0.993
## black        2.615e+00  3.093e+02   0.008    0.993
## lstat        1.458e-01  2.383e+02   0.001    1.000
## medv         2.255e+00  5.640e+02   0.004    0.997
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4.9075e+02  on 353  degrees of freedom
## Residual deviance: 8.6635e-06  on 339  degrees of freedom
## AIC: 30
## 
## Number of Fisher Scoring iterations: 25

# Predict on the test set
logistic_preds <- predict(logistic_model, newdata = test, type = "response")
logistic_pred_class <- ifelse(logistic_preds > 0.5, 1, 0)
library(MASS)
lda_model <- lda(crim_above_median ~ ., data = train)
lda_preds <- predict(lda_model, newdata = test)
lda_pred_class <- lda_preds$class
library(e1071)
nb_model <- naiveBayes(crim_above_median ~ ., data = train)
nb_preds <- predict(nb_model, newdata = test)
library(class)
train_X <- train[, -which(names(train) == "crim_above_median")]
test_X <- test[, -which(names(test) == "crim_above_median")]
train_y <- train$crim_above_median

# Scale the predictors
train_X_scaled <- scale(train_X)
test_X_scaled <- scale(test_X)

knn_preds <- knn(train = train_X_scaled, test = test_X_scaled, cl = train_y, k = 5)
# Confusion matrix and accuracy for Logistic Regression
logistic_cm <- table(test$crim_above_median, logistic_pred_class)
logistic_accuracy <- sum(diag(logistic_cm)) / sum(logistic_cm)
print(paste("Logistic Regression Accuracy:", logistic_accuracy))

## [1] "Logistic Regression Accuracy: 0.980263157894737"

# Confusion matrix and accuracy for LDA
lda_cm <- table(test$crim_above_median, lda_pred_class)
lda_accuracy <- sum(diag(lda_cm)) / sum(lda_cm)
print(paste("LDA Accuracy:", lda_accuracy))

## [1] "LDA Accuracy: 0.881578947368421"

# Confusion matrix and accuracy for Naive Bayes
nb_cm <- table(test$crim_above_median, nb_preds)
nb_accuracy <- sum(diag(nb_cm)) / sum(nb_cm)
print(paste("Naive Bayes Accuracy:", nb_accuracy))

## [1] "Naive Bayes Accuracy: 0.947368421052632"

# Confusion matrix and accuracy for KNN
knn_cm <- table(test$crim_above_median, knn_preds)
knn_accuracy <- sum(diag(knn_cm)) / sum(knn_cm)
print(paste("KNN Accuracy:", knn_accuracy))

## [1] "KNN Accuracy: 0.901315789473684"

"CHAPTER 5"

## [1] "CHAPTER 5"

library(ISLR)
data("Default")
set.seed(123)
logistic_model <- glm(default ~ income + balance, data = Default, family = binomial)
summary(logistic_model)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

set.seed(123)
library(caTools)
split <- sample.split(Default$default, SplitRatio = 0.7)
train <- subset(Default, split == TRUE)
test <- subset(Default, split == FALSE)

# Fit the model on the training set
logistic_model_train <- glm(default ~ income + balance, data = train, family = binomial)

# Predict on the validation set
test_probs <- predict(logistic_model_train, newdata = test, type = "response")
test_preds <- ifelse(test_probs > 0.5, "Yes", "No")

# Calculate the validation set error
test_actual <- test$default
conf_matrix <- table(test_actual, test_preds)
validation_error <- 1 - sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Validation Error:", validation_error))

## [1] "Validation Error: 0.0246666666666666"

# Function to perform the validation set approach
validation_set_approach <- function(seed) {
  set.seed(seed)
  split <- sample.split(Default$default, SplitRatio = 0.7)
  train <- subset(Default, split == TRUE)
  test <- subset(Default, split == FALSE)
  
  logistic_model_train <- glm(default ~ income + balance, data = train, family = binomial)
  test_probs <- predict(logistic_model_train, newdata = test, type = "response")
  test_preds <- ifelse(test_probs > 0.5, "Yes", "No")
  test_actual <- test$default
  conf_matrix <- table(test_actual, test_preds)
  validation_error <- 1 - sum(diag(conf_matrix)) / sum(conf_matrix)
  return(validation_error)
}

# Perform the process three times with different seeds
set.seed(123)
seeds <- sample(1:1000, 3)
errors <- sapply(seeds, validation_set_approach)
print(errors)

## [1] 0.02600000 0.02566667 0.02866667

mean_error <- mean(errors)
print(paste("Mean Validation Error:", mean_error))

## [1] "Mean Validation Error: 0.0267777777777778"

# Add the dummy variable for student
set.seed(123)
split <- sample.split(Default$default, SplitRatio = 0.7)
train <- subset(Default, split == TRUE)
test <- subset(Default, split == FALSE)

logistic_model_student <- glm(default ~ income + balance + student, data = train, family = binomial)
test_probs_student <- predict(logistic_model_student, newdata = test, type = "response")
test_preds_student <- ifelse(test_probs_student > 0.5, "Yes", "No")

conf_matrix_student <- table(test$default, test_preds_student)
validation_error_student <- 1 - sum(diag(conf_matrix_student)) / sum(conf_matrix_student)
print(paste("Validation Error with Student Variable:", validation_error_student))

## [1] "Validation Error with Student Variable: 0.0243333333333333"

"CHAPTER 6"

## [1] "CHAPTER 6"

library(ISLR)
data("College")
set.seed(123)
library(caTools)
split <- sample.split(College$Apps, SplitRatio = 0.7)
train <- subset(College, split == TRUE)
test <- subset(College, split == FALSE)
linear_model <- lm(Apps ~ ., data = train)
test_preds_linear <- predict(linear_model, newdata = test)
test_error_linear <- mean((test$Apps - test_preds_linear)^2)
print(paste("Test Error for Linear Model:", test_error_linear))

## [1] "Test Error for Linear Model: 1304671.1253516"

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-8

x_train <- model.matrix(Apps ~ ., train)[, -1]
y_train <- train$Apps
x_test <- model.matrix(Apps ~ ., test)[, -1]
y_test <- test$Apps

# Perform cross-validation to choose the best lambda
set.seed(123)
cv_ridge <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- cv_ridge$lambda.min

# Fit the ridge regression model
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = best_lambda_ridge)
test_preds_ridge <- predict(ridge_model, s = best_lambda_ridge, newx = x_test)
test_error_ridge <- mean((y_test - test_preds_ridge)^2)
print(paste("Test Error for Ridge Regression:", test_error_ridge))

## [1] "Test Error for Ridge Regression: 1313120.09915936"

# Perform cross-validation to choose the best lambda
set.seed(123)
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- cv_lasso$lambda.min

# Fit the lasso model
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = best_lambda_lasso)
test_preds_lasso <- predict(lasso_model, s = best_lambda_lasso, newx = x_test)
test_error_lasso <- mean((y_test - test_preds_lasso)^2)

# Number of non-zero coefficients
num_nonzero_coef <- sum(coef(lasso_model, s = best_lambda_lasso) != 0)

print(paste("Test Error for Lasso:", test_error_lasso))

## [1] "Test Error for Lasso: 1311182.68289331"

print(paste("Number of Non-zero Coefficients in Lasso:", num_nonzero_coef))

## [1] "Number of Non-zero Coefficients in Lasso: 13"

library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:stats':
## 
##     loadings

set.seed(123)
pcr_model <- pcr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pcr_model, val.type = "MSEP")

# Select the number of components
optimal_M <- which.min(pcr_model$validation$PRESS)
test_preds_pcr <- predict(pcr_model, newdata = test, ncomp = optimal_M)
test_error_pcr <- mean((test$Apps - test_preds_pcr)^2)
print(paste("Test Error for PCR:", test_error_pcr))

## [1] "Test Error for PCR: 1304671.1253516"

print(paste("Optimal Number of Components for PCR:", optimal_M))

## [1] "Optimal Number of Components for PCR: 17"

set.seed(123)
pls_model <- plsr(Apps ~ ., data = train, scale = TRUE, validation = "CV")
validationplot(pls_model, val.type = "MSEP")

# Select the number of components
optimal_M_pls <- which.min(pls_model$validation$PRESS)
test_preds_pls <- predict(pls_model, newdata = test, ncomp = optimal_M_pls)
test_error_pls <- mean((test$Apps - test_preds_pls)^2)
print(paste("Test Error for PLS:", test_error_pls))

## [1] "Test Error for PLS: 1304653.05383128"

print(paste("Optimal Number of Components for PLS:", optimal_M_pls))

## [1] "Optimal Number of Components for PLS: 16"

# Summary of results
results <- data.frame(
  Method = c("Linear Model", "Ridge Regression", "Lasso", "PCR", "PLS"),
  TestError = c(test_error_linear, test_error_ridge, test_error_lasso, test_error_pcr, test_error_pls)
)
print(results)

##             Method TestError
## 1     Linear Model   1304671
## 2 Ridge Regression   1313120
## 3            Lasso   1311183
## 4              PCR   1304671
## 5              PLS   1304653

# Load necessary libraries
library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──

## ✔ broom        1.0.6      ✔ recipes      1.0.10
## ✔ dials        1.2.1      ✔ rsample      1.2.1 
## ✔ dplyr        1.1.4      ✔ tibble       3.2.1 
## ✔ ggplot2      3.5.1      ✔ tidyr        1.3.1 
## ✔ infer        1.0.7      ✔ tune         1.2.1 
## ✔ modeldata    1.3.0      ✔ workflows    1.1.4 
## ✔ parsnip      1.2.1      ✔ workflowsets 1.1.0 
## ✔ purrr        1.0.2      ✔ yardstick    1.3.1

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard()        masks scales::discard()
## ✖ tidyr::expand()         masks Matrix::expand()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ tidyr::pack()           masks Matrix::pack()
## ✖ rsample::permutations() masks e1071::permutations()
## ✖ dplyr::select()         masks MASS::select()
## ✖ recipes::step()         masks stats::step()
## ✖ tune::tune()            masks parsnip::tune(), e1071::tune()
## ✖ tidyr::unpack()         masks Matrix::unpack()
## ✖ recipes::update()       masks Matrix::update(), stats::update()
## • Use tidymodels_prefer() to resolve common conflicts.

library(ggplot2)
library(ISLR2)

## 
## Attaching package: 'ISLR2'

## The following object is masked _by_ '.GlobalEnv':
## 
##     Boston

## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit

## The following object is masked from 'package:MASS':
## 
##     Boston

library(MASS)
# Load the Auto dataset
data(auto)

## Warning in data(auto): data set 'auto' not found

# (a) Perform simple linear regression
model <- lm(mpg ~ horsepower, data = Auto)

# Summary of the model
summary(model)

## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5710  -3.2592  -0.3435   2.7630  16.9240 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.935861   0.717499   55.66   <2e-16 ***
## horsepower  -0.157845   0.006446  -24.49   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.6049 
## F-statistic: 599.7 on 1 and 390 DF,  p-value: < 2.2e-16

# (b) Plot the response and predictor
ggplot(Auto, aes(x = horsepower, y = mpg)) +
  geom_point() +
  geom_abline(intercept = coef(model)[1], slope = coef(model)[2], color = "red") +
  labs(x = "Horsepower", y = "MPG") +
  theme_minimal()

final exam AOF

uyanga

2024-06-13