Untitled

#CHAPTER4
library(MASS)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

# Load the Boston data set
data("Boston")

# Create the response variable
median_crime <- median(Boston$crim)
Boston$HighCrime <- ifelse(Boston$crim > median_crime, 1, 0)
Boston$HighCrime <- factor(Boston$HighCrime)

# Summary of the new variable
summary(Boston$HighCrime)

##   0   1 
## 253 253

# Fit a logistic regression model
logistic_model <- glm(HighCrime ~ . - crim, data = Boston, family = binomial)

# Summary of the logistic regression model
summary(logistic_model)

## 
## Call:
## glm(formula = HighCrime ~ . - crim, family = binomial, data = Boston)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -34.103704   6.530014  -5.223 1.76e-07 ***
## zn           -0.079918   0.033731  -2.369  0.01782 *  
## indus        -0.059389   0.043722  -1.358  0.17436    
## chas          0.785327   0.728930   1.077  0.28132    
## nox          48.523782   7.396497   6.560 5.37e-11 ***
## rm           -0.425596   0.701104  -0.607  0.54383    
## age           0.022172   0.012221   1.814  0.06963 .  
## dis           0.691400   0.218308   3.167  0.00154 ** 
## rad           0.656465   0.152452   4.306 1.66e-05 ***
## tax          -0.006412   0.002689  -2.385  0.01709 *  
## ptratio       0.368716   0.122136   3.019  0.00254 ** 
## black        -0.013524   0.006536  -2.069  0.03853 *  
## lstat         0.043862   0.048981   0.895  0.37052    
## medv          0.167130   0.066940   2.497  0.01254 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 701.46  on 505  degrees of freedom
## Residual deviance: 211.93  on 492  degrees of freedom
## AIC: 239.93
## 
## Number of Fisher Scoring iterations: 9

# Fit an LDA model
lda_model <- lda(HighCrime ~ . - crim, data = Boston)

# Summary of the LDA model
lda_model

## Call:
## lda(HighCrime ~ . - crim, data = Boston)
## 
## Prior probabilities of groups:
##   0   1 
## 0.5 0.5 
## 
## Group means:
##          zn     indus       chas       nox       rm      age      dis       rad
## 0 21.525692  7.002292 0.05138340 0.4709711 6.394395 51.31028 5.091596  4.158103
## 1  1.201581 15.271265 0.08695652 0.6384190 6.174874 85.83953 2.498489 14.940711
##        tax  ptratio    black     lstat     medv
## 0 305.7431 17.90711 388.7061  9.419486 24.94941
## 1 510.7312 19.00395 324.6420 15.886640 20.11621
## 
## Coefficients of linear discriminants:
##                   LD1
## zn      -0.0054345920
## indus    0.0123186828
## chas    -0.0627520897
## nox      8.1340353679
## rm       0.0893872928
## age      0.0112885889
## dis      0.0407823970
## rad      0.0726344900
## tax     -0.0008619171
## ptratio  0.0501188217
## black   -0.0010241413
## lstat    0.0149784417
## medv     0.0377731894

# Load the naivebayes library
library(naivebayes)

## naivebayes 1.0.0 loaded

## For more information please visit:

## https://majkamichal.github.io/naivebayes/

# Fit a naive Bayes model
naive_bayes_model <- naive_bayes(HighCrime ~ . - crim, data = Boston)

# Summary of the naive Bayes model
naive_bayes_model

## 
## ================================= Naive Bayes ==================================
## 
## Call:
## naive_bayes.formula(formula = HighCrime ~ . - crim, data = Boston)
## 
## -------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## -------------------------------------------------------------------------------- 
##  
## A priori probabilities: 
## 
##   0   1 
## 0.5 0.5 
## 
## -------------------------------------------------------------------------------- 
##  
## Tables: 
## 
## -------------------------------------------------------------------------------- 
## :: crim (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## crim             0           1
##   mean  0.09557150  7.13147561
##   sd    0.06281773 11.10912294
## 
## -------------------------------------------------------------------------------- 
## :: zn (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## zn             0         1
##   mean 21.525692  1.201581
##   sd   29.319808  4.798611
## 
## -------------------------------------------------------------------------------- 
## :: indus (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## indus          0         1
##   mean  7.002292 15.271265
##   sd    5.514454  5.439010
## 
## -------------------------------------------------------------------------------- 
## :: chas (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## chas            0          1
##   mean 0.05138340 0.08695652
##   sd   0.22121612 0.28232985
## 
## -------------------------------------------------------------------------------- 
## :: nox (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## nox             0          1
##   mean 0.47097115 0.63841897
##   sd   0.05559789 0.09870365
## 
## --------------------------------------------------------------------------------
## 
## # ... and 9 more tables
## 
## --------------------------------------------------------------------------------

# Preparing data for KNN
library(class)
set.seed(123)
train_index <- createDataPartition(Boston$HighCrime, p = 0.7, list = FALSE)
train_data <- Boston[train_index, ]
test_data <- Boston[-train_index, ]

train_X <- as.matrix(train_data[, -c(1, 15)])
test_X <- as.matrix(test_data[, -c(1, 15)])
train_y <- train_data$HighCrime
test_y <- test_data$HighCrime

# Fit a KNN model
knn_pred <- knn(train = train_X, test = test_X, cl = train_y, k = 5)

# Summary of the KNN model
confusionMatrix(knn_pred, test_y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 69  4
##          1  6 71
##                                           
##                Accuracy : 0.9333          
##                  95% CI : (0.8808, 0.9676)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8667          
##                                           
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.9200          
##             Specificity : 0.9467          
##          Pos Pred Value : 0.9452          
##          Neg Pred Value : 0.9221          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4600          
##    Detection Prevalence : 0.4867          
##       Balanced Accuracy : 0.9333          
##                                           
##        'Positive' Class : 0               
##

#CHAPTER5 
# Load the ISLR library
library(ISLR)

# Load the Default data set
data("Default")

# Fit the logistic regression model
logistic_model <- glm(default ~ income + balance, data = Default, family = binomial)

# Summary of the logistic regression model
summary(logistic_model)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

# Set random seed for reproducibility
set.seed(42)

# Split the data into training and validation sets
train_index <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data <- Default[train_index, ]
validation_data <- Default[-train_index, ]

# Fit the logistic regression model on the training set
logistic_model <- glm(default ~ income + balance, data = train_data, family = binomial)

# Make predictions on the validation set
pred_probs <- predict(logistic_model, validation_data, type = "response")
pred_default <- ifelse(pred_probs > 0.5, "Yes", "No")

# Compute the validation set error
conf_matrix <- table(pred_default, validation_data$default)
conf_matrix

##             
## pred_default   No  Yes
##          No  2887   69
##          Yes   13   30

validation_error <- mean(pred_default != validation_data$default)
validation_error

## [1] 0.02734245

# Function to compute validation error
compute_validation_error <- function(seed) {
  set.seed(seed)
  train_index <- createDataPartition(Default$default, p = 0.7, list = FALSE)
  train_data <- Default[train_index, ]
  validation_data <- Default[-train_index, ]
  logistic_model <- glm(default ~ income + balance, data = train_data, family = binomial)
  pred_probs <- predict(logistic_model, validation_data, type = "response")
  pred_default <- ifelse(pred_probs > 0.5, "Yes", "No")
  validation_error <- mean(pred_default != validation_data$default)
  return(validation_error)
}

# Compute validation errors for three different splits
validation_errors <- sapply(1:3, compute_validation_error)
validation_errors

## [1] 0.02600867 0.02767589 0.02534178

# Fit logistic regression model with student variable
logistic_model_student <- glm(default ~ income + balance + student, data = train_data, family = binomial)

# Make predictions on the validation set
pred_probs_student <- predict(logistic_model_student, validation_data, type = "response")
pred_default_student <- ifelse(pred_probs_student > 0.5, "Yes", "No")
logreg_model_student <- glm(default ~ income + balance + student, data = Default, family = binomial)
summary(logreg_model_student)

## 
## Call:
## glm(formula = default ~ income + balance + student, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

validation_error_student <- mean(pred_default_student != validation_data$default)
validation_error_student

## [1] 0.02834278

#CHAPTER6
 # Load necessary libraries
library(ISLR)
library(caret)
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-8

library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

# Load the College data set
data("College")

# Set random seed for reproducibility
set.seed(123)

# Split the data into training and test sets
train_index <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
train_data <- College[train_index, ]
test_data <- College[-train_index, ]

# Linear Regression Model
linear_model <- lm(Apps ~ ., data = train_data)
pred_linear <- predict(linear_model, test_data)
test_error_linear <- mean((pred_linear - test_data$Apps)^2)

# Ridge Regression Model
train_X <- model.matrix(Apps ~ ., train_data)[, -1]
test_X <- model.matrix(Apps ~ ., test_data)[, -1]
train_y <- train_data$Apps

cv_ridge <- cv.glmnet(train_X, train_y, alpha = 0)
ridge_model <- glmnet(train_X, train_y, alpha = 0, lambda = cv_ridge$lambda.min)
pred_ridge <- predict(ridge_model, s = cv_ridge$lambda.min, newx = test_X)
test_error_ridge <- mean((pred_ridge - test_data$Apps)^2)

# Lasso Model
cv_lasso <- cv.glmnet(train_X, train_y, alpha = 1)
lasso_model <- glmnet(train_X, train_y, alpha = 1, lambda = cv_lasso$lambda.min)
pred_lasso <- predict(lasso_model, s = cv_lasso$lambda.min, newx = test_X)
test_error_lasso <- mean((pred_lasso - test_data$Apps)^2)
num_nonzero_coef <- sum(coef(lasso_model) != 0)

# Principal Components Regression (PCR) Model
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Determine the optimal number of components
validationplot(pcr_model, val.type = "MSEP")

optimal_M <- which.min(RMSEP(pcr_model)$val[1, , ])

# Ensure optimal_M is within valid bounds
optimal_M <- max(1, min(optimal_M, ncol(train_X)))

# Predict on the test set
pred_pcr <- predict(pcr_model, newdata = test_data, ncomp = optimal_M)
test_error_pcr <- mean((pred_pcr - test_data$Apps)^2)

# Partial Least Squares (PLS) Model
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Determine the optimal number of components
validationplot(pls_model, val.type = "MSEP")

optimal_M_pls <- which.min(RMSEP(pls_model)$val[1, , ])

# Ensure optimal_M_pls is within valid bounds
optimal_M_pls <- max(1, min(optimal_M_pls, ncol(train_X)))

# Predict on the test set
pred_pls <- predict(pls_model, newdata = test_data, ncomp = optimal_M_pls)
test_error_pls <- mean((pred_pls - test_data$Apps)^2)

# Summarize the results
test_errors <- data.frame(
  Model = c("Linear Regression", "Ridge Regression", "Lasso", "PCR", "PLS"),
  TestError = c(test_error_linear, test_error_ridge, test_error_lasso, test_error_pcr, test_error_pls)
)

print(test_errors)

##               Model TestError
## 1 Linear Regression   1882074
## 2  Ridge Regression   3270111
## 3             Lasso   1946216
## 4               PCR   1882074
## 5               PLS   1885003

# Comment on the results
cat("The test errors for the different models show how accurately each model can predict the number of college applications received. Generally, lower test errors indicate better model performance. By examining these test errors, we can conclude that the model with the lowest test error provides the most accurate predictions")

## The test errors for the different models show how accurately each model can predict the number of college applications received. Generally, lower test errors indicate better model performance. By examining these test errors, we can conclude that the model with the lowest test error provides the most accurate predictions

# Specific comments on models
if (min(test_errors$TestError) == test_error_linear) {
  cat("The Linear Regression model has the lowest test error, indicating it is the best model for this data set.")
} else if (min(test_errors$TestError) == test_error_ridge) {
  cat("The Ridge Regression model has the lowest test error, indicating it is the best model for this data set.")
} else if (min(test_errors$TestError) == test_error_lasso) {
  cat("The Lasso model has the lowest test error, indicating it is the best model for this data set. Additionally, the Lasso model's number of non-zero coefficients is ", num_nonzero_coef, ", which suggests the model selected a subset of the predictors for a more parsimonious model.")
} else if (min(test_errors$TestError) == test_error_pcr) {
  cat("The PCR model has the lowest test error, indicating it is the best model for this data set. The optimal number of components selected was ", optimal_M,"" )
} else if (min(test_errors$TestError) == test_error_pls) {
  cat("The PLS model has the lowest test error, indicating it is the best model for this data set. The optimal number of components selected was ", optimal_M_pls, "")
}

## The PCR model has the lowest test error, indicating it is the best model for this data set. The optimal number of components selected was  17

Untitled

2024-06-13