Final_Software

#Q1(Chapter4 Question16)
library(MASS)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data("Boston")
median_crime_rate <- median(Boston$crim)
Boston$crim_binary <- ifelse(Boston$crim > median_crime_rate, 1, 0)
Boston <- Boston %>% select(-crim)
head(Boston)

##   zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat medv
## 1 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98 24.0
## 2  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14 21.6
## 3  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03 34.7
## 4  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94 33.4
## 5  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33 36.2
## 6  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21 28.7
##   crim_binary
## 1           0
## 2           0
## 3           0
## 4           0
## 5           0
## 6           0

set.seed(42)
train_indices <- sample(seq_len(nrow(Boston)), size = 0.7 * nrow(Boston))
train_data <- Boston[train_indices, ]
test_data <- Boston[-train_indices, ]
X_train <- train_data %>% select(-crim_binary)
y_train <- train_data$crim_binary
X_test <- test_data %>% select(-crim_binary)
y_test <- test_data$crim_binary
logreg_model <- glm(crim_binary ~ ., data = train_data, family = binomial)
logreg_preds <- predict(logreg_model, newdata = test_data, type = "response")
logreg_preds_binary <- ifelse(logreg_preds > 0.5, 1, 0)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

logreg_results <- confusionMatrix(as.factor(logreg_preds_binary), as.factor(y_test))
print("Logistic Regression:")

## [1] "Logistic Regression:"

print(logreg_results)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 76  6
##          1  2 68
##                                          
##                Accuracy : 0.9474         
##                  95% CI : (0.8989, 0.977)
##     No Information Rate : 0.5132         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.8945         
##                                          
##  Mcnemar's Test P-Value : 0.2888         
##                                          
##             Sensitivity : 0.9744         
##             Specificity : 0.9189         
##          Pos Pred Value : 0.9268         
##          Neg Pred Value : 0.9714         
##              Prevalence : 0.5132         
##          Detection Rate : 0.5000         
##    Detection Prevalence : 0.5395         
##       Balanced Accuracy : 0.9466         
##                                          
##        'Positive' Class : 0              
##

lda_model <- lda(crim_binary ~ ., data = train_data)
lda_preds <- predict(lda_model, newdata = test_data)$class
lda_results <- confusionMatrix(lda_preds, as.factor(y_test))
print("Linear Discriminant Analysis:")

## [1] "Linear Discriminant Analysis:"

print(lda_results)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 76 19
##          1  2 55
##                                           
##                Accuracy : 0.8618          
##                  95% CI : (0.7966, 0.9124)
##     No Information Rate : 0.5132          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7219          
##                                           
##  Mcnemar's Test P-Value : 0.0004803       
##                                           
##             Sensitivity : 0.9744          
##             Specificity : 0.7432          
##          Pos Pred Value : 0.8000          
##          Neg Pred Value : 0.9649          
##              Prevalence : 0.5132          
##          Detection Rate : 0.5000          
##    Detection Prevalence : 0.6250          
##       Balanced Accuracy : 0.8588          
##                                           
##        'Positive' Class : 0               
##

library(e1071)
nb_model <- naiveBayes(crim_binary ~ ., data = train_data)
nb_preds <- predict(nb_model, newdata = test_data)
nb_results <- confusionMatrix(nb_preds, as.factor(y_test))
print("Naive Bayes:")

## [1] "Naive Bayes:"

print(nb_results)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 74 16
##          1  4 58
##                                           
##                Accuracy : 0.8684          
##                  95% CI : (0.8041, 0.9177)
##     No Information Rate : 0.5132          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7356          
##                                           
##  Mcnemar's Test P-Value : 0.01391         
##                                           
##             Sensitivity : 0.9487          
##             Specificity : 0.7838          
##          Pos Pred Value : 0.8222          
##          Neg Pred Value : 0.9355          
##              Prevalence : 0.5132          
##          Detection Rate : 0.4868          
##    Detection Prevalence : 0.5921          
##       Balanced Accuracy : 0.8663          
##                                           
##        'Positive' Class : 0               
##

library(class)
train_data_scaled <- scale(train_data %>% select(-crim_binary))
test_data_scaled <- scale(test_data %>% select(-crim_binary), center = attr(train_data_scaled, "scaled:center"), scale = attr(train_data_scaled, "scaled:scale"))
k <- 5  # You can try different values of k
knn_preds <- knn(train = train_data_scaled, test = test_data_scaled, cl = y_train, k = k)
knn_results <- confusionMatrix(knn_preds, as.factor(y_test))
print(paste("K-Nearest Neighbors with k =", k, ":"))

## [1] "K-Nearest Neighbors with k = 5 :"

print(knn_results)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 77  8
##          1  1 66
##                                           
##                Accuracy : 0.9408          
##                  95% CI : (0.8906, 0.9726)
##     No Information Rate : 0.5132          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8812          
##                                           
##  Mcnemar's Test P-Value : 0.0455          
##                                           
##             Sensitivity : 0.9872          
##             Specificity : 0.8919          
##          Pos Pred Value : 0.9059          
##          Neg Pred Value : 0.9851          
##              Prevalence : 0.5132          
##          Detection Rate : 0.5066          
##    Detection Prevalence : 0.5592          
##       Balanced Accuracy : 0.9395          
##                                           
##        'Positive' Class : 0               
##

#Q2(Chapter5 Question5)
library(ISLR)
library(caret)
set.seed(42)
logreg_model <- glm(default ~ income + balance, data = Default, family = binomial)
summary(logreg_model)

## 
## Call:
## glm(formula = default ~ income + balance, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

train_indices <- createDataPartition(Default$default, p = 0.7, list = FALSE)
train_data <- Default[train_indices, ]
validation_data <- Default[-train_indices, ]
logreg_model_train <- glm(default ~ income + balance, data = train_data, family = binomial)
logreg_probs <- predict(logreg_model_train, newdata = validation_data, type = "response")
# Classify the individuals to the default category if the probability > 0.5
logreg_preds <- ifelse(logreg_probs > 0.5, "Yes", "No")
conf_matrix <- table(logreg_preds, validation_data$default)
conf_matrix

##             
## logreg_preds   No  Yes
##          No  2887   69
##          Yes   13   30

validation_error <- mean(logreg_preds != validation_data$default)
validation_error

## [1] 0.02734245

validation_error_logreg <- function(seed) {
  set.seed(seed)
  train_indices <- createDataPartition(Default$default, p = 0.7, list = FALSE)
  train_data <- Default[train_indices, ]
  validation_data <- Default[-train_indices, ]
  logreg_model_train <- glm(default ~ income + balance, data = train_data, family = binomial)
  logreg_probs <- predict(logreg_model_train, newdata = validation_data, type = "response")
  logreg_preds <- ifelse(logreg_probs > 0.5, "Yes", "No")
  validation_error <- mean(logreg_preds != validation_data$default)
  return(validation_error)
}
seeds <- c(1, 2, 3)
errors <- sapply(seeds, validation_error_logreg)
errors

## [1] 0.02600867 0.02767589 0.02534178

logreg_model_student <- glm(default ~ income + balance + student, data = Default, family = binomial)
summary(logreg_model_student)

## 
## Call:
## glm(formula = default ~ income + balance + student, family = binomial, 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

validation_error_logreg_student <- function(seed) {
  set.seed(seed)
  train_indices <- createDataPartition(Default$default, p = 0.7, list = FALSE)
  train_data <- Default[train_indices, ]
  validation_data <- Default[-train_indices, ]
  
  logreg_model_train <- glm(default ~ income + balance + student, data = train_data, family = binomial)
  logreg_probs <- predict(logreg_model_train, newdata = validation_data, type = "response")
  logreg_preds <- ifelse(logreg_probs > 0.5, "Yes", "No")
  
  validation_error <- mean(logreg_preds != validation_data$default)
  return(validation_error)
}
errors_student <- sapply(seeds, validation_error_logreg_student)
errors_student

## [1] 0.02667556 0.02767589 0.02534178

errors

## [1] 0.02600867 0.02767589 0.02534178

errors_student

## [1] 0.02667556 0.02767589 0.02534178

mean_error <- mean(errors)
mean_error_student <- mean(errors_student)
mean_error

## [1] 0.02634211

mean_error_student

## [1] 0.02656441

#Q3(Chapter6 Question9)
library(ISLR)
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-8

library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:stats':
## 
##     loadings

library(caret)
data("College")
set.seed(42)
train_indices <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]
# Fit a linear model using least squares
lm_model <- lm(Apps ~ ., data = train_data)
lm_preds <- predict(lm_model, newdata = test_data)
lm_mse <- mean((lm_preds - test_data$Apps)^2)
lm_mse

## [1] 1390507

x_train <- model.matrix(Apps ~ ., train_data)[, -1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., test_data)[, -1]
y_test <- test_data$Apps

# Fit a ridge regression model with cross-validation
ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0)
best_lambda_ridge <- ridge_cv$lambda.min
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = best_lambda_ridge)
ridge_preds <- predict(ridge_model, s = best_lambda_ridge, newx = x_test)
ridge_mse <- mean((ridge_preds - y_test)^2)
ridge_mse

## [1] 1315282

# Fit a lasso model with cross-validation
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)
best_lambda_lasso <- lasso_cv$lambda.min
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = best_lambda_lasso)
lasso_preds <- predict(lasso_model, s = best_lambda_lasso, newx = x_test)
lasso_mse <- mean((lasso_preds - y_test)^2)
lasso_mse

## [1] 1389547

lasso_coef <- predict(lasso_model, type = "coefficients", s = best_lambda_lasso)
nonzero_coef_count <- sum(lasso_coef != 0)
nonzero_coef_count

## [1] 18

# Fit a PCR model with cross-validation
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pcr_model, val.type = "MSEP")

optimal_M <- which.min(pcr_model$validation$PRESS)
pcr_preds <- predict(pcr_model, newdata = test_data, ncomp = optimal_M)
pcr_mse <- mean((pcr_preds - y_test)^2)
pcr_mse

## [1] 1390507

optimal_M

## [1] 17

# Fit a PLS model with cross-validation
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
validationplot(pls_model, val.type = "MSEP")

optimal_M_pls <- which.min(pls_model$validation$PRESS)
pls_preds <- predict(pls_model, newdata = test_data, ncomp = optimal_M_pls)
pls_mse <- mean((pls_preds - y_test)^2)
pls_mse

## [1] 1386457

optimal_M_pls

## [1] 13

# Compare the test errors from all models
results <- data.frame(
  Model = c("Linear Regression", "Ridge Regression", "Lasso", "PCR", "PLS"),
  Test_Error = c(lm_mse, ridge_mse, lasso_mse, pcr_mse, pls_mse)
)
print(results)

##               Model Test_Error
## 1 Linear Regression    1390507
## 2  Ridge Regression    1315282
## 3             Lasso    1389547
## 4               PCR    1390507
## 5               PLS    1386457

nonzero_coef_count

## [1] 18

# Print the summary
cat("Linear Regression Test Error:", lm_mse, "\n")

## Linear Regression Test Error: 1390507

cat("Ridge Regression Test Error:", ridge_mse, "with lambda =", best_lambda_ridge, "\n")

## Ridge Regression Test Error: 1315282 with lambda = 372.4627

cat("Lasso Regression Test Error:", lasso_mse, "with lambda =", best_lambda_lasso, "\n")

## Lasso Regression Test Error: 1389547 with lambda = 1.987721

cat("Number of non-zero coefficients in Lasso:", nonzero_coef_count, "\n")

## Number of non-zero coefficients in Lasso: 18

cat("PCR Test Error:", pcr_mse, "with", optimal_M, "components\n")

## PCR Test Error: 1390507 with 17 components

cat("PLS Test Error:", pls_mse, "with", optimal_M_pls, "components\n")

## PLS Test Error: 1386457 with 13 components

cat("\nComment on the results:\n")

## 
## Comment on the results:

cat("The test errors for the different models are quite similar, suggesting that all methods provide comparable predictions of the number of applications received. The Lasso model has the advantage of performing feature selection, reducing the number of predictors. Ridge regression, PCR, and PLS also perform well, with PCR and PLS having the flexibility to select the optimal number of components through cross-validation.")

## The test errors for the different models are quite similar, suggesting that all methods provide comparable predictions of the number of applications received. The Lasso model has the advantage of performing feature selection, reducing the number of predictors. Ridge regression, PCR, and PLS also perform well, with PCR and PLS having the flexibility to select the optimal number of components through cross-validation.

#Question4
library(quantmod)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first()    masks dplyr::first()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ xts::last()     masks dplyr::last()
## ✖ purrr::lift()   masks caret::lift()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ dplyr::select() masks MASS::select()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)
getSymbols("AAPL", src = "yahoo", from = "2020-01-01", to = "2023-01-01")

## [1] "AAPL"

# Extract the closing prices
apple_data <- Cl(AAPL)
# Convert to a data frame
apple_df <- data.frame(Date = index(apple_data), Close = as.numeric(apple_data))
head(apple_df)

##         Date   Close
## 1 2020-01-02 75.0875
## 2 2020-01-03 74.3575
## 3 2020-01-06 74.9500
## 4 2020-01-07 74.5975
## 5 2020-01-08 75.7975
## 6 2020-01-09 77.4075

# Create lagged features
apple_df <- apple_df %>%
  mutate(Lag1 = lag(Close, 1),
         Lag2 = lag(Close, 2),
         Lag3 = lag(Close, 3)) %>%
  na.omit()
# Split the data into training and test sets
set.seed(123)
trainIndex <- createDataPartition(apple_df$Close, p = 0.8, 
                                  list = FALSE, 
                                  times = 1)
train_data <- apple_df[ trainIndex,]
test_data <- apple_df[-trainIndex,]
# Train a linear regression model
model <- train(Close ~ Lag1 + Lag2 + Lag3, data = train_data, method = "lm")
summary(model)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7216  -1.4333   0.0471   1.5856  11.5270 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.59721    0.50943   1.172    0.242    
## Lag1         0.90262    0.04233  21.322   <2e-16 ***
## Lag2         0.06866    0.05696   1.205    0.228    
## Lag3         0.02470    0.04131   0.598    0.550    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.831 on 601 degrees of freedom
## Multiple R-squared:  0.9913, Adjusted R-squared:  0.9913 
## F-statistic: 2.282e+04 on 3 and 601 DF,  p-value: < 2.2e-16

# Make predictions 
predictions <- predict(model, newdata = test_data)
# Evaluate the model
results <- data.frame(Actual = test_data$Close, Predicted = predictions)
# Calculate performance metrics
RMSE <- sqrt(mean((results$Actual - results$Predicted)^2))
MAE <- mean(abs(results$Actual - results$Predicted))
cat("Root Mean Squared Error: ", RMSE, "\n")

## Root Mean Squared Error:  2.627207

cat("Mean Absolute Error: ", MAE, "\n")

## Mean Absolute Error:  1.977508

# Plot the actual vs predicted stock prices
ggplot(results, aes(x = Actual, y = Predicted)) +
  geom_point(color = 'blue') +
  geom_abline(slope = 1, intercept = 0, color = 'red') +
  ggtitle("Actual vs Predicted Stock Prices") +
  xlab("Actual Prices") +
  ylab("Predicted Prices")

Final_Software

2024-06-11