final exam 112035146

###########################Chapter4
library(pacman)
p_load(class, tidyverse,MASS, corrplot, kableExtra, ISLR2)
data("Boston")
attach(Boston)
head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7

# Calculate the median crime rate
med_crime <- median(Boston$crim)
crim_lvl <- ifelse(Boston$crim > med_crime, 1, 0)


# Convert 'crim_lvl' to a factor variable
crim_lvl <- as.factor(crim_lvl)
Boston_2 <- data.frame(Boston, crim_lvl)

# Detach the 'Boston' dataset
detach(Boston)

# Visualize the relationships between variables in 'Boston_2' using a scatterplot matrix
pairs(Boston_2)

summary(Boston_2)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv       crim_lvl
##  Min.   : 5.00   0:253   
##  1st Qu.:17.02   1:253   
##  Median :21.20           
##  Mean   :22.53           
##  3rd Qu.:25.00           
##  Max.   :50.00

corrplot(cor(Boston_2[,-14]), method="square")

crime_fit<-glm(crim_lvl~indus + nox + age + dis + rad + tax + lstat + medv, data= Boston_2,                       family=binomial)
summary(crime_fit)

## 
## Call:
## glm(formula = crim_lvl ~ indus + nox + age + dis + rad + tax + 
##     lstat + medv, family = binomial, data = Boston_2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -27.582736   4.343637  -6.350 2.15e-10 ***
## indus        -0.068239   0.044386  -1.537  0.12420    
## nox          44.337338   7.269049   6.099 1.06e-09 ***
## age           0.013836   0.009621   1.438  0.15040    
## dis           0.292732   0.158087   1.852  0.06407 .  
## rad           0.533675   0.119105   4.481 7.44e-06 ***
## tax          -0.006313   0.002408  -2.621  0.00876 ** 
## lstat         0.039949   0.042914   0.931  0.35190    
## medv          0.052652   0.032482   1.621  0.10503    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 701.46  on 505  degrees of freedom
## Residual deviance: 242.01  on 497  degrees of freedom
## AIC: 260.01
## 
## Number of Fisher Scoring iterations: 8

crime_fit<-glm(crim_lvl~ nox + rad + tax, data= Boston_2,                       family=binomial)
summary(crime_fit)

## 
## Call:
## glm(formula = crim_lvl ~ nox + rad + tax, family = binomial, 
##     data = Boston_2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -20.075965   2.287688  -8.776  < 2e-16 ***
## nox          36.015644   4.338728   8.301  < 2e-16 ***
## rad           0.631900   0.111610   5.662  1.5e-08 ***
## tax          -0.007801   0.002184  -3.571 0.000355 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 701.46  on 505  degrees of freedom
## Residual deviance: 250.34  on 502  degrees of freedom
## AIC: 258.34
## 
## Number of Fisher Scoring iterations: 8

set.seed(1)
train_13 <- rbinom(506, 1, 0.7)
Boston_2 <- cbind(Boston_2, train_13)
Boston.train <- Boston_2[train_13 == 1,]
Boston.test <- Boston_2[train_13 == 0,]
# We generate model using the train data
Boston_fit <- glm(crim_lvl~ nox + rad + tax, data= Boston.train,                       family=binomial)
Boston.probs <- predict(Boston_fit, Boston.test, type = "response")
Boston.probs[1:10]

##          4          6          7         15         17         18         20 
## 0.02569957 0.02569957 0.38986175 0.36048788 0.36048788 0.36048788 0.36048788 
##         21         29         35 
## 0.36048788 0.36048788 0.36048788

contrasts(crim_lvl)

##   1
## 0 0
## 1 1

Bos_pred <- rep(0, length(Boston.probs))
Bos_pred[Boston.probs > .5] = 1
table(Bos_pred, Boston.test$crim_lvl)

##         
## Bos_pred  0  1
##        0 67 15
##        1  8 56

(8 + 15) / length(Boston.probs)

## [1] 0.1575342

library(MASS)
lda.fit <- lda(crim_lvl ~ nox + tax + rad, data = Boston_2,
               subset = (train_13==1))
lda.fit

## Call:
## lda(crim_lvl ~ nox + tax + rad, data = Boston_2, subset = (train_13 == 
##     1))
## 
## Prior probabilities of groups:
##         0         1 
## 0.4944444 0.5055556 
## 
## Group means:
##         nox      tax       rad
## 0 0.4690051 303.3989  4.106742
## 1 0.6401758 506.0549 14.576923
## 
## Coefficients of linear discriminants:
##              LD1
## nox  9.970745177
## tax -0.001202623
## rad  0.081982185

lda.pred <- predict(lda.fit, Boston.test)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class <- lda.pred$class

table(lda.class, Boston.test$crim_lvl)

##          
## lda.class  0  1
##         0 73 17
##         1  2 54

(18 + 2) / length(Boston.probs)

## [1] 0.1369863

Bos.train <- cbind(Boston.train$nox, Boston.train$tax, Boston.train$rad)
Bos.test <- cbind(Boston.train$nox, Boston.train$tax, Boston.train$rad)
set.seed(1)
knn.pred=knn(Bos.train,Bos.test, Boston.train$crim_lvl,k=1)
table(knn.pred,Boston.train$crim_lvl)

##         
## knn.pred   0   1
##        0 176  10
##        1   2 172

#Error rate:
(10+2)/146

## [1] 0.08219178

knn.pred=knn(Bos.train,Bos.test, Boston.train$crim_lvl,k=3)
table(knn.pred,Boston.train$crim_lvl)

##         
## knn.pred   0   1
##        0 170  10
##        1   8 172

#Error rate:
(10+8)/146

## [1] 0.1232877

library(ISLR2)
data <- Boston
head(data,6)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7

library(dplyr)  
library(ggplot2)
library(rsample)   
library(tidyverse)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(vip)

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(kknn)

## 
## Attaching package: 'kknn'

## The following object is masked from 'package:caret':
## 
##     contr.dummy

library(pacman)
p_load(class, tidyverse,MASS, corrplot, kableExtra, ISLR2)
data("Boston")
attach(Boston)
head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7

data <- data %>%
  mutate(chas = factor(chas),
           crime_rate = factor(ifelse(crim > median(crim), 
                                              'High', 'Low'), 
                                       levels = c('High', 'Low')))
head(data,6)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7
##   crime_rate
## 1        Low
## 2        Low
## 3        Low
## 4        Low
## 5        Low
## 6        Low

summary(data)

##       crim                zn             indus       chas         nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   0:471   Min.   :0.3850  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1: 35   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69           Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14           Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10           3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74           Max.   :0.8710  
##        rm             age              dis              rad        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##       tax           ptratio          lstat            medv       crime_rate
##  Min.   :187.0   Min.   :12.60   Min.   : 1.73   Min.   : 5.00   High:253  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95   1st Qu.:17.02   Low :253  
##  Median :330.0   Median :19.05   Median :11.36   Median :21.20             
##  Mean   :408.2   Mean   :18.46   Mean   :12.65   Mean   :22.53             
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95   3rd Qu.:25.00             
##  Max.   :711.0   Max.   :22.00   Max.   :37.97   Max.   :50.00

df = read.csv("/cloud/project/Boston.csv")
head(df)

##   X    crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

summary(Boston)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          lstat      
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   : 1.73  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.: 6.95  
##  Median : 5.000   Median :330.0   Median :19.05   Median :11.36  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :12.65  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:16.95  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00

set.seed(123) 
data_split <- initial_split(data, prop = .7, strata ="crim")
data_train <- training(data_split)
head(data_train,6)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 2 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 3 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7
## 4 0.06417  0  5.96    0 0.499 5.933 68.2 3.3603   5 279    19.2  9.68 18.9
## 5 0.02763 75  2.95    0 0.428 6.595 21.8 5.4011   3 252    18.3  4.32 30.8
## 6 0.05360 21  5.64    0 0.439 6.511 21.1 6.8147   4 243    16.8  5.28 25.0
##   crime_rate
## 1        Low
## 2        Low
## 3        Low
## 4        Low
## 5        Low
## 6        Low

data_test  <- testing(data_split) 
head(data_test,6)

##      crim   zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18.0  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0.0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0.0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.17004 12.5  7.87    0 0.524 6.004 85.9 6.5921   5 311    15.2 17.10 18.9
## 5 0.62739  0.0  8.14    0 0.538 5.834 56.5 4.4986   4 307    21.0  8.47 19.9
## 6 0.75026  0.0  8.14    0 0.538 5.924 94.1 4.3996   4 307    21.0 16.30 15.6
##   crime_rate
## 1        Low
## 2        Low
## 3        Low
## 4        Low
## 5       High
## 6       High

Boston <- df
Boston$crime1 <- NA
crime_m = median(Boston$crim)
crime_m

## [1] 0.25651

for(i in 1:dim(Boston)[1]){
if (Boston$crim[i] > crime_m)
 {
Boston$crime1[i] = 1 }
 else{
 Boston$crime1[i] = 0
 }
 }
 head(Boston)

##   X    crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv crime1
## 1 24.0      0
## 2 21.6      0
## 3 34.7      0
## 4 33.4      0
## 5 36.2      0
## 6 28.7      0

 autocorr = cor(Boston$crime1,Boston)
autocorr

##              X      crim        zn     indus       chas       nox         rm
## [1,] 0.3694304 0.4093955 -0.436151 0.6032602 0.07009677 0.7232348 -0.1563718
##            age        dis       rad       tax   ptratio      black     lstat
## [1,] 0.6139399 -0.6163416 0.6197862 0.6087413 0.2535684 -0.3512109 0.4532627
##            medv crime1
## [1,] -0.2630167      1

model1 <-glm(crime_rate~chas+indus+age+dis+rad+tax, data=data_train,family=binomial)
summary(model1)

## 
## Call:
## glm(formula = crime_rate ~ chas + indus + age + dis + rad + tax, 
##     family = binomial, data = data_train)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.270142   1.179986   1.924  0.05437 .  
## chas1        0.309733   0.658693   0.470  0.63820    
## indus       -0.075372   0.037364  -2.017  0.04367 *  
## age         -0.025658   0.008654  -2.965  0.00303 ** 
## dis          0.433994   0.133896   3.241  0.00119 ** 
## rad         -0.537394   0.127793  -4.205 2.61e-05 ***
## tax          0.005480   0.002473   2.216  0.02671 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 487.98  on 351  degrees of freedom
## Residual deviance: 230.44  on 345  degrees of freedom
## AIC: 244.44
## 
## Number of Fisher Scoring iterations: 8

predicted1 <- predict(model1, data_train)
predicted2 <- if_else(predicted1 > 0.5, "Low", "High")
cross.tab1 <-table(data_train$chas, predicted2)
cross.tab1

##    predicted2
##     High Low
##   0  176 157
##   1   14   5

###########################Chapter 5
# Load necessary packages
library(ISLR)    # Contains the Default dataset

## 
## Attaching package: 'ISLR'

## The following objects are masked from 'package:ISLR2':
## 
##     Auto, Credit

library(dplyr)   # For data manipulation
library(caret)   # For data splitting and model accuracy
# Load the Default dataset
data("Default")
# Convert 'default' and 'student' to a factor
Default$default <- as.factor(Default$default)
Default$student <- as.factor(Default$student)
# Fit the logistic regression model
model_full <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(model_full)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

set.seed(123)  # Setting a random seed for reproducibility

# Function to perform analysis with different splits
perform_analysis <- function(data, splitRatio = 0.7) {
  # Splitting the data
  index <- createDataPartition(data$default, p = splitRatio, list = FALSE)
  train_set <- data[index, ]
  validation_set <- data[-index, ]

  # Fit the model on the training set
  model <- glm(default ~ income + balance, data = train_set, family = "binomial")

  # Predict on the validation set
  pred_probs <- predict(model, validation_set, type = "response")
  predictions <- ifelse(pred_probs > 0.5, "Yes", "No")

  # Compute the validation set error
  mean(predictions != validation_set$default)
}

# Perform the analysis three times
errors <- sapply(1:3, function(x) perform_analysis(Default))
errors

## [1] 0.02534178 0.02734245 0.02567523

mean(errors)  # Average error across the three splits

## [1] 0.02611982

# Function to analyze with the student variable included
perform_analysis_with_student <- function(data, splitRatio = 0.7) {
  index <- createDataPartition(data$default, p = splitRatio, list = FALSE)
  train_set <- data[index, ]
  validation_set <- data[-index, ]

  # Fit the model
  model <- glm(default ~ income + balance + student, data = train_set, family = "binomial")

  # Predict on the validation set
  pred_probs <- predict(model, validation_set, type = "response")
  predictions <- ifelse(pred, "Yes", "No")

  # Compute the validation set error
  mean(predictions != validation, set$default)
}

# Perform the analysis
# Function to analyze with the student variable included
perform_analysis_with_student <- function(data, splitRatio = 0.7) {
  # Split the dataset
  index <- createDataPartition(data$default, p = splitRatio, list = FALSE)
  train_set <- data[index, ]
  validation_set <- data[-index, ]

  # Fit the model on the training set
  model <- glm(default ~ income + balance + student, data = train_set, family = "binomial")

  # Predict on the validation set using type "response" for probabilities
  pred_probs <- predict(model, validation_set, type = "response")
  predictions <- ifelse(pred_probs > 0.5, "Yes", "No")

  # Compute the validation set error
  mean(predictions != validation_set$default)
}

# Perform the analysis
error_with_student <- perform_analysis_with_student(Default)
error_with_student

## [1] 0.027009

##########################Chapter6
# Load required packages
library(ISLR2)
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-8

# Load College data set
data("College")
# (a) Split the data set into a training set and a test set
set.seed(123)  # For reproducibility
train_indices <- sample(1:nrow(College), 0.7 * nrow(College))  # 70% for training
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]
#a. We split the College data set into a training set and a test set using a 70/30 split, where 70% of the data is used for training.
# (b) Fit a linear model using least squares on the training set and report the test error obtained.
data2 <- College
head(College,6)

##                              Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University     Yes 1660   1232    721        23        52
## Adelphi University               Yes 2186   1924    512        16        29
## Adrian College                   Yes 1428   1097    336        22        50
## Agnes Scott College              Yes  417    349    137        60        89
## Alaska Pacific University        Yes  193    146     55        16        44
## Albertson College                Yes  587    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University        2885         537     7440       3300   450
## Adelphi University                  2683        1227    12280       6450   750
## Adrian College                      1036          99    11250       3750   400
## Agnes Scott College                  510          63    12960       5450   450
## Alaska Pacific University            249         869     7560       4120   800
## Albertson College                    678          41    13500       3335   500
##                              Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University     2200  70       78      18.1          12   7041
## Adelphi University               1500  29       30      12.2          16  10527
## Adrian College                   1165  53       66      12.9          30   8735
## Agnes Scott College               875  92       97       7.7          37  19016
## Alaska Pacific University        1500  76       72      11.9           2  10922
## Albertson College                 675  67       73       9.4          11   9727
##                              Grad.Rate
## Abilene Christian University        60
## Adelphi University                  56
## Adrian College                      54
## Agnes Scott College                 59
## Alaska Pacific University           15
## Albertson College                   55

summary(data2)

##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00

linear_model <- lm(Apps ~ ., data = train_data)
linear_predictions <- predict(linear_model, newdata = test_data)
linear_test_error <- mean((test_data$Apps - linear_predictions)^2)
linear_test_error

## [1] 1734841

#(b). We fit a linear regression model using least squares on the training set. The lm function is used, with Apps as the response variable and all other variables as predictors. We then use the model to make predictions on the test set and calculate the test error, which is the mean squared error between the predicted values and the actual values of Apps in the test set.
set.seed(123) 
data_split1 <- initial_split(data2, prop = .7, strata ="Apps")
data_train1 <- training(data_split1)
data_test1<- testing(data_split1)
head(data_train1,6)

##                                 Private Apps Accept Enroll Top10perc Top25perc
## Albertus Magnus College             Yes  353    340    103        17        45
## Alderson-Broaddus College           Yes  582    498    172        21        44
## Alverno College                     Yes  494    313    157        23        46
## Antioch University                  Yes  713    661    252        25        44
## Aquinas College                     Yes  619    516    219        20        51
## Arkansas College (Lyon College)     Yes  708    334    166        46        74
##                                 F.Undergrad P.Undergrad Outstate Room.Board
## Albertus Magnus College                 416         230    13290       5720
## Alderson-Broaddus College               799          78    10468       3380
## Alverno College                        1317        1235     8352       3640
## Antioch University                      712          23    15476       3336
## Aquinas College                        1251         767    11208       4124
## Arkansas College (Lyon College)         530         182     8644       3922
##                                 Books Personal PhD Terminal S.F.Ratio
## Albertus Magnus College           500     1500  90       93      11.5
## Alderson-Broaddus College         660     1800  40       41      11.5
## Alverno College                   650     2449  36       69      11.1
## Antioch University                400     1100  69       82      11.3
## Aquinas College                   350     1615  55       65      12.7
## Arkansas College (Lyon College)   500      800  79       88      12.6
##                                 perc.alumni Expend Grad.Rate
## Albertus Magnus College                  26   8861        63
## Alderson-Broaddus College                15   8991        52
## Alverno College                          26   8127        55
## Antioch University                       35  42926        48
## Aquinas College                          25   6584        65
## Arkansas College (Lyon College)          24  14579        54

head(data_test1,6)

##                                         Private Apps Accept Enroll Top10perc
## Abilene Christian University                Yes 1660   1232    721        23
## Adelphi University                          Yes 2186   1924    512        16
## Agnes Scott College                         Yes  417    349    137        60
## Alaska Pacific University                   Yes  193    146     55        16
## Albertson College                           Yes  587    479    158        38
## Allentown Coll. of St. Francis de Sales     Yes 1179    780    290        38
##                                         Top25perc F.Undergrad P.Undergrad
## Abilene Christian University                   52        2885         537
## Adelphi University                             29        2683        1227
## Agnes Scott College                            89         510          63
## Alaska Pacific University                      44         249         869
## Albertson College                              62         678          41
## Allentown Coll. of St. Francis de Sales        64        1130         638
##                                         Outstate Room.Board Books Personal PhD
## Abilene Christian University                7440       3300   450     2200  70
## Adelphi University                         12280       6450   750     1500  29
## Agnes Scott College                        12960       5450   450      875  92
## Alaska Pacific University                   7560       4120   800     1500  76
## Albertson College                          13500       3335   500      675  67
## Allentown Coll. of St. Francis de Sales     9690       4785   600     1000  60
##                                         Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University                  78      18.1          12   7041
## Adelphi University                            30      12.2          16  10527
## Agnes Scott College                           97       7.7          37  19016
## Alaska Pacific University                     72      11.9           2  10922
## Albertson College                             73       9.4          11   9727
## Allentown Coll. of St. Francis de Sales       84      13.3          21   7940
##                                         Grad.Rate
## Abilene Christian University                   60
## Adelphi University                             56
## Agnes Scott College                            59
## Alaska Pacific University                      15
## Albertson College                              55
## Allentown Coll. of St. Francis de Sales        74

lm.fit <-  lm(Apps ~ . , data = data_train1)
lm.pred <-  predict(lm.fit, data_test1)
mean((data_test1[, "Apps"] - lm.pred)^2)

## [1] 1630969

# (c) Fit a ridge regression model on the training set, with λ chosen by cross-validation and report the test error obtained
ridge_model <- cv.glmnet(as.matrix(train_data[, -1]), train_data$Apps, alpha = 0, nfolds = 10)
ridge_predictions <- predict(ridge_model, newx = as.matrix(test_data[, -1]), s = "lambda.min")
ridge_test_error <- mean((test_data$Apps - ridge_predictions)^2)
ridge_test_error

## [1] 557372.2

#(c). We fit a ridge regression model on the training set using cross-validation to select the optimal lambda (λ) value. The cv.glmnet function from the glmnet package is used. We set alpha = 0 for ridge regression. We then use the fitted model to make predictions on the test set using the lambda value that minimizes cross-validation error and calculate the test error.
library(glmnet)
train.mat <-  model.matrix(Apps ~ . -1 , data = data_train1)
test.mat <-  model.matrix(Apps ~ . -1, data = data_test1)
grid <-  10 ^ seq(4, -2, length = 100)
mod.ridge <-  cv.glmnet(train.mat, data_train1[, "Apps"], 
                        alpha = 0, lambda = grid, thresh = 1e-12)
lambda.best <-  mod.ridge$lambda.min
lambda.best

## [1] 10.72267

ridge.pred <-  predict(mod.ridge, newx = test.mat, s = lambda.best)
mean((data_test1[, "Apps"] - ridge.pred)^2)

## [1] 1707981

# (d) Fit a lasso model on the training set, with λ chosen by cross-validation and report the test error obtained, along with the number of non-zero coefficient estimates
lasso_model <- cv.glmnet(as.matrix(train_data[, -1]), train_data$Apps, alpha = 1, nfolds = 10)
lasso_predictions <- predict(lasso_model, newx = as.matrix(test_data[, -1]), s = "lambda.min")
lasso_test_error <- mean((test_data$Apps - lasso_predictions)^2)
lasso_test_error

## [1] 19487.97

non_zero_coef <- sum(coef(lasso_model, s = "lambda.min") != 0)
non_zero_coef

## [1] 2

mod.lasso <-  cv.glmnet(train.mat, data_train1[, "Apps"], 
                        alpha = 1, lambda = grid, thresh = 1e-12)
lambda.best <-  mod.lasso$lambda.min
lambda.best

## [1] 4.037017

lasso.pred <-  predict(mod.lasso, newx = test.mat, s = lambda.best)
mean((data_test1[, "Apps"] - lasso.pred)^2)

## [1] 1682458

#e
# Load required libraries
library(ISLR)
library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:caret':
## 
##     R2

## The following object is masked from 'package:corrplot':
## 
##     corrplot

## The following object is masked from 'package:stats':
## 
##     loadings

# Load the College data set
data(College)

# Set seed for reproducibility
set.seed(123)

# Split the data into training and test sets
train_indices <- sample(1:nrow(College), size = nrow(College) / 2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]

# Fit PCR model using cross-validation to select the number of components
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Get the number of components chosen by cross-validation
optimal_components <- which.min(summary(pcr_model)$val[,"MSEP"])

## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3258     3228     1665     1614     1297     1311     1216
## adjCV         3258     3229     1664     1612     1284     1315     1213
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1219     1207     1149      1151      1149      1158      1160
## adjCV     1219     1210     1146      1148      1146      1155      1157
##        14 comps  15 comps  16 comps  17 comps
## CV         1160      1147      1021      1023
## adjCV      1157      1146      1017      1018
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X      32.969    59.69    66.68    72.25    77.31    81.82    85.18    88.34
## Apps    3.259    74.43    76.35    84.40    84.42    86.69    86.79    87.01
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       91.32     93.61     95.48     97.05     98.11     98.90     99.40
## Apps    88.38     88.39     88.47     88.49     88.50     88.51     88.74
##       16 comps  17 comps
## X        99.81    100.00
## Apps     91.48     91.58

# Report the optimal number of components
cat("Optimal number of components:", optimal_components, "\n")

## Optimal number of components:

# Make predictions on the test set
test_predictions <- predict(pcr_model, newdata = test_data, ncomp = optimal_components)

# Calculate the test error (Mean Squared Error)
test_error <- mean((test_predictions - test_data$Apps)^2)
cat("Test MSE:", test_error, "\n")

## Test MSE: NaN

#f
# Load required libraries
library(ISLR)
library(pls)

# Load the College data set
data(College)

# Set seed for reproducibility
set.seed(123)

# Split the data into training and test sets
train_indices <- sample(1:nrow(College), size = nrow(College) / 2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]

# Fit PLS model using cross-validation to select the number of components
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")

# Get the number of components chosen by cross-validation
optimal_components <- which.min(summary(pls_model)$val[,"MSEP"])

## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3258     1504     1245     1130     1106     1072     1033
## adjCV         3258     1502     1247     1128     1102     1068     1028
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1026     1020     1021      1021      1022      1023      1023
## adjCV     1021     1016     1016      1017      1018      1018      1019
##        14 comps  15 comps  16 comps  17 comps
## CV         1023      1023      1023      1023
## adjCV      1018      1018      1018      1018
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       26.97    43.83    65.16    69.79    73.63    76.18    79.97    81.98
## Apps    79.33    85.98    88.76    89.67    90.59    91.43    91.54    91.56
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       84.18     86.85     90.18     91.77     93.55     95.40     97.35
## Apps    91.57     91.57     91.57     91.58     91.58     91.58     91.58
##       16 comps  17 comps
## X        99.06    100.00
## Apps     91.58     91.58

# Report the optimal number of components
cat("Optimal number of components:", optimal_components, "\n")

## Optimal number of components:

# Make predictions on the test set
test_predictions <- predict(pls_model, newdata = test_data, ncomp = optimal_components)

# Calculate the test error (Mean Squared Error)
test_error <- mean((test_predictions - test_data$Apps)^2)
cat("Test MSE:", test_error, "\n")

## Test MSE: NaN

#g
# Load required libraries
library(ISLR)
library(pls)
library(glmnet)

# Load the College data set
data(College)

# Set seed for reproducibility
set.seed(123)

# Split the data into training and test sets
train_indices <- sample(1:nrow(College), size = nrow(College) / 2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]

# Prepare data for glmnet
x_train <- model.matrix(Apps ~ ., train_data)[,-1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., test_data)[,-1]
y_test <- test_data$Apps

# Linear Regression
lm_model <- lm(Apps ~ ., data = train_data)
lm_pred <- predict(lm_model, newdata = test_data)
lm_mse <- mean((lm_pred - y_test)^2)
cat("Linear Regression Test MSE:", lm_mse, "\n")

## Linear Regression Test MSE: 1373995

# Ridge Regression
ridge_model <- cv.glmnet(x_train, y_train, alpha = 0)
ridge_pred <- predict(ridge_model, s = "lambda.min", newx = x_test)
ridge_mse <- mean((ridge_pred - y_test)^2)
cat("Ridge Regression Test MSE:", ridge_mse, "\n")

## Ridge Regression Test MSE: 2090325

# Lasso Regression
lasso_model <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_pred <- predict(lasso_model, s = "lambda.min", newx = x_test)
lasso_mse <- mean((lasso_pred - y_test)^2)
cat("Lasso Regression Test MSE:", lasso_mse, "\n")

## Lasso Regression Test MSE: 1397441

# Principal Components Regression (PCR)
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
optimal_pcr <- which.min(summary(pcr_model)$val[,"MSEP"])

## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3258     3251     1672     1618     1329     1323     1240
## adjCV         3258     3252     1670     1617     1316     1321     1237
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1236     1222     1157      1158      1159      1162      1166
## adjCV     1234     1227     1153      1156      1156      1159      1163
##        14 comps  15 comps  16 comps  17 comps
## CV         1167      1149      1014      1019
## adjCV      1163      1147      1010      1015
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X      32.969    59.69    66.68    72.25    77.31    81.82    85.18    88.34
## Apps    3.259    74.43    76.35    84.40    84.42    86.69    86.79    87.01
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       91.32     93.61     95.48     97.05     98.11     98.90     99.40
## Apps    88.38     88.39     88.47     88.49     88.50     88.51     88.74
##       16 comps  17 comps
## X        99.81    100.00
## Apps     91.48     91.58

pcr_pred <- predict(pcr_model, newdata = test_data, ncomp = optimal_pcr)
pcr_mse <- mean((pcr_pred - y_test)^2)
cat("PCR Test MSE:", pcr_mse, "\n")

## PCR Test MSE: NaN

# Partial Least Squares (PLS)
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
optimal_pls <- which.min(summary(pls_model)$val[,"MSEP"])

## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            3258     1510     1252     1133     1098     1064     1023
## adjCV         3258     1508     1253     1130     1095     1060     1019
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1015     1012     1013      1012      1012      1013      1013
## adjCV     1011     1009     1009      1008      1008      1009      1009
##        14 comps  15 comps  16 comps  17 comps
## CV         1013      1013      1013      1013
## adjCV      1009      1009      1009      1009
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       26.97    43.83    65.16    69.79    73.63    76.18    79.97    81.98
## Apps    79.33    85.98    88.76    89.67    90.59    91.43    91.54    91.56
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       84.18     86.85     90.18     91.77     93.55     95.40     97.35
## Apps    91.57     91.57     91.57     91.58     91.58     91.58     91.58
##       16 comps  17 comps
## X        99.06    100.00
## Apps     91.58     91.58

pls_pred <- predict(pls_model, newdata = test_data, ncomp = optimal_pls)
pls_mse <- mean((pls_pred - y_test)^2)
cat("PLS Test MSE:", pls_mse, "\n")

## PLS Test MSE: NaN

# Load necessary libraries
library(ISLR2)  # For the College data set
library(caret)  # For data partitioning and cross-validation
library(glmnet) # For ridge and lasso regression
library(pls)    # For PCR and PLS regression

# Load the College data set
data(College)

# Set a random seed for reproducibility
set.seed(123)

# Create a training index (70% training, 30% test)
trainIndex <- createDataPartition(College$Apps, p = 0.7, list = FALSE)

# Split the data
trainData <- College[trainIndex, ]
testData <- College[-trainIndex, ]

# Fit a linear model using least squares
lm_model <- lm(Apps ~ ., data = trainData)

# Predict on the test set
lm_predictions <- predict(lm_model, testData)

# Calculate the test error (Mean Squared Error)
lm_mse <- mean((lm_predictions - testData$Apps)^2)
cat("Linear Model Test Error (MSE):", lm_mse, "\n")

## Linear Model Test Error (MSE): 1882074

# Prepare data for glmnet
x_train <- model.matrix(Apps ~ ., trainData)[, -1]
y_train <- trainData$Apps
x_test <- model.matrix(Apps ~ ., testData)[, -1]
y_test <- testData$Apps

# Perform cross-validation to choose lambda for ridge regression
set.seed(123)
ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0)

# Fit the ridge regression model with the best lambda
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = ridge_cv$lambda.min)

# Predict on the test set
ridge_predictions <- predict(ridge_model, s = ridge_cv$lambda.min, newx = x_test)

# Calculate the test error (Mean Squared Error)
ridge_mse <- mean((ridge_predictions - y_test)^2)
cat("Ridge Regression Test Error (MSE):", ridge_mse, "\n")

## Ridge Regression Test Error (MSE): 3270111

# Perform cross-validation to choose lambda for lasso regression
set.seed(123)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)

# Fit the lasso model with the best lambda
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = lasso_cv$lambda.min)

# Predict on the test set
lasso_predictions <- predict(lasso_model, s = lasso_cv$lambda.min, newx = x_test)

# Calculate the test error (Mean Squared Error)
lasso_mse <- mean((lasso_predictions - y_test)^2)
cat("Lasso Regression Test Error (MSE):", lasso_mse, "\n")

## Lasso Regression Test Error (MSE): 1938601

# Get the number of non-zero coefficient estimates
lasso_coef <- coef(lasso_model, s = lasso_cv$lambda.min)
num_nonzero_coefs <- sum(lasso_coef != 0)
cat("Number of non-zero coefficients in Lasso:", num_nonzero_coefs, "\n")

## Number of non-zero coefficients in Lasso: 17

# Fit the PCR model with cross-validation to choose the number of components
set.seed(123)
pcr_model <- pcr(Apps ~ ., data = trainData, scale = TRUE, validation = "CV")

# Choose the number of components with the lowest cross-validation error
validationplot(pcr_model, val.type = "MSEP")

opt_ncomp <- which.min(pcr_model$validation$PRESS)

# Predict on the test set
pcr_predictions <- predict(pcr_model, testData, ncomp = opt_ncomp)

# Calculate the test error (Mean Squared Error)
pcr_mse <- mean((pcr_predictions - testData$Apps)^2)
cat("PCR Test Error (MSE):", pcr_mse, "\n")

## PCR Test Error (MSE): 1882074

cat("Optimal number of components (M) for PCR:", opt_ncomp, "\n")

## Optimal number of components (M) for PCR: 17

# Fit the PLS model with cross-validation to choose the number of components
set.seed(123)
pls_model <- plsr(Apps ~ ., data = trainData, scale = TRUE, validation = "CV")

# Choose the number of components with the lowest cross-validation error
validationplot(pls_model, val.type = "MSEP")

opt_ncomp_pls <- which.min(pls_model$validation$PRESS)

# Predict on the test set
pls_predictions <- predict(pls_model, testData, ncomp = opt_ncomp_pls)

# Calculate the test error (Mean Squared Error)
pls_mse <- mean((pls_predictions - testData$Apps)^2)
cat("PLS Test Error (MSE):", pls_mse, "\n")

## PLS Test Error (MSE): 1897756

cat("Optimal number of components (M) for PLS:", opt_ncomp_pls, "\n")

## Optimal number of components (M) for PLS: 12

cat("\nSummary of Test Errors:\n")

## 
## Summary of Test Errors:

cat("Linear Model Test Error (MSE):", lm_mse, "\n")

## Linear Model Test Error (MSE): 1882074

cat("Ridge Regression Test Error (MSE):", ridge_mse, "\n")

## Ridge Regression Test Error (MSE): 3270111

cat("Lasso Regression Test Error (MSE):", lasso_mse, "\n")

## Lasso Regression Test Error (MSE): 1938601

cat("PCR Test Error (MSE):", pcr_mse, "with M =", opt_ncomp, "\n")

## PCR Test Error (MSE): 1882074 with M = 17

cat("PLS Test Error (MSE):", pls_mse, "with M =", opt_ncomp_pls, "\n")

## PLS Test Error (MSE): 1897756 with M = 12

cat("\nComments on Results:\n")

## 
## Comments on Results:

cat("The test errors from the different approaches can be compared to evaluate their relative performance.\n")

## The test errors from the different approaches can be compared to evaluate their relative performance.

cat("It appears that the choice of regularization (ridge or lasso) and dimension reduction (PCR or PLS) impacts the prediction accuracy.\n")

## It appears that the choice of regularization (ridge or lasso) and dimension reduction (PCR or PLS) impacts the prediction accuracy.

cat("The linear model provides a baseline error. Ridge and lasso regression can handle multicollinearity and potentially reduce overfitting.\n")

## The linear model provides a baseline error. Ridge and lasso regression can handle multicollinearity and potentially reduce overfitting.

cat("PCR and PLS are useful for dimensionality reduction, especially when predictors are highly correlated.\n")

## PCR and PLS are useful for dimensionality reduction, especially when predictors are highly correlated.

cat("By comparing the test errors, we can see if any method significantly outperforms the others in terms of predictive accuracy for the number of college applications received.\n")

## By comparing the test errors, we can see if any method significantly outperforms the others in terms of predictive accuracy for the number of college applications received.

###########20%
library(quantmod)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(tidyverse)
getSymbols("AAPL", src = "yahoo", from = "2020-01-01", to = "2023-01-01")

## [1] "AAPL"

stock_data <- Cl(AAPL)
stock_data <- data.frame(date=index(stock_data), stock_prices=as.numeric(stock_data), row.names=NULL)
colnames(stock_data) <- c("Date", "Close")
ggplot(stock_data, aes(x=Date, y=Close)) +
  geom_line() +
  labs(title="AAPL Stock Closing Prices", x="Date", y="Closing Price")

model <- lm(Close ~ Date, data=stock_data)
summary(model)

## 
## Call:
## lm(formula = Close ~ Date, data = stock_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.209 -11.871   3.358  12.365  36.756 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.389e+03  3.640e+01  -38.16   <2e-16 ***
## Date         8.076e-02  1.935e-03   41.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.81 on 754 degrees of freedom
## Multiple R-squared:  0.698,  Adjusted R-squared:  0.6976 
## F-statistic:  1742 on 1 and 754 DF,  p-value: < 2.2e-16

future_dates <- data.frame(Date=seq(as.Date("2023-01-02"), as.Date("2023-12-31"), by="days"))

#Financial data often exhibits non-linear patterns, significant outliers, or shifts due to unforeseen circumstances (like market crashes or geopolitical events). In practice, you might need to incorporate more complex models and techniques (like ARIMA, GARCH, or machine learning methods such as Lasso or Ridge regression) to capture these dynamics more accurately. Moreover, considering additional variables such as volume, open, high, low prices, and macroeconomic indicators could improve the model's predictive power.

# Required Libraries
install.packages("tidyquant")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(tidyquant)

## Loading required package: PerformanceAnalytics

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

# Get Historical Stock Price Data
data <- tq_get("AAPL", from = "2017-01-01", to = Sys.Date())

# Split the data into training and test set
train_size <- floor(0.75*nrow(data))
train_set <- data[1:train_size, ]
test_set <- data[(train_size+1):nrow(data), ]

# Fit the model (Predict Adjusted Close Price with other variables)
fit <- lm(close ~ ., data = train_set[-1])

# Predict on the test set
predictions <- predict(fit, newdata = test_set[-1])

# Print the model summary
summary(fit)

## 
## Call:
## lm(formula = close ~ ., data = train_set[-1])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50328 -0.12582  0.00066  0.11979  0.54605 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.167e+01  4.089e-01  52.992  < 2e-16 ***
## date        -1.147e-03  2.358e-05 -48.620  < 2e-16 ***
## open        -2.307e-02  7.306e-03  -3.158  0.00162 ** 
## high         1.902e-02  8.600e-03   2.212  0.02714 *  
## low          5.427e-02  7.435e-03   7.300 4.82e-13 ***
## volume       6.987e-10  1.101e-10   6.347 2.95e-10 ***
## adjusted     9.633e-01  6.602e-03 145.903  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1755 on 1398 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.635e+07 on 6 and 1398 DF,  p-value: < 2.2e-16

final exam 112035146

2024-06-14