###########################Chapter4
library(pacman)
p_load(class, tidyverse,MASS, corrplot, kableExtra, ISLR2)
data("Boston")
attach(Boston)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
# Calculate the median crime rate
med_crime <- median(Boston$crim)
crim_lvl <- ifelse(Boston$crim > med_crime, 1, 0)
# Convert 'crim_lvl' to a factor variable
crim_lvl <- as.factor(crim_lvl)
Boston_2 <- data.frame(Boston, crim_lvl)
# Detach the 'Boston' dataset
detach(Boston)
# Visualize the relationships between variables in 'Boston_2' using a scatterplot matrix
pairs(Boston_2)

summary(Boston_2)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv crim_lvl
## Min. : 5.00 0:253
## 1st Qu.:17.02 1:253
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
corrplot(cor(Boston_2[,-14]), method="square")

crime_fit<-glm(crim_lvl~indus + nox + age + dis + rad + tax + lstat + medv, data= Boston_2, family=binomial)
summary(crime_fit)
##
## Call:
## glm(formula = crim_lvl ~ indus + nox + age + dis + rad + tax +
## lstat + medv, family = binomial, data = Boston_2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -27.582736 4.343637 -6.350 2.15e-10 ***
## indus -0.068239 0.044386 -1.537 0.12420
## nox 44.337338 7.269049 6.099 1.06e-09 ***
## age 0.013836 0.009621 1.438 0.15040
## dis 0.292732 0.158087 1.852 0.06407 .
## rad 0.533675 0.119105 4.481 7.44e-06 ***
## tax -0.006313 0.002408 -2.621 0.00876 **
## lstat 0.039949 0.042914 0.931 0.35190
## medv 0.052652 0.032482 1.621 0.10503
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 242.01 on 497 degrees of freedom
## AIC: 260.01
##
## Number of Fisher Scoring iterations: 8
crime_fit<-glm(crim_lvl~ nox + rad + tax, data= Boston_2, family=binomial)
summary(crime_fit)
##
## Call:
## glm(formula = crim_lvl ~ nox + rad + tax, family = binomial,
## data = Boston_2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -20.075965 2.287688 -8.776 < 2e-16 ***
## nox 36.015644 4.338728 8.301 < 2e-16 ***
## rad 0.631900 0.111610 5.662 1.5e-08 ***
## tax -0.007801 0.002184 -3.571 0.000355 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 250.34 on 502 degrees of freedom
## AIC: 258.34
##
## Number of Fisher Scoring iterations: 8
set.seed(1)
train_13 <- rbinom(506, 1, 0.7)
Boston_2 <- cbind(Boston_2, train_13)
Boston.train <- Boston_2[train_13 == 1,]
Boston.test <- Boston_2[train_13 == 0,]
# We generate model using the train data
Boston_fit <- glm(crim_lvl~ nox + rad + tax, data= Boston.train, family=binomial)
Boston.probs <- predict(Boston_fit, Boston.test, type = "response")
Boston.probs[1:10]
## 4 6 7 15 17 18 20
## 0.02569957 0.02569957 0.38986175 0.36048788 0.36048788 0.36048788 0.36048788
## 21 29 35
## 0.36048788 0.36048788 0.36048788
contrasts(crim_lvl)
## 1
## 0 0
## 1 1
Bos_pred <- rep(0, length(Boston.probs))
Bos_pred[Boston.probs > .5] = 1
table(Bos_pred, Boston.test$crim_lvl)
##
## Bos_pred 0 1
## 0 67 15
## 1 8 56
(8 + 15) / length(Boston.probs)
## [1] 0.1575342
library(MASS)
lda.fit <- lda(crim_lvl ~ nox + tax + rad, data = Boston_2,
subset = (train_13==1))
lda.fit
## Call:
## lda(crim_lvl ~ nox + tax + rad, data = Boston_2, subset = (train_13 ==
## 1))
##
## Prior probabilities of groups:
## 0 1
## 0.4944444 0.5055556
##
## Group means:
## nox tax rad
## 0 0.4690051 303.3989 4.106742
## 1 0.6401758 506.0549 14.576923
##
## Coefficients of linear discriminants:
## LD1
## nox 9.970745177
## tax -0.001202623
## rad 0.081982185
lda.pred <- predict(lda.fit, Boston.test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, Boston.test$crim_lvl)
##
## lda.class 0 1
## 0 73 17
## 1 2 54
(18 + 2) / length(Boston.probs)
## [1] 0.1369863
Bos.train <- cbind(Boston.train$nox, Boston.train$tax, Boston.train$rad)
Bos.test <- cbind(Boston.train$nox, Boston.train$tax, Boston.train$rad)
set.seed(1)
knn.pred=knn(Bos.train,Bos.test, Boston.train$crim_lvl,k=1)
table(knn.pred,Boston.train$crim_lvl)
##
## knn.pred 0 1
## 0 176 10
## 1 2 172
#Error rate:
(10+2)/146
## [1] 0.08219178
knn.pred=knn(Bos.train,Bos.test, Boston.train$crim_lvl,k=3)
table(knn.pred,Boston.train$crim_lvl)
##
## knn.pred 0 1
## 0 170 10
## 1 8 172
#Error rate:
(10+8)/146
## [1] 0.1232877
library(ISLR2)
data <- Boston
head(data,6)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
library(dplyr)
library(ggplot2)
library(rsample)
library(tidyverse)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(kknn)
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
library(pacman)
p_load(class, tidyverse,MASS, corrplot, kableExtra, ISLR2)
data("Boston")
attach(Boston)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
data <- data %>%
mutate(chas = factor(chas),
crime_rate = factor(ifelse(crim > median(crim),
'High', 'Low'),
levels = c('High', 'Low')))
head(data,6)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
## crime_rate
## 1 Low
## 2 Low
## 3 Low
## 4 Low
## 5 Low
## 6 Low
summary(data)
## crim zn indus chas nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 0:471 Min. :0.3850
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1: 35 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm age dis rad
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## tax ptratio lstat medv crime_rate
## Min. :187.0 Min. :12.60 Min. : 1.73 Min. : 5.00 High:253
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95 1st Qu.:17.02 Low :253
## Median :330.0 Median :19.05 Median :11.36 Median :21.20
## Mean :408.2 Mean :18.46 Mean :12.65 Mean :22.53
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :711.0 Max. :22.00 Max. :37.97 Max. :50.00
df = read.csv("/cloud/project/Boston.csv")
head(df)
## X crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
set.seed(123)
data_split <- initial_split(data, prop = .7, strata ="crim")
data_train <- training(data_split)
head(data_train,6)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 2 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 3 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
## 4 0.06417 0 5.96 0 0.499 5.933 68.2 3.3603 5 279 19.2 9.68 18.9
## 5 0.02763 75 2.95 0 0.428 6.595 21.8 5.4011 3 252 18.3 4.32 30.8
## 6 0.05360 21 5.64 0 0.439 6.511 21.1 6.8147 4 243 16.8 5.28 25.0
## crime_rate
## 1 Low
## 2 Low
## 3 Low
## 4 Low
## 5 Low
## 6 Low
data_test <- testing(data_split)
head(data_test,6)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 17.10 18.9
## 5 0.62739 0.0 8.14 0 0.538 5.834 56.5 4.4986 4 307 21.0 8.47 19.9
## 6 0.75026 0.0 8.14 0 0.538 5.924 94.1 4.3996 4 307 21.0 16.30 15.6
## crime_rate
## 1 Low
## 2 Low
## 3 Low
## 4 Low
## 5 High
## 6 High
Boston <- df
Boston$crime1 <- NA
crime_m = median(Boston$crim)
crime_m
## [1] 0.25651
for(i in 1:dim(Boston)[1]){
if (Boston$crim[i] > crime_m)
{
Boston$crime1[i] = 1 }
else{
Boston$crime1[i] = 0
}
}
head(Boston)
## X crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv crime1
## 1 24.0 0
## 2 21.6 0
## 3 34.7 0
## 4 33.4 0
## 5 36.2 0
## 6 28.7 0
autocorr = cor(Boston$crime1,Boston)
autocorr
## X crim zn indus chas nox rm
## [1,] 0.3694304 0.4093955 -0.436151 0.6032602 0.07009677 0.7232348 -0.1563718
## age dis rad tax ptratio black lstat
## [1,] 0.6139399 -0.6163416 0.6197862 0.6087413 0.2535684 -0.3512109 0.4532627
## medv crime1
## [1,] -0.2630167 1
model1 <-glm(crime_rate~chas+indus+age+dis+rad+tax, data=data_train,family=binomial)
summary(model1)
##
## Call:
## glm(formula = crime_rate ~ chas + indus + age + dis + rad + tax,
## family = binomial, data = data_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.270142 1.179986 1.924 0.05437 .
## chas1 0.309733 0.658693 0.470 0.63820
## indus -0.075372 0.037364 -2.017 0.04367 *
## age -0.025658 0.008654 -2.965 0.00303 **
## dis 0.433994 0.133896 3.241 0.00119 **
## rad -0.537394 0.127793 -4.205 2.61e-05 ***
## tax 0.005480 0.002473 2.216 0.02671 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 487.98 on 351 degrees of freedom
## Residual deviance: 230.44 on 345 degrees of freedom
## AIC: 244.44
##
## Number of Fisher Scoring iterations: 8
predicted1 <- predict(model1, data_train)
predicted2 <- if_else(predicted1 > 0.5, "Low", "High")
cross.tab1 <-table(data_train$chas, predicted2)
cross.tab1
## predicted2
## High Low
## 0 176 157
## 1 14 5
###########################Chapter 5
# Load necessary packages
library(ISLR) # Contains the Default dataset
##
## Attaching package: 'ISLR'
## The following objects are masked from 'package:ISLR2':
##
## Auto, Credit
library(dplyr) # For data manipulation
library(caret) # For data splitting and model accuracy
# Load the Default dataset
data("Default")
# Convert 'default' and 'student' to a factor
Default$default <- as.factor(Default$default)
Default$student <- as.factor(Default$student)
# Fit the logistic regression model
model_full <- glm(default ~ income + balance, data = Default, family = "binomial")
summary(model_full)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(123) # Setting a random seed for reproducibility
# Function to perform analysis with different splits
perform_analysis <- function(data, splitRatio = 0.7) {
# Splitting the data
index <- createDataPartition(data$default, p = splitRatio, list = FALSE)
train_set <- data[index, ]
validation_set <- data[-index, ]
# Fit the model on the training set
model <- glm(default ~ income + balance, data = train_set, family = "binomial")
# Predict on the validation set
pred_probs <- predict(model, validation_set, type = "response")
predictions <- ifelse(pred_probs > 0.5, "Yes", "No")
# Compute the validation set error
mean(predictions != validation_set$default)
}
# Perform the analysis three times
errors <- sapply(1:3, function(x) perform_analysis(Default))
errors
## [1] 0.02534178 0.02734245 0.02567523
mean(errors) # Average error across the three splits
## [1] 0.02611982
# Function to analyze with the student variable included
perform_analysis_with_student <- function(data, splitRatio = 0.7) {
index <- createDataPartition(data$default, p = splitRatio, list = FALSE)
train_set <- data[index, ]
validation_set <- data[-index, ]
# Fit the model
model <- glm(default ~ income + balance + student, data = train_set, family = "binomial")
# Predict on the validation set
pred_probs <- predict(model, validation_set, type = "response")
predictions <- ifelse(pred, "Yes", "No")
# Compute the validation set error
mean(predictions != validation, set$default)
}
# Perform the analysis
# Function to analyze with the student variable included
perform_analysis_with_student <- function(data, splitRatio = 0.7) {
# Split the dataset
index <- createDataPartition(data$default, p = splitRatio, list = FALSE)
train_set <- data[index, ]
validation_set <- data[-index, ]
# Fit the model on the training set
model <- glm(default ~ income + balance + student, data = train_set, family = "binomial")
# Predict on the validation set using type "response" for probabilities
pred_probs <- predict(model, validation_set, type = "response")
predictions <- ifelse(pred_probs > 0.5, "Yes", "No")
# Compute the validation set error
mean(predictions != validation_set$default)
}
# Perform the analysis
error_with_student <- perform_analysis_with_student(Default)
error_with_student
## [1] 0.027009
##########################Chapter6
# Load required packages
library(ISLR2)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
# Load College data set
data("College")
# (a) Split the data set into a training set and a test set
set.seed(123) # For reproducibility
train_indices <- sample(1:nrow(College), 0.7 * nrow(College)) # 70% for training
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]
#a. We split the College data set into a training set and a test set using a 70/30 split, where 70% of the data is used for training.
# (b) Fit a linear model using least squares on the training set and report the test error obtained.
data2 <- College
head(College,6)
## Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University Yes 1660 1232 721 23 52
## Adelphi University Yes 2186 1924 512 16 29
## Adrian College Yes 1428 1097 336 22 50
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University 2885 537 7440 3300 450
## Adelphi University 2683 1227 12280 6450 750
## Adrian College 1036 99 11250 3750 400
## Agnes Scott College 510 63 12960 5450 450
## Alaska Pacific University 249 869 7560 4120 800
## Albertson College 678 41 13500 3335 500
## Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 2200 70 78 18.1 12 7041
## Adelphi University 1500 29 30 12.2 16 10527
## Adrian College 1165 53 66 12.9 30 8735
## Agnes Scott College 875 92 97 7.7 37 19016
## Alaska Pacific University 1500 76 72 11.9 2 10922
## Albertson College 675 67 73 9.4 11 9727
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Adrian College 54
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
summary(data2)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
linear_model <- lm(Apps ~ ., data = train_data)
linear_predictions <- predict(linear_model, newdata = test_data)
linear_test_error <- mean((test_data$Apps - linear_predictions)^2)
linear_test_error
## [1] 1734841
#(b). We fit a linear regression model using least squares on the training set. The lm function is used, with Apps as the response variable and all other variables as predictors. We then use the model to make predictions on the test set and calculate the test error, which is the mean squared error between the predicted values and the actual values of Apps in the test set.
set.seed(123)
data_split1 <- initial_split(data2, prop = .7, strata ="Apps")
data_train1 <- training(data_split1)
data_test1<- testing(data_split1)
head(data_train1,6)
## Private Apps Accept Enroll Top10perc Top25perc
## Albertus Magnus College Yes 353 340 103 17 45
## Alderson-Broaddus College Yes 582 498 172 21 44
## Alverno College Yes 494 313 157 23 46
## Antioch University Yes 713 661 252 25 44
## Aquinas College Yes 619 516 219 20 51
## Arkansas College (Lyon College) Yes 708 334 166 46 74
## F.Undergrad P.Undergrad Outstate Room.Board
## Albertus Magnus College 416 230 13290 5720
## Alderson-Broaddus College 799 78 10468 3380
## Alverno College 1317 1235 8352 3640
## Antioch University 712 23 15476 3336
## Aquinas College 1251 767 11208 4124
## Arkansas College (Lyon College) 530 182 8644 3922
## Books Personal PhD Terminal S.F.Ratio
## Albertus Magnus College 500 1500 90 93 11.5
## Alderson-Broaddus College 660 1800 40 41 11.5
## Alverno College 650 2449 36 69 11.1
## Antioch University 400 1100 69 82 11.3
## Aquinas College 350 1615 55 65 12.7
## Arkansas College (Lyon College) 500 800 79 88 12.6
## perc.alumni Expend Grad.Rate
## Albertus Magnus College 26 8861 63
## Alderson-Broaddus College 15 8991 52
## Alverno College 26 8127 55
## Antioch University 35 42926 48
## Aquinas College 25 6584 65
## Arkansas College (Lyon College) 24 14579 54
head(data_test1,6)
## Private Apps Accept Enroll Top10perc
## Abilene Christian University Yes 1660 1232 721 23
## Adelphi University Yes 2186 1924 512 16
## Agnes Scott College Yes 417 349 137 60
## Alaska Pacific University Yes 193 146 55 16
## Albertson College Yes 587 479 158 38
## Allentown Coll. of St. Francis de Sales Yes 1179 780 290 38
## Top25perc F.Undergrad P.Undergrad
## Abilene Christian University 52 2885 537
## Adelphi University 29 2683 1227
## Agnes Scott College 89 510 63
## Alaska Pacific University 44 249 869
## Albertson College 62 678 41
## Allentown Coll. of St. Francis de Sales 64 1130 638
## Outstate Room.Board Books Personal PhD
## Abilene Christian University 7440 3300 450 2200 70
## Adelphi University 12280 6450 750 1500 29
## Agnes Scott College 12960 5450 450 875 92
## Alaska Pacific University 7560 4120 800 1500 76
## Albertson College 13500 3335 500 675 67
## Allentown Coll. of St. Francis de Sales 9690 4785 600 1000 60
## Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University 78 18.1 12 7041
## Adelphi University 30 12.2 16 10527
## Agnes Scott College 97 7.7 37 19016
## Alaska Pacific University 72 11.9 2 10922
## Albertson College 73 9.4 11 9727
## Allentown Coll. of St. Francis de Sales 84 13.3 21 7940
## Grad.Rate
## Abilene Christian University 60
## Adelphi University 56
## Agnes Scott College 59
## Alaska Pacific University 15
## Albertson College 55
## Allentown Coll. of St. Francis de Sales 74
lm.fit <- lm(Apps ~ . , data = data_train1)
lm.pred <- predict(lm.fit, data_test1)
mean((data_test1[, "Apps"] - lm.pred)^2)
## [1] 1630969
# (c) Fit a ridge regression model on the training set, with λ chosen by cross-validation and report the test error obtained
ridge_model <- cv.glmnet(as.matrix(train_data[, -1]), train_data$Apps, alpha = 0, nfolds = 10)
ridge_predictions <- predict(ridge_model, newx = as.matrix(test_data[, -1]), s = "lambda.min")
ridge_test_error <- mean((test_data$Apps - ridge_predictions)^2)
ridge_test_error
## [1] 557372.2
#(c). We fit a ridge regression model on the training set using cross-validation to select the optimal lambda (λ) value. The cv.glmnet function from the glmnet package is used. We set alpha = 0 for ridge regression. We then use the fitted model to make predictions on the test set using the lambda value that minimizes cross-validation error and calculate the test error.
library(glmnet)
train.mat <- model.matrix(Apps ~ . -1 , data = data_train1)
test.mat <- model.matrix(Apps ~ . -1, data = data_test1)
grid <- 10 ^ seq(4, -2, length = 100)
mod.ridge <- cv.glmnet(train.mat, data_train1[, "Apps"],
alpha = 0, lambda = grid, thresh = 1e-12)
lambda.best <- mod.ridge$lambda.min
lambda.best
## [1] 10.72267
ridge.pred <- predict(mod.ridge, newx = test.mat, s = lambda.best)
mean((data_test1[, "Apps"] - ridge.pred)^2)
## [1] 1707981
# (d) Fit a lasso model on the training set, with λ chosen by cross-validation and report the test error obtained, along with the number of non-zero coefficient estimates
lasso_model <- cv.glmnet(as.matrix(train_data[, -1]), train_data$Apps, alpha = 1, nfolds = 10)
lasso_predictions <- predict(lasso_model, newx = as.matrix(test_data[, -1]), s = "lambda.min")
lasso_test_error <- mean((test_data$Apps - lasso_predictions)^2)
lasso_test_error
## [1] 19487.97
non_zero_coef <- sum(coef(lasso_model, s = "lambda.min") != 0)
non_zero_coef
## [1] 2
mod.lasso <- cv.glmnet(train.mat, data_train1[, "Apps"],
alpha = 1, lambda = grid, thresh = 1e-12)
lambda.best <- mod.lasso$lambda.min
lambda.best
## [1] 4.037017
lasso.pred <- predict(mod.lasso, newx = test.mat, s = lambda.best)
mean((data_test1[, "Apps"] - lasso.pred)^2)
## [1] 1682458
#e
# Load required libraries
library(ISLR)
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:stats':
##
## loadings
# Load the College data set
data(College)
# Set seed for reproducibility
set.seed(123)
# Split the data into training and test sets
train_indices <- sample(1:nrow(College), size = nrow(College) / 2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]
# Fit PCR model using cross-validation to select the number of components
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
# Get the number of components chosen by cross-validation
optimal_components <- which.min(summary(pcr_model)$val[,"MSEP"])
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3258 3228 1665 1614 1297 1311 1216
## adjCV 3258 3229 1664 1612 1284 1315 1213
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1219 1207 1149 1151 1149 1158 1160
## adjCV 1219 1210 1146 1148 1146 1155 1157
## 14 comps 15 comps 16 comps 17 comps
## CV 1160 1147 1021 1023
## adjCV 1157 1146 1017 1018
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 32.969 59.69 66.68 72.25 77.31 81.82 85.18 88.34
## Apps 3.259 74.43 76.35 84.40 84.42 86.69 86.79 87.01
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 91.32 93.61 95.48 97.05 98.11 98.90 99.40
## Apps 88.38 88.39 88.47 88.49 88.50 88.51 88.74
## 16 comps 17 comps
## X 99.81 100.00
## Apps 91.48 91.58
# Report the optimal number of components
cat("Optimal number of components:", optimal_components, "\n")
## Optimal number of components:
# Make predictions on the test set
test_predictions <- predict(pcr_model, newdata = test_data, ncomp = optimal_components)
# Calculate the test error (Mean Squared Error)
test_error <- mean((test_predictions - test_data$Apps)^2)
cat("Test MSE:", test_error, "\n")
## Test MSE: NaN
#f
# Load required libraries
library(ISLR)
library(pls)
# Load the College data set
data(College)
# Set seed for reproducibility
set.seed(123)
# Split the data into training and test sets
train_indices <- sample(1:nrow(College), size = nrow(College) / 2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]
# Fit PLS model using cross-validation to select the number of components
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
# Get the number of components chosen by cross-validation
optimal_components <- which.min(summary(pls_model)$val[,"MSEP"])
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3258 1504 1245 1130 1106 1072 1033
## adjCV 3258 1502 1247 1128 1102 1068 1028
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1026 1020 1021 1021 1022 1023 1023
## adjCV 1021 1016 1016 1017 1018 1018 1019
## 14 comps 15 comps 16 comps 17 comps
## CV 1023 1023 1023 1023
## adjCV 1018 1018 1018 1018
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 26.97 43.83 65.16 69.79 73.63 76.18 79.97 81.98
## Apps 79.33 85.98 88.76 89.67 90.59 91.43 91.54 91.56
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 84.18 86.85 90.18 91.77 93.55 95.40 97.35
## Apps 91.57 91.57 91.57 91.58 91.58 91.58 91.58
## 16 comps 17 comps
## X 99.06 100.00
## Apps 91.58 91.58
# Report the optimal number of components
cat("Optimal number of components:", optimal_components, "\n")
## Optimal number of components:
# Make predictions on the test set
test_predictions <- predict(pls_model, newdata = test_data, ncomp = optimal_components)
# Calculate the test error (Mean Squared Error)
test_error <- mean((test_predictions - test_data$Apps)^2)
cat("Test MSE:", test_error, "\n")
## Test MSE: NaN
#g
# Load required libraries
library(ISLR)
library(pls)
library(glmnet)
# Load the College data set
data(College)
# Set seed for reproducibility
set.seed(123)
# Split the data into training and test sets
train_indices <- sample(1:nrow(College), size = nrow(College) / 2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]
# Prepare data for glmnet
x_train <- model.matrix(Apps ~ ., train_data)[,-1]
y_train <- train_data$Apps
x_test <- model.matrix(Apps ~ ., test_data)[,-1]
y_test <- test_data$Apps
# Linear Regression
lm_model <- lm(Apps ~ ., data = train_data)
lm_pred <- predict(lm_model, newdata = test_data)
lm_mse <- mean((lm_pred - y_test)^2)
cat("Linear Regression Test MSE:", lm_mse, "\n")
## Linear Regression Test MSE: 1373995
# Ridge Regression
ridge_model <- cv.glmnet(x_train, y_train, alpha = 0)
ridge_pred <- predict(ridge_model, s = "lambda.min", newx = x_test)
ridge_mse <- mean((ridge_pred - y_test)^2)
cat("Ridge Regression Test MSE:", ridge_mse, "\n")
## Ridge Regression Test MSE: 2090325
# Lasso Regression
lasso_model <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_pred <- predict(lasso_model, s = "lambda.min", newx = x_test)
lasso_mse <- mean((lasso_pred - y_test)^2)
cat("Lasso Regression Test MSE:", lasso_mse, "\n")
## Lasso Regression Test MSE: 1397441
# Principal Components Regression (PCR)
pcr_model <- pcr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
optimal_pcr <- which.min(summary(pcr_model)$val[,"MSEP"])
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3258 3251 1672 1618 1329 1323 1240
## adjCV 3258 3252 1670 1617 1316 1321 1237
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1236 1222 1157 1158 1159 1162 1166
## adjCV 1234 1227 1153 1156 1156 1159 1163
## 14 comps 15 comps 16 comps 17 comps
## CV 1167 1149 1014 1019
## adjCV 1163 1147 1010 1015
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 32.969 59.69 66.68 72.25 77.31 81.82 85.18 88.34
## Apps 3.259 74.43 76.35 84.40 84.42 86.69 86.79 87.01
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 91.32 93.61 95.48 97.05 98.11 98.90 99.40
## Apps 88.38 88.39 88.47 88.49 88.50 88.51 88.74
## 16 comps 17 comps
## X 99.81 100.00
## Apps 91.48 91.58
pcr_pred <- predict(pcr_model, newdata = test_data, ncomp = optimal_pcr)
pcr_mse <- mean((pcr_pred - y_test)^2)
cat("PCR Test MSE:", pcr_mse, "\n")
## PCR Test MSE: NaN
# Partial Least Squares (PLS)
pls_model <- plsr(Apps ~ ., data = train_data, scale = TRUE, validation = "CV")
optimal_pls <- which.min(summary(pls_model)$val[,"MSEP"])
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 3258 1510 1252 1133 1098 1064 1023
## adjCV 3258 1508 1253 1130 1095 1060 1019
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1015 1012 1013 1012 1012 1013 1013
## adjCV 1011 1009 1009 1008 1008 1009 1009
## 14 comps 15 comps 16 comps 17 comps
## CV 1013 1013 1013 1013
## adjCV 1009 1009 1009 1009
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 26.97 43.83 65.16 69.79 73.63 76.18 79.97 81.98
## Apps 79.33 85.98 88.76 89.67 90.59 91.43 91.54 91.56
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 84.18 86.85 90.18 91.77 93.55 95.40 97.35
## Apps 91.57 91.57 91.57 91.58 91.58 91.58 91.58
## 16 comps 17 comps
## X 99.06 100.00
## Apps 91.58 91.58
pls_pred <- predict(pls_model, newdata = test_data, ncomp = optimal_pls)
pls_mse <- mean((pls_pred - y_test)^2)
cat("PLS Test MSE:", pls_mse, "\n")
## PLS Test MSE: NaN
# Load necessary libraries
library(ISLR2) # For the College data set
library(caret) # For data partitioning and cross-validation
library(glmnet) # For ridge and lasso regression
library(pls) # For PCR and PLS regression
# Load the College data set
data(College)
# Set a random seed for reproducibility
set.seed(123)
# Create a training index (70% training, 30% test)
trainIndex <- createDataPartition(College$Apps, p = 0.7, list = FALSE)
# Split the data
trainData <- College[trainIndex, ]
testData <- College[-trainIndex, ]
# Fit a linear model using least squares
lm_model <- lm(Apps ~ ., data = trainData)
# Predict on the test set
lm_predictions <- predict(lm_model, testData)
# Calculate the test error (Mean Squared Error)
lm_mse <- mean((lm_predictions - testData$Apps)^2)
cat("Linear Model Test Error (MSE):", lm_mse, "\n")
## Linear Model Test Error (MSE): 1882074
# Prepare data for glmnet
x_train <- model.matrix(Apps ~ ., trainData)[, -1]
y_train <- trainData$Apps
x_test <- model.matrix(Apps ~ ., testData)[, -1]
y_test <- testData$Apps
# Perform cross-validation to choose lambda for ridge regression
set.seed(123)
ridge_cv <- cv.glmnet(x_train, y_train, alpha = 0)
# Fit the ridge regression model with the best lambda
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = ridge_cv$lambda.min)
# Predict on the test set
ridge_predictions <- predict(ridge_model, s = ridge_cv$lambda.min, newx = x_test)
# Calculate the test error (Mean Squared Error)
ridge_mse <- mean((ridge_predictions - y_test)^2)
cat("Ridge Regression Test Error (MSE):", ridge_mse, "\n")
## Ridge Regression Test Error (MSE): 3270111
# Perform cross-validation to choose lambda for lasso regression
set.seed(123)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1)
# Fit the lasso model with the best lambda
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = lasso_cv$lambda.min)
# Predict on the test set
lasso_predictions <- predict(lasso_model, s = lasso_cv$lambda.min, newx = x_test)
# Calculate the test error (Mean Squared Error)
lasso_mse <- mean((lasso_predictions - y_test)^2)
cat("Lasso Regression Test Error (MSE):", lasso_mse, "\n")
## Lasso Regression Test Error (MSE): 1938601
# Get the number of non-zero coefficient estimates
lasso_coef <- coef(lasso_model, s = lasso_cv$lambda.min)
num_nonzero_coefs <- sum(lasso_coef != 0)
cat("Number of non-zero coefficients in Lasso:", num_nonzero_coefs, "\n")
## Number of non-zero coefficients in Lasso: 17
# Fit the PCR model with cross-validation to choose the number of components
set.seed(123)
pcr_model <- pcr(Apps ~ ., data = trainData, scale = TRUE, validation = "CV")
# Choose the number of components with the lowest cross-validation error
validationplot(pcr_model, val.type = "MSEP")

opt_ncomp <- which.min(pcr_model$validation$PRESS)
# Predict on the test set
pcr_predictions <- predict(pcr_model, testData, ncomp = opt_ncomp)
# Calculate the test error (Mean Squared Error)
pcr_mse <- mean((pcr_predictions - testData$Apps)^2)
cat("PCR Test Error (MSE):", pcr_mse, "\n")
## PCR Test Error (MSE): 1882074
cat("Optimal number of components (M) for PCR:", opt_ncomp, "\n")
## Optimal number of components (M) for PCR: 17
# Fit the PLS model with cross-validation to choose the number of components
set.seed(123)
pls_model <- plsr(Apps ~ ., data = trainData, scale = TRUE, validation = "CV")
# Choose the number of components with the lowest cross-validation error
validationplot(pls_model, val.type = "MSEP")

opt_ncomp_pls <- which.min(pls_model$validation$PRESS)
# Predict on the test set
pls_predictions <- predict(pls_model, testData, ncomp = opt_ncomp_pls)
# Calculate the test error (Mean Squared Error)
pls_mse <- mean((pls_predictions - testData$Apps)^2)
cat("PLS Test Error (MSE):", pls_mse, "\n")
## PLS Test Error (MSE): 1897756
cat("Optimal number of components (M) for PLS:", opt_ncomp_pls, "\n")
## Optimal number of components (M) for PLS: 12
cat("\nSummary of Test Errors:\n")
##
## Summary of Test Errors:
cat("Linear Model Test Error (MSE):", lm_mse, "\n")
## Linear Model Test Error (MSE): 1882074
cat("Ridge Regression Test Error (MSE):", ridge_mse, "\n")
## Ridge Regression Test Error (MSE): 3270111
cat("Lasso Regression Test Error (MSE):", lasso_mse, "\n")
## Lasso Regression Test Error (MSE): 1938601
cat("PCR Test Error (MSE):", pcr_mse, "with M =", opt_ncomp, "\n")
## PCR Test Error (MSE): 1882074 with M = 17
cat("PLS Test Error (MSE):", pls_mse, "with M =", opt_ncomp_pls, "\n")
## PLS Test Error (MSE): 1897756 with M = 12
cat("\nComments on Results:\n")
##
## Comments on Results:
cat("The test errors from the different approaches can be compared to evaluate their relative performance.\n")
## The test errors from the different approaches can be compared to evaluate their relative performance.
cat("It appears that the choice of regularization (ridge or lasso) and dimension reduction (PCR or PLS) impacts the prediction accuracy.\n")
## It appears that the choice of regularization (ridge or lasso) and dimension reduction (PCR or PLS) impacts the prediction accuracy.
cat("The linear model provides a baseline error. Ridge and lasso regression can handle multicollinearity and potentially reduce overfitting.\n")
## The linear model provides a baseline error. Ridge and lasso regression can handle multicollinearity and potentially reduce overfitting.
cat("PCR and PLS are useful for dimensionality reduction, especially when predictors are highly correlated.\n")
## PCR and PLS are useful for dimensionality reduction, especially when predictors are highly correlated.
cat("By comparing the test errors, we can see if any method significantly outperforms the others in terms of predictive accuracy for the number of college applications received.\n")
## By comparing the test errors, we can see if any method significantly outperforms the others in terms of predictive accuracy for the number of college applications received.
###########20%
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tidyverse)
getSymbols("AAPL", src = "yahoo", from = "2020-01-01", to = "2023-01-01")
## [1] "AAPL"
stock_data <- Cl(AAPL)
stock_data <- data.frame(date=index(stock_data), stock_prices=as.numeric(stock_data), row.names=NULL)
colnames(stock_data) <- c("Date", "Close")
ggplot(stock_data, aes(x=Date, y=Close)) +
geom_line() +
labs(title="AAPL Stock Closing Prices", x="Date", y="Closing Price")

model <- lm(Close ~ Date, data=stock_data)
summary(model)
##
## Call:
## lm(formula = Close ~ Date, data = stock_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.209 -11.871 3.358 12.365 36.756
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.389e+03 3.640e+01 -38.16 <2e-16 ***
## Date 8.076e-02 1.935e-03 41.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.81 on 754 degrees of freedom
## Multiple R-squared: 0.698, Adjusted R-squared: 0.6976
## F-statistic: 1742 on 1 and 754 DF, p-value: < 2.2e-16
future_dates <- data.frame(Date=seq(as.Date("2023-01-02"), as.Date("2023-12-31"), by="days"))
#Financial data often exhibits non-linear patterns, significant outliers, or shifts due to unforeseen circumstances (like market crashes or geopolitical events). In practice, you might need to incorporate more complex models and techniques (like ARIMA, GARCH, or machine learning methods such as Lasso or Ridge regression) to capture these dynamics more accurately. Moreover, considering additional variables such as volume, open, high, low prices, and macroeconomic indicators could improve the model's predictive power.
# Required Libraries
install.packages("tidyquant")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyquant)
## Loading required package: PerformanceAnalytics
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
# Get Historical Stock Price Data
data <- tq_get("AAPL", from = "2017-01-01", to = Sys.Date())
# Split the data into training and test set
train_size <- floor(0.75*nrow(data))
train_set <- data[1:train_size, ]
test_set <- data[(train_size+1):nrow(data), ]
# Fit the model (Predict Adjusted Close Price with other variables)
fit <- lm(close ~ ., data = train_set[-1])
# Predict on the test set
predictions <- predict(fit, newdata = test_set[-1])
# Print the model summary
summary(fit)
##
## Call:
## lm(formula = close ~ ., data = train_set[-1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50328 -0.12582 0.00066 0.11979 0.54605
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.167e+01 4.089e-01 52.992 < 2e-16 ***
## date -1.147e-03 2.358e-05 -48.620 < 2e-16 ***
## open -2.307e-02 7.306e-03 -3.158 0.00162 **
## high 1.902e-02 8.600e-03 2.212 0.02714 *
## low 5.427e-02 7.435e-03 7.300 4.82e-13 ***
## volume 6.987e-10 1.101e-10 6.347 2.95e-10 ***
## adjusted 9.633e-01 6.602e-03 145.903 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1755 on 1398 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.635e+07 on 6 and 1398 DF, p-value: < 2.2e-16