library(MASS)
library(e1071)
library(class)
data(Boston)
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
# Create a binary response variable indicating whether the crime rate is above or below the median
Boston$high_crime <- ifelse(Boston$crim > median(Boston$crim), "High", "Low")
# Convert to factor
Boston$high_crime <- factor(Boston$high_crime, levels = c("Low", "High"))
By applying the LR model we can see essential details such as standard errors, z-values, and p-values. These details are significant and influence the predictor of crime occurence.
logit_model <- glm(high_crime ~ ., data = Boston, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit_model)
##
## Call:
## glm(formula = high_crime ~ ., family = binomial, data = Boston)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.437e+01 1.202e+05 0.000 1.000
## crim 1.083e+03 1.773e+04 0.061 0.951
## zn 2.194e+00 5.856e+01 0.037 0.970
## indus -2.510e+00 9.002e+02 -0.003 0.998
## chas 4.489e+00 1.014e+04 0.000 1.000
## nox -2.585e+02 1.458e+05 -0.002 0.999
## rm -3.953e+01 1.653e+03 -0.024 0.981
## age 3.437e-01 5.798e+01 0.006 0.995
## dis -1.742e+01 2.146e+03 -0.008 0.994
## rad -5.933e+00 2.642e+03 -0.002 0.998
## tax 1.639e-01 1.078e+02 0.002 0.999
## ptratio 5.525e+00 3.640e+03 0.002 0.999
## black 3.266e-02 1.208e+01 0.003 0.998
## lstat -1.687e+00 3.560e+02 -0.005 0.996
## medv 2.358e+00 5.382e+02 0.004 0.997
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7.0146e+02 on 505 degrees of freedom
## Residual deviance: 2.8371e-05 on 491 degrees of freedom
## AIC: 30
##
## Number of Fisher Scoring iterations: 25
Linear Discriminant Analysis (LDA) is used when we need to classify observations into predefined classes based on multiple input features, particularly when the assumptions of normally distributed predictors and equal class covariances hold. It is effective for problems where the relationships between features and class labels are approximately linear. Through applying LDA to these dataset, we get valuable insights on covariances matrices and means, that allow us to better evaluate for crime rates.
# Fit LDA model using all predictors
lda_model <- lda(high_crime ~ ., data = Boston)
lda_model
## Call:
## lda(high_crime ~ ., data = Boston)
##
## Prior probabilities of groups:
## Low High
## 0.5 0.5
##
## Group means:
## crim zn indus chas nox rm age
## Low 0.0955715 21.525692 7.002292 0.05138340 0.4709711 6.394395 51.31028
## High 7.1314756 1.201581 15.271265 0.08695652 0.6384190 6.174874 85.83953
## dis rad tax ptratio black lstat medv
## Low 5.091596 4.158103 305.7431 17.90711 388.7061 9.419486 24.94941
## High 2.498489 14.940711 510.7312 19.00395 324.6420 15.886640 20.11621
##
## Coefficients of linear discriminants:
## LD1
## crim 0.0046376592
## zn -0.0056431194
## indus 0.0126159626
## chas -0.0592836851
## nox 8.1826206579
## rm 0.0874007870
## age 0.0112829040
## dis 0.0453643651
## rad 0.0699133176
## tax -0.0008444666
## ptratio 0.0513806507
## black -0.0009892799
## lstat 0.0143945059
## medv 0.0386990631
plot(lda_model)
## Naive Bayes We use for classification tasks where we need to predict
the category or class of a given set of features. This theorem allow us
to assess independence between the predictor.
crime_rate_above_median <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
naive_bayes_model <- naiveBayes(as.factor(crime_rate_above_median) ~ ., data = Boston)
naive_bayes_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5 0.5
##
## Conditional probabilities:
## crim
## Y [,1] [,2]
## 0 0.0955715 0.06281773
## 1 7.1314756 11.10912294
##
## zn
## Y [,1] [,2]
## 0 21.525692 29.319808
## 1 1.201581 4.798611
##
## indus
## Y [,1] [,2]
## 0 7.002292 5.514454
## 1 15.271265 5.439010
##
## chas
## Y [,1] [,2]
## 0 0.05138340 0.2212161
## 1 0.08695652 0.2823299
##
## nox
## Y [,1] [,2]
## 0 0.4709711 0.05559789
## 1 0.6384190 0.09870365
##
## rm
## Y [,1] [,2]
## 0 6.394395 0.5556856
## 1 6.174874 0.8101381
##
## age
## Y [,1] [,2]
## 0 51.31028 25.88190
## 1 85.83953 17.87423
##
## dis
## Y [,1] [,2]
## 0 5.091596 2.081304
## 1 2.498489 1.085521
##
## rad
## Y [,1] [,2]
## 0 4.158103 1.659121
## 1 14.940711 9.529843
##
## tax
## Y [,1] [,2]
## 0 305.7431 87.4837
## 1 510.7312 167.8553
##
## ptratio
## Y [,1] [,2]
## 0 17.90711 1.811216
## 1 19.00395 2.346947
##
## black
## Y [,1] [,2]
## 0 388.7061 22.83774
## 1 324.6420 118.83084
##
## lstat
## Y [,1] [,2]
## 0 9.419486 4.923497
## 1 15.886640 7.546922
##
## medv
## Y [,1] [,2]
## 0 24.94941 7.232047
## 1 20.11621 10.270362
##
## high_crime
## Y Low High
## 0 1 0
## 1 0 1
Through the use and training of the KNN model we can evaluate the accuracy of how well the model predicts crimes. Based on the KNN model we can attest that the model is a valuable and accurate predictor for crime rate.
# Split the dataset into train and test sets
set.seed(123) # for reproducibility
train_index <- sample(1:nrow(Boston), 0.7*nrow(Boston)) # 70% train, 30% test
train_data <- Boston[train_index, ]
test_data <- Boston[-train_index, ]
# Fit KNN model using selected predictors
k <- 5 # value of k for KNN
knn_model <- knn(train = train_data[, c("age", "rad", "tax")],
test = test_data[, c("age", "rad", "tax")],
cl = train_data$high_crime,
k = k)
# Print the accuracy of the KNN model
accuracy <- mean(knn_model == test_data$high_crime)*100
formatted_mean_accuracy <- sprintf("%.2f", accuracy)
cat("Mean Accuracy:", formatted_mean_accuracy,"% \n")
## Mean Accuracy: 91.45 %
library(ISLR)
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Boston
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
## The following object is masked from 'package:MASS':
##
## Boston
attach(Default)
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
dim(Default)
## [1] 10000 4
logR = glm(default ~ balance + income, data=Default, family='binomial')
summary(logR)
##
## Call:
## glm(formula = default ~ balance + income, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
default.num = sample(dim(Default)[1],dim(Default)[1]*.75)
default.train = subset(Default[default.num,])
val.default = subset(Default[-default.num,])
logr.default = glm(default ~ balance + income, family = binomial, data = default.train)
summary(logr.default)
##
## Call:
## glm(formula = default ~ balance + income, family = binomial,
## data = default.train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.178e+01 5.152e-01 -22.868 < 2e-16 ***
## balance 5.846e-03 2.719e-04 21.502 < 2e-16 ***
## income 1.863e-05 5.825e-06 3.199 0.00138 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2192.2 on 7499 degrees of freedom
## Residual deviance: 1141.8 on 7497 degrees of freedom
## AIC: 1147.8
##
## Number of Fisher Scoring iterations: 8
log.prob.default = predict(logr.default, val.default, type = "response")
log.pred.default = rep("No", length(log.prob.default))
log.pred.default[log.prob.default > 0.5] ="Yes"
table(log.pred.default, val.default$default)
##
## log.pred.default No Yes
## No 2404 61
## Yes 13 22
mean(log.pred.default != val.default$default)
## [1] 0.0296
With a .4 validate and a .65 train the errorrate is of .028
default.num1 = sample(dim(Default)[1],dim(Default)[1]*.65)
default.train1 = subset(Default[default.num1,])
val.default1 = subset(Default[-default.num1,])
logr.default1 = glm(default ~ balance + income, family = binomial, data = default.train1)
log.prob.default1 = predict(logr.default1, val.default1, type = "response")
log.pred.default1 = rep("No", length(log.prob.default1))
log.pred.default1[log.prob.default1 > 0.4] ="Yes"
table(log.pred.default1, val.default1$default)
##
## log.pred.default1 No Yes
## No 3354 69
## Yes 30 47
mean(log.pred.default1 != val.default1$default)
## [1] 0.02828571
With a .6 validate and a .8 train, the error train is .029
default.num2 = sample(dim(Default)[1],dim(Default)[1]*.8)
default.train2 = subset(Default[default.num2,])
val.default2 = subset(Default[-default.num2,])
logr.default2 = glm(default ~ balance + income, family = binomial, data = default.train2)
log.prob.default2 = predict(logr.default2, val.default2, type = "response")
log.pred.default2 = rep("No", length(log.prob.default2))
log.pred.default2[log.prob.default2 > 0.6] ="Yes"
table(log.pred.default2, val.default2$default)
##
## log.pred.default2 No Yes
## No 1924 57
## Yes 4 15
mean(log.pred.default2 != val.default2$default)
## [1] 0.0305
With a .4 validate and a .4 train, the error rate is .027
default.num3 = sample(dim(Default)[1],dim(Default)[1]*.4)
default.train3 = subset(Default[default.num3,])
val.default3 = subset(Default[-default.num3,])
logr.default3 = glm(default ~ balance + income, family = binomial, data = default.train3)
log.prob.default3 = predict(logr.default3, val.default3, type = "response")
log.pred.default3 = rep("No", length(log.prob.default3))
log.pred.default3[log.prob.default3 > 0.4] ="Yes"
table(log.pred.default3, val.default3$default)
##
## log.pred.default3 No Yes
## No 5748 115
## Yes 57 80
mean(log.pred.default3 != val.default3$default)
## [1] 0.02866667
Conclusion is that the second split has the highest error rate with .029, and the third split has the lowest error rate with . 027.
When comparing to the third split of the previous question (which contained the lowest error rate), by including a dummy variable we were able to establish a significant reduction in the test error rate. Reducing the error rate from .027 to .024.
student1 = ifelse(student == "Yes", 1, 0)
Default1 = data.frame(Default, student1)
def.logr.num = sample(dim(Default1)[1],dim(Default1)[1]*.65)
def.logr.train = subset(Default1[def.logr.num,])
def.logr.val = subset(Default1[-def.logr.num,])
logr.d = glm(default ~ income + balance + student1, data = Default1, family = binomial)
logr.prob.def = predict(logr.d, def.logr.val, type = "response")
logr.pred.def = rep("No", length(logr.prob.def))
logr.pred.def[logr.prob.def > 0.6] ="Yes"
table(logr.pred.def, def.logr.val$default)
##
## logr.pred.def No Yes
## No 3391 80
## Yes 6 23
mean(logr.pred.def != def.logr.val$default)
## [1] 0.02457143
install.packages("glmnet")
## Installing package into '/Users/HectorR/Library/R/arm64/4.3/library'
## (as 'lib' is unspecified)
##
## The downloaded binary packages are in
## /var/folders/xs/w6cgj6yj08v91w3_v94fs_1m0000gp/T//RtmpI0upDZ/downloaded_packages
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
data("College")
attach(College)
set.seed(2)
train <- sample(1:nrow(College), nrow(College)/2)
test <- -train
y.test <- Apps[test]
lm.fit <- lm(Apps ~., data=College, subset = train)
lm.pred <- predict(lm.fit, College[test,])
lm.error <- mean((lm.pred - y.test)^2)
lm.error
## [1] 1093608
The test MSE is 1093608. ## (c) Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.
x <- model.matrix(Apps~., data=College)[, -1]
x.train <- x[train,]
y <- College$Apps
y.train <- y[train]
set.seed(7)
ridge.fit <- glmnet(x.train,y.train,alpha=0)
cv.ridge.fit <- cv.glmnet(x.train,y.train,alpha=0)
plot(cv.ridge.fit)
bestlam <- cv.ridge.fit$lambda.min
bestlam
## [1] 424.2704
The best lambda chosen by cross validation is 424.2704
ridge.pred <- predict(ridge.fit, s=bestlam, newx=x[test,])
ridge.error <- mean((ridge.pred-y.test)^2)
ridge.error
## [1] 1138030
The test MSE is 1138030, and is higher than linear regression. ## (d) Fit a lasso model on the training set, with λ chosen by cross-validation. Report the test error obtained, along with the number of non-zero coefficient estimates.
set.seed(2)
lasso.fit <- glmnet(x.train,y.train,alpha=1)
cv.lasso.fit<- cv.glmnet(x.train,y.train,alpha=1)
bestlam.lasso <- cv.lasso.fit$lambda.min
bestlam.lasso
## [1] 15.97351
The best Lambda value chosen by cross validation for the Lasso Model is 15.97351.
lasso.pred <- predict(lasso.fit, s=bestlam.lasso, newx=x[test,])
lasso.error <- mean((lasso.pred-y.test)^2)
lasso.error
## [1] 1045922
lasso.c <- predict(lasso.fit,type="coefficients", s=bestlam)[1:17,]
length(lasso.c[lasso.c != 0])
## [1] 3
The test MSE for Lasso Regression is 1045922.As for the non-zero coefficient estimates, there is 3. ## (e) Fit a PCR model on the training set, with M chosen by cross-validation. Report the test error obtained, along with the value of M selected by cross-validation.
install.packages("pls")
## Installing package into '/Users/HectorR/Library/R/arm64/4.3/library'
## (as 'lib' is unspecified)
##
## The downloaded binary packages are in
## /var/folders/xs/w6cgj6yj08v91w3_v94fs_1m0000gp/T//RtmpI0upDZ/downloaded_packages
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
pcr.fit <- pcr(Apps~., data=College, subset=train, scale=TRUE, validation ="CV")
summary(pcr.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4440 4463 2362 2385 2092 1835 1832
## adjCV 4440 4465 2359 2384 2066 1824 1823
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1829 1821 1753 1764 1771 1772 1784
## adjCV 1818 1813 1746 1757 1764 1764 1777
## 14 comps 15 comps 16 comps 17 comps
## CV 1778 1747 1401 1338
## adjCV 1773 1701 1385 1322
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 31.4816 57.40 64.84 70.54 75.80 80.23 84.04 87.64
## Apps 0.1398 72.45 72.50 80.45 84.74 84.77 85.06 85.16
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 90.87 93.29 95.33 96.98 98.05 98.75 99.37
## Apps 85.93 86.16 86.21 86.32 86.40 86.61 92.49
## 16 comps 17 comps
## X 99.85 100.00
## Apps 93.65 94.32
validationplot(pcr.fit, val.type = "MSEP")
pcr.pred <- predict(pcr.fit,x[test,], ncomp=17)
pcr.error <- mean((pcr.pred-y.test)^2)
pcr.error
## [1] 1093608
From the graph we can observe the lowest cross validation of occurs when all 17 components are included. The test MSE is 1093608.
set.seed(1)
pls.fit <- plsr(Apps~., data=College, subset=train, scale=TRUE, validation="CV")
summary(pls.fit)
## Data: X dimension: 388 17
## Y dimension: 388 1
## Fit method: kernelpls
## Number of components considered: 17
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 4440 2153 1701 1711 1650 1498 1413
## adjCV 4440 2147 1677 1704 1615 1470 1392
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 1378 1358 1356 1342 1338 1337 1337
## adjCV 1361 1342 1339 1326 1322 1321 1321
## 14 comps 15 comps 16 comps 17 comps
## CV 1338 1338 1338 1338
## adjCV 1322 1322 1322 1322
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 25.76 31.99 61.88 65.00 68.68 73.82 78.33 81.38
## Apps 77.92 87.80 88.38 92.06 93.59 93.92 94.01 94.09
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 83.24 85.59 87.76 90.49 92.69 94.27 97.03
## Apps 94.20 94.27 94.30 94.31 94.32 94.32 94.32
## 16 comps 17 comps
## X 99.21 100.00
## Apps 94.32 94.32
validationplot(pls.fit, val.type = "MSEP")
pls.pred <- predict(pls.fit,x[test,],ncomp=12)
pls.error <- mean((pls.pred-y.test)^2)
pls.error
## [1] 1085346
By fitting a PLS model we can see that the lowest cross validation error occurs when 12 components are included. The test MSE for the PLS is 1085346, which lower than all the previous MSE tested. by the other models.
##(g) Comment on the results obtained. How accurately can we predict the number of college applications received? Is there much difference among the test errors resulting from these five approaches?
errors <- c(lm.error, ridge.error, lasso.error, pcr.error, pls.error)
names(errors) <- c("lm", "ridge", "lasso", "pcr", "pls")
barplot(sort(errors))
print(sort(errors))
## lasso pls pcr lm ridge
## 1045922 1085346 1093608 1093608 1138030
As displayed, the Lasso model provides with the lowest test MSE. Then the Lasso model , is followed by PLS, then PCR, LM, and lastly we have Ridge with the highest test MSE. All models can be used to predict the number of college applications, however ridge would be significantly much less inaccurate by comparison to the other models (especially Lasso Model).
install.packages("quantmod")
## Installing package into '/Users/HectorR/Library/R/arm64/4.3/library'
## (as 'lib' is unspecified)
##
## The downloaded binary packages are in
## /var/folders/xs/w6cgj6yj08v91w3_v94fs_1m0000gp/T//RtmpI0upDZ/downloaded_packages
install.packages("glmnet")
## Installing package into '/Users/HectorR/Library/R/arm64/4.3/library'
## (as 'lib' is unspecified)
##
## The downloaded binary packages are in
## /var/folders/xs/w6cgj6yj08v91w3_v94fs_1m0000gp/T//RtmpI0upDZ/downloaded_packages
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(glmnet)
getSymbols("AAPL", src = "yahoo", from = "2020-01-01", to = "2023-01-01")
## [1] "AAPL"
head(AAPL)
## AAPL.Open AAPL.High AAPL.Low AAPL.Close AAPL.Volume AAPL.Adjusted
## 2020-01-02 74.0600 75.1500 73.7975 75.0875 135480400 72.96044
## 2020-01-03 74.2875 75.1450 74.1250 74.3575 146322800 72.25112
## 2020-01-06 73.4475 74.9900 73.1875 74.9500 118387200 72.82686
## 2020-01-07 74.9600 75.2250 74.3700 74.5975 108872000 72.48434
## 2020-01-08 74.2900 76.1100 74.2900 75.7975 132079200 73.65034
## 2020-01-09 76.8100 77.6075 76.5500 77.4075 170108400 75.21473
With the data set loaded, we’ll proceed to calculate daily returns. This will be our targeted value.
# Calculate daily returns
AAPL$Return <- dailyReturn(AAPL$AAPL.Adjusted)
# Lag the returns to create a prediction target
AAPL$LagReturn <- lag(AAPL$Return, -1)
# Remove NA values
AAPL <- na.omit(AAPL)
# Split the data into training and testing sets
set.seed(123)
train_index <- sample(seq_len(nrow(AAPL)), size = 0.7 * nrow(AAPL))
train_data <- AAPL[train_index, ]
test_data <- AAPL[-train_index, ]
# Extract the predictor and response variables
x_train <- as.matrix(train_data[, c("AAPL.Open", "AAPL.High", "AAPL.Low", "AAPL.Close", "AAPL.Volume")])
y_train <- train_data$LagReturn
x_test <- as.matrix(test_data[, c("AAPL.Open", "AAPL.High", "AAPL.Low", "AAPL.Close", "AAPL.Volume")])
y_test <- test_data$LagReturn
# Fit a linear regression model
linear_model <- lm(LagReturn ~ AAPL.Open + AAPL.High + AAPL.Low + AAPL.Close + AAPL.Volume, data = train_data)
# Predict on the test set
linear_predictions <- predict(linear_model, newdata = test_data)
# Evaluate the model
linear_mse <- mean((linear_predictions - y_test)^2)
print(paste("Linear Regression MSE:", linear_mse))
## [1] "Linear Regression MSE: 0.000557104812905552"
# Fit a Lasso regression model using cross-validation to find the best lambda
lasso_model <- cv.glmnet(x_train, y_train, alpha = 1)
# Predict on the test set
lasso_predictions <- predict(lasso_model, s = lasso_model$lambda.min, newx = x_test)
# Evaluate the model
lasso_mse <- mean((lasso_predictions - y_test)^2)
print(paste("Lasso Regression MSE:", lasso_mse))
## [1] "Lasso Regression MSE: 0.000556797136308034"
By comparing the Mean Squared Error (MSE) of both models, we can determine which model performs better on the test set. Generally, Lasso regression is useful when we expect some predictors to be irrelevant because it can perform variable selection by shrinking the coefficients of less important variables to zero.In this case we would try to implement Lasso Model as it would present a slightly lower MSE for our predictor.