10.This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
library(ISLR)
require(MASS)
## Loading required package: MASS
require(class)
## Loading required package: class
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
plot(Today~Lag1,col="darkred",data = Weekly)
simplelm=lm(Today~Lag1,data = Weekly)
abline(simplelm,lwd=3,col="darkgreen")
pairs(Weekly)
logmod=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,family = "binomial",data = Weekly)
summary(logmod)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
out of all the data only Lag2 is significant as estimated Std is the only one that has a significant value. the coefficient indicates that a chnage to lag 1 would either increase or decrease lag2.
probs=predict(logmod,type="response")
preds=rep("down",1089)
preds[probs>0.5]="up"
table(preds,Weekly$Direction)
##
## preds Down Up
## down 54 48
## up 430 557
Based on the confusion matrix ( which made me confused) we can see that thres strong evidnece of thinfs going up, yet there is also strong of evidence prviding a down prediction. essentially ending with 430 false positive giving us a low percentage of 12.59% true negatives.
hist(probs,breaks=100,col="darkred")
abline(v=mean(probs),lwd=2)
plot(probs,col=ifelse(Weekly$Direction=="Down","red","green"),pch=16)
abline(h=0.5,lwd=3)
in the plot we can see that most probabilities end up being above the .5 line which if we used it we would consider most of the probabilites being up
training.data=Weekly[Weekly$Year<2009,]
test.data=Weekly[Weekly$Year>2008,]
simpglm=glm(Direction~Lag2,data = training.data,family = "binomial")
summary(simpglm)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = training.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
testprobs=predict(simpglm,type="response",newdata=test.data)
testdirs=Weekly$Direction[Weekly$Year>2008]
plot(testprobs,col=ifelse(Weekly$Direction[Weekly$Year>2008]=="Down","red","green"),pch=16)
abline(h=0.5,lwd=3)
testpreds=rep("Down",104)
testpreds[testprobs>0.5]="up"
mean(probs)
## [1] 0.5555556
table(testpreds,testdirs)
## testdirs
## testpreds Down Up
## Down 9 5
## up 34 56
the erorr for the test is 34/90=37.78%
lda.fit=lda(Direction~Lag2,data=training.data)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = training.data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
plot(lda.fit)
lda.pred=predict(lda.fit,newdata=test.data,type="response")
lda.class=lda.pred$class
table(lda.class,test.data$Direction)
##
## lda.class Down Up
## Down 9 5
## Up 34 56
we can see that the model still shows that most of the data goes up. this could be because of the lag2 levels so it is difficult to find an accurate way to tell the diffrences with the same error rate as above.
qda.fit=qda(Direction~Lag2,data=training.data)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = training.data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.pred=predict(qda.fit,newdata=test.data,type="response")
qda.class=qda.pred$class
table(qda.class,test.data$Direction)
##
## qda.class Down Up
## Down 0 0
## Up 43 61
this model is the worst one as it classfies evetything going up. the error rate for the model is 41.35% which makes it a bad model.
set.seed(1)
train.X=cbind(training.data$Lag2)
test.X=cbind(test.data$Lag2)
train.Y=cbind(training.data$Direction)
knn.pred=knn(train.X,test.X,train.Y,k=1)
table(knn.pred,test.data$Direction)
##
## knn.pred Down Up
## 1 21 30
## 2 22 31
knn3.pred=knn(train.X,test.X,train.Y,k=3)
table(knn3.pred,test.data$Direction)
##
## knn3.pred Down Up
## 1 16 19
## 2 27 42
on the firt model we see a bad test rate as it spits it above the 45% range, on the second set of data we improve the error rate down to below 45% range which makes it a litte bit better.
qda.fit2 = qda(Direction~Lag1 + Lag2 + Lag4, data= training.data)
qda.fit2
## Call:
## qda(Direction ~ Lag1 + Lag2 + Lag4, data = training.data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag1 Lag2 Lag4
## Down 0.289444444 -0.03568254 0.15925624
## Up -0.009213235 0.26036581 0.09220956
qda.pred2 = predict(qda.fit2, newdata=test.data, type="response")
qda.class2 = qda.pred2$class
table(qda.class2, test.data$Direction)
##
## qda.class2 Down Up
## Down 9 20
## Up 34 41
lda.fit2 = lda(Direction~Lag1 + Lag2 + Lag4, data= training.data)
lda.fit2
## Call:
## lda(Direction ~ Lag1 + Lag2 + Lag4, data = training.data)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag1 Lag2 Lag4
## Down 0.289444444 -0.03568254 0.15925624
## Up -0.009213235 0.26036581 0.09220956
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.2984478
## Lag2 0.2960224
## Lag4 -0.1113485
lda.pred2 = predict(lda.fit2, newdata=test.data, type="response")
lda.class2 = lda.pred2$class
table(lda.class2, test.data$Direction)
##
## lda.class2 Down Up
## Down 9 7
## Up 34 54
We do not see any improvement in the output using only Lag2 which does not benefit us
11.In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
library(ISLR)
data("Auto")
mpg01<-rep(0,length(Auto$mpg))
mpg01[Auto$mpg>median(Auto$mpg)]<-1
Auto<-data.frame(Auto,mpg01)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
## mpg01
## Min. :0.0
## 1st Qu.:0.0
## Median :0.5
## Mean :0.5
## 3rd Qu.:1.0
## Max. :1.0
##
pairs(Auto[,-9])
par(mfrow=c(2,3))
boxplot(cylinders~mpg01,data = Auto,main="cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")
boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01")
boxplot(year ~ mpg01, data = Auto, main = "Year vs mpg01")
there is a slight correlation or association between the vraibales of mpg1 , cylinders, weight, displacement and horespower. In the scatter plot its a little difficult to tell as the MPG01 is based on binary.
set.seed(123)
train <- sample(1:dim(Auto)[1], dim(Auto)[1]*.7, rep=FALSE)
test <- -train
training_data<- Auto[train, ]
testing_data= Auto[test, ]
mpg01.test <- mpg01[test]
By using these functions i am able to split the data into the two different sets
lda_model <- lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
lda_model
## Call:
## lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
##
## Prior probabilities of groups:
## 0 1
## 0.4963504 0.5036496
##
## Group means:
## cylinders weight displacement horsepower
## 0 6.786765 3641.022 275.2941 130.96324
## 1 4.188406 2314.000 114.5290 78.00725
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.3974647924
## weight -0.0009670704
## displacement -0.0029615583
## horsepower 0.0049004106
lda_pred = predict(lda_model, testing_data)
names(lda_pred)
## [1] "class" "posterior" "x"
pred.lda <- predict(lda_model, testing_data)
table(pred.lda$class, mpg01.test)
## mpg01.test
## 0 1
## 0 50 3
## 1 10 55
mean(pred.lda$class != mpg01.test)
## [1] 0.1101695
From the output obtained we can tell that the test error is of 11.02%
qda_model = qda(mpg01 ~ cylinders + horsepower + weight + acceleration, data=training_data)
qda_model
## Call:
## qda(mpg01 ~ cylinders + horsepower + weight + acceleration, data = training_data)
##
## Prior probabilities of groups:
## 0 1
## 0.4963504 0.5036496
##
## Group means:
## cylinders horsepower weight acceleration
## 0 6.786765 130.96324 3641.022 14.55588
## 1 4.188406 78.00725 2314.000 16.55072
qda.class=predict(qda_model, testing_data)$class
table(qda.class, testing_data$mpg01)
##
## qda.class 0 1
## 0 53 4
## 1 7 54
mean(qda.class != testing_data$mpg01)
## [1] 0.09322034
from the output we can conclude the test error using those variables is of 9.32%
glm_model <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data, family = binomial)
summary(glm_model)
##
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower,
## family = binomial, data = training_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.44120 -0.17870 0.08712 0.31147 3.05303
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.8103006 2.0819718 5.673 1.41e-08 ***
## cylinders 0.1869071 0.3972245 0.471 0.63797
## weight -0.0020251 0.0008573 -2.362 0.01817 *
## displacement -0.0164493 0.0095899 -1.715 0.08629 .
## horsepower -0.0443408 0.0172072 -2.577 0.00997 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 379.83 on 273 degrees of freedom
## Residual deviance: 138.27 on 269 degrees of freedom
## AIC: 148.27
##
## Number of Fisher Scoring iterations: 7
probs <- predict(glm_model, testing_data, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, mpg01.test)
## mpg01.test
## pred.glm 0 1
## 0 53 6
## 1 7 52
mean(pred.glm != mpg01.test)
## [1] 0.1101695
doing the logistic regression we can tell that the test error for this os of 11.02% as well like in question d
str(Auto)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ mpg01 : num 0 0 0 0 0 0 0 0 0 0 ...
data = scale(Auto[,-c(9,10)])
set.seed(1234)
train <- sample(1:dim(Auto)[1], 392*.7, rep=FALSE)
test <- -train
training_data = data[train,c("cylinders","horsepower","weight","acceleration")]
testing_data = data[test, c("cylinders", "horsepower","weight","acceleration")]
train.mpg01 = Auto$mpg01[train]
test.mpg01= Auto$mpg01[test]
library(class)
set.seed(1234)
knn_pred_y = knn(training_data, testing_data, train.mpg01, k = 1)
table(knn_pred_y, test.mpg01)
## test.mpg01
## knn_pred_y 0 1
## 0 57 5
## 1 7 49
mean(knn_pred_y != test.mpg01)
## [1] 0.1016949
the test error is 10.12% and the value of that best fits is 1 if we want to deeper into finding the best k vakue then we have to do these coands to find the optimal k with lowest test error
knn_pred_y = NULL
error_rate = NULL
for(i in 1:dim(testing_data)[1]){
set.seed(1234)
knn_pred_y = knn(training_data,testing_data,train.mpg01,k=i)
error_rate[i] = mean(test.mpg01 != knn_pred_y)}
min_error_rate = min(error_rate)
print(min_error_rate)
## [1] 0.09322034
K = which(error_rate == min_error_rate)
print(K)
## [1] 4
the lowest test error it came up with would be the 9.32% which has a k value equal to 4
library(MASS)
data("Boston")
crim01 <- rep(0, length(Boston$crim))
crim01[Boston$crim > median(Boston$crim)] <- 1
Boston <- data.frame(Boston, crim01)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crim01
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
set.seed(1234)
train <- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep=FALSE)
test <- -train
Boston.train <- Boston[train, ]
Boston.test <- Boston[test, ]
crim01.test <- crim01[test]
fit.glm1 <- glm(crim01 ~ . - crim01 - crim, data = Boston, family = binomial)
fit.glm1
##
## Call: glm(formula = crim01 ~ . - crim01 - crim, family = binomial,
## data = Boston)
##
## Coefficients:
## (Intercept) zn indus chas nox rm
## -34.103704 -0.079918 -0.059389 0.785327 48.523782 -0.425596
## age dis rad tax ptratio black
## 0.022172 0.691400 0.656465 -0.006412 0.368716 -0.013524
## lstat medv
## 0.043862 0.167130
##
## Degrees of Freedom: 505 Total (i.e. Null); 492 Residual
## Null Deviance: 701.5
## Residual Deviance: 211.9 AIC: 239.9
fit.glm <- glm(crim01 ~ nox + indus + age + rad, data = Boston, family = binomial)
probs <- predict(fit.glm, Boston.test, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, crim01.test)
## crim01.test
## pred.glm 0 1
## 0 68 18
## 1 7 59
mean(pred.glm != crim01.test)
## [1] 0.1644737
fot the logistic regresson the test error is 16.45%
fit.lda <- lda(crim01 ~ nox + indus + age + rad , data = Boston)
pred.lda <- predict(fit.lda, Boston.test)
table(pred.lda$class, crim01.test)
## crim01.test
## 0 1
## 0 72 25
## 1 3 52
mean(pred.lda$class != crim01.test)
## [1] 0.1842105
fot the lda model the test error is 18.42%
data = scale(Boston[,-c(1,15)])
set.seed(1234)
train <- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep=FALSE)
test <- -train
training_data = data[train, c("nox" , "indus" , "age" , "rad")]
testing_data = data[test, c("nox" , "indus" , "age" , "rad")]
train.crime01 = Boston$crim01[train]
test.crime01= Boston$crim01[test]
library(class)
set.seed(1234)
knn_pred_y = knn(training_data, testing_data, train.crime01, k = 1)
table(knn_pred_y, test.crime01)
## test.crime01
## knn_pred_y 0 1
## 0 67 6
## 1 8 71
mean(knn_pred_y != test.crime01)
## [1] 0.09210526
knn_pred_y = NULL
error_rate = NULL
for(i in 1:dim(testing_data)[1]){
set.seed(1234)
knn_pred_y = knn(training_data,testing_data,train.crime01,k=i)
error_rate[i] = mean(test.crime01 != knn_pred_y)}
min_error_rate = min(error_rate)
print(min_error_rate)
## [1] 0.06578947
K = which(error_rate == min_error_rate)
print(K)
## [1] 4
From the output with k equaling 1 the test error is 9.21% when we try to find the lowest test error we find that it is 6.58% with a k equaling to 4