library(ISLR)
weekly = Weekly
attach(weekly)
summary(weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
pairs(weekly)
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data=Weekly, family="binomial")
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 is the only predictor that appears to be satistically significant.
glm.probs = predict(glm.fit, type="response")
glm.pred=rep("Down",1089)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction)
## Direction
## glm.pred Down Up
## Down 54 48
## Up 430 557
(557+54)/1089
## [1] 0.5610652
There are a predominance of Up prediction. The model predicts well the Up direction, but it predict poorly the Down direction.
trainset = (Weekly$Year<=2008)
testset = Weekly[!trainset,]
dim(testset)
## [1] 104 9
glm.fit.d = glm(Direction ~ Lag2, data=Weekly, subset=trainset, family="binomial")
glm.probs.d = predict(glm.fit.d, type="response", newdata=testset)
glm.preds.d=rep("Down",104)
glm.preds.d[glm.probs.d>.5] = "Up"
(9+56)/104
## [1] 0.625
library(MASS)
lda.fit.e = lda(Direction ~ Lag2, data=Weekly, subset=trainset)
lda.preds.e = predict(lda.fit.e, newdata=testset)
table(lda.preds.e$class, testset$Direction)
##
## Down Up
## Down 9 5
## Up 34 56
(56+9)/104
## [1] 0.625
qda.fit.f = qda(Direction ~ Lag2, data=Weekly, subset=trainset)
qda.preds.f = predict(qda.fit.f, newdata=testset)
table(qda.preds.f$class,testset$Direction)
##
## Down Up
## Down 0 0
## Up 43 61
library(class)
set.seed(1)
train.g = Weekly[trainset, c("Lag2", "Direction")]
knn.pred = knn(train=data.frame(train.g$Lag2), test=data.frame(testset$Lag2), cl=train.g$Direction, k=1)
table(knn.pred, testset$Direction)
##
## knn.pred Down Up
## Down 21 30
## Up 22 31
52/104
## [1] 0.5
Which of these methods appears to provide the best results on this data? The Logistic Regression and LDA models appear to provide the best results on this data.
Experiment with different combinations of predictors, includ- ing possible transformations and interactions, for each of the methods. Report the variables, method, and associated confu- sion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
set.seed(1)
results <- data.frame(k=1:50, acc=NA)
for(i in 1:50){
knn.pred = knn(train=data.frame(train.g$Lag2), test=data.frame(testset$Lag2), cl=train.g$Direction, k=i)
cm <- table(testset$Direction, knn.pred)
acc <- (cm["Down", "Down"] + cm["Up", "Up"])/sum(cm)
results$acc[i] <- acc
}
plot(x=results$k, y=results$acc, type="l", xlab="K", ylab="accuracy", ylim=c(.4,.65))
The K doesn’t seem to affect the accuracy values. ## 11. In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set. a. Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both mpg01 and the other Auto variables.
auto=Auto
attach(auto)
dim(auto)
## [1] 392 9
auto$mpg01 = 0
auto$mpg01[mpg>median(mpg)] = 1
par(mfrow=c(2,3))
for(i in names(Auto)){
# excluding the own mpgs variables and others categorical variables
if( grepl(i, pattern="^mpg|cylinders|origin|name")){ next}
boxplot(eval(parse(text=i)) ~ auto$mpg01, ylab=i, col=c("red", "blue"))
}
the quantitave variables affect the mpg value.
attach(auto)
## The following objects are masked from auto (pos = 3):
##
## acceleration, cylinders, displacement, horsepower, mpg, name,
## origin, weight, year
colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
par(mfrow=c(1,2))
for(i in c("cylinders", "origin")){
aux <- table(eval(parse(text=i)), auto$mpg01)
cols <- colors[1:nrow(aux)]
barplot(aux, xlab="mpg01", ylab=i, beside=T, legend=rownames(aux), col=cols)}
Categorical variables such as cylinders and origin also show relation with mpg01. The more cylinders, the higher the mpg of the cars are. Cars of lower mpg are originally from America.
set.seed(1)
rows = sample(x=nrow(auto), size=.75*nrow(auto))
trainset = auto[rows, ]
testset = auto[-rows, ]
library(MASS)
lda.fit = lda(mpg01 ~ displacement+horsepower+weight+acceleration+year+cylinders+origin, data=trainset)
lda.pred = predict(lda.fit, testset)
table(lda.pred$class,testset$mpg01)
##
## 0 1
## 0 40 0
## 1 13 45
1 - (41+52)/98
## [1] 0.05102041
qda.fit = qda(mpg01 ~ displacement+horsepower+weight+acceleration+year+cylinders+origin, data=trainset)
qda.pred = predict(qda.fit, testset)
table(qda.pred$class,testset$mpg01)
##
## 0 1
## 0 44 4
## 1 9 41
1 - (43+51)/98
## [1] 0.04081633
sel.variables = which(names(trainset)%in%c("mpg01", "displacement", "horsepower", "weight", "acceleration", "year", "cylinders", "origin"))
set.seed(1)
accuracies = data.frame("k"=1:10, acc=NA)
for(k in 1:10){
knn.pred = knn(train=trainset[, sel.variables], test=testset[, sel.variables], cl=trainset$mpg01, k=k)
# test-error
accuracies$acc[k]= round(sum(knn.pred!=testset$mpg01)/nrow(testset)*100,2)
}
accuracies
## k acc
## 1 1 14.29
## 2 2 16.33
## 3 3 12.24
## 4 4 12.24
## 5 5 13.27
## 6 6 13.27
## 7 7 13.27
## 8 8 16.33
## 9 9 15.31
## 10 10 15.31
The k=7 was the best response, outperformed all others.
power = function (){
print(2^3)}
power()
## [1] 8
power2 = function (x,a){
print(x^a)
}
power2(2,3)
## [1] 8
power2(10,3)
## [1] 1000
power2(8,17)
## [1] 2.2518e+15
y = power2(131,3)
## [1] 2248091
power3 = function(x,a){
return(x^a)
}
power3(2,3)
## [1] 8
x = c(1:10)
y=power3(x,2)
par(mfrow=c(2,2))
plot(x = x, y = y, xlab="x", ylab="x²")
plot(x,y,log="x", xlab="log(x) scale", ylab="x²")
plot(x,y,log="y", xlab="x", ylab="log(x²) scale")
plot(x,y,log="xy", xlab="log(x) scale", ylab="log(x²) scale")
plotpower = function(x,a){
return(plot(x=x,y=x^a))
}
plotpower(1:10,3)
rm(list=ls())
?Boston
## starting httpd help server ... done
Boston$crim01 = ifelse(Boston$crim>median(Boston$crim),1,0)
attach(Boston)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crim01
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
par(mfrow=c(2,3))
for(i in names(Boston)){
# excluding the own crim variables and others categorical variables
if( grepl(i, pattern="crim|chas")){ next}
boxplot(eval(parse(text=i)) ~ crim01, ylab=i, col=c("red", "blue"))}
colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
aux <- table(chas, Boston$crim01)
cols <- colors[1:nrow(aux)]
barplot(aux, xlab="crim01", ylab="chas", beside=T, legend=rownames(aux), col=cols)
Selecting the relevant variables:
vars = c("zn", "indus", "nox", "age", "dis", "rad", "tax", "ptratio", "black", "lstat", "medv", "crim01")
rows = sample(x=nrow(Boston), size=.75*nrow(Boston))
trainset = Boston[rows, vars]
testset = Boston[-rows, vars]
# LOGISTIC REGRESSION
lr.fit <- glm(as.factor(crim01) ~ ., data=trainset, family="binomial")
lr.probs <- predict(lr.fit, testset, type="response")
lr.pred <- ifelse(lr.probs>.5, "1","0")
test.err.lr <- mean(lr.pred!=testset$crim01)
# LINEAR DISCRIMINANT ANALYSIS
lda.fit <- lda(crim01 ~ ., data=trainset)
lda.pred <- predict(lda.fit, testset)
test.err.lda <- mean(lda.pred$class!=testset$crim01)
# QUADRATIC DISCRIMINANT ANALYSIS
qda.fit <- qda(crim01 ~ ., data=trainset)
qda.pred <- predict(qda.fit, testset)
test.err.qda <- mean(qda.pred$class!=testset$crim01)
# KNN-1
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=1)
test.err.knn_1 <- mean(knn.pred!=testset$crim01)
# KNN-CV
err.knn_cv <- rep(NA,9)
for(i in 2:10){
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=i)
err.knn_cv[i-1] <- mean(knn.pred!=testset$crim01)
}
test.err_knn_CV <- min(err.knn_cv)
round1 = data.frame("method"=c("LR", "LDA", "QDA", "KNN-1", "KNN-CV"), test.err=c(test.err.lr, test.err.lda, test.err.qda, test.err.knn_1, test.err_knn_CV))
round1
## method test.err
## 1 LR 0.08661417
## 2 LDA 0.17322835
## 3 QDA 0.11023622
## 4 KNN-1 0.07874016
## 5 KNN-CV 0.05511811
Both KNN methods outperforms the others, maybe it’s related to the form of the data, which can be more non-linear and either differs more from a gaussian shape. The logistic regression performs better than LDA and QDA, that enhances the assumption of a non Gaussian distribution from the data. And as QDA performs better than LDA, i can imagine that the non-linear decision boundary helps this decision. So the non-parametric method presents the best results.
Doing a second round of modelling, this time choosing only the predictors which seemed more relevants by the logistic regression coefficients. Cheking the p-values:
coefs <- summary(lr.fit)$coefficients
coefs[order(coefs[,"Pr(>|z|)"], decreasing=F),]
## Estimate Std. Error z value Pr(>|z|)
## nox 43.719285025 7.927829508 5.514660 3.494547e-08
## (Intercept) -32.787324533 7.072007992 -4.636211 3.548529e-06
## rad 0.647111515 0.172924181 3.742169 1.824387e-04
## medv 0.131066963 0.048318950 2.712537 6.677023e-03
## ptratio 0.304806188 0.128066295 2.380066 1.730955e-02
## dis 0.562015386 0.240847366 2.333492 1.962234e-02
## tax -0.006671769 0.003278188 -2.035200 4.183073e-02
## zn -0.074954690 0.037014164 -2.025027 4.286457e-02
## black -0.010142406 0.005903889 -1.717920 8.581129e-02
## lstat 0.073419938 0.049647733 1.478818 1.391891e-01
## age 0.015149974 0.011281310 1.342927 1.792957e-01
## indus -0.048746984 0.047600050 -1.024095 3.057903e-01
I choose nox, rad, ptratio, black and medv.
vars <- c("nox", "rad", "ptratio", "black", "medv", "dis", "crim01")
trainset = Boston[rows, vars]
testset = Boston[-rows, vars]
# LOGISTIC REGRESSION
lr.fit <- glm(as.factor(crim01) ~ ., data=trainset, family="binomial")
lr.probs <- predict(lr.fit, testset, type="response")
lr.pred <- ifelse(lr.probs>.5, "1","0")
test.err.lr <- mean(lr.pred!=testset$crim01)
# LINEAR DISCRIMINANT ANALYSIS
lda.fit <- lda(crim01 ~ ., data=trainset)
lda.pred <- predict(lda.fit, testset)
test.err.lda <- mean(lda.pred$class!=testset$crim01)
# QUADRATIC DISCRIMINANT ANALYSIS
qda.fit <- qda(crim01 ~ ., data=trainset)
qda.pred <- predict(qda.fit, testset)
test.err.qda <- mean(qda.pred$class!=testset$crim01)
# KNN-1
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=1)
test.err.knn_1 <- mean(knn.pred!=testset$crim01)
# KNN-CV
err.knn_cv <- rep(NA,9)
for(i in 2:10){
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=i)
err.knn_cv[i-1] <- mean(knn.pred!=testset$crim01)
}
test.err_knn_CV <- min(err.knn_cv)
round2 = data.frame("method"=c("LR", "LDA", "QDA", "KNN-1", "KNN-CV"), test.err=c(test.err.lr, test.err.lda, test.err.qda, test.err.knn_1, test.err_knn_CV))
round2
## method test.err
## 1 LR 0.1259843
## 2 LDA 0.1811024
## 3 QDA 0.1574803
## 4 KNN-1 0.1338583
## 5 KNN-CV 0.1102362
On round 2, the general performance was worse for all approachs, so probably there are relevent information in the excluded variables.
Now, i try again, using the most 6 variable that seemed, in my observation from the graphs shown before, more associated with crime index. They are zn, indus, nox, dis, rad and tax.
vars <- c("zn","indus", "nox", "dis", "rad", "tax", "crim01")
trainset = Boston[rows, vars]
testset = Boston[-rows, vars]
# LOGISTIC REGRESSION
lr.fit <- glm(as.factor(crim01) ~ ., data=trainset, family="binomial")
lr.probs <- predict(lr.fit, testset, type="response")
lr.pred <- ifelse(lr.probs>.5, "1","0")
test.err.lr <- mean(lr.pred!=testset$crim01)
# LINEAR DISCRIMINANT ANALYSIS
lda.fit <- lda(crim01 ~ ., data=trainset)
lda.pred <- predict(lda.fit, testset)
test.err.lda <- mean(lda.pred$class!=testset$crim01)
# QUADRATIC DISCRIMINANT ANALYSIS
qda.fit <- qda(crim01 ~ ., data=trainset)
qda.pred <- predict(qda.fit, testset)
test.err.qda <- mean(qda.pred$class!=testset$crim01)
# KNN-1
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=1)
test.err.knn_1 <- mean(knn.pred!=testset$crim01)
# KNN-CV
err.knn_cv <- rep(NA,9)
for(i in 2:10){
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=i)
err.knn_cv[i-1] <- mean(knn.pred!=testset$crim01)
}
test.err_knn_CV <- min(err.knn_cv)
round3 = data.frame("method"=c("LR", "LDA", "QDA", "KNN-1", "KNN-CV"), test.err=c(test.err.lr, test.err.lda, test.err.qda, test.err.knn_1, test.err_knn_CV))
round3
## method test.err
## 1 LR 0.14960630
## 2 LDA 0.18897638
## 3 QDA 0.06299213
## 4 KNN-1 0.00000000
## 5 KNN-CV 0.00000000
When I eliminate some variables, it helped for the non-linear approachs did better models. Seeing the three rounds on the graph bellow.
performances <- rbind(cbind(round="round_1", round1), cbind(round="round_2", round2), cbind(round="round_3", round3))
library(reshape2)
dcast(data=performances, method ~ round, value.var="test.err")
## method round_1 round_2 round_3
## 1 KNN-1 0.07874016 0.1338583 0.00000000
## 2 KNN-CV 0.05511811 0.1102362 0.00000000
## 3 LDA 0.17322835 0.1811024 0.18897638
## 4 LR 0.08661417 0.1259843 0.14960630
## 5 QDA 0.11023622 0.1574803 0.06299213
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'auto':
##
## mpg
ggplot(data=performances, aes(x=method,y=test.err)) + geom_bar(stat="identity", aes(fill=method)) + coord_flip() + facet_grid(round ~ .)