library(ISLR)
weekly = Weekly
attach(weekly)
summary(weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202
## Median : 0.2380 Median : 0.2340 Median :1.00268
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821
## Today Direction
## Min. :-18.1950 Down:484
## 1st Qu.: -1.1540 Up :605
## Median : 0.2410
## Mean : 0.1499
## 3rd Qu.: 1.4050
## Max. : 12.0260
cor(weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
pairs(weekly)
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data=Weekly, family="binomial")
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 is the only predictor that appears to be satistically significant.
glm.probs = predict(glm.fit, type="response")
glm.pred=rep("Down",1089)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction)
## Direction
## glm.pred Down Up
## Down 54 48
## Up 430 557
(557+54)/1089
## [1] 0.5610652
There are a predominance of Up prediction. The model predicts well the Up direction, but it predict poorly the Down direction.
trainset = (Weekly$Year<=2008)
testset = Weekly[!trainset,]
dim(testset)
## [1] 104 9
glm.fit.d = glm(Direction ~ Lag2, data=Weekly, subset=trainset, family="binomial")
glm.probs.d = predict(glm.fit.d, type="response", newdata=testset)
glm.preds.d=rep("Down",104)
glm.preds.d[glm.probs.d>.5] = "Up"
table(glm.preds.d, testset$Direction)
##
## glm.preds.d Down Up
## Down 9 5
## Up 34 56
(9+56)/104
## [1] 0.625
library(MASS)
lda.fit.e = lda(Direction ~ Lag2, data=Weekly, subset=trainset)
lda.preds.e = predict(lda.fit.e, newdata=testset)
table(lda.preds.e$class, testset$Direction)
##
## Down Up
## Down 9 5
## Up 34 56
(56+9)/104
## [1] 0.625
qda.fit.f = qda(Direction ~ Lag2, data=Weekly, subset=trainset)
qda.preds.f = predict(qda.fit.f, newdata=testset)
table(qda.preds.f$class,testset$Direction)
##
## Down Up
## Down 0 0
## Up 43 61
61/104
## [1] 0.5865385
library(class)
set.seed(1)
train.g = Weekly[trainset, c("Lag2", "Direction")]
knn.pred = knn(train=data.frame(train.g$Lag2), test=data.frame(testset$Lag2), cl=train.g$Direction, k=1)
table(knn.pred, testset$Direction)
##
## knn.pred Down Up
## Down 21 30
## Up 22 31
52/104
## [1] 0.5
The Logistic Regression and LDA models appear to provide the best results on this data.
set.seed(1)
results <- data.frame(k=1:50, acc=NA)
for(i in 1:50){
knn.pred = knn(train=data.frame(train.g$Lag2), test=data.frame(testset$Lag2), cl=train.g$Direction, k=i)
cm <- table(testset$Direction, knn.pred)
acc <- (cm["Down", "Down"] + cm["Up", "Up"])/sum(cm)
results$acc[i] <- acc
}
plot(x=results$k, y=results$acc, type="l", xlab="K", ylab="accuracy", ylim=c(.4,.65))
The K doesn’t seem to affect the accuracy values.
auto=Auto
attach(auto)
dim(auto)
## [1] 392 9
auto$mpg01 = 0
auto$mpg01[mpg>median(mpg)] = 1
par(mfrow=c(2,3))
for(i in names(Auto)){
# excluding the own mpgs variables and others categorical variables
if( grepl(i, pattern="^mpg|cylinders|origin|name")){ next}
boxplot(eval(parse(text=i)) ~ auto$mpg01, ylab=i, col=c("red", "blue"))
}
From the boxplots we can see that all the quantitave variables affect the mpg value.
attach(auto)
## The following objects are masked from auto (pos = 3):
##
## acceleration, cylinders, displacement, horsepower, mpg, name,
## origin, weight, year
colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
par(mfrow=c(1,2))
for(i in c("cylinders", "origin")){
aux <- table(eval(parse(text=i)), auto$mpg01)
cols <- colors[1:nrow(aux)]
barplot(aux, xlab="mpg01", ylab=i, beside=T, legend=rownames(aux), col=cols)}
Categorical variables such as cylinders and origin also show relation with mpg01. The more cylinders, the higher the mpg of the cars are. Cars of lower mpg are originally from America.
# splitting the train and test set into 75% and 25%
set.seed(1)
rows = sample(x=nrow(auto), size=.75*nrow(auto))
trainset = auto[rows, ]
testset = auto[-rows, ]
library(MASS)
lda.fit = lda(mpg01 ~ displacement+horsepower+weight+acceleration+year+cylinders+origin, data=trainset)
lda.pred = predict(lda.fit, testset)
table(lda.pred$class,testset$mpg01)
##
## 0 1
## 0 41 0
## 1 5 52
1 - (41+52)/98
## [1] 0.05102041
qda.fit = qda(mpg01 ~ displacement+horsepower+weight+acceleration+year+cylinders+origin, data=trainset)
qda.pred = predict(qda.fit, testset)
table(qda.pred$class,testset$mpg01)
##
## 0 1
## 0 43 1
## 1 3 51
1 - (43+51)/98
## [1] 0.04081633
lr.fit = glm(as.factor(mpg01) ~ displacement+horsepower+weight+acceleration+year+cylinders+origin, data=trainset, family="binomial")
lr.probs = predict(lr.fit, testset, type="response")
lr.pred = ifelse(lr.probs>0.5, "1", "0")
table(lr.pred,testset$mpg01)
##
## lr.pred 0 1
## 0 41 1
## 1 5 51
1-(51+41)/98
## [1] 0.06122449
sel.variables = which(names(trainset)%in%c("mpg01", "displacement", "horsepower", "weight", "acceleration", "year", "cylinders", "origin"))
set.seed(1)
accuracies = data.frame("k"=1:10, acc=NA)
for(k in 1:10){
knn.pred = knn(train=trainset[, sel.variables], test=testset[, sel.variables], cl=trainset$mpg01, k=k)
# test-error
accuracies$acc[k]= round(sum(knn.pred!=testset$mpg01)/nrow(testset)*100,2)
}
accuracies
## k acc
## 1 1 16.33
## 2 2 20.41
## 3 3 14.29
## 4 4 14.29
## 5 5 13.27
## 6 6 13.27
## 7 7 12.24
## 8 8 14.29
## 9 9 14.29
## 10 10 13.27
The k=7 was the best response, outperformed all others.
power = function (){
print(2^3)}
power()
## [1] 8
power2 = function (x,a){
print(x^a)
}
power2(2,3)
## [1] 8
power2(10,3)
## [1] 1000
power2(8,17)
## [1] 2.2518e+15
y = power2(131,3)
## [1] 2248091
power3 = function(x,a){
return(x^a)
}
power3(2,3)
## [1] 8
x = c(1:10)
y=power3(x,2)
par(mfrow=c(2,2))
plot(x = x, y = y, xlab="x", ylab="x²")
plot(x,y,log="x", xlab="log(x) scale", ylab="x²")
plot(x,y,log="y", xlab="x", ylab="log(x²) scale")
plot(x,y,log="xy", xlab="log(x) scale", ylab="log(x²) scale")
(f) Create a function, PlotPower(), that allows you to create a plot of x against x^a for a fixed a and for a range of values of x. For instance, if you call then a plot should be created with an x-axis taking on values 1,2,…,10, and a y-axis taking on values 13,23,…,103.
plotpower = function(x,a){
return(plot(x=x,y=x^a))
}
plotpower(1:10,3)
rm(list=ls())
?Boston
Boston$crim01 = ifelse(Boston$crim>median(Boston$crim),1,0)
attach(Boston)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crim01
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
par(mfrow=c(2,3))
for(i in names(Boston)){
# excluding the own crim variables and others categorical variables
if( grepl(i, pattern="crim|chas")){ next}
boxplot(eval(parse(text=i)) ~ crim01, ylab=i, col=c("red", "blue"))}
colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
aux <- table(chas, Boston$crim01)
cols <- colors[1:nrow(aux)]
barplot(aux, xlab="crim01", ylab="chas", beside=T, legend=rownames(aux), col=cols)
Selecting the relevant variables:
vars = c("zn", "indus", "nox", "age", "dis", "rad", "tax", "ptratio", "black", "lstat", "medv", "crim01")
rows = sample(x=nrow(Boston), size=.75*nrow(Boston))
trainset = Boston[rows, vars]
testset = Boston[-rows, vars]
# LOGISTIC REGRESSION
lr.fit <- glm(as.factor(crim01) ~ ., data=trainset, family="binomial")
lr.probs <- predict(lr.fit, testset, type="response")
lr.pred <- ifelse(lr.probs>.5, "1","0")
test.err.lr <- mean(lr.pred!=testset$crim01)
# LINEAR DISCRIMINANT ANALYSIS
lda.fit <- lda(crim01 ~ ., data=trainset)
lda.pred <- predict(lda.fit, testset)
test.err.lda <- mean(lda.pred$class!=testset$crim01)
# QUADRATIC DISCRIMINANT ANALYSIS
qda.fit <- qda(crim01 ~ ., data=trainset)
qda.pred <- predict(qda.fit, testset)
test.err.qda <- mean(qda.pred$class!=testset$crim01)
# KNN-1
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=1)
test.err.knn_1 <- mean(knn.pred!=testset$crim01)
# KNN-CV
err.knn_cv <- rep(NA,9)
for(i in 2:10){
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=i)
err.knn_cv[i-1] <- mean(knn.pred!=testset$crim01)
}
test.err_knn_CV <- min(err.knn_cv)
round1 = data.frame("method"=c("LR", "LDA", "QDA", "KNN-1", "KNN-CV"), test.err=c(test.err.lr, test.err.lda, test.err.qda, test.err.knn_1, test.err_knn_CV))
round1
## method test.err
## 1 LR 0.11811024
## 2 LDA 0.18110236
## 3 QDA 0.11023622
## 4 KNN-1 0.08661417
## 5 KNN-CV 0.10236220
Both KNN methods outperforms the others, maybe it’s related to the form of the data, which can be more non-linear and either differs more from a gaussian shape. The logistic regression performs better than LDA and QDA, that enhances the assumption of a non Gaussian distribution from the data. And as QDA performs better than LDA, i can imagine that the non-linear decision boundary helps this decision. So the non-parametric method presents the best results.
Doing a second round of modelling, this time choosing only the predictors which seemed more relevants by the logistic regression coefficients. Cheking the p-values:
coefs <- summary(lr.fit)$coefficients
coefs[order(coefs[,"Pr(>|z|)"], decreasing=F),]
## Estimate Std. Error z value Pr(>|z|)
## nox 48.647494313 9.073527230 5.361476 8.254490e-08
## (Intercept) -39.700417822 7.788609547 -5.097241 3.446400e-07
## rad 0.715059648 0.181626779 3.936973 8.251603e-05
## medv 0.174308395 0.053113289 3.281823 1.031385e-03
## ptratio 0.368144523 0.128580944 2.863134 4.194726e-03
## dis 0.649936278 0.253869932 2.560115 1.046375e-02
## zn -0.088654645 0.046837253 -1.892823 5.838137e-02
## lstat 0.094009867 0.057031445 1.648387 9.927337e-02
## black -0.008586495 0.005838517 -1.470664 1.413821e-01
## tax -0.004130847 0.002990376 -1.381381 1.671620e-01
## indus -0.057402207 0.052405886 -1.095339 2.733682e-01
## age 0.013533748 0.012817471 1.055883 2.910217e-01
I choose nox, rad, ptratio, black and medv.
vars <- c("nox", "rad", "ptratio", "black", "medv", "dis", "crim01")
trainset = Boston[rows, vars]
testset = Boston[-rows, vars]
# LOGISTIC REGRESSION
lr.fit <- glm(as.factor(crim01) ~ ., data=trainset, family="binomial")
lr.probs <- predict(lr.fit, testset, type="response")
lr.pred <- ifelse(lr.probs>.5, "1","0")
test.err.lr <- mean(lr.pred!=testset$crim01)
# LINEAR DISCRIMINANT ANALYSIS
lda.fit <- lda(crim01 ~ ., data=trainset)
lda.pred <- predict(lda.fit, testset)
test.err.lda <- mean(lda.pred$class!=testset$crim01)
# QUADRATIC DISCRIMINANT ANALYSIS
qda.fit <- qda(crim01 ~ ., data=trainset)
qda.pred <- predict(qda.fit, testset)
test.err.qda <- mean(qda.pred$class!=testset$crim01)
# KNN-1
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=1)
test.err.knn_1 <- mean(knn.pred!=testset$crim01)
# KNN-CV
err.knn_cv <- rep(NA,9)
for(i in 2:10){
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=i)
err.knn_cv[i-1] <- mean(knn.pred!=testset$crim01)
}
test.err_knn_CV <- min(err.knn_cv)
round2 = data.frame("method"=c("LR", "LDA", "QDA", "KNN-1", "KNN-CV"), test.err=c(test.err.lr, test.err.lda, test.err.qda, test.err.knn_1, test.err_knn_CV))
round2
## method test.err
## 1 LR 0.1496063
## 2 LDA 0.1811024
## 3 QDA 0.1811024
## 4 KNN-1 0.1023622
## 5 KNN-CV 0.1338583
On round 2, the general performance was worse for all approachs, so probably there are relevent information in the excluded variables.
Now, i try again, using the most 6 variable that seemed, in my observation from the graphs shown before, more associated with crime index. They are zn, indus, nox, dis, rad and tax.
vars <- c("zn","indus", "nox", "dis", "rad", "tax", "crim01")
trainset = Boston[rows, vars]
testset = Boston[-rows, vars]
# LOGISTIC REGRESSION
lr.fit <- glm(as.factor(crim01) ~ ., data=trainset, family="binomial")
lr.probs <- predict(lr.fit, testset, type="response")
lr.pred <- ifelse(lr.probs>.5, "1","0")
test.err.lr <- mean(lr.pred!=testset$crim01)
# LINEAR DISCRIMINANT ANALYSIS
lda.fit <- lda(crim01 ~ ., data=trainset)
lda.pred <- predict(lda.fit, testset)
test.err.lda <- mean(lda.pred$class!=testset$crim01)
# QUADRATIC DISCRIMINANT ANALYSIS
qda.fit <- qda(crim01 ~ ., data=trainset)
qda.pred <- predict(qda.fit, testset)
test.err.qda <- mean(qda.pred$class!=testset$crim01)
# KNN-1
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=1)
test.err.knn_1 <- mean(knn.pred!=testset$crim01)
# KNN-CV
err.knn_cv <- rep(NA,9)
for(i in 2:10){
knn.pred <- knn(train=trainset, test=testset, cl=trainset$crim01, k=i)
err.knn_cv[i-1] <- mean(knn.pred!=testset$crim01)
}
test.err_knn_CV <- min(err.knn_cv)
round3 = data.frame("method"=c("LR", "LDA", "QDA", "KNN-1", "KNN-CV"), test.err=c(test.err.lr, test.err.lda, test.err.qda, test.err.knn_1, test.err_knn_CV))
round3
## method test.err
## 1 LR 0.15748031
## 2 LDA 0.18897638
## 3 QDA 0.08661417
## 4 KNN-1 0.01574803
## 5 KNN-CV 0.02362205
When I eliminate some variables, it helped for the non-linear approachs did better models. Seeing the three rounds on the graph bellow.
performances <- rbind(cbind(round="round_1", round1), cbind(round="round_2", round2), cbind(round="round_3", round3))
library(reshape2)
dcast(data=performances, method ~ round, value.var="test.err")
## method round_1 round_2 round_3
## 1 KNN-1 0.08661417 0.1023622 0.01574803
## 2 KNN-CV 0.10236220 0.1338583 0.02362205
## 3 LDA 0.18110236 0.1811024 0.18897638
## 4 LR 0.11811024 0.1496063 0.15748031
## 5 QDA 0.11023622 0.1811024 0.08661417
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'auto':
##
## mpg
##
## The following object is masked from 'auto':
##
## mpg
ggplot(data=performances, aes(x=method,y=test.err)) + geom_bar(stat="identity", aes(fill=method)) + coord_flip() + facet_grid(round ~ .)