library(ISLR)
library(MASS)
str(Weekly)
'data.frame': 1089 obs. of 9 variables:
$ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
$ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
$ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
$ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
$ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
$ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
$ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
$ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
$ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
summary(Weekly)
Year Lag1 Lag2 Lag3 Lag4
Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580 1st Qu.: -1.1580
Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410 Median : 0.2380
Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472 Mean : 0.1458
3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090 3rd Qu.: 1.4090
Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
Lag5 Volume Today Direction
Min. :-18.1950 Min. :0.08747 Min. :-18.1950 Down:484
1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540 Up :605
Median : 0.2340 Median :1.00268 Median : 0.2410
Mean : 0.1399 Mean :1.57462 Mean : 0.1499
3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
Max. : 12.0260 Max. :9.32821 Max. : 12.0260
plot(Weekly)
cor(Weekly[,-9])
Year Lag1 Lag2 Lag3 Lag4 Lag5
Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923 -0.030519101
Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876 -0.008183096
Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535 -0.072499482
Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865 0.060657175
Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000 -0.075675027
Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027 1.000000000
Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617 -0.058517414
Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873 0.011012698
Volume Today
Year 0.84194162 -0.032459894
Lag1 -0.06495131 -0.075031842
Lag2 -0.08551314 0.059166717
Lag3 -0.06928771 -0.071243639
Lag4 -0.06107462 -0.007825873
Lag5 -0.05851741 0.011012698
Volume 1.00000000 -0.033077783
Today -0.03307778 1.000000000
attach(Weekly)
The following objects are masked from Weekly (pos = 4):
Direction, Lag1, Lag2, Lag3, Lag4, Lag5, Today, Volume, Year
The following objects are masked from Weekly (pos = 5):
Direction, Lag1, Lag2, Lag3, Lag4, Lag5, Today, Volume, Year
plot(Volume)
plot(Year)
En base a lo anterior se puede decir que el Volumen y Año tienen alguna relacion.
attach(Weekly)
The following objects are masked from Weekly (pos = 3):
Direction, Lag1, Lag2, Lag3, Lag4, Lag5, Today, Volume, Year
The following objects are masked from Weekly (pos = 5):
Direction, Lag1, Lag2, Lag3, Lag4, Lag5, Today, Volume, Year
The following objects are masked from Weekly (pos = 6):
Direction, Lag1, Lag2, Lag3, Lag4, Lag5, Today, Volume, Year
fit.glm <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly,
family = binomial)
summary(fit.glm)
Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
Volume, family = binomial, data = Weekly)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6949 -1.2565 0.9913 1.0849 1.4579
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.26686 0.08593 3.106 0.0019 **
Lag1 -0.04127 0.02641 -1.563 0.1181
Lag2 0.05844 0.02686 2.175 0.0296 *
Lag3 -0.01606 0.02666 -0.602 0.5469
Lag4 -0.02779 0.02646 -1.050 0.2937
Lag5 -0.01447 0.02638 -0.549 0.5833
Volume -0.02274 0.03690 -0.616 0.5377
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1496.2 on 1088 degrees of freedom
Residual deviance: 1486.4 on 1082 degrees of freedom
AIC: 1500.4
Number of Fisher Scoring iterations: 4
De los predictores utilizado el unico estadisticamente significativo es Lag2, el cual tiene un valor de P menor a 0.05
probs <- predict(fit.glm, type = "response")
pred.glm <- rep("Down", length(probs))
pred.glm[probs > 0.5] <- "Up"
table(pred.glm, Direction)
Direction
pred.glm Down Up
Down 54 48
Up 430 557
mean(pred.glm == Weekly$Direction )
[1] 0.5610652
c((557/(557+48))*100,(54/(430+54))*100 )
[1] 92.06612 11.15702
Cuando el mercado sube la regresion logistica tiene un porcentaje de certeza del 92.1% y cuando el mercado baja el modelo funciona un 11.15%
train <- Weekly[, "Year"] <= 2008
fit.glm2 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
summary(fit.glm2)
Call:
glm(formula = Direction ~ Lag2, family = binomial, data = Weekly,
subset = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.536 -1.264 1.021 1.091 1.368
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.20326 0.06428 3.162 0.00157 **
Lag2 0.05810 0.02870 2.024 0.04298 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1354.7 on 984 degrees of freedom
Residual deviance: 1350.5 on 983 degrees of freedom
AIC: 1354.5
Number of Fisher Scoring iterations: 4
train <- (Year < 2009)
Weekly.2009_2010 <- Weekly[!train, ]
Direction.2009_2010 <- Direction[!train]
glm.probs <- predict(fit.glm2, Weekly.2009_2010,type = "response")
pred.glm2 <- rep("Down", length(glm.probs))
pred.glm2[glm.probs > 0.5] <- "Up"
table(pred.glm2,Direction.2009_2010)
Direction.2009_2010
pred.glm2 Down Up
Down 9 5
Up 34 56
c(((9+56)/104)*100, (56/(56+5))*100)
[1] 62.50000 91.80328
Se tiene un 62.5% de predecciones correctas en los datos de prueba y cuando el mercado sube el modelo tiene un 91.8% de efectividad.
fit.lda <- lda(Direction ~ Lag2, data = Weekly, subset = train)
fit.lda
Call:
lda(Direction ~ Lag2, data = Weekly, subset = train)
Prior probabilities of groups:
Down Up
0.4477157 0.5522843
Group means:
Lag2
Down -0.03568254
Up 0.26036581
Coefficients of linear discriminants:
LD1
Lag2 0.4414162
pred.lda <- predict(fit.lda, Weekly.2009_2010)
table(pred.lda$class, Direction.2009_2010)
Direction.2009_2010
Down Up
Down 9 5
Up 34 56
fit.qda <- qda(Direction ~ Lag2, data = Weekly, subset = train)
fit.qda
Call:
qda(Direction ~ Lag2, data = Weekly, subset = train)
Prior probabilities of groups:
Down Up
0.4477157 0.5522843
Group means:
Lag2
Down -0.03568254
Up 0.26036581
pred.qda <- predict(fit.qda, Weekly.2009_2010)
table(pred.qda$class, Direction.2009_2010)
Direction.2009_2010
Down Up
Down 0 0
Up 43 61
library(class)
train.X <- as.matrix(Lag2[train])
test.X <- as.matrix(Lag2[!train])
train.Direction <- Direction[train]
set.seed(1)
pred.knn <- knn(train.X, test.X, train.Direction, k = 1)
table(pred.knn, Direction.2009_2010)
Direction.2009_2010
pred.knn Down Up
Down 21 30
Up 22 31
Si nos basamos en la comparacion de las tasas de error de prueba, observamos que la regresión logística y LDA tienen las tasas de error mínimas, seguidas de QDA y KNN.
# Regresion logistica con Lag2:Lag1
fit.glm3 <- glm(Direction ~ Lag2:Lag1, data = Weekly, family = binomial, subset = train)
probs3 <- predict(fit.glm3, Weekly.2009_2010, type = "response")
pred.glm3 <- rep("Down", length(probs3))
pred.glm3[probs3 > 0.5] = "Up"
table(pred.glm3, Direction.2009_2010)
Direction.2009_2010
pred.glm3 Down Up
Down 1 1
Up 42 60
mean(pred.glm3 == Direction.2009_2010)
[1] 0.5865385
# LDA Lag2 con interacion Lag1
fit.lda2 <- lda(Direction ~ Lag2:Lag1, data = Weekly, subset = train)
pred.lda2 <- predict(fit.lda2, Weekly.2009_2010)
mean(pred.lda2$class == Direction.2009_2010)
[1] 0.5769231
# QDA con sqrt(abs(Lag2))
fit.qda2 <- qda(Direction ~ Lag2 + sqrt(abs(Lag2)), data = Weekly, subset = train)
pred.qda2 <- predict(fit.qda2, Weekly.2009_2010)
table(pred.qda2$class, Direction.2009_2010)
Direction.2009_2010
Down Up
Down 12 13
Up 31 48
mean(pred.qda2$class == Direction.2009_2010)
[1] 0.5769231
# KNN k =100
pred.knn2 <- knn(train.X, test.X, train.Direction, k = 100)
table(pred.knn2, Direction.2009_2010)
Direction.2009_2010
pred.knn2 Down Up
Down 9 12
Up 34 49
mean(pred.knn2 == Direction.2009_2010)
[1] 0.5576923
table(mpg01)
mpg01
0 1
196 196
cor(Auto[, -9])
mpg cylinders displacement horsepower weight acceleration
mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442 0.4233285
cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273 -0.5046834
displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944 -0.5438005
horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377 -0.6891955
weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000 -0.4168392
acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392 1.0000000
year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199 0.2903161
origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054 0.2127458
mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566 0.3468215
mpg01.1 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566 0.3468215
mpg01.2 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566 0.3468215
mpg01.3 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566 0.3468215
year origin mpg01 mpg01.1 mpg01.2 mpg01.3
mpg 0.5805410 0.5652088 0.8369392 0.8369392 0.8369392 0.8369392
cylinders -0.3456474 -0.5689316 -0.7591939 -0.7591939 -0.7591939 -0.7591939
displacement -0.3698552 -0.6145351 -0.7534766 -0.7534766 -0.7534766 -0.7534766
horsepower -0.4163615 -0.4551715 -0.6670526 -0.6670526 -0.6670526 -0.6670526
weight -0.3091199 -0.5850054 -0.7577566 -0.7577566 -0.7577566 -0.7577566
acceleration 0.2903161 0.2127458 0.3468215 0.3468215 0.3468215 0.3468215
year 1.0000000 0.1815277 0.4299042 0.4299042 0.4299042 0.4299042
origin 0.1815277 1.0000000 0.5136984 0.5136984 0.5136984 0.5136984
mpg01 0.4299042 0.5136984 1.0000000 1.0000000 1.0000000 1.0000000
mpg01.1 0.4299042 0.5136984 1.0000000 1.0000000 1.0000000 1.0000000
mpg01.2 0.4299042 0.5136984 1.0000000 1.0000000 1.0000000 1.0000000
mpg01.3 0.4299042 0.5136984 1.0000000 1.0000000 1.0000000 1.0000000
par(mfrow = c(2,2))
boxplot(acceleration~mpg01,Data,main="Accel~mpg01")
boxplot(weight~mpg01,Data,main="Weight~mpg01")
boxplot(horsepower~mpg01,Data,main="Horsep~mpg01")
boxplot(displacement~mpg01,Data,main="Disp~mpg01")
boxplot(cylinders~mpg01,Data,main="Disp~mpg01")
boxplot(year~mpg01,Data,main="Disp~mpg01")
Se puede observar una asociacion entre mpg01 y cylinders, weight, displacement y horsepower
train <- (year %% 2 == 0)
Auto.train <- Auto[train, ]
Auto.test <- Auto[!train, ]
mpg01.test <- mpg01[!train]
fit.lda <- lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
subset = train)
fit.lda
Call:
lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
subset = train)
Prior probabilities of groups:
0 1
0.4571429 0.5428571
Group means:
cylinders weight displacement horsepower
0 6.812500 3604.823 271.7396 133.14583
1 4.070175 2314.763 111.6623 77.92105
Coefficients of linear discriminants:
LD1
cylinders -0.6741402638
weight -0.0011465750
displacement 0.0004481325
horsepower 0.0059035377
pred.lda <- predict(fit.lda, Auto.test)
table(pred.lda$class, mpg01.test)
mpg01.test
0 1
0 86 9
1 14 73
(mean(pred.lda$class != mpg01.test)*100)
[1] 12.63736
la tasa de error de prueba es del 12,6%
fit.qda <- qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
subset = train)
fit.qda
Call:
qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
subset = train)
Prior probabilities of groups:
0 1
0.4571429 0.5428571
Group means:
cylinders weight displacement horsepower
0 6.812500 3604.823 271.7396 133.14583
1 4.070175 2314.763 111.6623 77.92105
pred.qda <- predict(fit.qda, Auto.test)
table(pred.qda$class, mpg01.test)
mpg01.test
0 1
0 89 13
1 11 69
(mean(pred.qda$class != mpg01.test)*100)
[1] 13.18681
la tasa de error de prueba es del 13,18%
fit.glm <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, family = binomial, subset = train)
summary(fit.glm)
Call:
glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower,
family = binomial, data = Auto, subset = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.48027 -0.03413 0.10583 0.29634 2.57584
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 17.658730 3.409012 5.180 2.22e-07 ***
cylinders -1.028032 0.653607 -1.573 0.1158
weight -0.002922 0.001137 -2.569 0.0102 *
displacement 0.002462 0.015030 0.164 0.8699
horsepower -0.050611 0.025209 -2.008 0.0447 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 289.58 on 209 degrees of freedom
Residual deviance: 83.24 on 205 degrees of freedom
AIC: 93.24
Number of Fisher Scoring iterations: 7
probs <- predict(fit.glm, Auto.test, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, mpg01.test)
mpg01.test
pred.glm 0 1
0 89 11
1 11 71
(mean(pred.glm != mpg01.test)*100)
[1] 12.08791
La tasa de error es del 12,1%
train.X <- cbind(cylinders, weight, displacement, horsepower)[train, ]
test.X <- cbind(cylinders, weight, displacement, horsepower)[!train, ]
train.mpg01 <- mpg01[train]
set.seed(1)
pred.knn <- knn(train.X, test.X, train.mpg01, k = 1)
table(pred.knn, mpg01.test)
mpg01.test
pred.knn 0 1
0 83 11
1 17 71
mean(pred.knn != mpg01.test)*100
[1] 15.38462
pred.knn <- knn(train.X, test.X, train.mpg01, k = 10)
table(pred.knn, mpg01.test)
mpg01.test
pred.knn 0 1
0 77 7
1 23 75
mean(pred.knn != mpg01.test)*100
[1] 16.48352
pred.knn <- knn(train.X, test.X, train.mpg01, k = 100)
table(pred.knn, mpg01.test)
mpg01.test
pred.knn 0 1
0 81 7
1 19 75
mean(pred.knn != mpg01.test)*100
[1] 14.28571
cuando k=100 se obtiene la menor tasa de error
Power <- function() {
2^3
}
print(Power())
[1] 8
Power2 <- function(x, a) {
x^a
}
Power2(3, 8)
[1] 6561
Power2(10, 3)
[1] 1000
Power2(8, 17)
[1] 2.2518e+15
Power2(131, 3)
[1] 2248091
Power3 <- function(x , a) {
result <- x^a
return(result)
}
x <- 1:10
plot(x, Power3(x, 2), log = "xy", xlab = "Log of x", ylab = "Log of x^2", main = "Log of x^2 vs Log of x")
PlotPower <- function(x, a) {
plot(x, Power3(x, a))
}
PlotPower(1:10, 3)
library(MASS)
attach(Boston)
crim01 <- rep(0, length(crim))
crim01[crim > median(crim)] <- 1
Boston <- data.frame(Boston, crim01)
train <- 1:(length(crim) / 2)
test <- (length(crim) / 2 + 1):length(crim)
Boston.train <- Boston[train, ]
Boston.test <- Boston[test, ]
crim01.test <- crim01[test]
fit.glm <- glm(crim01 ~ . - crim01 - crim, data = Boston, family = binomial, subset = train)
glm.fit: fitted probabilities numerically 0 or 1 occurred
probs <- predict(fit.glm, Boston.test, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, crim01.test)
crim01.test
pred.glm 0 1
0 68 24
1 22 139
mean(pred.glm != crim01.test)*100
[1] 18.18182
fit.glm <- glm(crim01 ~ . - crim01 - crim - chas - nox, data = Boston, family = binomial, subset = train)
glm.fit: fitted probabilities numerically 0 or 1 occurred
probs <- predict(fit.glm, Boston.test, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, crim01.test)
crim01.test
pred.glm 0 1
0 78 28
1 12 135
mean(pred.glm != crim01.test)*100
[1] 15.81028
fit.lda <- lda(crim01 ~ . - crim01 - crim, data = Boston, subset = train)
pred.lda <- predict(fit.lda, Boston.test)
table(pred.lda$class, crim01.test)
crim01.test
0 1
0 80 24
1 10 139
mean(pred.lda$class != crim01.test)*100
[1] 13.43874
fit.lda <- lda(crim01 ~ . - crim01 - crim - chas - nox, data = Boston, subset = train)
pred.lda <- predict(fit.lda, Boston.test)
table(pred.lda$class, crim01.test)
crim01.test
0 1
0 82 30
1 8 133
mean(pred.lda$class != crim01.test)*100
[1] 15.01976
train.X <- cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat, medv)[train, ]
test.X <- cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat, medv)[test, ]
train.crim01 <- crim01[train]
set.seed(1)
pred.knn <- knn(train.X, test.X, train.crim01, k = 1)
table(pred.knn, crim01.test)
crim01.test
pred.knn 0 1
0 85 111
1 5 52
mean(pred.knn != crim01.test)*100
[1] 45.8498
pred.knn <- knn(train.X, test.X, train.crim01, k = 10)
table(pred.knn, crim01.test)
crim01.test
pred.knn 0 1
0 83 23
1 7 140
mean(pred.knn != crim01.test)*100
[1] 11.85771
pred.knn <- knn(train.X, test.X, train.crim01, k = 100)
table(pred.knn, crim01.test)
crim01.test
pred.knn 0 1
0 86 120
1 4 43
mean(pred.knn != crim01.test)*100
[1] 49.01186