x <- c(1, 3, 2, 5)
x
## [1] 1 3 2 5
x = c(1, 6, 2)
x
## [1] 1 6 2
y = c(1, 4, 3)
length(x)
## [1] 3
length(y)
## [1] 3
x + y
## [1] 2 10 5
ls()
## [1] "x" "y"
rm(x, y)
ls()
## character(0)
rm(list = ls())
### ?matrix
x <- matrix(data = c(1, 2, 3, 4), nrow = 2, ncol = 2)
x
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
x <- matrix(c(1, 2, 3, 4), 2, 2)
matrix(c(1, 2, 3, 4), 2, 2, byrow = TRUE)
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
sqrt(x)
## [,1] [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
x^2
## [,1] [,2]
## [1,] 1 9
## [2,] 4 16
x <- rnorm(50)
y <- x + rnorm(50, mean = 50, sd = .1)
cor(x, y)
## [1] 0.9953654
set.seed(1303)
rnorm(50)
## [1] -1.1439763145 1.3421293656 2.1853904757 0.5363925179 0.0631929665
## [6] 0.5022344825 -0.0004167247 0.5658198405 -0.5725226890 -1.1102250073
## [11] -0.0486871234 -0.6956562176 0.8289174803 0.2066528551 -0.2356745091
## [16] -0.5563104914 -0.3647543571 0.8623550343 -0.6307715354 0.3136021252
## [21] -0.9314953177 0.8238676185 0.5233707021 0.7069214120 0.4202043256
## [26] -0.2690521547 -1.5103172999 -0.6902124766 -0.1434719524 -1.0135274099
## [31] 1.5732737361 0.0127465055 0.8726470499 0.4220661905 -0.0188157917
## [36] 2.6157489689 -0.6931401748 -0.2663217810 -0.7206364412 1.3677342065
## [41] 0.2640073322 0.6321868074 -1.3306509858 0.0268888182 1.0406363208
## [46] 1.3120237985 -0.0300020767 -0.2500257125 0.0234144857 1.6598706557
set.seed(3)
y <- rnorm(100)
mean(y)
## [1] 0.01103557
var(y)
## [1] 0.7328675
sqrt(var(y))
## [1] 0.8560768
sd(y)
## [1] 0.8560768
x <- rnorm(100)
y <- rnorm(100)
plot(x, y)
plot(x, y, xlab = "this is the x-axis",
ylab = "this is the y-axis",
main = "Plot of X vs Y")
pdf("Figure.pdf")
plot(x, y, col = "green")
dev.off()
## png
## 2
x <- seq(1, 10)
x
## [1] 1 2 3 4 5 6 7 8 9 10
x <- 1:10
x
## [1] 1 2 3 4 5 6 7 8 9 10
x <- seq(-pi, pi, length = 50)
###
y <- x
f <- outer(x, y, function(x, y) cos(y) / (1 + x^2))
contour(x, y, f)
contour(x, y, f, nlevels = 45, add = T)
fa <- (f - t(f)) / 2
contour(x, y, fa, nlevels = 15)
###
image(x, y, fa)
persp(x, y, fa)
persp(x, y, fa, theta = 30)
persp(x, y, fa, theta = 30, phi = 20)
persp(x, y, fa, theta = 30, phi = 70)
persp(x, y, fa, theta = 30, phi = 40)
## Indexing Data
A <- matrix(1:16, 4, 4)
A
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
## [3,] 3 7 11 15
## [4,] 4 8 12 16
A[2, 3]
## [1] 10
A[c(1, 3), c(2, 4)]
## [,1] [,2]
## [1,] 5 13
## [2,] 7 15
A[1:3, 2:4]
## [,1] [,2] [,3]
## [1,] 5 9 13
## [2,] 6 10 14
## [3,] 7 11 15
A[1:2, ]
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
A[, 1:2]
## [,1] [,2]
## [1,] 1 5
## [2,] 2 6
## [3,] 3 7
## [4,] 4 8
A[1, ]
## [1] 1 5 9 13
A[-c(1, 3), ]
## [,1] [,2] [,3] [,4]
## [1,] 2 6 10 14
## [2,] 4 8 12 16
A[-c(1, 3), -c(1, 3, 4)]
## [1] 6 8
dim(A)
## [1] 4 4
Auto <- read.table("D:/Masters/AST 533 Statistical Computing III/Data/Auto.data")
# View(Auto)
head(Auto)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 mpg cylinders displacement horsepower weight acceleration year origin
## 2 18.0 8 307.0 130.0 3504. 12.0 70 1
## 3 15.0 8 350.0 165.0 3693. 11.5 70 1
## 4 18.0 8 318.0 150.0 3436. 11.0 70 1
## 5 16.0 8 304.0 150.0 3433. 12.0 70 1
## 6 17.0 8 302.0 140.0 3449. 10.5 70 1
## V9
## 1 name
## 2 chevrolet chevelle malibu
## 3 buick skylark 320
## 4 plymouth satellite
## 5 amc rebel sst
## 6 ford torino
Auto <- read.table("D:/Masters/AST 533 Statistical Computing III/Data/Auto.data", header = T, na.strings = "?", stringsAsFactors = T)
# View(Auto)
Auto <- read.csv("D:/Masters/AST 533 Statistical Computing III/Data/Auto.csv", na.strings = "?", stringsAsFactors = T)
# View(Auto)
dim(Auto)
## [1] 397 9
Auto[1:4, ]
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
Auto <- na.omit(Auto)
dim(Auto)
## [1] 392 9
names(Auto)
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name"
plot(cylinders, mpg)
plot(Auto$cylinders, Auto$mpg)
attach(Auto)
plot(cylinders, mpg)
cylinders <- as.factor(cylinders)
plot(cylinders, mpg)
plot(cylinders, mpg, col = "red")
plot(cylinders, mpg, col = "red", varwidth = T)
plot(cylinders, mpg, col = "red", varwidth = T,
horizontal = T)
plot(cylinders, mpg, col = "red", varwidth = T,
xlab = "cylinders", ylab = "MPG")
hist(mpg)
hist(mpg, col = 2)
hist(mpg, col = 2, breaks = 15)
pairs(Auto)
pairs(~ mpg + displacement + horsepower + weight + acceleration,
data = Auto)
plot(horsepower, mpg)
identify(horsepower, mpg, name)
## integer(0)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
summary(mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.00 17.00 22.75 23.45 29.00 46.60
## Libraries
library(MASS)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
##
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Auto
## The following object is masked from 'package:MASS':
##
## Boston
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
#lm.fit <- lm(medv ~ lstat)
lm.fit <- lm(medv ~ lstat, data = Boston)
attach(Boston)
lm.fit <- lm(medv ~ lstat)
lm.fit
##
## Call:
## lm(formula = medv ~ lstat)
##
## Coefficients:
## (Intercept) lstat
## 34.55 -0.95
summary(lm.fit)
##
## Call:
## lm(formula = medv ~ lstat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
names(lm.fit)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
coef(lm.fit)
## (Intercept) lstat
## 34.5538409 -0.9500494
lm.fit$coefficients
## (Intercept) lstat
## 34.5538409 -0.9500494
# confidence interval
confint(lm.fit)
## 2.5 % 97.5 %
## (Intercept) 33.448457 35.6592247
## lstat -1.026148 -0.8739505
predict(lm.fit, data.frame(lstat = (c(5, 10, 15))),
interval = "confidence")
## fit lwr upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461
predict(object = lm.fit,newdata = data.frame(lstat=c(100,200,300)),interval = "confidence")
## fit lwr upr
## 1 -60.45109 -67.12023 -53.78196
## 2 -155.45603 -169.72325 -141.18881
## 3 -250.46097 -272.33447 -228.58746
predict(lm.fit, data.frame(lstat = (c(5, 10, 15))),
interval = "prediction")
## fit lwr upr
## 1 29.80359 17.565675 42.04151
## 2 25.05335 12.827626 37.27907
## 3 20.30310 8.077742 32.52846
plot(lstat, medv)
abline(lm.fit)
abline(lm.fit, lwd = 3)
abline(lm.fit, lwd = 3, col = "red")
plot(lstat, medv, col = "red")
plot(lstat, medv, pch = 20)
plot(lstat, medv, pch = "+")
plot(1:20, 1:20, pch = 1:20)
par(mfrow = c(2, 2))
plot(lm.fit)
head(predict(lm.fit) )
## 1 2 3 4 5 6
## 29.82260 25.87039 30.72514 31.76070 29.49008 29.60408
head(lm.fit$fitted.values) # same result
## 1 2 3 4 5 6
## 29.82260 25.87039 30.72514 31.76070 29.49008 29.60408
head(residuals(lm.fit))
## 1 2 3 4 5 6
## -5.8225951 -4.2703898 3.9748580 1.6393042 6.7099222 -0.9040837
head(lm.fit$residuals) # same result
## 1 2 3 4 5 6
## -5.8225951 -4.2703898 3.9748580 1.6393042 6.7099222 -0.9040837
plot(predict(lm.fit), residuals(lm.fit))
plot(predict(lm.fit), rstudent(lm.fit))
plot(hatvalues(lm.fit))
which.max(hatvalues(lm.fit))
## 375
## 375
lm.fit <- lm(medv ~ lstat + age, data = Boston)
summary(lm.fit)
##
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.981 -3.978 -1.283 1.968 23.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.22276 0.73085 45.458 < 2e-16 ***
## lstat -1.03207 0.04819 -21.416 < 2e-16 ***
## age 0.03454 0.01223 2.826 0.00491 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared: 0.5513, Adjusted R-squared: 0.5495
## F-statistic: 309 on 2 and 503 DF, p-value: < 2.2e-16
lm.fit <- lm(medv ~ ., data = Boston)
summary(lm.fit)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1304 -2.7673 -0.5814 1.9414 26.2526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.617270 4.936039 8.431 3.79e-16 ***
## crim -0.121389 0.033000 -3.678 0.000261 ***
## zn 0.046963 0.013879 3.384 0.000772 ***
## indus 0.013468 0.062145 0.217 0.828520
## chas 2.839993 0.870007 3.264 0.001173 **
## nox -18.758022 3.851355 -4.870 1.50e-06 ***
## rm 3.658119 0.420246 8.705 < 2e-16 ***
## age 0.003611 0.013329 0.271 0.786595
## dis -1.490754 0.201623 -7.394 6.17e-13 ***
## rad 0.289405 0.066908 4.325 1.84e-05 ***
## tax -0.012682 0.003801 -3.337 0.000912 ***
## ptratio -0.937533 0.132206 -7.091 4.63e-12 ***
## lstat -0.552019 0.050659 -10.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.798 on 493 degrees of freedom
## Multiple R-squared: 0.7343, Adjusted R-squared: 0.7278
## F-statistic: 113.5 on 12 and 493 DF, p-value: < 2.2e-16
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
vif(lm.fit) #Variance Inflation Factors
## crim zn indus chas nox rm age dis
## 1.767486 2.298459 3.987181 1.071168 4.369093 1.912532 3.088232 3.954037
## rad tax ptratio lstat
## 7.445301 9.002158 1.797060 2.870777
lm.fit1 <- lm(medv ~ . - age, data = Boston)
summary(lm.fit1)
##
## Call:
## lm(formula = medv ~ . - age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1851 -2.7330 -0.6116 1.8555 26.3838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.525128 4.919684 8.441 3.52e-16 ***
## crim -0.121426 0.032969 -3.683 0.000256 ***
## zn 0.046512 0.013766 3.379 0.000785 ***
## indus 0.013451 0.062086 0.217 0.828577
## chas 2.852773 0.867912 3.287 0.001085 **
## nox -18.485070 3.713714 -4.978 8.91e-07 ***
## rm 3.681070 0.411230 8.951 < 2e-16 ***
## dis -1.506777 0.192570 -7.825 3.12e-14 ***
## rad 0.287940 0.066627 4.322 1.87e-05 ***
## tax -0.012653 0.003796 -3.333 0.000923 ***
## ptratio -0.934649 0.131653 -7.099 4.39e-12 ***
## lstat -0.547409 0.047669 -11.483 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.794 on 494 degrees of freedom
## Multiple R-squared: 0.7343, Adjusted R-squared: 0.7284
## F-statistic: 124.1 on 11 and 494 DF, p-value: < 2.2e-16
lm.fit1 <- update(lm.fit, ~ . - age)
summary(lm.fit1)
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + dis +
## rad + tax + ptratio + lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1851 -2.7330 -0.6116 1.8555 26.3838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.525128 4.919684 8.441 3.52e-16 ***
## crim -0.121426 0.032969 -3.683 0.000256 ***
## zn 0.046512 0.013766 3.379 0.000785 ***
## indus 0.013451 0.062086 0.217 0.828577
## chas 2.852773 0.867912 3.287 0.001085 **
## nox -18.485070 3.713714 -4.978 8.91e-07 ***
## rm 3.681070 0.411230 8.951 < 2e-16 ***
## dis -1.506777 0.192570 -7.825 3.12e-14 ***
## rad 0.287940 0.066627 4.322 1.87e-05 ***
## tax -0.012653 0.003796 -3.333 0.000923 ***
## ptratio -0.934649 0.131653 -7.099 4.39e-12 ***
## lstat -0.547409 0.047669 -11.483 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.794 on 494 degrees of freedom
## Multiple R-squared: 0.7343, Adjusted R-squared: 0.7284
## F-statistic: 124.1 on 11 and 494 DF, p-value: < 2.2e-16
## Interaction Terms
summary(lm(medv ~ lstat * age, data = Boston))
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat:age 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
## Non-linear Transformations of the Predictors
lm.fit2 <- lm(medv ~ lstat + I(lstat^2))
summary(lm.fit2)
##
## Call:
## lm(formula = medv ~ lstat + I(lstat^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2834 -3.8313 -0.5295 2.3095 25.4148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.862007 0.872084 49.15 <2e-16 ***
## lstat -2.332821 0.123803 -18.84 <2e-16 ***
## I(lstat^2) 0.043547 0.003745 11.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared: 0.6407, Adjusted R-squared: 0.6393
## F-statistic: 448.5 on 2 and 503 DF, p-value: < 2.2e-16
lm.fit <- lm(medv ~ lstat)
anova(lm.fit, lm.fit2)
## Analysis of Variance Table
##
## Model 1: medv ~ lstat
## Model 2: medv ~ lstat + I(lstat^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 504 19472
## 2 503 15347 1 4125.1 135.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(lm.fit2)
lm.fit5 <- lm(medv ~ poly(lstat, 5))
summary(lm.fit5)
##
## Call:
## lm(formula = medv ~ poly(lstat, 5))
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5433 -3.1039 -0.7052 2.0844 27.1153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.2318 97.197 < 2e-16 ***
## poly(lstat, 5)1 -152.4595 5.2148 -29.236 < 2e-16 ***
## poly(lstat, 5)2 64.2272 5.2148 12.316 < 2e-16 ***
## poly(lstat, 5)3 -27.0511 5.2148 -5.187 3.10e-07 ***
## poly(lstat, 5)4 25.4517 5.2148 4.881 1.42e-06 ***
## poly(lstat, 5)5 -19.2524 5.2148 -3.692 0.000247 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared: 0.6817, Adjusted R-squared: 0.6785
## F-statistic: 214.2 on 5 and 500 DF, p-value: < 2.2e-16
summary(lm(medv ~ log(rm), data = Boston))
##
## Call:
## lm(formula = medv ~ log(rm), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.487 -2.875 -0.104 2.837 39.816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -76.488 5.028 -15.21 <2e-16 ***
## log(rm) 54.055 2.739 19.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.915 on 504 degrees of freedom
## Multiple R-squared: 0.4358, Adjusted R-squared: 0.4347
## F-statistic: 389.3 on 1 and 504 DF, p-value: < 2.2e-16
library(ISLR2)
## Qualitative Predictors
head(Carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1 9.50 138 73 11 276 120 Bad 42 17
## 2 11.22 111 48 16 260 83 Good 65 10
## 3 10.06 113 35 10 269 80 Medium 59 12
## 4 7.40 117 100 4 466 97 Medium 55 14
## 5 4.15 141 64 3 340 128 Bad 38 13
## 6 10.81 124 113 13 501 72 Bad 78 16
## Urban US
## 1 Yes Yes
## 2 Yes Yes
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 No Yes
lm.fit <- lm(Sales ~ . + Income:Advertising + Price:Age,
data = Carseats)
summary(lm.fit)
##
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Price:Age, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9208 -0.7503 0.0177 0.6754 3.3413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5755654 1.0087470 6.519 2.22e-10 ***
## CompPrice 0.0929371 0.0041183 22.567 < 2e-16 ***
## Income 0.0108940 0.0026044 4.183 3.57e-05 ***
## Advertising 0.0702462 0.0226091 3.107 0.002030 **
## Population 0.0001592 0.0003679 0.433 0.665330
## Price -0.1008064 0.0074399 -13.549 < 2e-16 ***
## ShelveLocGood 4.8486762 0.1528378 31.724 < 2e-16 ***
## ShelveLocMedium 1.9532620 0.1257682 15.531 < 2e-16 ***
## Age -0.0579466 0.0159506 -3.633 0.000318 ***
## Education -0.0208525 0.0196131 -1.063 0.288361
## UrbanYes 0.1401597 0.1124019 1.247 0.213171
## USYes -0.1575571 0.1489234 -1.058 0.290729
## Income:Advertising 0.0007510 0.0002784 2.698 0.007290 **
## Price:Age 0.0001068 0.0001333 0.801 0.423812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared: 0.8761, Adjusted R-squared: 0.8719
## F-statistic: 210 on 13 and 386 DF, p-value: < 2.2e-16
attach(Carseats)
contrasts(ShelveLoc)
## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1
## Writing Functions
#LoadLibraries
#LoadLibraries()
LoadLibraries <- function() {
library(ISLR2)
library(MASS)
print("The libraries have been loaded.")
}
LoadLibraries
## function ()
## {
## library(ISLR2)
## library(MASS)
## print("The libraries have been loaded.")
## }
LoadLibraries()
## [1] "The libraries have been loaded."
library(ISLR2)
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
dim(Smarket)
## [1] 1250 9
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
## Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
## Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
## Lag4 Lag5 Volume Today
## Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000
## 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500
## Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
## Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
## 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
## Direction
## Down:602
## Up :648
##
##
##
##
pairs(Smarket)
str(Smarket)
## 'data.frame': 1250 obs. of 9 variables:
## $ Year : num 2001 2001 2001 2001 2001 ...
## $ Lag1 : num 0.381 0.959 1.032 -0.623 0.614 ...
## $ Lag2 : num -0.192 0.381 0.959 1.032 -0.623 ...
## $ Lag3 : num -2.624 -0.192 0.381 0.959 1.032 ...
## $ Lag4 : num -1.055 -2.624 -0.192 0.381 0.959 ...
## $ Lag5 : num 5.01 -1.055 -2.624 -0.192 0.381 ...
## $ Volume : num 1.19 1.3 1.41 1.28 1.21 ...
## $ Today : num 0.959 1.032 -0.623 0.614 0.213 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
# cor(Smarket) #Error in cor(Smarket) : 'x' must be numeric
cor(Smarket[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 0.029699649 0.030596422 0.033194581 0.035688718
## Lag1 0.02969965 1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2 0.03059642 -0.026294328 1.000000000 -0.025896670 -0.010853533
## Lag3 0.03319458 -0.010803402 -0.025896670 1.000000000 -0.024051036
## Lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 1.000000000
## Lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
## Lag5 Volume Today
## Year 0.029787995 0.53900647 0.030095229
## Lag1 -0.005674606 0.04090991 -0.026155045
## Lag2 -0.003557949 -0.04338321 -0.010250033
## Lag3 -0.018808338 -0.04182369 -0.002447647
## Lag4 -0.027083641 -0.04841425 -0.006899527
## Lag5 1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315 1.00000000 0.014591823
## Today -0.034860083 0.01459182 1.000000000
attach(Smarket)
plot(Volume)
plot(Smarket$Volume)
glm.fits <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,data = Smarket, family = binomial)
summary(glm.fits)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Smarket)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## Lag1 -0.073074 0.050167 -1.457 0.145
## Lag2 -0.042301 0.050086 -0.845 0.398
## Lag3 0.011085 0.049939 0.222 0.824
## Lag4 0.009359 0.049974 0.187 0.851
## Lag5 0.010313 0.049511 0.208 0.835
## Volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
coef(glm.fits)
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## -0.126000257 -0.073073746 -0.042301344 0.011085108 0.009358938 0.010313068
## Volume
## 0.135440659
summary(glm.fits)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000257 0.24073574 -0.5233966 0.6006983
## Lag1 -0.073073746 0.05016739 -1.4565986 0.1452272
## Lag2 -0.042301344 0.05008605 -0.8445733 0.3983491
## Lag3 0.011085108 0.04993854 0.2219750 0.8243333
## Lag4 0.009358938 0.04997413 0.1872757 0.8514445
## Lag5 0.010313068 0.04951146 0.2082966 0.8349974
## Volume 0.135440659 0.15835970 0.8552723 0.3924004
summary(glm.fits)$coef[, 4] ## p-value
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## 0.6006983 0.1452272 0.3983491 0.8243333 0.8514445 0.8349974
## Volume
## 0.3924004
glm.probs <- predict(glm.fits, type = "response")
glm.probs[1:10]
## 1 2 3 4 5 6 7 8
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509 0.5092292
## 9 10
## 0.5176135 0.4888378
contrasts(Direction)
## Up
## Down 0
## Up 1
glm.pred <- rep("Down", 1250)
glm.pred[glm.probs > .5] = "Up"
table(glm.pred, Direction)
## Direction
## glm.pred Down Up
## Down 145 141
## Up 457 507
(507 + 145) / 1250 ## accuracy
## [1] 0.5216
mean(glm.pred == Direction)
## [1] 0.5216
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,data = Smarket)
## Setting levels: control = Down, case = Up
## Setting direction: controls > cases
## Setting levels: control = Down, case = Up
## Setting direction: controls > cases
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
## Setting levels: control = Down, case = Up
## Setting direction: controls > cases
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
## $Lag1
##
## Call:
## roc.formula(formula = Direction ~ Lag1, data = Smarket)
##
## Data: Lag1 in 602 controls (Direction Down) > 648 cases (Direction Up).
## Area under the curve: 0.5348
##
## $Lag2
##
## Call:
## roc.formula(formula = Direction ~ Lag2, data = Smarket)
##
## Data: Lag2 in 602 controls (Direction Down) > 648 cases (Direction Up).
## Area under the curve: 0.5243
##
## $Lag3
##
## Call:
## roc.formula(formula = Direction ~ Lag3, data = Smarket)
##
## Data: Lag3 in 602 controls (Direction Down) < 648 cases (Direction Up).
## Area under the curve: 0.4992
##
## $Lag4
##
## Call:
## roc.formula(formula = Direction ~ Lag4, data = Smarket)
##
## Data: Lag4 in 602 controls (Direction Down) > 648 cases (Direction Up).
## Area under the curve: 0.4997
##
## $Lag5
##
## Call:
## roc.formula(formula = Direction ~ Lag5, data = Smarket)
##
## Data: Lag5 in 602 controls (Direction Down) < 648 cases (Direction Up).
## Area under the curve: 0.5053
##
## $Volume
##
## Call:
## roc.formula(formula = Direction ~ Volume, data = Smarket)
##
## Data: Volume in 602 controls (Direction Down) < 648 cases (Direction Up).
## Area under the curve: 0.5177
A<-roc(Direction,glm.probs)
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
plot(roc(Direction,glm.probs))
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
plot(A)
glm_fit<- glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,family = "binomial",data = Smarket)
summary(glm_fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Smarket)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## Lag1 -0.073074 0.050167 -1.457 0.145
## Lag2 -0.042301 0.050086 -0.845 0.398
## Lag3 0.011085 0.049939 0.222 0.824
## Lag4 0.009359 0.049974 0.187 0.851
## Lag5 0.010313 0.049511 0.208 0.835
## Volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
glm_pro<-predict(object = glm_fit,type = "res")
contrasts(Direction)
## Up
## Down 0
## Up 1
glm_pred<- rep("Down",1250)
glm_pred[glm_pro>0.5]="Up"
table(glm_pred,Direction)
## Direction
## glm_pred Down Up
## Down 145 141
## Up 457 507
train <- (Year < 2005)
Smarket.2005 <- Smarket[!train, ]
dim(Smarket.2005)
## [1] 252 9
Direction.2005 <- Direction[!train]
glm.fits <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Smarket, family = binomial, subset = train)
glm.probs <- predict(glm.fits, Smarket.2005,
type = "response")
glm.pred <- rep("Down", 252)
glm.pred[glm.probs > .5] <- "Up"
table(glm.pred, Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44
mean(glm.pred == Direction.2005)
## [1] 0.4801587
mean(glm.pred != Direction.2005)
## [1] 0.5198413
tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
gl.fit<-glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = tr, family = binomial)
prob<- predict(gl.fit,test,type = "response")
dim(test)
## [1] 252 9
pre<-rep("Down",252)
pre[prob> 0.5]<- "Up"
table(pre,test$Direction)
##
## pre Down Up
## Down 77 97
## Up 34 44
glm.fits <- glm(Direction ~ Lag1 + Lag2, data = Smarket,
family = binomial, subset = train)
glm.probs <- predict(glm.fits, Smarket.2005,
type = "response")
glm.pred <- rep("Down", 252)
glm.pred[glm.probs > .5] <- "Up"
table(glm.pred, Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 35 35
## Up 76 106
mean(glm.pred == Direction.2005)
## [1] 0.5595238
106 / (106 + 76)
## [1] 0.5824176
predict(glm.fits,newdata =data.frame(Lag1 = c(1.2, 1.5), Lag2 = c(1.1, -0.8)),type = "response")
## 1 2
## 0.4791462 0.4960939
tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
gl.fit<-glm(Direction ~ Lag1 + Lag2,
data = tr, family = binomial)
prob<- predict(gl.fit,test,type = "response")
dim(test)
## [1] 252 9
pre<-rep("Down",252)
pre[prob> 0.5]<- "Up"
table(pre,test$Direction)
##
## pre Down Up
## Down 35 35
## Up 76 106
mean(pre == test$Direction)
## [1] 0.5595238
library(MASS)
lda.fit <- lda(Direction ~ Lag1 + Lag2, data = Smarket,
subset = train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
plot(lda.fit)
lda.pred <- predict(lda.fit, Smarket.2005)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, Direction.2005)
## Direction.2005
## lda.class Down Up
## Down 35 35
## Up 76 106
mean(lda.class == Direction.2005)
## [1] 0.5595238
sum(lda.pred$posterior[, 1] >= .5)
## [1] 70
sum(lda.pred$posterior[, 1] < .5)
## [1] 182
lda.pred$posterior[1:20, 1]
## 999 1000 1001 1002 1003 1004 1005 1006
## 0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016 0.4872861
## 1007 1008 1009 1010 1011 1012 1013 1014
## 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761 0.4744593 0.4799583
## 1015 1016 1017 1018
## 0.4935775 0.5030894 0.4978806 0.4886331
lda.class[1:20]
## [1] Up Up Up Up Up Up Up Up Up Up Up Down Up Up Up
## [16] Up Up Down Up Up
## Levels: Down Up
sum(lda.pred$posterior[, 1] > .9)
## [1] 0
tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
lda_fit<-lda(Direction ~ Lag1 + Lag2 ,data = tr)
pre<- predict(lda_fit,test,type = "response")$class
table(pre,test$Direction)
##
## pre Down Up
## Down 35 35
## Up 76 106
mean(pre == test$Direction)
## [1] 0.5595238
qda.fit <- qda(Direction ~ Lag1 + Lag2, data = Smarket,
subset = train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
qda.class <- predict(qda.fit, Smarket.2005)$class
table(qda.class, Direction.2005)
## Direction.2005
## qda.class Down Up
## Down 30 20
## Up 81 121
mean(qda.class == Direction.2005)
## [1] 0.5992063
tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
qda_fit<-qda(Direction ~ Lag1 + Lag2 ,data = tr)
pre<- predict(qda_fit,test,type = "response")$class
table(pre,test$Direction)
##
## pre Down Up
## Down 30 20
## Up 81 121
mean(pre == test$Direction)
## [1] 0.5992063
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
nb.fit <- naiveBayes(Direction ~ Lag1 + Lag2, data = Smarket,
subset = train)
nb.fit
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.491984 0.508016
##
## Conditional probabilities:
## Lag1
## Y [,1] [,2]
## Down 0.04279022 1.227446
## Up -0.03954635 1.231668
##
## Lag2
## Y [,1] [,2]
## Down 0.03389409 1.239191
## Up -0.03132544 1.220765
nb.class <- predict(nb.fit, Smarket.2005)
table(nb.class, Direction.2005)
## Direction.2005
## nb.class Down Up
## Down 28 20
## Up 83 121
mean(nb.class == Direction.2005)
## [1] 0.5912698
nb.preds <- predict(nb.fit, Smarket.2005, type = "raw")
nb.preds[1:5, ]
## Down Up
## [1,] 0.4873164 0.5126836
## [2,] 0.4762492 0.5237508
## [3,] 0.4653377 0.5346623
## [4,] 0.4748652 0.5251348
## [5,] 0.4901890 0.5098110
mean(Lag1[train][Direction[train] == "Down"])
## [1] 0.04279022
sd(Lag1[train][Direction[train] == "Down"])
## [1] 1.227446
## 2nd way
mean(tr$Lag1[tr$Direction == "Down"])
## [1] 0.04279022
sd(tr$Lag1[tr$Direction == "Down"])
## [1] 1.227446
tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
naiveBayes_fit<-naiveBayes(Direction ~ Lag1 + Lag2 ,data = tr)
prob<- predict(naiveBayes_fit,test,type = "raw")
pre<- ifelse(prob[,1]<0.5,"Up","Down")
table(pre,test$Direction)
##
## pre Down Up
## Down 28 20
## Up 83 121
mean(pre == test$Direction)
## [1] 0.5912698
## 2nd way
mean(tr$Lag1[tr$Direction == "Down"])
## [1] 0.04279022
sd(tr$Lag1[tr$Direction == "Down"])
## [1] 1.227446
library(class)
train.X <- cbind(Lag1, Lag2)[train, ]
test.X <- cbind(Lag1, Lag2)[!train, ]
train.Direction <- Direction[train]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 43 58
## Up 68 83
(83 + 43) / 252
## [1] 0.5
knn.pred <- knn(train.X, test.X, train.Direction, k = 3)
table(knn.pred, Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 48 54
## Up 63 87
mean(knn.pred == Direction.2005)
## [1] 0.5357143
library(class)
train<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
train1<-train %>%
select(Lag1, Lag2)
test1<- test%>%
select(Lag1, Lag2)
class<-train$Direction
knn.pred<-knn(train = train1,test = test1,cl = class,k = 1)
table(knn.pred,test$Direction)
##
## knn.pred Down Up
## Down 43 58
## Up 68 83
mean(knn.pred==test$Direction)
## [1] 0.5
set.seed(1)
knn.pred3<-knn(train = train1,test = test1,cl = class,k = 3)
table(knn.pred3,test$Direction)
##
## knn.pred3 Down Up
## Down 48 55
## Up 63 86
mean(knn.pred3==test$Direction)
## [1] 0.531746
dim(Caravan)
## [1] 5822 86
attach(Caravan)
summary(Purchase)
## No Yes
## 5474 348
348 / 5822
## [1] 0.05977327
standardized.X <- scale(Caravan[, -86])
var(Caravan[, 1])
## [1] 165.0378
var(Caravan[, 2])
## [1] 0.1647078
var(standardized.X[, 1])
## [1] 1
var(standardized.X[, 2])
## [1] 1
test <- 1:1000
train.X <- standardized.X[-test, ]
test.X <- standardized.X[test, ]
train.Y <- Purchase[-test]
test.Y <- Purchase[test]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Y, k = 1)
mean(test.Y == knn.pred)
## [1] 0.882
mean(test.Y != knn.pred)
## [1] 0.118
mean(test.Y != "No")
## [1] 0.059
table(knn.pred, test.Y)
## test.Y
## knn.pred No Yes
## No 873 50
## Yes 68 9
9 / (68 + 9)
## [1] 0.1168831
knn.pred <- knn(train.X, test.X, train.Y, k = 3)
table(knn.pred, test.Y)
## test.Y
## knn.pred No Yes
## No 920 54
## Yes 21 5
5 / 26
## [1] 0.1923077
knn.pred <- knn(train.X, test.X, train.Y, k = 5)
table(knn.pred, test.Y)
## test.Y
## knn.pred No Yes
## No 930 55
## Yes 11 4
4 / 15
## [1] 0.2666667
glm.fits <- glm(Purchase ~ ., data = Caravan,
family = binomial, subset = -test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs <- predict(glm.fits, Caravan[test, ],type = "response")
glm.pred <- rep("No", 1000)
glm.pred[glm.probs > .5] <- "Yes"
table(glm.pred, test.Y)
## test.Y
## glm.pred No Yes
## No 934 59
## Yes 7 0
glm.pred <- rep("No", 1000)
glm.pred[glm.probs > .25] <- "Yes"
table(glm.pred, test.Y)
## test.Y
## glm.pred No Yes
## No 919 48
## Yes 22 11
11 / (22 + 11)
## [1] 0.3333333
attach(Bikeshare)
dim(Bikeshare)
## [1] 8645 15
names(Bikeshare)
## [1] "season" "mnth" "day" "hr" "holiday"
## [6] "weekday" "workingday" "weathersit" "temp" "atemp"
## [11] "hum" "windspeed" "casual" "registered" "bikers"
mod.lm <- lm(
bikers ~ mnth + hr + workingday + temp + weathersit,
data = Bikeshare
)
summary(mod.lm)
##
## Call:
## lm(formula = bikers ~ mnth + hr + workingday + temp + weathersit,
## data = Bikeshare)
##
## Residuals:
## Min 1Q Median 3Q Max
## -299.00 -45.70 -6.23 41.08 425.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -68.632 5.307 -12.932 < 2e-16 ***
## mnthFeb 6.845 4.287 1.597 0.110398
## mnthMarch 16.551 4.301 3.848 0.000120 ***
## mnthApril 41.425 4.972 8.331 < 2e-16 ***
## mnthMay 72.557 5.641 12.862 < 2e-16 ***
## mnthJune 67.819 6.544 10.364 < 2e-16 ***
## mnthJuly 45.324 7.081 6.401 1.63e-10 ***
## mnthAug 53.243 6.640 8.019 1.21e-15 ***
## mnthSept 66.678 5.925 11.254 < 2e-16 ***
## mnthOct 75.834 4.950 15.319 < 2e-16 ***
## mnthNov 60.310 4.610 13.083 < 2e-16 ***
## mnthDec 46.458 4.271 10.878 < 2e-16 ***
## hr1 -14.579 5.699 -2.558 0.010536 *
## hr2 -21.579 5.733 -3.764 0.000168 ***
## hr3 -31.141 5.778 -5.389 7.26e-08 ***
## hr4 -36.908 5.802 -6.361 2.11e-10 ***
## hr5 -24.135 5.737 -4.207 2.61e-05 ***
## hr6 20.600 5.704 3.612 0.000306 ***
## hr7 120.093 5.693 21.095 < 2e-16 ***
## hr8 223.662 5.690 39.310 < 2e-16 ***
## hr9 120.582 5.693 21.182 < 2e-16 ***
## hr10 83.801 5.705 14.689 < 2e-16 ***
## hr11 105.423 5.722 18.424 < 2e-16 ***
## hr12 137.284 5.740 23.916 < 2e-16 ***
## hr13 136.036 5.760 23.617 < 2e-16 ***
## hr14 126.636 5.776 21.923 < 2e-16 ***
## hr15 132.087 5.780 22.852 < 2e-16 ***
## hr16 178.521 5.772 30.927 < 2e-16 ***
## hr17 296.267 5.749 51.537 < 2e-16 ***
## hr18 269.441 5.736 46.976 < 2e-16 ***
## hr19 186.256 5.714 32.596 < 2e-16 ***
## hr20 125.549 5.704 22.012 < 2e-16 ***
## hr21 87.554 5.693 15.378 < 2e-16 ***
## hr22 59.123 5.689 10.392 < 2e-16 ***
## hr23 26.838 5.688 4.719 2.41e-06 ***
## workingday 1.270 1.784 0.711 0.476810
## temp 157.209 10.261 15.321 < 2e-16 ***
## weathersitcloudy/misty -12.890 1.964 -6.562 5.60e-11 ***
## weathersitlight rain/snow -66.494 2.965 -22.425 < 2e-16 ***
## weathersitheavy rain/snow -109.745 76.667 -1.431 0.152341
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 76.5 on 8605 degrees of freedom
## Multiple R-squared: 0.6745, Adjusted R-squared: 0.6731
## F-statistic: 457.3 on 39 and 8605 DF, p-value: < 2.2e-16
contrasts(Bikeshare$hr) = contr.sum(24)
contrasts(Bikeshare$mnth) = contr.sum(12)
mod.lm2 <- lm(
bikers ~ mnth + hr + workingday + temp + weathersit,
data = Bikeshare
)
summary(mod.lm2)
##
## Call:
## lm(formula = bikers ~ mnth + hr + workingday + temp + weathersit,
## data = Bikeshare)
##
## Residuals:
## Min 1Q Median 3Q Max
## -299.00 -45.70 -6.23 41.08 425.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.5974 5.1322 14.340 < 2e-16 ***
## mnth1 -46.0871 4.0855 -11.281 < 2e-16 ***
## mnth2 -39.2419 3.5391 -11.088 < 2e-16 ***
## mnth3 -29.5357 3.1552 -9.361 < 2e-16 ***
## mnth4 -4.6622 2.7406 -1.701 0.08895 .
## mnth5 26.4700 2.8508 9.285 < 2e-16 ***
## mnth6 21.7317 3.4651 6.272 3.75e-10 ***
## mnth7 -0.7626 3.9084 -0.195 0.84530
## mnth8 7.1560 3.5347 2.024 0.04295 *
## mnth9 20.5912 3.0456 6.761 1.46e-11 ***
## mnth10 29.7472 2.6995 11.019 < 2e-16 ***
## mnth11 14.2229 2.8604 4.972 6.74e-07 ***
## hr1 -96.1420 3.9554 -24.307 < 2e-16 ***
## hr2 -110.7213 3.9662 -27.916 < 2e-16 ***
## hr3 -117.7212 4.0165 -29.310 < 2e-16 ***
## hr4 -127.2828 4.0808 -31.191 < 2e-16 ***
## hr5 -133.0495 4.1168 -32.319 < 2e-16 ***
## hr6 -120.2775 4.0370 -29.794 < 2e-16 ***
## hr7 -75.5424 3.9916 -18.925 < 2e-16 ***
## hr8 23.9511 3.9686 6.035 1.65e-09 ***
## hr9 127.5199 3.9500 32.284 < 2e-16 ***
## hr10 24.4399 3.9360 6.209 5.57e-10 ***
## hr11 -12.3407 3.9361 -3.135 0.00172 **
## hr12 9.2814 3.9447 2.353 0.01865 *
## hr13 41.1417 3.9571 10.397 < 2e-16 ***
## hr14 39.8939 3.9750 10.036 < 2e-16 ***
## hr15 30.4940 3.9910 7.641 2.39e-14 ***
## hr16 35.9445 3.9949 8.998 < 2e-16 ***
## hr17 82.3786 3.9883 20.655 < 2e-16 ***
## hr18 200.1249 3.9638 50.488 < 2e-16 ***
## hr19 173.2989 3.9561 43.806 < 2e-16 ***
## hr20 90.1138 3.9400 22.872 < 2e-16 ***
## hr21 29.4071 3.9362 7.471 8.74e-14 ***
## hr22 -8.5883 3.9332 -2.184 0.02902 *
## hr23 -37.0194 3.9344 -9.409 < 2e-16 ***
## workingday 1.2696 1.7845 0.711 0.47681
## temp 157.2094 10.2612 15.321 < 2e-16 ***
## weathersitcloudy/misty -12.8903 1.9643 -6.562 5.60e-11 ***
## weathersitlight rain/snow -66.4944 2.9652 -22.425 < 2e-16 ***
## weathersitheavy rain/snow -109.7446 76.6674 -1.431 0.15234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 76.5 on 8605 degrees of freedom
## Multiple R-squared: 0.6745, Adjusted R-squared: 0.6731
## F-statistic: 457.3 on 39 and 8605 DF, p-value: < 2.2e-16
sum((predict(mod.lm) - predict(mod.lm2))^2)
## [1] 1.586608e-18
all.equal(predict(mod.lm), predict(mod.lm2))
## [1] TRUE
coef.months <- c(coef(mod.lm2)[2:12],
-sum(coef(mod.lm2)[2:12]))
plot(coef.months, xlab = "Month", ylab = "Coefficient",
xaxt = "n", col = "blue", pch = 19, type = "o")
axis(side = 1, at = 1:12, labels = c("J", "F", "M", "A",
"M", "J", "J", "A", "S", "O", "N", "D"))
coef.hours <- c(coef(mod.lm2)[13:35],
-sum(coef(mod.lm2)[13:35]))
plot(coef.hours, xlab = "Hour", ylab = "Coefficient",
col = "blue", pch = 19, type = "o")
plot(coef.months,type="o",xaxt="n")
axis(side = 1,at = 1:12,labels = c("J", "F", "M", "A","M", "J", "J", "A", "S", "O", "N", "D"))
mod.pois <- glm(
bikers ~ mnth + hr + workingday + temp + weathersit,
data = Bikeshare, family = poisson
)
summary(mod.pois)
##
## Call:
## glm(formula = bikers ~ mnth + hr + workingday + temp + weathersit,
## family = poisson, data = Bikeshare)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.118245 0.006021 683.964 < 2e-16 ***
## mnth1 -0.670170 0.005907 -113.445 < 2e-16 ***
## mnth2 -0.444124 0.004860 -91.379 < 2e-16 ***
## mnth3 -0.293733 0.004144 -70.886 < 2e-16 ***
## mnth4 0.021523 0.003125 6.888 5.66e-12 ***
## mnth5 0.240471 0.002916 82.462 < 2e-16 ***
## mnth6 0.223235 0.003554 62.818 < 2e-16 ***
## mnth7 0.103617 0.004125 25.121 < 2e-16 ***
## mnth8 0.151171 0.003662 41.281 < 2e-16 ***
## mnth9 0.233493 0.003102 75.281 < 2e-16 ***
## mnth10 0.267573 0.002785 96.091 < 2e-16 ***
## mnth11 0.150264 0.003180 47.248 < 2e-16 ***
## hr1 -0.754386 0.007879 -95.744 < 2e-16 ***
## hr2 -1.225979 0.009953 -123.173 < 2e-16 ***
## hr3 -1.563147 0.011869 -131.702 < 2e-16 ***
## hr4 -2.198304 0.016424 -133.846 < 2e-16 ***
## hr5 -2.830484 0.022538 -125.586 < 2e-16 ***
## hr6 -1.814657 0.013464 -134.775 < 2e-16 ***
## hr7 -0.429888 0.006896 -62.341 < 2e-16 ***
## hr8 0.575181 0.004406 130.544 < 2e-16 ***
## hr9 1.076927 0.003563 302.220 < 2e-16 ***
## hr10 0.581769 0.004286 135.727 < 2e-16 ***
## hr11 0.336852 0.004720 71.372 < 2e-16 ***
## hr12 0.494121 0.004392 112.494 < 2e-16 ***
## hr13 0.679642 0.004069 167.040 < 2e-16 ***
## hr14 0.673565 0.004089 164.722 < 2e-16 ***
## hr15 0.624910 0.004178 149.570 < 2e-16 ***
## hr16 0.653763 0.004132 158.205 < 2e-16 ***
## hr17 0.874301 0.003784 231.040 < 2e-16 ***
## hr18 1.294635 0.003254 397.848 < 2e-16 ***
## hr19 1.212281 0.003321 365.084 < 2e-16 ***
## hr20 0.914022 0.003700 247.065 < 2e-16 ***
## hr21 0.616201 0.004191 147.045 < 2e-16 ***
## hr22 0.364181 0.004659 78.173 < 2e-16 ***
## hr23 0.117493 0.005225 22.488 < 2e-16 ***
## workingday 0.014665 0.001955 7.502 6.27e-14 ***
## temp 0.785292 0.011475 68.434 < 2e-16 ***
## weathersitcloudy/misty -0.075231 0.002179 -34.528 < 2e-16 ***
## weathersitlight rain/snow -0.575800 0.004058 -141.905 < 2e-16 ***
## weathersitheavy rain/snow -0.926287 0.166782 -5.554 2.79e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1052921 on 8644 degrees of freedom
## Residual deviance: 228041 on 8605 degrees of freedom
## AIC: 281159
##
## Number of Fisher Scoring iterations: 5
coef.mnth <- c(coef(mod.pois)[2:12],
-sum(coef(mod.pois)[2:12]))
plot(coef.mnth, xlab = "Month", ylab = "Coefficient",
xaxt = "n", col = "blue", pch = 19, type = "o")
axis(side = 1, at = 1:12, labels = c("J", "F", "M", "A", "M", "J", "J", "A", "S", "O", "N", "D"))
coef.hours <- c(coef(mod.pois)[13:35],
-sum(coef(mod.pois)[13:35]))
plot(coef.hours, xlab = "Hour", ylab = "Coefficient",
col = "blue", pch = 19, type = "o")
plot(predict(mod.lm2), predict(mod.pois, type = "response"))
abline(0, 1, col = 2, lwd = 3)
#### my try
mod.pois1 <- glm(
bikers ~ mnth + hr + workingday + temp + weathersit,
data = Bikeshare, family = poisson
)
summary(mod.pois1)
##
## Call:
## glm(formula = bikers ~ mnth + hr + workingday + temp + weathersit,
## family = poisson, data = Bikeshare)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.118245 0.006021 683.964 < 2e-16 ***
## mnth1 -0.670170 0.005907 -113.445 < 2e-16 ***
## mnth2 -0.444124 0.004860 -91.379 < 2e-16 ***
## mnth3 -0.293733 0.004144 -70.886 < 2e-16 ***
## mnth4 0.021523 0.003125 6.888 5.66e-12 ***
## mnth5 0.240471 0.002916 82.462 < 2e-16 ***
## mnth6 0.223235 0.003554 62.818 < 2e-16 ***
## mnth7 0.103617 0.004125 25.121 < 2e-16 ***
## mnth8 0.151171 0.003662 41.281 < 2e-16 ***
## mnth9 0.233493 0.003102 75.281 < 2e-16 ***
## mnth10 0.267573 0.002785 96.091 < 2e-16 ***
## mnth11 0.150264 0.003180 47.248 < 2e-16 ***
## hr1 -0.754386 0.007879 -95.744 < 2e-16 ***
## hr2 -1.225979 0.009953 -123.173 < 2e-16 ***
## hr3 -1.563147 0.011869 -131.702 < 2e-16 ***
## hr4 -2.198304 0.016424 -133.846 < 2e-16 ***
## hr5 -2.830484 0.022538 -125.586 < 2e-16 ***
## hr6 -1.814657 0.013464 -134.775 < 2e-16 ***
## hr7 -0.429888 0.006896 -62.341 < 2e-16 ***
## hr8 0.575181 0.004406 130.544 < 2e-16 ***
## hr9 1.076927 0.003563 302.220 < 2e-16 ***
## hr10 0.581769 0.004286 135.727 < 2e-16 ***
## hr11 0.336852 0.004720 71.372 < 2e-16 ***
## hr12 0.494121 0.004392 112.494 < 2e-16 ***
## hr13 0.679642 0.004069 167.040 < 2e-16 ***
## hr14 0.673565 0.004089 164.722 < 2e-16 ***
## hr15 0.624910 0.004178 149.570 < 2e-16 ***
## hr16 0.653763 0.004132 158.205 < 2e-16 ***
## hr17 0.874301 0.003784 231.040 < 2e-16 ***
## hr18 1.294635 0.003254 397.848 < 2e-16 ***
## hr19 1.212281 0.003321 365.084 < 2e-16 ***
## hr20 0.914022 0.003700 247.065 < 2e-16 ***
## hr21 0.616201 0.004191 147.045 < 2e-16 ***
## hr22 0.364181 0.004659 78.173 < 2e-16 ***
## hr23 0.117493 0.005225 22.488 < 2e-16 ***
## workingday 0.014665 0.001955 7.502 6.27e-14 ***
## temp 0.785292 0.011475 68.434 < 2e-16 ***
## weathersitcloudy/misty -0.075231 0.002179 -34.528 < 2e-16 ***
## weathersitlight rain/snow -0.575800 0.004058 -141.905 < 2e-16 ***
## weathersitheavy rain/snow -0.926287 0.166782 -5.554 2.79e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1052921 on 8644 degrees of freedom
## Residual deviance: 228041 on 8605 degrees of freedom
## AIC: 281159
##
## Number of Fisher Scoring iterations: 5
coef<-c(mod.pois1$coefficients[2:12],-sum(mod.pois1$coefficients[2:12]))
coef
## mnth1 mnth2 mnth3 mnth4 mnth5 mnth6
## -0.67016993 -0.44412382 -0.29373313 0.02152319 0.24047087 0.22323523
## mnth7 mnth8 mnth9 mnth10 mnth11
## 0.10361658 0.15117137 0.23349303 0.26757318 0.15026355 0.01667987
plot(coef,type="o")
contrasts(Bikeshare$mnth)=contr.sum(12)
coef<-c(mod.pois1$coefficients[2:12],-sum(mod.pois1$coefficients[2:12]))
coef
## mnth1 mnth2 mnth3 mnth4 mnth5 mnth6
## -0.67016993 -0.44412382 -0.29373313 0.02152319 0.24047087 0.22323523
## mnth7 mnth8 mnth9 mnth10 mnth11
## 0.10361658 0.15117137 0.23349303 0.26757318 0.15026355 0.01667987
plot(coef,type="o")
library(ISLR2)
set.seed(1)
train <- sample(392, 196)
lm.fit <- lm(mpg ~ horsepower, data = Auto, subset = train)
attach(Auto)
## The following object is masked _by_ .GlobalEnv:
##
## cylinders
## The following object is masked from package:lubridate:
##
## origin
## The following object is masked from package:ggplot2:
##
## mpg
## The following objects are masked from Auto (pos = 25):
##
## acceleration, cylinders, displacement, horsepower, mpg, name,
## origin, weight, year
mean((mpg - predict(lm.fit, Auto))[-train]^2)
## [1] 23.26601
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data = Auto,
subset = train)
mean((mpg - predict(lm.fit2, Auto))[-train]^2)
## [1] 18.71646
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data = Auto,
subset = train)
mean((mpg - predict(lm.fit3, Auto))[-train]^2)
## [1] 18.79401
set.seed(2)
train <- sample(392, 196)
lm.fit <- lm(mpg ~ horsepower, subset = train)
mean((mpg - predict(lm.fit, Auto))[-train]^2)
## [1] 25.72651
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data = Auto,
subset = train)
mean((mpg - predict(lm.fit2, Auto))[-train]^2)
## [1] 20.43036
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data = Auto,
subset = train)
mean((mpg - predict(lm.fit3, Auto))[-train]^2)
## [1] 20.38533
glm.fit <- glm(mpg ~ horsepower, data = Auto)
coef(glm.fit)
## (Intercept) horsepower
## 39.9358610 -0.1578447
lm.fit <- lm(mpg ~ horsepower, data = Auto)
coef(lm.fit)
## (Intercept) horsepower
## 39.9358610 -0.1578447
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
glm.fit <- glm(mpg ~ horsepower, data = Auto)
cv.err <- cv.glm(Auto, glm.fit)
cv.err$delta
## [1] 24.23151 24.23114
cv.error <- rep(0, 10)
for (i in 1:10) {
glm.fit <- glm(mpg ~ poly(horsepower, i), data = Auto)
cv.error[i] <- cv.glm(Auto, glm.fit)$delta[1]
}
cv.error
## [1] 24.23151 19.24821 19.33498 19.42443 19.03321 18.97864 18.83305 18.96115
## [9] 19.06863 19.49093
set.seed(17)
cv.error.10 <- rep(0, 10)
for (i in 1:10) {
glm.fit <- glm(mpg ~ poly(horsepower, i), data = Auto)
cv.error.10[i] <- cv.glm(Auto, glm.fit, K = 10)$delta[1]
}
cv.error.10
## [1] 24.27207 19.26909 19.34805 19.29496 19.03198 18.89781 19.12061 19.14666
## [9] 18.87013 20.95520
error<- c()
for (i in 1:10){
glm_fit<-glm(mpg ~ poly(horsepower,i ), data = Auto)
error[i]<-cv.glm(data = Auto,glmfit = glm_fit)$delta[1]
}
error
## [1] 24.23151 19.24821 19.33498 19.42443 19.03321 18.97864 18.83305 18.96115
## [9] 19.06863 19.49093
error10<- c()
for (i in 1:10){
glm_fit<-glm(mpg ~ poly(horsepower,i ), data = Auto)
error10[i]<-cv.glm(data = Auto,glmfit = glm_fit,K = 10)$delta[1]
}
error10
## [1] 24.30935 19.28566 19.47768 19.45928 19.11895 18.96636 19.14254 19.19909
## [9] 19.14876 19.96767
### Estimating the Accuracy of a Statistic of Interest
alpha.fn <- function(data, index) {
X <- data$X[index]
Y <- data$Y[index]
(var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y))
}
alpha.fn(Portfolio, 1:100)
## [1] 0.5758321
set.seed(7)
alpha.fn(Portfolio, sample(100, 100, replace = T))
## [1] 0.5385326
boot(Portfolio, alpha.fn, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Portfolio, statistic = alpha.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.5758321 0.0007959475 0.08969074
### Estimating the Accuracy of a Linear Regression Model
boot.fn <- function(data, index)
coef(lm(mpg ~ horsepower, data = data, subset = index))
boot.fn(Auto, 1:392)
## (Intercept) horsepower
## 39.9358610 -0.1578447
set.seed(1)
boot.fn(Auto, sample(392, 392, replace = T))
## (Intercept) horsepower
## 40.3404517 -0.1634868
boot.fn(Auto, sample(392, 392, replace = T))
## (Intercept) horsepower
## 40.1186906 -0.1577063
boot(Auto, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 39.9358610 0.0544513229 0.841289790
## t2* -0.1578447 -0.0006170901 0.007343073
summary(lm(mpg ~ horsepower, data = Auto))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.9358610 0.717498656 55.65984 1.220362e-187
## horsepower -0.1578447 0.006445501 -24.48914 7.031989e-81
boot.fn <- function(data, index)
coef(
lm(mpg ~ horsepower + I(horsepower^2),
data = data, subset = index)
)
set.seed(1)
boot(Auto, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 56.900099702 3.511640e-02 2.0300222526
## t2* -0.466189630 -7.080834e-04 0.0324241984
## t3* 0.001230536 2.840324e-06 0.0001172164
summary(
lm(mpg ~ horsepower + I(horsepower^2), data = Auto)
)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.900099702 1.8004268063 31.60367 1.740911e-109
## horsepower -0.466189630 0.0311246171 -14.97816 2.289429e-40
## I(horsepower^2) 0.001230536 0.0001220759 10.08009 2.196340e-21
boot_fun<- function(data,index){
coefficients(lm(mpg ~ horsepower,data = data,subset = index))
}
boot(Auto,statistic = boot_fun,R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = boot_fun, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 39.9358610 0.0250819701 0.855907634
## t2* -0.1578447 -0.0002002252 0.007393336
coefficients(lm(mpg ~ horsepower,data = Auto))
## (Intercept) horsepower
## 39.9358610 -0.1578447
library(ISLR2)
#View(Hitters)
names(Hitters)
## [1] "AtBat" "Hits" "HmRun" "Runs" "RBI" "Walks"
## [7] "Years" "CAtBat" "CHits" "CHmRun" "CRuns" "CRBI"
## [13] "CWalks" "League" "Division" "PutOuts" "Assists" "Errors"
## [19] "Salary" "NewLeague"
dim(Hitters)
## [1] 322 20
sum(is.na(Hitters$Salary))
## [1] 59
Hitters <- na.omit(Hitters)
dim(Hitters)
## [1] 263 20
sum(is.na(Hitters))
## [1] 0
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
regfit.full <- regsubsets(Salary ~ ., Hitters)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " "
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
regfit.full <- regsubsets(Salary ~ ., data = Hitters,
nvmax = 19)
reg.summary <- summary(regfit.full)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
reg.summary$rsq
## [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
## [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
par(mfrow = c(2, 2))
plot(reg.summary$rss, xlab = "Number of Variables",
ylab = "RSS", type = "l")
plot(reg.summary$adjr2, xlab = "Number of Variables",
ylab = "Adjusted RSq", type = "l")
plot(reg.summary$cp, type = "l")
plot(reg.summary$bic, type = "l")
par(mfrow=c(2,2))
which.max(reg.summary$adjr2)
## [1] 11
plot(reg.summary$adjr2, xlab = "Number of Variables",
ylab = "Adjusted RSq", type = "l")
points(11, reg.summary$adjr2[11], col = "red", cex = 2,
pch = 20)
plot(reg.summary$cp, xlab = "Number of Variables",
ylab = "Cp", type = "l")
which.min(reg.summary$cp)
## [1] 10
points(10, reg.summary$cp[10], col = "red", cex = 2,
pch = 20)
which.min(reg.summary$bic)
## [1] 6
plot(reg.summary$bic, xlab = "Number of Variables",
ylab = "BIC", type = "l")
points(6, reg.summary$bic[6], col = "red", cex = 2,
pch = 20)
plot(regfit.full, scale = "r2")
plot(regfit.full, scale = "adjr2")
plot(regfit.full, scale = "Cp")
plot(regfit.full, scale = "bic")
coef(regfit.full, 6)
## (Intercept) AtBat Hits Walks CRBI DivisionW
## 91.5117981 -1.8685892 7.6043976 3.6976468 0.6430169 -122.9515338
## PutOuts
## 0.2643076
regfit.fwd <- regsubsets(Salary ~ ., data = Hitters,
nvmax = 19, method = "forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
regfit.bwd <- regsubsets(Salary ~ ., data = Hitters,
nvmax = 19, method = "backward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: backward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*" " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
## 4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*" " "
## 5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " " " "*" " " " " " "
## 5 ( 1 ) " " " " " " "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
###
coef(regfit.full, 7)
## (Intercept) Hits Walks CAtBat CHits CHmRun
## 79.4509472 1.2833513 3.2274264 -0.3752350 1.4957073 1.4420538
## DivisionW PutOuts
## -129.9866432 0.2366813
coef(regfit.fwd, 7)
## (Intercept) AtBat Hits Walks CRBI CWalks
## 109.7873062 -1.9588851 7.4498772 4.9131401 0.8537622 -0.3053070
## DivisionW PutOuts
## -127.1223928 0.2533404
coef(regfit.bwd, 7)
## (Intercept) AtBat Hits Walks CRuns CWalks
## 105.6487488 -1.9762838 6.7574914 6.0558691 1.1293095 -0.7163346
## DivisionW PutOuts
## -116.1692169 0.3028847
set.seed(1)
train <- sample(c(TRUE, FALSE), nrow(Hitters),
replace = TRUE)
test <- (!train)
# Now, we apply regsubsets() to the training set in order to perform best subset selection.
regfit.best <- regsubsets(Salary ~ .,
data = Hitters[train, ], nvmax = 19)
## model.matrix creates a design (or model) matrix, e.g., by expanding factors to a set of dummy variables (depending on the contrasts) and expanding interactions similarly.
test.mat <- model.matrix(Salary ~ ., data = Hitters[test, ])
val.errors <- rep(NA, 19)
for (i in 1:19) {
coefi <- coef(regfit.best, id = i)
pred <- test.mat[, names(coefi)] %*% coefi
val.errors[i] <- mean((Hitters$Salary[test] - pred)^2)
}
val.errors
## [1] 164377.3 144405.5 152175.7 145198.4 137902.1 139175.7 126849.0 136191.4
## [9] 132889.6 135434.9 136963.3 140694.9 140690.9 141951.2 141508.2 142164.4
## [17] 141767.4 142339.6 142238.2
which.min(val.errors)
## [1] 7
coef(regfit.best, 7)
## (Intercept) AtBat Hits Walks CRuns CWalks
## 67.1085369 -2.1462987 7.0149547 8.0716640 1.2425113 -0.8337844
## DivisionW PutOuts
## -118.4364998 0.2526925
predict.regsubsets <- function(object, newdata, id, ...) {
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id = id)
xvars <- names(coefi)
mat[, xvars] %*% coefi
}
### library(leaps)
regfit.best <- regsubsets(Salary ~ ., data = Hitters,
nvmax = 19)
coef(regfit.best, 7)
## (Intercept) Hits Walks CAtBat CHits CHmRun
## 79.4509472 1.2833513 3.2274264 -0.3752350 1.4957073 1.4420538
## DivisionW PutOuts
## -129.9866432 0.2366813
k <- 10
n <- nrow(Hitters)
set.seed(1)
folds <- sample(rep(1:k, length = n))
cv.errors <- matrix(NA, k, 19,
dimnames = list(NULL, paste(1:19)))
###
for (j in 1:k) {
best.fit <- regsubsets(Salary ~ .,
data = Hitters[folds != j, ],
nvmax = 19)
for (i in 1:19) {
pred <- predict(best.fit, Hitters[folds == j, ], id = i)
cv.errors[j, i] <-
mean((Hitters$Salary[folds == j] - pred)^2)
}
}
###
mean.cv.errors <- apply(cv.errors, 2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 143439.8 126817.0 134214.2 131782.9 130765.6 120382.9 121443.1 114363.7
## 9 10 11 12 13 14 15 16
## 115163.1 109366.0 112738.5 113616.5 115557.6 115853.3 115630.6 116050.0
## 17 18 19
## 116117.0 116419.3 116299.1
par(mfrow = c(1, 1))
plot(mean.cv.errors, type = "b")
reg.best <- regsubsets(Salary ~ ., data = Hitters,
nvmax = 19)
coef(reg.best, 10)
## (Intercept) AtBat Hits Walks CAtBat CRuns
## 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798 1.4082490
## CRBI CWalks DivisionW PutOuts Assists
## 0.7743122 -0.8308264 -112.3800575 0.2973726 0.2831680
x <- model.matrix(Salary ~ ., Hitters)[, -1]
y <- Hitters$Salary
### Ridge Regression
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.2
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
grid <- 10^seq(10, -2, length = 100)
ridge.mod <- glmnet(x, y, alpha = 0, lambda = grid)
dim(coef(ridge.mod))
## [1] 20 100
###
ridge.mod$lambda[50]
## [1] 11497.57
coef(ridge.mod)[, 50]
## (Intercept) AtBat Hits HmRun Runs
## 407.356050200 0.036957182 0.138180344 0.524629976 0.230701523
## RBI Walks Years CAtBat CHits
## 0.239841459 0.289618741 1.107702929 0.003131815 0.011653637
## CHmRun CRuns CRBI CWalks LeagueN
## 0.087545670 0.023379882 0.024138320 0.025015421 0.085028114
## DivisionW PutOuts Assists Errors NewLeagueN
## -6.215440973 0.016482577 0.002612988 -0.020502690 0.301433531
sqrt(sum(coef(ridge.mod)[-1, 50]^2))
## [1] 6.360612
###
ridge.mod$lambda[60]
## [1] 705.4802
coef(ridge.mod)[, 60]
## (Intercept) AtBat Hits HmRun Runs RBI
## 54.32519950 0.11211115 0.65622409 1.17980910 0.93769713 0.84718546
## Walks Years CAtBat CHits CHmRun CRuns
## 1.31987948 2.59640425 0.01083413 0.04674557 0.33777318 0.09355528
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.09780402 0.07189612 13.68370191 -54.65877750 0.11852289 0.01606037
## Errors NewLeagueN
## -0.70358655 8.61181213
sqrt(sum(coef(ridge.mod)[-1, 60]^2))
## [1] 57.11001
###
predict(ridge.mod, s = 50, type = "coefficients")[1:20, ]
## (Intercept) AtBat Hits HmRun Runs
## 4.876610e+01 -3.580999e-01 1.969359e+00 -1.278248e+00 1.145892e+00
## RBI Walks Years CAtBat CHits
## 8.038292e-01 2.716186e+00 -6.218319e+00 5.447837e-03 1.064895e-01
## CHmRun CRuns CRBI CWalks LeagueN
## 6.244860e-01 2.214985e-01 2.186914e-01 -1.500245e-01 4.592589e+01
## DivisionW PutOuts Assists Errors NewLeagueN
## -1.182011e+02 2.502322e-01 1.215665e-01 -3.278600e+00 -9.496680e+00
set.seed(1)
train <- sample(1:nrow(x), nrow(x) / 2)
test <- (-train)
y.test <- y[test]
ridge.mod <- glmnet(x[train, ], y[train], alpha = 0,lambda = grid, thresh = 1e-12)
ridge.pred <- predict(ridge.mod, s = 4, newx = x[test, ])
mean((ridge.pred - y.test)^2)
## [1] 142199.2
mean((mean(y[train]) - y.test)^2)
## [1] 224669.9
###
ridge.pred <- predict(ridge.mod, s = 1e10, newx = x[test, ])
mean((ridge.pred - y.test)^2)
## [1] 224669.8
###
ridge.pred <- predict(ridge.mod, s = 0, newx = x[test, ],exact = T, x = x[train, ], y = y[train])
mean((ridge.pred - y.test)^2)
## [1] 168588.6
lm(y ~ x, subset = train)
##
## Call:
## lm(formula = y ~ x, subset = train)
##
## Coefficients:
## (Intercept) xAtBat xHits xHmRun xRuns xRBI
## 274.0145 -0.3521 -1.6377 5.8145 1.5424 1.1243
## xWalks xYears xCAtBat xCHits xCHmRun xCRuns
## 3.7287 -16.3773 -0.6412 3.1632 3.4008 -0.9739
## xCRBI xCWalks xLeagueN xDivisionW xPutOuts xAssists
## -0.6005 0.3379 119.1486 -144.0831 0.1976 0.6804
## xErrors xNewLeagueN
## -4.7128 -71.0951
predict(ridge.mod, s = 0, exact = T, type = "coefficients",
x = x[train, ], y = y[train])[1:20, ]
## (Intercept) AtBat Hits HmRun Runs RBI
## 274.0200994 -0.3521900 -1.6371383 5.8146692 1.5423361 1.1241837
## Walks Years CAtBat CHits CHmRun CRuns
## 3.7288406 -16.3795195 -0.6411235 3.1629444 3.4005281 -0.9739405
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## -0.6003976 0.3378422 119.1434637 -144.0853061 0.1976300 0.6804200
## Errors NewLeagueN
## -4.7127879 -71.0898914
###
set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 0)
plot(cv.out)
bestlam <- cv.out$lambda.min
bestlam
## [1] 326.0828
###
ridge.pred <- predict(ridge.mod, s = bestlam,
newx = x[test, ])
mean((ridge.pred - y.test)^2)
## [1] 139856.6
out <- glmnet(x, y, alpha = 0)
predict(out, type = "coefficients", s = bestlam)[1:20, ]
## (Intercept) AtBat Hits HmRun Runs RBI
## 15.44383120 0.07715547 0.85911582 0.60103106 1.06369007 0.87936105
## Walks Years CAtBat CHits CHmRun CRuns
## 1.62444617 1.35254778 0.01134999 0.05746654 0.40680157 0.11456224
## CRBI CWalks LeagueN DivisionW PutOuts Assists
## 0.12116504 0.05299202 22.09143197 -79.04032656 0.16619903 0.02941950
## Errors NewLeagueN
## -1.36092945 9.12487765
lasso.mod <- glmnet(x[train, ], y[train], alpha = 1,
lambda = grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1)
plot(cv.out)
bestlam <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam,
newx = x[test, ])
mean((lasso.pred - y.test)^2)
## [1] 143673.6
###
out <- glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out, type = "coefficients",
s = bestlam)[1:20, ]
lasso.coef
## (Intercept) AtBat Hits HmRun Runs
## 1.27479059 -0.05497143 2.18034583 0.00000000 0.00000000
## RBI Walks Years CAtBat CHits
## 0.00000000 2.29192406 -0.33806109 0.00000000 0.00000000
## CHmRun CRuns CRBI CWalks LeagueN
## 0.02825013 0.21628385 0.41712537 0.00000000 20.28615023
## DivisionW PutOuts Assists Errors NewLeagueN
## -116.16755870 0.23752385 0.00000000 -0.85629148 0.00000000
lasso.coef[lasso.coef != 0]
## (Intercept) AtBat Hits Walks Years
## 1.27479059 -0.05497143 2.18034583 2.29192406 -0.33806109
## CHmRun CRuns CRBI LeagueN DivisionW
## 0.02825013 0.21628385 0.41712537 20.28615023 -116.16755870
## PutOuts Errors
## 0.23752385 -0.85629148
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(2)
pcr.fit <- pcr(Salary ~ ., data = Hitters, scale = TRUE,
validation = "CV")
summary(pcr.fit)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 452 351.9 353.2 355.0 352.8 348.4 343.6
## adjCV 452 351.6 352.7 354.4 352.1 347.6 342.7
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 345.5 347.7 349.6 351.4 352.1 353.5 358.2
## adjCV 344.7 346.7 348.5 350.1 350.7 352.0 356.5
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 349.7 349.4 339.9 341.6 339.2 339.6
## adjCV 348.0 347.7 338.2 339.7 337.2 337.6
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26 94.96
## Salary 40.63 41.58 42.17 43.22 44.90 46.48 46.69 46.75
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 96.28 97.26 97.98 98.65 99.15 99.47 99.75
## Salary 46.86 47.76 47.82 47.85 48.10 50.40 50.55
## 16 comps 17 comps 18 comps 19 comps
## X 99.89 99.97 99.99 100.00
## Salary 53.01 53.85 54.61 54.61
validationplot(pcr.fit, val.type = "MSEP")
set.seed(1)
pcr.fit <- pcr(Salary ~ ., data = Hitters, subset = train,
scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")
pcr.pred <- predict(pcr.fit, x[test, ], ncomp = 5)
mean((pcr.pred - y.test)^2)
## [1] 142811.8
###
pcr.fit <- pcr(y ~ x, scale = TRUE, ncomp = 5)
summary(pcr.fit)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 5
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps
## X 38.31 60.16 70.84 79.03 84.29
## y 40.63 41.58 42.17 43.22 44.90
set.seed(1)
pls.fit <- plsr(Salary ~ ., data = Hitters, subset = train, scale = TRUE, validation = "CV")
summary(pls.fit)
## Data: X dimension: 131 19
## Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 428.3 325.5 329.9 328.8 339.0 338.9 340.1
## adjCV 428.3 325.0 328.2 327.2 336.6 336.1 336.6
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 339.0 347.1 346.4 343.4 341.5 345.4 356.4
## adjCV 336.2 343.4 342.8 340.2 338.3 341.8 351.1
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 348.4 349.1 350.0 344.2 344.5 345.0
## adjCV 344.2 345.0 345.9 340.4 340.6 341.1
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 39.13 48.80 60.09 75.07 78.58 81.12 88.21 90.71
## Salary 46.36 50.72 52.23 53.03 54.07 54.77 55.05 55.66
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 93.17 96.05 97.08 97.61 97.97 98.70 99.12
## Salary 55.95 56.12 56.47 56.68 57.37 57.76 58.08
## 16 comps 17 comps 18 comps 19 comps
## X 99.61 99.70 99.95 100.00
## Salary 58.17 58.49 58.56 58.62
validationplot(pls.fit, val.type = "MSEP")
pls.pred <- predict(pls.fit, x[test, ], ncomp = 1)
mean((pls.pred - y.test)^2)
## [1] 151995.3
pls.fit <- plsr(Salary ~ ., data = Hitters, scale = TRUE,
ncomp = 1)
summary(pls.fit)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: kernelpls
## Number of components considered: 1
## TRAINING: % variance explained
## 1 comps
## X 38.08
## Salary 43.05
library(ISLR2)
attach(Wage)
## The following object is masked from Auto (pos = 8):
##
## year
## The following object is masked from Boston:
##
## age
## The following object is masked from Auto (pos = 31):
##
## year
## Polynomial Regression and Step Functions
fit <- lm(wage ~ poly(age, 4), data = Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7287409 153.283015 0.000000e+00
## poly(age, 4)1 447.06785 39.9147851 11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3 125.52169 39.9147851 3.144742 1.678622e-03
## poly(age, 4)4 -77.91118 39.9147851 -1.951938 5.103865e-02
fit2 <- lm(wage ~ poly(age, 4, raw = T), data = Wage)
coef(summary(fit2))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = T)1 2.124552e+01 5.886748e+00 3.609042 0.0003123618
## poly(age, 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = T)3 6.810688e-03 3.065931e-03 2.221409 0.0263977518
## poly(age, 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498
### alternative way
fit2a <- lm(wage ~ age + I(age^2) + I(age^3) + I(age^4),
data = Wage)
coef(fit2a)
## (Intercept) age I(age^2) I(age^3) I(age^4)
## -1.841542e+02 2.124552e+01 -5.638593e-01 6.810688e-03 -3.203830e-05
### alternative way 2
fit2b <- lm(wage ~ cbind(age, age^2, age^3, age^4),
data = Wage)
coef(fit2b)
## (Intercept) cbind(age, age^2, age^3, age^4)age
## -1.841542e+02 2.124552e+01
## cbind(age, age^2, age^3, age^4) cbind(age, age^2, age^3, age^4)
## -5.638593e-01 6.810688e-03
## cbind(age, age^2, age^3, age^4)
## -3.203830e-05
###
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds <- predict(fit, newdata = list(age = age.grid),
se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit,
preds$fit - 2 * preds$se.fit)
###
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 1, 1),
oma = c(0, 0, 4, 0))
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey")
title("Degree-4 Polynomial", outer = T)
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)
###
preds2 <- predict(fit2, newdata = list(age = age.grid),
se = TRUE)
max(abs(preds$fit - preds2$fit))
## [1] 6.842527e-11
fit.1 <- lm(wage ~ age, data = Wage)
fit.2 <- lm(wage ~ poly(age, 2), data = Wage)
fit.3 <- lm(wage ~ poly(age, 3), data = Wage)
fit.4 <- lm(wage ~ poly(age, 4), data = Wage)
fit.5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(summary(fit.5))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1 447.06785 39.9160847 11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3 125.52169 39.9160847 3.1446392 1.679213e-03
## poly(age, 5)4 -77.91118 39.9160847 -1.9518743 5.104623e-02
## poly(age, 5)5 -35.81289 39.9160847 -0.8972045 3.696820e-01
###
(-11.983)^2
## [1] 143.5923
fit.1 <- lm(wage ~ education + age, data = Wage)
fit.2 <- lm(wage ~ education + poly(age, 2), data = Wage)
fit.3 <- lm(wage ~ education + poly(age, 3), data = Wage)
anova(fit.1, fit.2, fit.3)
## Analysis of Variance Table
##
## Model 1: wage ~ education + age
## Model 2: wage ~ education + poly(age, 2)
## Model 3: wage ~ education + poly(age, 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2994 3867992
## 2 2993 3725395 1 142597 114.6969 <2e-16 ***
## 3 2992 3719809 1 5587 4.4936 0.0341 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit <- glm(I(wage > 250) ~ poly(age, 4), data = Wage,
family = binomial)
preds <- predict(fit, newdata = list(age = age.grid), se = T)
pfit <- exp(preds$fit) / (1 + exp(preds$fit))
se.bands.logit <- cbind(preds$fit + 2 * preds$se.fit,
preds$fit - 2 * preds$se.fit)
se.bands <- exp(se.bands.logit) / (1 + exp(se.bands.logit))
###
preds <- predict(fit, newdata = list(age = age.grid),
type = "response", se = T)
plot(age, I(wage > 250), xlim = agelims, type = "n",
ylim = c(0, .2))
points(jitter(age), I((wage > 250) / 5), cex = .5, pch = "|", col = "darkgrey")
lines(age.grid, pfit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)
###
table(cut(age, 4))
##
## (17.9,33.5] (33.5,49] (49,64.5] (64.5,80.1]
## 750 1399 779 72
fit <- lm(wage ~ cut(age, 4), data = Wage)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.158392 1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49] 24.053491 1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5] 23.664559 2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1] 7.640592 4.987424 1.531972 1.256350e-01
library(splines)
fit <- lm(wage ~ bs(age, knots = c(25, 40, 60)), data = Wage)
pred <- predict(fit, newdata = list(age = age.grid), se = T)
plot(age, wage, col = "gray")
lines(age.grid, pred$fit, lwd = 2)
lines(age.grid, pred$fit + 2 * pred$se, lty = "dashed")
lines(age.grid, pred$fit - 2 * pred$se, lty = "dashed")
dim(bs(age, knots = c(25, 40, 60)))
## [1] 3000 6
dim(bs(age, df = 6))
## [1] 3000 6
attr(bs(age, df = 6), "knots")
## [1] 33.75 42.00 51.00
###
fit2 <- lm(wage ~ ns(age, df = 4), data = Wage)
pred2 <- predict(fit2, newdata = list(age = age.grid),
se = T)
plot(age, wage, col = "gray")
lines(age.grid, pred2$fit, col = "red", lwd = 2)
###
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey")
title("Smoothing Spline")
fit <- smooth.spline(age, wage, df = 16)
fit2 <- smooth.spline(age, wage, cv = TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with
## non-unique 'x' values seems doubtful
fit$df
## [1] 16.00237
fit2$df
## [1] 6.794596
lines(fit, col = "red", lwd = 2)
lines(fit2, col = "blue", lwd = 2)
legend("topright", legend = c("16 DF", "6.8 DF"),
col = c("red", "blue"), lty = 1, lwd = 2, cex = .8)
###
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey")
title("Local Regression")
fit <- loess(wage ~ age, span = .2, data = Wage)
fit2 <- loess(wage ~ age, span = .5, data = Wage)
lines(age.grid, predict(fit, data.frame(age = age.grid)),
col = "red", lwd = 2)
lines(age.grid, predict(fit2, data.frame(age = age.grid)),
col = "blue", lwd = 2)
legend("topright", legend = c("Span = 0.2", "Span = 0.5"),
col = c("red", "blue"), lty = 1, lwd = 2, cex = .8)
gam1 <- lm(wage ~ ns(year, 4) + ns(age, 5) + education,
data = Wage)
###
library(gam)
## Warning: package 'gam' was built under R version 4.4.3
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.4.2
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.22-6
gam.m3 <- gam(wage ~ s(year, 4) + s(age, 5) + education,
data = Wage)
par(mfrow = c(1, 3))
plot(gam.m3, se = TRUE, col = "blue")
plot.Gam(gam1, se = TRUE, col = "red")
gam.m1 <- gam(wage ~ s(age, 5) + education, data = Wage)
gam.m2 <- gam(wage ~ year + s(age, 5) + education,
data = Wage)
anova(gam.m1, gam.m2, gam.m3, test = "F")
## Analysis of Deviance Table
##
## Model 1: wage ~ s(age, 5) + education
## Model 2: wage ~ year + s(age, 5) + education
## Model 3: wage ~ s(year, 4) + s(age, 5) + education
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 2990 3711731
## 2 2989 3693842 1 17889.2 14.4771 0.0001447 ***
## 3 2986 3689770 3 4071.1 1.0982 0.3485661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gam.m3)
##
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -119.43 -19.70 -3.33 14.17 213.48
##
## (Dispersion Parameter for gaussian family taken to be 1235.69)
##
## Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(year, 4) 1 27162 27162 21.981 2.877e-06 ***
## s(age, 5) 1 195338 195338 158.081 < 2.2e-16 ***
## education 4 1069726 267432 216.423 < 2.2e-16 ***
## Residuals 2986 3689770 1236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(year, 4) 3 1.086 0.3537
## s(age, 5) 4 32.380 <2e-16 ***
## education
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
preds <- predict(gam.m2, newdata = Wage)
###
gam.lo <- gam(
wage ~ s(year, df = 4) + lo(age, span = 0.7) + education,
data = Wage
)
plot(gam.lo, se = TRUE, col = "green")
#install.packages("interp")
library(interp)
## Warning: package 'interp' was built under R version 4.4.3
##
## Attaching package: 'interp'
## The following object is masked from 'package:MASS':
##
## area
library(gam)
gam.lo.i <- gam(wage ~ lo(year, age, span = 0.5) + education,
data = Wage)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## liv too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## liv too small. (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small. (Discovered by lowesd)
library(akima)
## Warning: package 'akima' was built under R version 4.4.3
##
## Attaching package: 'akima'
## The following objects are masked from 'package:interp':
##
## aspline, bicubic, bicubic.grid, bilinear, bilinear.grid,
## franke.data, franke.fn, interp, interp2xyz, interpp
plot(gam.lo.i)
###
gam.lr <- gam(
I(wage > 250) ~ year + s(age, df = 5) + education,
family = binomial, data = Wage
)
par(mfrow = c(1, 3))
plot(gam.lr, se = T, col = "green")
table(education, I(wage > 250))
##
## education FALSE TRUE
## 1. < HS Grad 268 0
## 2. HS Grad 966 5
## 3. Some College 643 7
## 4. College Grad 663 22
## 5. Advanced Degree 381 45
gam.lr.s <- gam(
I(wage > 250) ~ year + s(age, df = 5) + education,
family = binomial, data = Wage,
subset = (education != "1. < HS Grad")
)
plot(gam.lr.s, se = T, col = "green")
library(tree)
## Warning: package 'tree' was built under R version 4.4.3
library(ISLR2)
attach(Carseats)
## The following objects are masked from Carseats (pos = 32):
##
## Advertising, Age, CompPrice, Education, Income, Population, Price,
## Sales, ShelveLoc, Urban, US
High <- factor(ifelse(Sales <= 8, "No", "Yes"))
Carseats <- data.frame(Carseats, High)
tree.carseats <- tree(High ~ . - Sales, Carseats)
tree.carseats
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 400 541.500 No ( 0.59000 0.41000 )
## 2) ShelveLoc: Bad,Medium 315 390.600 No ( 0.68889 0.31111 )
## 4) Price < 92.5 46 56.530 Yes ( 0.30435 0.69565 )
## 8) Income < 57 10 12.220 No ( 0.70000 0.30000 )
## 16) CompPrice < 110.5 5 0.000 No ( 1.00000 0.00000 ) *
## 17) CompPrice > 110.5 5 6.730 Yes ( 0.40000 0.60000 ) *
## 9) Income > 57 36 35.470 Yes ( 0.19444 0.80556 )
## 18) Population < 207.5 16 21.170 Yes ( 0.37500 0.62500 ) *
## 19) Population > 207.5 20 7.941 Yes ( 0.05000 0.95000 ) *
## 5) Price > 92.5 269 299.800 No ( 0.75465 0.24535 )
## 10) Advertising < 13.5 224 213.200 No ( 0.81696 0.18304 )
## 20) CompPrice < 124.5 96 44.890 No ( 0.93750 0.06250 )
## 40) Price < 106.5 38 33.150 No ( 0.84211 0.15789 )
## 80) Population < 177 12 16.300 No ( 0.58333 0.41667 )
## 160) Income < 60.5 6 0.000 No ( 1.00000 0.00000 ) *
## 161) Income > 60.5 6 5.407 Yes ( 0.16667 0.83333 ) *
## 81) Population > 177 26 8.477 No ( 0.96154 0.03846 ) *
## 41) Price > 106.5 58 0.000 No ( 1.00000 0.00000 ) *
## 21) CompPrice > 124.5 128 150.200 No ( 0.72656 0.27344 )
## 42) Price < 122.5 51 70.680 Yes ( 0.49020 0.50980 )
## 84) ShelveLoc: Bad 11 6.702 No ( 0.90909 0.09091 ) *
## 85) ShelveLoc: Medium 40 52.930 Yes ( 0.37500 0.62500 )
## 170) Price < 109.5 16 7.481 Yes ( 0.06250 0.93750 ) *
## 171) Price > 109.5 24 32.600 No ( 0.58333 0.41667 )
## 342) Age < 49.5 13 16.050 Yes ( 0.30769 0.69231 ) *
## 343) Age > 49.5 11 6.702 No ( 0.90909 0.09091 ) *
## 43) Price > 122.5 77 55.540 No ( 0.88312 0.11688 )
## 86) CompPrice < 147.5 58 17.400 No ( 0.96552 0.03448 ) *
## 87) CompPrice > 147.5 19 25.010 No ( 0.63158 0.36842 )
## 174) Price < 147 12 16.300 Yes ( 0.41667 0.58333 )
## 348) CompPrice < 152.5 7 5.742 Yes ( 0.14286 0.85714 ) *
## 349) CompPrice > 152.5 5 5.004 No ( 0.80000 0.20000 ) *
## 175) Price > 147 7 0.000 No ( 1.00000 0.00000 ) *
## 11) Advertising > 13.5 45 61.830 Yes ( 0.44444 0.55556 )
## 22) Age < 54.5 25 25.020 Yes ( 0.20000 0.80000 )
## 44) CompPrice < 130.5 14 18.250 Yes ( 0.35714 0.64286 )
## 88) Income < 100 9 12.370 No ( 0.55556 0.44444 ) *
## 89) Income > 100 5 0.000 Yes ( 0.00000 1.00000 ) *
## 45) CompPrice > 130.5 11 0.000 Yes ( 0.00000 1.00000 ) *
## 23) Age > 54.5 20 22.490 No ( 0.75000 0.25000 )
## 46) CompPrice < 122.5 10 0.000 No ( 1.00000 0.00000 ) *
## 47) CompPrice > 122.5 10 13.860 No ( 0.50000 0.50000 )
## 94) Price < 125 5 0.000 Yes ( 0.00000 1.00000 ) *
## 95) Price > 125 5 0.000 No ( 1.00000 0.00000 ) *
## 3) ShelveLoc: Good 85 90.330 Yes ( 0.22353 0.77647 )
## 6) Price < 135 68 49.260 Yes ( 0.11765 0.88235 )
## 12) US: No 17 22.070 Yes ( 0.35294 0.64706 )
## 24) Price < 109 8 0.000 Yes ( 0.00000 1.00000 ) *
## 25) Price > 109 9 11.460 No ( 0.66667 0.33333 ) *
## 13) US: Yes 51 16.880 Yes ( 0.03922 0.96078 ) *
## 7) Price > 135 17 22.070 No ( 0.64706 0.35294 )
## 14) Income < 46 6 0.000 No ( 1.00000 0.00000 ) *
## 15) Income > 46 11 15.160 Yes ( 0.45455 0.54545 ) *
summary(tree.carseats)
##
## Classification tree:
## tree(formula = High ~ . - Sales, data = Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Income" "CompPrice" "Population"
## [6] "Advertising" "Age" "US"
## Number of terminal nodes: 27
## Residual mean deviance: 0.4575 = 170.7 / 373
## Misclassification error rate: 0.09 = 36 / 400
plot(tree.carseats)
text(tree.carseats, pretty = 0)
set.seed(2)
train <- sample(1:nrow(Carseats), 200)
Carseats.test <- Carseats[-train, ]
High.test <- High[-train]
tree.carseats <- tree(High ~ . - Sales, Carseats,
subset = train)
tree.pred <- predict(tree.carseats, Carseats.test,
type = "class")
table(tree.pred, High.test)
## High.test
## tree.pred No Yes
## No 104 33
## Yes 13 50
(104 + 50) / 200
## [1] 0.77
###
set.seed(7)
cv.carseats <- cv.tree(tree.carseats, FUN = prune.misclass)
names(cv.carseats)
## [1] "size" "dev" "k" "method"
cv.carseats
## $size
## [1] 21 19 14 9 8 5 3 2 1
##
## $dev
## [1] 75 75 75 74 82 83 83 85 82
##
## $k
## [1] -Inf 0.0 1.0 1.4 2.0 3.0 4.0 9.0 18.0
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
###
par(mfrow = c(1, 2))
plot(cv.carseats$size, cv.carseats$dev, type = "b")
plot(cv.carseats$k, cv.carseats$dev, type = "b")
### from plot for size 9 dev is minimum
prune.carseats <- prune.misclass(tree.carseats, best = 9)
plot(prune.carseats)
text(prune.carseats, pretty = 0)
###
tree.pred <- predict(prune.carseats, Carseats.test,
type = "class")
table(tree.pred, High.test)
## High.test
## tree.pred No Yes
## No 97 25
## Yes 20 58
(97 + 58) / 200
## [1] 0.775
### for (number of terminal node) size 14
prune.carseats <- prune.misclass(tree.carseats, best = 14)
plot(prune.carseats)
text(prune.carseats, pretty = 0)
tree.pred <- predict(prune.carseats, Carseats.test,
type = "class")
table(tree.pred, High.test)
## High.test
## tree.pred No Yes
## No 102 31
## Yes 15 52
(102 + 52) / 200
## [1] 0.77
set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston) / 2)
tree.boston <- tree(medv ~ ., Boston, subset = train)
summary(tree.boston)
##
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "rm" "lstat" "crim" "age"
## Number of terminal nodes: 7
## Residual mean deviance: 10.38 = 2555 / 246
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -10.1800 -1.7770 -0.1775 0.0000 1.9230 16.5800
###
plot(tree.boston)
text(tree.boston, pretty = 0)
###
cv.boston <- cv.tree(tree.boston)
cv.boston
## $size
## [1] 7 6 5 4 3 2 1
##
## $dev
## [1] 4380.849 4544.815 5601.055 6171.917 6919.608 10419.472 19630.870
##
## $k
## [1] -Inf 203.9641 637.2707 796.1207 1106.4931 3424.7810 10724.5951
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.boston$size, cv.boston$dev, type = "b")
###
prune.boston <- prune.tree(tree.boston, best = 5)
plot(prune.boston)
text(prune.boston, pretty = 0)
###
yhat <- predict(tree.boston, newdata = Boston[-train, ])
boston.test <- Boston[-train, "medv"]
plot(yhat, boston.test)
abline(0, 1)
mean((yhat - boston.test)^2)
## [1] 35.28688
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(1)
bag.boston <- randomForest(medv ~ ., data = Boston,
subset = train, mtry = 12, importance = TRUE)
bag.boston
##
## Call:
## randomForest(formula = medv ~ ., data = Boston, mtry = 12, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 12
##
## Mean of squared residuals: 11.40162
## % Var explained: 85.17
###
yhat.bag <- predict(bag.boston, newdata = Boston[-train, ])
plot(yhat.bag, boston.test)
abline(0, 1)
mean((yhat.bag - boston.test)^2)
## [1] 23.41916
###
bag.boston <- randomForest(medv ~ ., data = Boston,
subset = train, mtry = 12, ntree = 25)
yhat.bag <- predict(bag.boston, newdata = Boston[-train, ])
mean((yhat.bag - boston.test)^2)
## [1] 25.75055
###
set.seed(1)
rf.boston <- randomForest(medv ~ ., data = Boston,
subset = train, mtry = 6, importance = TRUE)
yhat.rf <- predict(rf.boston, newdata = Boston[-train, ])
mean((yhat.rf - boston.test)^2)
## [1] 20.06644
###
importance(rf.boston)
## %IncMSE IncNodePurity
## crim 19.435587 1070.42307
## zn 3.091630 82.19257
## indus 6.140529 590.09536
## chas 1.370310 36.70356
## nox 13.263466 859.97091
## rm 35.094741 8270.33906
## age 15.144821 634.31220
## dis 9.163776 684.87953
## rad 4.793720 83.18719
## tax 4.410714 292.20949
## ptratio 8.612780 902.20190
## lstat 28.725343 5813.04833
###
varImpPlot(rf.boston)
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(1)
boost.boston <- gbm(medv ~ ., data = Boston[train, ],
distribution = "gaussian", n.trees = 5000,
interaction.depth = 4)
summary(boost.boston)
## var rel.inf
## rm rm 44.48249588
## lstat lstat 32.70281223
## crim crim 4.85109954
## dis dis 4.48693083
## nox nox 3.75222394
## age age 3.19769210
## ptratio ptratio 2.81354826
## tax tax 1.54417603
## indus indus 1.03384666
## rad rad 0.87625748
## zn zn 0.16220479
## chas chas 0.09671228
###
plot(boost.boston, i = "rm")
plot(boost.boston, i = "lstat")
###
yhat.boost <- predict(boost.boston,
newdata = Boston[-train, ], n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 18.39057
###
boost.boston <- gbm(medv ~ ., data = Boston[train, ],
distribution = "gaussian", n.trees = 5000,
interaction.depth = 4, shrinkage = 0.2, verbose = F)
yhat.boost <- predict(boost.boston,
newdata = Boston[-train, ], n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 16.54778
library(BART)
## Warning: package 'BART' was built under R version 4.4.3
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
x <- Boston[, 1:12]
y <- Boston[, "medv"]
xtrain <- x[train, ]
ytrain <- y[train]
xtest <- x[-train, ]
ytest <- y[-train]
set.seed(1)
bartfit <- gbart(xtrain, ytrain, x.test = xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 253, 12, 253
## y1,yn: 0.213439, -5.486561
## x1,x[n*p]: 0.109590, 20.080000
## xp1,xp[np*p]: 0.027310, 7.880000
## *****Number of Trees: 200
## *****Number of Cut Points: 100 ... 100
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.795495,3,3.71636,21.7866
## *****sigma: 4.367914
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,12,0
## *****printevery: 100
##
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 3s
## trcnt,tecnt: 1000,1000
###
yhat.bart <- bartfit$yhat.test.mean
mean((ytest - yhat.bart)^2)
## [1] 15.94718
###
ord <- order(bartfit$varcount.mean, decreasing = T)
bartfit$varcount.mean[ord]
## nox lstat tax rad rm indus chas ptratio age zn
## 22.952 21.329 21.250 20.781 19.890 19.825 19.051 18.976 18.274 15.952
## dis crim
## 14.457 11.007
set.seed(1)
x <- matrix(rnorm(20 * 2), ncol = 2)
y <- c(rep(-1, 10), rep(1, 10))
x[y == 1, ] <- x[y == 1, ] + 1
plot(x, col = (3 - y))
dat <- data.frame(x = x, y = as.factor(y))
library(e1071)
svmfit <- svm(y ~ ., data = dat, kernel = "linear",
cost = 10, scale = FALSE)
plot(svmfit, dat)
svmfit$index
## [1] 1 2 5 7 14 16 17
summary(svmfit)
##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
##
## Number of Support Vectors: 7
##
## ( 4 3 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
svmfit <- svm(y ~ ., data = dat, kernel = "linear",
cost = 0.1, scale = FALSE)
plot(svmfit, dat)
svmfit$index
## [1] 1 2 3 4 5 7 9 10 12 13 14 15 16 17 18 20
set.seed(1)
tune.out <- tune(svm, y ~ ., data = dat, kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.05
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.55 0.4377975
## 2 1e-02 0.55 0.4377975
## 3 1e-01 0.05 0.1581139
## 4 1e+00 0.15 0.2415229
## 5 5e+00 0.15 0.2415229
## 6 1e+01 0.15 0.2415229
## 7 1e+02 0.15 0.2415229
###
bestmod <- tune.out$best.model
summary(bestmod)
##
## Call:
## best.tune(METHOD = svm, train.x = y ~ ., data = dat, ranges = list(cost = c(0.001,
## 0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
##
## Number of Support Vectors: 16
##
## ( 8 8 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
xtest <- matrix(rnorm(20 * 2), ncol = 2)
ytest <- sample(c(-1, 1), 20, rep = TRUE)
xtest[ytest == 1, ] <- xtest[ytest == 1, ] + 1
testdat <- data.frame(x = xtest, y = as.factor(ytest))
ypred <- predict(bestmod, xtest)
table(predict = ypred, truth = testdat$y)
## truth
## predict -1 1
## -1 9 1
## 1 2 8
###
svmfit <- svm(y ~ ., data = dat, kernel = "linear",
cost = .01, scale = FALSE)
ypred <- predict(svmfit, testdat)
table(predict = ypred, truth = testdat$y)
## truth
## predict -1 1
## -1 11 6
## 1 0 3
###
x[y == 1, ] <- x[y == 1, ] + 0.5
plot(x, col = (y + 5) / 2, pch = 19)
###
dat <- data.frame(x = x, y = as.factor(y))
svmfit <- svm(y ~ ., data = dat, kernel = "linear",
cost = 1e5)
summary(svmfit)
##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1e+05)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1e+05
##
## Number of Support Vectors: 3
##
## ( 1 2 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
plot(svmfit, dat)
###
svmfit <- svm(y ~ ., data = dat, kernel = "linear", cost = 1)
summary(svmfit)
##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 7
##
## ( 4 3 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
plot(svmfit, dat)
set.seed(1)
x <- matrix(rnorm(200 * 2), ncol = 2)
x[1:100, ] <- x[1:100, ] + 2
x[101:150, ] <- x[101:150, ] - 2
y <- c(rep(1, 150), rep(2, 50))
dat <- data.frame(x = x, y = as.factor(y))
plot(x, col = y)
###
train <- sample(200, 100)
svmfit <- svm(y ~ ., data = dat[train, ], kernel = "radial",
gamma = 1, cost = 1)
plot(svmfit, dat[train, ])
summary(svmfit)
##
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", gamma = 1,
## cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 31
##
## ( 16 15 )
##
##
## Number of Classes: 2
##
## Levels:
## 1 2
###
svmfit <- svm(y ~ ., data = dat[train, ], kernel = "radial",
gamma = 1, cost = 1e5)
plot(svmfit, dat[train, ])
###
set.seed(1)
tune.out <- tune(svm, y ~ ., data = dat[train, ],
kernel = "radial",
ranges = list(
cost = c(0.1, 1, 10, 100, 1000),
gamma = c(0.5, 1, 2, 3, 4)
)
)
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 1 0.5
##
## - best performance: 0.07
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.5 0.26 0.15776213
## 2 1e+00 0.5 0.07 0.08232726
## 3 1e+01 0.5 0.07 0.08232726
## 4 1e+02 0.5 0.14 0.15055453
## 5 1e+03 0.5 0.11 0.07378648
## 6 1e-01 1.0 0.22 0.16193277
## 7 1e+00 1.0 0.07 0.08232726
## 8 1e+01 1.0 0.09 0.07378648
## 9 1e+02 1.0 0.12 0.12292726
## 10 1e+03 1.0 0.11 0.11005049
## 11 1e-01 2.0 0.27 0.15670212
## 12 1e+00 2.0 0.07 0.08232726
## 13 1e+01 2.0 0.11 0.07378648
## 14 1e+02 2.0 0.12 0.13165612
## 15 1e+03 2.0 0.16 0.13498971
## 16 1e-01 3.0 0.27 0.15670212
## 17 1e+00 3.0 0.07 0.08232726
## 18 1e+01 3.0 0.08 0.07888106
## 19 1e+02 3.0 0.13 0.14181365
## 20 1e+03 3.0 0.15 0.13540064
## 21 1e-01 4.0 0.27 0.15670212
## 22 1e+00 4.0 0.07 0.08232726
## 23 1e+01 4.0 0.09 0.07378648
## 24 1e+02 4.0 0.13 0.14181365
## 25 1e+03 4.0 0.15 0.13540064
###
p<-predict(tune.out$best.model, newdata = dat[-train, ])
table(true = dat[-train, "y"], pred = p)
## pred
## true 1 2
## 1 67 10
## 2 2 21
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.4.3
rocplot <- function(pred, truth, ...) {
predob <- prediction(pred, truth)
perf <- performance(predob, "tpr", "fpr")
plot(perf, ...)
}
###
svmfit.opt <- svm(y ~ ., data = dat[train, ],
kernel = "radial", gamma = 2, cost = 1,
decision.values = T)
fitted <- attributes(
predict(svmfit.opt, dat[train, ], decision.values = TRUE)
)$decision.values
###
par(mfrow = c(1, 2))
rocplot(-fitted, dat[train, "y"], main = "Training Data")
###
svmfit.flex <- svm(y ~ ., data = dat[train, ],
kernel = "radial", gamma = 50, cost = 1,
decision.values = T)
fitted <- attributes(
predict(svmfit.flex, dat[train, ], decision.values = T)
)$decision.values
rocplot(-fitted, dat[train, "y"], add = T, col = "red")
###
fitted <- attributes(
predict(svmfit.opt, dat[-train, ], decision.values = T)
)$decision.values
rocplot(-fitted, dat[-train, "y"], main = "Test Data")
fitted <- attributes(
predict(svmfit.flex, dat[-train, ], decision.values = T)
)$decision.values
rocplot(-fitted, dat[-train, "y"], add = T, col = "red")
dat <- data.frame(xx = x[train, ], yy = as.factor(y[train]))
###
svmfit.opt <- svm(yy ~ ., data = dat, kernel = "radial", gamma = 2, cost = 1, probability = T)
probs <- attributes(predict(svmfit.opt, dat, probability = T))$probabilities[, 1]
###
par(mfrow = c(1, 2))
rocplot(probs, dat$yy, main = "Training Data")
###
svmfit.flex <- svm(yy ~ ., data = dat, kernel = "radial", gamma = 50, cost = 1, probability = T)
probs <- attributes(predict(svmfit.flex, dat, probability = T))$probabilities[, 1]
rocplot(probs, dat$yy, add = T, col = "red")
###
dat.test <- data.frame(xx = x[-train, ], yy = as.factor(y[-train]))
probs <- attributes(predict(svmfit.opt, dat.test, probability = T))$probabilities[, 1]
rocplot(probs, dat.test$yy, main = "Test Data")
probs <- attributes(predict(svmfit.flex, dat.test, probability = T))$probabilities[, 1]
rocplot(probs, dat.test$yy, add = T, col = "red")
set.seed(1)
x <- rbind(x, matrix(rnorm(50 * 2), ncol = 2))
y <- c(y, rep(0, 50))
x[y == 0, 2] <- x[y == 0, 2] + 2
dat <- data.frame(x = x, y = as.factor(y))
par(mfrow = c(1, 1))
plot(x, col = (y + 1))
###
svmfit <- svm(y ~ ., data = dat, kernel = "radial",
cost = 10, gamma = 1)
plot(svmfit, dat)
library(ISLR2)
names(Khan)
## [1] "xtrain" "xtest" "ytrain" "ytest"
dim(Khan$xtrain)
## [1] 63 2308
dim(Khan$xtest)
## [1] 20 2308
length(Khan$ytrain)
## [1] 63
length(Khan$ytest)
## [1] 20
table(Khan$ytrain)
##
## 1 2 3 4
## 8 23 12 20
table(Khan$ytest)
##
## 1 2 3 4
## 3 6 6 5
###
dat <- data.frame(
x = Khan$xtrain,
y = as.factor(Khan$ytrain)
)
out <- svm(y ~ ., data = dat, kernel = "linear",
cost = 10)
summary(out)
##
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
##
## Number of Support Vectors: 58
##
## ( 20 20 11 7 )
##
##
## Number of Classes: 4
##
## Levels:
## 1 2 3 4
table(out$fitted, dat$y)
##
## 1 2 3 4
## 1 8 0 0 0
## 2 0 23 0 0
## 3 0 0 12 0
## 4 0 0 0 20
###
dat.te <- data.frame(
x = Khan$xtest,
y = as.factor(Khan$ytest))
pred.te <- predict(out, newdata = dat.te)
table(pred.te, dat.te$y)
##
## pred.te 1 2 3 4
## 1 3 0 0 0
## 2 0 6 2 0
## 3 0 0 4 0
## 4 0 0 0 5