Chapter 02 (Introduction to R)

Basic Commands

x <- c(1, 3, 2, 5)
x
## [1] 1 3 2 5
x = c(1, 6, 2)
x
## [1] 1 6 2
y = c(1, 4, 3)
 
length(x)
## [1] 3
length(y)
## [1] 3
x + y
## [1]  2 10  5
ls()
## [1] "x" "y"
rm(x, y)
ls()
## character(0)
rm(list = ls())
###  ?matrix
  
x <- matrix(data = c(1, 2, 3, 4), nrow = 2, ncol = 2)
x
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
x <- matrix(c(1, 2, 3, 4), 2, 2)
  
matrix(c(1, 2, 3, 4), 2, 2, byrow = TRUE)
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
sqrt(x)
##          [,1]     [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
x^2
##      [,1] [,2]
## [1,]    1    9
## [2,]    4   16
x <- rnorm(50)
y <- x + rnorm(50, mean = 50, sd = .1)
cor(x, y)
## [1] 0.9953654
set.seed(1303)
rnorm(50)
##  [1] -1.1439763145  1.3421293656  2.1853904757  0.5363925179  0.0631929665
##  [6]  0.5022344825 -0.0004167247  0.5658198405 -0.5725226890 -1.1102250073
## [11] -0.0486871234 -0.6956562176  0.8289174803  0.2066528551 -0.2356745091
## [16] -0.5563104914 -0.3647543571  0.8623550343 -0.6307715354  0.3136021252
## [21] -0.9314953177  0.8238676185  0.5233707021  0.7069214120  0.4202043256
## [26] -0.2690521547 -1.5103172999 -0.6902124766 -0.1434719524 -1.0135274099
## [31]  1.5732737361  0.0127465055  0.8726470499  0.4220661905 -0.0188157917
## [36]  2.6157489689 -0.6931401748 -0.2663217810 -0.7206364412  1.3677342065
## [41]  0.2640073322  0.6321868074 -1.3306509858  0.0268888182  1.0406363208
## [46]  1.3120237985 -0.0300020767 -0.2500257125  0.0234144857  1.6598706557
set.seed(3)
y <- rnorm(100)
mean(y)
## [1] 0.01103557
var(y)
## [1] 0.7328675
sqrt(var(y))
## [1] 0.8560768
sd(y)
## [1] 0.8560768

Graphics

x <- rnorm(100)
y <- rnorm(100)
plot(x, y)

plot(x, y, xlab = "this is the x-axis",
    ylab = "this is the y-axis",
    main = "Plot of X vs Y")

pdf("Figure.pdf")
plot(x, y, col = "green")
dev.off()
## png 
##   2
x <- seq(1, 10)
x
##  [1]  1  2  3  4  5  6  7  8  9 10
x <- 1:10
x
##  [1]  1  2  3  4  5  6  7  8  9 10
x <- seq(-pi, pi, length = 50)
###
y <- x
f <- outer(x, y, function(x, y) cos(y) / (1 + x^2))
contour(x, y, f)
contour(x, y, f, nlevels = 45, add = T)

fa <- (f - t(f)) / 2
contour(x, y, fa, nlevels = 15)

###
image(x, y, fa)

persp(x, y, fa)

persp(x, y, fa, theta = 30)

persp(x, y, fa, theta = 30, phi = 20)

persp(x, y, fa, theta = 30, phi = 70)

persp(x, y, fa, theta = 30, phi = 40)

## Indexing Data


A <- matrix(1:16, 4, 4)
A
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16
A[2, 3]
## [1] 10
A[c(1, 3), c(2, 4)]
##      [,1] [,2]
## [1,]    5   13
## [2,]    7   15
A[1:3, 2:4]
##      [,1] [,2] [,3]
## [1,]    5    9   13
## [2,]    6   10   14
## [3,]    7   11   15
A[1:2, ]
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
A[, 1:2]
##      [,1] [,2]
## [1,]    1    5
## [2,]    2    6
## [3,]    3    7
## [4,]    4    8
A[1, ]
## [1]  1  5  9 13
A[-c(1, 3), ]
##      [,1] [,2] [,3] [,4]
## [1,]    2    6   10   14
## [2,]    4    8   12   16
A[-c(1, 3), -c(1, 3, 4)]
## [1] 6 8
dim(A)
## [1] 4 4

Loading Data

Auto <- read.table("D:/Masters/AST 533  Statistical Computing III/Data/Auto.data")
# View(Auto)
head(Auto)
##     V1        V2           V3         V4     V5           V6   V7     V8
## 1  mpg cylinders displacement horsepower weight acceleration year origin
## 2 18.0         8        307.0      130.0  3504.         12.0   70      1
## 3 15.0         8        350.0      165.0  3693.         11.5   70      1
## 4 18.0         8        318.0      150.0  3436.         11.0   70      1
## 5 16.0         8        304.0      150.0  3433.         12.0   70      1
## 6 17.0         8        302.0      140.0  3449.         10.5   70      1
##                          V9
## 1                      name
## 2 chevrolet chevelle malibu
## 3         buick skylark 320
## 4        plymouth satellite
## 5             amc rebel sst
## 6               ford torino
Auto <- read.table("D:/Masters/AST 533  Statistical Computing III/Data/Auto.data", header = T, na.strings = "?", stringsAsFactors = T)
# View(Auto)
  
Auto <- read.csv("D:/Masters/AST 533  Statistical Computing III/Data/Auto.csv", na.strings = "?", stringsAsFactors = T)
# View(Auto)
dim(Auto)
## [1] 397   9
Auto[1:4, ]
##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
Auto <- na.omit(Auto)
dim(Auto)
## [1] 392   9
names(Auto)
## [1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
## [6] "acceleration" "year"         "origin"       "name"

Additional Graphical and Numerical Summaries

plot(cylinders, mpg)
plot(Auto$cylinders, Auto$mpg)

attach(Auto)
plot(cylinders, mpg)

cylinders <- as.factor(cylinders)
  
plot(cylinders, mpg)

plot(cylinders, mpg, col = "red")

plot(cylinders, mpg, col = "red", varwidth = T)

plot(cylinders, mpg, col = "red", varwidth = T,
    horizontal = T)

plot(cylinders, mpg, col = "red", varwidth = T,
    xlab = "cylinders", ylab = "MPG")

hist(mpg)

hist(mpg, col = 2)

hist(mpg, col = 2, breaks = 15)

pairs(Auto)

pairs(~ mpg + displacement + horsepower + weight + acceleration,
    data = Auto)

plot(horsepower, mpg)
identify(horsepower, mpg, name)

## integer(0)
summary(Auto)
##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365
summary(mpg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.00   17.00   22.75   23.45   29.00   46.60

Chapter 03 (Linear Regression)

Simple Linear Regression

## Libraries
library(MASS)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
## 
## Attaching package: 'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
## 
##     Auto
## The following object is masked from 'package:MASS':
## 
##     Boston
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7  5.21 28.7
#lm.fit <- lm(medv ~ lstat)
  
lm.fit <- lm(medv ~ lstat, data = Boston)
attach(Boston)
lm.fit <- lm(medv ~ lstat)
lm.fit
## 
## Call:
## lm(formula = medv ~ lstat)
## 
## Coefficients:
## (Intercept)        lstat  
##       34.55        -0.95
summary(lm.fit)
## 
## Call:
## lm(formula = medv ~ lstat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
names(lm.fit)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
coef(lm.fit)
## (Intercept)       lstat 
##  34.5538409  -0.9500494
lm.fit$coefficients
## (Intercept)       lstat 
##  34.5538409  -0.9500494
# confidence interval
confint(lm.fit)
##                 2.5 %     97.5 %
## (Intercept) 33.448457 35.6592247
## lstat       -1.026148 -0.8739505
predict(lm.fit, data.frame(lstat = (c(5, 10, 15))),
    interval = "confidence")
##        fit      lwr      upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461
predict(object = lm.fit,newdata = data.frame(lstat=c(100,200,300)),interval = "confidence")
##          fit        lwr        upr
## 1  -60.45109  -67.12023  -53.78196
## 2 -155.45603 -169.72325 -141.18881
## 3 -250.46097 -272.33447 -228.58746
predict(lm.fit, data.frame(lstat = (c(5, 10, 15))),
    interval = "prediction")
##        fit       lwr      upr
## 1 29.80359 17.565675 42.04151
## 2 25.05335 12.827626 37.27907
## 3 20.30310  8.077742 32.52846
plot(lstat, medv)
abline(lm.fit)
  
abline(lm.fit, lwd = 3)
abline(lm.fit, lwd = 3, col = "red")

plot(lstat, medv, col = "red")

plot(lstat, medv, pch = 20)

plot(lstat, medv, pch = "+")

plot(1:20, 1:20, pch = 1:20)

par(mfrow = c(2, 2))
plot(lm.fit)

head(predict(lm.fit) )
##        1        2        3        4        5        6 
## 29.82260 25.87039 30.72514 31.76070 29.49008 29.60408
head(lm.fit$fitted.values) # same result
##        1        2        3        4        5        6 
## 29.82260 25.87039 30.72514 31.76070 29.49008 29.60408
head(residuals(lm.fit))
##          1          2          3          4          5          6 
## -5.8225951 -4.2703898  3.9748580  1.6393042  6.7099222 -0.9040837
head(lm.fit$residuals) # same result
##          1          2          3          4          5          6 
## -5.8225951 -4.2703898  3.9748580  1.6393042  6.7099222 -0.9040837
plot(predict(lm.fit), residuals(lm.fit))
plot(predict(lm.fit), rstudent(lm.fit))
  
plot(hatvalues(lm.fit))
which.max(hatvalues(lm.fit))
## 375 
## 375

Multiple Linear Regression

lm.fit <- lm(medv ~ lstat + age, data = Boston)
summary(lm.fit)
## 
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.981  -3.978  -1.283   1.968  23.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
## lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
## age          0.03454    0.01223   2.826  0.00491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared:  0.5513, Adjusted R-squared:  0.5495 
## F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16
lm.fit <- lm(medv ~ ., data = Boston)
summary(lm.fit)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1304  -2.7673  -0.5814   1.9414  26.2526 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  41.617270   4.936039   8.431 3.79e-16 ***
## crim         -0.121389   0.033000  -3.678 0.000261 ***
## zn            0.046963   0.013879   3.384 0.000772 ***
## indus         0.013468   0.062145   0.217 0.828520    
## chas          2.839993   0.870007   3.264 0.001173 ** 
## nox         -18.758022   3.851355  -4.870 1.50e-06 ***
## rm            3.658119   0.420246   8.705  < 2e-16 ***
## age           0.003611   0.013329   0.271 0.786595    
## dis          -1.490754   0.201623  -7.394 6.17e-13 ***
## rad           0.289405   0.066908   4.325 1.84e-05 ***
## tax          -0.012682   0.003801  -3.337 0.000912 ***
## ptratio      -0.937533   0.132206  -7.091 4.63e-12 ***
## lstat        -0.552019   0.050659 -10.897  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.798 on 493 degrees of freedom
## Multiple R-squared:  0.7343, Adjusted R-squared:  0.7278 
## F-statistic: 113.5 on 12 and 493 DF,  p-value: < 2.2e-16
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.2
vif(lm.fit) #Variance Inflation Factors
##     crim       zn    indus     chas      nox       rm      age      dis 
## 1.767486 2.298459 3.987181 1.071168 4.369093 1.912532 3.088232 3.954037 
##      rad      tax  ptratio    lstat 
## 7.445301 9.002158 1.797060 2.870777
lm.fit1 <- lm(medv ~ . - age, data = Boston)
summary(lm.fit1)
## 
## Call:
## lm(formula = medv ~ . - age, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1851  -2.7330  -0.6116   1.8555  26.3838 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  41.525128   4.919684   8.441 3.52e-16 ***
## crim         -0.121426   0.032969  -3.683 0.000256 ***
## zn            0.046512   0.013766   3.379 0.000785 ***
## indus         0.013451   0.062086   0.217 0.828577    
## chas          2.852773   0.867912   3.287 0.001085 ** 
## nox         -18.485070   3.713714  -4.978 8.91e-07 ***
## rm            3.681070   0.411230   8.951  < 2e-16 ***
## dis          -1.506777   0.192570  -7.825 3.12e-14 ***
## rad           0.287940   0.066627   4.322 1.87e-05 ***
## tax          -0.012653   0.003796  -3.333 0.000923 ***
## ptratio      -0.934649   0.131653  -7.099 4.39e-12 ***
## lstat        -0.547409   0.047669 -11.483  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.794 on 494 degrees of freedom
## Multiple R-squared:  0.7343, Adjusted R-squared:  0.7284 
## F-statistic: 124.1 on 11 and 494 DF,  p-value: < 2.2e-16
lm.fit1 <- update(lm.fit, ~ . - age)
summary(lm.fit1)
## 
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + dis + 
##     rad + tax + ptratio + lstat, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1851  -2.7330  -0.6116   1.8555  26.3838 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  41.525128   4.919684   8.441 3.52e-16 ***
## crim         -0.121426   0.032969  -3.683 0.000256 ***
## zn            0.046512   0.013766   3.379 0.000785 ***
## indus         0.013451   0.062086   0.217 0.828577    
## chas          2.852773   0.867912   3.287 0.001085 ** 
## nox         -18.485070   3.713714  -4.978 8.91e-07 ***
## rm            3.681070   0.411230   8.951  < 2e-16 ***
## dis          -1.506777   0.192570  -7.825 3.12e-14 ***
## rad           0.287940   0.066627   4.322 1.87e-05 ***
## tax          -0.012653   0.003796  -3.333 0.000923 ***
## ptratio      -0.934649   0.131653  -7.099 4.39e-12 ***
## lstat        -0.547409   0.047669 -11.483  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.794 on 494 degrees of freedom
## Multiple R-squared:  0.7343, Adjusted R-squared:  0.7284 
## F-statistic: 124.1 on 11 and 494 DF,  p-value: < 2.2e-16
## Interaction Terms
summary(lm(medv ~ lstat * age, data = Boston))
## 
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.806  -4.045  -1.333   2.085  27.552 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
## lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
## age         -0.0007209  0.0198792  -0.036   0.9711    
## lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared:  0.5557, Adjusted R-squared:  0.5531 
## F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16
## Non-linear Transformations of the Predictors
lm.fit2 <- lm(medv ~ lstat + I(lstat^2))
summary(lm.fit2)
## 
## Call:
## lm(formula = medv ~ lstat + I(lstat^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2834  -3.8313  -0.5295   2.3095  25.4148 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.862007   0.872084   49.15   <2e-16 ***
## lstat       -2.332821   0.123803  -18.84   <2e-16 ***
## I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16
lm.fit <- lm(medv ~ lstat)
anova(lm.fit, lm.fit2)
## Analysis of Variance Table
## 
## Model 1: medv ~ lstat
## Model 2: medv ~ lstat + I(lstat^2)
##   Res.Df   RSS Df Sum of Sq     F    Pr(>F)    
## 1    504 19472                                 
## 2    503 15347  1    4125.1 135.2 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(lm.fit2)

lm.fit5 <- lm(medv ~ poly(lstat, 5))
summary(lm.fit5)
## 
## Call:
## lm(formula = medv ~ poly(lstat, 5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5433  -3.1039  -0.7052   2.0844  27.1153 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       22.5328     0.2318  97.197  < 2e-16 ***
## poly(lstat, 5)1 -152.4595     5.2148 -29.236  < 2e-16 ***
## poly(lstat, 5)2   64.2272     5.2148  12.316  < 2e-16 ***
## poly(lstat, 5)3  -27.0511     5.2148  -5.187 3.10e-07 ***
## poly(lstat, 5)4   25.4517     5.2148   4.881 1.42e-06 ***
## poly(lstat, 5)5  -19.2524     5.2148  -3.692 0.000247 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared:  0.6817, Adjusted R-squared:  0.6785 
## F-statistic: 214.2 on 5 and 500 DF,  p-value: < 2.2e-16
summary(lm(medv ~ log(rm), data = Boston))
## 
## Call:
## lm(formula = medv ~ log(rm), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.487  -2.875  -0.104   2.837  39.816 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -76.488      5.028  -15.21   <2e-16 ***
## log(rm)       54.055      2.739   19.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.915 on 504 degrees of freedom
## Multiple R-squared:  0.4358, Adjusted R-squared:  0.4347 
## F-statistic: 389.3 on 1 and 504 DF,  p-value: < 2.2e-16
library(ISLR2)
## Qualitative Predictors
head(Carseats)
##   Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1  9.50       138     73          11        276   120       Bad  42        17
## 2 11.22       111     48          16        260    83      Good  65        10
## 3 10.06       113     35          10        269    80    Medium  59        12
## 4  7.40       117    100           4        466    97    Medium  55        14
## 5  4.15       141     64           3        340   128       Bad  38        13
## 6 10.81       124    113          13        501    72       Bad  78        16
##   Urban  US
## 1   Yes Yes
## 2   Yes Yes
## 3   Yes Yes
## 4   Yes Yes
## 5   Yes  No
## 6    No Yes
lm.fit <- lm(Sales ~ . + Income:Advertising + Price:Age, 
    data = Carseats)
summary(lm.fit)
## 
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Price:Age, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9208 -0.7503  0.0177  0.6754  3.3413 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
## CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
## Income              0.0108940  0.0026044   4.183 3.57e-05 ***
## Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
## Population          0.0001592  0.0003679   0.433 0.665330    
## Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
## ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
## ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
## Age                -0.0579466  0.0159506  -3.633 0.000318 ***
## Education          -0.0208525  0.0196131  -1.063 0.288361    
## UrbanYes            0.1401597  0.1124019   1.247 0.213171    
## USYes              -0.1575571  0.1489234  -1.058 0.290729    
## Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
## Price:Age           0.0001068  0.0001333   0.801 0.423812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared:  0.8761, Adjusted R-squared:  0.8719 
## F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16
attach(Carseats)
contrasts(ShelveLoc)
##        Good Medium
## Bad       0      0
## Good      1      0
## Medium    0      1
## Writing  Functions

  
#LoadLibraries
#LoadLibraries()
  
LoadLibraries <- function() {
 library(ISLR2)
 library(MASS)
 print("The libraries have been loaded.")
}
  
LoadLibraries
## function () 
## {
##     library(ISLR2)
##     library(MASS)
##     print("The libraries have been loaded.")
## }
LoadLibraries()
## [1] "The libraries have been loaded."

Chapter 04 (Classification Methods)

The Stock Market Data

library(ISLR2)
names(Smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
dim(Smarket)
## [1] 1250    9
summary(Smarket)
##       Year           Lag1                Lag2                Lag3          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000  
##  Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000  
##       Lag4                Lag5              Volume           Today          
##  Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000  
##  1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500  
##  Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500  
##  Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138  
##  3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750  
##  Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000  
##  Direction 
##  Down:602  
##  Up  :648  
##            
##            
##            
## 
pairs(Smarket)

str(Smarket)
## 'data.frame':    1250 obs. of  9 variables:
##  $ Year     : num  2001 2001 2001 2001 2001 ...
##  $ Lag1     : num  0.381 0.959 1.032 -0.623 0.614 ...
##  $ Lag2     : num  -0.192 0.381 0.959 1.032 -0.623 ...
##  $ Lag3     : num  -2.624 -0.192 0.381 0.959 1.032 ...
##  $ Lag4     : num  -1.055 -2.624 -0.192 0.381 0.959 ...
##  $ Lag5     : num  5.01 -1.055 -2.624 -0.192 0.381 ...
##  $ Volume   : num  1.19 1.3 1.41 1.28 1.21 ...
##  $ Today    : num  0.959 1.032 -0.623 0.614 0.213 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
# cor(Smarket) #Error in cor(Smarket) : 'x' must be numeric
cor(Smarket[, -9])
##              Year         Lag1         Lag2         Lag3         Lag4
## Year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718
## Lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533
## Lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036
## Lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000
## Lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
##                Lag5      Volume        Today
## Year    0.029787995  0.53900647  0.030095229
## Lag1   -0.005674606  0.04090991 -0.026155045
## Lag2   -0.003557949 -0.04338321 -0.010250033
## Lag3   -0.018808338 -0.04182369 -0.002447647
## Lag4   -0.027083641 -0.04841425 -0.006899527
## Lag5    1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315  1.00000000  0.014591823
## Today  -0.034860083  0.01459182  1.000000000
attach(Smarket)
plot(Volume)

plot(Smarket$Volume)

Logistic Regression

glm.fits <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,data = Smarket, family = binomial)
summary(glm.fits)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
coef(glm.fits)
##  (Intercept)         Lag1         Lag2         Lag3         Lag4         Lag5 
## -0.126000257 -0.073073746 -0.042301344  0.011085108  0.009358938  0.010313068 
##       Volume 
##  0.135440659
summary(glm.fits)$coef
##                 Estimate Std. Error    z value  Pr(>|z|)
## (Intercept) -0.126000257 0.24073574 -0.5233966 0.6006983
## Lag1        -0.073073746 0.05016739 -1.4565986 0.1452272
## Lag2        -0.042301344 0.05008605 -0.8445733 0.3983491
## Lag3         0.011085108 0.04993854  0.2219750 0.8243333
## Lag4         0.009358938 0.04997413  0.1872757 0.8514445
## Lag5         0.010313068 0.04951146  0.2082966 0.8349974
## Volume       0.135440659 0.15835970  0.8552723 0.3924004
summary(glm.fits)$coef[, 4] ## p-value
## (Intercept)        Lag1        Lag2        Lag3        Lag4        Lag5 
##   0.6006983   0.1452272   0.3983491   0.8243333   0.8514445   0.8349974 
##      Volume 
##   0.3924004
glm.probs <- predict(glm.fits, type = "response")
glm.probs[1:10]
##         1         2         3         4         5         6         7         8 
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509 0.5092292 
##         9        10 
## 0.5176135 0.4888378
contrasts(Direction)
##      Up
## Down  0
## Up    1
glm.pred <- rep("Down", 1250)
glm.pred[glm.probs > .5] = "Up"

table(glm.pred, Direction)
##         Direction
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507
(507 + 145) / 1250 ## accuracy
## [1] 0.5216
mean(glm.pred == Direction)
## [1] 0.5216
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.2
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,data = Smarket)
## Setting levels: control = Down, case = Up
## Setting direction: controls > cases
## Setting levels: control = Down, case = Up
## Setting direction: controls > cases
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
## Setting levels: control = Down, case = Up
## Setting direction: controls > cases
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
## $Lag1
## 
## Call:
## roc.formula(formula = Direction ~ Lag1, data = Smarket)
## 
## Data: Lag1 in 602 controls (Direction Down) > 648 cases (Direction Up).
## Area under the curve: 0.5348
## 
## $Lag2
## 
## Call:
## roc.formula(formula = Direction ~ Lag2, data = Smarket)
## 
## Data: Lag2 in 602 controls (Direction Down) > 648 cases (Direction Up).
## Area under the curve: 0.5243
## 
## $Lag3
## 
## Call:
## roc.formula(formula = Direction ~ Lag3, data = Smarket)
## 
## Data: Lag3 in 602 controls (Direction Down) < 648 cases (Direction Up).
## Area under the curve: 0.4992
## 
## $Lag4
## 
## Call:
## roc.formula(formula = Direction ~ Lag4, data = Smarket)
## 
## Data: Lag4 in 602 controls (Direction Down) > 648 cases (Direction Up).
## Area under the curve: 0.4997
## 
## $Lag5
## 
## Call:
## roc.formula(formula = Direction ~ Lag5, data = Smarket)
## 
## Data: Lag5 in 602 controls (Direction Down) < 648 cases (Direction Up).
## Area under the curve: 0.5053
## 
## $Volume
## 
## Call:
## roc.formula(formula = Direction ~ Volume, data = Smarket)
## 
## Data: Volume in 602 controls (Direction Down) < 648 cases (Direction Up).
## Area under the curve: 0.5177
A<-roc(Direction,glm.probs)
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
plot(roc(Direction,glm.probs))
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases

plot(A)
glm_fit<- glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,family = "binomial",data = Smarket)
summary(glm_fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = "binomial", data = Smarket)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
glm_pro<-predict(object = glm_fit,type = "res")
contrasts(Direction)
##      Up
## Down  0
## Up    1
glm_pred<- rep("Down",1250)
glm_pred[glm_pro>0.5]="Up"
table(glm_pred,Direction)
##         Direction
## glm_pred Down  Up
##     Down  145 141
##     Up    457 507
train <- (Year < 2005)
Smarket.2005 <- Smarket[!train, ]
dim(Smarket.2005)
## [1] 252   9
Direction.2005 <- Direction[!train]

glm.fits <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
    data = Smarket, family = binomial, subset = train)



glm.probs <- predict(glm.fits, Smarket.2005,
    type = "response")

glm.pred <- rep("Down", 252)
glm.pred[glm.probs > .5] <- "Up"
table(glm.pred, Direction.2005)
##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44
mean(glm.pred == Direction.2005)
## [1] 0.4801587
mean(glm.pred != Direction.2005)
## [1] 0.5198413

2nd way

tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
gl.fit<-glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
    data = tr, family = binomial)

prob<- predict(gl.fit,test,type = "response")

dim(test)
## [1] 252   9
pre<-rep("Down",252)
pre[prob> 0.5]<- "Up"

table(pre,test$Direction)
##       
## pre    Down Up
##   Down   77 97
##   Up     34 44
glm.fits <- glm(Direction ~ Lag1 + Lag2, data = Smarket,
    family = binomial, subset = train)

glm.probs <- predict(glm.fits, Smarket.2005,
    type = "response")
glm.pred <- rep("Down", 252)
glm.pred[glm.probs > .5] <- "Up"
table(glm.pred, Direction.2005)
##         Direction.2005
## glm.pred Down  Up
##     Down   35  35
##     Up     76 106
mean(glm.pred == Direction.2005)
## [1] 0.5595238
106 / (106 + 76)
## [1] 0.5824176
predict(glm.fits,newdata =data.frame(Lag1 = c(1.2, 1.5),  Lag2 = c(1.1, -0.8)),type = "response")
##         1         2 
## 0.4791462 0.4960939

2nd way

tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
gl.fit<-glm(Direction ~ Lag1 + Lag2,
    data = tr, family = binomial)

prob<- predict(gl.fit,test,type = "response")

dim(test)
## [1] 252   9
pre<-rep("Down",252)
pre[prob> 0.5]<- "Up"

table(pre,test$Direction)
##       
## pre    Down  Up
##   Down   35  35
##   Up     76 106
mean(pre == test$Direction)
## [1] 0.5595238

Linear Discriminant Analysis

library(MASS)
lda.fit <- lda(Direction ~ Lag1 + Lag2, data = Smarket,
    subset = train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
plot(lda.fit)

lda.pred <- predict(lda.fit, Smarket.2005)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.class <- lda.pred$class
table(lda.class, Direction.2005)
##          Direction.2005
## lda.class Down  Up
##      Down   35  35
##      Up     76 106
mean(lda.class == Direction.2005)
## [1] 0.5595238
sum(lda.pred$posterior[, 1] >= .5)
## [1] 70
sum(lda.pred$posterior[, 1] < .5)
## [1] 182
lda.pred$posterior[1:20, 1]
##       999      1000      1001      1002      1003      1004      1005      1006 
## 0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016 0.4872861 
##      1007      1008      1009      1010      1011      1012      1013      1014 
## 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761 0.4744593 0.4799583 
##      1015      1016      1017      1018 
## 0.4935775 0.5030894 0.4978806 0.4886331
lda.class[1:20]
##  [1] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up   Up  
## [16] Up   Up   Down Up   Up  
## Levels: Down Up
sum(lda.pred$posterior[, 1] > .9)
## [1] 0

2nd way

tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
lda_fit<-lda(Direction ~ Lag1 + Lag2 ,data = tr)


pre<- predict(lda_fit,test,type = "response")$class
table(pre,test$Direction)
##       
## pre    Down  Up
##   Down   35  35
##   Up     76 106
mean(pre == test$Direction)
## [1] 0.5595238

Quadratic Discriminant Analysis

qda.fit <- qda(Direction ~ Lag1 + Lag2, data = Smarket,
    subset = train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
qda.class <- predict(qda.fit, Smarket.2005)$class
table(qda.class, Direction.2005)
##          Direction.2005
## qda.class Down  Up
##      Down   30  20
##      Up     81 121
mean(qda.class == Direction.2005)
## [1] 0.5992063

2nd way

tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
qda_fit<-qda(Direction ~ Lag1 + Lag2 ,data = tr)


pre<- predict(qda_fit,test,type = "response")$class
table(pre,test$Direction)
##       
## pre    Down  Up
##   Down   30  20
##   Up     81 121
mean(pre == test$Direction)
## [1] 0.5992063

Naive Bayes

library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
nb.fit <- naiveBayes(Direction ~ Lag1 + Lag2, data = Smarket,
    subset = train)
nb.fit
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     Down       Up 
## 0.491984 0.508016 
## 
## Conditional probabilities:
##       Lag1
## Y             [,1]     [,2]
##   Down  0.04279022 1.227446
##   Up   -0.03954635 1.231668
## 
##       Lag2
## Y             [,1]     [,2]
##   Down  0.03389409 1.239191
##   Up   -0.03132544 1.220765
nb.class <- predict(nb.fit, Smarket.2005)
table(nb.class, Direction.2005)
##         Direction.2005
## nb.class Down  Up
##     Down   28  20
##     Up     83 121
mean(nb.class == Direction.2005)
## [1] 0.5912698
nb.preds <- predict(nb.fit, Smarket.2005, type = "raw")
nb.preds[1:5, ]
##           Down        Up
## [1,] 0.4873164 0.5126836
## [2,] 0.4762492 0.5237508
## [3,] 0.4653377 0.5346623
## [4,] 0.4748652 0.5251348
## [5,] 0.4901890 0.5098110
mean(Lag1[train][Direction[train] == "Down"])
## [1] 0.04279022
sd(Lag1[train][Direction[train] == "Down"])
## [1] 1.227446
## 2nd way
mean(tr$Lag1[tr$Direction == "Down"])
## [1] 0.04279022
sd(tr$Lag1[tr$Direction == "Down"])
## [1] 1.227446

2nd way

tr<- Smarket[Year<2005,]
test<- Smarket[!Year<2005,]
naiveBayes_fit<-naiveBayes(Direction ~ Lag1 + Lag2 ,data = tr)


prob<- predict(naiveBayes_fit,test,type = "raw")

pre<- ifelse(prob[,1]<0.5,"Up","Down")



table(pre,test$Direction)
##       
## pre    Down  Up
##   Down   28  20
##   Up     83 121
mean(pre == test$Direction)
## [1] 0.5912698
## 2nd way
mean(tr$Lag1[tr$Direction == "Down"])
## [1] 0.04279022
sd(tr$Lag1[tr$Direction == "Down"])
## [1] 1.227446

\(K\)-Nearest Neighbors

library(class)
train.X <- cbind(Lag1, Lag2)[train, ]
test.X <- cbind(Lag1, Lag2)[!train, ]
train.Direction <- Direction[train]

set.seed(1)
knn.pred <- knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   43 58
##     Up     68 83
(83 + 43) / 252
## [1] 0.5
knn.pred <- knn(train.X, test.X, train.Direction, k = 3)
table(knn.pred, Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   48 54
##     Up     63 87
mean(knn.pred == Direction.2005)
## [1] 0.5357143

2nd way

library(class)
train<- Smarket[Year<2005,] 
test<- Smarket[!Year<2005,]

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
train1<-train %>% 
  select(Lag1, Lag2)
test1<- test%>% 
  select(Lag1, Lag2)

class<-train$Direction

knn.pred<-knn(train = train1,test = test1,cl = class,k = 1)
table(knn.pred,test$Direction)
##         
## knn.pred Down Up
##     Down   43 58
##     Up     68 83
mean(knn.pred==test$Direction)
## [1] 0.5
set.seed(1)
knn.pred3<-knn(train = train1,test = test1,cl = class,k = 3)
table(knn.pred3,test$Direction)
##          
## knn.pred3 Down Up
##      Down   48 55
##      Up     63 86
mean(knn.pred3==test$Direction)
## [1] 0.531746
dim(Caravan)
## [1] 5822   86
attach(Caravan)
summary(Purchase)
##   No  Yes 
## 5474  348
348 / 5822
## [1] 0.05977327
standardized.X <- scale(Caravan[, -86])
var(Caravan[, 1])
## [1] 165.0378
var(Caravan[, 2])
## [1] 0.1647078
var(standardized.X[, 1])
## [1] 1
var(standardized.X[, 2])
## [1] 1
test <- 1:1000
train.X <- standardized.X[-test, ]
test.X <- standardized.X[test, ]
train.Y <- Purchase[-test]
test.Y <- Purchase[test]

set.seed(1)
knn.pred <- knn(train.X, test.X, train.Y, k = 1)
mean(test.Y == knn.pred)
## [1] 0.882
mean(test.Y != knn.pred)
## [1] 0.118
mean(test.Y != "No")
## [1] 0.059
table(knn.pred, test.Y)
##         test.Y
## knn.pred  No Yes
##      No  873  50
##      Yes  68   9
9 / (68 + 9)
## [1] 0.1168831
knn.pred <- knn(train.X, test.X, train.Y, k = 3)
table(knn.pred, test.Y)
##         test.Y
## knn.pred  No Yes
##      No  920  54
##      Yes  21   5
5 / 26
## [1] 0.1923077
knn.pred <- knn(train.X, test.X, train.Y, k = 5)
table(knn.pred, test.Y)
##         test.Y
## knn.pred  No Yes
##      No  930  55
##      Yes  11   4
4 / 15
## [1] 0.2666667
glm.fits <- glm(Purchase ~ ., data = Caravan,
    family = binomial, subset = -test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs <- predict(glm.fits, Caravan[test, ],type = "response")

glm.pred <- rep("No", 1000)
glm.pred[glm.probs > .5] <- "Yes"
table(glm.pred, test.Y)
##         test.Y
## glm.pred  No Yes
##      No  934  59
##      Yes   7   0
glm.pred <- rep("No", 1000)
glm.pred[glm.probs > .25] <- "Yes"
table(glm.pred, test.Y)
##         test.Y
## glm.pred  No Yes
##      No  919  48
##      Yes  22  11
11 / (22 + 11)
## [1] 0.3333333

Poisson Regression

attach(Bikeshare)
dim(Bikeshare)
## [1] 8645   15
names(Bikeshare)
##  [1] "season"     "mnth"       "day"        "hr"         "holiday"   
##  [6] "weekday"    "workingday" "weathersit" "temp"       "atemp"     
## [11] "hum"        "windspeed"  "casual"     "registered" "bikers"
mod.lm <- lm(
    bikers ~ mnth + hr + workingday + temp + weathersit,
    data = Bikeshare
  )
summary(mod.lm)
## 
## Call:
## lm(formula = bikers ~ mnth + hr + workingday + temp + weathersit, 
##     data = Bikeshare)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -299.00  -45.70   -6.23   41.08  425.29 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -68.632      5.307 -12.932  < 2e-16 ***
## mnthFeb                      6.845      4.287   1.597 0.110398    
## mnthMarch                   16.551      4.301   3.848 0.000120 ***
## mnthApril                   41.425      4.972   8.331  < 2e-16 ***
## mnthMay                     72.557      5.641  12.862  < 2e-16 ***
## mnthJune                    67.819      6.544  10.364  < 2e-16 ***
## mnthJuly                    45.324      7.081   6.401 1.63e-10 ***
## mnthAug                     53.243      6.640   8.019 1.21e-15 ***
## mnthSept                    66.678      5.925  11.254  < 2e-16 ***
## mnthOct                     75.834      4.950  15.319  < 2e-16 ***
## mnthNov                     60.310      4.610  13.083  < 2e-16 ***
## mnthDec                     46.458      4.271  10.878  < 2e-16 ***
## hr1                        -14.579      5.699  -2.558 0.010536 *  
## hr2                        -21.579      5.733  -3.764 0.000168 ***
## hr3                        -31.141      5.778  -5.389 7.26e-08 ***
## hr4                        -36.908      5.802  -6.361 2.11e-10 ***
## hr5                        -24.135      5.737  -4.207 2.61e-05 ***
## hr6                         20.600      5.704   3.612 0.000306 ***
## hr7                        120.093      5.693  21.095  < 2e-16 ***
## hr8                        223.662      5.690  39.310  < 2e-16 ***
## hr9                        120.582      5.693  21.182  < 2e-16 ***
## hr10                        83.801      5.705  14.689  < 2e-16 ***
## hr11                       105.423      5.722  18.424  < 2e-16 ***
## hr12                       137.284      5.740  23.916  < 2e-16 ***
## hr13                       136.036      5.760  23.617  < 2e-16 ***
## hr14                       126.636      5.776  21.923  < 2e-16 ***
## hr15                       132.087      5.780  22.852  < 2e-16 ***
## hr16                       178.521      5.772  30.927  < 2e-16 ***
## hr17                       296.267      5.749  51.537  < 2e-16 ***
## hr18                       269.441      5.736  46.976  < 2e-16 ***
## hr19                       186.256      5.714  32.596  < 2e-16 ***
## hr20                       125.549      5.704  22.012  < 2e-16 ***
## hr21                        87.554      5.693  15.378  < 2e-16 ***
## hr22                        59.123      5.689  10.392  < 2e-16 ***
## hr23                        26.838      5.688   4.719 2.41e-06 ***
## workingday                   1.270      1.784   0.711 0.476810    
## temp                       157.209     10.261  15.321  < 2e-16 ***
## weathersitcloudy/misty     -12.890      1.964  -6.562 5.60e-11 ***
## weathersitlight rain/snow  -66.494      2.965 -22.425  < 2e-16 ***
## weathersitheavy rain/snow -109.745     76.667  -1.431 0.152341    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76.5 on 8605 degrees of freedom
## Multiple R-squared:  0.6745, Adjusted R-squared:  0.6731 
## F-statistic: 457.3 on 39 and 8605 DF,  p-value: < 2.2e-16
contrasts(Bikeshare$hr) = contr.sum(24)
contrasts(Bikeshare$mnth) = contr.sum(12)
mod.lm2 <- lm(
    bikers ~ mnth + hr + workingday + temp + weathersit,
    data = Bikeshare
  )
summary(mod.lm2)
## 
## Call:
## lm(formula = bikers ~ mnth + hr + workingday + temp + weathersit, 
##     data = Bikeshare)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -299.00  -45.70   -6.23   41.08  425.29 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 73.5974     5.1322  14.340  < 2e-16 ***
## mnth1                      -46.0871     4.0855 -11.281  < 2e-16 ***
## mnth2                      -39.2419     3.5391 -11.088  < 2e-16 ***
## mnth3                      -29.5357     3.1552  -9.361  < 2e-16 ***
## mnth4                       -4.6622     2.7406  -1.701  0.08895 .  
## mnth5                       26.4700     2.8508   9.285  < 2e-16 ***
## mnth6                       21.7317     3.4651   6.272 3.75e-10 ***
## mnth7                       -0.7626     3.9084  -0.195  0.84530    
## mnth8                        7.1560     3.5347   2.024  0.04295 *  
## mnth9                       20.5912     3.0456   6.761 1.46e-11 ***
## mnth10                      29.7472     2.6995  11.019  < 2e-16 ***
## mnth11                      14.2229     2.8604   4.972 6.74e-07 ***
## hr1                        -96.1420     3.9554 -24.307  < 2e-16 ***
## hr2                       -110.7213     3.9662 -27.916  < 2e-16 ***
## hr3                       -117.7212     4.0165 -29.310  < 2e-16 ***
## hr4                       -127.2828     4.0808 -31.191  < 2e-16 ***
## hr5                       -133.0495     4.1168 -32.319  < 2e-16 ***
## hr6                       -120.2775     4.0370 -29.794  < 2e-16 ***
## hr7                        -75.5424     3.9916 -18.925  < 2e-16 ***
## hr8                         23.9511     3.9686   6.035 1.65e-09 ***
## hr9                        127.5199     3.9500  32.284  < 2e-16 ***
## hr10                        24.4399     3.9360   6.209 5.57e-10 ***
## hr11                       -12.3407     3.9361  -3.135  0.00172 ** 
## hr12                         9.2814     3.9447   2.353  0.01865 *  
## hr13                        41.1417     3.9571  10.397  < 2e-16 ***
## hr14                        39.8939     3.9750  10.036  < 2e-16 ***
## hr15                        30.4940     3.9910   7.641 2.39e-14 ***
## hr16                        35.9445     3.9949   8.998  < 2e-16 ***
## hr17                        82.3786     3.9883  20.655  < 2e-16 ***
## hr18                       200.1249     3.9638  50.488  < 2e-16 ***
## hr19                       173.2989     3.9561  43.806  < 2e-16 ***
## hr20                        90.1138     3.9400  22.872  < 2e-16 ***
## hr21                        29.4071     3.9362   7.471 8.74e-14 ***
## hr22                        -8.5883     3.9332  -2.184  0.02902 *  
## hr23                       -37.0194     3.9344  -9.409  < 2e-16 ***
## workingday                   1.2696     1.7845   0.711  0.47681    
## temp                       157.2094    10.2612  15.321  < 2e-16 ***
## weathersitcloudy/misty     -12.8903     1.9643  -6.562 5.60e-11 ***
## weathersitlight rain/snow  -66.4944     2.9652 -22.425  < 2e-16 ***
## weathersitheavy rain/snow -109.7446    76.6674  -1.431  0.15234    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76.5 on 8605 degrees of freedom
## Multiple R-squared:  0.6745, Adjusted R-squared:  0.6731 
## F-statistic: 457.3 on 39 and 8605 DF,  p-value: < 2.2e-16
sum((predict(mod.lm) - predict(mod.lm2))^2)
## [1] 1.586608e-18
all.equal(predict(mod.lm), predict(mod.lm2))
## [1] TRUE
coef.months <- c(coef(mod.lm2)[2:12],
    -sum(coef(mod.lm2)[2:12]))

plot(coef.months, xlab = "Month", ylab = "Coefficient",
    xaxt = "n", col = "blue", pch = 19, type = "o")
axis(side = 1, at = 1:12, labels = c("J", "F", "M", "A",
    "M", "J", "J", "A", "S", "O", "N", "D"))

coef.hours <- c(coef(mod.lm2)[13:35],
    -sum(coef(mod.lm2)[13:35]))
plot(coef.hours, xlab = "Hour", ylab = "Coefficient",
    col = "blue", pch = 19, type = "o")

plot(coef.months,type="o",xaxt="n")
axis(side = 1,at = 1:12,labels = c("J", "F", "M", "A","M", "J", "J", "A", "S", "O", "N", "D"))

mod.pois <- glm(
    bikers ~ mnth + hr + workingday + temp + weathersit,
    data = Bikeshare, family = poisson
  )
summary(mod.pois)
## 
## Call:
## glm(formula = bikers ~ mnth + hr + workingday + temp + weathersit, 
##     family = poisson, data = Bikeshare)
## 
## Coefficients:
##                            Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                4.118245   0.006021  683.964  < 2e-16 ***
## mnth1                     -0.670170   0.005907 -113.445  < 2e-16 ***
## mnth2                     -0.444124   0.004860  -91.379  < 2e-16 ***
## mnth3                     -0.293733   0.004144  -70.886  < 2e-16 ***
## mnth4                      0.021523   0.003125    6.888 5.66e-12 ***
## mnth5                      0.240471   0.002916   82.462  < 2e-16 ***
## mnth6                      0.223235   0.003554   62.818  < 2e-16 ***
## mnth7                      0.103617   0.004125   25.121  < 2e-16 ***
## mnth8                      0.151171   0.003662   41.281  < 2e-16 ***
## mnth9                      0.233493   0.003102   75.281  < 2e-16 ***
## mnth10                     0.267573   0.002785   96.091  < 2e-16 ***
## mnth11                     0.150264   0.003180   47.248  < 2e-16 ***
## hr1                       -0.754386   0.007879  -95.744  < 2e-16 ***
## hr2                       -1.225979   0.009953 -123.173  < 2e-16 ***
## hr3                       -1.563147   0.011869 -131.702  < 2e-16 ***
## hr4                       -2.198304   0.016424 -133.846  < 2e-16 ***
## hr5                       -2.830484   0.022538 -125.586  < 2e-16 ***
## hr6                       -1.814657   0.013464 -134.775  < 2e-16 ***
## hr7                       -0.429888   0.006896  -62.341  < 2e-16 ***
## hr8                        0.575181   0.004406  130.544  < 2e-16 ***
## hr9                        1.076927   0.003563  302.220  < 2e-16 ***
## hr10                       0.581769   0.004286  135.727  < 2e-16 ***
## hr11                       0.336852   0.004720   71.372  < 2e-16 ***
## hr12                       0.494121   0.004392  112.494  < 2e-16 ***
## hr13                       0.679642   0.004069  167.040  < 2e-16 ***
## hr14                       0.673565   0.004089  164.722  < 2e-16 ***
## hr15                       0.624910   0.004178  149.570  < 2e-16 ***
## hr16                       0.653763   0.004132  158.205  < 2e-16 ***
## hr17                       0.874301   0.003784  231.040  < 2e-16 ***
## hr18                       1.294635   0.003254  397.848  < 2e-16 ***
## hr19                       1.212281   0.003321  365.084  < 2e-16 ***
## hr20                       0.914022   0.003700  247.065  < 2e-16 ***
## hr21                       0.616201   0.004191  147.045  < 2e-16 ***
## hr22                       0.364181   0.004659   78.173  < 2e-16 ***
## hr23                       0.117493   0.005225   22.488  < 2e-16 ***
## workingday                 0.014665   0.001955    7.502 6.27e-14 ***
## temp                       0.785292   0.011475   68.434  < 2e-16 ***
## weathersitcloudy/misty    -0.075231   0.002179  -34.528  < 2e-16 ***
## weathersitlight rain/snow -0.575800   0.004058 -141.905  < 2e-16 ***
## weathersitheavy rain/snow -0.926287   0.166782   -5.554 2.79e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1052921  on 8644  degrees of freedom
## Residual deviance:  228041  on 8605  degrees of freedom
## AIC: 281159
## 
## Number of Fisher Scoring iterations: 5
coef.mnth <- c(coef(mod.pois)[2:12],
    -sum(coef(mod.pois)[2:12]))
plot(coef.mnth, xlab = "Month", ylab = "Coefficient",
     xaxt = "n", col = "blue", pch = 19, type = "o")
axis(side = 1, at = 1:12, labels = c("J", "F", "M", "A", "M", "J", "J", "A", "S", "O", "N", "D"))

coef.hours <- c(coef(mod.pois)[13:35],
     -sum(coef(mod.pois)[13:35]))
plot(coef.hours, xlab = "Hour", ylab = "Coefficient",
    col = "blue", pch = 19, type = "o")

plot(predict(mod.lm2), predict(mod.pois, type = "response"))
abline(0, 1, col = 2, lwd = 3)

#### my try

mod.pois1 <- glm(
    bikers ~ mnth + hr + workingday + temp + weathersit,
    data = Bikeshare, family = poisson
  )
summary(mod.pois1)
## 
## Call:
## glm(formula = bikers ~ mnth + hr + workingday + temp + weathersit, 
##     family = poisson, data = Bikeshare)
## 
## Coefficients:
##                            Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                4.118245   0.006021  683.964  < 2e-16 ***
## mnth1                     -0.670170   0.005907 -113.445  < 2e-16 ***
## mnth2                     -0.444124   0.004860  -91.379  < 2e-16 ***
## mnth3                     -0.293733   0.004144  -70.886  < 2e-16 ***
## mnth4                      0.021523   0.003125    6.888 5.66e-12 ***
## mnth5                      0.240471   0.002916   82.462  < 2e-16 ***
## mnth6                      0.223235   0.003554   62.818  < 2e-16 ***
## mnth7                      0.103617   0.004125   25.121  < 2e-16 ***
## mnth8                      0.151171   0.003662   41.281  < 2e-16 ***
## mnth9                      0.233493   0.003102   75.281  < 2e-16 ***
## mnth10                     0.267573   0.002785   96.091  < 2e-16 ***
## mnth11                     0.150264   0.003180   47.248  < 2e-16 ***
## hr1                       -0.754386   0.007879  -95.744  < 2e-16 ***
## hr2                       -1.225979   0.009953 -123.173  < 2e-16 ***
## hr3                       -1.563147   0.011869 -131.702  < 2e-16 ***
## hr4                       -2.198304   0.016424 -133.846  < 2e-16 ***
## hr5                       -2.830484   0.022538 -125.586  < 2e-16 ***
## hr6                       -1.814657   0.013464 -134.775  < 2e-16 ***
## hr7                       -0.429888   0.006896  -62.341  < 2e-16 ***
## hr8                        0.575181   0.004406  130.544  < 2e-16 ***
## hr9                        1.076927   0.003563  302.220  < 2e-16 ***
## hr10                       0.581769   0.004286  135.727  < 2e-16 ***
## hr11                       0.336852   0.004720   71.372  < 2e-16 ***
## hr12                       0.494121   0.004392  112.494  < 2e-16 ***
## hr13                       0.679642   0.004069  167.040  < 2e-16 ***
## hr14                       0.673565   0.004089  164.722  < 2e-16 ***
## hr15                       0.624910   0.004178  149.570  < 2e-16 ***
## hr16                       0.653763   0.004132  158.205  < 2e-16 ***
## hr17                       0.874301   0.003784  231.040  < 2e-16 ***
## hr18                       1.294635   0.003254  397.848  < 2e-16 ***
## hr19                       1.212281   0.003321  365.084  < 2e-16 ***
## hr20                       0.914022   0.003700  247.065  < 2e-16 ***
## hr21                       0.616201   0.004191  147.045  < 2e-16 ***
## hr22                       0.364181   0.004659   78.173  < 2e-16 ***
## hr23                       0.117493   0.005225   22.488  < 2e-16 ***
## workingday                 0.014665   0.001955    7.502 6.27e-14 ***
## temp                       0.785292   0.011475   68.434  < 2e-16 ***
## weathersitcloudy/misty    -0.075231   0.002179  -34.528  < 2e-16 ***
## weathersitlight rain/snow -0.575800   0.004058 -141.905  < 2e-16 ***
## weathersitheavy rain/snow -0.926287   0.166782   -5.554 2.79e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1052921  on 8644  degrees of freedom
## Residual deviance:  228041  on 8605  degrees of freedom
## AIC: 281159
## 
## Number of Fisher Scoring iterations: 5
coef<-c(mod.pois1$coefficients[2:12],-sum(mod.pois1$coefficients[2:12]))
coef
##       mnth1       mnth2       mnth3       mnth4       mnth5       mnth6 
## -0.67016993 -0.44412382 -0.29373313  0.02152319  0.24047087  0.22323523 
##       mnth7       mnth8       mnth9      mnth10      mnth11             
##  0.10361658  0.15117137  0.23349303  0.26757318  0.15026355  0.01667987
plot(coef,type="o")

contrasts(Bikeshare$mnth)=contr.sum(12)

coef<-c(mod.pois1$coefficients[2:12],-sum(mod.pois1$coefficients[2:12]))
coef
##       mnth1       mnth2       mnth3       mnth4       mnth5       mnth6 
## -0.67016993 -0.44412382 -0.29373313  0.02152319  0.24047087  0.22323523 
##       mnth7       mnth8       mnth9      mnth10      mnth11             
##  0.10361658  0.15117137  0.23349303  0.26757318  0.15026355  0.01667987
plot(coef,type="o")

Chapter 05 (Cross-Validation and the Bootstrap)

The Validation Set Approach

library(ISLR2)
set.seed(1)
train <- sample(392, 196)

lm.fit <- lm(mpg ~ horsepower, data = Auto, subset = train)

attach(Auto)
## The following object is masked _by_ .GlobalEnv:
## 
##     cylinders
## The following object is masked from package:lubridate:
## 
##     origin
## The following object is masked from package:ggplot2:
## 
##     mpg
## The following objects are masked from Auto (pos = 25):
## 
##     acceleration, cylinders, displacement, horsepower, mpg, name,
##     origin, weight, year
mean((mpg - predict(lm.fit, Auto))[-train]^2)
## [1] 23.26601
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data = Auto, 
    subset = train)
mean((mpg - predict(lm.fit2, Auto))[-train]^2)
## [1] 18.71646
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data = Auto, 
    subset = train)
mean((mpg - predict(lm.fit3, Auto))[-train]^2)
## [1] 18.79401
set.seed(2)
train <- sample(392, 196)
lm.fit <- lm(mpg ~ horsepower, subset = train)
mean((mpg - predict(lm.fit, Auto))[-train]^2)
## [1] 25.72651
lm.fit2 <- lm(mpg ~ poly(horsepower, 2), data = Auto, 
    subset = train)
mean((mpg - predict(lm.fit2, Auto))[-train]^2)
## [1] 20.43036
lm.fit3 <- lm(mpg ~ poly(horsepower, 3), data = Auto, 
    subset = train)
mean((mpg - predict(lm.fit3, Auto))[-train]^2)
## [1] 20.38533

Leave-One-Out Cross-Validation

glm.fit <- glm(mpg ~ horsepower, data = Auto)
coef(glm.fit)
## (Intercept)  horsepower 
##  39.9358610  -0.1578447
lm.fit <- lm(mpg ~ horsepower, data = Auto)
coef(lm.fit)
## (Intercept)  horsepower 
##  39.9358610  -0.1578447
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:car':
## 
##     logit
glm.fit <- glm(mpg ~ horsepower, data = Auto)
cv.err <- cv.glm(Auto, glm.fit)
cv.err$delta
## [1] 24.23151 24.23114
cv.error <- rep(0, 10)
for (i in 1:10) {
  glm.fit <- glm(mpg ~ poly(horsepower, i), data = Auto)
  cv.error[i] <- cv.glm(Auto, glm.fit)$delta[1]
}
cv.error
##  [1] 24.23151 19.24821 19.33498 19.42443 19.03321 18.97864 18.83305 18.96115
##  [9] 19.06863 19.49093

\(k\)-Fold Cross-Validation

set.seed(17)
cv.error.10 <- rep(0, 10)
for (i in 1:10) {
  glm.fit <- glm(mpg ~ poly(horsepower, i), data = Auto)
  cv.error.10[i] <- cv.glm(Auto, glm.fit, K = 10)$delta[1]
}
cv.error.10
##  [1] 24.27207 19.26909 19.34805 19.29496 19.03198 18.89781 19.12061 19.14666
##  [9] 18.87013 20.95520
error<- c()
for (i in 1:10){
 glm_fit<-glm(mpg ~ poly(horsepower,i ), data = Auto)
 
 error[i]<-cv.glm(data = Auto,glmfit = glm_fit)$delta[1]
}
error
##  [1] 24.23151 19.24821 19.33498 19.42443 19.03321 18.97864 18.83305 18.96115
##  [9] 19.06863 19.49093
error10<- c()
for (i in 1:10){
 glm_fit<-glm(mpg ~ poly(horsepower,i ), data = Auto)
 
 error10[i]<-cv.glm(data = Auto,glmfit = glm_fit,K = 10)$delta[1]
}
error10
##  [1] 24.30935 19.28566 19.47768 19.45928 19.11895 18.96636 19.14254 19.19909
##  [9] 19.14876 19.96767

The Bootstrap

### Estimating the Accuracy of a Statistic of Interest


alpha.fn <- function(data, index) {
  X <- data$X[index]
  Y <- data$Y[index]
  (var(Y) - cov(X, Y)) / (var(X) + var(Y) - 2 * cov(X, Y))
}

alpha.fn(Portfolio, 1:100)
## [1] 0.5758321
set.seed(7)
alpha.fn(Portfolio, sample(100, 100, replace = T))
## [1] 0.5385326
boot(Portfolio, alpha.fn, R = 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Portfolio, statistic = alpha.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##      original       bias    std. error
## t1* 0.5758321 0.0007959475  0.08969074
### Estimating the Accuracy of a Linear Regression Model


boot.fn <- function(data, index)
  coef(lm(mpg ~ horsepower, data = data, subset = index))
boot.fn(Auto, 1:392)
## (Intercept)  horsepower 
##  39.9358610  -0.1578447
set.seed(1)
boot.fn(Auto, sample(392, 392, replace = T))
## (Intercept)  horsepower 
##  40.3404517  -0.1634868
boot.fn(Auto, sample(392, 392, replace = T))
## (Intercept)  horsepower 
##  40.1186906  -0.1577063
boot(Auto, boot.fn, 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Auto, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##       original        bias    std. error
## t1* 39.9358610  0.0544513229 0.841289790
## t2* -0.1578447 -0.0006170901 0.007343073
summary(lm(mpg ~ horsepower, data = Auto))$coef
##               Estimate  Std. Error   t value      Pr(>|t|)
## (Intercept) 39.9358610 0.717498656  55.65984 1.220362e-187
## horsepower  -0.1578447 0.006445501 -24.48914  7.031989e-81
boot.fn <- function(data, index)
  coef(
      lm(mpg ~ horsepower + I(horsepower^2), 
        data = data, subset = index)
    )
set.seed(1)
boot(Auto, boot.fn, 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Auto, statistic = boot.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##         original        bias     std. error
## t1* 56.900099702  3.511640e-02 2.0300222526
## t2* -0.466189630 -7.080834e-04 0.0324241984
## t3*  0.001230536  2.840324e-06 0.0001172164
summary(
    lm(mpg ~ horsepower + I(horsepower^2), data = Auto)
  )$coef
##                     Estimate   Std. Error   t value      Pr(>|t|)
## (Intercept)     56.900099702 1.8004268063  31.60367 1.740911e-109
## horsepower      -0.466189630 0.0311246171 -14.97816  2.289429e-40
## I(horsepower^2)  0.001230536 0.0001220759  10.08009  2.196340e-21
boot_fun<- function(data,index){
  
  coefficients(lm(mpg ~ horsepower,data = data,subset = index))
}

boot(Auto,statistic = boot_fun,R = 1000)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Auto, statistic = boot_fun, R = 1000)
## 
## 
## Bootstrap Statistics :
##       original        bias    std. error
## t1* 39.9358610  0.0250819701 0.855907634
## t2* -0.1578447 -0.0002002252 0.007393336
coefficients(lm(mpg ~ horsepower,data = Auto))
## (Intercept)  horsepower 
##  39.9358610  -0.1578447

Chapter 06 ( Model Selection)

Subset Selection Methods

Best Subset Selection

library(ISLR2)
#View(Hitters)
names(Hitters)
##  [1] "AtBat"     "Hits"      "HmRun"     "Runs"      "RBI"       "Walks"    
##  [7] "Years"     "CAtBat"    "CHits"     "CHmRun"    "CRuns"     "CRBI"     
## [13] "CWalks"    "League"    "Division"  "PutOuts"   "Assists"   "Errors"   
## [19] "Salary"    "NewLeague"
dim(Hitters)
## [1] 322  20
sum(is.na(Hitters$Salary))
## [1] 59
Hitters <- na.omit(Hitters)
dim(Hitters)
## [1] 263  20
sum(is.na(Hitters))
## [1] 0
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
regfit.full <- regsubsets(Salary ~ ., Hitters)
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "   " " 
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"   " " 
##          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "
regfit.full <- regsubsets(Salary ~ ., data = Hitters,
    nvmax = 19)
reg.summary <- summary(regfit.full)

names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
reg.summary$rsq
##  [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
##  [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
par(mfrow = c(2, 2))
plot(reg.summary$rss, xlab = "Number of Variables",
    ylab = "RSS", type = "l")
plot(reg.summary$adjr2, xlab = "Number of Variables",
    ylab = "Adjusted RSq", type = "l")
plot(reg.summary$cp, type = "l")
plot(reg.summary$bic, type = "l")

par(mfrow=c(2,2))
which.max(reg.summary$adjr2)
## [1] 11
plot(reg.summary$adjr2, xlab = "Number of Variables",
    ylab = "Adjusted RSq", type = "l")
points(11, reg.summary$adjr2[11], col = "red", cex = 2, 
    pch = 20)

plot(reg.summary$cp, xlab = "Number of Variables",
    ylab = "Cp", type = "l")
which.min(reg.summary$cp)
## [1] 10
points(10, reg.summary$cp[10], col = "red", cex = 2,
    pch = 20)
which.min(reg.summary$bic)
## [1] 6
plot(reg.summary$bic, xlab = "Number of Variables",
    ylab = "BIC", type = "l")
points(6, reg.summary$bic[6], col = "red", cex = 2,
    pch = 20)


plot(regfit.full, scale = "r2")

plot(regfit.full, scale = "adjr2")
plot(regfit.full, scale = "Cp")
plot(regfit.full, scale = "bic")

coef(regfit.full, 6)
##  (Intercept)        AtBat         Hits        Walks         CRBI    DivisionW 
##   91.5117981   -1.8685892    7.6043976    3.6976468    0.6430169 -122.9515338 
##      PutOuts 
##    0.2643076

Forward and Backward Stepwise Selection

regfit.fwd <- regsubsets(Salary ~ ., data = Hitters,
    nvmax = 19, method = "forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
##           CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"
regfit.bwd <- regsubsets(Salary ~ ., data = Hitters,
    nvmax = 19, method = "backward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: backward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 4  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 5  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
##           CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 5  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"
###
coef(regfit.full, 7)
##  (Intercept)         Hits        Walks       CAtBat        CHits       CHmRun 
##   79.4509472    1.2833513    3.2274264   -0.3752350    1.4957073    1.4420538 
##    DivisionW      PutOuts 
## -129.9866432    0.2366813
coef(regfit.fwd, 7)
##  (Intercept)        AtBat         Hits        Walks         CRBI       CWalks 
##  109.7873062   -1.9588851    7.4498772    4.9131401    0.8537622   -0.3053070 
##    DivisionW      PutOuts 
## -127.1223928    0.2533404
coef(regfit.bwd, 7)
##  (Intercept)        AtBat         Hits        Walks        CRuns       CWalks 
##  105.6487488   -1.9762838    6.7574914    6.0558691    1.1293095   -0.7163346 
##    DivisionW      PutOuts 
## -116.1692169    0.3028847

Choosing Among Models Using the Validation-Set Approach and Cross-Validation

set.seed(1)
train <- sample(c(TRUE, FALSE), nrow(Hitters),
    replace = TRUE)
test <- (!train)

# Now, we apply regsubsets() to the training set in order to perform best subset selection.

regfit.best <- regsubsets(Salary ~ .,
    data = Hitters[train, ], nvmax = 19)

## model.matrix creates a design (or model) matrix, e.g., by expanding factors to a set of dummy variables (depending on the contrasts) and expanding interactions similarly.
test.mat <- model.matrix(Salary ~ ., data = Hitters[test, ])

val.errors <- rep(NA, 19)
for (i in 1:19) {
 coefi <- coef(regfit.best, id = i)
 pred <- test.mat[, names(coefi)] %*% coefi
 val.errors[i] <- mean((Hitters$Salary[test] - pred)^2)
}

val.errors
##  [1] 164377.3 144405.5 152175.7 145198.4 137902.1 139175.7 126849.0 136191.4
##  [9] 132889.6 135434.9 136963.3 140694.9 140690.9 141951.2 141508.2 142164.4
## [17] 141767.4 142339.6 142238.2
which.min(val.errors)
## [1] 7
coef(regfit.best, 7)
##  (Intercept)        AtBat         Hits        Walks        CRuns       CWalks 
##   67.1085369   -2.1462987    7.0149547    8.0716640    1.2425113   -0.8337844 
##    DivisionW      PutOuts 
## -118.4364998    0.2526925
predict.regsubsets <- function(object, newdata, id, ...) {
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  coefi <- coef(object, id = id)
  xvars <- names(coefi)
  mat[, xvars] %*% coefi
 }
 
### library(leaps)
 
regfit.best <- regsubsets(Salary ~ ., data = Hitters,
    nvmax = 19)
coef(regfit.best, 7)
##  (Intercept)         Hits        Walks       CAtBat        CHits       CHmRun 
##   79.4509472    1.2833513    3.2274264   -0.3752350    1.4957073    1.4420538 
##    DivisionW      PutOuts 
## -129.9866432    0.2366813
k <- 10
n <- nrow(Hitters)
set.seed(1)
folds <- sample(rep(1:k, length = n))
cv.errors <- matrix(NA, k, 19,
    dimnames = list(NULL, paste(1:19)))
###
for (j in 1:k) {
  best.fit <- regsubsets(Salary ~ .,
       data = Hitters[folds != j, ],
       nvmax = 19)
  for (i in 1:19) {
    pred <- predict(best.fit, Hitters[folds == j, ], id = i)
    cv.errors[j, i] <-
         mean((Hitters$Salary[folds == j] - pred)^2)
   }
 }
###
mean.cv.errors <- apply(cv.errors, 2, mean)
mean.cv.errors
##        1        2        3        4        5        6        7        8 
## 143439.8 126817.0 134214.2 131782.9 130765.6 120382.9 121443.1 114363.7 
##        9       10       11       12       13       14       15       16 
## 115163.1 109366.0 112738.5 113616.5 115557.6 115853.3 115630.6 116050.0 
##       17       18       19 
## 116117.0 116419.3 116299.1
par(mfrow = c(1, 1))
plot(mean.cv.errors, type = "b")

reg.best <- regsubsets(Salary ~ ., data = Hitters,
    nvmax = 19)
coef(reg.best, 10)
##  (Intercept)        AtBat         Hits        Walks       CAtBat        CRuns 
##  162.5354420   -2.1686501    6.9180175    5.7732246   -0.1300798    1.4082490 
##         CRBI       CWalks    DivisionW      PutOuts      Assists 
##    0.7743122   -0.8308264 -112.3800575    0.2973726    0.2831680

Ridge Regression and the Lasso

x <- model.matrix(Salary ~ ., Hitters)[, -1]
y <- Hitters$Salary

### Ridge Regression


library(glmnet)
## Warning: package 'glmnet' was built under R version 4.4.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
grid <- 10^seq(10, -2, length = 100)
ridge.mod <- glmnet(x, y, alpha = 0, lambda = grid)

dim(coef(ridge.mod))
## [1]  20 100
###
ridge.mod$lambda[50]
## [1] 11497.57
coef(ridge.mod)[, 50]
##   (Intercept)         AtBat          Hits         HmRun          Runs 
## 407.356050200   0.036957182   0.138180344   0.524629976   0.230701523 
##           RBI         Walks         Years        CAtBat         CHits 
##   0.239841459   0.289618741   1.107702929   0.003131815   0.011653637 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##   0.087545670   0.023379882   0.024138320   0.025015421   0.085028114 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
##  -6.215440973   0.016482577   0.002612988  -0.020502690   0.301433531
sqrt(sum(coef(ridge.mod)[-1, 50]^2))
## [1] 6.360612
###
ridge.mod$lambda[60]
## [1] 705.4802
coef(ridge.mod)[, 60]
##  (Intercept)        AtBat         Hits        HmRun         Runs          RBI 
##  54.32519950   0.11211115   0.65622409   1.17980910   0.93769713   0.84718546 
##        Walks        Years       CAtBat        CHits       CHmRun        CRuns 
##   1.31987948   2.59640425   0.01083413   0.04674557   0.33777318   0.09355528 
##         CRBI       CWalks      LeagueN    DivisionW      PutOuts      Assists 
##   0.09780402   0.07189612  13.68370191 -54.65877750   0.11852289   0.01606037 
##       Errors   NewLeagueN 
##  -0.70358655   8.61181213
sqrt(sum(coef(ridge.mod)[-1, 60]^2))
## [1] 57.11001
###
predict(ridge.mod, s = 50, type = "coefficients")[1:20, ]
##   (Intercept)         AtBat          Hits         HmRun          Runs 
##  4.876610e+01 -3.580999e-01  1.969359e+00 -1.278248e+00  1.145892e+00 
##           RBI         Walks         Years        CAtBat         CHits 
##  8.038292e-01  2.716186e+00 -6.218319e+00  5.447837e-03  1.064895e-01 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##  6.244860e-01  2.214985e-01  2.186914e-01 -1.500245e-01  4.592589e+01 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
## -1.182011e+02  2.502322e-01  1.215665e-01 -3.278600e+00 -9.496680e+00
set.seed(1)
train <- sample(1:nrow(x), nrow(x) / 2)
test <- (-train)
y.test <- y[test]

ridge.mod <- glmnet(x[train, ], y[train], alpha = 0,lambda = grid, thresh = 1e-12)

ridge.pred <- predict(ridge.mod, s = 4, newx = x[test, ])
mean((ridge.pred - y.test)^2)
## [1] 142199.2
mean((mean(y[train]) - y.test)^2)
## [1] 224669.9
###
ridge.pred <- predict(ridge.mod, s = 1e10, newx = x[test, ])
mean((ridge.pred - y.test)^2)
## [1] 224669.8
###
ridge.pred <- predict(ridge.mod, s = 0, newx = x[test, ],exact = T, x = x[train, ], y = y[train])

mean((ridge.pred - y.test)^2)
## [1] 168588.6
lm(y ~ x, subset = train)
## 
## Call:
## lm(formula = y ~ x, subset = train)
## 
## Coefficients:
## (Intercept)       xAtBat        xHits       xHmRun        xRuns         xRBI  
##    274.0145      -0.3521      -1.6377       5.8145       1.5424       1.1243  
##      xWalks       xYears      xCAtBat       xCHits      xCHmRun       xCRuns  
##      3.7287     -16.3773      -0.6412       3.1632       3.4008      -0.9739  
##       xCRBI      xCWalks     xLeagueN   xDivisionW     xPutOuts     xAssists  
##     -0.6005       0.3379     119.1486    -144.0831       0.1976       0.6804  
##     xErrors  xNewLeagueN  
##     -4.7128     -71.0951
predict(ridge.mod, s = 0, exact = T, type = "coefficients",
    x = x[train, ], y = y[train])[1:20, ]
##  (Intercept)        AtBat         Hits        HmRun         Runs          RBI 
##  274.0200994   -0.3521900   -1.6371383    5.8146692    1.5423361    1.1241837 
##        Walks        Years       CAtBat        CHits       CHmRun        CRuns 
##    3.7288406  -16.3795195   -0.6411235    3.1629444    3.4005281   -0.9739405 
##         CRBI       CWalks      LeagueN    DivisionW      PutOuts      Assists 
##   -0.6003976    0.3378422  119.1434637 -144.0853061    0.1976300    0.6804200 
##       Errors   NewLeagueN 
##   -4.7127879  -71.0898914
###
set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 0)
plot(cv.out)

bestlam <- cv.out$lambda.min
bestlam
## [1] 326.0828
###
ridge.pred <- predict(ridge.mod, s = bestlam,
    newx = x[test, ])
mean((ridge.pred - y.test)^2)
## [1] 139856.6
out <- glmnet(x, y, alpha = 0)
predict(out, type = "coefficients", s = bestlam)[1:20, ]
##  (Intercept)        AtBat         Hits        HmRun         Runs          RBI 
##  15.44383120   0.07715547   0.85911582   0.60103106   1.06369007   0.87936105 
##        Walks        Years       CAtBat        CHits       CHmRun        CRuns 
##   1.62444617   1.35254778   0.01134999   0.05746654   0.40680157   0.11456224 
##         CRBI       CWalks      LeagueN    DivisionW      PutOuts      Assists 
##   0.12116504   0.05299202  22.09143197 -79.04032656   0.16619903   0.02941950 
##       Errors   NewLeagueN 
##  -1.36092945   9.12487765

The Lasso

lasso.mod <- glmnet(x[train, ], y[train], alpha = 1,
    lambda = grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1)
plot(cv.out)

bestlam <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod, s = bestlam,
    newx = x[test, ])
mean((lasso.pred - y.test)^2)
## [1] 143673.6
###
out <- glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef <- predict(out, type = "coefficients",
    s = bestlam)[1:20, ]
lasso.coef
##   (Intercept)         AtBat          Hits         HmRun          Runs 
##    1.27479059   -0.05497143    2.18034583    0.00000000    0.00000000 
##           RBI         Walks         Years        CAtBat         CHits 
##    0.00000000    2.29192406   -0.33806109    0.00000000    0.00000000 
##        CHmRun         CRuns          CRBI        CWalks       LeagueN 
##    0.02825013    0.21628385    0.41712537    0.00000000   20.28615023 
##     DivisionW       PutOuts       Assists        Errors    NewLeagueN 
## -116.16755870    0.23752385    0.00000000   -0.85629148    0.00000000
lasso.coef[lasso.coef != 0]
##   (Intercept)         AtBat          Hits         Walks         Years 
##    1.27479059   -0.05497143    2.18034583    2.29192406   -0.33806109 
##        CHmRun         CRuns          CRBI       LeagueN     DivisionW 
##    0.02825013    0.21628385    0.41712537   20.28615023 -116.16755870 
##       PutOuts        Errors 
##    0.23752385   -0.85629148

PCR and PLS Regression

Principal Components Regression

library(pls)
## Warning: package 'pls' was built under R version 4.4.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
set.seed(2)
pcr.fit <- pcr(Salary ~ ., data = Hitters, scale = TRUE,
    validation = "CV")

summary(pcr.fit)
## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV             452    351.9    353.2    355.0    352.8    348.4    343.6
## adjCV          452    351.6    352.7    354.4    352.1    347.6    342.7
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       345.5    347.7    349.6     351.4     352.1     353.5     358.2
## adjCV    344.7    346.7    348.5     350.1     350.7     352.0     356.5
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        349.7     349.4     339.9     341.6     339.2     339.6
## adjCV     348.0     347.7     338.2     339.7     337.2     337.6
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X         38.31    60.16    70.84    79.03    84.29    88.63    92.26    94.96
## Salary    40.63    41.58    42.17    43.22    44.90    46.48    46.69    46.75
##         9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X         96.28     97.26     97.98     98.65     99.15     99.47     99.75
## Salary    46.86     47.76     47.82     47.85     48.10     50.40     50.55
##         16 comps  17 comps  18 comps  19 comps
## X          99.89     99.97     99.99    100.00
## Salary     53.01     53.85     54.61     54.61
validationplot(pcr.fit, val.type = "MSEP")

set.seed(1)
pcr.fit <- pcr(Salary ~ ., data = Hitters, subset = train,
    scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")

pcr.pred <- predict(pcr.fit, x[test, ], ncomp = 5)
mean((pcr.pred - y.test)^2)
## [1] 142811.8
###
pcr.fit <- pcr(y ~ x, scale = TRUE, ncomp = 5)
summary(pcr.fit)
## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 5
## TRAINING: % variance explained
##    1 comps  2 comps  3 comps  4 comps  5 comps
## X    38.31    60.16    70.84    79.03    84.29
## y    40.63    41.58    42.17    43.22    44.90

Partial Least Squares

set.seed(1)
pls.fit <- plsr(Salary ~ ., data = Hitters, subset = train, scale = TRUE, validation = "CV")
summary(pls.fit)
## Data:    X dimension: 131 19 
##  Y dimension: 131 1
## Fit method: kernelpls
## Number of components considered: 19
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV           428.3    325.5    329.9    328.8    339.0    338.9    340.1
## adjCV        428.3    325.0    328.2    327.2    336.6    336.1    336.6
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV       339.0    347.1    346.4     343.4     341.5     345.4     356.4
## adjCV    336.2    343.4    342.8     340.2     338.3     341.8     351.1
##        14 comps  15 comps  16 comps  17 comps  18 comps  19 comps
## CV        348.4     349.1     350.0     344.2     344.5     345.0
## adjCV     344.2     345.0     345.9     340.4     340.6     341.1
## 
## TRAINING: % variance explained
##         1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X         39.13    48.80    60.09    75.07    78.58    81.12    88.21    90.71
## Salary    46.36    50.72    52.23    53.03    54.07    54.77    55.05    55.66
##         9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X         93.17     96.05     97.08     97.61     97.97     98.70     99.12
## Salary    55.95     56.12     56.47     56.68     57.37     57.76     58.08
##         16 comps  17 comps  18 comps  19 comps
## X          99.61     99.70     99.95    100.00
## Salary     58.17     58.49     58.56     58.62
validationplot(pls.fit, val.type = "MSEP")

pls.pred <- predict(pls.fit, x[test, ], ncomp = 1)
mean((pls.pred - y.test)^2)
## [1] 151995.3
pls.fit <- plsr(Salary ~ ., data = Hitters, scale = TRUE,
    ncomp = 1)
summary(pls.fit)
## Data:    X dimension: 263 19 
##  Y dimension: 263 1
## Fit method: kernelpls
## Number of components considered: 1
## TRAINING: % variance explained
##         1 comps
## X         38.08
## Salary    43.05

Chapter 07 ( Non-linear Modeling)

Polynomial Regression and Step Functions

library(ISLR2)
attach(Wage)
## The following object is masked from Auto (pos = 8):
## 
##     year
## The following object is masked from Boston:
## 
##     age
## The following object is masked from Auto (pos = 31):
## 
##     year
## Polynomial Regression and Step Functions

fit <- lm(wage ~ poly(age, 4), data = Wage)
coef(summary(fit))
##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02
fit2 <- lm(wage ~ poly(age, 4, raw = T), data = Wage)
coef(summary(fit2))
##                             Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)            -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = T)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
## poly(age, 4, raw = T)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = T)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
## poly(age, 4, raw = T)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498
### alternative way
fit2a <- lm(wage ~ age + I(age^2) + I(age^3) + I(age^4),
    data = Wage)
coef(fit2a)
##   (Intercept)           age      I(age^2)      I(age^3)      I(age^4) 
## -1.841542e+02  2.124552e+01 -5.638593e-01  6.810688e-03 -3.203830e-05
### alternative way 2
fit2b <- lm(wage ~ cbind(age, age^2, age^3, age^4),
    data = Wage)
coef(fit2b)
##                        (Intercept) cbind(age, age^2, age^3, age^4)age 
##                      -1.841542e+02                       2.124552e+01 
##    cbind(age, age^2, age^3, age^4)    cbind(age, age^2, age^3, age^4) 
##                      -5.638593e-01                       6.810688e-03 
##    cbind(age, age^2, age^3, age^4) 
##                      -3.203830e-05
###
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds <- predict(fit, newdata = list(age = age.grid),
    se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit,
    preds$fit - 2 * preds$se.fit)


###
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 1, 1),
    oma = c(0, 0, 4, 0))
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey")
title("Degree-4 Polynomial", outer = T)
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)


###
preds2 <- predict(fit2, newdata = list(age = age.grid),
    se = TRUE)
max(abs(preds$fit - preds2$fit))
## [1] 6.842527e-11

fit.1 <- lm(wage ~ age, data = Wage)
fit.2 <- lm(wage ~ poly(age, 2), data = Wage)
fit.3 <- lm(wage ~ poly(age, 3), data = Wage)
fit.4 <- lm(wage ~ poly(age, 4), data = Wage)
fit.5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(summary(fit.5))
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01
###
(-11.983)^2
## [1] 143.5923
fit.1 <- lm(wage ~ education + age, data = Wage)
fit.2 <- lm(wage ~ education + poly(age, 2), data = Wage)
fit.3 <- lm(wage ~ education + poly(age, 3), data = Wage)
anova(fit.1, fit.2, fit.3)
## Analysis of Variance Table
## 
## Model 1: wage ~ education + age
## Model 2: wage ~ education + poly(age, 2)
## Model 3: wage ~ education + poly(age, 3)
##   Res.Df     RSS Df Sum of Sq        F Pr(>F)    
## 1   2994 3867992                                 
## 2   2993 3725395  1    142597 114.6969 <2e-16 ***
## 3   2992 3719809  1      5587   4.4936 0.0341 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit <- glm(I(wage > 250) ~ poly(age, 4), data = Wage,
    family = binomial)

preds <- predict(fit, newdata = list(age = age.grid), se = T)
pfit <- exp(preds$fit) / (1 + exp(preds$fit))

se.bands.logit <- cbind(preds$fit + 2 * preds$se.fit,
    preds$fit - 2 * preds$se.fit)
se.bands <- exp(se.bands.logit) / (1 + exp(se.bands.logit))


###
preds <- predict(fit, newdata = list(age = age.grid),
    type = "response", se = T)



plot(age, I(wage > 250), xlim = agelims, type = "n",
    ylim = c(0, .2))
points(jitter(age), I((wage > 250) / 5), cex = .5, pch = "|", col = "darkgrey")
lines(age.grid, pfit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)

###
table(cut(age, 4))
## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72
fit <- lm(wage ~ cut(age, 4), data = Wage)
coef(summary(fit))
##                         Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)            94.158392   1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49]   24.053491   1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5]   23.664559   2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1]  7.640592   4.987424  1.531972 1.256350e-01

Splines

library(splines)
fit <- lm(wage ~ bs(age, knots = c(25, 40, 60)), data = Wage)
pred <- predict(fit, newdata = list(age = age.grid), se = T)
plot(age, wage, col = "gray")
lines(age.grid, pred$fit, lwd = 2)
lines(age.grid, pred$fit + 2 * pred$se, lty = "dashed")
lines(age.grid, pred$fit - 2 * pred$se, lty = "dashed")

dim(bs(age, knots = c(25, 40, 60)))
## [1] 3000    6
dim(bs(age, df = 6))
## [1] 3000    6
attr(bs(age, df = 6), "knots")
## [1] 33.75 42.00 51.00
###
fit2 <- lm(wage ~ ns(age, df = 4), data = Wage)
pred2 <- predict(fit2, newdata = list(age = age.grid),
     se = T)
plot(age, wage, col = "gray")
lines(age.grid, pred2$fit, col = "red", lwd = 2)

###
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey")
title("Smoothing Spline")
fit <- smooth.spline(age, wage, df = 16)
fit2 <- smooth.spline(age, wage, cv = TRUE)
## Warning in smooth.spline(age, wage, cv = TRUE): cross-validation with
## non-unique 'x' values seems doubtful
fit$df
## [1] 16.00237
fit2$df
## [1] 6.794596
lines(fit, col = "red", lwd = 2)
lines(fit2, col = "blue", lwd = 2)
legend("topright", legend = c("16 DF", "6.8 DF"),
    col = c("red", "blue"), lty = 1, lwd = 2, cex = .8)

###
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey")
title("Local Regression")
fit <- loess(wage ~ age, span = .2, data = Wage)
fit2 <- loess(wage ~ age, span = .5, data = Wage)
lines(age.grid, predict(fit, data.frame(age = age.grid)),
    col = "red", lwd = 2)
lines(age.grid, predict(fit2, data.frame(age = age.grid)),
    col = "blue", lwd = 2)
legend("topright", legend = c("Span = 0.2", "Span = 0.5"),
    col = c("red", "blue"), lty = 1, lwd = 2, cex = .8)

GAMs

gam1 <- lm(wage ~ ns(year, 4) + ns(age, 5) + education,
    data = Wage)
###
library(gam)
## Warning: package 'gam' was built under R version 4.4.3
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.4.2
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.22-6
gam.m3 <- gam(wage ~ s(year, 4) + s(age, 5) + education,
    data = Wage)

par(mfrow = c(1, 3))
plot(gam.m3, se = TRUE, col = "blue")

plot.Gam(gam1, se = TRUE, col = "red")

gam.m1 <- gam(wage ~ s(age, 5) + education, data = Wage)
gam.m2 <- gam(wage ~ year + s(age, 5) + education,
    data = Wage)
anova(gam.m1, gam.m2, gam.m3, test = "F")
## Analysis of Deviance Table
## 
## Model 1: wage ~ s(age, 5) + education
## Model 2: wage ~ year + s(age, 5) + education
## Model 3: wage ~ s(year, 4) + s(age, 5) + education
##   Resid. Df Resid. Dev Df Deviance       F    Pr(>F)    
## 1      2990    3711731                                  
## 2      2989    3693842  1  17889.2 14.4771 0.0001447 ***
## 3      2986    3689770  3   4071.1  1.0982 0.3485661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gam.m3)
## 
## Call: gam(formula = wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -119.43  -19.70   -3.33   14.17  213.48 
## 
## (Dispersion Parameter for gaussian family taken to be 1235.69)
## 
##     Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 3689770 on 2986 degrees of freedom
## AIC: 29887.75 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##              Df  Sum Sq Mean Sq F value    Pr(>F)    
## s(year, 4)    1   27162   27162  21.981 2.877e-06 ***
## s(age, 5)     1  195338  195338 158.081 < 2.2e-16 ***
## education     4 1069726  267432 216.423 < 2.2e-16 ***
## Residuals  2986 3689770    1236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df Npar F  Pr(F)    
## (Intercept)                          
## s(year, 4)        3  1.086 0.3537    
## s(age, 5)         4 32.380 <2e-16 ***
## education                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
preds <- predict(gam.m2, newdata = Wage)
###

gam.lo <- gam(
    wage ~ s(year, df = 4) + lo(age, span = 0.7) + education,
    data = Wage
  )
plot(gam.lo, se = TRUE, col = "green")

#install.packages("interp")
library(interp)
## Warning: package 'interp' was built under R version 4.4.3
## 
## Attaching package: 'interp'
## The following object is masked from 'package:MASS':
## 
##     area
library(gam)
gam.lo.i <- gam(wage ~ lo(year, age, span = 0.5) + education,
    data = Wage)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## liv too small.  (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small.  (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, :
## liv too small.  (Discovered by lowesd)
## Warning in lo.wam(x, z, wz, fit$smooth, which, fit$smooth.frame, bf.maxit, : lv
## too small.  (Discovered by lowesd)
library(akima)
## Warning: package 'akima' was built under R version 4.4.3
## 
## Attaching package: 'akima'
## The following objects are masked from 'package:interp':
## 
##     aspline, bicubic, bicubic.grid, bilinear, bilinear.grid,
##     franke.data, franke.fn, interp, interp2xyz, interpp
plot(gam.lo.i)

###
gam.lr <- gam(
    I(wage > 250) ~ year + s(age, df = 5) + education,
    family = binomial, data = Wage
  )
par(mfrow = c(1, 3))
plot(gam.lr, se = T, col = "green")

table(education, I(wage > 250))
##                     
## education            FALSE TRUE
##   1. < HS Grad         268    0
##   2. HS Grad           966    5
##   3. Some College      643    7
##   4. College Grad      663   22
##   5. Advanced Degree   381   45
gam.lr.s <- gam(
    I(wage > 250) ~ year + s(age, df = 5) + education,
    family = binomial, data = Wage,
    subset = (education != "1. < HS Grad")
  )
plot(gam.lr.s, se = T, col = "green")

Chapter 08 ( Fitting Classification Trees)

library(tree)
## Warning: package 'tree' was built under R version 4.4.3
library(ISLR2)
attach(Carseats)
## The following objects are masked from Carseats (pos = 32):
## 
##     Advertising, Age, CompPrice, Education, Income, Population, Price,
##     Sales, ShelveLoc, Urban, US
High <- factor(ifelse(Sales <= 8, "No", "Yes"))

Carseats <- data.frame(Carseats, High)

tree.carseats <- tree(High ~ . - Sales, Carseats)
tree.carseats
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 400 541.500 No ( 0.59000 0.41000 )  
##     2) ShelveLoc: Bad,Medium 315 390.600 No ( 0.68889 0.31111 )  
##       4) Price < 92.5 46  56.530 Yes ( 0.30435 0.69565 )  
##         8) Income < 57 10  12.220 No ( 0.70000 0.30000 )  
##          16) CompPrice < 110.5 5   0.000 No ( 1.00000 0.00000 ) *
##          17) CompPrice > 110.5 5   6.730 Yes ( 0.40000 0.60000 ) *
##         9) Income > 57 36  35.470 Yes ( 0.19444 0.80556 )  
##          18) Population < 207.5 16  21.170 Yes ( 0.37500 0.62500 ) *
##          19) Population > 207.5 20   7.941 Yes ( 0.05000 0.95000 ) *
##       5) Price > 92.5 269 299.800 No ( 0.75465 0.24535 )  
##        10) Advertising < 13.5 224 213.200 No ( 0.81696 0.18304 )  
##          20) CompPrice < 124.5 96  44.890 No ( 0.93750 0.06250 )  
##            40) Price < 106.5 38  33.150 No ( 0.84211 0.15789 )  
##              80) Population < 177 12  16.300 No ( 0.58333 0.41667 )  
##               160) Income < 60.5 6   0.000 No ( 1.00000 0.00000 ) *
##               161) Income > 60.5 6   5.407 Yes ( 0.16667 0.83333 ) *
##              81) Population > 177 26   8.477 No ( 0.96154 0.03846 ) *
##            41) Price > 106.5 58   0.000 No ( 1.00000 0.00000 ) *
##          21) CompPrice > 124.5 128 150.200 No ( 0.72656 0.27344 )  
##            42) Price < 122.5 51  70.680 Yes ( 0.49020 0.50980 )  
##              84) ShelveLoc: Bad 11   6.702 No ( 0.90909 0.09091 ) *
##              85) ShelveLoc: Medium 40  52.930 Yes ( 0.37500 0.62500 )  
##               170) Price < 109.5 16   7.481 Yes ( 0.06250 0.93750 ) *
##               171) Price > 109.5 24  32.600 No ( 0.58333 0.41667 )  
##                 342) Age < 49.5 13  16.050 Yes ( 0.30769 0.69231 ) *
##                 343) Age > 49.5 11   6.702 No ( 0.90909 0.09091 ) *
##            43) Price > 122.5 77  55.540 No ( 0.88312 0.11688 )  
##              86) CompPrice < 147.5 58  17.400 No ( 0.96552 0.03448 ) *
##              87) CompPrice > 147.5 19  25.010 No ( 0.63158 0.36842 )  
##               174) Price < 147 12  16.300 Yes ( 0.41667 0.58333 )  
##                 348) CompPrice < 152.5 7   5.742 Yes ( 0.14286 0.85714 ) *
##                 349) CompPrice > 152.5 5   5.004 No ( 0.80000 0.20000 ) *
##               175) Price > 147 7   0.000 No ( 1.00000 0.00000 ) *
##        11) Advertising > 13.5 45  61.830 Yes ( 0.44444 0.55556 )  
##          22) Age < 54.5 25  25.020 Yes ( 0.20000 0.80000 )  
##            44) CompPrice < 130.5 14  18.250 Yes ( 0.35714 0.64286 )  
##              88) Income < 100 9  12.370 No ( 0.55556 0.44444 ) *
##              89) Income > 100 5   0.000 Yes ( 0.00000 1.00000 ) *
##            45) CompPrice > 130.5 11   0.000 Yes ( 0.00000 1.00000 ) *
##          23) Age > 54.5 20  22.490 No ( 0.75000 0.25000 )  
##            46) CompPrice < 122.5 10   0.000 No ( 1.00000 0.00000 ) *
##            47) CompPrice > 122.5 10  13.860 No ( 0.50000 0.50000 )  
##              94) Price < 125 5   0.000 Yes ( 0.00000 1.00000 ) *
##              95) Price > 125 5   0.000 No ( 1.00000 0.00000 ) *
##     3) ShelveLoc: Good 85  90.330 Yes ( 0.22353 0.77647 )  
##       6) Price < 135 68  49.260 Yes ( 0.11765 0.88235 )  
##        12) US: No 17  22.070 Yes ( 0.35294 0.64706 )  
##          24) Price < 109 8   0.000 Yes ( 0.00000 1.00000 ) *
##          25) Price > 109 9  11.460 No ( 0.66667 0.33333 ) *
##        13) US: Yes 51  16.880 Yes ( 0.03922 0.96078 ) *
##       7) Price > 135 17  22.070 No ( 0.64706 0.35294 )  
##        14) Income < 46 6   0.000 No ( 1.00000 0.00000 ) *
##        15) Income > 46 11  15.160 Yes ( 0.45455 0.54545 ) *
summary(tree.carseats)
## 
## Classification tree:
## tree(formula = High ~ . - Sales, data = Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population" 
## [6] "Advertising" "Age"         "US"         
## Number of terminal nodes:  27 
## Residual mean deviance:  0.4575 = 170.7 / 373 
## Misclassification error rate: 0.09 = 36 / 400
plot(tree.carseats)
text(tree.carseats, pretty = 0)

set.seed(2)
train <- sample(1:nrow(Carseats), 200)

Carseats.test <- Carseats[-train, ]
High.test <- High[-train]

tree.carseats <- tree(High ~ . - Sales, Carseats,
    subset = train)
tree.pred <- predict(tree.carseats, Carseats.test,
    type = "class")
table(tree.pred, High.test)
##          High.test
## tree.pred  No Yes
##       No  104  33
##       Yes  13  50
(104 + 50) / 200
## [1] 0.77
###
set.seed(7)
cv.carseats <- cv.tree(tree.carseats, FUN = prune.misclass)
names(cv.carseats)
## [1] "size"   "dev"    "k"      "method"
cv.carseats
## $size
## [1] 21 19 14  9  8  5  3  2  1
## 
## $dev
## [1] 75 75 75 74 82 83 83 85 82
## 
## $k
## [1] -Inf  0.0  1.0  1.4  2.0  3.0  4.0  9.0 18.0
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
###
par(mfrow = c(1, 2))
plot(cv.carseats$size, cv.carseats$dev, type = "b")
plot(cv.carseats$k, cv.carseats$dev, type = "b")

### from plot for size 9 dev is minimum
prune.carseats <- prune.misclass(tree.carseats, best = 9)
plot(prune.carseats)
text(prune.carseats, pretty = 0)
###
tree.pred <- predict(prune.carseats, Carseats.test,
    type = "class")
table(tree.pred, High.test)
##          High.test
## tree.pred No Yes
##       No  97  25
##       Yes 20  58
(97 + 58) / 200
## [1] 0.775
### for (number of terminal node) size 14
prune.carseats <- prune.misclass(tree.carseats, best = 14)
plot(prune.carseats)
text(prune.carseats, pretty = 0)

tree.pred <- predict(prune.carseats, Carseats.test,
    type = "class")
table(tree.pred, High.test)
##          High.test
## tree.pred  No Yes
##       No  102  31
##       Yes  15  52
(102 + 52) / 200
## [1] 0.77

Fitting Regression Trees

set.seed(1)
train <- sample(1:nrow(Boston), nrow(Boston) / 2)
tree.boston <- tree(medv ~ ., Boston, subset = train)
summary(tree.boston)
## 
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "rm"    "lstat" "crim"  "age"  
## Number of terminal nodes:  7 
## Residual mean deviance:  10.38 = 2555 / 246 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -10.1800  -1.7770  -0.1775   0.0000   1.9230  16.5800
###
plot(tree.boston)
text(tree.boston, pretty = 0)

###
cv.boston <- cv.tree(tree.boston)
cv.boston
## $size
## [1] 7 6 5 4 3 2 1
## 
## $dev
## [1]  4380.849  4544.815  5601.055  6171.917  6919.608 10419.472 19630.870
## 
## $k
## [1]       -Inf   203.9641   637.2707   796.1207  1106.4931  3424.7810 10724.5951
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
plot(cv.boston$size, cv.boston$dev, type = "b")

###
prune.boston <- prune.tree(tree.boston, best = 5)
plot(prune.boston)
text(prune.boston, pretty = 0)

###
yhat <- predict(tree.boston, newdata = Boston[-train, ])
boston.test <- Boston[-train, "medv"]
plot(yhat, boston.test)
abline(0, 1)

mean((yhat - boston.test)^2)
## [1] 35.28688

Bagging and Random Forests

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
set.seed(1)
bag.boston <- randomForest(medv ~ ., data = Boston,
    subset = train, mtry = 12, importance = TRUE)
bag.boston
## 
## Call:
##  randomForest(formula = medv ~ ., data = Boston, mtry = 12, importance = TRUE,      subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 12
## 
##           Mean of squared residuals: 11.40162
##                     % Var explained: 85.17
###
yhat.bag <- predict(bag.boston, newdata = Boston[-train, ])
plot(yhat.bag, boston.test)
abline(0, 1)

mean((yhat.bag - boston.test)^2)
## [1] 23.41916
###
bag.boston <- randomForest(medv ~ ., data = Boston,
    subset = train, mtry = 12, ntree = 25)
yhat.bag <- predict(bag.boston, newdata = Boston[-train, ])
mean((yhat.bag - boston.test)^2)
## [1] 25.75055
###
set.seed(1)
rf.boston <- randomForest(medv ~ ., data = Boston,
    subset = train, mtry = 6, importance = TRUE)
yhat.rf <- predict(rf.boston, newdata = Boston[-train, ])
mean((yhat.rf - boston.test)^2)
## [1] 20.06644
###
importance(rf.boston)
##           %IncMSE IncNodePurity
## crim    19.435587    1070.42307
## zn       3.091630      82.19257
## indus    6.140529     590.09536
## chas     1.370310      36.70356
## nox     13.263466     859.97091
## rm      35.094741    8270.33906
## age     15.144821     634.31220
## dis      9.163776     684.87953
## rad      4.793720      83.18719
## tax      4.410714     292.20949
## ptratio  8.612780     902.20190
## lstat   28.725343    5813.04833
###
varImpPlot(rf.boston)

Boosting

library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(1)
boost.boston <- gbm(medv ~ ., data = Boston[train, ],
    distribution = "gaussian", n.trees = 5000,
    interaction.depth = 4)
summary(boost.boston)

##             var     rel.inf
## rm           rm 44.48249588
## lstat     lstat 32.70281223
## crim       crim  4.85109954
## dis         dis  4.48693083
## nox         nox  3.75222394
## age         age  3.19769210
## ptratio ptratio  2.81354826
## tax         tax  1.54417603
## indus     indus  1.03384666
## rad         rad  0.87625748
## zn           zn  0.16220479
## chas       chas  0.09671228
###
plot(boost.boston, i = "rm")

plot(boost.boston, i = "lstat")

###
yhat.boost <- predict(boost.boston,
    newdata = Boston[-train, ], n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 18.39057
###
boost.boston <- gbm(medv ~ ., data = Boston[train, ],
    distribution = "gaussian", n.trees = 5000,
    interaction.depth = 4, shrinkage = 0.2, verbose = F)
yhat.boost <- predict(boost.boston,
    newdata = Boston[-train, ], n.trees = 5000)
mean((yhat.boost - boston.test)^2)
## [1] 16.54778

Bayesian Additive Regression Trees

library(BART)
## Warning: package 'BART' was built under R version 4.4.3
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
## 
##     aml
x <- Boston[, 1:12]
y <- Boston[, "medv"]
xtrain <- x[train, ]
ytrain <- y[train]
xtest <- x[-train, ]
ytest <- y[-train]
set.seed(1)
bartfit <- gbart(xtrain, ytrain, x.test = xtest)
## *****Calling gbart: type=1
## *****Data:
## data:n,p,np: 253, 12, 253
## y1,yn: 0.213439, -5.486561
## x1,x[n*p]: 0.109590, 20.080000
## xp1,xp[np*p]: 0.027310, 7.880000
## *****Number of Trees: 200
## *****Number of Cut Points: 100 ... 100
## *****burn,nd,thin: 100,1000,1
## *****Prior:beta,alpha,tau,nu,lambda,offset: 2,0.95,0.795495,3,3.71636,21.7866
## *****sigma: 4.367914
## *****w (weights): 1.000000 ... 1.000000
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,12,0
## *****printevery: 100
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 3s
## trcnt,tecnt: 1000,1000
###
yhat.bart <- bartfit$yhat.test.mean
mean((ytest - yhat.bart)^2)
## [1] 15.94718
###
ord <- order(bartfit$varcount.mean, decreasing = T)
bartfit$varcount.mean[ord]
##     nox   lstat     tax     rad      rm   indus    chas ptratio     age      zn 
##  22.952  21.329  21.250  20.781  19.890  19.825  19.051  18.976  18.274  15.952 
##     dis    crim 
##  14.457  11.007

Chapter 09 ( Support Vector Machines)

Support Vector Classifier

set.seed(1)
x <- matrix(rnorm(20 * 2), ncol = 2)
y <- c(rep(-1, 10), rep(1, 10))
x[y == 1, ] <- x[y == 1, ] + 1
plot(x, col = (3 - y))

dat <- data.frame(x = x, y = as.factor(y))
library(e1071)
svmfit <- svm(y ~ ., data = dat, kernel = "linear", 
    cost = 10, scale = FALSE)
plot(svmfit, dat)

svmfit$index
## [1]  1  2  5  7 14 16 17
summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
## 
## Number of Support Vectors:  7
## 
##  ( 4 3 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
svmfit <- svm(y ~ ., data = dat, kernel = "linear", 
    cost = 0.1, scale = FALSE)
plot(svmfit, dat)

svmfit$index
##  [1]  1  2  3  4  5  7  9 10 12 13 14 15 16 17 18 20
set.seed(1)
tune.out <- tune(svm, y ~ ., data = dat, kernel = "linear", 
    ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.05 
## 
## - Detailed performance results:
##    cost error dispersion
## 1 1e-03  0.55  0.4377975
## 2 1e-02  0.55  0.4377975
## 3 1e-01  0.05  0.1581139
## 4 1e+00  0.15  0.2415229
## 5 5e+00  0.15  0.2415229
## 6 1e+01  0.15  0.2415229
## 7 1e+02  0.15  0.2415229
###
bestmod <- tune.out$best.model
summary(bestmod)
## 
## Call:
## best.tune(METHOD = svm, train.x = y ~ ., data = dat, ranges = list(cost = c(0.001, 
##     0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
## 
## Number of Support Vectors:  16
## 
##  ( 8 8 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
xtest <- matrix(rnorm(20 * 2), ncol = 2)
ytest <- sample(c(-1, 1), 20, rep = TRUE)
xtest[ytest == 1, ] <- xtest[ytest == 1, ] + 1
testdat <- data.frame(x = xtest, y = as.factor(ytest))


ypred <- predict(bestmod, xtest)
table(predict = ypred, truth = testdat$y)
##        truth
## predict -1 1
##      -1  9 1
##      1   2 8
###
svmfit <- svm(y ~ ., data = dat, kernel = "linear", 
    cost = .01, scale = FALSE)
ypred <- predict(svmfit, testdat)
table(predict = ypred, truth = testdat$y)
##        truth
## predict -1  1
##      -1 11  6
##      1   0  3
###
x[y == 1, ] <- x[y == 1, ] + 0.5
plot(x, col = (y + 5) / 2, pch = 19)

###
dat <- data.frame(x = x, y = as.factor(y))
svmfit <- svm(y ~ ., data = dat, kernel = "linear", 
    cost = 1e5)
summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1e+05)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1e+05 
## 
## Number of Support Vectors:  3
## 
##  ( 1 2 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
plot(svmfit, dat)

###
svmfit <- svm(y ~ ., data = dat, kernel = "linear", cost = 1)
summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  7
## 
##  ( 4 3 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -1 1
plot(svmfit, dat)

Support Vector Machine

set.seed(1)
x <- matrix(rnorm(200 * 2), ncol = 2)
x[1:100, ] <- x[1:100, ] + 2
x[101:150, ] <- x[101:150, ] - 2
y <- c(rep(1, 150), rep(2, 50))
dat <- data.frame(x = x, y = as.factor(y))
plot(x, col = y)

###
train <- sample(200, 100)
svmfit <- svm(y ~ ., data = dat[train, ], kernel = "radial",  
    gamma = 1, cost = 1)
plot(svmfit, dat[train, ])

summary(svmfit)
## 
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", gamma = 1, 
##     cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  31
## 
##  ( 16 15 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  1 2
###
svmfit <- svm(y ~ ., data = dat[train, ], kernel = "radial", 
    gamma = 1, cost = 1e5)
plot(svmfit, dat[train, ])

###
set.seed(1)
tune.out <- tune(svm, y ~ ., data = dat[train, ], 
    kernel = "radial", 
    ranges = list(
      cost = c(0.1, 1, 10, 100, 1000),
      gamma = c(0.5, 1, 2, 3, 4)
    )
  )
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##     1   0.5
## 
## - best performance: 0.07 
## 
## - Detailed performance results:
##     cost gamma error dispersion
## 1  1e-01   0.5  0.26 0.15776213
## 2  1e+00   0.5  0.07 0.08232726
## 3  1e+01   0.5  0.07 0.08232726
## 4  1e+02   0.5  0.14 0.15055453
## 5  1e+03   0.5  0.11 0.07378648
## 6  1e-01   1.0  0.22 0.16193277
## 7  1e+00   1.0  0.07 0.08232726
## 8  1e+01   1.0  0.09 0.07378648
## 9  1e+02   1.0  0.12 0.12292726
## 10 1e+03   1.0  0.11 0.11005049
## 11 1e-01   2.0  0.27 0.15670212
## 12 1e+00   2.0  0.07 0.08232726
## 13 1e+01   2.0  0.11 0.07378648
## 14 1e+02   2.0  0.12 0.13165612
## 15 1e+03   2.0  0.16 0.13498971
## 16 1e-01   3.0  0.27 0.15670212
## 17 1e+00   3.0  0.07 0.08232726
## 18 1e+01   3.0  0.08 0.07888106
## 19 1e+02   3.0  0.13 0.14181365
## 20 1e+03   3.0  0.15 0.13540064
## 21 1e-01   4.0  0.27 0.15670212
## 22 1e+00   4.0  0.07 0.08232726
## 23 1e+01   4.0  0.09 0.07378648
## 24 1e+02   4.0  0.13 0.14181365
## 25 1e+03   4.0  0.15 0.13540064
###
p<-predict(tune.out$best.model, newdata = dat[-train, ])
table(true = dat[-train, "y"], pred = p)
##     pred
## true  1  2
##    1 67 10
##    2  2 21

ROC Curves

library(ROCR)
## Warning: package 'ROCR' was built under R version 4.4.3
rocplot <- function(pred, truth, ...) {
  predob <- prediction(pred, truth)
  perf <- performance(predob, "tpr", "fpr")
  plot(perf, ...)
}


###
svmfit.opt <- svm(y ~ ., data = dat[train, ], 
    kernel = "radial", gamma = 2, cost = 1, 
    decision.values = T)

fitted <- attributes(
    predict(svmfit.opt, dat[train, ], decision.values = TRUE)
  )$decision.values

###
par(mfrow = c(1, 2))
rocplot(-fitted, dat[train, "y"], main = "Training Data")
###
svmfit.flex <- svm(y ~ ., data = dat[train, ], 
    kernel = "radial", gamma = 50, cost = 1, 
    decision.values = T)

fitted <- attributes(
    predict(svmfit.flex, dat[train, ], decision.values = T)
  )$decision.values
rocplot(-fitted, dat[train, "y"], add = T, col = "red")



###
fitted <- attributes(
    predict(svmfit.opt, dat[-train, ], decision.values = T)
  )$decision.values
rocplot(-fitted, dat[-train, "y"], main = "Test Data")


fitted <- attributes(
    predict(svmfit.flex, dat[-train, ], decision.values = T)
  )$decision.values
rocplot(-fitted, dat[-train, "y"], add = T, col = "red")

dat <- data.frame(xx = x[train, ], yy = as.factor(y[train]))
###
svmfit.opt <- svm(yy ~ ., data = dat, kernel = "radial",  gamma = 2, cost = 1, probability = T)
probs <- attributes(predict(svmfit.opt, dat, probability = T))$probabilities[, 1]


###
par(mfrow = c(1, 2))
rocplot(probs, dat$yy, main = "Training Data")


###
svmfit.flex <- svm(yy ~ ., data = dat, kernel = "radial", gamma = 50,   cost = 1, probability = T)
probs <- attributes(predict(svmfit.flex, dat, probability = T))$probabilities[, 1]
rocplot(probs, dat$yy, add = T, col = "red")


###
dat.test <- data.frame(xx = x[-train, ], yy = as.factor(y[-train]))
probs <- attributes(predict(svmfit.opt, dat.test, probability = T))$probabilities[, 1]
rocplot(probs, dat.test$yy, main = "Test Data")
probs <- attributes(predict(svmfit.flex, dat.test, probability = T))$probabilities[, 1]
rocplot(probs, dat.test$yy, add = T, col = "red")

SVM with Multiple Classes

set.seed(1)
x <- rbind(x, matrix(rnorm(50 * 2), ncol = 2))
y <- c(y, rep(0, 50))
x[y == 0, 2] <- x[y == 0, 2] + 2
dat <- data.frame(x = x, y = as.factor(y))

par(mfrow = c(1, 1))
plot(x, col = (y + 1))

###
svmfit <- svm(y ~ ., data = dat, kernel = "radial", 
    cost = 10, gamma = 1)
plot(svmfit, dat)

Application to Gene Expression Data

library(ISLR2)
names(Khan)
## [1] "xtrain" "xtest"  "ytrain" "ytest"
dim(Khan$xtrain)
## [1]   63 2308
dim(Khan$xtest)
## [1]   20 2308
length(Khan$ytrain)
## [1] 63
length(Khan$ytest)
## [1] 20
table(Khan$ytrain)
## 
##  1  2  3  4 
##  8 23 12 20
table(Khan$ytest)
## 
## 1 2 3 4 
## 3 6 6 5
###
dat <- data.frame(
    x = Khan$xtrain, 
    y = as.factor(Khan$ytrain)
  )
out <- svm(y ~ ., data = dat, kernel = "linear", 
    cost = 10)
summary(out)
## 
## Call:
## svm(formula = y ~ ., data = dat, kernel = "linear", cost = 10)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  10 
## 
## Number of Support Vectors:  58
## 
##  ( 20 20 11 7 )
## 
## 
## Number of Classes:  4 
## 
## Levels: 
##  1 2 3 4
table(out$fitted, dat$y)
##    
##      1  2  3  4
##   1  8  0  0  0
##   2  0 23  0  0
##   3  0  0 12  0
##   4  0  0  0 20
###
dat.te <- data.frame(
    x = Khan$xtest, 
    y = as.factor(Khan$ytest))
pred.te <- predict(out, newdata = dat.te)
table(pred.te, dat.te$y)
##        
## pred.te 1 2 3 4
##       1 3 0 0 0
##       2 0 6 2 0
##       3 0 0 4 0
##       4 0 0 0 5