ISLRII

1 Chapter 3

1.1 Data

data("Boston", package = "ISLR2")
names(Boston)
 [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
 [8] "dis"     "rad"     "tax"     "ptratio" "lstat"   "medv"   

1.2 Simple Linear Regression

plot(medv ~ lstat, data = Boston)

fit1 <- lm(medv ~ lstat, data = Boston)
summary(fit1)

Call:
lm(formula = medv ~ lstat, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.168  -3.990  -1.318   2.034  24.500 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.55384    0.56263   61.41   <2e-16 ***
lstat       -0.95005    0.03873  -24.53   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared:  0.5441,    Adjusted R-squared:  0.5432 
F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
names(fit1)
 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"        
plot(medv ~ lstat, data = Boston)
abline(fit1, col = "brown")

confint(fit1, level = 0.99)
                0.5 %     99.5 %
(Intercept) 33.099101 36.0085810
lstat       -1.050199 -0.8498995
predict(fit1, data.frame(lstat=c(5, 10, 15)), interval = "confidence")
       fit      lwr      upr
1 29.80359 29.00741 30.59978
2 25.05335 24.47413 25.63256
3 20.30310 19.73159 20.87461

1.3 Multiple Linear Regression

fit2 <- lm(medv ~ lstat + age, data = Boston)
summary(fit2)

Call:
lm(formula = medv ~ lstat + age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.981  -3.978  -1.283   1.968  23.158 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
age          0.03454    0.01223   2.826  0.00491 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.173 on 503 degrees of freedom
Multiple R-squared:  0.5513,    Adjusted R-squared:  0.5495 
F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16
fit3 <- lm(medv ~ ., data = Boston)
summary(fit3)

Call:
lm(formula = medv ~ ., data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.1304  -2.7673  -0.5814   1.9414  26.2526 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  41.617270   4.936039   8.431 3.79e-16 ***
crim         -0.121389   0.033000  -3.678 0.000261 ***
zn            0.046963   0.013879   3.384 0.000772 ***
indus         0.013468   0.062145   0.217 0.828520    
chas          2.839993   0.870007   3.264 0.001173 ** 
nox         -18.758022   3.851355  -4.870 1.50e-06 ***
rm            3.658119   0.420246   8.705  < 2e-16 ***
age           0.003611   0.013329   0.271 0.786595    
dis          -1.490754   0.201623  -7.394 6.17e-13 ***
rad           0.289405   0.066908   4.325 1.84e-05 ***
tax          -0.012682   0.003801  -3.337 0.000912 ***
ptratio      -0.937533   0.132206  -7.091 4.63e-12 ***
lstat        -0.552019   0.050659 -10.897  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.798 on 493 degrees of freedom
Multiple R-squared:  0.7343,    Adjusted R-squared:  0.7278 
F-statistic: 113.5 on 12 and 493 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit3)

fit4 <- update(fit3, ~ . - age - indus)
summary(fit4)

Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
    tax + ptratio + lstat, data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.1814  -2.7625  -0.6243   1.8448  26.3920 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  41.451747   4.903283   8.454 3.18e-16 ***
crim         -0.121665   0.032919  -3.696 0.000244 ***
zn            0.046191   0.013673   3.378 0.000787 ***
chas          2.871873   0.862591   3.329 0.000935 ***
nox         -18.262427   3.565247  -5.122 4.33e-07 ***
rm            3.672957   0.409127   8.978  < 2e-16 ***
dis          -1.515951   0.187675  -8.078 5.08e-15 ***
rad           0.283932   0.063945   4.440 1.11e-05 ***
tax          -0.012292   0.003407  -3.608 0.000340 ***
ptratio      -0.930961   0.130423  -7.138 3.39e-12 ***
lstat        -0.546509   0.047442 -11.519  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.789 on 495 degrees of freedom
Multiple R-squared:  0.7342,    Adjusted R-squared:  0.7289 
F-statistic: 136.8 on 10 and 495 DF,  p-value: < 2.2e-16

1.4 Nonlinear terms and Interactions

Includes the predictors lstat, age, and lstat * age.

fit5 <- lm(medv ~ lstat * age, data = Boston)
summary(fit5)

Call:
lm(formula = medv ~ lstat * age, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.806  -4.045  -1.333   2.085  27.552 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
age         -0.0007209  0.0198792  -0.036   0.9711    
lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.149 on 502 degrees of freedom
Multiple R-squared:  0.5557,    Adjusted R-squared:  0.5531 
F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16
fit6 <- lm(medv ~ lstat + I(lstat^2), data = Boston)
summary(fit6)

Call:
lm(formula = medv ~ lstat + I(lstat^2), data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.2834  -3.8313  -0.5295   2.3095  25.4148 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 42.862007   0.872084   49.15   <2e-16 ***
lstat       -2.332821   0.123803  -18.84   <2e-16 ***
I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.524 on 503 degrees of freedom
Multiple R-squared:  0.6407,    Adjusted R-squared:  0.6393 
F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16

fit7 <- lm(medv ~ poly(lstat, 4))
summary(fit7)

Call:
lm(formula = medv ~ poly(lstat, 4))

Residuals:
    Min      1Q  Median      3Q     Max 
-13.563  -3.180  -0.632   2.283  27.181 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)       22.5328     0.2347  95.995  < 2e-16 ***
poly(lstat, 4)1 -152.4595     5.2801 -28.874  < 2e-16 ***
poly(lstat, 4)2   64.2272     5.2801  12.164  < 2e-16 ***
poly(lstat, 4)3  -27.0511     5.2801  -5.123 4.29e-07 ***
poly(lstat, 4)4   25.4517     5.2801   4.820 1.90e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.28 on 501 degrees of freedom
Multiple R-squared:  0.673, Adjusted R-squared:  0.6704 
F-statistic: 257.8 on 4 and 501 DF,  p-value: < 2.2e-16
plot(medv ~ lstat)
points(lstat, fitted(fit7), col = "maroon", pch = 20)

1.5 Qualitative Predictors

names(Carseats)
 [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
 [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
[11] "US"         
Carseats |> 
  sample_n(10)
    Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
278  7.80       136     48          12        326   125    Medium  36        16
249  5.36       111     52           0         12   101    Medium  61        11
265  6.95       128     29           5        324   159      Good  31        15
373  7.80       121     50           0        508    98    Medium  65        11
362  8.68       131     25          10        183   104    Medium  56        15
35   2.67       115     54           0        406   128    Medium  42        17
186 10.07       130    100          11        449   107    Medium  64        10
365 10.50       122     21          16        488   131      Good  30        14
138  6.52       128     42           0        436   118    Medium  80        11
225  4.10       134     82           0        464   141    Medium  48        13
    Urban  US
278   Yes Yes
249   Yes Yes
265   Yes Yes
373    No  No
362    No Yes
35    Yes Yes
186   Yes Yes
365   Yes Yes
138   Yes  No
225    No  No

The below model adds in interaction terms between Income and Advertising, and Age and Price.

q_fit1 <- lm(Sales ~ . + Income:Advertising + Age:Price, data = Carseats)
summary(q_fit1)

Call:
lm(formula = Sales ~ . + Income:Advertising + Age:Price, data = Carseats)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9208 -0.7503  0.0177  0.6754  3.3413 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
Income              0.0108940  0.0026044   4.183 3.57e-05 ***
Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
Population          0.0001592  0.0003679   0.433 0.665330    
Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
Age                -0.0579466  0.0159506  -3.633 0.000318 ***
Education          -0.0208525  0.0196131  -1.063 0.288361    
UrbanYes            0.1401597  0.1124019   1.247 0.213171    
USYes              -0.1575571  0.1489234  -1.058 0.290729    
Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
Price:Age           0.0001068  0.0001333   0.801 0.423812    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.011 on 386 degrees of freedom
Multiple R-squared:  0.8761,    Adjusted R-squared:  0.8719 
F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16

The contrasts function returns how a qualitative variable will be coded when put in a Linear Model (in this case, it is a three level factor),

contrasts(Carseats$ShelveLoc)
       Good Medium
Bad       0      0
Good      1      0
Medium    0      1

The following function fits a regression model and makes a plot,

regplot = function(x,y, ...) {
  fit = lm(y ~ x)
  plot(x,y, ...)
  abline(fit, col = "darkblue")
}

attach(Carseats)
regplot(Price, Sales, xlab = "Price", ylab = "Sales", col = "lightblue", pch = 20)

2 Chapter 4

log.reg <- glm(default ~ student, family = binomial, data = Default)
summary(log.reg)

Call:
glm(formula = default ~ student, family = binomial, data = Default)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -3.50413    0.07071  -49.55  < 2e-16 ***
studentYes   0.40489    0.11502    3.52 0.000431 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2920.6  on 9999  degrees of freedom
Residual deviance: 2908.7  on 9998  degrees of freedom
AIC: 2912.7

Number of Fisher Scoring iterations: 6
log.reg.multi <- glm(default ~ ., data = Default, family = binomial)
summary(log.reg.multi)

Call:
glm(formula = default ~ ., family = binomial, data = Default)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
income       3.033e-06  8.203e-06   0.370  0.71152    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2920.6  on 9999  degrees of freedom
Residual deviance: 1571.5  on 9996  degrees of freedom
AIC: 1579.5

Number of Fisher Scoring iterations: 8

So, when considering only student as a predictor, it has a positive slope which means, knowing whether someone is student or not aids in predicting credit card default. But when using student, balance, and income, it has a negative slope, which means it harms in predicting the default status.

2.1 Confounding

So it is not the fault of being a student, rather it is confounded with balance.