data("Boston", package = "ISLR2")
names(Boston) [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
[8] "dis" "rad" "tax" "ptratio" "lstat" "medv"
data("Boston", package = "ISLR2")
names(Boston) [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
[8] "dis" "rad" "tax" "ptratio" "lstat" "medv"
plot(medv ~ lstat, data = Boston)fit1 <- lm(medv ~ lstat, data = Boston)
summary(fit1)
Call:
lm(formula = medv ~ lstat, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-15.168 -3.990 -1.318 2.034 24.500
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 34.55384 0.56263 61.41 <2e-16 ***
lstat -0.95005 0.03873 -24.53 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
names(fit1) [1] "coefficients" "residuals" "effects" "rank"
[5] "fitted.values" "assign" "qr" "df.residual"
[9] "xlevels" "call" "terms" "model"
plot(medv ~ lstat, data = Boston)
abline(fit1, col = "brown")confint(fit1, level = 0.99) 0.5 % 99.5 %
(Intercept) 33.099101 36.0085810
lstat -1.050199 -0.8498995
predict(fit1, data.frame(lstat=c(5, 10, 15)), interval = "confidence") fit lwr upr
1 29.80359 29.00741 30.59978
2 25.05335 24.47413 25.63256
3 20.30310 19.73159 20.87461
fit2 <- lm(medv ~ lstat + age, data = Boston)
summary(fit2)
Call:
lm(formula = medv ~ lstat + age, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-15.981 -3.978 -1.283 1.968 23.158
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 33.22276 0.73085 45.458 < 2e-16 ***
lstat -1.03207 0.04819 -21.416 < 2e-16 ***
age 0.03454 0.01223 2.826 0.00491 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.173 on 503 degrees of freedom
Multiple R-squared: 0.5513, Adjusted R-squared: 0.5495
F-statistic: 309 on 2 and 503 DF, p-value: < 2.2e-16
fit3 <- lm(medv ~ ., data = Boston)
summary(fit3)
Call:
lm(formula = medv ~ ., data = Boston)
Residuals:
Min 1Q Median 3Q Max
-15.1304 -2.7673 -0.5814 1.9414 26.2526
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 41.617270 4.936039 8.431 3.79e-16 ***
crim -0.121389 0.033000 -3.678 0.000261 ***
zn 0.046963 0.013879 3.384 0.000772 ***
indus 0.013468 0.062145 0.217 0.828520
chas 2.839993 0.870007 3.264 0.001173 **
nox -18.758022 3.851355 -4.870 1.50e-06 ***
rm 3.658119 0.420246 8.705 < 2e-16 ***
age 0.003611 0.013329 0.271 0.786595
dis -1.490754 0.201623 -7.394 6.17e-13 ***
rad 0.289405 0.066908 4.325 1.84e-05 ***
tax -0.012682 0.003801 -3.337 0.000912 ***
ptratio -0.937533 0.132206 -7.091 4.63e-12 ***
lstat -0.552019 0.050659 -10.897 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.798 on 493 degrees of freedom
Multiple R-squared: 0.7343, Adjusted R-squared: 0.7278
F-statistic: 113.5 on 12 and 493 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(fit3)fit4 <- update(fit3, ~ . - age - indus)
summary(fit4)
Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
tax + ptratio + lstat, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-15.1814 -2.7625 -0.6243 1.8448 26.3920
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 41.451747 4.903283 8.454 3.18e-16 ***
crim -0.121665 0.032919 -3.696 0.000244 ***
zn 0.046191 0.013673 3.378 0.000787 ***
chas 2.871873 0.862591 3.329 0.000935 ***
nox -18.262427 3.565247 -5.122 4.33e-07 ***
rm 3.672957 0.409127 8.978 < 2e-16 ***
dis -1.515951 0.187675 -8.078 5.08e-15 ***
rad 0.283932 0.063945 4.440 1.11e-05 ***
tax -0.012292 0.003407 -3.608 0.000340 ***
ptratio -0.930961 0.130423 -7.138 3.39e-12 ***
lstat -0.546509 0.047442 -11.519 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.789 on 495 degrees of freedom
Multiple R-squared: 0.7342, Adjusted R-squared: 0.7289
F-statistic: 136.8 on 10 and 495 DF, p-value: < 2.2e-16
Includes the predictors lstat, age, and lstat * age.
fit5 <- lm(medv ~ lstat * age, data = Boston)
summary(fit5)
Call:
lm(formula = medv ~ lstat * age, data = Boston)
Residuals:
Min 1Q Median 3Q Max
-15.806 -4.045 -1.333 2.085 27.552
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
age -0.0007209 0.0198792 -0.036 0.9711
lstat:age 0.0041560 0.0018518 2.244 0.0252 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.149 on 502 degrees of freedom
Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
fit6 <- lm(medv ~ lstat + I(lstat^2), data = Boston)
summary(fit6)
Call:
lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
Residuals:
Min 1Q Median 3Q Max
-15.2834 -3.8313 -0.5295 2.3095 25.4148
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 42.862007 0.872084 49.15 <2e-16 ***
lstat -2.332821 0.123803 -18.84 <2e-16 ***
I(lstat^2) 0.043547 0.003745 11.63 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.524 on 503 degrees of freedom
Multiple R-squared: 0.6407, Adjusted R-squared: 0.6393
F-statistic: 448.5 on 2 and 503 DF, p-value: < 2.2e-16
fit7 <- lm(medv ~ poly(lstat, 4))
summary(fit7)
Call:
lm(formula = medv ~ poly(lstat, 4))
Residuals:
Min 1Q Median 3Q Max
-13.563 -3.180 -0.632 2.283 27.181
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 22.5328 0.2347 95.995 < 2e-16 ***
poly(lstat, 4)1 -152.4595 5.2801 -28.874 < 2e-16 ***
poly(lstat, 4)2 64.2272 5.2801 12.164 < 2e-16 ***
poly(lstat, 4)3 -27.0511 5.2801 -5.123 4.29e-07 ***
poly(lstat, 4)4 25.4517 5.2801 4.820 1.90e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.28 on 501 degrees of freedom
Multiple R-squared: 0.673, Adjusted R-squared: 0.6704
F-statistic: 257.8 on 4 and 501 DF, p-value: < 2.2e-16
plot(medv ~ lstat)
points(lstat, fitted(fit7), col = "maroon", pch = 20)names(Carseats) [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
[6] "Price" "ShelveLoc" "Age" "Education" "Urban"
[11] "US"
Carseats |>
sample_n(10) Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
278 7.80 136 48 12 326 125 Medium 36 16
249 5.36 111 52 0 12 101 Medium 61 11
265 6.95 128 29 5 324 159 Good 31 15
373 7.80 121 50 0 508 98 Medium 65 11
362 8.68 131 25 10 183 104 Medium 56 15
35 2.67 115 54 0 406 128 Medium 42 17
186 10.07 130 100 11 449 107 Medium 64 10
365 10.50 122 21 16 488 131 Good 30 14
138 6.52 128 42 0 436 118 Medium 80 11
225 4.10 134 82 0 464 141 Medium 48 13
Urban US
278 Yes Yes
249 Yes Yes
265 Yes Yes
373 No No
362 No Yes
35 Yes Yes
186 Yes Yes
365 Yes Yes
138 Yes No
225 No No
The below model adds in interaction terms between Income and Advertising, and Age and Price.
q_fit1 <- lm(Sales ~ . + Income:Advertising + Age:Price, data = Carseats)
summary(q_fit1)
Call:
lm(formula = Sales ~ . + Income:Advertising + Age:Price, data = Carseats)
Residuals:
Min 1Q Median 3Q Max
-2.9208 -0.7503 0.0177 0.6754 3.3413
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.5755654 1.0087470 6.519 2.22e-10 ***
CompPrice 0.0929371 0.0041183 22.567 < 2e-16 ***
Income 0.0108940 0.0026044 4.183 3.57e-05 ***
Advertising 0.0702462 0.0226091 3.107 0.002030 **
Population 0.0001592 0.0003679 0.433 0.665330
Price -0.1008064 0.0074399 -13.549 < 2e-16 ***
ShelveLocGood 4.8486762 0.1528378 31.724 < 2e-16 ***
ShelveLocMedium 1.9532620 0.1257682 15.531 < 2e-16 ***
Age -0.0579466 0.0159506 -3.633 0.000318 ***
Education -0.0208525 0.0196131 -1.063 0.288361
UrbanYes 0.1401597 0.1124019 1.247 0.213171
USYes -0.1575571 0.1489234 -1.058 0.290729
Income:Advertising 0.0007510 0.0002784 2.698 0.007290 **
Price:Age 0.0001068 0.0001333 0.801 0.423812
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.011 on 386 degrees of freedom
Multiple R-squared: 0.8761, Adjusted R-squared: 0.8719
F-statistic: 210 on 13 and 386 DF, p-value: < 2.2e-16
The contrasts function returns how a qualitative variable will be coded when put in a Linear Model (in this case, it is a three level factor),
contrasts(Carseats$ShelveLoc) Good Medium
Bad 0 0
Good 1 0
Medium 0 1
The following function fits a regression model and makes a plot,
regplot = function(x,y, ...) {
fit = lm(y ~ x)
plot(x,y, ...)
abline(fit, col = "darkblue")
}
attach(Carseats)
regplot(Price, Sales, xlab = "Price", ylab = "Sales", col = "lightblue", pch = 20)log.reg <- glm(default ~ student, family = binomial, data = Default)
summary(log.reg)
Call:
glm(formula = default ~ student, family = binomial, data = Default)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.50413 0.07071 -49.55 < 2e-16 ***
studentYes 0.40489 0.11502 3.52 0.000431 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2920.6 on 9999 degrees of freedom
Residual deviance: 2908.7 on 9998 degrees of freedom
AIC: 2912.7
Number of Fisher Scoring iterations: 6
log.reg.multi <- glm(default ~ ., data = Default, family = binomial)
summary(log.reg.multi)
Call:
glm(formula = default ~ ., family = binomial, data = Default)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
income 3.033e-06 8.203e-06 0.370 0.71152
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2920.6 on 9999 degrees of freedom
Residual deviance: 1571.5 on 9996 degrees of freedom
AIC: 1579.5
Number of Fisher Scoring iterations: 8
So, when considering only student as a predictor, it has a positive slope which means, knowing whether someone is student or not aids in predicting credit card default. But when using student, balance, and income, it has a negative slope, which means it harms in predicting the default status.
So it is not the fault of being a student, rather it is confounded with balance.