library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
(a). correlation coefficient
correlation_matrix <- cor(mtcars$mpg, mtcars[, -1])
abs_corr <- abs(correlation_matrix)
#put answers in correlation coefficients
print(abs_corr)
## cyl disp hp drat wt qsec vs
## [1,] 0.852162 0.8475514 0.7761684 0.6811719 0.8676594 0.418684 0.6640389
## am gear carb
## [1,] 0.5998324 0.4802848 0.5509251
sorted_indices <- order(abs_corr, decreasing = TRUE) #only shows the indices
# sort the vector based on the indices
sorted_vector <- abs_corr[sorted_indices]
print(sorted_vector) #but this only returns numbers, I want names too.
## [1] 0.8676594 0.8521620 0.8475514 0.7761684 0.6811719 0.6640389 0.5998324
## [8] 0.5509251 0.4802848 0.4186840
ANSWER: The 2 features that mostly strongly correlated with mpg is: - wt (weight in 1000 lbs) with correlation coefficient: 0.8676594 AND - cyl (Number of Cynlinders) with correlation coefficient 0.8521620.
(b). 2 linear regression
#strongest feature
fit <- lm(mpg~wt, data=mtcars)
summary(fit)
##
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
# second strongest feature
fit2 <- lm(mpg~cyl, data=mtcars)
summary(fit2)
##
## Call:
## lm(formula = mpg ~ cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9814 -2.1185 0.2217 1.0717 7.5186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.8846 2.0738 18.27 < 2e-16 ***
## cyl -2.8758 0.3224 -8.92 6.11e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.206 on 30 degrees of freedom
## Multiple R-squared: 0.7262, Adjusted R-squared: 0.7171
## F-statistic: 79.56 on 1 and 30 DF, p-value: 6.113e-10
Linear regression formula:
R^2:
Conclusion: Both model’s R^2, p value of the coefficient are quite similar. But if I had to choose one, it would be the model with wt and mpg hat (model #1). This is because it has slightly greater R^2, meaning the proportion of sample variability of response variable is explained by the explanatory variable is higher than model #2.
(c). multiple linear regression
fit_all <- lm(mpg~., data=mtcars)
summary(fit_all)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
Features that are significant in the model is wt (weight 1000 lbs), the R^2 for the model is 0.869.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fit0 <- glm(mpg~1, data=mtcars)
AIC <- stepAIC(fit0, direction = "both", scope=list(upper=fit_all, lower=fit0))
## Start: AIC=208.76
## mpg ~ 1
##
## Df Deviance AIC
## + wt 1 278.32 166.03
## + cyl 1 308.33 169.31
## + disp 1 317.16 170.21
## + hp 1 447.67 181.24
## + drat 1 603.57 190.80
## + vs 1 629.52 192.15
## + am 1 720.90 196.48
## + carb 1 784.27 199.18
## + gear 1 866.30 202.36
## + qsec 1 928.66 204.59
## <none> 1126.05 208.76
##
## Step: AIC=166.03
## mpg ~ wt
##
## Df Deviance AIC
## + cyl 1 191.17 156.01
## + hp 1 195.05 156.65
## + qsec 1 195.46 156.72
## + vs 1 224.09 161.09
## + carb 1 233.72 162.44
## + disp 1 246.68 164.17
## <none> 278.32 166.03
## + drat 1 269.24 166.97
## + gear 1 277.19 167.90
## + am 1 278.32 168.03
## - wt 1 1126.05 208.76
##
## Step: AIC=156.01
## mpg ~ wt + cyl
##
## Df Deviance AIC
## + hp 1 176.62 155.48
## + carb 1 177.40 155.62
## <none> 191.17 156.01
## + qsec 1 180.60 156.19
## + gear 1 188.14 157.50
## + disp 1 188.49 157.56
## + vs 1 190.47 157.89
## + am 1 191.05 157.99
## + drat 1 191.17 158.01
## - cyl 1 278.32 166.03
## - wt 1 308.33 169.31
##
## Step: AIC=155.48
## mpg ~ wt + cyl + hp
##
## Df Deviance AIC
## <none> 176.62 155.48
## - hp 1 191.17 156.01
## + am 1 170.00 156.25
## + disp 1 170.44 156.34
## - cyl 1 195.05 156.65
## + carb 1 174.10 157.02
## + drat 1 174.38 157.07
## + qsec 1 175.22 157.22
## + gear 1 175.76 157.32
## + vs 1 176.56 157.47
## - wt 1 291.98 169.56
#fit MLR using best subset of features mpg ~ wt + cyl + hp
AIC_best_model <- lm(mpg ~ wt + cyl + hp, data=mtcars)
summary(AIC_best_model)
##
## Call:
## lm(formula = mpg ~ wt + cyl + hp, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9290 -1.5598 -0.5311 1.1850 5.8986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.75179 1.78686 21.687 < 2e-16 ***
## wt -3.16697 0.74058 -4.276 0.000199 ***
## cyl -0.94162 0.55092 -1.709 0.098480 .
## hp -0.01804 0.01188 -1.519 0.140015
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.512 on 28 degrees of freedom
## Multiple R-squared: 0.8431, Adjusted R-squared: 0.8263
## F-statistic: 50.17 on 3 and 28 DF, p-value: 2.184e-11
Regression formula: mpg hat = 38.75179 - 3.16697wt - 0.94162cyl - 0.01804hp R^2 = 0.8431
(a). Yes, wt and cyl are features included in the previous models, they are the strongest and second strongest features (had the highest correlation coefficient between mpg and corresponding explanatory variable).
(b). The coefficients of cyl and wt are different from previous problems. This is because additional features are added in the model, this makes the interpretation of individual features different, which makes the actual coefficient different.
#try <- glm(mpg~., data=mtcars)
#summary(try)
(a). statement III is correct. This is because when the starting salary of college student (meaning x5=1), the coefficient of interaction between GPA and Level makes the response variable of starting salary after graduation decrease, lower than high school student. Specifically, if GPA is high enough, then high school graduates earn more than college graduates.
(b). E(Y) = 50+ 20* 4 + 0.07* 110+35* 1 + 0.01 * 110* 4 - 10 * 4*1 = 137.1 Therefore, the expected starting salary for the given set of predictor values is approximately $137100.
(c). False. This is because size of coefficient does NOT provide evidence for the presence or absence of an interaction effect. The small coefficient means the interaction effect in the regression model is weak, but does NOT mean the interaction effect is not significant. To determine significance of a coefficient, we use p value of that coefficient to determine.
library(ISLR2)
##
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
##
## Boston
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
ques8_fit <- lm(mpg~horsepower, data=Auto)
summary(ques8_fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
Comment on the output:
predict(ques8_fit, new=data.frame(horsepower=98),interval="predict", se.fit=TRUE)
## $fit
## fit lwr upr
## 1 24.46708 14.8094 34.12476
##
## $se.fit
## [1] 0.2512623
##
## $df
## [1] 390
##
## $residual.scale
## [1] 4.905757
predict(ques8_fit, new=data.frame(horsepower=98),interval="confidence", se.fit=TRUE)
## $fit
## fit lwr upr
## 1 24.46708 23.97308 24.96108
##
## $se.fit
## [1] 0.2512623
##
## $df
## [1] 390
##
## $residual.scale
## [1] 4.905757
ANSWER: predicted mpg is 24.46708 when horsepower is 98. Associated Confidence interval: (23.97308, 24.96108) Associated Prediction interval: (14.8094, 34.12476)
(b). Plot response and predictor
#scatterplot of horsepower against mpg
plot(Auto$horsepower, Auto$mpg, xlab = "Horsepower", ylab = "MPG", main = "Scatterplot of Horsepower vs. MPG")
#add the least squares regression line to the plot using abline()
abline(ques8_fit, col = "purple")
(c). Use the plot() function to produce diagnostic plots of the least squares regression fit. Comment on any problems you see with the fit.
plot(ques8_fit) # gives the 4 diagnostics plot!
Comment:
(a). Produce a scatterplot matrix which includes all of the variables in the data set.
pairs(Auto)
(b). Compute the matrix of correlations between the variables using the function cor(). You will need to exclude the name variable, cor() which is qualitative.
Auto_numeric <- Auto[, sapply(Auto, is.numeric)] #using sapply, only want numeric variables in Auto
#get correlation matrix
cor_matrix <- cor(Auto_numeric)
print(cor_matrix)
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
(c). Use the lm() function to perform a multiple linear regression with mpg as the response and all other variables except name as the predictors. Use the summary() function to print the results. Comment on the output. For instance: i. Is there a relationship between the predictors and the response? ii. Which predictors appear to have a statistically significant relationship to the response? iii. What does the coefficient for the year variable suggest?
ques9_MLR <- lm(mpg~.-name, data=Auto)
summary(ques9_MLR)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
Interpretation:
The overall p value shows < 2.2e-16, which is significant, meaning there’s at least 1 predictors in this multiple linear regression is significant. The R^2 value is 0.8215, meaning 82.15% of variability of response is explained by the explanatory variables. The predictors that have a relationship with response variable is displacement (p value 0.00844, significant), weight (p value < 2e-16), year (p value < 2e-16) and origin (p value 4.67e-07). Because those p values are significant, meaning less than 0.05, this means the response variable mpg depends on those explanatory variables. Translating to their coefficients: for example, as one more model year pass by, the miles per gallon increases by 0.750773 mpg. In addition, to interpret displacement: for every inch increase in engine displacement, the miles per gallon increase by 0.019896 mpg.
plot(ques9_MLR)
Comment:
#https://rdrr.io/cran/ISLR/man/Auto.html
# You should consider all interaction terms for the numeric variables, not only the significant ones.
# Fit linear regression models with interactions
model_interactions <- lm(mpg ~ (cylinders +displacement+ horsepower+ weight +acceleration+ year) * origin, data = Auto)
# View the summary of the model to assess significance of interactions
summary(model_interactions)
##
## Call:
## lm(formula = mpg ~ (cylinders + displacement + horsepower + weight +
## acceleration + year) * origin, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2651 -1.7622 -0.1884 1.3333 12.5693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.495524 10.100557 1.039 0.299421
## cylinders -1.688144 0.667998 -2.527 0.011906 *
## displacement -0.023367 0.021376 -1.093 0.275028
## horsepower 0.040812 0.031299 1.304 0.193042
## weight -0.002016 0.001640 -1.229 0.219683
## acceleration -0.765094 0.224332 -3.411 0.000718 ***
## year 0.509361 0.109103 4.669 4.22e-06 ***
## origin -12.825675 6.101034 -2.102 0.036196 *
## cylinders:origin 0.744185 0.454583 1.637 0.102447
## displacement:origin 0.020163 0.017604 1.145 0.252773
## horsepower:origin -0.048688 0.024130 -2.018 0.044318 *
## weight:origin -0.001911 0.001211 -1.577 0.115562
## acceleration:origin 0.428624 0.140570 3.049 0.002456 **
## year:origin 0.130813 0.061983 2.110 0.035475 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.036 on 378 degrees of freedom
## Multiple R-squared: 0.8537, Adjusted R-squared: 0.8487
## F-statistic: 169.7 on 13 and 378 DF, p-value: < 2.2e-16
Yes, the interactions that are significant are horsepower:origin, acceleration:origin, year:origin since their corresponding p value are less than 0.05 threshold.
(f). transformations of the variables
I picked horsepower first because in question 8 previously saw horsepower need transformation of the variable since the diagnositics plots can be improved.
horsepower_try <- lm(mpg~log(horsepower), data=Auto)
summary(horsepower_try)
##
## Call:
## lm(formula = mpg ~ log(horsepower), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.2299 -2.7818 -0.2322 2.6661 15.4695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108.6997 3.0496 35.64 <2e-16 ***
## log(horsepower) -18.5822 0.6629 -28.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.501 on 390 degrees of freedom
## Multiple R-squared: 0.6683, Adjusted R-squared: 0.6675
## F-statistic: 785.9 on 1 and 390 DF, p-value: < 2.2e-16
plot(horsepower_try)
horsepower_try2 <- lm(mpg~sqrt(horsepower), data=Auto)
summary(horsepower_try2)
##
## Call:
## lm(formula = mpg ~ sqrt(horsepower), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.9768 -3.2239 -0.2252 2.6881 16.1411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.705 1.349 43.52 <2e-16 ***
## sqrt(horsepower) -3.503 0.132 -26.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.665 on 390 degrees of freedom
## Multiple R-squared: 0.6437, Adjusted R-squared: 0.6428
## F-statistic: 704.6 on 1 and 390 DF, p-value: < 2.2e-16
plot(horsepower_try2)
horsepower_try3 <- lm(mpg~I(horsepower^2), data=Auto)
summary(horsepower_try3)
##
## Call:
## lm(formula = mpg ~ I(horsepower^2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.529 -3.798 -1.049 3.240 18.528
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.047e+01 4.466e-01 68.22 <2e-16 ***
## I(horsepower^2) -5.665e-04 2.827e-05 -20.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.485 on 390 degrees of freedom
## Multiple R-squared: 0.5074, Adjusted R-squared: 0.5061
## F-statistic: 401.7 on 1 and 390 DF, p-value: < 2.2e-16
plot(horsepower_try3)
Interpretation: Using log(horsepower), sqrt(horsepower) and horsepower^2 still indicates the transformation did not fix the problem. By looking at the diagnostics plots, there’s still a pattern in the residual v fitted plot.
Let’s try another variable: cylinders
#first plot the original version without any transformation
weights_original <- lm(mpg~weight, data=Auto)
summary(weights_original)
##
## Call:
## lm(formula = mpg ~ weight, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9736 -2.7556 -0.3358 2.1379 16.5194
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.216524 0.798673 57.87 <2e-16 ***
## weight -0.007647 0.000258 -29.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.333 on 390 degrees of freedom
## Multiple R-squared: 0.6926, Adjusted R-squared: 0.6918
## F-statistic: 878.8 on 1 and 390 DF, p-value: < 2.2e-16
plot(weights_original)
The residual v fitted diagnostics plots also shows a fan shape, meaning linear regression is not suited for weight variable. Now see explore the transformations:
weight_try <- lm(mpg~log(weight), data=Auto)
summary(weight_try)
##
## Call:
## lm(formula = mpg ~ log(weight), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.4315 -2.6752 -0.2888 1.9429 16.0136
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 209.9433 6.0002 34.99 <2e-16 ***
## log(weight) -23.4317 0.7534 -31.10 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.189 on 390 degrees of freedom
## Multiple R-squared: 0.7127, Adjusted R-squared: 0.7119
## F-statistic: 967.3 on 1 and 390 DF, p-value: < 2.2e-16
plot(weight_try)
weight_try2 <- lm(mpg~sqrt(weight), data=Auto)
summary(weight_try2)
##
## Call:
## lm(formula = mpg ~ sqrt(weight), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.2402 -2.9005 -0.3708 2.0791 16.2296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.67218 1.52649 45.64 <2e-16 ***
## sqrt(weight) -0.85560 0.02797 -30.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.239 on 390 degrees of freedom
## Multiple R-squared: 0.7058, Adjusted R-squared: 0.705
## F-statistic: 935.4 on 1 and 390 DF, p-value: < 2.2e-16
plot(weight_try2)
weight_try3 <- lm(mpg~I(weight^2), data=Auto)
summary(weight_try3)
##
## Call:
## lm(formula = mpg ~ I(weight^2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2813 -3.1744 -0.4708 2.2708 17.2506
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.447e+01 4.708e-01 73.22 <2e-16 ***
## I(weight^2) -1.150e-06 4.266e-08 -26.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.619 on 390 degrees of freedom
## Multiple R-squared: 0.6507, Adjusted R-squared: 0.6498
## F-statistic: 726.6 on 1 and 390 DF, p-value: < 2.2e-16
plot(weight_try3)
Only sqrt(weight) mitigated the issue. The diagnostics plots of residual v fitted value does not show that clear of a fan pattern but still detectable if looking closely. This indicates, sqrt(weight) is on the right track finding the right transformation of the variable.
From both variables, the transformation of variable squared shows a greater curvature in the residual v fitted value diagnostics plots so this shows both variable^2 is not a good transformation is this case.
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
ques10_MLR <- lm(Sales~Price+Urban+US, data=Carseats)
summary(ques10_MLR)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
(b). Interpretation for each coefficient:
(c). Write out the model in equation form. let X1 = price, X2 = (1 if urban, 0 if not), X3 = (1 if US, 0 if not) E(Y) = 13.043469 - 0.054459X1 - 0.021916X2 + 1.200573X3
(d). The predictors you can reject the null hypothesis that are Price and USYES. This is because both predictors has significant p value, so they can reject the null hypothesis that the coefficient is 0, meaning no effect on the response, and conclude that variable has significant effect on the response.
(e). smaller model fitting
ques10_nested <- lm(Sales~Price+US, data= Carseats)
summary(ques10_nested)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
(f). The model in a and e did not fit the data well since both have very low R^2 value. This means although there p-values for individual variables are statistically significant (p < 0.05), but the R-squared value is very low. This might be because multicollinearity among the predictors can lead to this scenario. When predictors are highly correlated with each other OR relationships between predictors and the response are nonlinear, a linear regression model may not capture these relationships effectively etc.
(g). Using the model from (e), obtain 95 % confidence intervals for the coefficient(s).
# NOT FOR PREDICTING INDIVIDUAL POINTS LIKE BEFORE
#confint() will obtain 95% confidence intervals for the coefficients
conf_intervals <- confint(ques10_nested, level = 0.95)
print(conf_intervals)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
(h). Is there evidence of outliers or high leverage observations in the model from (e)?
plot(ques10_nested)
- Based on Residual v fitted plot, the pattern is random. There’s doesn’t seem to have a outlier, since outliers may appear as points with large residuals. - The normality assumption is not violated, since all points are very close to diagnoal straight line. - The scale-location graph also shows a random pattern graph. - Looking at the x axis in Residual v Leverage plot, only 1 point is bigger than 0.04, which holds more unit than other majority of points on the left.
library(ISLR2)
str(Boston)
## 'data.frame': 506 obs. of 13 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
#writing each would be too slow --> a FOR loop is needed
#first create an empty list to store model summaries
model_summaries <- list()
predictors <- colnames(Boston)[1:12] # Exclude the response variable "medv" NOT -14, not 1:13, or -13 works too
for (i in predictors) {
lm_model <- lm(medv ~ get(i), data = Boston) # Fit a SLR
model_summaries[[i]] <- summary(lm_model) #extracting values from a list using the character as the key or name for the list element
cat("Summary for predictor:", i, "\n") #create summary
print(model_summaries[[i]])
#scatter plot
plot(Boston[, i], Boston$medv, xlab = i, ylab = "medv", main = paste("Scatterplot of", i, "vs. medv"))
abline(lm_model, col = "red")
cat("\n")
}
## Summary for predictor: crim
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.957 -5.449 -2.007 2.512 29.800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.03311 0.40914 58.74 <2e-16 ***
## get(i) -0.41519 0.04389 -9.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.484 on 504 degrees of freedom
## Multiple R-squared: 0.1508, Adjusted R-squared: 0.1491
## F-statistic: 89.49 on 1 and 504 DF, p-value: < 2.2e-16
##
## Summary for predictor: zn
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.918 -5.518 -1.006 2.757 29.082
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.91758 0.42474 49.248 <2e-16 ***
## get(i) 0.14214 0.01638 8.675 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.587 on 504 degrees of freedom
## Multiple R-squared: 0.1299, Adjusted R-squared: 0.1282
## F-statistic: 75.26 on 1 and 504 DF, p-value: < 2.2e-16
##
## Summary for predictor: indus
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.017 -4.917 -1.457 3.180 32.943
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.75490 0.68345 43.54 <2e-16 ***
## get(i) -0.64849 0.05226 -12.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.057 on 504 degrees of freedom
## Multiple R-squared: 0.234, Adjusted R-squared: 0.2325
## F-statistic: 154 on 1 and 504 DF, p-value: < 2.2e-16
##
## Summary for predictor: chas
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.094 -5.894 -1.417 2.856 27.906
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.0938 0.4176 52.902 < 2e-16 ***
## get(i) 6.3462 1.5880 3.996 7.39e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.064 on 504 degrees of freedom
## Multiple R-squared: 0.03072, Adjusted R-squared: 0.02879
## F-statistic: 15.97 on 1 and 504 DF, p-value: 7.391e-05
##
## Summary for predictor: nox
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.691 -5.121 -2.161 2.959 31.310
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.346 1.811 22.83 <2e-16 ***
## get(i) -33.916 3.196 -10.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.323 on 504 degrees of freedom
## Multiple R-squared: 0.1826, Adjusted R-squared: 0.181
## F-statistic: 112.6 on 1 and 504 DF, p-value: < 2.2e-16
##
## Summary for predictor: rm
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.346 -2.547 0.090 2.986 39.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.671 2.650 -13.08 <2e-16 ***
## get(i) 9.102 0.419 21.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.616 on 504 degrees of freedom
## Multiple R-squared: 0.4835, Adjusted R-squared: 0.4825
## F-statistic: 471.8 on 1 and 504 DF, p-value: < 2.2e-16
##
## Summary for predictor: age
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.097 -5.138 -1.958 2.397 31.338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.97868 0.99911 31.006 <2e-16 ***
## get(i) -0.12316 0.01348 -9.137 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.527 on 504 degrees of freedom
## Multiple R-squared: 0.1421, Adjusted R-squared: 0.1404
## F-statistic: 83.48 on 1 and 504 DF, p-value: < 2.2e-16
##
## Summary for predictor: dis
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.016 -5.556 -1.865 2.288 30.377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.3901 0.8174 22.499 < 2e-16 ***
## get(i) 1.0916 0.1884 5.795 1.21e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.914 on 504 degrees of freedom
## Multiple R-squared: 0.06246, Adjusted R-squared: 0.0606
## F-statistic: 33.58 on 1 and 504 DF, p-value: 1.207e-08
##
## Summary for predictor: rad
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.770 -5.199 -1.967 3.321 33.292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.38213 0.56176 46.964 <2e-16 ***
## get(i) -0.40310 0.04349 -9.269 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.509 on 504 degrees of freedom
## Multiple R-squared: 0.1456, Adjusted R-squared: 0.1439
## F-statistic: 85.91 on 1 and 504 DF, p-value: < 2.2e-16
##
## Summary for predictor: tax
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.091 -5.173 -2.085 3.158 34.058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.970654 0.948296 34.77 <2e-16 ***
## get(i) -0.025568 0.002147 -11.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.133 on 504 degrees of freedom
## Multiple R-squared: 0.2195, Adjusted R-squared: 0.218
## F-statistic: 141.8 on 1 and 504 DF, p-value: < 2.2e-16
##
## Summary for predictor: ptratio
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.8342 -4.8262 -0.6426 3.1571 31.2303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.345 3.029 20.58 <2e-16 ***
## get(i) -2.157 0.163 -13.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.931 on 504 degrees of freedom
## Multiple R-squared: 0.2578, Adjusted R-squared: 0.2564
## F-statistic: 175.1 on 1 and 504 DF, p-value: < 2.2e-16
##
## Summary for predictor: lstat
##
## Call:
## lm(formula = medv ~ get(i), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## get(i) -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
After running SLR for each predictor, all predictors are significant, meaning the response variable medv depends on those explanatory variables, since their p values are significant. However, most predictors R^2 is very low, meaning less than 0.3 and there’s only predictors such as lstat has R^2 value bigger than 0.5. This shows there might be potential problems such as nonlinear relationship issue etc.
Based on the plots, most plots has weak positive or negative coefficient. The shape of the graphs varies and most hard to describe using linear relationships.
(b).
ques15_MLR <- lm(medv~., data=Boston)
summary(ques15_MLR)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1304 -2.7673 -0.5814 1.9414 26.2526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.617270 4.936039 8.431 3.79e-16 ***
## crim -0.121389 0.033000 -3.678 0.000261 ***
## zn 0.046963 0.013879 3.384 0.000772 ***
## indus 0.013468 0.062145 0.217 0.828520
## chas 2.839993 0.870007 3.264 0.001173 **
## nox -18.758022 3.851355 -4.870 1.50e-06 ***
## rm 3.658119 0.420246 8.705 < 2e-16 ***
## age 0.003611 0.013329 0.271 0.786595
## dis -1.490754 0.201623 -7.394 6.17e-13 ***
## rad 0.289405 0.066908 4.325 1.84e-05 ***
## tax -0.012682 0.003801 -3.337 0.000912 ***
## ptratio -0.937533 0.132206 -7.091 4.63e-12 ***
## lstat -0.552019 0.050659 -10.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.798 on 493 degrees of freedom
## Multiple R-squared: 0.7343, Adjusted R-squared: 0.7278
## F-statistic: 113.5 on 12 and 493 DF, p-value: < 2.2e-16
(c). Comparison (a) and (b)
The results in multiple linear regression in (b) has a significant higher R^2 than simple linear regression in (a). This might be because some of variables are correlated with the response variable or with each other (multicolinearity), which can help explain more of the variance in the response, thus R^2 higher in (b).
#Extract coefficients from SLR (a)
coefficients_SLR <- sapply(predictors, function(predictor) {
summary(lm(medv~get(predictor), data = Boston))$coef[2, 1]
})
#Extract coefficients from the MLR (b)
coefficients_MLR <- coef(ques15_MLR)[-1] # Exclude intercept
plot(coefficients_SLR, coefficients_MLR,
xlab = "SLR Coefficients",
ylab = "MLR Coefficients",
main = "Comparison of Coefficients",
pch = 19, col = "blue", ylim = c(-20, 5))
# Add labels for predictor names
text(coefficients_SLR, coefficients_MLR, labels = predictors, pos = 3)
The graph shows both coefficients of simple linear regression and multiple linear regression are around 0. While there’s 1 significant outier nox (nitrogen oxides concentration (parts per 10 million)) having high negative coefficent and 2 other variables chas and rm having positive coefficients. This shows over all both regressions agrees with each other and simple inear regression has higher/bigger coefficients than multiple linear regression. This is due to only 1 variable at a time is considered with the response, thus exhibiting greater effect, so greater coefficient is shown.
(d).
final_ques <- list()
predictors <- colnames(Boston)[-13] # Exclude the response variable "medv"
# this case predictor is "i" from last for loop
for (predictor in predictors) {
formula <- as.formula(paste("medv ~", predictor, "+ I(", predictor, "^2) + I(", predictor, "^3)"))
# paste() = concatenate
# before is lm(vairable) BUT this case we change it to a string first then can iterate using lm() below
polynomial_model <- lm(formula, data = Boston) #fitting the regression model
final_ques[[predictor]] <- summary(polynomial_model) #store summary in list
cat("Summary for predictor:", predictor, "\n")
print(final_ques[[predictor]])
cat("\n")
}
## Summary for predictor: crim
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.983 -4.975 -1.940 2.881 33.391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.519e+01 4.355e-01 57.846 < 2e-16 ***
## crim -1.136e+00 1.444e-01 -7.868 2.24e-14 ***
## I(crim^2) 2.378e-02 6.808e-03 3.494 0.000518 ***
## I(crim^3) -1.489e-04 6.641e-05 -2.242 0.025411 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.159 on 502 degrees of freedom
## Multiple R-squared: 0.2177, Adjusted R-squared: 0.213
## F-statistic: 46.57 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## Summary for predictor: zn
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.449 -5.549 -1.049 3.225 29.551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.4485972 0.4359536 46.905 < 2e-16 ***
## zn 0.6433652 0.1105611 5.819 1.06e-08 ***
## I(zn^2) -0.0167646 0.0038872 -4.313 1.94e-05 ***
## I(zn^3) 0.0001257 0.0000316 3.978 7.98e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.43 on 502 degrees of freedom
## Multiple R-squared: 0.1649, Adjusted R-squared: 0.1599
## F-statistic: 33.05 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## Summary for predictor: indus
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.760 -4.725 -1.009 2.932 32.038
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.080160 1.663326 22.293 < 2e-16 ***
## indus -2.806994 0.509349 -5.511 5.71e-08 ***
## I(indus^2) 0.140462 0.041554 3.380 0.000781 ***
## I(indus^3) -0.002399 0.001011 -2.373 0.018026 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.844 on 502 degrees of freedom
## Multiple R-squared: 0.2768, Adjusted R-squared: 0.2725
## F-statistic: 64.06 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## Summary for predictor: chas
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.094 -5.894 -1.417 2.856 27.906
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.0938 0.4176 52.902 < 2e-16 ***
## chas 6.3462 1.5880 3.996 7.39e-05 ***
## I(chas^2) NA NA NA NA
## I(chas^3) NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.064 on 504 degrees of freedom
## Multiple R-squared: 0.03072, Adjusted R-squared: 0.02879
## F-statistic: 15.97 on 1 and 504 DF, p-value: 7.391e-05
##
##
## Summary for predictor: nox
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.104 -5.020 -2.144 2.747 32.416
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -22.49 38.52 -0.584 0.5596
## nox 315.10 195.10 1.615 0.1069
## I(nox^2) -615.83 320.48 -1.922 0.0552 .
## I(nox^3) 350.19 170.92 2.049 0.0410 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.282 on 502 degrees of freedom
## Multiple R-squared: 0.1939, Adjusted R-squared: 0.189
## F-statistic: 40.24 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## Summary for predictor: rm
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.102 -2.674 0.569 3.011 35.911
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 241.3108 47.3275 5.099 4.85e-07 ***
## rm -109.3906 22.9690 -4.763 2.51e-06 ***
## I(rm^2) 16.4910 3.6750 4.487 8.95e-06 ***
## I(rm^3) -0.7404 0.1935 -3.827 0.000146 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.11 on 502 degrees of freedom
## Multiple R-squared: 0.5612, Adjusted R-squared: 0.5586
## F-statistic: 214 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## Summary for predictor: age
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.443 -4.909 -2.234 2.185 32.944
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.893e+01 2.992e+00 9.668 <2e-16 ***
## age -1.224e-01 2.014e-01 -0.608 0.544
## I(age^2) 2.355e-03 3.930e-03 0.599 0.549
## I(age^3) -2.318e-05 2.279e-05 -1.017 0.310
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.472 on 502 degrees of freedom
## Multiple R-squared: 0.1566, Adjusted R-squared: 0.1515
## F-statistic: 31.06 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## Summary for predictor: dis
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.571 -5.242 -2.037 2.397 34.769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.03789 2.91134 2.417 0.01599 *
## dis 8.59284 2.06633 4.158 3.77e-05 ***
## I(dis^2) -1.24953 0.41235 -3.030 0.00257 **
## I(dis^3) 0.05602 0.02428 2.307 0.02146 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.727 on 502 degrees of freedom
## Multiple R-squared: 0.105, Adjusted R-squared: 0.09968
## F-statistic: 19.64 on 3 and 502 DF, p-value: 4.736e-12
##
##
## Summary for predictor: rad
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.630 -5.151 -2.017 3.169 33.594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.251303 2.567860 11.781 < 2e-16 ***
## rad -3.799454 1.307156 -2.907 0.003815 **
## I(rad^2) 0.616347 0.186057 3.313 0.000991 ***
## I(rad^3) -0.020086 0.005717 -3.514 0.000482 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.37 on 502 degrees of freedom
## Multiple R-squared: 0.1767, Adjusted R-squared: 0.1718
## F-statistic: 35.91 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## Summary for predictor: tax
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.109 -4.952 -1.878 2.957 33.694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.222e+01 1.397e+01 3.739 0.000206 ***
## tax -1.635e-01 1.133e-01 -1.443 0.149646
## I(tax^2) 3.029e-04 2.872e-04 1.055 0.292004
## I(tax^3) -2.079e-07 2.236e-07 -0.930 0.353061
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.115 on 502 degrees of freedom
## Multiple R-squared: 0.2261, Adjusted R-squared: 0.2215
## F-statistic: 48.89 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## Summary for predictor: ptratio
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.7795 -5.0364 -0.9778 3.4766 31.1636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 312.28642 152.48693 2.048 0.0411 *
## ptratio -48.69114 26.88441 -1.811 0.0707 .
## I(ptratio^2) 2.83995 1.56413 1.816 0.0700 .
## I(ptratio^3) -0.05686 0.03005 -1.892 0.0590 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.898 on 502 degrees of freedom
## Multiple R-squared: 0.2669, Adjusted R-squared: 0.2625
## F-statistic: 60.91 on 3 and 502 DF, p-value: < 2.2e-16
##
##
## Summary for predictor: lstat
##
## Call:
## lm(formula = formula, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5441 -3.7122 -0.5145 2.4846 26.4153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.6496253 1.4347240 33.909 < 2e-16 ***
## lstat -3.8655928 0.3287861 -11.757 < 2e-16 ***
## I(lstat^2) 0.1487385 0.0212987 6.983 9.18e-12 ***
## I(lstat^3) -0.0020039 0.0003997 -5.013 7.43e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.396 on 502 degrees of freedom
## Multiple R-squared: 0.6578, Adjusted R-squared: 0.6558
## F-statistic: 321.7 on 3 and 502 DF, p-value: < 2.2e-16
i have neither given nor received unauthorized assistance on this test or assignment. - Lucy Liu