Part 2:
Q8:
setwd("C:/Users/peter/Desktop/UNC/Fall 2022/ECON 573/R Project/PS2_files")
auto_data <- read.csv("auto.csv")
auto_data$horsepower <- as.numeric(auto_data$horsepower)
## Warning: NAs introduced by coercion
auto_data <- na.omit(auto_data)
auto_data$name <- as.factor(auto_data$name)
summary(auto_data)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
lg <- lm(mpg ~ horsepower, auto_data)
summary(lg)
##
## Call:
## lm(formula = mpg ~ horsepower, data = auto_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
(a)Comment on the model: i. According to the model, we can see that the p-value is extremely small (<2e-16), indicating the it is significant and there is relationship between the predictor (horsepower) and the response (mpg). ii. According to the model, we can see that 61% variability of Y can be explained by using X by using R-Squared. iii. According to the model, we can see the coefficient for horsepower is -0.157845, indicating there is a negative relationship between horsepower and mpg. iV.
predict(lg, data.frame(horsepower = 98), interval = "confidence", level = 0.95)
## fit lwr upr
## 1 24.46708 23.97308 24.96108
predict(lg, data.frame(horsepower = 98), interval = "prediction", level = 0.95)
## fit lwr upr
## 1 24.46708 14.8094 34.12476
plot(mpg ~ horsepower, data = auto_data)
abline(lg)
(c)we can see the Residuals vs Fitted Graph indicating that the
relationship between response and predictor has curve and is not
completely linear. From the Normal Q-Q curve, we can see the data are
not fully normally distributed.
plot(lg)
Q9:
pairs(auto_data)
(b)
cor(auto_data[,1:8])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
mlg <- lm(mpg ~ cylinders + displacement + horsepower + weight + acceleration + year + origin, auto_data)
summary(mlg)
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration + year + origin, data = auto_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
plot(mlg)
(e) Accoring to the model shown below with a few selected interaction
term, we can see that the interaction term between horsepower and
acceleration has small p-value, indicating this is statiscally
significant term.
mlg <- lm(mpg ~ cylinders + displacement + horsepower + weight + acceleration + year + origin + cylinders * displacement + cylinders * horsepower + cylinders * weight + cylinders * acceleration + cylinders * year + cylinders * origin + horsepower * acceleration, auto_data)
summary(mlg)
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration + year + origin + cylinders * displacement +
## cylinders * horsepower + cylinders * weight + cylinders *
## acceleration + cylinders * year + cylinders * origin + horsepower *
## acceleration, data = auto_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.328 -1.505 0.060 1.370 12.450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.879e+01 1.488e+01 -2.607 0.009500 **
## cylinders 2.634e+00 2.664e+00 0.989 0.323362
## displacement 6.027e-03 2.371e-02 0.254 0.799505
## horsepower 1.692e-02 7.657e-02 0.221 0.825255
## weight -1.152e-02 2.454e-03 -4.696 3.71e-06 ***
## acceleration 1.865e-01 2.994e-01 0.623 0.533702
## year 1.385e+00 1.634e-01 8.473 5.44e-16 ***
## origin -1.749e+00 1.366e+00 -1.281 0.201116
## cylinders:displacement -1.479e-03 3.615e-03 -0.409 0.682580
## cylinders:horsepower 9.650e-03 8.328e-03 1.159 0.247278
## cylinders:weight 1.203e-03 3.695e-04 3.255 0.001236 **
## cylinders:acceleration 1.207e-01 6.306e-02 1.914 0.056379 .
## cylinders:year -1.247e-01 3.104e-02 -4.016 7.13e-05 ***
## cylinders:origin 5.784e-01 3.160e-01 1.830 0.068002 .
## horsepower:acceleration -9.349e-03 2.613e-03 -3.578 0.000392 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.822 on 377 degrees of freedom
## Multiple R-squared: 0.874, Adjusted R-squared: 0.8693
## F-statistic: 186.8 on 14 and 377 DF, p-value: < 2.2e-16
mlg2 <- lm(mpg ~ log(cylinders) + displacement + log(horsepower) + weight + acceleration + year + origin, auto_data)
mlg3 <- lm(mpg ~ cylinders + displacement + horsepower + I(weight^2) + I(acceleration^2) + year + origin, auto_data)
summary(mlg2)
##
## Call:
## lm(formula = mpg ~ log(cylinders) + displacement + log(horsepower) +
## weight + acceleration + year + origin, data = auto_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7914 -1.9401 -0.2182 1.8071 12.6684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.3379798 8.7101473 3.483 0.000553 ***
## log(cylinders) -3.8170919 1.6129724 -2.366 0.018453 *
## displacement 0.0223897 0.0066897 3.347 0.000898 ***
## log(horsepower) -9.5694799 1.5339291 -6.239 1.17e-09 ***
## weight -0.0042423 0.0006907 -6.142 2.03e-09 ***
## acceleration -0.2813480 0.1035130 -2.718 0.006865 **
## year 0.7063620 0.0482667 14.635 < 2e-16 ***
## origin 1.4616357 0.2576401 5.673 2.76e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.167 on 384 degrees of freedom
## Multiple R-squared: 0.8383, Adjusted R-squared: 0.8353
## F-statistic: 284.3 on 7 and 384 DF, p-value: < 2.2e-16
summary(mlg3)
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + I(weight^2) +
## I(acceleration^2) + year + origin, data = auto_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0914 -2.3839 -0.1408 2.0034 13.3553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.106e+01 4.834e+00 -4.356 1.70e-05 ***
## cylinders -7.174e-01 3.480e-01 -2.061 0.0399 *
## displacement 9.773e-03 8.111e-03 1.205 0.2290
## horsepower -3.332e-02 1.437e-02 -2.319 0.0209 *
## I(weight^2) -6.296e-07 1.016e-07 -6.195 1.50e-09 ***
## I(acceleration^2) -5.836e-04 3.032e-03 -0.192 0.8475
## year 7.035e-01 5.464e-02 12.876 < 2e-16 ***
## origin 1.734e+00 2.988e-01 5.804 1.36e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.593 on 384 degrees of freedom
## Multiple R-squared: 0.7918, Adjusted R-squared: 0.788
## F-statistic: 208.7 on 7 and 384 DF, p-value: < 2.2e-16
Q13: (a)
set.seed(1)
x <- rnorm(100, 0, 1)
eps <- rnorm(100, 0, 0.25)
beta0 = -1 beta1 = 0.5
y <- -1 +0.5*x + eps
scatter_plot_xy <- plot(x,y)
(e) comment: beta0: -0.9931 beta1: 0.4866
They beta hat are very close to the actual ones.
lg_yx <- lm(y~x)
summary(lg_yx)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46921 -0.15344 -0.03487 0.13485 0.58654
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.00942 0.02425 -41.63 <2e-16 ***
## x 0.49973 0.02693 18.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2407 on 98 degrees of freedom
## Multiple R-squared: 0.7784, Adjusted R-squared: 0.7762
## F-statistic: 344.3 on 1 and 98 DF, p-value: < 2.2e-16
plot(x,y)
abline(-1, 0.5, col ="blue" )
abline(lg_yx, col = "red")
legend("topleft", legend = c("actual","model"), col = c("blue", "red"))
(g)The quadratic term does improve the model fit. As we can see in the
previous model, the adjusted R-squared is 0.7627 and R-squared is
0.7651. After adding the quadratic term, we can see that the adjusted
R-squared 0.7654 and R-squared is 0.7702.
x2 <- x^2
yxx2_lg <- lm(y ~ x+x2)
summary(yxx2_lg)
##
## Call:
## lm(formula = y ~ x + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4913 -0.1563 -0.0322 0.1451 0.5675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.98582 0.02941 -33.516 <2e-16 ***
## x 0.50429 0.02700 18.680 <2e-16 ***
## x2 -0.02973 0.02119 -1.403 0.164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2395 on 97 degrees of freedom
## Multiple R-squared: 0.7828, Adjusted R-squared: 0.7784
## F-statistic: 174.8 on 2 and 97 DF, p-value: < 2.2e-16
set.seed(1)
x2 <- rnorm(100, 0, 1)
eps2 <- rnorm(100, 0, 0.1)
y2 <- -1 +0.5*x2 + eps2
plot(x2,y2)
lg_yx2 <- lm(y2~x2)
summary(lg_yx2)
##
## Call:
## lm(formula = y2 ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18768 -0.06138 -0.01395 0.05394 0.23462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.003769 0.009699 -103.5 <2e-16 ***
## x2 0.499894 0.010773 46.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09628 on 98 degrees of freedom
## Multiple R-squared: 0.9565, Adjusted R-squared: 0.956
## F-statistic: 2153 on 1 and 98 DF, p-value: < 2.2e-16
abline(lg_yx2)
(i) The model fit decrease as we increase the variance of the normal
distribution used to generate the error term (increase the noise). The
reason is that we can see the new R-squared is 0.4674 and adjusted
R-squared is 0.4619 which is higher than that from previous model with
quadratic term (R-squared: 0.7702 and Adjusted R-squared: 0.7654)
set.seed(1)
x3 <- rnorm(100, 0, 1)
eps3 <- rnorm(100, 0, 0.5)
y3 <- -1 +0.5*x3 + eps3
plot(x3,y3)
lg_yx3 <- lm(y3~x3)
summary(lg_yx3)
##
## Call:
## lm(formula = y3 ~ x3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93842 -0.30688 -0.06975 0.26970 1.17309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.01885 0.04849 -21.010 < 2e-16 ***
## x3 0.49947 0.05386 9.273 4.58e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4814 on 98 degrees of freedom
## Multiple R-squared: 0.4674, Adjusted R-squared: 0.4619
## F-statistic: 85.99 on 1 and 98 DF, p-value: 4.583e-15
abline(lg_yx3)
(j) According to the confidence interval below, we can see the 95%
confidence interval for beta0 in the original model is [-1.044637,
-0.9415372] and for beta1 in the original model is [0.432583,
0.5406738]. For noisier model, the 95% confidence interval for beta0 is
[-1.1150804, -0.9226122] and beta1 is [0.3925794, 0.6063602]. For less
noisy model, the 95% confidence interval for beta0 is [-1.0230161,
-0.9845224] and bets 1 is [0.4785159, 0.5212720]. We can see that
confidence interval is narrowest for less noisy model as the variance is
smaller, we would only need a smaller interval for it. However, the
confidence interval is widest for nosiser model as the variance is
bigger, we would need a bigger interval for it.
confint(lg_yx)
## 2.5 % 97.5 %
## (Intercept) -1.0575402 -0.9613061
## x 0.4462897 0.5531801
confint(lg_yx2)
## 2.5 % 97.5 %
## (Intercept) -1.0230161 -0.9845224
## x2 0.4785159 0.5212720
confint(lg_yx3)
## 2.5 % 97.5 %
## (Intercept) -1.1150804 -0.9226122
## x3 0.3925794 0.6063602
Q15: (a) lm1:Significant lm2:Significant lm3:Not Significant lm4:Significant lm5:Significant lm6:Significant lm7:Significant lm8:Significant lm9:Significant lm10:Significant lm11:Significant lm12:Significant
library(ISLR2)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
lm1 <- lm(crim~zn, data = Boston)
lm2 <- lm(crim~indus, data = Boston)
lm3 <- lm(crim~chas, data = Boston)
lm4 <- lm(crim~nox, data = Boston)
lm5 <- lm(crim~rm, data = Boston)
lm6 <- lm(crim~age, data = Boston)
lm7 <- lm(crim~dis, data = Boston)
lm8 <- lm(crim~rad, data = Boston)
lm9 <- lm(crim~tax, data = Boston)
lm10 <- lm(crim~ptratio, data = Boston)
lm11 <- lm(crim~lstat, data = Boston)
lm12 <- lm(crim~medv, data = Boston)
summary(lm1)
##
## Call:
## lm(formula = crim ~ zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.429 -4.222 -2.620 1.250 84.523
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.45369 0.41722 10.675 < 2e-16 ***
## zn -0.07393 0.01609 -4.594 5.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.435 on 504 degrees of freedom
## Multiple R-squared: 0.04019, Adjusted R-squared: 0.03828
## F-statistic: 21.1 on 1 and 504 DF, p-value: 5.506e-06
summary(lm2)
##
## Call:
## lm(formula = crim ~ indus, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.972 -2.698 -0.736 0.712 81.813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.06374 0.66723 -3.093 0.00209 **
## indus 0.50978 0.05102 9.991 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.866 on 504 degrees of freedom
## Multiple R-squared: 0.1653, Adjusted R-squared: 0.1637
## F-statistic: 99.82 on 1 and 504 DF, p-value: < 2.2e-16
summary(lm3)
##
## Call:
## lm(formula = crim ~ chas, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.738 -3.661 -3.435 0.018 85.232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7444 0.3961 9.453 <2e-16 ***
## chas -1.8928 1.5061 -1.257 0.209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.597 on 504 degrees of freedom
## Multiple R-squared: 0.003124, Adjusted R-squared: 0.001146
## F-statistic: 1.579 on 1 and 504 DF, p-value: 0.2094
summary(lm4)
##
## Call:
## lm(formula = crim ~ nox, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.371 -2.738 -0.974 0.559 81.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.720 1.699 -8.073 5.08e-15 ***
## nox 31.249 2.999 10.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.81 on 504 degrees of freedom
## Multiple R-squared: 0.1772, Adjusted R-squared: 0.1756
## F-statistic: 108.6 on 1 and 504 DF, p-value: < 2.2e-16
summary(lm5)
##
## Call:
## lm(formula = crim ~ rm, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.604 -3.952 -2.654 0.989 87.197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.482 3.365 6.088 2.27e-09 ***
## rm -2.684 0.532 -5.045 6.35e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.401 on 504 degrees of freedom
## Multiple R-squared: 0.04807, Adjusted R-squared: 0.04618
## F-statistic: 25.45 on 1 and 504 DF, p-value: 6.347e-07
summary(lm6)
##
## Call:
## lm(formula = crim ~ age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.789 -4.257 -1.230 1.527 82.849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.77791 0.94398 -4.002 7.22e-05 ***
## age 0.10779 0.01274 8.463 2.85e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.057 on 504 degrees of freedom
## Multiple R-squared: 0.1244, Adjusted R-squared: 0.1227
## F-statistic: 71.62 on 1 and 504 DF, p-value: 2.855e-16
summary(lm7)
##
## Call:
## lm(formula = crim ~ dis, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.708 -4.134 -1.527 1.516 81.674
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.4993 0.7304 13.006 <2e-16 ***
## dis -1.5509 0.1683 -9.213 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.965 on 504 degrees of freedom
## Multiple R-squared: 0.1441, Adjusted R-squared: 0.1425
## F-statistic: 84.89 on 1 and 504 DF, p-value: < 2.2e-16
summary(lm8)
##
## Call:
## lm(formula = crim ~ rad, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.164 -1.381 -0.141 0.660 76.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.28716 0.44348 -5.157 3.61e-07 ***
## rad 0.61791 0.03433 17.998 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.718 on 504 degrees of freedom
## Multiple R-squared: 0.3913, Adjusted R-squared: 0.39
## F-statistic: 323.9 on 1 and 504 DF, p-value: < 2.2e-16
summary(lm9)
##
## Call:
## lm(formula = crim ~ tax, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.513 -2.738 -0.194 1.065 77.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.528369 0.815809 -10.45 <2e-16 ***
## tax 0.029742 0.001847 16.10 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.997 on 504 degrees of freedom
## Multiple R-squared: 0.3396, Adjusted R-squared: 0.3383
## F-statistic: 259.2 on 1 and 504 DF, p-value: < 2.2e-16
summary(lm10)
##
## Call:
## lm(formula = crim ~ ptratio, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.654 -3.985 -1.912 1.825 83.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.6469 3.1473 -5.607 3.40e-08 ***
## ptratio 1.1520 0.1694 6.801 2.94e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.24 on 504 degrees of freedom
## Multiple R-squared: 0.08407, Adjusted R-squared: 0.08225
## F-statistic: 46.26 on 1 and 504 DF, p-value: 2.943e-11
summary(lm11)
##
## Call:
## lm(formula = crim ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.925 -2.822 -0.664 1.079 82.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.33054 0.69376 -4.801 2.09e-06 ***
## lstat 0.54880 0.04776 11.491 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.664 on 504 degrees of freedom
## Multiple R-squared: 0.2076, Adjusted R-squared: 0.206
## F-statistic: 132 on 1 and 504 DF, p-value: < 2.2e-16
summary(lm12)
##
## Call:
## lm(formula = crim ~ medv, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.071 -4.022 -2.343 1.298 80.957
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.79654 0.93419 12.63 <2e-16 ***
## medv -0.36316 0.03839 -9.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.934 on 504 degrees of freedom
## Multiple R-squared: 0.1508, Adjusted R-squared: 0.1491
## F-statistic: 89.49 on 1 and 504 DF, p-value: < 2.2e-16
plot(lm3)
plot(lm1)
plot(lm5)
(b) We can see that we can reject the following predictors using
p-values: zn, dis, rad, medv.
lm_all <- lm(crim~., data = Boston)
summary(lm_all)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
x <- c(
coefficients(lm1)[2],
coefficients(lm2)[2],
coefficients(lm3)[2],
coefficients(lm4)[2],
coefficients(lm5)[2],
coefficients(lm6)[2],
coefficients(lm7)[2],
coefficients(lm8)[2],
coefficients(lm9)[2],
coefficients(lm10)[2],
coefficients(lm11)[2],
coefficients(lm12)[2]
)
y <- coefficients(lm_all)[2:13]
plot(x,y)
(d) lm_poly1: zn significant lm_poly2: all significant lm_poly3: all not
significant lm_poly4: all significant lm_poly5: all not significant
lm_poly6: age2 and age 3 significant lm_poly7: all significant lm_poly8:
all not significant lm_poly9: all not significant lm_poly10: all
significant lm_poly11: all not significant lm_poly12: all
significant
lm_poly1 <- lm(crim ~ zn + I(zn^2) + I(zn^3), data = Boston)
lm_poly2 <- lm(crim ~ indus + I(indus^2) + I(indus^3), data = Boston)
lm_poly3 <- lm(crim ~ chas + I(chas^2) + I(chas^3), data = Boston)
lm_poly4 <- lm(crim ~ nox + I(nox^2) + I(nox^3), data = Boston)
lm_poly5 <- lm(crim ~ rm + I(rm^2) + I(rm^3), data = Boston)
lm_poly6 <- lm(crim ~ age + I(age^2) + I(age^3), data = Boston)
lm_poly7 <- lm(crim ~ dis + I(dis^2) + I(dis^3), data = Boston)
lm_poly8 <- lm(crim ~ rad + I(rad^2) + I(rad^3), data = Boston)
lm_poly9 <- lm(crim ~ tax + I(tax^2) + I(tax^3), data = Boston)
lm_poly10 <- lm(crim ~ ptratio + I(ptratio^2) + I(ptratio^3), data = Boston)
lm_poly11 <- lm(crim ~ lstat + I(lstat^2) + I(lstat^3), data = Boston)
lm_poly12 <- lm(crim ~ medv + I(medv^2) + I(medv^3), data = Boston)
summary(lm_poly1)
##
## Call:
## lm(formula = crim ~ zn + I(zn^2) + I(zn^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.821 -4.614 -1.294 0.473 84.130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.846e+00 4.330e-01 11.192 < 2e-16 ***
## zn -3.322e-01 1.098e-01 -3.025 0.00261 **
## I(zn^2) 6.483e-03 3.861e-03 1.679 0.09375 .
## I(zn^3) -3.776e-05 3.139e-05 -1.203 0.22954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.372 on 502 degrees of freedom
## Multiple R-squared: 0.05824, Adjusted R-squared: 0.05261
## F-statistic: 10.35 on 3 and 502 DF, p-value: 1.281e-06
summary(lm_poly2)
##
## Call:
## lm(formula = crim ~ indus + I(indus^2) + I(indus^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.278 -2.514 0.054 0.764 79.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6625683 1.5739833 2.327 0.0204 *
## indus -1.9652129 0.4819901 -4.077 5.30e-05 ***
## I(indus^2) 0.2519373 0.0393221 6.407 3.42e-10 ***
## I(indus^3) -0.0069760 0.0009567 -7.292 1.20e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.423 on 502 degrees of freedom
## Multiple R-squared: 0.2597, Adjusted R-squared: 0.2552
## F-statistic: 58.69 on 3 and 502 DF, p-value: < 2.2e-16
summary(lm_poly3)
##
## Call:
## lm(formula = crim ~ chas + I(chas^2) + I(chas^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.738 -3.661 -3.435 0.018 85.232
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7444 0.3961 9.453 <2e-16 ***
## chas -1.8928 1.5061 -1.257 0.209
## I(chas^2) NA NA NA NA
## I(chas^3) NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.597 on 504 degrees of freedom
## Multiple R-squared: 0.003124, Adjusted R-squared: 0.001146
## F-statistic: 1.579 on 1 and 504 DF, p-value: 0.2094
summary(lm_poly4)
##
## Call:
## lm(formula = crim ~ nox + I(nox^2) + I(nox^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.110 -2.068 -0.255 0.739 78.302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 233.09 33.64 6.928 1.31e-11 ***
## nox -1279.37 170.40 -7.508 2.76e-13 ***
## I(nox^2) 2248.54 279.90 8.033 6.81e-15 ***
## I(nox^3) -1245.70 149.28 -8.345 6.96e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.234 on 502 degrees of freedom
## Multiple R-squared: 0.297, Adjusted R-squared: 0.2928
## F-statistic: 70.69 on 3 and 502 DF, p-value: < 2.2e-16
summary(lm_poly5)
##
## Call:
## lm(formula = crim ~ rm + I(rm^2) + I(rm^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.485 -3.468 -2.221 -0.015 87.219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.6246 64.5172 1.746 0.0815 .
## rm -39.1501 31.3115 -1.250 0.2118
## I(rm^2) 4.5509 5.0099 0.908 0.3641
## I(rm^3) -0.1745 0.2637 -0.662 0.5086
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.33 on 502 degrees of freedom
## Multiple R-squared: 0.06779, Adjusted R-squared: 0.06222
## F-statistic: 12.17 on 3 and 502 DF, p-value: 1.067e-07
summary(lm_poly6)
##
## Call:
## lm(formula = crim ~ age + I(age^2) + I(age^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.762 -2.673 -0.516 0.019 82.842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.549e+00 2.769e+00 -0.920 0.35780
## age 2.737e-01 1.864e-01 1.468 0.14266
## I(age^2) -7.230e-03 3.637e-03 -1.988 0.04738 *
## I(age^3) 5.745e-05 2.109e-05 2.724 0.00668 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.84 on 502 degrees of freedom
## Multiple R-squared: 0.1742, Adjusted R-squared: 0.1693
## F-statistic: 35.31 on 3 and 502 DF, p-value: < 2.2e-16
summary(lm_poly7)
##
## Call:
## lm(formula = crim ~ dis + I(dis^2) + I(dis^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.757 -2.588 0.031 1.267 76.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.0476 2.4459 12.285 < 2e-16 ***
## dis -15.5543 1.7360 -8.960 < 2e-16 ***
## I(dis^2) 2.4521 0.3464 7.078 4.94e-12 ***
## I(dis^3) -0.1186 0.0204 -5.814 1.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.331 on 502 degrees of freedom
## Multiple R-squared: 0.2778, Adjusted R-squared: 0.2735
## F-statistic: 64.37 on 3 and 502 DF, p-value: < 2.2e-16
summary(lm_poly8)
##
## Call:
## lm(formula = crim ~ rad + I(rad^2) + I(rad^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.381 -0.412 -0.269 0.179 76.217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.605545 2.050108 -0.295 0.768
## rad 0.512736 1.043597 0.491 0.623
## I(rad^2) -0.075177 0.148543 -0.506 0.613
## I(rad^3) 0.003209 0.004564 0.703 0.482
##
## Residual standard error: 6.682 on 502 degrees of freedom
## Multiple R-squared: 0.4, Adjusted R-squared: 0.3965
## F-statistic: 111.6 on 3 and 502 DF, p-value: < 2.2e-16
summary(lm_poly9)
##
## Call:
## lm(formula = crim ~ tax + I(tax^2) + I(tax^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.273 -1.389 0.046 0.536 76.950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.918e+01 1.180e+01 1.626 0.105
## tax -1.533e-01 9.568e-02 -1.602 0.110
## I(tax^2) 3.608e-04 2.425e-04 1.488 0.137
## I(tax^3) -2.204e-07 1.889e-07 -1.167 0.244
##
## Residual standard error: 6.854 on 502 degrees of freedom
## Multiple R-squared: 0.3689, Adjusted R-squared: 0.3651
## F-statistic: 97.8 on 3 and 502 DF, p-value: < 2.2e-16
summary(lm_poly10)
##
## Call:
## lm(formula = crim ~ ptratio + I(ptratio^2) + I(ptratio^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.833 -4.146 -1.655 1.408 82.697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 477.18405 156.79498 3.043 0.00246 **
## ptratio -82.36054 27.64394 -2.979 0.00303 **
## I(ptratio^2) 4.63535 1.60832 2.882 0.00412 **
## I(ptratio^3) -0.08476 0.03090 -2.743 0.00630 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.122 on 502 degrees of freedom
## Multiple R-squared: 0.1138, Adjusted R-squared: 0.1085
## F-statistic: 21.48 on 3 and 502 DF, p-value: 4.171e-13
summary(lm_poly11)
##
## Call:
## lm(formula = crim ~ lstat + I(lstat^2) + I(lstat^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.234 -2.151 -0.486 0.066 83.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2009656 2.0286452 0.592 0.5541
## lstat -0.4490656 0.4648911 -0.966 0.3345
## I(lstat^2) 0.0557794 0.0301156 1.852 0.0646 .
## I(lstat^3) -0.0008574 0.0005652 -1.517 0.1299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.629 on 502 degrees of freedom
## Multiple R-squared: 0.2179, Adjusted R-squared: 0.2133
## F-statistic: 46.63 on 3 and 502 DF, p-value: < 2.2e-16
summary(lm_poly12)
##
## Call:
## lm(formula = crim ~ medv + I(medv^2) + I(medv^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.427 -1.976 -0.437 0.439 73.655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.1655381 3.3563105 15.840 < 2e-16 ***
## medv -5.0948305 0.4338321 -11.744 < 2e-16 ***
## I(medv^2) 0.1554965 0.0171904 9.046 < 2e-16 ***
## I(medv^3) -0.0014901 0.0002038 -7.312 1.05e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.569 on 502 degrees of freedom
## Multiple R-squared: 0.4202, Adjusted R-squared: 0.4167
## F-statistic: 121.3 on 3 and 502 DF, p-value: < 2.2e-16
Part 3: # conditional vs marginal value
getwd()
## [1] "C:/Users/peter/Desktop/UNC/Fall 2022/ECON 573/R Project/PS2_files"
homes <- read.csv("home2024.csv")
par(mfrow=c(1,2)) # 1 row, 2 columns of plots
hist(homes$VALUE, col="grey", xlab="home value", main="")
plot(VALUE ~ factor(BATHS),
col=rainbow(8), data=homes[homes$BATHS<8,],
xlab="number of bathrooms", ylab="home value")
# create a var for downpayment being greater than 20% # some quick
plots. Do more to build your intuition!
homes$gt20dwn <-
factor(0.2<(homes$LPRICE-homes$AMMORT)/homes$LPRICE)
par(mfrow=c(1,2))
homes$STATE <- as.factor(homes$STATE)
plot(VALUE ~ STATE, data=homes,
col=rainbow(nlevels(homes$STATE)),
ylim=c(0,10^6), cex.axis=.65)
(1): In the graph below, we can see that without trash junk within 1/2 block, the average current market value is higher than that with trash junk. Same idea can be seen in the neighborhood with bad smells has average lower current market value than neighborhood without bad smells.
homes$EJUNK <- as.factor(homes$EJUNK)
homes$ODORA <- as.factor(homes$ODORA)
homes$HHGRAD <- as.factor(homes$HHGRAD)
homes$DWNPAY <- as.factor(homes$DWNPAY)
homes$ECOM1 <- as.factor(homes$ECOM1)
par(mfrow=c(1,2))
plot(VALUE ~ EJUNK, data=homes,
col=rainbow(nlevels(homes$STATE)),
ylim=c(0,10^6), cex.axis=.65)
plot(VALUE ~ ODORA, data=homes,
col=rainbow(nlevels(homes$STATE)),
ylim=c(0,10^6), cex.axis=.65)
pricey <- glm(log(VALUE) ~ .-AMMORT-LPRICE, data=homes)
summary(pricey)
##
## Call:
## glm(formula = log(VALUE) ~ . - AMMORT - LPRICE, data = homes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.2965 -0.1546 0.0583 0.2752 2.4718
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.158e+01 6.234e-02 185.814 < 2e-16 ***
## EAPTBLY -4.343e-02 2.345e-02 -1.852 0.06411 .
## ECOM1Y -2.428e-02 1.924e-02 -1.262 0.20707
## ECOM2Y -8.432e-02 4.805e-02 -1.755 0.07928 .
## EGREENY 9.284e-03 1.400e-02 0.663 0.50726
## EJUNKY -1.265e-01 5.103e-02 -2.478 0.01322 *
## ELOW1Y 2.848e-02 2.311e-02 1.232 0.21785
## ESFDY 2.965e-01 2.956e-02 10.033 < 2e-16 ***
## ETRANSY -1.471e-02 2.532e-02 -0.581 0.56108
## EABANY -1.617e-01 3.598e-02 -4.495 7.01e-06 ***
## HOWHgood 1.306e-01 2.631e-02 4.963 7.00e-07 ***
## HOWNgood 1.178e-01 2.190e-02 5.378 7.64e-08 ***
## ODORAY 9.254e-03 3.312e-02 0.279 0.77991
## STRNAY -3.542e-02 1.607e-02 -2.204 0.02751 *
## ZINC2 6.215e-07 5.538e-08 11.224 < 2e-16 ***
## PER 1.060e-02 6.260e-03 1.694 0.09027 .
## ZADULT -1.870e-02 1.087e-02 -1.720 0.08539 .
## HHGRADBach 1.302e-01 2.292e-02 5.680 1.37e-08 ***
## HHGRADGrad 1.939e-01 2.579e-02 7.518 5.86e-14 ***
## HHGRADHS Grad -6.048e-02 2.170e-02 -2.787 0.00533 **
## HHGRADNo HS -1.942e-01 3.182e-02 -6.104 1.06e-09 ***
## NUNITS -9.563e-04 5.202e-04 -1.838 0.06603 .
## INTW -4.583e-02 4.411e-03 -10.391 < 2e-16 ***
## METROurban 8.622e-02 1.807e-02 4.772 1.84e-06 ***
## STATECO -2.907e-01 2.921e-02 -9.952 < 2e-16 ***
## STATECT -3.521e-01 3.130e-02 -11.250 < 2e-16 ***
## STATEGA -6.509e-01 3.110e-02 -20.930 < 2e-16 ***
## STATEIL -8.643e-01 5.767e-02 -14.986 < 2e-16 ***
## STATEIN -7.788e-01 3.070e-02 -25.371 < 2e-16 ***
## STATELA -7.220e-01 3.688e-02 -19.579 < 2e-16 ***
## STATEMO -6.669e-01 3.343e-02 -19.947 < 2e-16 ***
## STATEOH -6.781e-01 3.271e-02 -20.730 < 2e-16 ***
## STATEOK -9.964e-01 3.280e-02 -30.375 < 2e-16 ***
## STATEPA -8.743e-01 3.389e-02 -25.798 < 2e-16 ***
## STATETX -1.048e+00 3.430e-02 -30.566 < 2e-16 ***
## STATEWA -1.236e-01 3.093e-02 -3.995 6.50e-05 ***
## BATHS 2.086e-01 1.163e-02 17.934 < 2e-16 ***
## BEDRMS 8.728e-02 1.006e-02 8.680 < 2e-16 ***
## MATBUYY -3.190e-02 1.370e-02 -2.329 0.01986 *
## DWNPAYprev home 1.131e-01 1.802e-02 6.277 3.54e-10 ***
## FRSTHOY -8.105e-02 1.727e-02 -4.694 2.70e-06 ***
## gt20dwnTRUE 4.609e-02 1.523e-02 3.025 0.00249 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.6673016)
##
## Null deviance: 14920 on 15564 degrees of freedom
## Residual deviance: 10359 on 15523 degrees of freedom
## AIC: 37919
##
## Number of Fisher Scoring iterations: 2
pvals <- summary(pricey)$coef[-1,4]
names(pvals)[pvals < 0.1]
## [1] "EAPTBLY" "ECOM2Y" "EJUNKY" "ESFDY"
## [5] "EABANY" "HOWHgood" "HOWNgood" "STRNAY"
## [9] "ZINC2" "PER" "ZADULT" "HHGRADBach"
## [13] "HHGRADGrad" "HHGRADHS Grad" "HHGRADNo HS" "NUNITS"
## [17] "INTW" "METROurban" "STATECO" "STATECT"
## [21] "STATEGA" "STATEIL" "STATEIN" "STATELA"
## [25] "STATEMO" "STATEOH" "STATEOK" "STATEPA"
## [29] "STATETX" "STATEWA" "BATHS" "BEDRMS"
## [33] "MATBUYY" "DWNPAYprev home" "FRSTHOY" "gt20dwnTRUE"
pricey2 <- glm(log(VALUE) ~ .-AMMORT-LPRICE-ECOM1-EGREEN-ELOW1-ETRANS-ODORA, data=homes)
summary(pricey2)
##
## Call:
## glm(formula = log(VALUE) ~ . - AMMORT - LPRICE - ECOM1 - EGREEN -
## ELOW1 - ETRANS - ODORA, data = homes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -13.2991 -0.1543 0.0589 0.2752 2.4757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.159e+01 6.143e-02 188.730 < 2e-16 ***
## EAPTBLY -4.433e-02 2.210e-02 -2.006 0.04486 *
## ECOM2Y -9.406e-02 4.698e-02 -2.002 0.04529 *
## EJUNKY -1.256e-01 5.086e-02 -2.470 0.01352 *
## ESFDY 2.901e-01 2.925e-02 9.917 < 2e-16 ***
## EABANY -1.627e-01 3.593e-02 -4.530 5.95e-06 ***
## HOWHgood 1.311e-01 2.630e-02 4.983 6.32e-07 ***
## HOWNgood 1.183e-01 2.186e-02 5.412 6.33e-08 ***
## STRNAY -3.817e-02 1.583e-02 -2.412 0.01588 *
## ZINC2 6.213e-07 5.534e-08 11.228 < 2e-16 ***
## PER 1.034e-02 6.252e-03 1.655 0.09803 .
## ZADULT -1.877e-02 1.087e-02 -1.727 0.08426 .
## HHGRADBach 1.309e-01 2.290e-02 5.715 1.12e-08 ***
## HHGRADGrad 1.946e-01 2.577e-02 7.551 4.55e-14 ***
## HHGRADHS Grad -6.070e-02 2.170e-02 -2.798 0.00515 **
## HHGRADNo HS -1.952e-01 3.181e-02 -6.136 8.65e-10 ***
## NUNITS -1.001e-03 5.196e-04 -1.927 0.05405 .
## INTW -4.608e-02 4.408e-03 -10.454 < 2e-16 ***
## METROurban 8.429e-02 1.793e-02 4.702 2.60e-06 ***
## STATECO -2.867e-01 2.904e-02 -9.873 < 2e-16 ***
## STATECT -3.510e-01 3.126e-02 -11.228 < 2e-16 ***
## STATEGA -6.503e-01 3.099e-02 -20.981 < 2e-16 ***
## STATEIL -8.651e-01 5.762e-02 -15.015 < 2e-16 ***
## STATEIN -7.786e-01 3.067e-02 -25.391 < 2e-16 ***
## STATELA -7.240e-01 3.677e-02 -19.689 < 2e-16 ***
## STATEMO -6.669e-01 3.342e-02 -19.954 < 2e-16 ***
## STATEOH -6.801e-01 3.264e-02 -20.838 < 2e-16 ***
## STATEOK -9.963e-01 3.278e-02 -30.391 < 2e-16 ***
## STATEPA -8.712e-01 3.383e-02 -25.754 < 2e-16 ***
## STATETX -1.049e+00 3.426e-02 -30.622 < 2e-16 ***
## STATEWA -1.210e-01 3.090e-02 -3.916 9.05e-05 ***
## BATHS 2.098e-01 1.160e-02 18.086 < 2e-16 ***
## BEDRMS 8.620e-02 9.999e-03 8.621 < 2e-16 ***
## MATBUYY -3.165e-02 1.368e-02 -2.314 0.02071 *
## DWNPAYprev home 1.137e-01 1.802e-02 6.308 2.90e-10 ***
## FRSTHOY -8.170e-02 1.726e-02 -4.734 2.22e-06 ***
## gt20dwnTRUE 4.674e-02 1.523e-02 3.069 0.00215 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.6672567)
##
## Null deviance: 14920 on 15564 degrees of freedom
## Residual deviance: 10361 on 15528 degrees of freedom
## AIC: 37913
##
## Number of Fisher Scoring iterations: 2
str(homes$gt20dwn)
## Factor w/ 2 levels "FALSE","TRUE": 2 1 1 1 1 1 1 1 1 1 ...
more_than_tt <- glm(gt20dwn ~ .-AMMORT-LPRICE,data=homes, family = binomial)
summary(more_than_tt)
##
## Call:
## glm(formula = gt20dwn ~ . - AMMORT - LPRICE, family = binomial,
## data = homes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4502 -0.8084 -0.5985 1.0693 2.4772
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.293e+00 1.831e-01 -7.065 1.61e-12 ***
## EAPTBLY 1.505e-02 7.025e-02 0.214 0.830424
## ECOM1Y -1.619e-01 5.809e-02 -2.787 0.005325 **
## ECOM2Y -3.131e-01 1.600e-01 -1.957 0.050385 .
## EGREENY -1.569e-03 3.984e-02 -0.039 0.968582
## EJUNKY -9.697e-03 1.608e-01 -0.060 0.951913
## ELOW1Y 4.635e-02 6.627e-02 0.699 0.484292
## ESFDY -2.670e-01 8.276e-02 -3.227 0.001252 **
## ETRANSY -6.270e-02 7.616e-02 -0.823 0.410416
## EABANY -8.187e-02 1.157e-01 -0.708 0.479137
## HOWHgood -1.372e-01 7.947e-02 -1.726 0.084398 .
## HOWNgood 1.597e-01 6.730e-02 2.372 0.017669 *
## ODORAY 1.041e-01 9.811e-02 1.061 0.288528
## STRNAY -9.644e-02 4.737e-02 -2.036 0.041783 *
## ZINC2 -1.277e-07 1.874e-07 -0.682 0.495530
## PER -1.253e-01 1.855e-02 -6.752 1.46e-11 ***
## ZADULT 1.944e-02 3.188e-02 0.610 0.542024
## HHGRADBach 1.797e-01 6.596e-02 2.725 0.006431 **
## HHGRADGrad 2.729e-01 7.288e-02 3.745 0.000181 ***
## HHGRADHS Grad -2.064e-02 6.376e-02 -0.324 0.746192
## HHGRADNo HS -7.246e-02 9.845e-02 -0.736 0.461720
## NUNITS 2.377e-03 1.428e-03 1.664 0.096100 .
## INTW -6.327e-02 1.372e-02 -4.613 3.98e-06 ***
## METROurban -8.000e-02 5.389e-02 -1.485 0.137672
## STATECO -2.513e-02 8.491e-02 -0.296 0.767257
## STATECT 7.870e-01 8.825e-02 8.918 < 2e-16 ***
## STATEGA -2.223e-01 9.455e-02 -2.351 0.018716 *
## STATEIL 5.870e-01 1.635e-01 3.590 0.000330 ***
## STATEIN 2.431e-01 9.352e-02 2.599 0.009336 **
## STATELA 5.932e-01 1.077e-01 5.506 3.67e-08 ***
## STATEMO 5.309e-01 9.730e-02 5.456 4.87e-08 ***
## STATEOH 7.642e-01 9.480e-02 8.061 7.59e-16 ***
## STATEOK 1.291e-01 1.027e-01 1.257 0.208850
## STATEPA 6.011e-01 1.007e-01 5.968 2.40e-09 ***
## STATETX 2.935e-01 1.073e-01 2.736 0.006221 **
## STATEWA 1.525e-01 8.819e-02 1.730 0.083717 .
## BATHS 2.445e-01 3.419e-02 7.152 8.57e-13 ***
## BEDRMS -2.086e-02 2.908e-02 -0.717 0.473120
## MATBUYY 2.587e-01 3.927e-02 6.588 4.45e-11 ***
## DWNPAYprev home 7.417e-01 4.857e-02 15.272 < 2e-16 ***
## VALUE 1.489e-06 1.452e-07 10.256 < 2e-16 ***
## FRSTHOY -3.700e-01 5.170e-02 -7.156 8.29e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18873 on 15564 degrees of freedom
## Residual deviance: 16969 on 15523 degrees of freedom
## AIC: 17053
##
## Number of Fisher Scoring iterations: 4
more_than_tt1 <- glm(gt20dwn ~ .-AMMORT-LPRICE+FRSTHO*BATHS,data=homes, family = binomial)
summary(more_than_tt1)
##
## Call:
## glm(formula = gt20dwn ~ . - AMMORT - LPRICE + FRSTHO * BATHS,
## family = binomial, data = homes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4400 -0.8054 -0.5974 1.0654 2.4456
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.378e+00 1.851e-01 -7.444 9.76e-14 ***
## EAPTBLY 1.217e-02 7.020e-02 0.173 0.862337
## ECOM1Y -1.608e-01 5.806e-02 -2.770 0.005612 **
## ECOM2Y -3.181e-01 1.598e-01 -1.991 0.046511 *
## EGREENY -2.305e-03 3.987e-02 -0.058 0.953900
## EJUNKY -5.332e-03 1.606e-01 -0.033 0.973520
## ELOW1Y 4.950e-02 6.627e-02 0.747 0.455066
## ESFDY -2.715e-01 8.276e-02 -3.280 0.001036 **
## ETRANSY -6.147e-02 7.612e-02 -0.808 0.419333
## EABANY -9.206e-02 1.155e-01 -0.797 0.425505
## HOWHgood -1.324e-01 7.938e-02 -1.668 0.095245 .
## HOWNgood 1.630e-01 6.728e-02 2.423 0.015399 *
## ODORAY 1.022e-01 9.804e-02 1.043 0.297090
## STRNAY -9.672e-02 4.736e-02 -2.042 0.041136 *
## ZINC2 -1.479e-07 1.897e-07 -0.780 0.435530
## PER -1.266e-01 1.859e-02 -6.811 9.67e-12 ***
## ZADULT 2.195e-02 3.193e-02 0.687 0.491817
## HHGRADBach 1.818e-01 6.597e-02 2.755 0.005863 **
## HHGRADGrad 2.770e-01 7.294e-02 3.797 0.000146 ***
## HHGRADHS Grad -1.967e-02 6.374e-02 -0.309 0.757647
## HHGRADNo HS -7.767e-02 9.837e-02 -0.790 0.429774
## NUNITS 2.284e-03 1.415e-03 1.613 0.106646
## INTW -6.421e-02 1.371e-02 -4.684 2.81e-06 ***
## METROurban -8.407e-02 5.391e-02 -1.560 0.118848
## STATECO -3.523e-02 8.516e-02 -0.414 0.679103
## STATECT 7.739e-01 8.837e-02 8.758 < 2e-16 ***
## STATEGA -2.317e-01 9.489e-02 -2.441 0.014636 *
## STATEIL 5.738e-01 1.635e-01 3.509 0.000450 ***
## STATEIN 2.367e-01 9.369e-02 2.526 0.011534 *
## STATELA 5.893e-01 1.079e-01 5.464 4.66e-08 ***
## STATEMO 5.194e-01 9.749e-02 5.328 9.95e-08 ***
## STATEOH 7.505e-01 9.493e-02 7.906 2.66e-15 ***
## STATEOK 1.174e-01 1.029e-01 1.141 0.253976
## STATEPA 5.816e-01 1.009e-01 5.761 8.34e-09 ***
## STATETX 2.875e-01 1.075e-01 2.675 0.007473 **
## STATEWA 1.535e-01 8.829e-02 1.739 0.082036 .
## BATHS 2.994e-01 3.824e-02 7.829 4.92e-15 ***
## BEDRMS -2.157e-02 2.913e-02 -0.741 0.458931
## MATBUYY 2.590e-01 3.929e-02 6.592 4.33e-11 ***
## DWNPAYprev home 7.338e-01 4.868e-02 15.073 < 2e-16 ***
## VALUE 1.448e-06 1.458e-07 9.927 < 2e-16 ***
## FRSTHOY -2.137e-02 1.184e-01 -0.180 0.856799
## BATHS:FRSTHOY -2.020e-01 6.207e-02 -3.255 0.001135 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18873 on 15564 degrees of freedom
## Residual deviance: 16958 on 15522 degrees of freedom
## AIC: 17044
##
## Number of Fisher Scoring iterations: 4
exp(-0.37)
## [1] 0.6907343
exp(-0.2445)
## [1] 0.783096
gt100 <- which(homes$VALUE>1e5)
training_data_set <- homes[gt100,]
traing_model <- glm(gt20dwn ~ .-AMMORT-LPRICE+FRSTHO*BATHS,data=training_data_set, family = binomial)
summary(traing_model)
##
## Call:
## glm(formula = gt20dwn ~ . - AMMORT - LPRICE + FRSTHO * BATHS,
## family = binomial, data = training_data_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5812 -0.8460 -0.6109 1.0974 2.5486
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.547e+00 2.154e-01 -7.185 6.73e-13 ***
## EAPTBLY 9.093e-02 8.199e-02 1.109 0.267394
## ECOM1Y -9.174e-02 6.692e-02 -1.371 0.170387
## ECOM2Y -4.134e-01 2.068e-01 -1.999 0.045572 *
## EGREENY 3.692e-03 4.391e-02 0.084 0.932996
## EJUNKY -2.308e-01 2.178e-01 -1.060 0.289331
## ELOW1Y 2.268e-02 7.398e-02 0.307 0.759134
## ESFDY -3.600e-01 9.835e-02 -3.660 0.000252 ***
## ETRANSY -1.043e-01 8.861e-02 -1.177 0.239078
## EABANY -1.885e-01 1.605e-01 -1.175 0.240187
## HOWHgood -7.675e-02 9.747e-02 -0.787 0.431007
## HOWNgood 1.959e-01 8.097e-02 2.419 0.015558 *
## ODORAY 1.275e-01 1.160e-01 1.099 0.271682
## STRNAY -1.150e-01 5.437e-02 -2.115 0.034406 *
## ZINC2 -2.971e-07 2.187e-07 -1.359 0.174198
## PER -1.271e-01 2.044e-02 -6.217 5.06e-10 ***
## ZADULT 1.180e-02 3.573e-02 0.330 0.741257
## HHGRADBach 2.070e-01 7.310e-02 2.832 0.004629 **
## HHGRADGrad 2.916e-01 7.994e-02 3.648 0.000265 ***
## HHGRADHS Grad 2.582e-02 7.252e-02 0.356 0.721806
## HHGRADNo HS -1.787e-01 1.246e-01 -1.435 0.151371
## NUNITS 1.235e-03 1.777e-03 0.695 0.487031
## INTW -6.175e-02 1.712e-02 -3.606 0.000311 ***
## METROurban -1.263e-01 6.295e-02 -2.007 0.044761 *
## STATECO 3.541e-02 8.751e-02 0.405 0.685725
## STATECT 7.801e-01 9.221e-02 8.460 < 2e-16 ***
## STATEGA -2.192e-01 9.958e-02 -2.202 0.027689 *
## STATEIL 5.666e-01 1.972e-01 2.873 0.004061 **
## STATEIN 3.507e-01 1.017e-01 3.450 0.000562 ***
## STATELA 6.946e-01 1.214e-01 5.719 1.07e-08 ***
## STATEMO 6.122e-01 1.052e-01 5.818 5.97e-09 ***
## STATEOH 7.912e-01 1.025e-01 7.721 1.15e-14 ***
## STATEOK 2.651e-01 1.219e-01 2.175 0.029649 *
## STATEPA 7.568e-01 1.170e-01 6.471 9.76e-11 ***
## STATETX 4.579e-01 1.299e-01 3.525 0.000423 ***
## STATEWA 1.765e-01 9.097e-02 1.940 0.052373 .
## BATHS 2.635e-01 4.126e-02 6.387 1.69e-10 ***
## BEDRMS -1.487e-02 3.236e-02 -0.459 0.645927
## MATBUYY 3.427e-01 4.336e-02 7.904 2.70e-15 ***
## DWNPAYprev home 7.734e-01 5.324e-02 14.527 < 2e-16 ***
## VALUE 1.741e-06 1.612e-07 10.800 < 2e-16 ***
## FRSTHOY -1.081e-01 1.425e-01 -0.759 0.448045
## BATHS:FRSTHOY -1.301e-01 7.118e-02 -1.828 0.067616 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15210 on 12143 degrees of freedom
## Residual deviance: 13617 on 12101 degrees of freedom
## AIC: 13703
##
## Number of Fisher Scoring iterations: 4
deviance <- function(y, pred, family=c("gaussian","binomial")){
family <- match.arg(family)
if(family=="gaussian"){
return( sum( (y-pred)^2 ) )
}else{
if(is.factor(y)) y <- as.numeric(y)>1
return( -2*sum( y*log(pred) + (1-y)*log(1-pred) ) )
}
}
R2 <- function(y, pred, family=c("gaussian","binomial")){
fam <- match.arg(family)
if(fam=="binomial"){
if(is.factor(y)){ y <- as.numeric(y)>1 }
}
dev <- deviance(y, pred, family=fam)
dev0 <- deviance(y, mean(y), family=fam)
return(1-dev/dev0)
}
ybar <- mean(homes$gt20dwn[-gt100]==TRUE)
D0 <- deviance(y=homes$gt20dwn, pred=ybar, family="binomial")
D0
## [1] 19530.03
r_squared<- R2(y=homes$gt20dwn, pred=ybar, family="binomial")
r_squared
## [1] -0.03483716