library(tidyverse)
library(Stat2Data)
library(leaps)
library(skimr)
data(RailsTrails)
lin = lm(Adj2007 ~ Distance, data = RailsTrails)
summary(lin)
##
## Call:
## lm(formula = Adj2007 ~ Distance, data = RailsTrails)
##
## Residuals:
## Min 1Q Median 3Q Max
## -190.55 -58.19 -17.48 25.22 444.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 388.204 14.052 27.626 < 2e-16 ***
## Distance -54.427 9.659 -5.635 1.56e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 92.13 on 102 degrees of freedom
## Multiple R-squared: 0.2374, Adjusted R-squared: 0.2299
## F-statistic: 31.75 on 1 and 102 DF, p-value: 1.562e-07
coef(lin)
## (Intercept) Distance
## 388.2038 -54.4272
\[ \hat{Adj2007} = 388.204*-54.427Distance \]
lin2 = lm(Adj2007 ~ Distance + SquareFeet, data = RailsTrails)
summary(lin2)
##
## Call:
## lm(formula = Adj2007 ~ Distance + SquareFeet, data = RailsTrails)
##
## Residuals:
## Min 1Q Median 3Q Max
## -138.835 -32.621 -1.903 27.369 145.504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109.742 20.057 5.472 3.25e-07 ***
## Distance -16.486 5.942 -2.775 0.00659 **
## SquareFeet 150.780 9.998 15.080 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.34 on 101 degrees of freedom
## Multiple R-squared: 0.7655, Adjusted R-squared: 0.7608
## F-statistic: 164.8 on 2 and 101 DF, p-value: < 2.2e-16
coef(lin2)
## (Intercept) Distance SquareFeet
## 109.74240 -16.48597 150.78013
\[ \hat{Adj2007}=109.74240 - 16.48597Distance + 150.78013SquareFeet \] - We see that the addition of SquareFeet did change our estimate of the distance relationship to Adj2007 because our \(R^2\) value for the two explanatory variables if higher (76.55%) than what is is for the \(R^2\) value with the single explanatory variable (23.74%). The estimated coefficient for SquareFeet is also positive and larger in value compared to the coefficient for Distance, therefore, making it significant to the regression. This means that both Distance and SquareFeet are less effective at explaining the variability in the selling price than they are as a combination in the multiple regression model.
confint(lin)
## 2.5 % 97.5 %
## (Intercept) 360.3317 416.07588
## Distance -73.5859 -35.26851
confint(lin2)
## 2.5 % 97.5 %
## (Intercept) 69.95460 149.530197
## Distance -28.27307 -4.698861
## SquareFeet 130.94601 170.614247
pr = 108.74240 - 16.48597*0.5 + 150.78013*1500
pr
## [1] 226270.7
\[ \hat{Adj2007}=108.74240 - 16.48597(0.5) + 150.78013(1500)=22,6270.7 \]
data(Tadpoles)
ggplot(Tadpoles) + geom_point(aes(x=Body, y=GutLength, col=Treatment))
- The scatterplot shows that for both treatments, there is a positive linear relationship, i.e., as the length in mm of the body of the tadpole increases, the GutLength also increases.
m = lm(GutLength ~ Body, data = Tadpoles)
summary(m)
##
## Call:
## lm(formula = GutLength ~ Body, data = Tadpoles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.575 -22.245 0.027 17.815 39.998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.764 49.002 -0.424 0.675384
## Body 10.280 2.445 4.204 0.000293 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.83 on 25 degrees of freedom
## Multiple R-squared: 0.4141, Adjusted R-squared: 0.3907
## F-statistic: 17.67 on 1 and 25 DF, p-value: 0.0002931
coef(m)
## (Intercept) Body
## -20.76397 10.28000
ggplot(m, aes(x=Body, y=GutLength)) + geom_point() + stat_smooth(method=lm, se=FALSE)
\[ \hat{GutLength} = -20.764+10.28Body \]
m1 = lm(GutLength ~ Body + Treatment, data = Tadpoles)
summary(m1)
##
## Call:
## lm(formula = GutLength ~ Body + Treatment, data = Tadpoles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.462 -15.006 -2.545 19.117 41.298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.47 53.81 0.288 0.77618
## Body 8.07 2.82 2.862 0.00859 **
## TreatmentControl 16.28 11.03 1.476 0.15286
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.26 on 24 degrees of freedom
## Multiple R-squared: 0.4629, Adjusted R-squared: 0.4181
## F-statistic: 10.34 on 2 and 24 DF, p-value: 0.0005762
coef(m1)
## (Intercept) Body TreatmentControl
## 15.471225 8.070068 16.277529
\[ \hat{GutLength} = 15.47+8.07Body+16.28TreatMent \]
m2 = lm(GutLength ~ Body + Treatment + MouthpartDamage, data = Tadpoles)
summary(m2)
##
## Call:
## lm(formula = GutLength ~ Body + Treatment + MouthpartDamage,
## data = Tadpoles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.422 -17.701 -6.771 16.338 40.877
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.258 53.070 -0.382 0.7062
## Body 6.442 2.746 2.346 0.0280 *
## TreatmentControl 25.412 11.177 2.274 0.0326 *
## MouthpartDamage 96.839 45.839 2.113 0.0457 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.68 on 23 degrees of freedom
## Multiple R-squared: 0.5502, Adjusted R-squared: 0.4915
## F-statistic: 9.378 on 3 and 23 DF, p-value: 0.0003092
coef(m2)
## (Intercept) Body TreatmentControl MouthpartDamage
## -20.258362 6.442416 25.411531 96.839236
\[ \hat{GutLength} = -20.258362 + 6.44Body+25.411531Treatment+96.839236MouthpartDamage \] - The biologist predicted that the variable MouthpartDamage is a measure of damage to the mouth and is expected to have a positive association with GutLength because more damage means reduced food intake, which means the tadpole need to increase nutrient absorbtion The biologist question is whether GutLength will have a positive association with Treatment. This question is true because we see that there is a positive association in Treatment and MouthpartDamage from the estimated coefficients and the significance of the variables’ p-values.
data(FishEggs)
fe = lm(PctDM ~ Age, data = FishEggs)
summary(fe)
##
## Call:
## lm(formula = PctDM ~ Age, data = FishEggs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9091 -0.8471 0.3822 1.0271 2.1409
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.70206 0.86783 44.596 <2e-16 ***
## Age -0.21033 0.07313 -2.876 0.007 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.426 on 33 degrees of freedom
## Multiple R-squared: 0.2004, Adjusted R-squared: 0.1762
## F-statistic: 8.272 on 1 and 33 DF, p-value: 0.007001
coef(fe)
## (Intercept) Age
## 38.7020617 -0.2103312
\[ \hat{PctDM} = 38.7020617-0.2103312Age \] - Since the estimated for the Age coefficient is negative, we see that there is a negative association in the model. This means that for an increase in age, we expect to have a decrease in the percent dry mass in the fish eggs.
summary(fe)$r.squared
## [1] 0.2004315
plot(fe, which = 1)
- We see from the residual versus fitted plot that the simplel linear model shows a pretty symmetrically distributed data. Since the red line is not flat, we see that the model is not a perfectly linear model.
fe1 = lm(PctDM ~ Age + Month, data = FishEggs)
summary(fe1)
##
## Call:
## lm(formula = PctDM ~ Age + Month, data = FishEggs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9100 -0.5869 0.2974 0.7599 2.4380
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.51922 0.77827 50.778 < 2e-16 ***
## Age -0.22870 0.06292 -3.635 0.000965 ***
## MonthSep -1.51929 0.42342 -3.588 0.001096 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.223 on 32 degrees of freedom
## Multiple R-squared: 0.4298, Adjusted R-squared: 0.3942
## F-statistic: 12.06 on 2 and 32 DF, p-value: 0.0001248
plot(fe1, which = 1)
ggplot(FishEggs, aes(x=fitted.values(fe1), y = residuals(fe1))) + geom_point(aes(color=Month)) + geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
fe2 = lm(PctDM ~ Age + Sept, data = FishEggs)
summary(fe2)
##
## Call:
## lm(formula = PctDM ~ Age + Sept, data = FishEggs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9100 -0.5869 0.2974 0.7599 2.4380
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.51922 0.77827 50.778 < 2e-16 ***
## Age -0.22870 0.06292 -3.635 0.000965 ***
## Sept -1.51929 0.42342 -3.588 0.001096 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.223 on 32 degrees of freedom
## Multiple R-squared: 0.4298, Adjusted R-squared: 0.3942
## F-statistic: 12.06 on 2 and 32 DF, p-value: 0.0001248
data(RailsTrails)
Skip part a
lin = lm(Adj2007 ~ Distance, data = RailsTrails)
summary(lin)
##
## Call:
## lm(formula = Adj2007 ~ Distance, data = RailsTrails)
##
## Residuals:
## Min 1Q Median 3Q Max
## -190.55 -58.19 -17.48 25.22 444.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 388.204 14.052 27.626 < 2e-16 ***
## Distance -54.427 9.659 -5.635 1.56e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 92.13 on 102 degrees of freedom
## Multiple R-squared: 0.2374, Adjusted R-squared: 0.2299
## F-statistic: 31.75 on 1 and 102 DF, p-value: 1.562e-07
coef(lin)
## (Intercept) Distance
## 388.2038 -54.4272
\[ \hat{Adj2007}=388.2038-54.4272Distance \] - We see that with each increase in distance, we would expect for a decrease in 54.42 for the adjusted price.
lin3 = lm(Adj2007 ~ GarageGroup + Distance, data = RailsTrails)
summary(lin3)
##
## Call:
## lm(formula = Adj2007 ~ GarageGroup + Distance, data = RailsTrails)
##
## Residuals:
## Min 1Q Median 3Q Max
## -167.88 -51.55 -21.88 36.79 427.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 365.103 17.661 20.673 <2e-16 ***
## GarageGroupyes 37.892 18.032 2.101 0.0381 *
## Distance -51.025 9.638 -5.294 7e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.62 on 101 degrees of freedom
## Multiple R-squared: 0.2693, Adjusted R-squared: 0.2549
## F-statistic: 18.62 on 2 and 101 DF, p-value: 1.311e-07
coef(lin3)
## (Intercept) GarageGroupyes Distance
## 365.10269 37.89222 -51.02546
\[ \hat{Adj2007}=365.10269+37.89222GarageGroup-51.02546Distance \] - We see that the estimated coefficient for the variable GarageGroup is positive and the estimated coefficient for the variable Distance is negative. The p-value for the Distance variable is more significant than the p-value for the GarageGroup, therefore, Distance is a better indicator for the adjusted price than the GarageGroup variable.
lin4 = lm(Adj2007 ~ GarageGroup + Distance + Distance*GarageGroup, data = RailsTrails)
summary(lin4)
##
## Call:
## lm(formula = Adj2007 ~ GarageGroup + Distance + Distance * GarageGroup,
## data = RailsTrails)
##
## Residuals:
## Min 1Q Median 3Q Max
## -162.46 -51.65 -17.22 30.04 425.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 359.083 21.295 16.862 < 2e-16 ***
## GarageGroupyes 48.862 28.108 1.738 0.085222 .
## Distance -46.302 13.391 -3.458 0.000802 ***
## GarageGroupyes:Distance -9.878 19.366 -0.510 0.611125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.96 on 100 degrees of freedom
## Multiple R-squared: 0.2712, Adjusted R-squared: 0.2494
## F-statistic: 12.41 on 3 and 100 DF, p-value: 5.785e-07
full = lm(Adj2007 ~ Distance + GarageSpaces, data=RailsTrails)
summary(full)
##
## Call:
## lm(formula = Adj2007 ~ Distance + GarageSpaces, data = RailsTrails)
##
## Residuals:
## Min 1Q Median 3Q Max
## -159.90 -49.27 -21.54 27.87 406.70
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 356.816 16.671 21.403 < 2e-16 ***
## Distance -48.560 9.433 -5.148 1.3e-06 ***
## GarageSpaces 32.715 10.253 3.191 0.00189 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 88.24 on 101 degrees of freedom
## Multiple R-squared: 0.3072, Adjusted R-squared: 0.2935
## F-statistic: 22.39 on 2 and 101 DF, p-value: 8.915e-09
plot(full, which = 1)
plot(full, which = 2)
- We see that the p-value is not as significant as other variables, the terms involving GarageSpace does not add significance to the model of the price on distance.
data(MLB2007Standings)
skim(MLB2007Standings)
## Skim summary statistics
## n obs: 30
## n variables: 21
##
## ── Variable type:factor ─────────────────────────────────────────────────────────────────
## variable missing complete n n_unique top_counts
## League 0 30 30 2 NL: 16, AL: 14, NA: 0
## Team 0 30 30 30 Ari: 1, Atl: 1, Bal: 1, Bos: 1
## ordered
## FALSE
## FALSE
##
## ── Variable type:integer ────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75
## Doubles 0 30 30 306.57 25.91 249 291.5 305.5 325.5
## Hits 0 30 30 1499.23 80.46 1341 1449 1502 1554.5
## HitsAllowed 0 30 30 1499.23 77.79 1340 1443.75 1500 1553
## HR 0 30 30 165.23 30.34 102 143.5 171 178.75
## Losses 0 30 30 81.03 9.23 66 74 79.5 89.75
## RBI 0 30 30 741.9 67.76 641 695.5 733 775.5
## Runs 0 30 30 777.4 69.05 673 724.25 769 810.75
## Saves 0 30 30 39.93 5.64 28 36 39.5 43.75
## SB 0 30 30 97.27 33.87 52 69.75 96 117.25
## StrikeOuts 0 30 30 1072.97 76.06 931 1016.25 1067.5 1135.5
## Triples 0 30 30 31.27 8.32 13 27 31 36
## Walks 0 30 30 535.97 64.87 410 500.25 525 569.5
## Wins 0 30 30 81.03 9.29 66 72.25 82.5 88.75
## p100 hist
## 352 ▁▂▅▇▆▆▁▆
## 1656 ▂▃▇▅▇▇▂▃
## 1649 ▂▂▃▂▇▃▂▂
## 231 ▂▃▂▃▇▂▃▁
## 96 ▅▃▇▅▂▂▇▅
## 929 ▅▇▃▇▁▃▁▁
## 968 ▃▇▅▇▁▃▁▁
## 51 ▂▁▇▇▅▇▁▃
## 200 ▇▆▆▃▅▁▁▁
## 1211 ▂▅▅▇▅▃▅▃
## 50 ▂▂▂▇▆▃▁▂
## 696 ▂▃▇▇▇▁▁▃
## 96 ▅▇▂▂▅▅▅▅
##
## ── Variable type:numeric ────────────────────────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100
## BattingAvg 0 30 30 0.27 0.012 0.25 0.26 0.27 0.28 0.29
## ERA 0 30 30 4.46 0.41 3.7 4.16 4.45 4.73 5.53
## OBP 0 30 30 0.34 0.012 0.32 0.33 0.34 0.34 0.37
## SLG 0 30 30 0.42 0.022 0.39 0.41 0.42 0.44 0.46
## WHIP 0 30 30 1.41 0.079 1.27 1.36 1.4 1.44 1.58
## WinPct 0 30 30 0.5 0.057 0.41 0.45 0.51 0.55 0.59
## hist
## ▃▃▅▇▂▆▃▃
## ▂▇▇▇▇▃▁▁
## ▆▇▆▆▅▂▁▂
## ▅▃▇▅▅▅▁▅
## ▃▃▇▅▇▂▂▂
## ▅▇▂▂▅▇▃▅
MLB2007Standings = MLB2007Standings %>%
select(-Wins, -Losses, -Team)
For part d, please use AIC instead of Mallow’s Cp.
forward = regsubsets(WinPct ~ ., data=MLB2007Standings, nbest = 1, nvmax = 4, method = "forward")
with(summary(forward), data.frame(cp, outmat))
## cp LeagueNL BattingAvg Runs Hits HR Doubles Triples RBI
## 1 ( 1 ) 104.879916
## 2 ( 1 ) 17.722949 *
## 3 ( 1 ) 9.708052 *
## 4 ( 1 ) 2.951144 * *
## SB OBP SLG ERA HitsAllowed Walks StrikeOuts Saves WHIP
## 1 ( 1 ) *
## 2 ( 1 ) *
## 3 ( 1 ) * *
## 4 ( 1 ) * *
reg = lm(WinPct ~ RBI + ERA + BattingAvg + Saves, data = MLB2007Standings)
summary(reg)
##
## Call:
## lm(formula = WinPct ~ RBI + ERA + BattingAvg + Saves, data = MLB2007Standings)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.030351 -0.014419 0.001373 0.015467 0.040924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.508e-02 1.274e-01 0.197 0.845548
## RBI 3.154e-04 8.371e-05 3.768 0.000898 ***
## ERA -6.832e-02 1.084e-02 -6.301 1.36e-06 ***
## BattingAvg 1.535e+00 4.971e-01 3.088 0.004877 **
## Saves 3.367e-03 8.242e-04 4.085 0.000398 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01954 on 25 degrees of freedom
## Multiple R-squared: 0.8994, Adjusted R-squared: 0.8833
## F-statistic: 55.87 on 4 and 25 DF, p-value: 4.182e-12
coef(reg)
## (Intercept) RBI ERA BattingAvg Saves
## 0.0250795245 0.0003153752 -0.0683235055 1.5352161567 0.0033667706
\[ \hat{WinPct} = 0.025 - 0.068ERA + 0.0003RBI + 0.00336677Saves + 1.535BattingAvg\\ R^2 = 0.8994 \]
backward = regsubsets(WinPct ~ ., data=MLB2007Standings, nbest = 1, nvmax = 4, method = "backward")
summary(backward)
## Subset selection object
## Call: regsubsets.formula(WinPct ~ ., data = MLB2007Standings, nbest = 1,
## nvmax = 4, method = "backward")
## 17 Variables (and intercept)
## Forced in Forced out
## LeagueNL FALSE FALSE
## BattingAvg FALSE FALSE
## Runs FALSE FALSE
## Hits FALSE FALSE
## HR FALSE FALSE
## Doubles FALSE FALSE
## Triples FALSE FALSE
## RBI FALSE FALSE
## SB FALSE FALSE
## OBP FALSE FALSE
## SLG FALSE FALSE
## ERA FALSE FALSE
## HitsAllowed FALSE FALSE
## Walks FALSE FALSE
## StrikeOuts FALSE FALSE
## Saves FALSE FALSE
## WHIP FALSE FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: backward
## LeagueNL BattingAvg Runs Hits HR Doubles Triples RBI SB OBP SLG
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " "*" " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " "*" " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " "*" " " "*" " " " " " " " " " " " "
## ERA HitsAllowed Walks StrikeOuts Saves WHIP
## 1 ( 1 ) " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " "*" "*"
## 4 ( 1 ) " " " " " " " " "*" "*"
with(summary(backward), data.frame(cp, outmat))
## cp LeagueNL BattingAvg Runs Hits HR Doubles Triples RBI
## 1 ( 1 ) 112.369101
## 2 ( 1 ) 25.793061 *
## 3 ( 1 ) 5.508871 *
## 4 ( 1 ) 3.689755 * *
## SB OBP SLG ERA HitsAllowed Walks StrikeOuts Saves WHIP
## 1 ( 1 ) *
## 2 ( 1 ) *
## 3 ( 1 ) * *
## 4 ( 1 ) * *
reg1 = lm(WinPct ~ WHIP + Runs + Saves + HR, data = MLB2007Standings)
summary(reg1)
##
## Call:
## lm(formula = WinPct ~ WHIP + Runs + Saves + HR, data = MLB2007Standings)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.026796 -0.012707 -0.000922 0.010269 0.043383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.902e-01 1.035e-01 3.768 0.000896 ***
## WHIP -3.153e-01 5.490e-02 -5.743 5.53e-06 ***
## Runs 5.743e-04 6.392e-05 8.985 2.66e-09 ***
## Saves 3.940e-03 7.567e-04 5.206 2.19e-05 ***
## HR -3.059e-04 1.524e-04 -2.008 0.055613 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01986 on 25 degrees of freedom
## Multiple R-squared: 0.8961, Adjusted R-squared: 0.8795
## F-statistic: 53.93 on 4 and 25 DF, p-value: 6.193e-12
coef(reg1)
## (Intercept) WHIP Runs Saves HR
## 0.3901758173 -0.3152812101 0.0005743166 0.0039398129 -0.0003059341
\[ \hat{WinPct} = 0.39-0.315WHIP+0.00057Runs+0.0039Saves-0.0003HR\\ R^2 = 0.8961 \]
library(leaps)
best = regsubsets(WinPct ~ ., data=MLB2007Standings, nbest=2, method="exhaustive")
with(summary(best), data.frame(rsq, adjr2, cp, rss, outmat))
## rsq adjr2 cp rss LeagueNL BattingAvg
## 1 ( 1 ) 0.4262137 0.4057213 104.8799157 0.054457981
## 1 ( 2 ) 0.3933806 0.3717156 112.3691015 0.057574165
## 2 ( 1 ) 0.8170838 0.8035345 17.7229487 0.017360552
## 2 ( 2 ) 0.8070835 0.7927934 20.0040109 0.018309682
## 3 ( 1 ) 0.8793993 0.8654838 5.5088713 0.011446199
## 3 ( 2 ) 0.8696935 0.8546581 7.7227532 0.012367376
## 4 ( 1 ) 0.9115018 0.8973421 0.1863323 0.008399355 *
## 4 ( 2 ) 0.9085257 0.8938899 0.8651608 0.008681810 *
## 5 ( 1 ) 0.9198007 0.9030925 0.2933631 0.007611707 *
## 5 ( 2 ) 0.9195065 0.9027370 0.3604715 0.007639631 *
## 6 ( 1 ) 0.9281648 0.9094252 0.3855103 0.006817867
## 6 ( 2 ) 0.9278489 0.9090269 0.4575724 0.006847851
## 7 ( 1 ) 0.9346512 0.9138584 0.9059708 0.006202243
## 7 ( 2 ) 0.9342333 0.9133075 1.0012972 0.006241908 *
## 8 ( 1 ) 0.9377173 0.9139906 2.2065989 0.005911241 *
## 8 ( 2 ) 0.9375236 0.9137231 2.2507884 0.005929628 *
## Runs Hits HR Doubles Triples RBI SB OBP SLG ERA HitsAllowed Walks
## 1 ( 1 ) *
## 1 ( 2 )
## 2 ( 1 ) * *
## 2 ( 2 ) * *
## 3 ( 1 ) *
## 3 ( 2 ) *
## 4 ( 1 ) *
## 4 ( 2 ) *
## 5 ( 1 ) * *
## 5 ( 2 ) * *
## 6 ( 1 ) * * * * *
## 6 ( 2 ) * * * *
## 7 ( 1 ) * * * * * *
## 7 ( 2 ) * * * * *
## 8 ( 1 ) * * * * * *
## 8 ( 2 ) * * * * * *
## StrikeOuts Saves WHIP
## 1 ( 1 )
## 1 ( 2 ) *
## 2 ( 1 )
## 2 ( 2 )
## 3 ( 1 ) * *
## 3 ( 2 ) * *
## 4 ( 1 ) * *
## 4 ( 2 ) * *
## 5 ( 1 ) * *
## 5 ( 2 ) * *
## 6 ( 1 ) *
## 6 ( 2 ) * *
## 7 ( 1 ) *
## 7 ( 2 ) *
## 8 ( 1 ) *
## 8 ( 2 ) *
be = lm(WinPct ~ WHIP + Saves + RBI + ERA, data = MLB2007Standings)
summary(be)
##
## Call:
## lm(formula = WinPct ~ WHIP + Saves + RBI + ERA, data = MLB2007Standings)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.034128 -0.012957 -0.003701 0.010918 0.052032
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.479e-01 1.106e-01 4.049 0.000437 ***
## WHIP -2.231e-01 1.314e-01 -1.697 0.102112
## Saves 2.953e-03 8.993e-04 3.284 0.003025 **
## RBI 5.106e-04 5.984e-05 8.533 7.11e-09 ***
## ERA -2.933e-02 2.754e-02 -1.065 0.297052
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02175 on 25 degrees of freedom
## Multiple R-squared: 0.8753, Adjusted R-squared: 0.8554
## F-statistic: 43.89 on 4 and 25 DF, p-value: 5.933e-11
coef(be)
## (Intercept) WHIP Saves RBI ERA
## 0.4478757137 -0.2230633919 0.0029528773 0.0005105964 -0.0293313911
\[ \hat{WinPct} = 0.4479 - 0.223WHIP +0.00295Saves + 0.00051RBI - 0.029ERA\\ R^2 = 0.8753 \]
AIC(reg)
## [1] -144.4368
AIC(reg1)
## [1] -143.4865
AIC(be)
## [1] -138.0115