library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
download.file("http://www.openintro.org/stat/data/mlb11.RData", destfile = "mlb11.RData")
load("mlb11.RData")
arrange(mlb11,at_bats)
## team runs at_bats hits homeruns bat_avg strikeouts
## 1 San Diego Padres 593 5417 1284 91 0.237 1320
## 2 Arizona Diamondbacks 731 5421 1357 172 0.250 1249
## 3 Pittsburgh Pirates 610 5421 1325 107 0.244 1308
## 4 Seattle Mariners 556 5421 1263 109 0.233 1280
## 5 Los Angeles Dodgers 644 5436 1395 117 0.257 1087
## 6 Tampa Bay Rays 707 5436 1324 172 0.244 1193
## 7 Washington Nationals 624 5441 1319 154 0.242 1323
## 8 Milwaukee Brewers 721 5447 1422 185 0.261 1083
## 9 Oakland Athletics 645 5452 1330 114 0.244 1094
## 10 San Francisco Giants 570 5486 1327 121 0.242 1122
## 11 Minnesota Twins 619 5487 1357 103 0.247 1048
## 12 Chicago White Sox 654 5502 1387 154 0.252 989
## 13 Florida Marlins 625 5508 1358 149 0.247 1244
## 14 Cleveland Indians 704 5509 1380 154 0.250 1269
## 15 Los Angeles Angels 667 5513 1394 155 0.253 1086
## 16 New York Yankees 867 5518 1452 222 0.263 1138
## 17 Atlanta Braves 641 5528 1345 173 0.243 1260
## 18 St. Louis Cardinals 762 5532 1513 162 0.273 978
## 19 Colorado Rockies 735 5544 1429 163 0.258 1201
## 20 Chicago Cubs 654 5549 1423 148 0.256 1202
## 21 Toronto Blue Jays 743 5559 1384 186 0.249 1184
## 22 Detroit Tigers 787 5563 1540 169 0.277 1143
## 23 Philadelphia Phillies 713 5579 1409 153 0.253 1024
## 24 Baltimore Orioles 708 5585 1434 191 0.257 1120
## 25 Houston Astros 615 5598 1442 95 0.258 1164
## 26 New York Mets 718 5600 1477 108 0.264 1085
## 27 Cincinnati Reds 735 5612 1438 183 0.256 1250
## 28 Texas Rangers 855 5659 1599 210 0.283 930
## 29 Kansas City Royals 730 5672 1560 129 0.275 1006
## 30 Boston Red Sox 875 5710 1600 203 0.280 1108
## stolen_bases wins new_onbase new_slug new_obs
## 1 170 71 0.305 0.349 0.653
## 2 133 94 0.322 0.413 0.736
## 3 108 72 0.309 0.368 0.676
## 4 125 67 0.292 0.348 0.640
## 5 126 82 0.322 0.375 0.697
## 6 155 91 0.322 0.402 0.724
## 7 106 80 0.309 0.383 0.691
## 8 94 96 0.325 0.425 0.750
## 9 117 74 0.311 0.369 0.680
## 10 85 86 0.303 0.368 0.671
## 11 92 63 0.306 0.360 0.666
## 12 81 79 0.319 0.388 0.706
## 13 95 72 0.318 0.388 0.706
## 14 89 80 0.317 0.396 0.714
## 15 135 86 0.313 0.402 0.714
## 16 147 97 0.343 0.444 0.788
## 17 77 89 0.308 0.387 0.695
## 18 57 90 0.341 0.425 0.766
## 19 118 73 0.329 0.410 0.739
## 20 69 71 0.314 0.401 0.715
## 21 131 81 0.317 0.413 0.730
## 22 49 95 0.340 0.434 0.773
## 23 96 102 0.323 0.395 0.717
## 24 81 69 0.316 0.413 0.729
## 25 118 56 0.311 0.374 0.684
## 26 130 77 0.335 0.391 0.725
## 27 97 79 0.326 0.408 0.734
## 28 143 96 0.340 0.460 0.800
## 29 153 71 0.329 0.415 0.744
## 30 102 90 0.349 0.461 0.810
plot(mlb11$runs,mlb11$at_bats)
## Exercise 1
The relationship is somewaht linear.
cor(mlb11$runs,mlb11$at_bats)
## [1] 0.610627
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
scatterplot(mlb11$runs,mlb11$at_bats)
Relationship between 2 varibleas has medium positive strength/corelation. It has some points that are spread away from the cental/regression line.
plot_ss(x = mlb11$at_bats, y = mlb11$runs)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
plot_ss(x = mlb11$at_bats, y = mlb11$runs, showSquares = TRUE)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
Actually for some reason I do not functionality to click on points on graph to make a line in R studio.
m1 <- lm(runs ~ at_bats, data = mlb11)
summary(m1)
##
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.58 -47.05 -16.59 54.40 176.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2789.2429 853.6957 -3.267 0.002871 **
## at_bats 0.6305 0.1545 4.080 0.000339 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared: 0.3729, Adjusted R-squared: 0.3505
## F-statistic: 16.65 on 1 and 28 DF, p-value: 0.0003388
m11 <- lm(runs ~ homeruns, data = mlb11)
summary(m11)
##
## Call:
## lm(formula = runs ~ homeruns, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91.615 -33.410 3.231 24.292 104.631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 415.2389 41.6779 9.963 1.04e-10 ***
## homeruns 1.8345 0.2677 6.854 1.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.29 on 28 degrees of freedom
## Multiple R-squared: 0.6266, Adjusted R-squared: 0.6132
## F-statistic: 46.98 on 1 and 28 DF, p-value: 1.9e-07
y=415.2389+1.8345*homeruns
Relationship between y and homeruns positive. As homeruns increase y also increases.
plot(mlb11$runs ~ mlb11$at_bats)
abline(m1)
He would predict 727.6861. I do not think we can calculate residual for 5578, as we do not have actual value for this point.
-2789.2429+0.6305*5578
## [1] 727.6861
library(dplyr)
filter(mlb11,at_bats==5578)
## [1] team runs at_bats hits homeruns
## [6] bat_avg strikeouts stolen_bases wins new_onbase
## [11] new_slug new_obs
## <0 rows> (or 0-length row.names)
plot(m1$residuals ~ mlb11$at_bats)
abline(h = 0, lty = 3) # adds a horizontal dashed line at y = 0
I cannot see a patern, which would indicate that linear model is a good fit.
hist(m1$residuals)
qqnorm(m1$residuals)
qqline(m1$residuals) # adds diagonal line to the normal prob plot
It seems so.
It looks to be met.
scatterplot(mlb11$bat_avg, mlb11$runs)
mymod <- lm(runs ~ bat_avg, data = mlb11)
summary(mymod)
##
## Call:
## lm(formula = runs ~ bat_avg, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94.676 -26.303 -5.496 28.482 131.113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -642.8 183.1 -3.511 0.00153 **
## bat_avg 5242.2 717.3 7.308 5.88e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.23 on 28 degrees of freedom
## Multiple R-squared: 0.6561, Adjusted R-squared: 0.6438
## F-statistic: 53.41 on 1 and 28 DF, p-value: 5.877e-08
scatterplot(mlb11$hits, mlb11$runs)
mymod1 <- lm(runs ~ hits, data = mlb11)
summary(mymod1)
##
## Call:
## lm(formula = runs ~ hits, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -103.718 -27.179 -5.233 19.322 140.693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -375.5600 151.1806 -2.484 0.0192 *
## hits 0.7589 0.1071 7.085 1.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.23 on 28 degrees of freedom
## Multiple R-squared: 0.6419, Adjusted R-squared: 0.6292
## F-statistic: 50.2 on 1 and 28 DF, p-value: 1.043e-07
scatterplot(mlb11$homeruns, mlb11$runs)
mymod2 <- lm(runs ~ homeruns, data = mlb11)
summary(mymod2)
##
## Call:
## lm(formula = runs ~ homeruns, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -91.615 -33.410 3.231 24.292 104.631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 415.2389 41.6779 9.963 1.04e-10 ***
## homeruns 1.8345 0.2677 6.854 1.90e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.29 on 28 degrees of freedom
## Multiple R-squared: 0.6266, Adjusted R-squared: 0.6132
## F-statistic: 46.98 on 1 and 28 DF, p-value: 1.9e-07
scatterplot(mlb11$strikeouts, mlb11$runs)
mymod3 <- lm(runs ~ strikeouts, data = mlb11)
summary(mymod3)
##
## Call:
## lm(formula = runs ~ strikeouts, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -132.27 -46.95 -11.92 55.14 169.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1054.7342 151.7890 6.949 1.49e-07 ***
## strikeouts -0.3141 0.1315 -2.389 0.0239 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 76.5 on 28 degrees of freedom
## Multiple R-squared: 0.1694, Adjusted R-squared: 0.1397
## F-statistic: 5.709 on 1 and 28 DF, p-value: 0.02386
scatterplot(mlb11$stolen_bases, mlb11$runs)
mymod4 <- lm(runs ~ stolen_bases, data = mlb11)
summary(mymod4)
##
## Call:
## lm(formula = runs ~ stolen_bases, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.94 -62.87 10.01 38.54 182.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 677.3074 58.9751 11.485 4.17e-12 ***
## stolen_bases 0.1491 0.5211 0.286 0.777
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 83.82 on 28 degrees of freedom
## Multiple R-squared: 0.002914, Adjusted R-squared: -0.0327
## F-statistic: 0.08183 on 1 and 28 DF, p-value: 0.7769
scatterplot(mlb11$wins, mlb11$runs)
mymod5 <- lm(runs ~ wins, data = mlb11)
summary(mymod5)
##
## Call:
## lm(formula = runs ~ wins, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.450 -47.506 -7.482 47.346 142.186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342.121 89.223 3.834 0.000654 ***
## wins 4.341 1.092 3.977 0.000447 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.1 on 28 degrees of freedom
## Multiple R-squared: 0.361, Adjusted R-squared: 0.3381
## F-statistic: 15.82 on 1 and 28 DF, p-value: 0.0004469
scatterplot(mlb11$new_onbase, mlb11$runs)
mymod6 <- lm(runs ~ new_onbase, data = mlb11)
summary(mymod6)
##
## Call:
## lm(formula = runs ~ new_onbase, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.270 -18.335 3.249 19.520 69.002
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1118.4 144.5 -7.741 1.97e-08 ***
## new_onbase 5654.3 450.5 12.552 5.12e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.61 on 28 degrees of freedom
## Multiple R-squared: 0.8491, Adjusted R-squared: 0.8437
## F-statistic: 157.6 on 1 and 28 DF, p-value: 5.116e-13
scatterplot(mlb11$new_slug, mlb11$runs)
mymod7 <- lm(runs ~ new_slug, data = mlb11)
summary(mymod7)
##
## Call:
## lm(formula = runs ~ new_slug, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.41 -18.66 -0.91 16.29 52.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -375.80 68.71 -5.47 7.70e-06 ***
## new_slug 2681.33 171.83 15.61 2.42e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.96 on 28 degrees of freedom
## Multiple R-squared: 0.8969, Adjusted R-squared: 0.8932
## F-statistic: 243.5 on 1 and 28 DF, p-value: 2.42e-15
scatterplot(mlb11$new_obs, mlb11$runs)
mymod8 <- lm(runs ~ new_obs, data = mlb11)
summary(mymod8)
##
## Call:
## lm(formula = runs ~ new_obs, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.456 -13.690 1.165 13.935 41.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -686.61 68.93 -9.962 1.05e-10 ***
## new_obs 1919.36 95.70 20.057 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.41 on 28 degrees of freedom
## Multiple R-squared: 0.9349, Adjusted R-squared: 0.9326
## F-statistic: 402.3 on 1 and 28 DF, p-value: < 2.2e-16
hist(mymod8$residuals)
qqnorm(mymod8$residuals)
qqline(mymod8$residuals)
plot(mymod8$residuals ~ mlb11$new_obs)
abline(h = 0, lty = 3)