Load data:
## team runs at_bats hits homeruns bat_avg strikeouts
## 1 Texas Rangers 855 5659 1599 210 0.283 930
## 2 Boston Red Sox 875 5710 1600 203 0.280 1108
## 3 Detroit Tigers 787 5563 1540 169 0.277 1143
## 4 Kansas City Royals 730 5672 1560 129 0.275 1006
## 5 St. Louis Cardinals 762 5532 1513 162 0.273 978
## 6 New York Mets 718 5600 1477 108 0.264 1085
## stolen_bases wins new_onbase new_slug new_obs
## 1 143 96 0.340 0.460 0.800
## 2 102 90 0.349 0.461 0.810
## 3 49 95 0.340 0.434 0.773
## 4 153 71 0.329 0.415 0.744
## 5 57 90 0.341 0.425 0.766
## 6 130 77 0.335 0.391 0.725
You would use a scatter plot to display the relationship between runs and at_bats. We can determine alpha is 5113.35, and beta is 0.760. The relationship between runs and at bats does look linear. The correlation coefficient is 0.61.
plot(mlb11$runs,mlb11$at_bats,col="red")
x=mlb11$runs
y=mlb11$at_bats
beta=cov(x,y)/var(x)
alpha=mean(y)-beta*mean(x)
abline(alpha,beta,col="blue")
cor(mlb11$runs,mlb11$at_bats)
## [1] 0.610627
The plot from exercise one demonstrates a positive liner relationship, that is relatively strong, as the points seem to follow an upward sloping line. My coefficients are: (intercept)= -6343.579, and x= 1.272. THe sum of squares is: 203465. After plotting the ss line which minimizes the sum of squared residuals, the (intercept) was -6282.879, and the x = 1.261, while the sum of squares now equals 201595.4.
plot_ss(x=mlb11$at_bats,y=mlb11$runs)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
plot_ss(x=mlb11$at_bats,y=mlb11$runs, showSquares = TRUE)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
m1 <- lm(runs ~ at_bats, data = mlb11)
summary(m1)
##
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -125.58 -47.05 -16.59 54.40 176.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2789.2429 853.6957 -3.267 0.002871 **
## at_bats 0.6305 0.1545 4.080 0.000339 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared: 0.3729, Adjusted R-squared: 0.3505
## F-statistic: 16.65 on 1 and 28 DF, p-value: 0.0003388
x=mlb11$at_bats
y=mlb11$runs
plot_ss(x = mlb11$at_bats, y= mlb11$runs)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
plot_ss(x = mlb11$at_bats, y = mlb11$runs, showSquares = TRUE)
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## -2789.2429 0.6305
##
## Sum of Squares: 123721.9
least squares regression line = -2789.2429 + 0.6305*at_bats. Multiple R-squared is 0.3729 (37.3% variability), and adjusted R-squared is 0.3505. The smallest sum of squares we got was 123721.9.
plot(mlb11$runs~mlb11$at_bats)
abline(m1)
list(mlb11)
## [[1]]
## team runs at_bats hits homeruns bat_avg strikeouts
## 1 Texas Rangers 855 5659 1599 210 0.283 930
## 2 Boston Red Sox 875 5710 1600 203 0.280 1108
## 3 Detroit Tigers 787 5563 1540 169 0.277 1143
## 4 Kansas City Royals 730 5672 1560 129 0.275 1006
## 5 St. Louis Cardinals 762 5532 1513 162 0.273 978
## 6 New York Mets 718 5600 1477 108 0.264 1085
## 7 New York Yankees 867 5518 1452 222 0.263 1138
## 8 Milwaukee Brewers 721 5447 1422 185 0.261 1083
## 9 Colorado Rockies 735 5544 1429 163 0.258 1201
## 10 Houston Astros 615 5598 1442 95 0.258 1164
## 11 Baltimore Orioles 708 5585 1434 191 0.257 1120
## 12 Los Angeles Dodgers 644 5436 1395 117 0.257 1087
## 13 Chicago Cubs 654 5549 1423 148 0.256 1202
## 14 Cincinnati Reds 735 5612 1438 183 0.256 1250
## 15 Los Angeles Angels 667 5513 1394 155 0.253 1086
## 16 Philadelphia Phillies 713 5579 1409 153 0.253 1024
## 17 Chicago White Sox 654 5502 1387 154 0.252 989
## 18 Cleveland Indians 704 5509 1380 154 0.250 1269
## 19 Arizona Diamondbacks 731 5421 1357 172 0.250 1249
## 20 Toronto Blue Jays 743 5559 1384 186 0.249 1184
## 21 Minnesota Twins 619 5487 1357 103 0.247 1048
## 22 Florida Marlins 625 5508 1358 149 0.247 1244
## 23 Pittsburgh Pirates 610 5421 1325 107 0.244 1308
## 24 Oakland Athletics 645 5452 1330 114 0.244 1094
## 25 Tampa Bay Rays 707 5436 1324 172 0.244 1193
## 26 Atlanta Braves 641 5528 1345 173 0.243 1260
## 27 Washington Nationals 624 5441 1319 154 0.242 1323
## 28 San Francisco Giants 570 5486 1327 121 0.242 1122
## 29 San Diego Padres 593 5417 1284 91 0.237 1320
## 30 Seattle Mariners 556 5421 1263 109 0.233 1280
## stolen_bases wins new_onbase new_slug new_obs
## 1 143 96 0.340 0.460 0.800
## 2 102 90 0.349 0.461 0.810
## 3 49 95 0.340 0.434 0.773
## 4 153 71 0.329 0.415 0.744
## 5 57 90 0.341 0.425 0.766
## 6 130 77 0.335 0.391 0.725
## 7 147 97 0.343 0.444 0.788
## 8 94 96 0.325 0.425 0.750
## 9 118 73 0.329 0.410 0.739
## 10 118 56 0.311 0.374 0.684
## 11 81 69 0.316 0.413 0.729
## 12 126 82 0.322 0.375 0.697
## 13 69 71 0.314 0.401 0.715
## 14 97 79 0.326 0.408 0.734
## 15 135 86 0.313 0.402 0.714
## 16 96 102 0.323 0.395 0.717
## 17 81 79 0.319 0.388 0.706
## 18 89 80 0.317 0.396 0.714
## 19 133 94 0.322 0.413 0.736
## 20 131 81 0.317 0.413 0.730
## 21 92 63 0.306 0.360 0.666
## 22 95 72 0.318 0.388 0.706
## 23 108 72 0.309 0.368 0.676
## 24 117 74 0.311 0.369 0.680
## 25 155 91 0.322 0.402 0.724
## 26 77 89 0.308 0.387 0.695
## 27 106 80 0.309 0.383 0.691
## 28 85 86 0.303 0.368 0.671
## 29 170 71 0.305 0.349 0.653
## 30 125 67 0.292 0.348 0.640
Based on the least squares regression line equation, if a player had 5579 at bats it would be predicted that they have 728.3166 runs. Because the actual value was 713, the least squares regression line is about 15.32 runs overestimate.
plot(m1$residuals~mlb11$at_bats)
abline(h = 0, lty = 3)
Because the plot of the residuals are realtively equally scattered above and below the “h” line, this means that a linear model was the correct one to use when demonstrating the relationship between runs and at bats.
hist(m1$residuals)
plot(m1$residuals)
Yes, because the histogram is relatively symmetric, and unimodal, it can be assumed that the nearly normal residuals condiotion is satisfied in this case.
qqnorm(m1$residuals)
qqline(m1$residuals)
Yes, the constant variability condition appears to have been met here because the plot of the residuals does not increase or decrease over the line of h=0. The points are scattered relatively equally around the regression line(possibly change)
ad=read.csv("http://foxweb.marist.edu/users/duy.nguyen2/Advertising.csv",head=TRUE)
attach(ad)
head(ad)
## X TV Radio Newspaper Sales
## 1 1 230.1 37.8 69.2 22.1
## 2 2 44.5 39.3 45.1 10.4
## 3 3 17.2 45.9 69.3 9.3
## 4 4 151.5 41.3 58.5 18.5
## 5 5 180.8 10.8 58.4 12.9
## 6 6 8.7 48.9 75.0 7.2
plot(ad$TV,ad$Sales,col="magenta")
plot(ad$Radio,ad$Sales,col="gold")
plot(ad$Newspaper,ad$Sales,col="navy")
plot(ad$TV, ad$Sales, col="magenta")
x=ad$TV
y=ad$Sales
beta=cov(x,y)/var(x)
alpha=mean(y)-beta*mean(x)
abline(alpha, beta, col = "maroon")
m6=lm(Sales~TV, data=ad)
m6$coefficients
## (Intercept) TV
## 7.03259355 0.04753664
beta=0.0473664
alpha=7.03259355
plot(ad$TV, ad$Sales, col="navy")
x=ad$TV
y=ad$Sales
beta=cov(x,y)/var(x)
alpha=mean(y)-beta*mean(x)
abline(alpha, beta, col="magenta")
summary(m6)
##
## Call:
## lm(formula = Sales ~ TV, data = ad)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3860 -1.9545 -0.1913 2.0671 7.2124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032594 0.457843 15.36 <2e-16 ***
## TV 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
m2=lm(ad$Sales~ad$TV)
plot(m2$residuals)
hist(m2$residuals)
#### 5: RMSE=14.83316 R-squared: 0.6292
set.seed(100)
index=sample(1:nrow(ad),size=.5*nrow(ad))
train=ad[index,]
test=ad[-index,]
m8=lm(Sales~TV, data=train)
predsales=predict(m8,test)
sum((test$Sales-predsales)^2)/length(test)
## [1] 220.0226
sqrt(sum((test$Sales-predsales)^2)/length(test))
## [1] 14.83316
summary(m8)
##
## Call:
## lm(formula = Sales ~ TV, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3083 -1.5591 -0.1113 1.9412 6.8822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.019262 0.645899 10.87 <2e-16 ***
## TV 0.047304 0.003668 12.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.199 on 98 degrees of freedom
## Multiple R-squared: 0.6292, Adjusted R-squared: 0.6254
## F-statistic: 166.3 on 1 and 98 DF, p-value: < 2.2e-16