MATH 242: Lab 1

Load data:

##                  team runs at_bats hits homeruns bat_avg strikeouts
## 1       Texas Rangers  855    5659 1599      210   0.283        930
## 2      Boston Red Sox  875    5710 1600      203   0.280       1108
## 3      Detroit Tigers  787    5563 1540      169   0.277       1143
## 4  Kansas City Royals  730    5672 1560      129   0.275       1006
## 5 St. Louis Cardinals  762    5532 1513      162   0.273        978
## 6       New York Mets  718    5600 1477      108   0.264       1085
##   stolen_bases wins new_onbase new_slug new_obs
## 1          143   96      0.340    0.460   0.800
## 2          102   90      0.349    0.461   0.810
## 3           49   95      0.340    0.434   0.773
## 4          153   71      0.329    0.415   0.744
## 5           57   90      0.341    0.425   0.766
## 6          130   77      0.335    0.391   0.725

1:

You would use a scatter plot to display the relationship between runs and at_bats. We can determine alpha is 5113.35, and beta is 0.760. The relationship between runs and at bats does look linear. The correlation coefficient is 0.61.

plot(mlb11$runs,mlb11$at_bats,col="red")
x=mlb11$runs
y=mlb11$at_bats
beta=cov(x,y)/var(x)
alpha=mean(y)-beta*mean(x)
abline(alpha,beta,col="blue")

cor(mlb11$runs,mlb11$at_bats)

## [1] 0.610627

2:

The plot from exercise one demonstrates a positive liner relationship, that is relatively strong, as the points seem to follow an upward sloping line. My coefficients are: (intercept)= -6343.579, and x= 1.272. THe sum of squares is: 203465. After plotting the ss line which minimizes the sum of squared residuals, the (intercept) was -6282.879, and the x = 1.261, while the sum of squares now equals 201595.4.

plot_ss(x=mlb11$at_bats,y=mlb11$runs)

## Click two points to make a line.                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##  -2789.2429       0.6305  
## 
## Sum of Squares:  123721.9

plot_ss(x=mlb11$at_bats,y=mlb11$runs, showSquares = TRUE)

## Click two points to make a line.                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##  -2789.2429       0.6305  
## 
## Sum of Squares:  123721.9

3:

m1 <- lm(runs ~ at_bats, data = mlb11)
summary(m1)

## 
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -125.58  -47.05  -16.59   54.40  176.87 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2789.2429   853.6957  -3.267 0.002871 ** 
## at_bats         0.6305     0.1545   4.080 0.000339 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared:  0.3729, Adjusted R-squared:  0.3505 
## F-statistic: 16.65 on 1 and 28 DF,  p-value: 0.0003388

x=mlb11$at_bats
y=mlb11$runs
plot_ss(x = mlb11$at_bats, y= mlb11$runs)

## Click two points to make a line.                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##  -2789.2429       0.6305  
## 
## Sum of Squares:  123721.9

plot_ss(x = mlb11$at_bats, y = mlb11$runs, showSquares = TRUE)

## Click two points to make a line.                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##  -2789.2429       0.6305  
## 
## Sum of Squares:  123721.9

least squares regression line = -2789.2429 + 0.6305*at_bats. Multiple R-squared is 0.3729 (37.3% variability), and adjusted R-squared is 0.3505. The smallest sum of squares we got was 123721.9.

4:

plot(mlb11$runs~mlb11$at_bats)
abline(m1)

list(mlb11)

## [[1]]
##                     team runs at_bats hits homeruns bat_avg strikeouts
## 1          Texas Rangers  855    5659 1599      210   0.283        930
## 2         Boston Red Sox  875    5710 1600      203   0.280       1108
## 3         Detroit Tigers  787    5563 1540      169   0.277       1143
## 4     Kansas City Royals  730    5672 1560      129   0.275       1006
## 5    St. Louis Cardinals  762    5532 1513      162   0.273        978
## 6          New York Mets  718    5600 1477      108   0.264       1085
## 7       New York Yankees  867    5518 1452      222   0.263       1138
## 8      Milwaukee Brewers  721    5447 1422      185   0.261       1083
## 9       Colorado Rockies  735    5544 1429      163   0.258       1201
## 10        Houston Astros  615    5598 1442       95   0.258       1164
## 11     Baltimore Orioles  708    5585 1434      191   0.257       1120
## 12   Los Angeles Dodgers  644    5436 1395      117   0.257       1087
## 13          Chicago Cubs  654    5549 1423      148   0.256       1202
## 14       Cincinnati Reds  735    5612 1438      183   0.256       1250
## 15    Los Angeles Angels  667    5513 1394      155   0.253       1086
## 16 Philadelphia Phillies  713    5579 1409      153   0.253       1024
## 17     Chicago White Sox  654    5502 1387      154   0.252        989
## 18     Cleveland Indians  704    5509 1380      154   0.250       1269
## 19  Arizona Diamondbacks  731    5421 1357      172   0.250       1249
## 20     Toronto Blue Jays  743    5559 1384      186   0.249       1184
## 21       Minnesota Twins  619    5487 1357      103   0.247       1048
## 22       Florida Marlins  625    5508 1358      149   0.247       1244
## 23    Pittsburgh Pirates  610    5421 1325      107   0.244       1308
## 24     Oakland Athletics  645    5452 1330      114   0.244       1094
## 25        Tampa Bay Rays  707    5436 1324      172   0.244       1193
## 26        Atlanta Braves  641    5528 1345      173   0.243       1260
## 27  Washington Nationals  624    5441 1319      154   0.242       1323
## 28  San Francisco Giants  570    5486 1327      121   0.242       1122
## 29      San Diego Padres  593    5417 1284       91   0.237       1320
## 30      Seattle Mariners  556    5421 1263      109   0.233       1280
##    stolen_bases wins new_onbase new_slug new_obs
## 1           143   96      0.340    0.460   0.800
## 2           102   90      0.349    0.461   0.810
## 3            49   95      0.340    0.434   0.773
## 4           153   71      0.329    0.415   0.744
## 5            57   90      0.341    0.425   0.766
## 6           130   77      0.335    0.391   0.725
## 7           147   97      0.343    0.444   0.788
## 8            94   96      0.325    0.425   0.750
## 9           118   73      0.329    0.410   0.739
## 10          118   56      0.311    0.374   0.684
## 11           81   69      0.316    0.413   0.729
## 12          126   82      0.322    0.375   0.697
## 13           69   71      0.314    0.401   0.715
## 14           97   79      0.326    0.408   0.734
## 15          135   86      0.313    0.402   0.714
## 16           96  102      0.323    0.395   0.717
## 17           81   79      0.319    0.388   0.706
## 18           89   80      0.317    0.396   0.714
## 19          133   94      0.322    0.413   0.736
## 20          131   81      0.317    0.413   0.730
## 21           92   63      0.306    0.360   0.666
## 22           95   72      0.318    0.388   0.706
## 23          108   72      0.309    0.368   0.676
## 24          117   74      0.311    0.369   0.680
## 25          155   91      0.322    0.402   0.724
## 26           77   89      0.308    0.387   0.695
## 27          106   80      0.309    0.383   0.691
## 28           85   86      0.303    0.368   0.671
## 29          170   71      0.305    0.349   0.653
## 30          125   67      0.292    0.348   0.640

5:

Based on the least squares regression line equation, if a player had 5579 at bats it would be predicted that they have 728.3166 runs. Because the actual value was 713, the least squares regression line is about 15.32 runs overestimate.

plot(m1$residuals~mlb11$at_bats)
abline(h = 0, lty = 3)

6:

Because the plot of the residuals are realtively equally scattered above and below the “h” line, this means that a linear model was the correct one to use when demonstrating the relationship between runs and at bats.

hist(m1$residuals)

plot(m1$residuals)

7:

Yes, because the histogram is relatively symmetric, and unimodal, it can be assumed that the nearly normal residuals condiotion is satisfied in this case.

qqnorm(m1$residuals)
qqline(m1$residuals)

8:

Yes, the constant variability condition appears to have been met here because the plot of the residuals does not increase or decrease over the line of h=0. The points are scattered relatively equally around the regression line(possibly change)

ad=read.csv("http://foxweb.marist.edu/users/duy.nguyen2/Advertising.csv",head=TRUE)
attach(ad)
head(ad)

##   X    TV Radio Newspaper Sales
## 1 1 230.1  37.8      69.2  22.1
## 2 2  44.5  39.3      45.1  10.4
## 3 3  17.2  45.9      69.3   9.3
## 4 4 151.5  41.3      58.5  18.5
## 5 5 180.8  10.8      58.4  12.9
## 6 6   8.7  48.9      75.0   7.2

plot(ad$TV,ad$Sales,col="magenta")

plot(ad$Radio,ad$Sales,col="gold")

plot(ad$Newspaper,ad$Sales,col="navy")

On your own

1: Plotted above^

2: There seems to be a linear relationship between TV and Sales.

plot(ad$TV, ad$Sales, col="magenta")
x=ad$TV
y=ad$Sales
beta=cov(x,y)/var(x)
alpha=mean(y)-beta*mean(x)
abline(alpha, beta, col = "maroon")

3: #H0: Beta = 0. Beta is not 0. There is evidence to reject the null hypothesis, since p<0.05.

m6=lm(Sales~TV, data=ad)
m6$coefficients

## (Intercept)          TV 
##  7.03259355  0.04753664

beta=0.0473664
alpha=7.03259355
plot(ad$TV, ad$Sales, col="navy")
x=ad$TV
y=ad$Sales
beta=cov(x,y)/var(x)
alpha=mean(y)-beta*mean(x)
abline(alpha, beta, col="magenta")

summary(m6)

## 
## Call:
## lm(formula = Sales ~ TV, data = ad)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3860 -1.9545 -0.1913  2.0671  7.2124 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.032594   0.457843   15.36   <2e-16 ***
## TV          0.047537   0.002691   17.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared:  0.6119, Adjusted R-squared:  0.6099 
## F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16

4:The histogram of residuals is about normally distributed, and the residual scatter plot displays no pattern. Thus, the data indicates that there is a linear relationship between the sales and tv ads.

m2=lm(ad$Sales~ad$TV)
plot(m2$residuals)

hist(m2$residuals)

#### 5: RMSE=14.83316 R-squared: 0.6292

set.seed(100)
index=sample(1:nrow(ad),size=.5*nrow(ad))
train=ad[index,]
test=ad[-index,]
m8=lm(Sales~TV, data=train)
predsales=predict(m8,test)
sum((test$Sales-predsales)^2)/length(test)

## [1] 220.0226

sqrt(sum((test$Sales-predsales)^2)/length(test))

## [1] 14.83316

summary(m8)

## 
## Call:
## lm(formula = Sales ~ TV, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3083 -1.5591 -0.1113  1.9412  6.8822 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.019262   0.645899   10.87   <2e-16 ***
## TV          0.047304   0.003668   12.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.199 on 98 degrees of freedom
## Multiple R-squared:  0.6292, Adjusted R-squared:  0.6254 
## F-statistic: 166.3 on 1 and 98 DF,  p-value: < 2.2e-16