#Lab 9
#Zhivko Kolevski T6

download.file("http://www.openintro.org/stat/data/mlb11.RData", destfile = "mlb11.RData")
load("mlb11.RData")
cor(mlb11$runs, mlb11$at_bats)
## [1] 0.610627
plot_ss(x = mlb11$at_bats, y = mlb11$runs, showSquares = TRUE)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##  -2789.2429       0.6305  
## 
## Sum of Squares:  123721.9
m1 <- lm(runs ~ at_bats, data = mlb11)
summary(m1)
## 
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -125.58  -47.05  -16.59   54.40  176.87 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2789.2429   853.6957  -3.267 0.002871 ** 
## at_bats         0.6305     0.1545   4.080 0.000339 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared:  0.3729, Adjusted R-squared:  0.3505 
## F-statistic: 16.65 on 1 and 28 DF,  p-value: 0.0003388
plot(mlb11$runs ~ mlb11$at_bats)
abline(m1)

plot(m1$residuals ~ mlb11$at_bats)
abline(h = 0, lty = 3)

#On your own

#1. Choose another traditional variable from mlb11 that you think might be a good predictor of runs. Produce a scatterplot of the two variables and fit a linear model. At a glance, does there seem to be a linear relationship?
plot(mlb11$bat_avg,mlb11$runs,xlab="Batting Average",ylab="Runs",main="Batting Average Vs Runs")
#I see a positive linear relationship.

#2. How does this relationship compare to the relationship between runs and at_bats? Use the R2 values from the two model summaries to compare. Does your variable seem to predict runs better than at_bats? How can you tell?
par(mfrow = c(2,2), oma = c(0,0,2,0))

plot(mlb11$bat_avg,mlb11$runs,xlab="Batting Average",ylab="Runs",main="Batting Average Vs Runs")
A = lm(runs ~ bat_avg, data = mlb11)
summary(A)
## 
## Call:
## lm(formula = runs ~ bat_avg, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -94.676 -26.303  -5.496  28.482 131.113 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -642.8      183.1  -3.511  0.00153 ** 
## bat_avg       5242.2      717.3   7.308 5.88e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.23 on 28 degrees of freedom
## Multiple R-squared:  0.6561, Adjusted R-squared:  0.6438 
## F-statistic: 53.41 on 1 and 28 DF,  p-value: 5.877e-08
plot(mlb11$runs ~ mlb11$at_bats,xlab="At Bat Average",ylab="Runs",main="At Bat Average Vs Runs")
B= lm(runs ~ at_bats, data = mlb11)
summary(B)
## 
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -125.58  -47.05  -16.59   54.40  176.87 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2789.2429   853.6957  -3.267 0.002871 ** 
## at_bats         0.6305     0.1545   4.080 0.000339 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared:  0.3729, Adjusted R-squared:  0.3505 
## F-statistic: 16.65 on 1 and 28 DF,  p-value: 0.0003388
#Batting average has tighter clusters. If batting R^2 value is greater than the bat average it is better for prediction runs. Batting Average R^2 = 0.6561. At Bat Average R^2 = 0.3729.

#3. Now that you can summarize the linear relationship between two variables, investigate the relationships between runs and each of the other five traditional variables. Which variable best predicts runs? Support your conclusion using the graphical and numerical methods we've discussed (for the sake of conciseness, only include output for the best variable, not all five).
par(mfrow = c(2,3), oma = c(0,0,2,0))

#A
C=lm(runs~bat_avg,data=mlb11)
plot(mlb11$bat_avg,mlb11$runs,xlab="Batting Average",ylab="Runs",main="Batting Average Vs Runs")
abline(C)
summary(C)
## 
## Call:
## lm(formula = runs ~ bat_avg, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -94.676 -26.303  -5.496  28.482 131.113 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -642.8      183.1  -3.511  0.00153 ** 
## bat_avg       5242.2      717.3   7.308 5.88e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.23 on 28 degrees of freedom
## Multiple R-squared:  0.6561, Adjusted R-squared:  0.6438 
## F-statistic: 53.41 on 1 and 28 DF,  p-value: 5.877e-08
sum(C$residuals^2)
## [1] 67849.52
#R^2 = 0.6561
#Sum of Squares = 67849.52
#B
D=lm(runs~new_obs,data=mlb11)
plot(mlb11$new_obs,mlb11$runs,xlab="new_obs",ylab="Runs",main="new_obs Vs Runs")
abline(D)     
summary(D)
## 
## Call:
## lm(formula = runs ~ new_obs, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.456 -13.690   1.165  13.935  41.156 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -686.61      68.93  -9.962 1.05e-10 ***
## new_obs      1919.36      95.70  20.057  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.41 on 28 degrees of freedom
## Multiple R-squared:  0.9349, Adjusted R-squared:  0.9326 
## F-statistic: 402.3 on 1 and 28 DF,  p-value: < 2.2e-16
sum(D$residuals^2)
## [1] 12837.65
#R^2 = 0.9349
#Sum of Squares = 12837.65
#C
E=lm(runs~new_onbase,data=mlb11)
plot(mlb11$new_onbase,mlb11$runs,xlab="new_onbase",ylab="Runs",main="new_onbase Vs Runs")
abline(E)
summary(E)
## 
## Call:
## lm(formula = runs ~ new_onbase, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.270 -18.335   3.249  19.520  69.002 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1118.4      144.5  -7.741 1.97e-08 ***
## new_onbase    5654.3      450.5  12.552 5.12e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.61 on 28 degrees of freedom
## Multiple R-squared:  0.8491, Adjusted R-squared:  0.8437 
## F-statistic: 157.6 on 1 and 28 DF,  p-value: 5.116e-13
sum(E$residuals^2)
## [1] 29768.7
#R^2 = 0.8491
#Sum of Squares = 29768.7
#D
F=lm(runs~new_slug,data=mlb11)
plot(mlb11$new_slug,mlb11$runs,xlab="new_slug",ylab="Runs",main="new_slug Vs Runs")
abline(F)
summary(F)
## 
## Call:
## lm(formula = runs ~ new_slug, data = mlb11)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45.41 -18.66  -0.91  16.29  52.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -375.80      68.71   -5.47 7.70e-06 ***
## new_slug     2681.33     171.83   15.61 2.42e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.96 on 28 degrees of freedom
## Multiple R-squared:  0.8969, Adjusted R-squared:  0.8932 
## F-statistic: 243.5 on 1 and 28 DF,  p-value: 2.42e-15
sum(F$residuals^2)
## [1] 20345.54
#R^2 = 0.8969
#Sum of Squares = 20345.54
#E
G=lm(runs~at_bats,data=mlb11)
plot(mlb11$runs ~ mlb11$at_bats,xlab="At Bat Average",ylab="Runs",main="At Bat Average Vs Runs")
abline(G)
summary(G)
## 
## Call:
## lm(formula = runs ~ at_bats, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -125.58  -47.05  -16.59   54.40  176.87 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2789.2429   853.6957  -3.267 0.002871 ** 
## at_bats         0.6305     0.1545   4.080 0.000339 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66.47 on 28 degrees of freedom
## Multiple R-squared:  0.3729, Adjusted R-squared:  0.3505 
## F-statistic: 16.65 on 1 and 28 DF,  p-value: 0.0003388
sum(G$residuals^2)
## [1] 123721.9
#R^2 =  0.3729
#Sum of Squares = 123721.9
#F
D=lm(runs~new_obs,data=mlb11)
plot(mlb11$stolen_bases,mlb11$runs,xlab="stolen_bases",ylab="Runs",main="stolen_bases Vs Runs")
abline(D)     

summary(D)
## 
## Call:
## lm(formula = runs ~ new_obs, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.456 -13.690   1.165  13.935  41.156 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -686.61      68.93  -9.962 1.05e-10 ***
## new_obs      1919.36      95.70  20.057  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.41 on 28 degrees of freedom
## Multiple R-squared:  0.9349, Adjusted R-squared:  0.9326 
## F-statistic: 402.3 on 1 and 28 DF,  p-value: < 2.2e-16
sum(D$residuals^2)
## [1] 12837.65
#new_obs is the best predictor because it has the highest R^2 = 0.9349 and lowest Sum of Squares = 12837.65

#4. Now examine the three newer variables. These are the statistics used by the author of Moneyball to predict a teams success. In general, are they more or less effective at predicting runs that the old variables? Explain using appropriate graphical and numerical evidence. Of all ten variables we've analyzed, which seems to be the best predictor of runs? Using the limited (or not so limited) information you know about these baseball statistics, does your result make sense?
par(mfrow = c(2,2), oma = c(0,0,2,0))
D =lm(runs~new_obs,data=mlb11)
plot(mlb11$new_obs,mlb11$runs,xlab="new_obs",ylab="Runs",main="new_obs Vs Runs")
abline(D)     
summary(D)
## 
## Call:
## lm(formula = runs ~ new_obs, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.456 -13.690   1.165  13.935  41.156 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -686.61      68.93  -9.962 1.05e-10 ***
## new_obs      1919.36      95.70  20.057  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.41 on 28 degrees of freedom
## Multiple R-squared:  0.9349, Adjusted R-squared:  0.9326 
## F-statistic: 402.3 on 1 and 28 DF,  p-value: < 2.2e-16
sum(D$residuals^2)
## [1] 12837.65
#R^2 = 0.9349
#Sum of Squares = 12837.65

E=lm(runs~new_onbase,data=mlb11)
plot(mlb11$new_onbase,mlb11$runs,xlab="new_onbase",ylab="Runs",main="new_onbase Vs Runs")
abline(E)
summary(E)
## 
## Call:
## lm(formula = runs ~ new_onbase, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.270 -18.335   3.249  19.520  69.002 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1118.4      144.5  -7.741 1.97e-08 ***
## new_onbase    5654.3      450.5  12.552 5.12e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.61 on 28 degrees of freedom
## Multiple R-squared:  0.8491, Adjusted R-squared:  0.8437 
## F-statistic: 157.6 on 1 and 28 DF,  p-value: 5.116e-13
sum(E$residuals^2)
## [1] 29768.7
#R^2 = 0.8491
#Sum of Squares = 29768.7

F=lm(runs~new_slug,data=mlb11)
plot(mlb11$new_slug,mlb11$runs,xlab="new_slug",ylab="Runs",main="new_slug Vs Runs")
abline(F)
summary(F)
## 
## Call:
## lm(formula = runs ~ new_slug, data = mlb11)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45.41 -18.66  -0.91  16.29  52.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -375.80      68.71   -5.47 7.70e-06 ***
## new_slug     2681.33     171.83   15.61 2.42e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.96 on 28 degrees of freedom
## Multiple R-squared:  0.8969, Adjusted R-squared:  0.8932 
## F-statistic: 243.5 on 1 and 28 DF,  p-value: 2.42e-15
sum(F$residuals^2)
## [1] 20345.54
#R^2 = 0.8969
#Sum of Squares = 20345.54

# These new variables predict much better The R^2 values are higher and the sum and squares values are smaller. This makes sense because this is a more improved way of predicticting.

#5. Check the model diagnostics for the regression model with the variable you decided was the best predictor for runs.
summary(D)
## 
## Call:
## lm(formula = runs ~ new_obs, data = mlb11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.456 -13.690   1.165  13.935  41.156 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -686.61      68.93  -9.962 1.05e-10 ***
## new_obs      1919.36      95.70  20.057  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.41 on 28 degrees of freedom
## Multiple R-squared:  0.9349, Adjusted R-squared:  0.9326 
## F-statistic: 402.3 on 1 and 28 DF,  p-value: < 2.2e-16
sum(D$residuals^2)
## [1] 12837.65
#R^2 = 0.9349 This is the highest R^2 value of all the variables
#Sum of Squares = 12837.65 This is thelowest SS value of all the variables

#Documentation: I recieved help from c2c Sermsipong because i got stuck on questions 3 and 4.