Baseball Data Analysis

#Import the Data - don't forget to change the directory
baseball=read.csv("BaseballHits.csv", header = TRUE)
#Look at the First 6 lines
head(baseball)
##   Team League Wins Runs Hits Doubles Triples HomeRuns RBI StolenBases
## 1  ARI     NL   64  615 1379     259      47      118 573          86
## 2  ATL     NL   79  573 1316     240      22      123 545          95
## 3  BAL     AL   96  705 1434     264      16      211 681          44
## 4  BOS     AL   71  634 1355     282      20      123 601          63
## 5  CHC     NL   73  614 1315     270      31      157 590          65
## 6  CHW     AL   73  660 1400     279      32      155 625          85
##   CaughtStealing Walks Strikeouts BattingAvg
## 1             33   398       1165      0.248
## 2             33   472       1369      0.241
## 3             20   401       1285      0.256
## 4             25   535       1337      0.244
## 5             40   442       1477      0.239
## 6             36   417       1362      0.253

Now let’s look at the data graphically in the American League

par(mfrow=c(1,1))
hist(baseball$Hits[baseball$League=="AL"],
     xlab="American League",
     xlim=c(1300,1600),
     breaks=8,
     main="Number of Hits")

We can also look at the data for the National League

hist(baseball$Hits[baseball$League=="NL"],
     xlim=c(1300,1600),
     breaks=8,
     xlab="National League",
     main="Number of Hits")

#Create a boxplot of the number of hits per season
boxplot(baseball$Hits~baseball$League, main="MLB Hits per Season")

The following code gives several statistical summaries of the data

#Summaries of the data
summary(baseball$Hits[baseball$League=="AL"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1317    1354    1400    1402    1434    1557
summary(baseball$Hits[baseball$League=="NL"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1199    1316    1371    1371    1405    1551
#mean
mean(baseball$Hits[baseball$League=="AL"])
## [1] 1402.2
mean(baseball$Hits[baseball$League=="NL"])
## [1] 1370.8
#median
median(baseball$Hits[baseball$League=="AL"])
## [1] 1400
median(baseball$Hits[baseball$League=="NL"])
## [1] 1371
#Measuring Range
range(baseball$Hits[baseball$League=="AL"])
## [1] 1317 1557
range(baseball$Hits[baseball$League=="NL"])
## [1] 1199 1551
#Measuring Standard Deviation
sd(baseball$Hits[baseball$League=="AL"])
## [1] 62.55877
sd(baseball$Hits[baseball$League=="NL"])
## [1] 84.07412
#I
IQR(baseball$Hits[baseball$League=="AL"])
## [1] 80
IQR(baseball$Hits[baseball$League=="NL"])
## [1] 89.5

Two variable analysis

#plot Wins vs. Hits color coded by league black is AL
plot(baseball$Hits,baseball$Wins,
     col=baseball$League,
     main="Wins vs. Hits for MLB Teams",
     xlab="Hits",
     ylab="Wins")

#Find the Correlation between wins and hits
cor(baseball$Hits,baseball$Wins)
## [1] 0.2877065
#Fit a linear model to the data
lm=lm(baseball$Wins~baseball$Hits)
#Get a Summary of the linear fit - ignore a lot of this info
summary(lm)
## 
## Call:
## lm(formula = baseball$Wins ~ baseball$Hits)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.094  -6.490   1.371   6.037  14.389 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)   29.63333   32.35911   0.916    0.368
## baseball$Hits  0.03705    0.02331   1.590    0.123
## 
## Residual standard error: 9.356 on 28 degrees of freedom
## Multiple R-squared:  0.08278,    Adjusted R-squared:  0.05002 
## F-statistic: 2.527 on 1 and 28 DF,  p-value: 0.1232

The model is: Wins=0.037*Hits + 29.63

#fit a line to the data
#plot Wins vs. Hits color coded by league black is AL
plot(baseball$Hits,baseball$Wins,
     col=baseball$League,
     main="Wins vs. Hits for MLB Teams",
     xlab="Hits",
     ylab="Wins")
lines(baseball$Hits, 0.037*baseball$Hits+29.63)