#Import the Data - don't forget to change the directory
baseball=read.csv("BaseballHits.csv", header = TRUE)
#Look at the First 6 lines
head(baseball)
## Team League Wins Runs Hits Doubles Triples HomeRuns RBI StolenBases
## 1 ARI NL 64 615 1379 259 47 118 573 86
## 2 ATL NL 79 573 1316 240 22 123 545 95
## 3 BAL AL 96 705 1434 264 16 211 681 44
## 4 BOS AL 71 634 1355 282 20 123 601 63
## 5 CHC NL 73 614 1315 270 31 157 590 65
## 6 CHW AL 73 660 1400 279 32 155 625 85
## CaughtStealing Walks Strikeouts BattingAvg
## 1 33 398 1165 0.248
## 2 33 472 1369 0.241
## 3 20 401 1285 0.256
## 4 25 535 1337 0.244
## 5 40 442 1477 0.239
## 6 36 417 1362 0.253
Now let’s look at the data graphically in the American League
par(mfrow=c(1,1))
hist(baseball$Hits[baseball$League=="AL"],
xlab="American League",
xlim=c(1300,1600),
breaks=8,
main="Number of Hits")
We can also look at the data for the National League
hist(baseball$Hits[baseball$League=="NL"],
xlim=c(1300,1600),
breaks=8,
xlab="National League",
main="Number of Hits")
#Create a boxplot of the number of hits per season
boxplot(baseball$Hits~baseball$League, main="MLB Hits per Season")
The following code gives several statistical summaries of the data
#Summaries of the data
summary(baseball$Hits[baseball$League=="AL"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1317 1354 1400 1402 1434 1557
summary(baseball$Hits[baseball$League=="NL"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1199 1316 1371 1371 1405 1551
#mean
mean(baseball$Hits[baseball$League=="AL"])
## [1] 1402.2
mean(baseball$Hits[baseball$League=="NL"])
## [1] 1370.8
#median
median(baseball$Hits[baseball$League=="AL"])
## [1] 1400
median(baseball$Hits[baseball$League=="NL"])
## [1] 1371
#Measuring Range
range(baseball$Hits[baseball$League=="AL"])
## [1] 1317 1557
range(baseball$Hits[baseball$League=="NL"])
## [1] 1199 1551
#Measuring Standard Deviation
sd(baseball$Hits[baseball$League=="AL"])
## [1] 62.55877
sd(baseball$Hits[baseball$League=="NL"])
## [1] 84.07412
#I
IQR(baseball$Hits[baseball$League=="AL"])
## [1] 80
IQR(baseball$Hits[baseball$League=="NL"])
## [1] 89.5
Two variable analysis
#plot Wins vs. Hits color coded by league black is AL
plot(baseball$Hits,baseball$Wins,
col=baseball$League,
main="Wins vs. Hits for MLB Teams",
xlab="Hits",
ylab="Wins")
#Find the Correlation between wins and hits
cor(baseball$Hits,baseball$Wins)
## [1] 0.2877065
#Fit a linear model to the data
lm=lm(baseball$Wins~baseball$Hits)
#Get a Summary of the linear fit - ignore a lot of this info
summary(lm)
##
## Call:
## lm(formula = baseball$Wins ~ baseball$Hits)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.094 -6.490 1.371 6.037 14.389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.63333 32.35911 0.916 0.368
## baseball$Hits 0.03705 0.02331 1.590 0.123
##
## Residual standard error: 9.356 on 28 degrees of freedom
## Multiple R-squared: 0.08278, Adjusted R-squared: 0.05002
## F-statistic: 2.527 on 1 and 28 DF, p-value: 0.1232
The model is: Wins=0.037*Hits + 29.63
#fit a line to the data
#plot Wins vs. Hits color coded by league black is AL
plot(baseball$Hits,baseball$Wins,
col=baseball$League,
main="Wins vs. Hits for MLB Teams",
xlab="Hits",
ylab="Wins")
lines(baseball$Hits, 0.037*baseball$Hits+29.63)