df <- read.csv('https://vincentarelbundock.github.io/Rdatasets/csv/ISLR/Hitters.csv')
head(df)
## X AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun
## 1 -Andy Allanson 293 66 1 30 29 14 1 293 66 1
## 2 -Alan Ashby 315 81 7 24 38 39 14 3449 835 69
## 3 -Alvin Davis 479 130 18 66 72 76 3 1624 457 63
## 4 -Andre Dawson 496 141 20 65 78 37 11 5628 1575 225
## 5 -Andres Galarraga 321 87 10 39 42 30 2 396 101 12
## 6 -Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19
## CRuns CRBI CWalks League Division PutOuts Assists Errors Salary NewLeague
## 1 30 29 14 A E 446 33 20 NA A
## 2 321 414 375 N W 632 43 10 475.0 N
## 3 224 266 263 A W 880 82 14 480.0 A
## 4 828 838 354 N E 200 11 3 500.0 N
## 5 48 46 33 N E 805 40 4 91.5 N
## 6 501 336 194 A W 282 421 25 750.0 A
str(df)
## 'data.frame': 322 obs. of 21 variables:
## $ X : chr "-Andy Allanson" "-Alan Ashby" "-Alvin Davis" "-Andre Dawson" ...
## $ AtBat : int 293 315 479 496 321 594 185 298 323 401 ...
## $ Hits : int 66 81 130 141 87 169 37 73 81 92 ...
## $ HmRun : int 1 7 18 20 10 4 1 0 6 17 ...
## $ Runs : int 30 24 66 65 39 74 23 24 26 49 ...
## $ RBI : int 29 38 72 78 42 51 8 24 32 66 ...
## $ Walks : int 14 39 76 37 30 35 21 7 8 65 ...
## $ Years : int 1 14 3 11 2 11 2 3 2 13 ...
## $ CAtBat : int 293 3449 1624 5628 396 4408 214 509 341 5206 ...
## $ CHits : int 66 835 457 1575 101 1133 42 108 86 1332 ...
## $ CHmRun : int 1 69 63 225 12 19 1 0 6 253 ...
## $ CRuns : int 30 321 224 828 48 501 30 41 32 784 ...
## $ CRBI : int 29 414 266 838 46 336 9 37 34 890 ...
## $ CWalks : int 14 375 263 354 33 194 24 12 8 866 ...
## $ League : chr "A" "N" "A" "N" ...
## $ Division : chr "E" "W" "W" "E" ...
## $ PutOuts : int 446 632 880 200 805 282 76 121 143 0 ...
## $ Assists : int 33 43 82 11 40 421 127 283 290 0 ...
## $ Errors : int 20 10 14 3 4 25 7 9 19 0 ...
## $ Salary : num NA 475 480 500 91.5 750 70 100 75 1100 ...
## $ NewLeague: chr "A" "N" "A" "N" ...
322 Rows with 21 different columns/variables.
summary(df)
## X AtBat Hits HmRun
## Length:322 Min. : 16.0 Min. : 1 Min. : 0.00
## Class :character 1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00
## Mode :character Median :379.5 Median : 96 Median : 8.00
## Mean :380.9 Mean :101 Mean :10.77
## 3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00
## Max. :687.0 Max. :238 Max. :40.00
##
## Runs RBI Walks Years
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 1.000
## 1st Qu.: 30.25 1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000
## Median : 48.00 Median : 44.00 Median : 35.00 Median : 6.000
## Mean : 50.91 Mean : 48.03 Mean : 38.74 Mean : 7.444
## 3rd Qu.: 69.00 3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000
## Max. :130.00 Max. :121.00 Max. :105.00 Max. :24.000
##
## CAtBat CHits CHmRun CRuns
## Min. : 19.0 Min. : 4.0 Min. : 0.00 Min. : 1.0
## 1st Qu.: 816.8 1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2
## Median : 1928.0 Median : 508.0 Median : 37.50 Median : 247.0
## Mean : 2648.7 Mean : 717.6 Mean : 69.49 Mean : 358.8
## 3rd Qu.: 3924.2 3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2
## Max. :14053.0 Max. :4256.0 Max. :548.00 Max. :2165.0
##
## CRBI CWalks League Division
## Min. : 0.00 Min. : 0.00 Length:322 Length:322
## 1st Qu.: 88.75 1st Qu.: 67.25 Class :character Class :character
## Median : 220.50 Median : 170.50 Mode :character Mode :character
## Mean : 330.12 Mean : 260.24
## 3rd Qu.: 426.25 3rd Qu.: 339.25
## Max. :1659.00 Max. :1566.00
##
## PutOuts Assists Errors Salary
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 67.5
## 1st Qu.: 109.2 1st Qu.: 7.0 1st Qu.: 3.00 1st Qu.: 190.0
## Median : 212.0 Median : 39.5 Median : 6.00 Median : 425.0
## Mean : 288.9 Mean :106.9 Mean : 8.04 Mean : 535.9
## 3rd Qu.: 325.0 3rd Qu.:166.0 3rd Qu.:11.00 3rd Qu.: 750.0
## Max. :1378.0 Max. :492.0 Max. :32.00 Max. :2460.0
## NA's :59
## NewLeague
## Length:322
## Class :character
## Mode :character
##
##
##
##
df <- setNames(df, c("Player_Names","At_Bats_1986", "Hits_1986", "HRs_1986", "Rs_1986", "RBIs_1986", "BBs_1986", "Years_In_MLB", "ABs_Career", "Hits_Career", "HRs_Career","Rs_Career", "RBIs_Career", "BBs_Career", "League_1986", "Division_1986", "PutOuts_1986", "Assists_1986", "Errors_1986", "Salary_1987", "League_1987"))
summary(df)
## Player_Names At_Bats_1986 Hits_1986 HRs_1986
## Length:322 Min. : 16.0 Min. : 1 Min. : 0.00
## Class :character 1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00
## Mode :character Median :379.5 Median : 96 Median : 8.00
## Mean :380.9 Mean :101 Mean :10.77
## 3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00
## Max. :687.0 Max. :238 Max. :40.00
##
## Rs_1986 RBIs_1986 BBs_1986 Years_In_MLB
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 1.000
## 1st Qu.: 30.25 1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000
## Median : 48.00 Median : 44.00 Median : 35.00 Median : 6.000
## Mean : 50.91 Mean : 48.03 Mean : 38.74 Mean : 7.444
## 3rd Qu.: 69.00 3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000
## Max. :130.00 Max. :121.00 Max. :105.00 Max. :24.000
##
## ABs_Career Hits_Career HRs_Career Rs_Career
## Min. : 19.0 Min. : 4.0 Min. : 0.00 Min. : 1.0
## 1st Qu.: 816.8 1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2
## Median : 1928.0 Median : 508.0 Median : 37.50 Median : 247.0
## Mean : 2648.7 Mean : 717.6 Mean : 69.49 Mean : 358.8
## 3rd Qu.: 3924.2 3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2
## Max. :14053.0 Max. :4256.0 Max. :548.00 Max. :2165.0
##
## RBIs_Career BBs_Career League_1986 Division_1986
## Min. : 0.00 Min. : 0.00 Length:322 Length:322
## 1st Qu.: 88.75 1st Qu.: 67.25 Class :character Class :character
## Median : 220.50 Median : 170.50 Mode :character Mode :character
## Mean : 330.12 Mean : 260.24
## 3rd Qu.: 426.25 3rd Qu.: 339.25
## Max. :1659.00 Max. :1566.00
##
## PutOuts_1986 Assists_1986 Errors_1986 Salary_1987
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 67.5
## 1st Qu.: 109.2 1st Qu.: 7.0 1st Qu.: 3.00 1st Qu.: 190.0
## Median : 212.0 Median : 39.5 Median : 6.00 Median : 425.0
## Mean : 288.9 Mean :106.9 Mean : 8.04 Mean : 535.9
## 3rd Qu.: 325.0 3rd Qu.:166.0 3rd Qu.:11.00 3rd Qu.: 750.0
## Max. :1378.0 Max. :492.0 Max. :32.00 Max. :2460.0
## NA's :59
## League_1987
## Length:322
## Class :character
## Mode :character
##
##
##
##
The names of the columns were changed to be more reader friendly. It is easier to now know whether a varaible is referring to a player’s singular season or his whole career.
veterans <- subset(df, Years_In_MLB > 10)
head(veterans)
## Player_Names At_Bats_1986 Hits_1986 HRs_1986 Rs_1986 RBIs_1986 BBs_1986
## 2 -Alan Ashby 315 81 7 24 38 39
## 4 -Andre Dawson 496 141 20 65 78 37
## 6 -Alfredo Griffin 594 169 4 74 51 35
## 10 -Andre Thornton 401 92 17 49 66 65
## 15 -Bill Almon 196 43 7 29 27 30
## 17 -Buddy Bell 568 158 20 89 75 73
## Years_In_MLB ABs_Career Hits_Career HRs_Career Rs_Career RBIs_Career
## 2 14 3449 835 69 321 414
## 4 11 5628 1575 225 828 838
## 6 11 4408 1133 19 501 336
## 10 13 5206 1332 253 784 890
## 15 13 3231 825 36 376 290
## 17 15 8068 2273 177 1045 993
## BBs_Career League_1986 Division_1986 PutOuts_1986 Assists_1986 Errors_1986
## 2 375 N W 632 43 10
## 4 354 N E 200 11 3
## 6 194 A W 282 421 25
## 10 866 A E 0 0 0
## 15 238 N E 80 45 8
## 17 732 N W 105 290 10
## Salary_1987 League_1987
## 2 475 N
## 4 500 N
## 6 750 A
## 10 1100 A
## 15 240 N
## 17 775 N
This subset was created by looking at player who accumulated more than 10 years played in the major leagues. Players who reach 10 years of major-league service time are statistical superheroes! Fewer than 10% in baseball history have played for a decade or more. That subset will be considered in our case “veteran” players.
goodplayers <- subset(df, Hits_1986 > 150)
head(goodplayers)
## Player_Names At_Bats_1986 Hits_1986 HRs_1986 Rs_1986 RBIs_1986 BBs_1986
## 6 -Alfredo Griffin 594 169 4 74 51 35
## 11 -Alan Trammell 574 159 21 107 75 59
## 17 -Buddy Bell 568 158 20 89 75 73
## 25 -Bill Buckner 629 168 18 73 102 40
## 26 -Brett Butler 587 163 4 92 51 70
## 29 -Bill Doran 550 152 6 92 37 81
## Years_In_MLB ABs_Career Hits_Career HRs_Career Rs_Career RBIs_Career
## 6 11 4408 1133 19 501 336
## 11 10 4631 1300 90 702 504
## 17 15 8068 2273 177 1045 993
## 25 18 8424 2464 164 1008 1072
## 26 6 2695 747 17 442 198
## 29 5 2308 633 32 349 182
## BBs_Career League_1986 Division_1986 PutOuts_1986 Assists_1986 Errors_1986
## 6 194 A W 282 421 25
## 11 488 A E 238 445 22
## 17 732 N W 105 290 10
## 25 402 A E 1067 157 14
## 26 317 A E 434 9 3
## 29 308 N W 262 329 16
## Salary_1987 League_1987
## 6 750.000 A
## 11 517.143 A
## 17 775.000 N
## 25 776.667 A
## 26 765.000 A
## 29 625.000 N
Another subset of players I wanted to look at, was players who had over 150 hits in the 1986 season. This subset showed me 52 of the best hitters in baseball that season.
MVPs <- subset(goodplayers, HRs_1986 > 20 & RBIs_1986 > 100)
MVPs
## Player_Names At_Bats_1986 Hits_1986 HRs_1986 Rs_1986 RBIs_1986 BBs_1986
## 83 -Don Mattingly 677 238 31 117 113 53
## 87 -Dave Parker 637 174 31 89 116 56
## 109 -George Bell 641 198 31 101 108 41
## 114 -Glenn Davis 574 152 31 91 101 64
## 116 -Gary Gaetti 596 171 34 91 108 52
## 137 -Jesse Barfield 589 170 40 107 108 69
## 142 -Joe Carter 663 200 29 108 121 32
## 160 -Jim Presley 616 163 27 83 107 32
## Years_In_MLB ABs_Career Hits_Career HRs_Career Rs_Career RBIs_Career
## 83 5 2223 737 93 349 401
## 87 14 6727 2024 247 978 1093
## 109 5 2129 610 92 297 319
## 114 3 985 260 53 148 173
## 116 6 2862 728 107 361 401
## 137 6 2325 634 128 371 376
## 142 4 1447 404 57 210 222
## 160 3 1437 377 65 181 227
## BBs_Career League_1986 Division_1986 PutOuts_1986 Assists_1986 Errors_1986
## 83 171 A E 1377 100 6
## 87 495 N W 278 9 9
## 109 117 A E 269 17 10
## 114 95 N W 1253 111 11
## 116 224 A W 118 334 21
## 137 238 A E 368 20 3
## 142 68 A E 241 8 6
## 160 82 A W 110 308 15
## Salary_1987 League_1987
## 83 1975.000 A
## 87 1041.667 N
## 109 1175.000 A
## 114 215.000 N
## 116 900.000 A
## 137 1237.500 A
## 142 250.000 A
## 160 200.000 A
After I looked at some of the best hitters in baseball that year, I wanted to look at the best of the best. I looked at players with at least 150 hits, at least 20 homeruns, and at least 100 runs batted in. The eight players I found were some of the best players in baseball that year.
df$bAVG <- round(df$Hits_1986/df$At_Bats_1986, digits = 3)
head(df)
## Player_Names At_Bats_1986 Hits_1986 HRs_1986 Rs_1986 RBIs_1986 BBs_1986
## 1 -Andy Allanson 293 66 1 30 29 14
## 2 -Alan Ashby 315 81 7 24 38 39
## 3 -Alvin Davis 479 130 18 66 72 76
## 4 -Andre Dawson 496 141 20 65 78 37
## 5 -Andres Galarraga 321 87 10 39 42 30
## 6 -Alfredo Griffin 594 169 4 74 51 35
## Years_In_MLB ABs_Career Hits_Career HRs_Career Rs_Career RBIs_Career
## 1 1 293 66 1 30 29
## 2 14 3449 835 69 321 414
## 3 3 1624 457 63 224 266
## 4 11 5628 1575 225 828 838
## 5 2 396 101 12 48 46
## 6 11 4408 1133 19 501 336
## BBs_Career League_1986 Division_1986 PutOuts_1986 Assists_1986 Errors_1986
## 1 14 A E 446 33 20
## 2 375 N W 632 43 10
## 3 263 A W 880 82 14
## 4 354 N E 200 11 3
## 5 33 N E 805 40 4
## 6 194 A W 282 421 25
## Salary_1987 League_1987 bAVG
## 1 NA A 0.225
## 2 475.0 N 0.257
## 3 480.0 A 0.271
## 4 500.0 N 0.284
## 5 91.5 N 0.271
## 6 750.0 A 0.285
By diving all of the players at bats by their hits, I was able to calculate the player’s batting averages. This was a fun category to look at because it is an easy way to tell how a player did just by looking at one number.
hist(df$Hits_1986)
By looking at this histogram, I was able to see that most players hit
around 100 hits in 1986 while the really good ones (very few) hit over
200 hits.
plot(df$bAVG, df$Salary_1987, main="Batting Average VS Salary Next Year",
xlab="Batting Average 1986", ylab="Salary 1987", pch=19)
I tried to find a correlation here between a player’s batting average
and their salary the next year. There simply wasn’t enough of a
correlation and that is probably due to the fact that just becuase a
player had a high batting average, it doesn’t mean that they were going
to necessarily get a new big contract the next year.