Homework 4 - Histograms, Boxplots, and Scatterplots

With and Without ggplot2

#Loading CSV
#This dataset was selected from the list of R datasets suggested in the assignment 
URL = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/Stat2Data/MLB2007Standings.csv"
mlb2007standings_table = read.csv(URL, header = TRUE, sep = ",")

#Checking that it worked and showing the data starting point
mlb2007standings_table
##     X                  Team League Wins Losses WinPct BattingAvg Runs Hits
## 1   1  Arizona Diamondbacks     NL   90     72  0.556      0.250  712 1350
## 2   2        Atlanta Braves     NL   84     78  0.519      0.275  810 1562
## 3   3     Baltimore Orioles     AL   69     93  0.426      0.272  756 1529
## 4   4        Boston Red Sox     AL   96     66  0.593      0.279  867 1561
## 5   5          Chicago Cubs     NL   85     77  0.525      0.271  752 1530
## 6   6     Chicago White Sox     AL   72     90  0.444      0.246  693 1341
## 7   7       Cincinnati Reds     NL   72     90  0.444      0.267  783 1496
## 8   8     Cleveland Indians     AL   96     66  0.593      0.268  811 1504
## 9   9      Colorado Rockies     NL   90     73  0.552      0.280  860 1591
## 10 10        Detroit Tigers     AL   88     74  0.543      0.287  887 1652
## 11 11       Florida Marlins     NL   71     91  0.438      0.267  790 1504
## 12 12        Houston Astros     NL   73     89  0.451      0.260  723 1457
## 13 13    Kansas City Royals     AL   69     93  0.426      0.261  706 1447
## 14 14    Los Angeles Angels     AL   94     68  0.580      0.284  822 1578
## 15 15   Los Angeles Dodgers     NL   82     80  0.506      0.275  735 1544
## 16 16     Milwaukee Brewers     NL   83     79  0.512      0.262  801 1455
## 17 17       Minnesota Twins     AL   79     83  0.488      0.264  718 1460
## 18 18         New York Mets     NL   88     74  0.543      0.275  804 1543
## 19 19      New York Yankees     AL   94     68  0.580      0.290  968 1656
## 20 20     Oakland Athletics     AL   76     86  0.469      0.256  741 1430
## 21 21 Philadelphia Phillies     NL   89     73  0.549      0.274  892 1558
## 22 22    Pittsburgh Pirates     NL   68     94  0.420      0.263  724 1463
## 23 23      San Diego Padres     NL   89     74  0.546      0.251  741 1408
## 24 24  San Francisco Giants     NL   71     91  0.438      0.254  683 1407
## 25 25      Seattle Mariners     AL   88     74  0.543      0.287  794 1629
## 26 26   St. Louis Cardinals     NL   78     84  0.481      0.274  725 1513
## 27 27  Tampa Bay Devil Rays     AL   66     96  0.407      0.268  782 1500
## 28 28         Texas Rangers     AL   75     87  0.463      0.263  816 1460
## 29 29     Toronto Blue Jays     AL   83     79  0.512      0.259  753 1434
## 30 30  Washington Nationals     NL   73     89  0.451      0.256  673 1415
##     HR Doubles Triples RBI  SB   OBP   SLG  ERA HitsAllowed Walks
## 1  171     286      40 687 109 0.321 0.413 4.13        1446   546
## 2  176     328      27 781  64 0.339 0.435 4.11        1442   537
## 3  142     306      30 718 144 0.333 0.412 5.17        1491   696
## 4  166     352      35 829  96 0.362 0.444 3.87        1350   482
## 5  151     340      28 711  86 0.333 0.422 4.04        1340   573
## 6  190     249      20 667  78 0.318 0.404 4.77        1556   499
## 7  204     293      23 747  97 0.335 0.436 4.94        1605   482
## 8  178     305      27 784  72 0.343 0.428 4.05        1519   410
## 9  171     313      36 823 100 0.354 0.437 4.32        1497   504
## 10 177     352      50 857 103 0.345 0.458 4.57        1498   566
## 11 201     340      38 749 105 0.336 0.448 4.94        1617   661
## 12 167     293      30 700  65 0.330 0.412 4.68        1566   510
## 13 102     300      46 660  78 0.322 0.388 4.48        1547   520
## 14 123     324      23 776 139 0.345 0.417 4.23        1480   477
## 15 129     276      35 706 137 0.337 0.406 4.20        1443   518
## 16 231     310      37 774  96 0.329 0.456 4.41        1513   507
## 17 118     273      36 671 112 0.330 0.391 4.15        1505   420
## 18 177     294      27 761 200 0.342 0.432 4.26        1415   570
## 19 201     326      32 929 123 0.366 0.463 4.49        1498   578
## 20 171     295      16 711  52 0.338 0.407 4.28        1468   530
## 21 213     326      41 850 138 0.354 0.458 4.73        1555   558
## 22 148     322      31 694  68 0.325 0.411 4.93        1627   518
## 23 171     322      31 704  55 0.322 0.411 3.70        1406   474
## 24 131     267      37 641 119 0.322 0.387 4.19        1442   593
## 25 153     284      22 754  81 0.337 0.425 4.73        1578   546
## 26 141     279      13 690  56 0.337 0.405 4.65        1514   509
## 27 187     291      36 750 131 0.336 0.433 5.53        1649   568
## 28 179     298      36 768  88 0.328 0.426 4.75        1525   668
## 29 165     344      24 719  57 0.327 0.419 4.00        1383   479
## 30 123     309      31 646  69 0.325 0.390 4.58        1502   580
##    StrikeOuts Saves WHIP
## 1        1088    51 1.38
## 2        1106    36 1.36
## 3        1087    30 1.52
## 4        1149    45 1.27
## 5        1211    39 1.32
## 6        1015    42 1.43
## 7        1068    34 1.44
## 8        1047    49 1.32
## 9         967    39 1.36
## 10       1047    44 1.43
## 11       1142    40 1.58
## 12       1109    38 1.42
## 13        993    36 1.44
## 14       1156    43 1.36
## 15       1184    43 1.35
## 16       1174    49 1.40
## 17       1094    38 1.34
## 18       1134    39 1.37
## 19       1009    34 1.43
## 20       1036    36 1.38
## 21       1050    42 1.45
## 22        997    32 1.48
## 23       1136    45 1.27
## 24       1057    37 1.40
## 25       1020    43 1.48
## 26        945    34 1.41
## 27       1194    28 1.55
## 28        976    42 1.53
## 29       1067    44 1.29
## 30        931    46 1.44
str(mlb2007standings_table)
## 'data.frame':    30 obs. of  22 variables:
##  $ X          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Team       : Factor w/ 30 levels "Arizona Diamondbacks",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ League     : Factor w/ 2 levels "AL","NL": 2 2 1 1 2 1 2 1 2 1 ...
##  $ Wins       : int  90 84 69 96 85 72 72 96 90 88 ...
##  $ Losses     : int  72 78 93 66 77 90 90 66 73 74 ...
##  $ WinPct     : num  0.556 0.519 0.426 0.593 0.525 0.444 0.444 0.593 0.552 0.543 ...
##  $ BattingAvg : num  0.25 0.275 0.272 0.279 0.271 0.246 0.267 0.268 0.28 0.287 ...
##  $ Runs       : int  712 810 756 867 752 693 783 811 860 887 ...
##  $ Hits       : int  1350 1562 1529 1561 1530 1341 1496 1504 1591 1652 ...
##  $ HR         : int  171 176 142 166 151 190 204 178 171 177 ...
##  $ Doubles    : int  286 328 306 352 340 249 293 305 313 352 ...
##  $ Triples    : int  40 27 30 35 28 20 23 27 36 50 ...
##  $ RBI        : int  687 781 718 829 711 667 747 784 823 857 ...
##  $ SB         : int  109 64 144 96 86 78 97 72 100 103 ...
##  $ OBP        : num  0.321 0.339 0.333 0.362 0.333 0.318 0.335 0.343 0.354 0.345 ...
##  $ SLG        : num  0.413 0.435 0.412 0.444 0.422 0.404 0.436 0.428 0.437 0.458 ...
##  $ ERA        : num  4.13 4.11 5.17 3.87 4.04 4.77 4.94 4.05 4.32 4.57 ...
##  $ HitsAllowed: int  1446 1442 1491 1350 1340 1556 1605 1519 1497 1498 ...
##  $ Walks      : int  546 537 696 482 573 499 482 410 504 566 ...
##  $ StrikeOuts : int  1088 1106 1087 1149 1211 1015 1068 1047 967 1047 ...
##  $ Saves      : int  51 36 30 45 39 42 34 49 39 44 ...
##  $ WHIP       : num  1.38 1.36 1.52 1.27 1.32 1.43 1.44 1.32 1.36 1.43 ...
summary(mlb2007standings_table)
##        X                           Team    League       Wins      
##  Min.   : 1.00   Arizona Diamondbacks: 1   AL:14   Min.   :66.00  
##  1st Qu.: 8.25   Atlanta Braves      : 1   NL:16   1st Qu.:72.25  
##  Median :15.50   Baltimore Orioles   : 1           Median :82.50  
##  Mean   :15.50   Boston Red Sox      : 1           Mean   :81.03  
##  3rd Qu.:22.75   Chicago Cubs        : 1           3rd Qu.:88.75  
##  Max.   :30.00   Chicago White Sox   : 1           Max.   :96.00  
##                  (Other)             :24                          
##      Losses          WinPct         BattingAvg          Runs      
##  Min.   :66.00   Min.   :0.4070   Min.   :0.2460   Min.   :673.0  
##  1st Qu.:74.00   1st Qu.:0.4457   1st Qu.:0.2602   1st Qu.:724.2  
##  Median :79.50   Median :0.5090   Median :0.2675   Median :769.0  
##  Mean   :81.03   Mean   :0.4999   Mean   :0.2679   Mean   :777.4  
##  3rd Qu.:89.75   3rd Qu.:0.5453   3rd Qu.:0.2750   3rd Qu.:810.8  
##  Max.   :96.00   Max.   :0.5930   Max.   :0.2900   Max.   :968.0  
##                                                                   
##       Hits            HR           Doubles         Triples     
##  Min.   :1341   Min.   :102.0   Min.   :249.0   Min.   :13.00  
##  1st Qu.:1449   1st Qu.:143.5   1st Qu.:291.5   1st Qu.:27.00  
##  Median :1502   Median :171.0   Median :305.5   Median :31.00  
##  Mean   :1499   Mean   :165.2   Mean   :306.6   Mean   :31.27  
##  3rd Qu.:1554   3rd Qu.:178.8   3rd Qu.:325.5   3rd Qu.:36.00  
##  Max.   :1656   Max.   :231.0   Max.   :352.0   Max.   :50.00  
##                                                                
##       RBI              SB              OBP              SLG        
##  Min.   :641.0   Min.   : 52.00   Min.   :0.3180   Min.   :0.3870  
##  1st Qu.:695.5   1st Qu.: 69.75   1st Qu.:0.3272   1st Qu.:0.4080  
##  Median :733.0   Median : 96.00   Median :0.3355   Median :0.4205  
##  Mean   :741.9   Mean   : 97.27   Mean   :0.3357   Mean   :0.4225  
##  3rd Qu.:775.5   3rd Qu.:117.25   3rd Qu.:0.3412   3rd Qu.:0.4358  
##  Max.   :929.0   Max.   :200.00   Max.   :0.3660   Max.   :0.4630  
##                                                                    
##       ERA         HitsAllowed       Walks         StrikeOuts  
##  Min.   :3.700   Min.   :1340   Min.   :410.0   Min.   : 931  
##  1st Qu.:4.160   1st Qu.:1444   1st Qu.:500.2   1st Qu.:1016  
##  Median :4.445   Median :1500   Median :525.0   Median :1068  
##  Mean   :4.463   Mean   :1499   Mean   :536.0   Mean   :1073  
##  3rd Qu.:4.730   3rd Qu.:1553   3rd Qu.:569.5   3rd Qu.:1136  
##  Max.   :5.530   Max.   :1649   Max.   :696.0   Max.   :1211  
##                                                               
##      Saves            WHIP      
##  Min.   :28.00   Min.   :1.270  
##  1st Qu.:36.00   1st Qu.:1.360  
##  Median :39.50   Median :1.405  
##  Mean   :39.93   Mean   :1.407  
##  3rd Qu.:43.75   3rd Qu.:1.440  
##  Max.   :51.00   Max.   :1.580  
## 
#Nothing stands out as looking incorrect

Basic Histogram of Wins

hist(mlb2007standings_table$Wins,main="MLB 2007 Wins Histogram", xlab="Team Wins",col="gray",border="blue")

#4 teams won more than 90 games and 4 teams won 70 or less games.
#It is also interesting that 71-75 and 86-90 were the bins with the highest frequency.
#Given that there are 162 games in a season, 
#I would have expected the bin with the highest frequency to be closer to 81 wins.

Basic Scatterplot of ERA and WHIP

plot(ERA ~ WHIP, data=mlb2007standings_table)

#Unsurprisingly, it looks like there is a direct relationship between ERA and WHIP.
#As walks and hits increase, I would expect earned runs to increase as well.

Basic Boxplot of WHIP

boxplot(mlb2007standings_table$WHIP)

#There is an outlier.  The Florida Marlins had a WHIP of 1.58 (0.17 above the median).
#Sergio Mitre (1.48) had the lowest WHIP of their starting pitchers, and that is still very high.

ggplot Histogram of Wins

#Loading the library to create ggplot versions
library(ggplot2)

ggplot(data=mlb2007standings_table) + geom_histogram(aes(x=Wins))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#The bin size is different than the basic histogram and it paints a different story.
#3 teams won 88 games.  That is the only win amount with a frequency greater than 2.

ggplot Scatterplot of Wins and Runs

#Saving ggplot scatterplot in variable
g <- ggplot(mlb2007standings_table, aes(x=Runs, y=Wins)) + geom_point()
#Adding color to indicate league
g + geom_point(aes(color=League))

#Creating separate scatterplots by league
g + geom_point(aes(color=League)) + facet_wrap(~League)

#Unsurprisingly, there is a direct relationship between runs scored and wins.
#As expected, it doesn't look as strong as the direct relationship between WHIP and ERA though since run prevention is not included.

ggplot Boxplots of WHIP by League

ggplot(mlb2007standings_table, aes(y=WHIP, x=League)) + geom_boxplot()

#The National League had a lower median WHIP.  Their 75th percentile WHIP was very close to the AL median.
#Besides the Marlins (metioned above), the NL teams were more concentrated in their WHIPs.  
#The middle 50% box is smaller than the AL.