Import libraries

library(dplyr)
library(tidyr)
library('zoo') # fill the missing values with the previous non missing value
library(ggplot2)

Collect the data

This data was posted on Slack by the student: Marley Myrianthopoulos

df2 = read.csv("https://raw.githubusercontent.com/Kossi-Akplaka/Data607-data_acquisition_and_management/main/Project%202/Data2-MajorLeagueBaseballcsv.csv")
head(df2,7)
##   Team Position X2018.HRs X2019.HRs X2021.HRs X2022.HRs X2023.HRs
## 1  BAL        C         3        13        11        13        20
## 2            1B        16        12        33        22        18
## 3            2B        17        24         5        13        13
## 4            3B        24         6         9        13         7
## 5            SS         7        12        11        16         4
## 6            LF        24        13        22        16        16
## 7            CF        15        10        30        16        15

Tidy the data

First we need to fill the missing values on the first column

df2$Team[df2$Team == ""] <- NA
df2$Team <- na.locf(df2$Team)
df2
##    Team Position X2018.HRs X2019.HRs X2021.HRs X2022.HRs X2023.HRs
## 1   BAL        C         3        13        11        13        20
## 2   BAL       1B        16        12        33        22        18
## 3   BAL       2B        17        24         5        13        13
## 4   BAL       3B        24         6         9        13         7
## 5   BAL       SS         7        12        11        16         4
## 6   BAL       LF        24        13        22        16        16
## 7   BAL       CF        15        10        30        16        15
## 8   BAL       RF         8        35        18        33        28
## 9   BAL       DH        17        31        21        10        14
## 10  BOS        C         5        23         6         8         9
## 11  BOS       1B        15        19        25        12        24
## 12  BOS       2B        10         3         6        16         3
## 13  BOS       3B        23        33        23        15         6
## 14  BOS       SS        21        32        38        27        33
## 15  BOS       LF        16        13        13        11        15
## 16  BOS       CF        13        21        20         6         8
## 17  BOS       RF        32        29        31         3        13
## 18  BOS       DH        43        36        28        16        23
## 19  NYY        C        18        34        23        11        10
## 20  NYY       1B        11        21         8        32        12
## 21  NYY       2B        24        26        10        24        25
## 22  NYY       3B        27        16         9         4        21
## 23  NYY       SS        27        21        14        15        15
## 24  NYY       LF        12        13        13         8         5
## 25  NYY       CF        27        28        10        62         7
## 26  NYY       RF        27        27        39        12        37
## 27  NYY       DH        38        13        35        31        24
## 28  TBR        C        14         9        33         6        11
## 29  TBR       1B        11        19        13        11        22
## 30  TBR       2B         7        17        39         8        21
## 31  TBR       3B        10        20         7         8        17
## 32  TBR       SS         4        14        11         9        31
## 33  TBR       LF         7        21        27        20        23
## 34  TBR       CF         7        14         4         7        25
## 35  TBR       RF         9        20        10         4        20
## 36  TBR       DH        30        33        13         6        12
## 37  TOR        C        10        13         1        14         8
## 38  TOR       1B        25        22        48        32        26
## 39  TOR       2B        11        16        45         7        11
## 40  TOR       3B        18        18        29        24        20
## 41  TOR       SS        17        15         2        27        17
## 42  TOR       LF        22        20        21         5        20
## 43  TOR       CF        15        26        22        25         8
## 44  TOR       RF        25        31        32        25        21
## 45  TOR       DH        21        21        22         4        19

Now we can transform the data from wide to long

df2_long <- df2 %>% 
  pivot_longer(cols = c(X2018.HRs, X2019.HRs, X2021.HRs,
                        X2022.HRs,X2023.HRs) ,
               names_to = "Year",
               values_to = "Home Runs")

head(df2_long, 7)
## # A tibble: 7 × 4
##   Team  Position Year      `Home Runs`
##   <chr> <chr>    <chr>           <int>
## 1 BAL   C        X2018.HRs           3
## 2 BAL   C        X2019.HRs          13
## 3 BAL   C        X2021.HRs          11
## 4 BAL   C        X2022.HRs          13
## 5 BAL   C        X2023.HRs          20
## 6 BAL   1B       X2018.HRs          16
## 7 BAL   1B       X2019.HRs          12

Finally let’s change the format for the year column

df2_long$Year <- sub("^X", "", df2_long$Year)
df2_long$Year <- gsub(".HRs", "", df2_long$Year)
class(df2_long$Year)
## [1] "character"

Convert the year column to a date

df2_long$Year <- as.integer(df2_long$Year)
head(df2_long,7)
## # A tibble: 7 × 4
##   Team  Position  Year `Home Runs`
##   <chr> <chr>    <int>       <int>
## 1 BAL   C         2018           3
## 2 BAL   C         2019          13
## 3 BAL   C         2021          11
## 4 BAL   C         2022          13
## 5 BAL   C         2023          20
## 6 BAL   1B        2018          16
## 7 BAL   1B        2019          12

Analysis

Let’s create a plot showing how many home runs were scored for the position “2B” by the team “BAL” and the team “NYY” overtime.

df2_BAL <- df2_long %>% 
  filter(Team == 'BAL' & Position == '2B')
df2_NYY <- df2_long %>% 
  filter(Team == 'NYY' & Position == '2B')


ggplot() +
  geom_line(data = df2_BAL, aes(x = Year, y = `Home Runs`, color = "BAL"))+
  geom_line(data = df2_NYY, aes(x = Year, y = `Home Runs`, color = "NYY")) +
  labs(x = "Year", y = "Home Runs", title = "Home Runs Comparison")

The plot shows that NYY’s players at Position 2B score more home runs than BAL’s players.

LS0tDQp0aXRsZTogIlByb2plY3QyLWRhdGEyIg0KYXV0aG9yOiAiS29zc2kgQWtwbGFrYSINCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCINCm91dHB1dDogb3BlbmludHJvOjpsYWJfcmVwb3J0DQotLS0NCg0KIyMgSW1wb3J0IGxpYnJhcmllcw0KYGBge3IgaW1wb3J0LWxpYnJhcmllcywgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0NCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KHRpZHlyKQ0KbGlicmFyeSgnem9vJykgIyBmaWxsIHRoZSBtaXNzaW5nIHZhbHVlcyB3aXRoIHRoZSBwcmV2aW91cyBub24gbWlzc2luZyB2YWx1ZQ0KbGlicmFyeShnZ3Bsb3QyKQ0KYGBgDQoNCiMjIENvbGxlY3QgdGhlIGRhdGENClRoaXMgZGF0YSB3YXMgcG9zdGVkIG9uIFNsYWNrIGJ5IHRoZSBzdHVkZW50OiBNYXJsZXkgTXlyaWFudGhvcG91bG9zDQpgYGB7cn0NCmRmMiA9IHJlYWQuY3N2KCJodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vS29zc2ktQWtwbGFrYS9EYXRhNjA3LWRhdGFfYWNxdWlzaXRpb25fYW5kX21hbmFnZW1lbnQvbWFpbi9Qcm9qZWN0JTIwMi9EYXRhMi1NYWpvckxlYWd1ZUJhc2ViYWxsY3N2LmNzdiIpDQpoZWFkKGRmMiw3KQ0KYGBgDQojIyBUaWR5IHRoZSBkYXRhDQpGaXJzdCB3ZSBuZWVkIHRvIGZpbGwgdGhlIG1pc3NpbmcgdmFsdWVzIG9uIHRoZSBmaXJzdCBjb2x1bW4NCmBgYHtyfQ0KZGYyJFRlYW1bZGYyJFRlYW0gPT0gIiJdIDwtIE5BDQpkZjIkVGVhbSA8LSBuYS5sb2NmKGRmMiRUZWFtKQ0KZGYyDQpgYGANCk5vdyB3ZSBjYW4gdHJhbnNmb3JtIHRoZSBkYXRhIGZyb20gd2lkZSB0byBsb25nIA0KYGBge3J9DQpkZjJfbG9uZyA8LSBkZjIgJT4lIA0KICBwaXZvdF9sb25nZXIoY29scyA9IGMoWDIwMTguSFJzLCBYMjAxOS5IUnMsIFgyMDIxLkhScywNCiAgICAgICAgICAgICAgICAgICAgICAgIFgyMDIyLkhScyxYMjAyMy5IUnMpICwNCiAgICAgICAgICAgICAgIG5hbWVzX3RvID0gIlllYXIiLA0KICAgICAgICAgICAgICAgdmFsdWVzX3RvID0gIkhvbWUgUnVucyIpDQoNCmhlYWQoZGYyX2xvbmcsIDcpDQpgYGANCkZpbmFsbHkgbGV0J3MgY2hhbmdlIHRoZSBmb3JtYXQgZm9yIHRoZSB5ZWFyIGNvbHVtbg0KYGBge3J9DQpkZjJfbG9uZyRZZWFyIDwtIHN1YigiXlgiLCAiIiwgZGYyX2xvbmckWWVhcikNCmRmMl9sb25nJFllYXIgPC0gZ3N1YigiLkhScyIsICIiLCBkZjJfbG9uZyRZZWFyKQ0KY2xhc3MoZGYyX2xvbmckWWVhcikNCmBgYA0KDQpDb252ZXJ0IHRoZSB5ZWFyIGNvbHVtbiB0byBhIGRhdGUNCmBgYHtyfQ0KZGYyX2xvbmckWWVhciA8LSBhcy5pbnRlZ2VyKGRmMl9sb25nJFllYXIpDQpoZWFkKGRmMl9sb25nLDcpDQpgYGANCiMjIEFuYWx5c2lzDQoNCkxldCdzIGNyZWF0ZSBhIHBsb3Qgc2hvd2luZyBob3cgbWFueSBob21lIHJ1bnMgd2VyZSBzY29yZWQgZm9yIHRoZSBwb3NpdGlvbiAiMkIiIGJ5IHRoZSB0ZWFtICJCQUwiIGFuZCB0aGUgdGVhbSAiTllZIiBvdmVydGltZS4NCmBgYHtyfQ0KDQpkZjJfQkFMIDwtIGRmMl9sb25nICU+JSANCiAgZmlsdGVyKFRlYW0gPT0gJ0JBTCcgJiBQb3NpdGlvbiA9PSAnMkInKQ0KZGYyX05ZWSA8LSBkZjJfbG9uZyAlPiUgDQogIGZpbHRlcihUZWFtID09ICdOWVknICYgUG9zaXRpb24gPT0gJzJCJykNCg0KDQpnZ3Bsb3QoKSArDQogIGdlb21fbGluZShkYXRhID0gZGYyX0JBTCwgYWVzKHggPSBZZWFyLCB5ID0gYEhvbWUgUnVuc2AsIGNvbG9yID0gIkJBTCIpKSsNCiAgZ2VvbV9saW5lKGRhdGEgPSBkZjJfTllZLCBhZXMoeCA9IFllYXIsIHkgPSBgSG9tZSBSdW5zYCwgY29sb3IgPSAiTllZIikpICsNCiAgbGFicyh4ID0gIlllYXIiLCB5ID0gIkhvbWUgUnVucyIsIHRpdGxlID0gIkhvbWUgUnVucyBDb21wYXJpc29uIikNCiANCmBgYA0KDQpUaGUgcGxvdCBzaG93cyB0aGF0IE5ZWSdzIHBsYXllcnMgYXQgUG9zaXRpb24gMkIgc2NvcmUgbW9yZSBob21lIHJ1bnMgdGhhbiBCQUwncyBwbGF5ZXJzLg0KDQo=