Import data
# excel file
teams <- read_excel("C:/Users/deleo/OneDrive/Desktop/PSU_DAT3000_IntroToDA/00_data/myData_updated.xlsx")
teams
## # A tibble: 236 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Abil… 0.7 45 0.7 52 3 1 2 0.333 2
## 2 2 Akron -0.9 179 -1.1 187 4 0 4 0 4
## 3 3 Alab… -2.1 211 -2.9 220 10 5 5 0.5 5
## 4 4 Alba… -0.4 147 -0.3 138 3 0 3 0 3
## 5 6 Amer… -0.5 160 -0.4 150 3 0 3 0 3
## 6 8 Ariz… -1.7 206 -2.5 216 28 17 11 0.607 11
## 7 9 Ariz… -2 209 -1.9 206 5 1 4 0.2 4
## 8 10 Arka… 4.3 11 3.5 16 18 11 7 0.611 7
## 9 11 Arka… 0 76 0 78 1 0 1 0 1
## 10 12 Aubu… 0.6 53 1.4 30 11 7 4 0.636 4
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
Apply the following dplyr verbs to your data
Filter rows
filter(teams, GAMES > 3, W > 3)
## # A tibble: 63 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 Alab… -2.1 211 -2.9 220 10 5 5 0.5 5
## 2 8 Ariz… -1.7 206 -2.5 216 28 17 11 0.607 11
## 3 10 Arka… 4.3 11 3.5 16 18 11 7 0.611 7
## 4 12 Aubu… 0.6 53 1.4 30 11 7 4 0.636 4
## 5 14 Bayl… 0.9 41 -0.3 138 29 19 10 0.655 11
## 6 24 Butl… 7 4 8.7 4 26 17 9 0.654 9
## 7 32 Cinc… -3.6 228 -3.5 224 15 6 9 0.4 9
## 8 40 Conn… 8.6 2 10.6 1 29 23 6 0.793 9
## 9 43 Crei… 0.7 45 0.9 41 17 9 8 0.529 8
## 10 45 Dayt… 2.5 22 1.3 32 10 5 5 0.5 5
## # ℹ 53 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
filter(teams, GAMES > 2 & WINPERCENT > .50)
## # A tibble: 47 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8 Ariz… -1.7 206 -2.5 216 28 17 11 0.607 11
## 2 10 Arka… 4.3 11 3.5 16 18 11 7 0.611 7
## 3 12 Aubu… 0.6 53 1.4 30 11 7 4 0.636 4
## 4 14 Bayl… 0.9 41 -0.3 138 29 19 10 0.655 11
## 5 24 Butl… 7 4 8.7 4 26 17 9 0.654 9
## 6 40 Conn… 8.6 2 10.6 1 29 23 6 0.793 9
## 7 43 Crei… 0.7 45 0.9 41 17 9 8 0.529 8
## 8 50 Duke 2 24 0 78 46 34 12 0.739 14
## 9 55 Flor… 4.3 11 5.9 7 28 19 9 0.679 9
## 10 56 Flor… 3.3 17 3.4 18 5 4 1 0.8 1
## # ℹ 37 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
Arrange rows
arrange(teams, desc(GAMES), desc(WINPERCENT))
## # A tibble: 236 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 86 Kans… 4.2 13 -1.8 205 53 40 13 0.755 15
## 2 135 Nort… 12 1 9.8 2 50 39 11 0.78 13
## 3 68 Gonz… 3.1 18 3.8 14 47 32 15 0.681 15
## 4 50 Duke 2 24 0 78 46 34 12 0.739 14
## 5 115 Mich… 8 3 9.2 3 45 31 14 0.689 14
## 6 90 Kent… 6.9 6 8.5 5 43 32 11 0.744 12
## 7 228 Vill… 4.8 9 4 11 40 29 11 0.725 13
## 8 240 Wisc… 2.5 22 4 11 36 23 13 0.639 13
## 9 114 Mich… 7 4 7.6 6 35 24 11 0.686 11
## 10 102 Loui… 1.8 28 3.5 16 31 22 9 0.71 10
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
Select columns
select(teams, GAMES:WINPERCENT)
## # A tibble: 236 × 4
## GAMES W L WINPERCENT
## <dbl> <dbl> <dbl> <dbl>
## 1 3 1 2 0.333
## 2 4 0 4 0
## 3 10 5 5 0.5
## 4 3 0 3 0
## 5 3 0 3 0
## 6 28 17 11 0.607
## 7 5 1 4 0.2
## 8 18 11 7 0.611
## 9 1 0 1 0
## 10 11 7 4 0.636
## # ℹ 226 more rows
select(teams, W, L, WINPERCENT)
## # A tibble: 236 × 3
## W L WINPERCENT
## <dbl> <dbl> <dbl>
## 1 1 2 0.333
## 2 0 4 0
## 3 5 5 0.5
## 4 0 3 0
## 5 0 3 0
## 6 17 11 0.607
## 7 1 4 0.2
## 8 11 7 0.611
## 9 0 1 0
## 10 7 4 0.636
## # ℹ 226 more rows
select(teams, PASE, PASERANK, starts_with("W"))
## # A tibble: 236 × 4
## PASE PASERANK W WINPERCENT
## <dbl> <dbl> <dbl> <dbl>
## 1 0.7 52 1 0.333
## 2 -1.1 187 0 0
## 3 -2.9 220 5 0.5
## 4 -0.3 138 0 0
## 5 -0.4 150 0 0
## 6 -2.5 216 17 0.607
## 7 -1.9 206 1 0.2
## 8 3.5 16 11 0.611
## 9 0 78 0 0
## 10 1.4 30 7 0.636
## # ℹ 226 more rows
select(teams, W, L, everything())
## # A tibble: 236 × 20
## W L TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES WINPERCENT R64
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 1 Abil… 0.7 45 0.7 52 3 0.333 2
## 2 0 4 2 Akron -0.9 179 -1.1 187 4 0 4
## 3 5 5 3 Alab… -2.1 211 -2.9 220 10 0.5 5
## 4 0 3 4 Alba… -0.4 147 -0.3 138 3 0 3
## 5 0 3 6 Amer… -0.5 160 -0.4 150 3 0 3
## 6 17 11 8 Ariz… -1.7 206 -2.5 216 28 0.607 11
## 7 1 4 9 Ariz… -2 209 -1.9 206 5 0.2 4
## 8 11 7 10 Arka… 4.3 11 3.5 16 18 0.611 7
## 9 0 1 11 Arka… 0 76 0 78 1 0 1
## 10 7 4 12 Aubu… 0.6 53 1.4 30 11 0.636 4
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
Add columns
mutate(teams,
gain = GAMES - L) %>%
# Select wins, losses, win percentage, and gain
select(W:WINPERCENT, gain)
## # A tibble: 236 × 4
## W L WINPERCENT gain
## <dbl> <dbl> <dbl> <dbl>
## 1 1 2 0.333 1
## 2 0 4 0 0
## 3 5 5 0.5 5
## 4 0 3 0 0
## 5 0 3 0 0
## 6 17 11 0.607 17
## 7 1 4 0.2 1
## 8 11 7 0.611 11
## 9 0 1 0 0
## 10 7 4 0.636 7
## # ℹ 226 more rows
# just keep gain
mutate(teams,
gain = W - L) %>%
# Select
select(gain)
## # A tibble: 236 × 1
## gain
## <dbl>
## 1 -1
## 2 -4
## 3 0
## 4 -3
## 5 -3
## 6 6
## 7 -3
## 8 4
## 9 -1
## 10 3
## # ℹ 226 more rows
# Alternative using transmute()
transmute(teams,
gain = GAMES - L)
## # A tibble: 236 × 1
## gain
## <dbl>
## 1 1
## 2 0
## 3 5
## 4 0
## 5 0
## 6 17
## 7 1
## 8 11
## 9 0
## 10 7
## # ℹ 226 more rows
# lag()
select(teams, GAMES) %>%
mutate(GAMES_lag1 = lag(GAMES))
## # A tibble: 236 × 2
## GAMES GAMES_lag1
## <dbl> <dbl>
## 1 3 NA
## 2 4 3
## 3 10 4
## 4 3 10
## 5 3 3
## 6 28 3
## 7 5 28
## 8 18 5
## 9 1 18
## 10 11 1
## # ℹ 226 more rows
#cumsum()
select(teams, WINPERCENT) %>%
mutate(WINPERCENT_cumsum = cumsum(WINPERCENT))
## # A tibble: 236 × 2
## WINPERCENT WINPERCENT_cumsum
## <dbl> <dbl>
## 1 0.333 0.333
## 2 0 0.333
## 3 0.5 0.833
## 4 0 0.833
## 5 0 0.833
## 6 0.607 1.44
## 7 0.2 1.64
## 8 0.611 2.25
## 9 0 2.25
## 10 0.636 2.89
## # ℹ 226 more rows
Summarize by groups
teams %>%
# Group by departure delay
group_by(TEAM) %>%
# Calculate average departure delay
summarize(GAMES = mean(PASE, na.rm = TRUE)) %>%
# Sort it
arrange(GAMES)
## # A tibble: 236 × 2
## TEAM GAMES
## <chr> <dbl>
## 1 Georgetown -8.5
## 2 Virginia -7.3
## 3 Vanderbilt -5
## 4 Pittsburgh -4.6
## 5 Texas -4.6
## 6 Purdue -4.4
## 7 Oklahoma St. -4.2
## 8 Illinois -4.1
## 9 New Mexico -3.8
## 10 Temple -3.6
## # ℹ 226 more rows
teams %>%
group_by(TEAM) %>%
summarize(count = n()) %>%
ungroup()
## # A tibble: 236 × 2
## TEAM count
## <chr> <int>
## 1 Abilene Christian 1
## 2 Akron 1
## 3 Alabama 1
## 4 Albany 1
## 5 American 1
## 6 Arizona 1
## 7 Arizona St. 1
## 8 Arkansas 1
## 9 Arkansas Pine Bluff 1
## 10 Auburn 1
## # ℹ 226 more rows