Import data
# excel file
data <- read_excel("../00_data/myData.xlsx")
data
## # A tibble: 236 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Abil… 0.7 45 0.7 52 3 1 2 0.333 2
## 2 2 Akron -0.9 179 -1.1 187 4 0 4 0 4
## 3 3 Alab… -2.1 211 -2.9 220 10 5 5 0.5 5
## 4 4 Alba… -0.4 147 -0.3 138 3 0 3 0 3
## 5 6 Amer… -0.5 160 -0.4 150 3 0 3 0 3
## 6 8 Ariz… -1.7 206 -2.5 216 28 17 11 0.607 11
## 7 9 Ariz… -2 209 -1.9 206 5 1 4 0.2 4
## 8 10 Arka… 4.3 11 3.5 16 18 11 7 0.611 7
## 9 11 Arka… 0 76 0 78 1 0 1 0 1
## 10 12 Aubu… 0.6 53 1.4 30 11 7 4 0.636 4
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
Apply the following dplyr verbs to your data
Filter rows
filter(data, GAMES > 10)
## # A tibble: 49 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8 Ariz… -1.7 206 -2.5 216 28 17 11 0.607 11
## 2 10 Arka… 4.3 11 3.5 16 18 11 7 0.611 7
## 3 12 Aubu… 0.6 53 1.4 30 11 7 4 0.636 4
## 4 14 Bayl… 0.9 41 -0.3 138 29 19 10 0.655 11
## 5 24 Butl… 7 4 8.7 4 26 17 9 0.654 9
## 6 32 Cinc… -3.6 228 -3.5 224 15 6 9 0.4 9
## 7 40 Conn… 8.6 2 10.6 1 29 23 6 0.793 9
## 8 43 Crei… 0.7 45 0.9 41 17 9 8 0.529 8
## 9 50 Duke 2 24 0 78 46 34 12 0.739 14
## 10 55 Flor… 4.3 11 5.9 7 28 19 9 0.679 9
## # ℹ 39 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
filter(data, W > 5)
## # A tibble: 47 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8 Ariz… -1.7 206 -2.5 216 28 17 11 0.607 11
## 2 10 Arka… 4.3 11 3.5 16 18 11 7 0.611 7
## 3 12 Aubu… 0.6 53 1.4 30 11 7 4 0.636 4
## 4 14 Bayl… 0.9 41 -0.3 138 29 19 10 0.655 11
## 5 24 Butl… 7 4 8.7 4 26 17 9 0.654 9
## 6 32 Cinc… -3.6 228 -3.5 224 15 6 9 0.4 9
## 7 40 Conn… 8.6 2 10.6 1 29 23 6 0.793 9
## 8 43 Crei… 0.7 45 0.9 41 17 9 8 0.529 8
## 9 50 Duke 2 24 0 78 46 34 12 0.739 14
## 10 55 Flor… 4.3 11 5.9 7 28 19 9 0.679 9
## # ℹ 37 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
Arrange rows
arrange(data, desc(PASERANK), desc(PAKERANK))
## # A tibble: 236 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 64 Geor… -6.2 235 -8.5 236 10 3 7 0.3 7
## 2 229 Virg… -6.7 236 -7.3 235 21 13 8 0.619 9
## 3 225 Vand… -3.4 225 -5 234 6 1 5 0.167 5
## 4 200 Texas -5.5 234 -4.6 232 22 10 12 0.455 12
## 5 162 Pitt… -3.5 227 -4.6 232 16 8 8 0.5 8
## 6 167 Purd… -4.4 232 -4.4 231 29 16 13 0.552 13
## 7 154 Okla… -4.1 229 -4.2 230 9 2 7 0.222 7
## 8 78 Illi… -3.3 224 -4.1 229 10 4 6 0.4 6
## 9 131 New … -2.7 220 -3.8 228 6 2 4 0.333 4
## 10 221 Utah… -4.4 232 -3.6 226 6 0 6 0 6
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
Select columns
select(data, GAMES:PAKE)
## # A tibble: 236 × 5
## GAMES PASERANK PASE PAKERANK PAKE
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 52 0.7 45 0.7
## 2 4 187 -1.1 179 -0.9
## 3 10 220 -2.9 211 -2.1
## 4 3 138 -0.3 147 -0.4
## 5 3 150 -0.4 160 -0.5
## 6 28 216 -2.5 206 -1.7
## 7 5 206 -1.9 209 -2
## 8 18 16 3.5 11 4.3
## 9 1 78 0 76 0
## 10 11 30 1.4 53 0.6
## # ℹ 226 more rows
select(data, R64, R32, E8, F4, F2)
## # A tibble: 236 × 5
## R64 R32 E8 F4 F2
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 1 0 0 0
## 2 4 0 0 0 0
## 3 5 3 0 0 0
## 4 3 0 0 0 0
## 5 3 0 0 0 0
## 6 11 7 3 0 0
## 7 4 1 0 0 0
## 8 7 6 2 0 0
## 9 1 0 0 0 0
## 10 4 4 1 1 0
## # ℹ 226 more rows
Add columns
#cumsum()
select(data, GAMES) %>%
mutate(GAMES_cumsum = cumsum(GAMES))
## # A tibble: 236 × 2
## GAMES GAMES_cumsum
## <dbl> <dbl>
## 1 3 3
## 2 4 7
## 3 10 17
## 4 3 20
## 5 3 23
## 6 28 51
## 7 5 56
## 8 18 74
## 9 1 75
## 10 11 86
## # ℹ 226 more rows
#lag()
select(data, PASERANK) %>%
mutate(PASERANK_lag1 = lag(PASERANK))
## # A tibble: 236 × 2
## PASERANK PASERANK_lag1
## <dbl> <dbl>
## 1 52 NA
## 2 187 52
## 3 220 187
## 4 138 220
## 5 150 138
## 6 216 150
## 7 206 216
## 8 16 206
## 9 78 16
## 10 30 78
## # ℹ 226 more rows
Summarize by groups
data
## # A tibble: 236 × 20
## TEAMID TEAM PAKE PAKERANK PASE PASERANK GAMES W L WINPERCENT R64
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Abil… 0.7 45 0.7 52 3 1 2 0.333 2
## 2 2 Akron -0.9 179 -1.1 187 4 0 4 0 4
## 3 3 Alab… -2.1 211 -2.9 220 10 5 5 0.5 5
## 4 4 Alba… -0.4 147 -0.3 138 3 0 3 0 3
## 5 6 Amer… -0.5 160 -0.4 150 3 0 3 0 3
## 6 8 Ariz… -1.7 206 -2.5 216 28 17 11 0.607 11
## 7 9 Ariz… -2 209 -1.9 206 5 1 4 0.2 4
## 8 10 Arka… 4.3 11 3.5 16 18 11 7 0.611 7
## 9 11 Arka… 0 76 0 78 1 0 1 0 1
## 10 12 Aubu… 0.6 53 1.4 30 11 7 4 0.636 4
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## # CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
summarise(data, WINPERCENT = mean(CHAMP, na.rm = TRUE))
## # A tibble: 1 × 1
## WINPERCENT
## <dbl>
## 1 0.0636
data %>%
# Group by GAMES
group_by(GAMES) %>%
# Calculate average WINPERCENT
summarise(WINPERCENT = mean(WINPERCENT, na.rm = TRUE)) %>%
# Sort it
arrange(WINPERCENT)
## # A tibble: 38 × 2
## GAMES WINPERCENT
## <dbl> <dbl>
## 1 1 0
## 2 2 0.141
## 3 3 0.195
## 4 4 0.219
## 5 6 0.244
## 6 7 0.322
## 7 10 0.362
## 8 8 0.375
## 9 13 0.385
## 10 5 0.388
## # ℹ 28 more rows