Import data

# excel file
teams <- read_excel("C:/Users/deleo/OneDrive/Desktop/PSU_DAT3000_IntroToDA/00_data/myData_updated.xlsx")
teams
## # A tibble: 236 × 20
##    TEAMID TEAM   PAKE PAKERANK  PASE PASERANK GAMES     W     L WINPERCENT   R64
##     <dbl> <chr> <dbl>    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl>      <dbl> <dbl>
##  1      1 Abil…   0.7       45   0.7       52     3     1     2      0.333     2
##  2      2 Akron  -0.9      179  -1.1      187     4     0     4      0         4
##  3      3 Alab…  -2.1      211  -2.9      220    10     5     5      0.5       5
##  4      4 Alba…  -0.4      147  -0.3      138     3     0     3      0         3
##  5      6 Amer…  -0.5      160  -0.4      150     3     0     3      0         3
##  6      8 Ariz…  -1.7      206  -2.5      216    28    17    11      0.607    11
##  7      9 Ariz…  -2        209  -1.9      206     5     1     4      0.2       4
##  8     10 Arka…   4.3       11   3.5       16    18    11     7      0.611     7
##  9     11 Arka…   0         76   0         78     1     0     1      0         1
## 10     12 Aubu…   0.6       53   1.4       30    11     7     4      0.636     4
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## #   CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>

Apply the following dplyr verbs to your data

Filter rows

filter(teams, GAMES > 3, W > 3)
## # A tibble: 63 × 20
##    TEAMID TEAM   PAKE PAKERANK  PASE PASERANK GAMES     W     L WINPERCENT   R64
##     <dbl> <chr> <dbl>    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl>      <dbl> <dbl>
##  1      3 Alab…  -2.1      211  -2.9      220    10     5     5      0.5       5
##  2      8 Ariz…  -1.7      206  -2.5      216    28    17    11      0.607    11
##  3     10 Arka…   4.3       11   3.5       16    18    11     7      0.611     7
##  4     12 Aubu…   0.6       53   1.4       30    11     7     4      0.636     4
##  5     14 Bayl…   0.9       41  -0.3      138    29    19    10      0.655    11
##  6     24 Butl…   7          4   8.7        4    26    17     9      0.654     9
##  7     32 Cinc…  -3.6      228  -3.5      224    15     6     9      0.4       9
##  8     40 Conn…   8.6        2  10.6        1    29    23     6      0.793     9
##  9     43 Crei…   0.7       45   0.9       41    17     9     8      0.529     8
## 10     45 Dayt…   2.5       22   1.3       32    10     5     5      0.5       5
## # ℹ 53 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## #   CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>
filter(teams, GAMES > 2 & WINPERCENT > .50)
## # A tibble: 47 × 20
##    TEAMID TEAM   PAKE PAKERANK  PASE PASERANK GAMES     W     L WINPERCENT   R64
##     <dbl> <chr> <dbl>    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl>      <dbl> <dbl>
##  1      8 Ariz…  -1.7      206  -2.5      216    28    17    11      0.607    11
##  2     10 Arka…   4.3       11   3.5       16    18    11     7      0.611     7
##  3     12 Aubu…   0.6       53   1.4       30    11     7     4      0.636     4
##  4     14 Bayl…   0.9       41  -0.3      138    29    19    10      0.655    11
##  5     24 Butl…   7          4   8.7        4    26    17     9      0.654     9
##  6     40 Conn…   8.6        2  10.6        1    29    23     6      0.793     9
##  7     43 Crei…   0.7       45   0.9       41    17     9     8      0.529     8
##  8     50 Duke    2         24   0         78    46    34    12      0.739    14
##  9     55 Flor…   4.3       11   5.9        7    28    19     9      0.679     9
## 10     56 Flor…   3.3       17   3.4       18     5     4     1      0.8       1
## # ℹ 37 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## #   CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>

Arrange rows

arrange(teams, desc(GAMES), desc(WINPERCENT))
## # A tibble: 236 × 20
##    TEAMID TEAM   PAKE PAKERANK  PASE PASERANK GAMES     W     L WINPERCENT   R64
##     <dbl> <chr> <dbl>    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl>      <dbl> <dbl>
##  1     86 Kans…   4.2       13  -1.8      205    53    40    13      0.755    15
##  2    135 Nort…  12          1   9.8        2    50    39    11      0.78     13
##  3     68 Gonz…   3.1       18   3.8       14    47    32    15      0.681    15
##  4     50 Duke    2         24   0         78    46    34    12      0.739    14
##  5    115 Mich…   8          3   9.2        3    45    31    14      0.689    14
##  6     90 Kent…   6.9        6   8.5        5    43    32    11      0.744    12
##  7    228 Vill…   4.8        9   4         11    40    29    11      0.725    13
##  8    240 Wisc…   2.5       22   4         11    36    23    13      0.639    13
##  9    114 Mich…   7          4   7.6        6    35    24    11      0.686    11
## 10    102 Loui…   1.8       28   3.5       16    31    22     9      0.71     10
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## #   CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>

Select columns

select(teams, GAMES:WINPERCENT)
## # A tibble: 236 × 4
##    GAMES     W     L WINPERCENT
##    <dbl> <dbl> <dbl>      <dbl>
##  1     3     1     2      0.333
##  2     4     0     4      0    
##  3    10     5     5      0.5  
##  4     3     0     3      0    
##  5     3     0     3      0    
##  6    28    17    11      0.607
##  7     5     1     4      0.2  
##  8    18    11     7      0.611
##  9     1     0     1      0    
## 10    11     7     4      0.636
## # ℹ 226 more rows
select(teams, W, L, WINPERCENT)
## # A tibble: 236 × 3
##        W     L WINPERCENT
##    <dbl> <dbl>      <dbl>
##  1     1     2      0.333
##  2     0     4      0    
##  3     5     5      0.5  
##  4     0     3      0    
##  5     0     3      0    
##  6    17    11      0.607
##  7     1     4      0.2  
##  8    11     7      0.611
##  9     0     1      0    
## 10     7     4      0.636
## # ℹ 226 more rows
select(teams, PASE, PASERANK, starts_with("W"))
## # A tibble: 236 × 4
##     PASE PASERANK     W WINPERCENT
##    <dbl>    <dbl> <dbl>      <dbl>
##  1   0.7       52     1      0.333
##  2  -1.1      187     0      0    
##  3  -2.9      220     5      0.5  
##  4  -0.3      138     0      0    
##  5  -0.4      150     0      0    
##  6  -2.5      216    17      0.607
##  7  -1.9      206     1      0.2  
##  8   3.5       16    11      0.611
##  9   0         78     0      0    
## 10   1.4       30     7      0.636
## # ℹ 226 more rows
select(teams, W, L, everything())
## # A tibble: 236 × 20
##        W     L TEAMID TEAM   PAKE PAKERANK  PASE PASERANK GAMES WINPERCENT   R64
##    <dbl> <dbl>  <dbl> <chr> <dbl>    <dbl> <dbl>    <dbl> <dbl>      <dbl> <dbl>
##  1     1     2      1 Abil…   0.7       45   0.7       52     3      0.333     2
##  2     0     4      2 Akron  -0.9      179  -1.1      187     4      0         4
##  3     5     5      3 Alab…  -2.1      211  -2.9      220    10      0.5       5
##  4     0     3      4 Alba…  -0.4      147  -0.3      138     3      0         3
##  5     0     3      6 Amer…  -0.5      160  -0.4      150     3      0         3
##  6    17    11      8 Ariz…  -1.7      206  -2.5      216    28      0.607    11
##  7     1     4      9 Ariz…  -2        209  -1.9      206     5      0.2       4
##  8    11     7     10 Arka…   4.3       11   3.5       16    18      0.611     7
##  9     0     1     11 Arka…   0         76   0         78     1      0         1
## 10     7     4     12 Aubu…   0.6       53   1.4       30    11      0.636     4
## # ℹ 226 more rows
## # ℹ 9 more variables: R32 <dbl>, S16 <dbl>, E8 <dbl>, F4 <dbl>, F2 <dbl>,
## #   CHAMP <dbl>, TOP2 <dbl>, F4PERCENT <dbl>, CHAMPPERCENT <dbl>

Add columns

mutate(teams,
       gain = GAMES - L) %>%
    
    # Select wins, losses, win percentage, and gain
    select(W:WINPERCENT, gain)
## # A tibble: 236 × 4
##        W     L WINPERCENT  gain
##    <dbl> <dbl>      <dbl> <dbl>
##  1     1     2      0.333     1
##  2     0     4      0         0
##  3     5     5      0.5       5
##  4     0     3      0         0
##  5     0     3      0         0
##  6    17    11      0.607    17
##  7     1     4      0.2       1
##  8    11     7      0.611    11
##  9     0     1      0         0
## 10     7     4      0.636     7
## # ℹ 226 more rows
# just keep gain
mutate(teams,
       gain = W - L) %>%
    
    # Select 
    select(gain)
## # A tibble: 236 × 1
##     gain
##    <dbl>
##  1    -1
##  2    -4
##  3     0
##  4    -3
##  5    -3
##  6     6
##  7    -3
##  8     4
##  9    -1
## 10     3
## # ℹ 226 more rows
# Alternative using transmute()
transmute(teams,
          gain = GAMES - L)
## # A tibble: 236 × 1
##     gain
##    <dbl>
##  1     1
##  2     0
##  3     5
##  4     0
##  5     0
##  6    17
##  7     1
##  8    11
##  9     0
## 10     7
## # ℹ 226 more rows
# lag()
select(teams, GAMES) %>%
    
    mutate(GAMES_lag1 = lag(GAMES))
## # A tibble: 236 × 2
##    GAMES GAMES_lag1
##    <dbl>      <dbl>
##  1     3         NA
##  2     4          3
##  3    10          4
##  4     3         10
##  5     3          3
##  6    28          3
##  7     5         28
##  8    18          5
##  9     1         18
## 10    11          1
## # ℹ 226 more rows
#cumsum()
select(teams, WINPERCENT) %>%
    
    mutate(WINPERCENT_cumsum = cumsum(WINPERCENT))
## # A tibble: 236 × 2
##    WINPERCENT WINPERCENT_cumsum
##         <dbl>             <dbl>
##  1      0.333             0.333
##  2      0                 0.333
##  3      0.5               0.833
##  4      0                 0.833
##  5      0                 0.833
##  6      0.607             1.44 
##  7      0.2               1.64 
##  8      0.611             2.25 
##  9      0                 2.25 
## 10      0.636             2.89 
## # ℹ 226 more rows

Summarize by groups

teams %>% 
    
    # Group by departure delay
    group_by(TEAM) %>%
    
    # Calculate average departure delay
    summarize(GAMES = mean(PASE, na.rm = TRUE)) %>%
    
    # Sort it
    arrange(GAMES)
## # A tibble: 236 × 2
##    TEAM         GAMES
##    <chr>        <dbl>
##  1 Georgetown    -8.5
##  2 Virginia      -7.3
##  3 Vanderbilt    -5  
##  4 Pittsburgh    -4.6
##  5 Texas         -4.6
##  6 Purdue        -4.4
##  7 Oklahoma St.  -4.2
##  8 Illinois      -4.1
##  9 New Mexico    -3.8
## 10 Temple        -3.6
## # ℹ 226 more rows
teams %>%
    group_by(TEAM) %>%
    summarize(count = n()) %>%
    ungroup()
## # A tibble: 236 × 2
##    TEAM                count
##    <chr>               <int>
##  1 Abilene Christian       1
##  2 Akron                   1
##  3 Alabama                 1
##  4 Albany                  1
##  5 American                1
##  6 Arizona                 1
##  7 Arizona St.             1
##  8 Arkansas                1
##  9 Arkansas Pine Bluff     1
## 10 Auburn                  1
## # ℹ 226 more rows