Exercise 1

transactions <- transactions_sample
transactions
## # A tibble: 75,000 × 11
##    household_id store_id basket_id   product_id quantity sales_value retail_disc
##    <chr>        <chr>    <chr>       <chr>         <dbl>       <dbl>       <dbl>
##  1 2261         309      31625220889 940996            1        3.86        0.43
##  2 2131         368      32053127496 873902            1        1.59        0.9 
##  3 511          316      32445856036 847901            1        1           0.69
##  4 400          388      31932241118 13094913          2       11.9         2.9 
##  5 918          340      32074655895 1085604           1        1.29        0   
##  6 718          324      32614612029 883203            1        2.5         0.49
##  7 868          323      32074722463 9884484           1        3.49        0   
##  8 1688         450      34850403304 1028715           1        2           1.79
##  9 467          31782    31280745102 896613            2        6.55        4.44
## 10 1947         32004    32744181707 978497            1        3.99        0   
## # ℹ 74,990 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>
# Q0: Create three new variables named `regular_price`, `loyalty_price`, and
# `coupon_price` according to the logic shown above
transactions <- transactions %>%
mutate(
regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
loyalty_price = (sales_value + coupon_match_disc) / quantity,
coupon_price = (sales_value - coupon_disc) / quantity
) %>%

  select(regular_price, loyalty_price, coupon_price, product_id, everything())

# Q1. Identify the five households with the largest `loyalty_price` transactions. What
# is unique about the transaction with the largest `loyalty_price` value?
transactions %>%
slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
##   regular_price loyalty_price coupon_price product_id household_id store_id
##           <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
## 1         Inf           Inf          Inf   13945244   2491         389     
## 2         100           100          100   12484608   1246         334     
## 3         100.0         100.0        100.0 13040176   2318         381     
## 4          88.9          88.9         88.9 15630122   1172         396     
## 5          85.0          66.1         66.1 916561     2312         442     
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

Q1. The transaction with the largest loyalty_price has an infinite value, this is the unique feature of the highest loyalty_price transaction.

# Q2. Now filter for only those observations where quantity was greater than 0. Now which
# household(s) have the largest `loyalty_price` transaction?
transactions %>%
filter(quantity > 0) %>%
slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
##   regular_price loyalty_price coupon_price product_id household_id store_id
##           <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
## 1         100           100          100   12484608   1246         334     
## 2         100.0         100.0        100.0 13040176   2318         381     
## 3          88.9          88.9         88.9 15630122   1172         396     
## 4          85.0          66.1         66.1 916561     2312         442     
## 5          63.8          63.8         63.8 1076056    57           298     
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

Q2. The household with the largest loyalty_price transaction is household_id 1246, the product_id associated is 12484608.

# Q3. Using the first transaction in the result from #2, filter the `products` data based
# on the `product_id` to find out which product the largest `loyalty_price` transaction
# is associated with.
products %>%
filter(product_id == 12484608)
## # A tibble: 1 × 7
##   product_id manufacturer_id department    brand   product_category product_type
##   <chr>      <chr>           <chr>         <fct>   <chr>            <chr>       
## 1 12484608   903             MISCELLANEOUS Private COUPON/MISC ITE… ELECTRONIC …
## # ℹ 1 more variable: package_size <chr>

Q3. Using the first transaction in the result from Q2, the product with the largest loyalty_price is electronic gift cards activating.

Exercise 2

# how many products had a regular price of $1 or less
transactions %>%
filter(regular_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 2748
# how many products had a loyalty price of $1 or less
transactions %>%
filter(loyalty_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4648
# how many products had a coupon price of $1 or less
transactions %>%
filter(coupon_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4844

There are 2748 products with a regular price of $1 or less, there are 4648 products with a loyalty price of $1 or less, and there are 4844 products with a coupon price of $1 or less.

Exercise 3

#What proportion of baskets are over $10 in sales value?
transactions %>%
  group_by(basket_id) %>%
  summarise(basket_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
  filter(basket_sales > 10) %>%
  select(basket_id) %>%
  n_distinct() / n_distinct(transactions$basket_id) * 100
## [1] 10.65978

There are 10.66% of baskets over $10 in sales value.

#What proportion of baskets are over $20 in sales value?
transactions %>%
  group_by(basket_id) %>%
  summarise(basket_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
  filter(basket_sales > 20) %>%
  select(basket_id) %>%
  n_distinct() / n_distinct(transactions$basket_id) * 100
## [1] 2.57816

There are 2.58% of baskets over $20 in sales value.

Exercise 4

# Which stores had the largest total `sales_value`
transactions %>%
  group_by(store_id) %>%
  summarize(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales_value))
## # A tibble: 293 × 2
##    store_id total_sales_value
##    <chr>                <dbl>
##  1 367                  7713.
##  2 406                  6034.
##  3 429                  4702.
##  4 343                  4471.
##  5 361                  4061.
##  6 356                  3807.
##  7 381                  3746.
##  8 292                  3702.
##  9 31782                3684.
## 10 321                  3515.
## # ℹ 283 more rows

The stores with the largest total_sales_vale are stores 367, 406, 429, 343, and 361.

transactions %>%
  mutate(pct_loyalty_disc = 1 - (loyalty_price / regular_price)) %>%
  group_by(store_id) %>%
  summarize(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>%
  arrange(desc(avg_pct_loyalty_disc))
## # A tibble: 293 × 2
##    store_id avg_pct_loyalty_disc
##    <chr>                   <dbl>
##  1 224                     0.576
##  2 62                      0.501
##  3 779                     0.475
##  4 3163                    0.433
##  5 784                     0.430
##  6 2950                    0.418
##  7 572                     0.401
##  8 65                      0.390
##  9 486                     0.388
## 10 2839                    0.373
## # ℹ 283 more rows

The stores with the largest average loyalty discount are 224, 62, 779, 3163, and 784.

Part 2

Exercise 1

library(readxl)

excel_sheets(path = "C:/Users/tsander/Downloads/mbta.xlsx")
## [1] "Sheet1"
mbta <- read_excel(path = "C:/Users/tsander/Downloads/mbta.xlsx", skip = 1, na = 'NA')
## New names:
## • `` -> `...1`

Exercise 2

#structure of mbta
glimpse(mbta)
## Rows: 11
## Columns: 60
## $ ...1      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
## $ mode      <chr> "All Modes by Qtr", "Boat", "Bus", "Commuter Rail", "Heavy R…
## $ `2007-01` <dbl> NA, 4.000, 335.819, 142.200, 435.294, 227.231, 0.020, 4.772,…
## $ `2007-02` <dbl> NA, 3.600, 338.675, 138.500, 448.271, 240.262, -0.040, 4.417…
## $ `2007-03` <dbl> 1187.653, 40.000, 339.867, 137.700, 458.583, 241.444, 0.114,…
## $ `2007-04` <dbl> NA, 4.300, 352.162, 139.500, 472.201, 255.557, -0.002, 4.542…
## $ `2007-05` <dbl> NA, 4.900, 354.367, 139.000, 474.579, 248.262, 0.049, 4.768,…
## $ `2007-06` <dbl> 1245.959, 5.800, 350.543, 143.000, 477.032, 246.108, 0.096, …
## $ `2007-07` <dbl> NA, 6.521, 357.519, 142.391, 471.735, 243.286, -0.037, 3.936…
## $ `2007-08` <dbl> NA, 6.572, 355.479, 142.364, 461.605, 234.907, 0.004, 3.946,…
## $ `2007-09` <dbl> 1256.571, 5.469, 372.598, 143.051, 499.566, 265.748, -0.007,…
## $ `2007-10` <dbl> NA, 5.145, 368.847, 146.542, 457.741, 241.434, -0.064, 4.315…
## $ `2007-11` <dbl> NA, 3.763, 330.826, 145.089, 488.348, 250.497, -0.077, 4.081…
## $ `2007-12` <dbl> 1216.890, 2.985, 312.920, 141.585, 448.268, 233.379, -0.060,…
## $ `2008-01` <dbl> NA, 3.175, 340.324, 142.145, 472.624, 241.223, 0.048, 4.122,…
## $ `2008-02` <dbl> NA, 3.111, 352.905, 142.607, 492.100, 249.306, 0.061, 4.079,…
## $ `2008-03` <dbl> 1253.522, 3.512, 361.155, 137.453, 494.046, 253.132, 0.058, …
## $ `2008-04` <dbl> NA, 4.164, 368.189, 140.389, 513.204, 271.070, 0.060, 4.144,…
## $ `2008-05` <dbl> NA, 4.015, 363.903, 142.585, 507.952, 258.351, 0.046, 4.079,…
## $ `2008-06` <dbl> 1314.821, 5.189, 362.962, 142.057, 518.349, 266.961, 0.060, …
## $ `2008-07` <dbl> NA, 6.016, 370.921, 145.731, 512.309, 270.158, 0.069, 4.548,…
## $ `2008-08` <dbl> NA, 5.800, 361.057, 144.565, 476.990, 239.344, 0.023, 4.286,…
## $ `2008-09` <dbl> 1307.041, 4.587, 389.537, 141.907, 517.324, 258.171, 0.021, …
## $ `2008-10` <dbl> NA, 4.285, 357.974, 151.957, 523.644, 250.063, 0.054, 4.556,…
## $ `2008-11` <dbl> NA, 3.488, 345.423, 152.952, 487.115, 232.068, 0.002, 4.302,…
## $ `2008-12` <dbl> 1232.655, 3.007, 325.767, 140.810, 446.743, 205.420, -0.015,…
## $ `2009-01` <dbl> NA, 3.014, 338.532, 141.448, 461.004, 215.660, -0.034, 3.584…
## $ `2009-02` <dbl> NA, 3.196, 360.412, 143.529, 482.407, 228.737, -0.020, 3.604…
## $ `2009-03` <dbl> 1209.792, 3.330, 353.686, 142.893, 467.224, 222.844, -0.050,…
## $ `2009-04` <dbl> NA, 4.049, 359.380, 142.340, 493.152, 238.232, -0.048, 2.824…
## $ `2009-05` <dbl> NA, 4.119, 354.750, 144.225, 475.634, 224.962, -0.058, 2.735…
## $ `2009-06` <dbl> 1233.085, 4.900, 347.865, 142.006, 473.099, 226.259, -0.079,…
## $ `2009-07` <dbl> NA, 6.444, 339.477, 137.691, 470.828, 230.308, -0.094, 2.655…
## $ `2009-08` <dbl> NA, 5.903, 332.661, 139.158, 466.676, 231.783, -0.044, 2.504…
## $ `2009-09` <dbl> 1230.461, 4.696, 374.260, 139.087, 500.403, 250.922, -0.035,…
## $ `2009-10` <dbl> NA, 4.212, 385.868, 137.104, 513.406, 230.739, -0.014, 2.584…
## $ `2009-11` <dbl> NA, 3.576, 366.980, 129.343, 480.278, 214.711, -0.022, 2.721…
## $ `2009-12` <dbl> 1207.845, 3.113, 332.394, 126.066, 440.925, 194.446, -0.022,…
## $ `2010-01` <dbl> NA, 3.207, 362.226, 130.910, 464.069, 204.396, 0.004, 2.682,…
## $ `2010-02` <dbl> NA, 3.195, 361.138, 131.918, 480.121, 213.136, -0.022, 2.498…
## $ `2010-03` <dbl> 1208.857, 3.481, 373.443, 131.252, 483.397, 211.693, 0.012, …
## $ `2010-04` <dbl> NA, 4.452, 378.611, 131.722, 502.374, 227.246, 0.007, 2.639,…
## $ `2010-05` <dbl> NA, 4.415, 380.171, 128.800, 487.400, 217.805, 0.013, 2.686,…
## $ `2010-06` <dbl> 1244.409, 5.411, 363.275, 129.144, 490.263, 215.922, 0.008, …
## $ `2010-07` <dbl> NA, 6.513, 353.040, 122.935, 488.587, 218.729, 0.001, 2.547,…
## $ `2010-08` <dbl> NA, 6.269, 343.688, 129.732, 473.731, 210.530, -0.015, 2.433…
## $ `2010-09` <dbl> 1225.527, 4.699, 381.622, 132.892, 521.099, 236.368, 0.001, …
## $ `2010-10` <dbl> NA, 4.402, 384.987, 131.033, 532.403, 236.366, 0.009, 2.616,…
## $ `2010-11` <dbl> NA, 3.731, 367.955, 130.889, 502.887, 221.881, 0.022, 2.528,…
## $ `2010-12` <dbl> 1216.262, 3.156, 326.338, 121.422, 450.433, 196.211, -0.004,…
## $ `2011-01` <dbl> NA, 3.140, 334.958, 128.396, 468.418, 198.450, -0.028, 2.213…
## $ `2011-02` <dbl> NA, 3.284, 346.234, 125.463, 504.068, 219.886, 0.008, 2.570,…
## $ `2011-03` <dbl> 1223.452, 3.674, 380.399, 134.374, 516.730, 227.935, 0.050, …
## $ `2011-04` <dbl> NA, 4.251, 380.446, 134.169, 528.631, 242.280, 0.036, 2.762,…
## $ `2011-05` <dbl> NA, 4.431, 385.289, 136.140, 528.122, 225.776, 0.050, 2.776,…
## $ `2011-06` <dbl> 1302.414, 5.474, 376.317, 135.581, 529.528, 221.865, 0.054, …
## $ `2011-07` <dbl> NA, 6.581, 361.585, 132.410, 532.888, 231.010, 0.067, 2.671,…
## $ `2011-08` <dbl> NA, 6.733, 353.793, 130.616, 508.145, 220.164, 0.052, 2.655,…
## $ `2011-09` <dbl> 1290.549, 5.003, 388.271, 136.901, 550.137, 244.949, 0.043, …
## $ `2011-10` <dbl> NA, 4.484, 398.456, 128.720, 554.932, 237.768, 0.032, 2.967,…
#first 6 rows of data
head(mbta)
## # A tibble: 6 × 60
##    ...1 mode         `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
##   <dbl> <chr>            <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1     1 All Modes b…       NA       NA       1188.      NA        NA      1246. 
## 2     2 Boat                4        3.6       40        4.3       4.9       5.8
## 3     3 Bus               336.     339.       340.     352.      354.      351. 
## 4     4 Commuter Ra…      142.     138.       138.     140.      139       143  
## 5     5 Heavy Rail        435.     448.       459.     472.      475.      477. 
## 6     6 Light Rail        227.     240.       241.     256.      248.      246. 
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## #   `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## #   `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## #   `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## #   `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## #   `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## #   `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
#Summary
summary(mbta)
##       ...1          mode              2007-01            2007-02        
##  Min.   : 1.0   Length:11          Min.   :   0.020   Min.   :  -0.040  
##  1st Qu.: 3.5   Class :character   1st Qu.:   4.804   1st Qu.:   4.563  
##  Median : 6.0   Mode  :character   Median :  77.478   Median :  75.707  
##  Mean   : 6.0                      Mean   : 233.397   Mean   : 238.324  
##  3rd Qu.: 8.5                      3rd Qu.: 308.672   3rd Qu.: 314.072  
##  Max.   :11.0                      Max.   :1166.974   Max.   :1191.639  
##                                    NA's   :1          NA's   :1         
##     2007-03            2007-04            2007-05            2007-06        
##  Min.   :   0.114   Min.   :  -0.002   Min.   :   0.049   Min.   :   0.096  
##  1st Qu.:   9.278   1st Qu.:   4.756   1st Qu.:   5.025   1st Qu.:   5.700  
##  Median : 137.700   Median :  76.472   Median :  76.240   Median : 143.000  
##  Mean   : 330.293   Mean   : 249.421   Mean   : 248.956   Mean   : 339.846  
##  3rd Qu.: 399.225   3rd Qu.: 328.011   3rd Qu.: 327.841   3rd Qu.: 413.788  
##  Max.   :1204.725   Max.   :1247.105   Max.   :1244.755   Max.   :1246.129  
##                     NA's   :1          NA's   :1                            
##     2007-07            2007-08            2007-09            2007-10        
##  Min.   :  -0.037   Min.   :   0.004   Min.   :  -0.007   Min.   :  -0.064  
##  1st Qu.:   5.570   1st Qu.:   5.624   1st Qu.:   5.539   1st Qu.:   5.310  
##  Median :  77.851   Median :  77.753   Median : 143.051   Median :  80.582  
##  Mean   : 248.787   Mean   : 244.665   Mean   : 352.554   Mean   : 248.884  
##  3rd Qu.: 328.961   3rd Qu.: 325.336   3rd Qu.: 436.082   3rd Qu.: 336.994  
##  Max.   :1243.952   Max.   :1223.323   Max.   :1310.764   Max.   :1244.453  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2007-11            2007-12            2008-01            2008-02        
##  Min.   :  -0.077   Min.   :  -0.060   Min.   :   0.048   Min.   :   0.061  
##  1st Qu.:   4.478   1st Qu.:   4.385   1st Qu.:   4.475   1st Qu.:   4.485  
##  Median :  79.356   Median : 141.585   Median :  78.023   Median :  78.389  
##  Mean   : 248.371   Mean   : 321.588   Mean   : 244.615   Mean   : 252.803  
##  3rd Qu.: 310.744   3rd Qu.: 380.594   3rd Qu.: 315.549   3rd Qu.: 327.005  
##  Max.   :1241.895   Max.   :1216.890   Max.   :1223.050   Max.   :1263.983  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2008-03            2008-04            2008-05            2008-06        
##  Min.   :   0.058   Min.   :   0.060   Min.   :   0.046   Min.   :   0.060  
##  1st Qu.:   5.170   1st Qu.:   4.689   1st Qu.:   4.629   1st Qu.:   5.742  
##  Median : 137.453   Median :  77.555   Median :  78.506   Median : 142.057  
##  Mean   : 345.604   Mean   : 264.435   Mean   : 260.323   Mean   : 359.667  
##  3rd Qu.: 427.601   3rd Qu.: 343.909   3rd Qu.: 337.515   3rd Qu.: 440.656  
##  Max.   :1274.031   Max.   :1322.146   Max.   :1301.591   Max.   :1320.728  
##                     NA's   :1          NA's   :1                            
##     2008-07            2008-08            2008-09            2008-10        
##  Min.   :   0.069   Min.   :   0.023   Min.   :   0.021   Min.   :   0.054  
##  1st Qu.:   6.019   1st Qu.:   5.887   1st Qu.:   5.691   1st Qu.:   5.087  
##  Median :  80.061   Median :  79.141   Median : 141.907   Median :  82.486  
##  Mean   : 266.027   Mean   : 250.383   Mean   : 362.099   Mean   : 262.440  
##  3rd Qu.: 345.730   3rd Qu.: 330.629   3rd Qu.: 453.430   3rd Qu.: 330.996  
##  Max.   :1330.103   Max.   :1251.905   Max.   :1338.015   Max.   :1312.172  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2008-11            2008-12            2009-01            2009-02       
##  Min.   :   0.002   Min.   :  -0.015   Min.   :  -0.034   Min.   :  -0.02  
##  1st Qu.:   4.829   1st Qu.:   4.689   1st Qu.:   4.186   1st Qu.:   4.38  
##  Median :  82.774   Median : 140.810   Median :  76.874   Median :  76.59  
##  Mean   : 248.871   Mean   : 319.882   Mean   : 236.303   Mean   : 247.65  
##  3rd Qu.: 317.084   3rd Qu.: 386.255   3rd Qu.: 307.814   3rd Qu.: 327.49  
##  Max.   :1244.354   Max.   :1232.655   Max.   :1181.534   Max.   :1238.24  
##  NA's   :1                             NA's   :1          NA's   :1        
##     2009-03            2009-04            2009-05            2009-06        
##  Min.   :  -0.050   Min.   :  -0.048   Min.   :  -0.058   Min.   :  -0.079  
##  1st Qu.:   5.003   1st Qu.:   4.720   1st Qu.:   4.763   1st Qu.:   5.845  
##  Median : 142.893   Median :  76.833   Median :  78.358   Median : 142.006  
##  Mean   : 330.142   Mean   : 251.603   Mean   : 245.116   Mean   : 333.194  
##  3rd Qu.: 410.455   3rd Qu.: 329.093   3rd Qu.: 322.303   3rd Qu.: 410.482  
##  Max.   :1210.912   Max.   :1258.037   Max.   :1225.608   Max.   :1233.085  
##                     NA's   :1          NA's   :1                            
##     2009-07            2009-08            2009-09            2009-10        
##  Min.   :  -0.094   Min.   :  -0.044   Min.   :  -0.035   Min.   :  -0.014  
##  1st Qu.:   6.298   1st Qu.:   6.033   1st Qu.:   5.693   1st Qu.:   4.883  
##  Median :  74.558   Median :  75.604   Median : 139.087   Median :  75.178  
##  Mean   : 241.006   Mean   : 239.427   Mean   : 346.687   Mean   : 258.811  
##  3rd Qu.: 312.185   3rd Qu.: 307.442   3rd Qu.: 437.332   3rd Qu.: 347.086  
##  Max.   :1205.079   Max.   :1197.158   Max.   :1291.564   Max.   :1294.064  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2009-11            2009-12            2010-01            2010-02        
##  Min.   :  -0.022   Min.   :  -0.022   Min.   :   0.004   Min.   :  -0.022  
##  1st Qu.:   4.323   1st Qu.:   4.784   1st Qu.:   4.034   1st Qu.:   4.062  
##  Median :  70.997   Median : 126.066   Median :  71.588   Median :  72.238  
##  Mean   : 243.363   Mean   : 312.962   Mean   : 237.255   Mean   : 242.244  
##  3rd Qu.: 328.913   3rd Qu.: 386.659   3rd Qu.: 322.769   3rd Qu.: 324.137  
##  Max.   :1216.824   Max.   :1207.845   Max.   :1186.271   Max.   :1211.228  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2010-03            2010-04            2010-05            2010-06        
##  Min.   :   0.012   Min.   :   0.007   Min.   :   0.013   Min.   :   0.008  
##  1st Qu.:   5.274   1st Qu.:   5.130   1st Qu.:   5.086   1st Qu.:   6.436  
##  Median : 131.252   Median :  72.370   Median :  70.785   Median : 129.144  
##  Mean   : 332.726   Mean   : 253.446   Mean   : 248.231   Mean   : 335.964  
##  3rd Qu.: 428.420   3rd Qu.: 340.770   3rd Qu.: 339.579   3rd Qu.: 426.769  
##  Max.   :1225.556   Max.   :1267.226   Max.   :1241.148   Max.   :1244.409  
##                     NA's   :1          NA's   :1                            
##     2010-07            2010-08            2010-09            2010-10        
##  Min.   :   0.001   Min.   :  -0.015   Min.   :   0.001   Min.   :   0.009  
##  1st Qu.:   6.531   1st Qu.:   6.281   1st Qu.:   5.567   1st Qu.:   5.006  
##  Median :  64.950   Median :  68.388   Median : 132.892   Median :  69.340  
##  Mean   : 241.180   Mean   : 235.947   Mean   : 346.524   Mean   : 261.255  
##  3rd Qu.: 319.462   3rd Qu.: 310.399   3rd Qu.: 451.361   3rd Qu.: 347.832  
##  Max.   :1205.901   Max.   :1179.745   Max.   :1293.117   Max.   :1306.271  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2010-11            2010-12            2011-01            2011-02        
##  Min.   :   0.022   Min.   :  -0.004   Min.   :  -0.028   Min.   :   0.008  
##  1st Qu.:   4.402   1st Qu.:   4.466   1st Qu.:   4.039   1st Qu.:   4.329  
##  Median :  69.166   Median : 121.422   Median :  69.750   Median :  68.579  
##  Mean   : 248.748   Mean   : 312.917   Mean   : 230.680   Mean   : 244.133  
##  3rd Qu.: 331.437   3rd Qu.: 388.385   3rd Qu.: 300.831   3rd Qu.: 314.647  
##  Max.   :1243.730   Max.   :1216.262   Max.   :1153.413   Max.   :1220.663  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2011-03           2011-04            2011-05            2011-06        
##  Min.   :   0.05   Min.   :   0.036   Min.   :   0.050   Min.   :   0.054  
##  1st Qu.:   6.03   1st Qu.:   5.224   1st Qu.:   5.338   1st Qu.:   6.926  
##  Median : 134.37   Median :  73.384   Median :  74.216   Median : 135.581  
##  Mean   : 345.17   Mean   : 262.660   Mean   : 260.582   Mean   : 353.331  
##  3rd Qu.: 448.56   3rd Qu.: 345.904   3rd Qu.: 345.411   3rd Qu.: 452.923  
##  Max.   :1286.66   Max.   :1313.283   Max.   :1302.884   Max.   :1302.414  
##                    NA's   :1          NA's   :1                            
##     2011-07            2011-08            2011-09            2011-10        
##  Min.   :   0.067   Min.   :   0.052   Min.   :   0.043   Min.   :   0.032  
##  1st Qu.:   6.911   1st Qu.:   7.067   1st Qu.:   6.660   1st Qu.:   5.513  
##  Median :  71.735   Median :  70.853   Median : 136.901   Median :  70.508  
##  Mean   : 257.228   Mean   : 248.259   Mean   : 362.555   Mean   : 269.648  
##  3rd Qu.: 328.941   3rd Qu.: 320.386   3rd Qu.: 469.204   3rd Qu.: 358.284  
##  Max.   :1286.107   Max.   :1241.268   Max.   :1348.754   Max.   :1348.222  
##  NA's   :1          NA's   :1                             NA's   :1
#Missing values in each column
colSums(is.na(mbta))
##    ...1    mode 2007-01 2007-02 2007-03 2007-04 2007-05 2007-06 2007-07 2007-08 
##       0       0       1       1       0       1       1       0       1       1 
## 2007-09 2007-10 2007-11 2007-12 2008-01 2008-02 2008-03 2008-04 2008-05 2008-06 
##       0       1       1       0       1       1       0       1       1       0 
## 2008-07 2008-08 2008-09 2008-10 2008-11 2008-12 2009-01 2009-02 2009-03 2009-04 
##       1       1       0       1       1       0       1       1       0       1 
## 2009-05 2009-06 2009-07 2009-08 2009-09 2009-10 2009-11 2009-12 2010-01 2010-02 
##       1       0       1       1       0       1       1       0       1       1 
## 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 
##       0       1       1       0       1       1       0       1       1       0 
## 2011-01 2011-02 2011-03 2011-04 2011-05 2011-06 2011-07 2011-08 2011-09 2011-10 
##       1       1       0       1       1       0       1       1       0       1

There are 1 missing value in each column.

Exercise 3

mbta <- mbta %>%
  slice(-c(1,7,11)) %>% # Remove the first, seventh, and eleventh rows of mbta.
  select(-1) # Remove the first column.
  dim(mbta) # Now what is the dimensions of this new data frame?
## [1]  8 59

The new dimensions are 8 rows and 59 columns.

Exercise 4

mbta <- mbta %>%
  pivot_longer(
    cols = -mode,
    names_to = "date",
    values_to = "thou_riders",
    values_drop_na = FALSE
  ) # Pivot the rows and columns of the mbta data so that all columns are variables of the data. This should result in 3 columns - `mode`, `date`, and number of riders in thousands (`thou_riders`).
  
dim(mbta) # Now what is the dimensions of this new data frame?
## [1] 464   3

The dimensions of the new data frame are 464 rows and 3 columns.

Exercise 5

mbta <- mbta %>%
  separate(date, into = c("year", "month"), sep = "-", remove = FALSE) # Split the month column of mbta at the dash and create a new month column with only the month and a year column with only the year.

head(mbta) # View the head of this new mbta data set.
## # A tibble: 6 × 5
##   mode  date    year  month thou_riders
##   <chr> <chr>   <chr> <chr>       <dbl>
## 1 Boat  2007-01 2007  01            4  
## 2 Boat  2007-02 2007  02            3.6
## 3 Boat  2007-03 2007  03           40  
## 4 Boat  2007-04 2007  04            4.3
## 5 Boat  2007-05 2007  05            4.9
## 6 Boat  2007-06 2007  06            5.8

Exercise 6

mbta <- mbta %>%
  mutate(
    thou_riders = if_else(
      str_detect(mode, "(?i)boat") &
        thou_riders == 40,
      4,
      thou_riders
    )
  )
head(mbta)
## # A tibble: 6 × 5
##   mode  date    year  month thou_riders
##   <chr> <chr>   <chr> <chr>       <dbl>
## 1 Boat  2007-01 2007  01            4  
## 2 Boat  2007-02 2007  02            3.6
## 3 Boat  2007-03 2007  03            4  
## 4 Boat  2007-04 2007  04            4.3
## 5 Boat  2007-05 2007  05            4.9
## 6 Boat  2007-06 2007  06            5.8
#Check to confirm there are no values of 40, should return 0 rows
mbta %>%
  filter(str_detect(mode, "(?i)^boat$"), thou_riders == 40)
## # A tibble: 0 × 5
## # ℹ 5 variables: mode <chr>, date <chr>, year <chr>, month <chr>,
## #   thou_riders <dbl>

Exercise 7

Average ridership by mode

mbta %>%
  group_by(mode) %>%
  summarize(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
##   mode              avg_ridership
##   <chr>                     <dbl>
## 1 Boat                       4.45
## 2 Bus                      359.  
## 3 Commuter Rail            137.  
## 4 Heavy Rail               489.  
## 5 Light Rail               233.  
## 6 Private Bus                3.35
## 7 RIDE                       6.60
## 8 Trackless Trolley         12.1

Average ridership by mode in January

mbta %>%
  filter(month == "01") %>%
  group_by(mode) %>%
  summarize(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
##   mode              avg_ridership
##   <chr>                     <dbl>
## 1 Boat                       3.31
## 2 Bus                      342.  
## 3 Commuter Rail            137.  
## 4 Heavy Rail               460.  
## 5 Light Rail               217.  
## 6 Private Bus                3.47
## 7 RIDE                       5.94
## 8 Trackless Trolley         12.5

Which year had the largest ridership for the boat mode?

mbta %>%
  filter(mode == "Boat") %>%                        
  group_by(year) %>%
  summarize(total_riders = sum(thou_riders, na.rm = TRUE)) %>%
  slice_max(total_riders, n = 1)
## # A tibble: 1 × 2
##   year  total_riders
##   <chr>        <dbl>
## 1 2007          57.1

2007

Which month experiences the greatest number of passengers on the Heavy Rail mode?

mbta %>%
  filter(mode == "Heavy Rail") %>%
  group_by(month) %>%
  summarize(avg_ridership = mean(thou_riders, na.rm = TRUE)) %>%
  slice_max(avg_ridership, n = 1)
## # A tibble: 1 × 2
##   month avg_ridership
##   <chr>         <dbl>
## 1 09             518.

September