Part 1

Installing Packages

install.packages('completejourney')
## 
## The downloaded binary packages are in
##  /var/folders/k_/ws4k3wfd0mz22w3mtk9q3y000000gn/T//RtmphMrlSj/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
transactions <- transactions_sample
transactions
## # A tibble: 75,000 × 11
##    household_id store_id basket_id   product_id quantity sales_value retail_disc
##    <chr>        <chr>    <chr>       <chr>         <dbl>       <dbl>       <dbl>
##  1 2261         309      31625220889 940996            1        3.86        0.43
##  2 2131         368      32053127496 873902            1        1.59        0.9 
##  3 511          316      32445856036 847901            1        1           0.69
##  4 400          388      31932241118 13094913          2       11.9         2.9 
##  5 918          340      32074655895 1085604           1        1.29        0   
##  6 718          324      32614612029 883203            1        2.5         0.49
##  7 868          323      32074722463 9884484           1        3.49        0   
##  8 1688         450      34850403304 1028715           1        2           1.79
##  9 467          31782    31280745102 896613            2        6.55        4.44
## 10 1947         32004    32744181707 978497            1        3.99        0   
## # ℹ 74,990 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>
products
## # A tibble: 92,331 × 7
##    product_id manufacturer_id department    brand  product_category product_type
##    <chr>      <chr>           <chr>         <fct>  <chr>            <chr>       
##  1 25671      2               GROCERY       Natio… FRZN ICE         ICE - CRUSH…
##  2 26081      2               MISCELLANEOUS Natio… <NA>             <NA>        
##  3 26093      69              PASTRY        Priva… BREAD            BREAD:ITALI…
##  4 26190      69              GROCERY       Priva… FRUIT - SHELF S… APPLE SAUCE 
##  5 26355      69              GROCERY       Priva… COOKIES/CONES    SPECIALTY C…
##  6 26426      69              GROCERY       Priva… SPICES & EXTRAC… SPICES & SE…
##  7 26540      69              GROCERY       Priva… COOKIES/CONES    TRAY PACK/C…
##  8 26601      69              DRUG GM       Priva… VITAMINS         VITAMIN - M…
##  9 26636      69              PASTRY        Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691      16              GROCERY       Priva… PNT BTR/JELLY/J… HONEY       
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>

Exercise 1

Q0: Create New Variables

transactions <- transactions %>% 
  mutate (
    regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
    loyalty_price = (sales_value + coupon_match_disc) / quantity,
    coupon_price = (sales_value - coupon_disc) / quantity
  ) %>% 
  select(regular_price, loyalty_price, coupon_price, product_id, everything())

Q1. Identify households with larget loyalty_price transactions

transactions %>% 
  slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
##   regular_price loyalty_price coupon_price product_id household_id store_id
##           <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
## 1         Inf           Inf          Inf   13945244   2491         389     
## 2         100           100          100   12484608   1246         334     
## 3         100.          100.         100.  13040176   2318         381     
## 4          88.9          88.9         88.9 15630122   1172         396     
## 5          85.0          66.1         66.1 916561     2312         442     
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

The regular_price and loyalty_price observations show infinity, indicating quantity was 0

Q2. Filter for observations > 0

transactions %>% 
  filter(quantity > 0) %>% 
  slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
##   regular_price loyalty_price coupon_price product_id household_id store_id
##           <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
## 1         100           100          100   12484608   1246         334     
## 2         100.          100.         100.  13040176   2318         381     
## 3          88.9          88.9         88.9 15630122   1172         396     
## 4          85.0          66.1         66.1 916561     2312         442     
## 5          63.8          63.8         63.8 1076056    57           298     
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

Households with largest loyalty_price transactions are:

  • 1246
  • 2318
  • 1172
  • 2312
  • 57

Q3. Finding the product of largest transaction

products %>% 
  filter(product_id == 12484608)
## # A tibble: 1 × 7
##   product_id manufacturer_id department    brand   product_category product_type
##   <chr>      <chr>           <chr>         <fct>   <chr>            <chr>       
## 1 12484608   903             MISCELLANEOUS Private COUPON/MISC ITE… ELECTRONIC …
## # ℹ 1 more variable: package_size <chr>

The product is electronic gift card

Exercise 2

how many products had a regular price of $1 or less

transactions %>% 
  filter(regular_price <= 1) %>% 
  select(product_id) %>% 
  n_distinct()
## [1] 2748

how many products had a loyalty price of $1 or less

transactions %>% 
  filter(loyalty_price <= 1) %>% 
  select(product_id) %>% 
  n_distinct()
## [1] 4648

how many products had a coupon price of $1 or less

transactions %>% 
  filter(coupon_price <= 1) %>% 
  select(product_id) %>% 
  n_distinct()
## [1] 4844

Exercise 3

? What proportion of baskets are over $20 in sales value?

transactions %>% 
  group_by(basket_id) %>% 
  summarise(total_sales_value = sum(sales_value, na.rm = TRUE)) %>% 
  summarise(
    proportion_over_10 = mean(total_sales_value > 10),
    proportion_over_20 = mean(total_sales_value > 20)
  )
## # A tibble: 1 × 2
##   proportion_over_10 proportion_over_20
##                <dbl>              <dbl>
## 1              0.107             0.0258

Exercise 4

Which stores had the largest totalsales_value

transactions %>% 
  group_by(store_id) %>% 
  summarise(total_sales_value = sum(sales_value, na.rm = TRUE)) %>% 
  arrange(desc(total_sales_value))
## # A tibble: 293 × 2
##    store_id total_sales_value
##    <chr>                <dbl>
##  1 367                  7713.
##  2 406                  6034.
##  3 429                  4702.
##  4 343                  4471.
##  5 361                  4061.
##  6 356                  3807.
##  7 381                  3746.
##  8 292                  3702.
##  9 31782                3684.
## 10 321                  3515.
## # ℹ 283 more rows

Which stores had the largest average loyalty discount

transactions %>% 
  mutate(pct_loyalty_disc = 1 - (loyalty_price/ regular_price)) %>% 
  group_by(store_id) %>% 
  summarise(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>% 
  arrange(desc(avg_pct_loyalty_disc))
## # A tibble: 293 × 2
##    store_id avg_pct_loyalty_disc
##    <chr>                   <dbl>
##  1 224                     0.576
##  2 62                      0.501
##  3 779                     0.475
##  4 3163                    0.433
##  5 784                     0.430
##  6 2950                    0.418
##  7 572                     0.401
##  8 65                      0.390
##  9 486                     0.388
## 10 2839                    0.373
## # ℹ 283 more rows

Part 2

Exercise 1

Import the data

library(readxl)
excel_sheets('/Users/shamssadin/Documents/4080 Data Mining/mbta.xlsx')
## [1] "Sheet1"
mbta <- read_excel(path = '/Users/shamssadin/Documents/4080 Data Mining/mbta.xlsx', skip = 1, na = 'NA')
## New names:
## • `` -> `...1`
mbta
## # A tibble: 11 × 60
##     ...1 mode        `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
##    <dbl> <chr>           <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
##  1     1 All Modes …     NA        NA     1188.       NA        NA      1246.   
##  2     2 Boat             4         3.6     40         4.3       4.9       5.8  
##  3     3 Bus            336.      339.     340.      352.      354.      351.   
##  4     4 Commuter R…    142.      138.     138.      140.      139       143    
##  5     5 Heavy Rail     435.      448.     459.      472.      475.      477.   
##  6     6 Light Rail     227.      240.     241.      256.      248.      246.   
##  7     7 Pct Chg / …      0.02     -0.04     0.114    -0.002     0.049     0.096
##  8     8 Private Bus      4.77      4.42     4.57      4.54      4.77      4.72 
##  9     9 RIDE             4.9       5        5.5       5.4       5.4       5.6  
## 10    10 Trackless …     12.8      12.9     13.1      13.4      13.5      13.3  
## 11    11 TOTAL         1167.     1192.    1205.     1247.     1245.     1246.   
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## #   `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## #   `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## #   `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## #   `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## #   `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## #   `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …

Exercise 2

Examine the data

dim(mbta)
## [1] 11 60
str(mbta)
## tibble [11 × 60] (S3: tbl_df/tbl/data.frame)
##  $ ...1   : num [1:11] 1 2 3 4 5 6 7 8 9 10 ...
##  $ mode   : chr [1:11] "All Modes by Qtr" "Boat" "Bus" "Commuter Rail" ...
##  $ 2007-01: num [1:11] NA 4 336 142 435 ...
##  $ 2007-02: num [1:11] NA 3.6 338.7 138.5 448.3 ...
##  $ 2007-03: num [1:11] 1188 40 340 138 459 ...
##  $ 2007-04: num [1:11] NA 4.3 352.2 139.5 472.2 ...
##  $ 2007-05: num [1:11] NA 4.9 354.4 139 474.6 ...
##  $ 2007-06: num [1:11] 1246 5.8 350.5 143 477 ...
##  $ 2007-07: num [1:11] NA 6.52 357.52 142.39 471.74 ...
##  $ 2007-08: num [1:11] NA 6.57 355.48 142.36 461.61 ...
##  $ 2007-09: num [1:11] 1256.57 5.47 372.6 143.05 499.57 ...
##  $ 2007-10: num [1:11] NA 5.14 368.85 146.54 457.74 ...
##  $ 2007-11: num [1:11] NA 3.76 330.83 145.09 488.35 ...
##  $ 2007-12: num [1:11] 1216.89 2.98 312.92 141.59 448.27 ...
##  $ 2008-01: num [1:11] NA 3.17 340.32 142.15 472.62 ...
##  $ 2008-02: num [1:11] NA 3.11 352.9 142.61 492.1 ...
##  $ 2008-03: num [1:11] 1253.52 3.51 361.15 137.45 494.05 ...
##  $ 2008-04: num [1:11] NA 4.16 368.19 140.39 513.2 ...
##  $ 2008-05: num [1:11] NA 4.01 363.9 142.59 507.95 ...
##  $ 2008-06: num [1:11] 1314.82 5.19 362.96 142.06 518.35 ...
##  $ 2008-07: num [1:11] NA 6.02 370.92 145.73 512.31 ...
##  $ 2008-08: num [1:11] NA 5.8 361.1 144.6 477 ...
##  $ 2008-09: num [1:11] 1307.04 4.59 389.54 141.91 517.32 ...
##  $ 2008-10: num [1:11] NA 4.29 357.97 151.96 523.64 ...
##  $ 2008-11: num [1:11] NA 3.49 345.42 152.95 487.12 ...
##  $ 2008-12: num [1:11] 1232.65 3.01 325.77 140.81 446.74 ...
##  $ 2009-01: num [1:11] NA 3.01 338.53 141.45 461 ...
##  $ 2009-02: num [1:11] NA 3.2 360.4 143.5 482.4 ...
##  $ 2009-03: num [1:11] 1209.79 3.33 353.69 142.89 467.22 ...
##  $ 2009-04: num [1:11] NA 4.05 359.38 142.34 493.15 ...
##  $ 2009-05: num [1:11] NA 4.12 354.75 144.22 475.63 ...
##  $ 2009-06: num [1:11] 1233.1 4.9 347.9 142 473.1 ...
##  $ 2009-07: num [1:11] NA 6.44 339.48 137.69 470.83 ...
##  $ 2009-08: num [1:11] NA 5.9 332.7 139.2 466.7 ...
##  $ 2009-09: num [1:11] 1230.5 4.7 374.3 139.1 500.4 ...
##  $ 2009-10: num [1:11] NA 4.21 385.87 137.1 513.41 ...
##  $ 2009-11: num [1:11] NA 3.58 366.98 129.34 480.28 ...
##  $ 2009-12: num [1:11] 1207.85 3.11 332.39 126.07 440.93 ...
##  $ 2010-01: num [1:11] NA 3.21 362.23 130.91 464.07 ...
##  $ 2010-02: num [1:11] NA 3.2 361.1 131.9 480.1 ...
##  $ 2010-03: num [1:11] 1208.86 3.48 373.44 131.25 483.4 ...
##  $ 2010-04: num [1:11] NA 4.45 378.61 131.72 502.37 ...
##  $ 2010-05: num [1:11] NA 4.42 380.17 128.8 487.4 ...
##  $ 2010-06: num [1:11] 1244.41 5.41 363.27 129.14 490.26 ...
##  $ 2010-07: num [1:11] NA 6.51 353.04 122.94 488.59 ...
##  $ 2010-08: num [1:11] NA 6.27 343.69 129.73 473.73 ...
##  $ 2010-09: num [1:11] 1225.5 4.7 381.6 132.9 521.1 ...
##  $ 2010-10: num [1:11] NA 4.4 385 131 532.4 ...
##  $ 2010-11: num [1:11] NA 3.73 367.95 130.89 502.89 ...
##  $ 2010-12: num [1:11] 1216.26 3.16 326.34 121.42 450.43 ...
##  $ 2011-01: num [1:11] NA 3.14 334.96 128.4 468.42 ...
##  $ 2011-02: num [1:11] NA 3.28 346.23 125.46 504.07 ...
##  $ 2011-03: num [1:11] 1223.45 3.67 380.4 134.37 516.73 ...
##  $ 2011-04: num [1:11] NA 4.25 380.45 134.17 528.63 ...
##  $ 2011-05: num [1:11] NA 4.43 385.29 136.14 528.12 ...
##  $ 2011-06: num [1:11] 1302.41 5.47 376.32 135.58 529.53 ...
##  $ 2011-07: num [1:11] NA 6.58 361.58 132.41 532.89 ...
##  $ 2011-08: num [1:11] NA 6.73 353.79 130.62 508.14 ...
##  $ 2011-09: num [1:11] 1291 5 388 137 550 ...
##  $ 2011-10: num [1:11] NA 4.48 398.46 128.72 554.93 ...
head(mbta)
## # A tibble: 6 × 60
##    ...1 mode         `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
##   <dbl> <chr>            <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1     1 All Modes b…       NA       NA       1188.      NA        NA      1246. 
## 2     2 Boat                4        3.6       40        4.3       4.9       5.8
## 3     3 Bus               336.     339.       340.     352.      354.      351. 
## 4     4 Commuter Ra…      142.     138.       138.     140.      139       143  
## 5     5 Heavy Rail        435.     448.       459.     472.      475.      477. 
## 6     6 Light Rail        227.     240.       241.     256.      248.      246. 
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## #   `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## #   `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## #   `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## #   `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## #   `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## #   `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
summary(mbta)
##       ...1          mode              2007-01            2007-02        
##  Min.   : 1.0   Length:11          Min.   :   0.020   Min.   :  -0.040  
##  1st Qu.: 3.5   Class :character   1st Qu.:   4.804   1st Qu.:   4.563  
##  Median : 6.0   Mode  :character   Median :  77.478   Median :  75.707  
##  Mean   : 6.0                      Mean   : 233.397   Mean   : 238.324  
##  3rd Qu.: 8.5                      3rd Qu.: 308.672   3rd Qu.: 314.072  
##  Max.   :11.0                      Max.   :1166.974   Max.   :1191.639  
##                                    NA's   :1          NA's   :1         
##     2007-03            2007-04            2007-05            2007-06        
##  Min.   :   0.114   Min.   :  -0.002   Min.   :   0.049   Min.   :   0.096  
##  1st Qu.:   9.278   1st Qu.:   4.756   1st Qu.:   5.025   1st Qu.:   5.700  
##  Median : 137.700   Median :  76.472   Median :  76.240   Median : 143.000  
##  Mean   : 330.293   Mean   : 249.421   Mean   : 248.956   Mean   : 339.846  
##  3rd Qu.: 399.225   3rd Qu.: 328.011   3rd Qu.: 327.841   3rd Qu.: 413.788  
##  Max.   :1204.725   Max.   :1247.105   Max.   :1244.755   Max.   :1246.129  
##                     NA's   :1          NA's   :1                            
##     2007-07            2007-08            2007-09            2007-10        
##  Min.   :  -0.037   Min.   :   0.004   Min.   :  -0.007   Min.   :  -0.064  
##  1st Qu.:   5.570   1st Qu.:   5.624   1st Qu.:   5.539   1st Qu.:   5.310  
##  Median :  77.851   Median :  77.753   Median : 143.051   Median :  80.582  
##  Mean   : 248.787   Mean   : 244.665   Mean   : 352.554   Mean   : 248.884  
##  3rd Qu.: 328.961   3rd Qu.: 325.336   3rd Qu.: 436.082   3rd Qu.: 336.994  
##  Max.   :1243.952   Max.   :1223.323   Max.   :1310.764   Max.   :1244.453  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2007-11            2007-12            2008-01            2008-02        
##  Min.   :  -0.077   Min.   :  -0.060   Min.   :   0.048   Min.   :   0.061  
##  1st Qu.:   4.478   1st Qu.:   4.385   1st Qu.:   4.475   1st Qu.:   4.485  
##  Median :  79.356   Median : 141.585   Median :  78.023   Median :  78.389  
##  Mean   : 248.371   Mean   : 321.588   Mean   : 244.615   Mean   : 252.803  
##  3rd Qu.: 310.744   3rd Qu.: 380.594   3rd Qu.: 315.549   3rd Qu.: 327.005  
##  Max.   :1241.895   Max.   :1216.890   Max.   :1223.050   Max.   :1263.983  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2008-03            2008-04            2008-05            2008-06        
##  Min.   :   0.058   Min.   :   0.060   Min.   :   0.046   Min.   :   0.060  
##  1st Qu.:   5.170   1st Qu.:   4.689   1st Qu.:   4.629   1st Qu.:   5.742  
##  Median : 137.453   Median :  77.555   Median :  78.506   Median : 142.057  
##  Mean   : 345.604   Mean   : 264.435   Mean   : 260.323   Mean   : 359.667  
##  3rd Qu.: 427.601   3rd Qu.: 343.909   3rd Qu.: 337.515   3rd Qu.: 440.656  
##  Max.   :1274.031   Max.   :1322.146   Max.   :1301.591   Max.   :1320.728  
##                     NA's   :1          NA's   :1                            
##     2008-07            2008-08            2008-09            2008-10        
##  Min.   :   0.069   Min.   :   0.023   Min.   :   0.021   Min.   :   0.054  
##  1st Qu.:   6.019   1st Qu.:   5.887   1st Qu.:   5.691   1st Qu.:   5.087  
##  Median :  80.061   Median :  79.141   Median : 141.907   Median :  82.486  
##  Mean   : 266.027   Mean   : 250.383   Mean   : 362.099   Mean   : 262.440  
##  3rd Qu.: 345.730   3rd Qu.: 330.629   3rd Qu.: 453.430   3rd Qu.: 330.996  
##  Max.   :1330.103   Max.   :1251.905   Max.   :1338.015   Max.   :1312.172  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2008-11            2008-12            2009-01            2009-02       
##  Min.   :   0.002   Min.   :  -0.015   Min.   :  -0.034   Min.   :  -0.02  
##  1st Qu.:   4.829   1st Qu.:   4.689   1st Qu.:   4.186   1st Qu.:   4.38  
##  Median :  82.774   Median : 140.810   Median :  76.874   Median :  76.59  
##  Mean   : 248.871   Mean   : 319.882   Mean   : 236.303   Mean   : 247.65  
##  3rd Qu.: 317.084   3rd Qu.: 386.255   3rd Qu.: 307.814   3rd Qu.: 327.49  
##  Max.   :1244.354   Max.   :1232.655   Max.   :1181.534   Max.   :1238.24  
##  NA's   :1                             NA's   :1          NA's   :1        
##     2009-03            2009-04            2009-05            2009-06        
##  Min.   :  -0.050   Min.   :  -0.048   Min.   :  -0.058   Min.   :  -0.079  
##  1st Qu.:   5.003   1st Qu.:   4.720   1st Qu.:   4.763   1st Qu.:   5.845  
##  Median : 142.893   Median :  76.833   Median :  78.358   Median : 142.006  
##  Mean   : 330.142   Mean   : 251.603   Mean   : 245.116   Mean   : 333.194  
##  3rd Qu.: 410.455   3rd Qu.: 329.093   3rd Qu.: 322.303   3rd Qu.: 410.482  
##  Max.   :1210.912   Max.   :1258.037   Max.   :1225.608   Max.   :1233.085  
##                     NA's   :1          NA's   :1                            
##     2009-07            2009-08            2009-09            2009-10        
##  Min.   :  -0.094   Min.   :  -0.044   Min.   :  -0.035   Min.   :  -0.014  
##  1st Qu.:   6.298   1st Qu.:   6.033   1st Qu.:   5.693   1st Qu.:   4.883  
##  Median :  74.558   Median :  75.604   Median : 139.087   Median :  75.178  
##  Mean   : 241.006   Mean   : 239.427   Mean   : 346.687   Mean   : 258.811  
##  3rd Qu.: 312.185   3rd Qu.: 307.442   3rd Qu.: 437.332   3rd Qu.: 347.086  
##  Max.   :1205.079   Max.   :1197.158   Max.   :1291.564   Max.   :1294.064  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2009-11            2009-12            2010-01            2010-02        
##  Min.   :  -0.022   Min.   :  -0.022   Min.   :   0.004   Min.   :  -0.022  
##  1st Qu.:   4.323   1st Qu.:   4.784   1st Qu.:   4.034   1st Qu.:   4.062  
##  Median :  70.997   Median : 126.066   Median :  71.588   Median :  72.238  
##  Mean   : 243.363   Mean   : 312.962   Mean   : 237.255   Mean   : 242.244  
##  3rd Qu.: 328.913   3rd Qu.: 386.659   3rd Qu.: 322.769   3rd Qu.: 324.137  
##  Max.   :1216.824   Max.   :1207.845   Max.   :1186.271   Max.   :1211.228  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2010-03            2010-04            2010-05            2010-06        
##  Min.   :   0.012   Min.   :   0.007   Min.   :   0.013   Min.   :   0.008  
##  1st Qu.:   5.274   1st Qu.:   5.130   1st Qu.:   5.086   1st Qu.:   6.436  
##  Median : 131.252   Median :  72.370   Median :  70.785   Median : 129.144  
##  Mean   : 332.726   Mean   : 253.446   Mean   : 248.231   Mean   : 335.964  
##  3rd Qu.: 428.420   3rd Qu.: 340.770   3rd Qu.: 339.579   3rd Qu.: 426.769  
##  Max.   :1225.556   Max.   :1267.226   Max.   :1241.148   Max.   :1244.409  
##                     NA's   :1          NA's   :1                            
##     2010-07            2010-08            2010-09            2010-10        
##  Min.   :   0.001   Min.   :  -0.015   Min.   :   0.001   Min.   :   0.009  
##  1st Qu.:   6.531   1st Qu.:   6.281   1st Qu.:   5.567   1st Qu.:   5.006  
##  Median :  64.950   Median :  68.388   Median : 132.892   Median :  69.340  
##  Mean   : 241.180   Mean   : 235.947   Mean   : 346.524   Mean   : 261.255  
##  3rd Qu.: 319.462   3rd Qu.: 310.399   3rd Qu.: 451.361   3rd Qu.: 347.832  
##  Max.   :1205.901   Max.   :1179.745   Max.   :1293.117   Max.   :1306.271  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2010-11            2010-12            2011-01            2011-02        
##  Min.   :   0.022   Min.   :  -0.004   Min.   :  -0.028   Min.   :   0.008  
##  1st Qu.:   4.402   1st Qu.:   4.466   1st Qu.:   4.039   1st Qu.:   4.329  
##  Median :  69.166   Median : 121.422   Median :  69.750   Median :  68.579  
##  Mean   : 248.748   Mean   : 312.917   Mean   : 230.680   Mean   : 244.133  
##  3rd Qu.: 331.437   3rd Qu.: 388.385   3rd Qu.: 300.831   3rd Qu.: 314.647  
##  Max.   :1243.730   Max.   :1216.262   Max.   :1153.413   Max.   :1220.663  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2011-03           2011-04            2011-05            2011-06        
##  Min.   :   0.05   Min.   :   0.036   Min.   :   0.050   Min.   :   0.054  
##  1st Qu.:   6.03   1st Qu.:   5.224   1st Qu.:   5.338   1st Qu.:   6.926  
##  Median : 134.37   Median :  73.384   Median :  74.216   Median : 135.581  
##  Mean   : 345.17   Mean   : 262.660   Mean   : 260.582   Mean   : 353.331  
##  3rd Qu.: 448.56   3rd Qu.: 345.904   3rd Qu.: 345.411   3rd Qu.: 452.923  
##  Max.   :1286.66   Max.   :1313.283   Max.   :1302.884   Max.   :1302.414  
##                    NA's   :1          NA's   :1                            
##     2011-07            2011-08            2011-09            2011-10        
##  Min.   :   0.067   Min.   :   0.052   Min.   :   0.043   Min.   :   0.032  
##  1st Qu.:   6.911   1st Qu.:   7.067   1st Qu.:   6.660   1st Qu.:   5.513  
##  Median :  71.735   Median :  70.853   Median : 136.901   Median :  70.508  
##  Mean   : 257.228   Mean   : 248.259   Mean   : 362.555   Mean   : 269.648  
##  3rd Qu.: 328.941   3rd Qu.: 320.386   3rd Qu.: 469.204   3rd Qu.: 358.284  
##  Max.   :1286.107   Max.   :1241.268   Max.   :1348.754   Max.   :1348.222  
##  NA's   :1          NA's   :1                             NA's   :1
colSums(is.na(mbta))
##    ...1    mode 2007-01 2007-02 2007-03 2007-04 2007-05 2007-06 2007-07 2007-08 
##       0       0       1       1       0       1       1       0       1       1 
## 2007-09 2007-10 2007-11 2007-12 2008-01 2008-02 2008-03 2008-04 2008-05 2008-06 
##       0       1       1       0       1       1       0       1       1       0 
## 2008-07 2008-08 2008-09 2008-10 2008-11 2008-12 2009-01 2009-02 2009-03 2009-04 
##       1       1       0       1       1       0       1       1       0       1 
## 2009-05 2009-06 2009-07 2009-08 2009-09 2009-10 2009-11 2009-12 2010-01 2010-02 
##       1       0       1       1       0       1       1       0       1       1 
## 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 
##       0       1       1       0       1       1       0       1       1       0 
## 2011-01 2011-02 2011-03 2011-04 2011-05 2011-06 2011-07 2011-08 2011-09 2011-10 
##       1       1       0       1       1       0       1       1       0       1
rowSums(is.na(mbta))
##  [1] 39  0  0  0  0  0  0  0  0  0  0

Exercise 3

Remove unnecessary rows and columns

mbta <- mbta %>% 
  slice(-c(1,7,11)) %>% 
  select(-1)

dim(mbta)
## [1]  8 59

Exercise 4

Make the data long

mbta <- mbta %>% 
  pivot_longer(2:59, names_to = 'date', values_to = 'thou_riders')
  
head(mbta)
## # A tibble: 6 × 3
##   mode  date    thou_riders
##   <chr> <chr>         <dbl>
## 1 Boat  2007-01         4  
## 2 Boat  2007-02         3.6
## 3 Boat  2007-03        40  
## 4 Boat  2007-04         4.3
## 5 Boat  2007-05         4.9
## 6 Boat  2007-06         5.8

Exercise 5

Seperate years and months from date

mbta <- mbta %>% 
  separate(date, into = c('year','month'), sep = '-')

head(mbta)
## # A tibble: 6 × 4
##   mode  year  month thou_riders
##   <chr> <chr> <chr>       <dbl>
## 1 Boat  2007  01            4  
## 2 Boat  2007  02            3.6
## 3 Boat  2007  03           40  
## 4 Boat  2007  04            4.3
## 5 Boat  2007  05            4.9
## 6 Boat  2007  06            5.8

Exercise 6

Fix entry error

mbta <- mbta %>%
  mutate(thou_riders = ifelse(thou_riders == 40.0, 4.0, thou_riders))
mbta
## # A tibble: 464 × 4
##    mode  year  month thou_riders
##    <chr> <chr> <chr>       <dbl>
##  1 Boat  2007  01           4   
##  2 Boat  2007  02           3.6 
##  3 Boat  2007  03           4   
##  4 Boat  2007  04           4.3 
##  5 Boat  2007  05           4.9 
##  6 Boat  2007  06           5.8 
##  7 Boat  2007  07           6.52
##  8 Boat  2007  08           6.57
##  9 Boat  2007  09           5.47
## 10 Boat  2007  10           5.14
## # ℹ 454 more rows

Exercise 7

Create descriptive statistics

mbta %>% 
  group_by(mode) %>% 
  summarise(avg_ridership = mean(thou_riders))
## # A tibble: 8 × 2
##   mode              avg_ridership
##   <chr>                     <dbl>
## 1 Boat                       4.45
## 2 Bus                      359.  
## 3 Commuter Rail            137.  
## 4 Heavy Rail               489.  
## 5 Light Rail               233.  
## 6 Private Bus                3.35
## 7 RIDE                       6.60
## 8 Trackless Trolley         12.1
mbta %>% 
  filter(month == '01') %>% 
  group_by(mode) %>% 
  summarise(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
##   mode              avg_ridership
##   <chr>                     <dbl>
## 1 Boat                       3.31
## 2 Bus                      342.  
## 3 Commuter Rail            137.  
## 4 Heavy Rail               460.  
## 5 Light Rail               217.  
## 6 Private Bus                3.47
## 7 RIDE                       5.94
## 8 Trackless Trolley         12.5
mbta %>% 
  filter(mode == 'Boat') %>% 
  group_by(year) %>% 
  summarise(total_riders = sum(thou_riders)) %>% 
  arrange(desc(total_riders))
## # A tibble: 5 × 2
##   year  total_riders
##   <chr>        <dbl>
## 1 2007          57.1
## 2 2010          52.9
## 3 2009          50.6
## 4 2008          50.3
## 5 2011          47.1
mbta %>% 
  filter(mode == 'Heavy Rail') %>% 
  group_by(month) %>% 
  summarise(avg_total_riders = mean(thou_riders)) %>% 
  arrange(desc(avg_total_riders))
## # A tibble: 12 × 2
##    month avg_total_riders
##    <chr>            <dbl>
##  1 09                518.
##  2 10                516.
##  3 04                502.
##  4 06                498.
##  5 07                495.
##  6 05                495.
##  7 11                490.
##  8 03                484.
##  9 02                481.
## 10 08                477.
## 11 01                460.
## 12 12                447.