### Module 3 Lab


# Part 1 --------------------------------------------------------------------------

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
transactions <- transactions_sample # just using a sample of the entire data
transactions
## # A tibble: 75,000 × 11
##    household_id store_id basket_id   product_id quantity sales_value retail_disc
##    <chr>        <chr>    <chr>       <chr>         <dbl>       <dbl>       <dbl>
##  1 2261         309      31625220889 940996            1        3.86        0.43
##  2 2131         368      32053127496 873902            1        1.59        0.9 
##  3 511          316      32445856036 847901            1        1           0.69
##  4 400          388      31932241118 13094913          2       11.9         2.9 
##  5 918          340      32074655895 1085604           1        1.29        0   
##  6 718          324      32614612029 883203            1        2.5         0.49
##  7 868          323      32074722463 9884484           1        3.49        0   
##  8 1688         450      34850403304 1028715           1        2           1.79
##  9 467          31782    31280745102 896613            2        6.55        4.44
## 10 1947         32004    32744181707 978497            1        3.99        0   
## # ℹ 74,990 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>
## # A tibble: 75,000 x 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2261 309 31625220889 940996 1 3.86 0.43
## 2 2131 368 32053127496 873902 1 1.59 0.9
## 3 511 316 32445856036 847901 1 1 0.69
## 4 400 388 31932241118 13094913 2 11.9 2.9
## 5 918 340 32074655895 1085604 1 1.29 0
## 6 718 324 32614612029 883203 1 2.5 0.49
## 7 868 323 32074722463 9884484 1 3.49 0
## 8 1688 450 34850403304 1028715 1 2 1.79
## 9 467 31782 31280745102 896613 2 6.55 4.44
## 10 1947 32004 32744181707 978497 1 3.99 0
## # i 74,990 more rows
## # i 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>

products
## # A tibble: 92,331 × 7
##    product_id manufacturer_id department    brand  product_category product_type
##    <chr>      <chr>           <chr>         <fct>  <chr>            <chr>       
##  1 25671      2               GROCERY       Natio… FRZN ICE         ICE - CRUSH…
##  2 26081      2               MISCELLANEOUS Natio… <NA>             <NA>        
##  3 26093      69              PASTRY        Priva… BREAD            BREAD:ITALI…
##  4 26190      69              GROCERY       Priva… FRUIT - SHELF S… APPLE SAUCE 
##  5 26355      69              GROCERY       Priva… COOKIES/CONES    SPECIALTY C…
##  6 26426      69              GROCERY       Priva… SPICES & EXTRAC… SPICES & SE…
##  7 26540      69              GROCERY       Priva… COOKIES/CONES    TRAY PACK/C…
##  8 26601      69              DRUG GM       Priva… VITAMINS         VITAMIN - M…
##  9 26636      69              PASTRY        Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691      16              GROCERY       Priva… PNT BTR/JELLY/J… HONEY       
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
## # A tibble: 92,331 x 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio~ FRZN ICE ICE - CRUSH~
## 2 26081 2 MISCELLANEOUS Natio~ <NA> <NA>
## 3 26093 69 PASTRY Priva~ BREAD BREAD:ITALI~
## 4 26190 69 GROCERY Priva~ FRUIT - SHELF S~ APPLE SAUCE
## 5 26355 69 GROCERY Priva~ COOKIES/CONES SPECIALTY C~
## 6 26426 69 GROCERY Priva~ SPICES & EXTRAC~ SPICES & SE~
## 7 26540 69 GROCERY Priva~ COOKIES/CONES TRAY PACK/C~
## 8 26601 69 DRUG GM Priva~ VITAMINS VITAMIN - M~
## 9 26636 69 PASTRY Priva~ BREAKFAST SWEETS SW GDS: SW ~
## 10 26691 16 GROCERY Priva~ PNT BTR/JELLY/J~ HONEY
## # i 92,321 more rows
## # i 1 more variable: package_size <chr>



# Exercise 1 ----------------------------------------------------------------------

## regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity
## loyalty_price = (sales_value + coupon_match_disc) / quantity
## coupon_price = (sales_value - coupon_disc) / quantity

# Q0: Create three new variables named `regular_price`, `loyalty_price`, and
# `coupon_price` according to the logic shown above

transactions <- transactions %>%
  mutate(
    regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
    loyalty_price = (sales_value + coupon_match_disc) / quantity,
    coupon_price = (sales_value - coupon_disc) / quantity
  ) %>%
  select(regular_price, loyalty_price, coupon_price, product_id, everything())


# Q1. Identify the five households with the largest `loyalty_price` transactions. What
# is unique about the transaction with the largest `loyalty_price` value?

transactions %>%
  slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
##   regular_price loyalty_price coupon_price product_id household_id store_id
##           <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
## 1         Inf           Inf          Inf   13945244   2491         389     
## 2         100           100          100   12484608   1246         334     
## 3         100.          100.         100.  13040176   2318         381     
## 4          88.9          88.9         88.9 15630122   1172         396     
## 5          85.0          66.1         66.1 916561     2312         442     
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>
# Q2. Now filter for only those observations where quantity was greater than 0. Now which
# household(s) have the largest `loyalty_price` transaction?
transactions %>%
  filter(quantity > 0) %>%
  slice_max(order_by = loyalty_price, n = 1)
## # A tibble: 1 × 14
##   regular_price loyalty_price coupon_price product_id household_id store_id
##           <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
## 1           100           100          100 12484608   1246         334     
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>
## household id = 1246

# Q3. Using the first transaction in the result from #2, filter the `products` data based
# on the `product_id` to find out which product the largest `loyalty_price` transaction
# is associated with.
products %>%
  filter(product_id == 12484608)
## # A tibble: 1 × 7
##   product_id manufacturer_id department    brand   product_category product_type
##   <chr>      <chr>           <chr>         <fct>   <chr>            <chr>       
## 1 12484608   903             MISCELLANEOUS Private COUPON/MISC ITE… ELECTRONIC …
## # ℹ 1 more variable: package_size <chr>
# Exercise 2 ----------------------------------------------------------------------

## Hint: After filtering, select the product_id column and count 
## unique products using the n_distinct() function.

# how many products had a regular price of $1 or less
transactions %>%
  filter(regular_price <= 1) %>%
  select(product_id) %>%
  n_distinct()
## [1] 2748
# how many products had a loyalty price of $1 or less
transactions %>%
  filter(loyalty_price <= 1) %>%
  select(product_id) %>%
  n_distinct()
## [1] 4648
# how many products had a coupon price of $1 or less
transactions %>%
  filter(coupon_price <= 1) %>%
  select(product_id) %>%
  n_distinct()
## [1] 4844
# Exercise 3 ----------------------------------------------------------------------

## What proportion of baskets are over $10 in sales value? 
## What proportion of baskets are over $20 in sales value?
## Hint: You need to use group_by() and summarize(). 
## Depending on your approach you may or may not use mutate().
# test your ability to right this code from scratch rather than just
# filling in the blanks :)
## _______

library(dplyr)
transactions %>%
  group_by(basket_id) %>%
  summarize(total_sales = sum(quantity * loyalty_price, na.rm = TRUE)) %>%
  summarize(proportions = mean(total_sales >10))
## # A tibble: 1 × 1
##   proportions
##         <dbl>
## 1       0.107
transactions %>%
  group_by(basket_id) %>%
  summarize(total_sales = sum(quantity * loyalty_price, na.rm = TRUE)) %>%
  summarize(proportions = mean(total_sales > 20))
## # A tibble: 1 × 1
##   proportions
##         <dbl>
## 1      0.0258
# Exercise 4 ----------------------------------------------------------------------

## Which stores had the largest total sales_value 
## (hint: sum(sales_value, na.rm = TRUE)? 
## Which stores had the largest average loyalty discount as defined below?
## Hint: You can calculate loyalty discount as a percentage 
## of regular price using the following logic:

library(dplyr)
transactions
## # A tibble: 75,000 × 14
##    regular_price loyalty_price coupon_price product_id household_id store_id
##            <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
##  1          4.29          3.86         3.86 940996     2261         309     
##  2          2.49          1.59         1.59 873902     2131         368     
##  3          1.69          1            1    847901     511          316     
##  4          7.38          5.94         5.94 13094913   400          388     
##  5          1.29          1.29         1.29 1085604    918          340     
##  6          2.99          2.5          2.5  883203     718          324     
##  7          3.49          3.49         3.49 9884484    868          323     
##  8          3.79          2            2    1028715    1688         450     
##  9          5.50          3.28         3.28 896613     467          31782   
## 10          3.99          3.99         3.99 978497     1947         32004   
## # ℹ 74,990 more rows
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>
## pct_loyalty_disc = 1 - (loyalty_price / regular_price)

# Which stores had the largest total `sales_value`
transactions %>%
  group_by(store_id) %>%
  summarize(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales_value))
## # A tibble: 293 × 2
##    store_id total_sales_value
##    <chr>                <dbl>
##  1 367                  7713.
##  2 406                  6034.
##  3 429                  4702.
##  4 343                  4471.
##  5 361                  4061.
##  6 356                  3807.
##  7 381                  3746.
##  8 292                  3702.
##  9 31782                3684.
## 10 321                  3515.
## # ℹ 283 more rows
# Which stores had the largest average loyalty discount
transactions %>%
  mutate(pct_loyalty_disc = 1 - (loyalty_price / regular_price)) %>%
  group_by(store_id) %>%
  summarize(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>%
  arrange(desc(avg_pct_loyalty_disc))
## # A tibble: 293 × 2
##    store_id avg_pct_loyalty_disc
##    <chr>                   <dbl>
##  1 224                     0.576
##  2 62                      0.501
##  3 779                     0.475
##  4 3163                    0.433
##  5 784                     0.430
##  6 2950                    0.418
##  7 572                     0.401
##  8 65                      0.390
##  9 486                     0.388
## 10 2839                    0.373
## # ℹ 283 more rows
# Part 2 --------------------------------------------------------------------------


# Exercise 1 - Import the data ----------------------------------------------------


library(readxl)
excel_sheets(path = "mbta.xlsx")
## [1] "Sheet1"
mbta <- read_excel(path = "mbta.xlsx", skip = 1, na = "NA")
## New names:
## • `` -> `...1`
mbta
## # A tibble: 11 × 60
##     ...1 mode        `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
##    <dbl> <chr>           <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
##  1     1 All Modes …     NA        NA     1188.       NA        NA      1246.   
##  2     2 Boat             4         3.6     40         4.3       4.9       5.8  
##  3     3 Bus            336.      339.     340.      352.      354.      351.   
##  4     4 Commuter R…    142.      138.     138.      140.      139       143    
##  5     5 Heavy Rail     435.      448.     459.      472.      475.      477.   
##  6     6 Light Rail     227.      240.     241.      256.      248.      246.   
##  7     7 Pct Chg / …      0.02     -0.04     0.114    -0.002     0.049     0.096
##  8     8 Private Bus      4.77      4.42     4.57      4.54      4.77      4.72 
##  9     9 RIDE             4.9       5        5.5       5.4       5.4       5.6  
## 10    10 Trackless …     12.8      12.9     13.1      13.4      13.5      13.3  
## 11    11 TOTAL         1167.     1192.    1205.     1247.     1245.     1246.   
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## #   `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## #   `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## #   `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## #   `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## #   `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## #   `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
# Exercise 2 - Exam the data ------------------------------------------------------

head(mbta)
## # A tibble: 6 × 60
##    ...1 mode         `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
##   <dbl> <chr>            <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1     1 All Modes b…       NA       NA       1188.      NA        NA      1246. 
## 2     2 Boat                4        3.6       40        4.3       4.9       5.8
## 3     3 Bus               336.     339.       340.     352.      354.      351. 
## 4     4 Commuter Ra…      142.     138.       138.     140.      139       143  
## 5     5 Heavy Rail        435.     448.       459.     472.      475.      477. 
## 6     6 Light Rail        227.     240.       241.     256.      248.      246. 
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## #   `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## #   `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## #   `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## #   `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## #   `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## #   `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
view(mbta)
head(mbta, n = 10)
## # A tibble: 10 × 60
##     ...1 mode        `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
##    <dbl> <chr>           <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
##  1     1 All Modes …     NA        NA     1188.       NA        NA      1246.   
##  2     2 Boat             4         3.6     40         4.3       4.9       5.8  
##  3     3 Bus            336.      339.     340.      352.      354.      351.   
##  4     4 Commuter R…    142.      138.     138.      140.      139       143    
##  5     5 Heavy Rail     435.      448.     459.      472.      475.      477.   
##  6     6 Light Rail     227.      240.     241.      256.      248.      246.   
##  7     7 Pct Chg / …      0.02     -0.04     0.114    -0.002     0.049     0.096
##  8     8 Private Bus      4.77      4.42     4.57      4.54      4.77      4.72 
##  9     9 RIDE             4.9       5        5.5       5.4       5.4       5.6  
## 10    10 Trackless …     12.8      12.9     13.1      13.4      13.5      13.3  
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## #   `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## #   `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## #   `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## #   `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## #   `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## #   `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
summary(mbta)
##       ...1          mode              2007-01            2007-02        
##  Min.   : 1.0   Length:11          Min.   :   0.020   Min.   :  -0.040  
##  1st Qu.: 3.5   Class :character   1st Qu.:   4.804   1st Qu.:   4.563  
##  Median : 6.0   Mode  :character   Median :  77.478   Median :  75.707  
##  Mean   : 6.0                      Mean   : 233.397   Mean   : 238.324  
##  3rd Qu.: 8.5                      3rd Qu.: 308.672   3rd Qu.: 314.072  
##  Max.   :11.0                      Max.   :1166.974   Max.   :1191.639  
##                                    NA's   :1          NA's   :1         
##     2007-03            2007-04            2007-05            2007-06        
##  Min.   :   0.114   Min.   :  -0.002   Min.   :   0.049   Min.   :   0.096  
##  1st Qu.:   9.278   1st Qu.:   4.756   1st Qu.:   5.025   1st Qu.:   5.700  
##  Median : 137.700   Median :  76.472   Median :  76.240   Median : 143.000  
##  Mean   : 330.293   Mean   : 249.421   Mean   : 248.956   Mean   : 339.846  
##  3rd Qu.: 399.225   3rd Qu.: 328.011   3rd Qu.: 327.841   3rd Qu.: 413.788  
##  Max.   :1204.725   Max.   :1247.105   Max.   :1244.755   Max.   :1246.129  
##                     NA's   :1          NA's   :1                            
##     2007-07            2007-08            2007-09            2007-10        
##  Min.   :  -0.037   Min.   :   0.004   Min.   :  -0.007   Min.   :  -0.064  
##  1st Qu.:   5.570   1st Qu.:   5.624   1st Qu.:   5.539   1st Qu.:   5.310  
##  Median :  77.851   Median :  77.753   Median : 143.051   Median :  80.582  
##  Mean   : 248.787   Mean   : 244.665   Mean   : 352.554   Mean   : 248.884  
##  3rd Qu.: 328.961   3rd Qu.: 325.336   3rd Qu.: 436.082   3rd Qu.: 336.994  
##  Max.   :1243.952   Max.   :1223.323   Max.   :1310.764   Max.   :1244.453  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2007-11            2007-12            2008-01            2008-02        
##  Min.   :  -0.077   Min.   :  -0.060   Min.   :   0.048   Min.   :   0.061  
##  1st Qu.:   4.478   1st Qu.:   4.385   1st Qu.:   4.475   1st Qu.:   4.485  
##  Median :  79.356   Median : 141.585   Median :  78.023   Median :  78.389  
##  Mean   : 248.371   Mean   : 321.588   Mean   : 244.615   Mean   : 252.803  
##  3rd Qu.: 310.744   3rd Qu.: 380.594   3rd Qu.: 315.549   3rd Qu.: 327.005  
##  Max.   :1241.895   Max.   :1216.890   Max.   :1223.050   Max.   :1263.983  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2008-03            2008-04            2008-05            2008-06        
##  Min.   :   0.058   Min.   :   0.060   Min.   :   0.046   Min.   :   0.060  
##  1st Qu.:   5.170   1st Qu.:   4.689   1st Qu.:   4.629   1st Qu.:   5.742  
##  Median : 137.453   Median :  77.555   Median :  78.506   Median : 142.057  
##  Mean   : 345.604   Mean   : 264.435   Mean   : 260.323   Mean   : 359.667  
##  3rd Qu.: 427.601   3rd Qu.: 343.909   3rd Qu.: 337.515   3rd Qu.: 440.656  
##  Max.   :1274.031   Max.   :1322.146   Max.   :1301.591   Max.   :1320.728  
##                     NA's   :1          NA's   :1                            
##     2008-07            2008-08            2008-09            2008-10        
##  Min.   :   0.069   Min.   :   0.023   Min.   :   0.021   Min.   :   0.054  
##  1st Qu.:   6.019   1st Qu.:   5.887   1st Qu.:   5.691   1st Qu.:   5.087  
##  Median :  80.061   Median :  79.141   Median : 141.907   Median :  82.486  
##  Mean   : 266.027   Mean   : 250.383   Mean   : 362.099   Mean   : 262.440  
##  3rd Qu.: 345.730   3rd Qu.: 330.629   3rd Qu.: 453.430   3rd Qu.: 330.996  
##  Max.   :1330.103   Max.   :1251.905   Max.   :1338.015   Max.   :1312.172  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2008-11            2008-12            2009-01            2009-02       
##  Min.   :   0.002   Min.   :  -0.015   Min.   :  -0.034   Min.   :  -0.02  
##  1st Qu.:   4.829   1st Qu.:   4.689   1st Qu.:   4.186   1st Qu.:   4.38  
##  Median :  82.774   Median : 140.810   Median :  76.874   Median :  76.59  
##  Mean   : 248.871   Mean   : 319.882   Mean   : 236.303   Mean   : 247.65  
##  3rd Qu.: 317.084   3rd Qu.: 386.255   3rd Qu.: 307.814   3rd Qu.: 327.49  
##  Max.   :1244.354   Max.   :1232.655   Max.   :1181.534   Max.   :1238.24  
##  NA's   :1                             NA's   :1          NA's   :1        
##     2009-03            2009-04            2009-05            2009-06        
##  Min.   :  -0.050   Min.   :  -0.048   Min.   :  -0.058   Min.   :  -0.079  
##  1st Qu.:   5.003   1st Qu.:   4.720   1st Qu.:   4.763   1st Qu.:   5.845  
##  Median : 142.893   Median :  76.833   Median :  78.358   Median : 142.006  
##  Mean   : 330.142   Mean   : 251.603   Mean   : 245.116   Mean   : 333.194  
##  3rd Qu.: 410.455   3rd Qu.: 329.093   3rd Qu.: 322.303   3rd Qu.: 410.482  
##  Max.   :1210.912   Max.   :1258.037   Max.   :1225.608   Max.   :1233.085  
##                     NA's   :1          NA's   :1                            
##     2009-07            2009-08            2009-09            2009-10        
##  Min.   :  -0.094   Min.   :  -0.044   Min.   :  -0.035   Min.   :  -0.014  
##  1st Qu.:   6.298   1st Qu.:   6.033   1st Qu.:   5.693   1st Qu.:   4.883  
##  Median :  74.558   Median :  75.604   Median : 139.087   Median :  75.178  
##  Mean   : 241.006   Mean   : 239.427   Mean   : 346.687   Mean   : 258.811  
##  3rd Qu.: 312.185   3rd Qu.: 307.442   3rd Qu.: 437.332   3rd Qu.: 347.086  
##  Max.   :1205.079   Max.   :1197.158   Max.   :1291.564   Max.   :1294.064  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2009-11            2009-12            2010-01            2010-02        
##  Min.   :  -0.022   Min.   :  -0.022   Min.   :   0.004   Min.   :  -0.022  
##  1st Qu.:   4.323   1st Qu.:   4.784   1st Qu.:   4.034   1st Qu.:   4.062  
##  Median :  70.997   Median : 126.066   Median :  71.588   Median :  72.238  
##  Mean   : 243.363   Mean   : 312.962   Mean   : 237.255   Mean   : 242.244  
##  3rd Qu.: 328.913   3rd Qu.: 386.659   3rd Qu.: 322.769   3rd Qu.: 324.137  
##  Max.   :1216.824   Max.   :1207.845   Max.   :1186.271   Max.   :1211.228  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2010-03            2010-04            2010-05            2010-06        
##  Min.   :   0.012   Min.   :   0.007   Min.   :   0.013   Min.   :   0.008  
##  1st Qu.:   5.274   1st Qu.:   5.130   1st Qu.:   5.086   1st Qu.:   6.436  
##  Median : 131.252   Median :  72.370   Median :  70.785   Median : 129.144  
##  Mean   : 332.726   Mean   : 253.446   Mean   : 248.231   Mean   : 335.964  
##  3rd Qu.: 428.420   3rd Qu.: 340.770   3rd Qu.: 339.579   3rd Qu.: 426.769  
##  Max.   :1225.556   Max.   :1267.226   Max.   :1241.148   Max.   :1244.409  
##                     NA's   :1          NA's   :1                            
##     2010-07            2010-08            2010-09            2010-10        
##  Min.   :   0.001   Min.   :  -0.015   Min.   :   0.001   Min.   :   0.009  
##  1st Qu.:   6.531   1st Qu.:   6.281   1st Qu.:   5.567   1st Qu.:   5.006  
##  Median :  64.950   Median :  68.388   Median : 132.892   Median :  69.340  
##  Mean   : 241.180   Mean   : 235.947   Mean   : 346.524   Mean   : 261.255  
##  3rd Qu.: 319.462   3rd Qu.: 310.399   3rd Qu.: 451.361   3rd Qu.: 347.832  
##  Max.   :1205.901   Max.   :1179.745   Max.   :1293.117   Max.   :1306.271  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2010-11            2010-12            2011-01            2011-02        
##  Min.   :   0.022   Min.   :  -0.004   Min.   :  -0.028   Min.   :   0.008  
##  1st Qu.:   4.402   1st Qu.:   4.466   1st Qu.:   4.039   1st Qu.:   4.329  
##  Median :  69.166   Median : 121.422   Median :  69.750   Median :  68.579  
##  Mean   : 248.748   Mean   : 312.917   Mean   : 230.680   Mean   : 244.133  
##  3rd Qu.: 331.437   3rd Qu.: 388.385   3rd Qu.: 300.831   3rd Qu.: 314.647  
##  Max.   :1243.730   Max.   :1216.262   Max.   :1153.413   Max.   :1220.663  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2011-03           2011-04            2011-05            2011-06        
##  Min.   :   0.05   Min.   :   0.036   Min.   :   0.050   Min.   :   0.054  
##  1st Qu.:   6.03   1st Qu.:   5.224   1st Qu.:   5.338   1st Qu.:   6.926  
##  Median : 134.37   Median :  73.384   Median :  74.216   Median : 135.581  
##  Mean   : 345.17   Mean   : 262.660   Mean   : 260.582   Mean   : 353.331  
##  3rd Qu.: 448.56   3rd Qu.: 345.904   3rd Qu.: 345.411   3rd Qu.: 452.923  
##  Max.   :1286.66   Max.   :1313.283   Max.   :1302.884   Max.   :1302.414  
##                    NA's   :1          NA's   :1                            
##     2011-07            2011-08            2011-09            2011-10        
##  Min.   :   0.067   Min.   :   0.052   Min.   :   0.043   Min.   :   0.032  
##  1st Qu.:   6.911   1st Qu.:   7.067   1st Qu.:   6.660   1st Qu.:   5.513  
##  Median :  71.735   Median :  70.853   Median : 136.901   Median :  70.508  
##  Mean   : 257.228   Mean   : 248.259   Mean   : 362.555   Mean   : 269.648  
##  3rd Qu.: 328.941   3rd Qu.: 320.386   3rd Qu.: 469.204   3rd Qu.: 358.284  
##  Max.   :1286.107   Max.   :1241.268   Max.   :1348.754   Max.   :1348.222  
##  NA's   :1          NA's   :1                             NA's   :1
sum(is.na(mbta))
## [1] 39
# Exercise 3 - Removing unnecessary rows and columns ------------------------------

## 1. Remove the first, seventh, and eleventh rows of mbta.
## 2. Remove the first column.
## 3. Now what is the dimensions of this new data frame?


mbta <- mbta %>%
  slice(-c(1,7,11)) %>% # Remove the first, seventh, and eleventh rows of mbta.
  select(-1) # Remove the first column.


dim(mbta) # Now what is the dimensions of this new data frame?
## [1]  8 59
head(mbta)
## # A tibble: 6 × 59
##   mode     `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06` `2007-07`
##   <chr>        <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 Boat          4         3.6      40         4.3       4.9       5.8       6.52
## 2 Bus         336.      339.      340.      352.      354.      351.      358.  
## 3 Commute…    142.      138.      138.      140.      139       143       142.  
## 4 Heavy R…    435.      448.      459.      472.      475.      477.      472.  
## 5 Light R…    227.      240.      241.      256.      248.      246.      243.  
## 6 Private…      4.77      4.42      4.57      4.54      4.77      4.72      3.94
## # ℹ 51 more variables: `2007-08` <dbl>, `2007-09` <dbl>, `2007-10` <dbl>,
## #   `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>, `2008-02` <dbl>,
## #   `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>, `2008-06` <dbl>,
## #   `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>, `2008-10` <dbl>,
## #   `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>, `2009-02` <dbl>,
## #   `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>, `2009-06` <dbl>,
## #   `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, `2009-10` <dbl>, …
# Exercise 4 - Observations are stored in columns ---------------------------------

## 1. Pivot the rows and columns of the mbta data so 
## that all columns are variables of the data. 
## This should result in 3 columns - mode, date, and 
## number of riders in thousands (thou_riders).
## 2. What are the new dimensions of this data?

library(dplyr)
library(tidyr)

mbta<- mbta %>%
  pivot_longer(
    cols = -mode,
    names_to = "date",
    values_to = "thou_riders"  
  )

dim(mbta)
## [1] 464   3
head(mbta)
## # A tibble: 6 × 3
##   mode  date    thou_riders
##   <chr> <chr>         <dbl>
## 1 Boat  2007-01         4  
## 2 Boat  2007-02         3.6
## 3 Boat  2007-03        40  
## 4 Boat  2007-04         4.3
## 5 Boat  2007-05         4.9
## 6 Boat  2007-06         5.8
view(mbta)

# Exercise 5 - Separating columns -------------------------------------------------


mbta <- mbta %>%
  separate(date, into = c("year", "month"), sep = "-")


# Split the month column of mbta at the dash and create a new
# month column with only the month and a year column with only
# the year.
head(mbta) 
## # A tibble: 6 × 4
##   mode  year  month thou_riders
##   <chr> <chr> <chr>       <dbl>
## 1 Boat  2007  01            4  
## 2 Boat  2007  02            3.6
## 3 Boat  2007  03           40  
## 4 Boat  2007  04            4.3
## 5 Boat  2007  05            4.9
## 6 Boat  2007  06            5.8
# View the head of this new mbta data set.
view(mbta)

# Exercise 6 - Dealing with entry error -------------------------------------------

incorrect <- mbta %>%
  filter(mode == "Boat", thou_riders == 40)
incorrect
## # A tibble: 1 × 4
##   mode  year  month thou_riders
##   <chr> <chr> <chr>       <dbl>
## 1 Boat  2007  03             40
# Exercise 7 - Performing descriptive analysis ------------------------------------

## 1. Compute the average ridership per mode.

mbta %>%
  group_by(mode) %>%
  summarize(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
##   mode              avg_ridership
##   <chr>                     <dbl>
## 1 Boat                       5.07
## 2 Bus                      359.  
## 3 Commuter Rail            137.  
## 4 Heavy Rail               489.  
## 5 Light Rail               233.  
## 6 Private Bus                3.35
## 7 RIDE                       6.60
## 8 Trackless Trolley         12.1
## 2. Compute the average ridership per mode for the month of January.

mbta %>%
  filter(month == "01") %>%
  group_by(mode) %>%
  summarize(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
##   mode              avg_ridership
##   <chr>                     <dbl>
## 1 Boat                       3.31
## 2 Bus                      342.  
## 3 Commuter Rail            137.  
## 4 Heavy Rail               460.  
## 5 Light Rail               217.  
## 6 Private Bus                3.47
## 7 RIDE                       5.94
## 8 Trackless Trolley         12.5
## 3. Which year had the largest total ridership for the boat mode?

  mbta %>%
  filter(mode == "Boat") %>%
  group_by(year) %>%
  summarize(total_riders = sum(thou_riders, na.rm = TRUE)) %>% 
  arrange(desc(total_riders)) %>%
  slice(1)
## # A tibble: 1 × 2
##   year  total_riders
##   <chr>        <dbl>
## 1 2007          93.1
## 4. On average, which month experiences the greatest number of passengers on the Heavy Rail mode?

  mbta %>%
    filter(mode == "Heavy Rail") %>%  
    group_by(month) %>%  
    summarize(avg_ridership = mean(thou_riders, na.rm = TRUE)) %>%  
    arrange(desc(avg_ridership)) %>%  
    slice(1)  
## # A tibble: 1 × 2
##   month avg_ridership
##   <chr>         <dbl>
## 1 09             518.