Installing packages

# Install necessary package
install.packages('tidyverse')
install.packages('completejourney')

Loading Libraries and Data

# Load necessary package
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
# Load a sample of the transactions data
transactions <- transactions_sample

Part 1

Exercise 1

Q0: Create Three New Variables

transactions <- transactions %>% 
  mutate(
    regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
    loyalty_price = (sales_value + coupon_match_disc) / quantity,
    coupon_price = (sales_value - coupon_disc) / quantity
  ) %>% 
  select(regular_price, loyalty_price, coupon_price, product_id, everything()) 
# View the data
head(transactions)
## # A tibble: 6 × 14
##   regular_price loyalty_price coupon_price product_id household_id store_id
##           <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
## 1          4.29          3.86         3.86 940996     2261         309     
## 2          2.49          1.59         1.59 873902     2131         368     
## 3          1.69          1            1    847901     511          316     
## 4          7.38          5.94         5.94 13094913   400          388     
## 5          1.29          1.29         1.29 1085604    918          340     
## 6          2.99          2.5          2.5  883203     718          324     
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

Q1: Top 5 loyalty_price transactions

top_loyalty_transactions <- transactions %>%
  slice_max(order_by = loyalty_price, n = 5)
top_loyalty_transactions
## # A tibble: 5 × 14
##   regular_price loyalty_price coupon_price product_id household_id store_id
##           <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
## 1         Inf           Inf          Inf   13945244   2491         389     
## 2         100           100          100   12484608   1246         334     
## 3         100.          100.         100.  13040176   2318         381     
## 4          88.9          88.9         88.9 15630122   1172         396     
## 5          85.0          66.1         66.1 916561     2312         442     
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

Q2: Filter and Identify Largest loyalty_price transaction

filtered_transactions <- transactions %>%
  filter(quantity > 0) %>%
  slice_max(order_by = loyalty_price, n = 5)
filtered_transactions
## # A tibble: 5 × 14
##   regular_price loyalty_price coupon_price product_id household_id store_id
##           <dbl>         <dbl>        <dbl> <chr>      <chr>        <chr>   
## 1         100           100          100   12484608   1246         334     
## 2         100.          100.         100.  13040176   2318         381     
## 3          88.9          88.9         88.9 15630122   1172         396     
## 4          85.0          66.1         66.1 916561     2312         442     
## 5          63.8          63.8         63.8 1076056    57           298     
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## #   retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## #   transaction_timestamp <dttm>

Q3: Identify Product Associated with Largest loyalty_price Transaction

largest_loyalty_product <- products %>%
  filter(product_id == filtered_transactions$product_id[1])
largest_loyalty_product
## # A tibble: 1 × 7
##   product_id manufacturer_id department    brand   product_category product_type
##   <chr>      <chr>           <chr>         <fct>   <chr>            <chr>       
## 1 12484608   903             MISCELLANEOUS Private COUPON/MISC ITE… ELECTRONIC …
## # ℹ 1 more variable: package_size <chr>

Exercise 2

# Count the number of unique products with a regular price of $1 or less
regular_price_count <- transactions %>%
  filter(regular_price <= 1) %>%
  select(product_id) %>%
  n_distinct()

Count the number of unique products with a loyalty price of $1 or less

loyalty_price_count <- transactions %>%
  filter(loyalty_price <= 1) %>%
  select(product_id) %>%
  n_distinct()

Count the number of unique products with a coupon price of $1 or less

coupon_price_count <- transactions %>%
  filter(coupon_price <= 1) %>%
  select(product_id) %>%
  n_distinct()

Exercise 3

What proportion of baskets are over $10 in sales value? What proportion of baskets are over $20 in sales value?

# First, group by basket_id and summarize the total sales value for each basket
basket_summary <- transactions %>%
  group_by(basket_id) %>%
  summarize(total_sales_value = sum(sales_value))

# Calculate the total number of baskets
total_baskets <- nrow(basket_summary)

# Calculate the number of baskets that are over $10 in sales value
baskets_over_10 <- basket_summary %>%
  filter(total_sales_value > 10) %>%
  nrow()

# Calculate the number of baskets that are over $20 in sales value
baskets_over_20 <- basket_summary %>%
  filter(total_sales_value > 20) %>%
  nrow()

# Calculate the proportion of baskets over $10
proportion_over_10 <- baskets_over_10 / total_baskets

# Calculate the proportion of baskets over $20
proportion_over_20 <- baskets_over_20 / total_baskets

# Print the results
print(paste("Proportion of baskets over $10:", round(proportion_over_10, 2)))
## [1] "Proportion of baskets over $10: 0.11"
print(paste("Proportion of baskets over $20:", round(proportion_over_20, 2)))
## [1] "Proportion of baskets over $20: 0.03"

Exercise 4

Which stores had the largest total sales_value?

# Identify which stores had the largest total `sales_value`.

transactions %>%
  group_by(store_id) %>%
  summarize(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales_value))
## # A tibble: 293 × 2
##    store_id total_sales_value
##    <chr>                <dbl>
##  1 367                  7713.
##  2 406                  6034.
##  3 429                  4702.
##  4 343                  4471.
##  5 361                  4061.
##  6 356                  3807.
##  7 381                  3746.
##  8 292                  3702.
##  9 31782                3684.
## 10 321                  3515.
## # ℹ 283 more rows
# Identify which stores had the largest average loyalty discount, using the logic `pct_loyalty_disc = 1 - (loyalty_price / regular_price)`.

transactions %>%
  mutate(pct_loyalty_disc = 1 - (loyalty_price / regular_price)) %>%
  group_by(store_id) %>%
  summarize(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>%
  arrange(desc(avg_pct_loyalty_disc))
## # A tibble: 293 × 2
##    store_id avg_pct_loyalty_disc
##    <chr>                   <dbl>
##  1 224                     0.576
##  2 62                      0.501
##  3 779                     0.475
##  4 3163                    0.433
##  5 784                     0.430
##  6 2950                    0.418
##  7 572                     0.401
##  8 65                      0.390
##  9 486                     0.388
## 10 2839                    0.373
## # ℹ 283 more rows

Part 2 - Module 3

For this part of the lab we’ll work through the mbta.xlsx data. The Massachusetts Bay Transportation Authority (“MBTA”) manages America’s oldest subway, as well as Greater Boston’s commuter rail, ferry, and bus systems. It’s your first day on the job as the T’s data analyst and you’ve been tasked with analyzing average ridership through time. Complete the following data cleaning tasks and be sure to use proper code styling and commenting throughout your notebook.

Exercise 1 - Import the data

# Load necessary package
library(readxl)

1. What spreadsheets exist in the workbook?

sheets <- excel_sheets(path = "./mbta.xlsx")
print(sheets)
## [1] "Sheet1"

2 & 3. Import mbta.xlsx and interpret ‘NA’ as missing values

mbta <- read_excel(path = "./mbta.xlsx", 
                   sheet = 1,  # assuming you want the first sheet; change as needed
                   skip = 1,   # number of rows to skip; adjust if needed
                   na = "NA")  # interpret 'NA' as missing values
## New names:
## • `` -> `...1`

Exercise 2 - Exam the data

1. View the structure of mbta

str(mbta)
## tibble [11 × 60] (S3: tbl_df/tbl/data.frame)
##  $ ...1   : num [1:11] 1 2 3 4 5 6 7 8 9 10 ...
##  $ mode   : chr [1:11] "All Modes by Qtr" "Boat" "Bus" "Commuter Rail" ...
##  $ 2007-01: num [1:11] NA 40010 336 142 435 ...
##  $ 2007-02: num [1:11] NA 3.6 338.7 138.5 448.3 ...
##  $ 2007-03: num [1:11] 1188 40 340 138 459 ...
##  $ 2007-04: num [1:11] NA 4.3 352.2 139.5 472.2 ...
##  $ 2007-05: num [1:11] NA 4.9 354.4 139 474.6 ...
##  $ 2007-06: num [1:11] 1246 5.8 350.5 143 477 ...
##  $ 2007-07: num [1:11] NA 6.52 357.52 142.39 471.74 ...
##  $ 2007-08: num [1:11] NA 6.57 355.48 142.36 461.61 ...
##  $ 2007-09: num [1:11] 1256.57 5.47 372.6 143.05 499.57 ...
##  $ 2007-10: num [1:11] NA 5.14 368.85 146.54 457.74 ...
##  $ 2007-11: num [1:11] NA 3.76 330.83 145.09 488.35 ...
##  $ 2007-12: num [1:11] 1216.89 2.98 312.92 141.59 448.27 ...
##  $ 2008-01: num [1:11] NA 3.17 340.32 142.15 472.62 ...
##  $ 2008-02: num [1:11] NA 3.11 352.9 142.61 492.1 ...
##  $ 2008-03: num [1:11] 1253.52 3.51 361.15 137.45 494.05 ...
##  $ 2008-04: num [1:11] NA 4.16 368.19 140.39 513.2 ...
##  $ 2008-05: num [1:11] NA 4.01 363.9 142.59 507.95 ...
##  $ 2008-06: num [1:11] 1314.82 5.19 362.96 142.06 518.35 ...
##  $ 2008-07: num [1:11] NA 6.02 370.92 145.73 512.31 ...
##  $ 2008-08: num [1:11] NA 5.8 361.1 144.6 477 ...
##  $ 2008-09: num [1:11] 1307.04 4.59 389.54 141.91 517.32 ...
##  $ 2008-10: num [1:11] NA 4.29 357.97 151.96 523.64 ...
##  $ 2008-11: num [1:11] NA 3.49 345.42 152.95 487.12 ...
##  $ 2008-12: num [1:11] 1232.65 3.01 325.77 140.81 446.74 ...
##  $ 2009-01: num [1:11] NA 3.01 338.53 141.45 461 ...
##  $ 2009-02: num [1:11] NA 3.2 360.4 143.5 482.4 ...
##  $ 2009-03: num [1:11] 1209.79 3.33 353.69 142.89 467.22 ...
##  $ 2009-04: num [1:11] NA 4.05 359.38 142.34 493.15 ...
##  $ 2009-05: num [1:11] NA 4.12 354.75 144.22 475.63 ...
##  $ 2009-06: num [1:11] 1233.1 4.9 347.9 142 473.1 ...
##  $ 2009-07: num [1:11] NA 6.44 339.48 137.69 470.83 ...
##  $ 2009-08: num [1:11] NA 5.9 332.7 139.2 466.7 ...
##  $ 2009-09: num [1:11] 1230.5 4.7 374.3 139.1 500.4 ...
##  $ 2009-10: num [1:11] NA 4.21 385.87 137.1 513.41 ...
##  $ 2009-11: num [1:11] NA 3.58 366.98 129.34 480.28 ...
##  $ 2009-12: num [1:11] 1207.85 3.11 332.39 126.07 440.93 ...
##  $ 2010-01: num [1:11] NA 3.21 362.23 130.91 464.07 ...
##  $ 2010-02: num [1:11] NA 3.19 361.14 131.92 480.12 ...
##  $ 2010-03: num [1:11] 1208.86 3.48 373.44 131.25 483.4 ...
##  $ 2010-04: num [1:11] NA 4.45 378.61 131.72 502.37 ...
##  $ 2010-05: num [1:11] NA 4.42 380.17 128.8 487.4 ...
##  $ 2010-06: num [1:11] 1244.41 5.41 363.27 129.14 490.26 ...
##  $ 2010-07: num [1:11] NA 6.51 353.04 122.94 488.59 ...
##  $ 2010-08: num [1:11] NA 6.27 343.69 129.73 473.73 ...
##  $ 2010-09: num [1:11] 1225.5 4.7 381.6 132.9 521.1 ...
##  $ 2010-10: num [1:11] NA 4.4 385 131 532.4 ...
##  $ 2010-11: num [1:11] NA 3.73 367.95 130.89 502.89 ...
##  $ 2010-12: num [1:11] 1216.26 3.16 326.34 121.42 450.43 ...
##  $ 2011-01: num [1:11] NA 3.14 334.96 128.4 468.42 ...
##  $ 2011-02: num [1:11] NA 3.28 346.23 125.46 504.07 ...
##  $ 2011-03: num [1:11] 1223.45 3.67 380.4 134.37 516.73 ...
##  $ 2011-04: num [1:11] NA 4.25 380.45 134.17 528.63 ...
##  $ 2011-05: num [1:11] NA 4.43 385.29 136.14 528.12 ...
##  $ 2011-06: num [1:11] 1302.41 5.47 376.32 135.58 529.53 ...
##  $ 2011-07: num [1:11] NA 6.58 361.58 132.41 532.89 ...
##  $ 2011-08: num [1:11] NA 6.73 353.79 130.62 508.14 ...
##  $ 2011-09: num [1:11] 1291 5 388 137 550 ...
##  $ 2011-10: num [1:11] NA 4.48 398.46 128.72 554.93 ...

2. View the first 6 rows of mbta

head(mbta, 6)
## # A tibble: 6 × 60
##    ...1 mode         `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
##   <dbl> <chr>            <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1     1 All Modes b…       NA       NA       1188.      NA        NA      1246. 
## 2     2 Boat            40010        3.6       40        4.3       4.9       5.8
## 3     3 Bus               336.     339.       340.     352.      354.      351. 
## 4     4 Commuter Ra…      142.     138.       138.     140.      139       143  
## 5     5 Heavy Rail        435.     448.       459.     472.      475.      477. 
## 6     6 Light Rail        227.     240.       241.     256.      248.      246. 
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## #   `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## #   `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## #   `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## #   `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## #   `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## #   `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …

3. View a summary of mbta

summary(mbta)
##       ...1          mode              2007-01            2007-02        
##  Min.   : 1.0   Length:11          Min.   :    0.02   Min.   :  -0.040  
##  1st Qu.: 3.5   Class :character   1st Qu.:    6.86   1st Qu.:   4.563  
##  Median : 6.0   Mode  :character   Median :  184.72   Median :  75.707  
##  Mean   : 6.0                      Mean   : 4234.00   Mean   : 238.324  
##  3rd Qu.: 8.5                      3rd Qu.:  410.43   3rd Qu.: 314.072  
##  Max.   :11.0                      Max.   :40010.00   Max.   :1191.639  
##                                    NA's   :1          NA's   :1         
##     2007-03            2007-04            2007-05            2007-06        
##  Min.   :   0.114   Min.   :  -0.002   Min.   :   0.049   Min.   :   0.096  
##  1st Qu.:   9.278   1st Qu.:   4.756   1st Qu.:   5.025   1st Qu.:   5.700  
##  Median : 137.700   Median :  76.472   Median :  76.240   Median : 143.000  
##  Mean   : 330.293   Mean   : 249.421   Mean   : 248.956   Mean   : 339.846  
##  3rd Qu.: 399.225   3rd Qu.: 328.011   3rd Qu.: 327.841   3rd Qu.: 413.788  
##  Max.   :1204.725   Max.   :1247.105   Max.   :1244.755   Max.   :1246.129  
##                     NA's   :1          NA's   :1                            
##     2007-07            2007-08            2007-09            2007-10        
##  Min.   :  -0.037   Min.   :   0.004   Min.   :  -0.007   Min.   :  -0.064  
##  1st Qu.:   5.570   1st Qu.:   5.624   1st Qu.:   5.539   1st Qu.:   5.310  
##  Median :  77.851   Median :  77.753   Median : 143.051   Median :  80.582  
##  Mean   : 248.787   Mean   : 244.665   Mean   : 352.554   Mean   : 248.884  
##  3rd Qu.: 328.961   3rd Qu.: 325.336   3rd Qu.: 436.082   3rd Qu.: 336.994  
##  Max.   :1243.952   Max.   :1223.323   Max.   :1310.764   Max.   :1244.453  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2007-11            2007-12            2008-01            2008-02        
##  Min.   :  -0.077   Min.   :  -0.060   Min.   :   0.048   Min.   :   0.061  
##  1st Qu.:   4.478   1st Qu.:   4.385   1st Qu.:   4.475   1st Qu.:   4.485  
##  Median :  79.356   Median : 141.585   Median :  78.023   Median :  78.389  
##  Mean   : 248.371   Mean   : 321.588   Mean   : 244.615   Mean   : 252.803  
##  3rd Qu.: 310.744   3rd Qu.: 380.594   3rd Qu.: 315.549   3rd Qu.: 327.005  
##  Max.   :1241.895   Max.   :1216.890   Max.   :1223.050   Max.   :1263.983  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2008-03            2008-04            2008-05            2008-06        
##  Min.   :   0.058   Min.   :   0.060   Min.   :   0.046   Min.   :   0.060  
##  1st Qu.:   5.170   1st Qu.:   4.689   1st Qu.:   4.629   1st Qu.:   5.742  
##  Median : 137.453   Median :  77.555   Median :  78.506   Median : 142.057  
##  Mean   : 345.604   Mean   : 264.435   Mean   : 260.323   Mean   : 359.667  
##  3rd Qu.: 427.601   3rd Qu.: 343.909   3rd Qu.: 337.515   3rd Qu.: 440.656  
##  Max.   :1274.031   Max.   :1322.146   Max.   :1301.591   Max.   :1320.728  
##                     NA's   :1          NA's   :1                            
##     2008-07            2008-08            2008-09            2008-10        
##  Min.   :   0.069   Min.   :   0.023   Min.   :   0.021   Min.   :   0.054  
##  1st Qu.:   6.019   1st Qu.:   5.887   1st Qu.:   5.691   1st Qu.:   5.087  
##  Median :  80.061   Median :  79.141   Median : 141.907   Median :  82.486  
##  Mean   : 266.027   Mean   : 250.383   Mean   : 362.099   Mean   : 262.440  
##  3rd Qu.: 345.730   3rd Qu.: 330.629   3rd Qu.: 453.430   3rd Qu.: 330.996  
##  Max.   :1330.103   Max.   :1251.905   Max.   :1338.015   Max.   :1312.172  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2008-11            2008-12            2009-01            2009-02       
##  Min.   :   0.002   Min.   :  -0.015   Min.   :  -0.034   Min.   :  -0.02  
##  1st Qu.:   4.829   1st Qu.:   4.689   1st Qu.:   4.186   1st Qu.:   4.38  
##  Median :  82.774   Median : 140.810   Median :  76.874   Median :  76.59  
##  Mean   : 248.871   Mean   : 319.882   Mean   : 236.303   Mean   : 247.65  
##  3rd Qu.: 317.084   3rd Qu.: 386.255   3rd Qu.: 307.814   3rd Qu.: 327.49  
##  Max.   :1244.354   Max.   :1232.655   Max.   :1181.534   Max.   :1238.24  
##  NA's   :1                             NA's   :1          NA's   :1        
##     2009-03            2009-04            2009-05            2009-06        
##  Min.   :  -0.050   Min.   :  -0.048   Min.   :  -0.058   Min.   :  -0.079  
##  1st Qu.:   5.003   1st Qu.:   4.720   1st Qu.:   4.763   1st Qu.:   5.845  
##  Median : 142.893   Median :  76.833   Median :  78.358   Median : 142.006  
##  Mean   : 330.142   Mean   : 251.603   Mean   : 245.116   Mean   : 333.194  
##  3rd Qu.: 410.455   3rd Qu.: 329.093   3rd Qu.: 322.303   3rd Qu.: 410.482  
##  Max.   :1210.912   Max.   :1258.037   Max.   :1225.608   Max.   :1233.085  
##                     NA's   :1          NA's   :1                            
##     2009-07            2009-08            2009-09            2009-10        
##  Min.   :  -0.094   Min.   :  -0.044   Min.   :  -0.035   Min.   :  -0.014  
##  1st Qu.:   6.298   1st Qu.:   6.033   1st Qu.:   5.693   1st Qu.:   4.883  
##  Median :  74.558   Median :  75.604   Median : 139.087   Median :  75.178  
##  Mean   : 241.006   Mean   : 239.427   Mean   : 346.687   Mean   : 258.811  
##  3rd Qu.: 312.185   3rd Qu.: 307.442   3rd Qu.: 437.332   3rd Qu.: 347.086  
##  Max.   :1205.079   Max.   :1197.158   Max.   :1291.564   Max.   :1294.064  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2009-11            2009-12            2010-01            2010-02        
##  Min.   :  -0.022   Min.   :  -0.022   Min.   :   0.004   Min.   :  -0.022  
##  1st Qu.:   4.323   1st Qu.:   4.784   1st Qu.:   4.034   1st Qu.:   4.062  
##  Median :  70.997   Median : 126.066   Median :  71.588   Median :  72.238  
##  Mean   : 243.363   Mean   : 312.962   Mean   : 237.255   Mean   : 242.244  
##  3rd Qu.: 328.913   3rd Qu.: 386.659   3rd Qu.: 322.769   3rd Qu.: 324.137  
##  Max.   :1216.824   Max.   :1207.845   Max.   :1186.271   Max.   :1211.228  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2010-03            2010-04            2010-05            2010-06        
##  Min.   :   0.012   Min.   :   0.007   Min.   :   0.013   Min.   :   0.008  
##  1st Qu.:   5.274   1st Qu.:   5.130   1st Qu.:   5.086   1st Qu.:   6.436  
##  Median : 131.252   Median :  72.370   Median :  70.785   Median : 129.144  
##  Mean   : 332.726   Mean   : 253.446   Mean   : 248.231   Mean   : 335.964  
##  3rd Qu.: 428.420   3rd Qu.: 340.770   3rd Qu.: 339.579   3rd Qu.: 426.769  
##  Max.   :1225.556   Max.   :1267.226   Max.   :1241.148   Max.   :1244.409  
##                     NA's   :1          NA's   :1                            
##     2010-07            2010-08            2010-09            2010-10        
##  Min.   :   0.001   Min.   :  -0.015   Min.   :   0.001   Min.   :   0.009  
##  1st Qu.:   6.531   1st Qu.:   6.281   1st Qu.:   5.567   1st Qu.:   5.006  
##  Median :  64.950   Median :  68.388   Median : 132.892   Median :  69.340  
##  Mean   : 241.180   Mean   : 235.947   Mean   : 346.524   Mean   : 261.255  
##  3rd Qu.: 319.462   3rd Qu.: 310.399   3rd Qu.: 451.361   3rd Qu.: 347.832  
##  Max.   :1205.901   Max.   :1179.745   Max.   :1293.117   Max.   :1306.271  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2010-11            2010-12            2011-01            2011-02        
##  Min.   :   0.022   Min.   :  -0.004   Min.   :  -0.028   Min.   :   0.008  
##  1st Qu.:   4.402   1st Qu.:   4.466   1st Qu.:   4.039   1st Qu.:   4.329  
##  Median :  69.166   Median : 121.422   Median :  69.750   Median :  68.579  
##  Mean   : 248.748   Mean   : 312.917   Mean   : 230.680   Mean   : 244.133  
##  3rd Qu.: 331.437   3rd Qu.: 388.385   3rd Qu.: 300.831   3rd Qu.: 314.647  
##  Max.   :1243.730   Max.   :1216.262   Max.   :1153.413   Max.   :1220.663  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2011-03           2011-04            2011-05            2011-06        
##  Min.   :   0.05   Min.   :   0.036   Min.   :   0.050   Min.   :   0.054  
##  1st Qu.:   6.03   1st Qu.:   5.224   1st Qu.:   5.338   1st Qu.:   6.926  
##  Median : 134.37   Median :  73.384   Median :  74.216   Median : 135.581  
##  Mean   : 345.17   Mean   : 262.660   Mean   : 260.582   Mean   : 353.331  
##  3rd Qu.: 448.56   3rd Qu.: 345.904   3rd Qu.: 345.411   3rd Qu.: 452.923  
##  Max.   :1286.66   Max.   :1313.283   Max.   :1302.884   Max.   :1302.414  
##                    NA's   :1          NA's   :1                            
##     2011-07            2011-08            2011-09            2011-10        
##  Min.   :   0.067   Min.   :   0.052   Min.   :   0.043   Min.   :   0.032  
##  1st Qu.:   6.911   1st Qu.:   7.067   1st Qu.:   6.660   1st Qu.:   5.513  
##  Median :  71.735   Median :  70.853   Median : 136.901   Median :  70.508  
##  Mean   : 257.228   Mean   : 248.259   Mean   : 362.555   Mean   : 269.648  
##  3rd Qu.: 328.941   3rd Qu.: 320.386   3rd Qu.: 469.204   3rd Qu.: 358.284  
##  Max.   :1286.107   Max.   :1241.268   Max.   :1348.754   Max.   :1348.222  
##  NA's   :1          NA's   :1                             NA's   :1

4. Count the number of missing values in each column

sapply(mbta, function(x) sum(is.na(x)))
##    ...1    mode 2007-01 2007-02 2007-03 2007-04 2007-05 2007-06 2007-07 2007-08 
##       0       0       1       1       0       1       1       0       1       1 
## 2007-09 2007-10 2007-11 2007-12 2008-01 2008-02 2008-03 2008-04 2008-05 2008-06 
##       0       1       1       0       1       1       0       1       1       0 
## 2008-07 2008-08 2008-09 2008-10 2008-11 2008-12 2009-01 2009-02 2009-03 2009-04 
##       1       1       0       1       1       0       1       1       0       1 
## 2009-05 2009-06 2009-07 2009-08 2009-09 2009-10 2009-11 2009-12 2010-01 2010-02 
##       1       0       1       1       0       1       1       0       1       1 
## 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 
##       0       1       1       0       1       1       0       1       1       0 
## 2011-01 2011-02 2011-03 2011-04 2011-05 2011-06 2011-07 2011-08 2011-09 2011-10 
##       1       1       0       1       1       0       1       1       0       1

Exercise 3 - Removing unnecessary rows and columns

# Load necessary package
library(dplyr)

1. Remove the first, seventh, and eleventh rows of mbta.

mbta <- mbta %>%
  slice(-c(1, 7, 11))

2. Remove the first column.

mbta <- mbta %>%
  select(-1)

3. Check the dimensions of the new data frame.

dim(mbta)
## [1]  8 59

Exercise 4 - Observations are stored in columns

# Load necessary package
library(tidyverse)

1. Pivot the data to long format

mbta <- mbta %>%
  pivot_longer(cols = -mode, names_to = "date", values_to = "thou_riders")

2. Check the dimensions of the new data frame

dim(mbta)
## [1] 464   3

Exercise 5 - Separating columns

# Load necessary package
library(tidyverse)

1. Split the month column of mbta at the dash and create a new month column with only the month and a year column with only the year.

mbta <- mbta %>%
  separate(date, into = c("year", "month"), sep = "-")

2. View the head of this new mbta data

head(mbta)
## # A tibble: 6 × 4
##   mode  year  month thou_riders
##   <chr> <chr> <chr>       <dbl>
## 1 Boat  2007  01        40010  
## 2 Boat  2007  02            3.6
## 3 Boat  2007  03           40  
## 4 Boat  2007  04            4.3
## 5 Boat  2007  05            4.9
## 6 Boat  2007  06            5.8

Exercise 6 - Dealing with entry error

1. Locate the row and column of the incorrect value

erroneous_row <- which(mbta$mode == "Boat" & mbta$thou_riders > 40000)

2. Replace the incorrect value with 4

if (length(erroneous_row) > 0) {
  mbta$thou_riders[erroneous_row] <- 4
}

3. Verify the change

filtered_data <- mbta %>% filter(mode == "Boat")
print(filtered_data)
## # A tibble: 58 × 4
##    mode  year  month thou_riders
##    <chr> <chr> <chr>       <dbl>
##  1 Boat  2007  01           4   
##  2 Boat  2007  02           3.6 
##  3 Boat  2007  03          40   
##  4 Boat  2007  04           4.3 
##  5 Boat  2007  05           4.9 
##  6 Boat  2007  06           5.8 
##  7 Boat  2007  07           6.52
##  8 Boat  2007  08           6.57
##  9 Boat  2007  09           5.47
## 10 Boat  2007  10           5.14
## # ℹ 48 more rows

Exercise 7 - Performing descriptive analysis

# Load necessary package
library(dplyr)

1. Compute the average ridership per mode

average_ridership <- mbta %>%
  group_by(mode) %>%
  summarise(avg_ridership = mean(thou_riders, na.rm = TRUE))

# Show the results
print(average_ridership)
## # A tibble: 8 × 2
##   mode              avg_ridership
##   <chr>                     <dbl>
## 1 Boat                       5.07
## 2 Bus                      359.  
## 3 Commuter Rail            137.  
## 4 Heavy Rail               489.  
## 5 Light Rail               233.  
## 6 Private Bus                3.35
## 7 RIDE                       6.60
## 8 Trackless Trolley         12.1

2. Compute the average ridership per mode for the month of January

january_avg_ridership <- mbta %>%
  filter(month == "01") %>%
  group_by(mode) %>%
  summarise(avg_ridership = mean(thou_riders, na.rm = TRUE))

# Show the results
print(january_avg_ridership)
## # A tibble: 8 × 2
##   mode              avg_ridership
##   <chr>                     <dbl>
## 1 Boat                       3.31
## 2 Bus                      342.  
## 3 Commuter Rail            137.  
## 4 Heavy Rail               460.  
## 5 Light Rail               217.  
## 6 Private Bus                3.47
## 7 RIDE                       5.94
## 8 Trackless Trolley         12.5

3. Find which year had the largest total ridership for boat mode

largest_boat_ridership_year <- mbta %>%
  filter(mode == "Boat") %>%
  group_by(year) %>%
  summarize(total_riders = sum(thou_riders, na.rm = TRUE)) %>%
  arrange(desc(total_riders)) %>%
  slice_head(n = 1)

# Show the results
print(largest_boat_ridership_year)
## # A tibble: 1 × 2
##   year  total_riders
##   <chr>        <dbl>
## 1 2007          93.1

4. Find which month has the greatest average number of passengers for Heavy Rail mode

busiest_month_heavy_rail <- mbta %>%
  filter(mode == "Heavy Rail") %>%
  group_by(month) %>%
  summarize(avg_ridership = mean(thou_riders, na.rm = TRUE)) %>%
  arrange(desc(avg_ridership)) %>%
  slice_head(n = 1)

# Show the results
print(busiest_month_heavy_rail)
## # A tibble: 1 × 2
##   month avg_ridership
##   <chr>         <dbl>
## 1 09             518.