Part 1

Load libraries

library(tidyverse)
library(completejourney)

Exercise 1

transactions <- transactions_sample

regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity loyalty_price = (sales_value + coupon_match_disc) / quantity coupon_price = (sales_value - coupon_disc) / quantity

Question 0

Create three new variables names regular_price, loyalty_price, and coupon_price according to the logic shown above

transactions <- transactions %>%
  mutate(regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
       loyalty_price = (sales_value + coupon_match_disc) / quantity,
       coupon_price = (sales_value - coupon_disc) / quantity
  ) %>%
  select(regular_price, loyalty_price, coupon_price, product_id, everything())

Question 1

Identify the five households with the largest loyalty_price transactions. What is unique about the transactions with the largest loyalty_price value?

transactions %>%
  slice_max(order_by = loyalty_price, n = 5)

What is unique is that their loyalty price is the same as the regular price.

Question 2

Now filter for only those observations where quantity was greater than 0. Now which household(s) have the largest loyalty_price transaction?

transactions %>%
  filter(quantity > 0) %>%
  slice_max(order_by = loyalty_price, n = 1)

Answer: 1246

Question 3

Using the first transaction in the result from #2, filter the products data based on the product_id to find out which product the largest loyalty_price transaction is associated with

products %>%
  filter(product_id == 12484608)

Answer: COUPON/MISC ITEMS

Exercise 2

How many products had a regular price of $1 or less?

transactions %>%
  filter(regular_price <= 1) %>%
  select(product_id) %>%
  n_distinct()
## [1] 2748

How many products had a loyalty price of $1 or less?

transactions %>%
  filter(loyalty_price <= 1) %>%
  select(product_id) %>%
  n_distinct()
## [1] 4648

How many products had a coupon price of $1 or less?

transactions %>%
  filter(coupon_price <= 1) %>%
  select(product_id) %>%
  n_distinct()
## [1] 4844

Exercise 3

What proportion of baskets are over $10 in sales value? What proportion of baskets are over $20 in sales value?

baskets_all <- transactions %>%
  group_by(basket_id) %>%
  summarize(total_sales_value = sum(sales_value))
  
baskets_over_10 <- baskets_all %>%
  filter(total_sales_value > 10) %>%
  nrow()

baskets_over_20 <- baskets_all %>%
  filter(total_sales_value > 20) %>%
  nrow()

all_count <- nrow(baskets_all)

prop_over_10 <- baskets_over_10 / all_count * 100
prop_over_10
## [1] 10.65978
prop_over_20 <- baskets_over_20 / all_count * 100
prop_over_20
## [1] 2.57816

Proportion of baskets over $10: 10.6597803
Proportion of baskets over $20: 2.5781597

Exercise 4

pct_loyalty_disc = 1 - (loyalty_price / regular_price)

Which stores had the largest total sales_value?

transactions %>%
  group_by(store_id) %>%
  summarize(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
  arrange(desc(total_sales_value))

Which stores had the largest average loyalty discount as defined above?

transactions %>%
  mutate(pct_loyalty_disc = 1 - (loyalty_price / regular_price)) %>%
  group_by(store_id) %>%
  summarize(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>%
  arrange(desc(avg_pct_loyalty_disc))

Part 2

Exercise 1 - Import the data

library(readxl)

excel_sheets(path = 'mbta.xlsx')
## [1] "Sheet1"
mbta <- read_excel(path = 'mbta.xlsx', skip = 1, na = 'NA')

Exercise 2 - Exam the data

View the structure of mbta

str(mbta)
## tibble [11 × 60] (S3: tbl_df/tbl/data.frame)
##  $ ...1   : num [1:11] 1 2 3 4 5 6 7 8 9 10 ...
##  $ mode   : chr [1:11] "All Modes by Qtr" "Boat" "Bus" "Commuter Rail" ...
##  $ 2007-01: num [1:11] NA 4 336 142 435 ...
##  $ 2007-02: num [1:11] NA 3.6 338.7 138.5 448.3 ...
##  $ 2007-03: num [1:11] 1188 40 340 138 459 ...
##  $ 2007-04: num [1:11] NA 4.3 352.2 139.5 472.2 ...
##  $ 2007-05: num [1:11] NA 4.9 354.4 139 474.6 ...
##  $ 2007-06: num [1:11] 1246 5.8 350.5 143 477 ...
##  $ 2007-07: num [1:11] NA 6.52 357.52 142.39 471.74 ...
##  $ 2007-08: num [1:11] NA 6.57 355.48 142.36 461.61 ...
##  $ 2007-09: num [1:11] 1256.57 5.47 372.6 143.05 499.57 ...
##  $ 2007-10: num [1:11] NA 5.14 368.85 146.54 457.74 ...
##  $ 2007-11: num [1:11] NA 3.76 330.83 145.09 488.35 ...
##  $ 2007-12: num [1:11] 1216.89 2.98 312.92 141.59 448.27 ...
##  $ 2008-01: num [1:11] NA 3.17 340.32 142.15 472.62 ...
##  $ 2008-02: num [1:11] NA 3.11 352.9 142.61 492.1 ...
##  $ 2008-03: num [1:11] 1253.52 3.51 361.15 137.45 494.05 ...
##  $ 2008-04: num [1:11] NA 4.16 368.19 140.39 513.2 ...
##  $ 2008-05: num [1:11] NA 4.01 363.9 142.59 507.95 ...
##  $ 2008-06: num [1:11] 1314.82 5.19 362.96 142.06 518.35 ...
##  $ 2008-07: num [1:11] NA 6.02 370.92 145.73 512.31 ...
##  $ 2008-08: num [1:11] NA 5.8 361.1 144.6 477 ...
##  $ 2008-09: num [1:11] 1307.04 4.59 389.54 141.91 517.32 ...
##  $ 2008-10: num [1:11] NA 4.29 357.97 151.96 523.64 ...
##  $ 2008-11: num [1:11] NA 3.49 345.42 152.95 487.12 ...
##  $ 2008-12: num [1:11] 1232.65 3.01 325.77 140.81 446.74 ...
##  $ 2009-01: num [1:11] NA 3.01 338.53 141.45 461 ...
##  $ 2009-02: num [1:11] NA 3.2 360.4 143.5 482.4 ...
##  $ 2009-03: num [1:11] 1209.79 3.33 353.69 142.89 467.22 ...
##  $ 2009-04: num [1:11] NA 4.05 359.38 142.34 493.15 ...
##  $ 2009-05: num [1:11] NA 4.12 354.75 144.22 475.63 ...
##  $ 2009-06: num [1:11] 1233.1 4.9 347.9 142 473.1 ...
##  $ 2009-07: num [1:11] NA 6.44 339.48 137.69 470.83 ...
##  $ 2009-08: num [1:11] NA 5.9 332.7 139.2 466.7 ...
##  $ 2009-09: num [1:11] 1230.5 4.7 374.3 139.1 500.4 ...
##  $ 2009-10: num [1:11] NA 4.21 385.87 137.1 513.41 ...
##  $ 2009-11: num [1:11] NA 3.58 366.98 129.34 480.28 ...
##  $ 2009-12: num [1:11] 1207.85 3.11 332.39 126.07 440.93 ...
##  $ 2010-01: num [1:11] NA 3.21 362.23 130.91 464.07 ...
##  $ 2010-02: num [1:11] NA 3.19 361.14 131.92 480.12 ...
##  $ 2010-03: num [1:11] 1208.86 3.48 373.44 131.25 483.4 ...
##  $ 2010-04: num [1:11] NA 4.45 378.61 131.72 502.37 ...
##  $ 2010-05: num [1:11] NA 4.42 380.17 128.8 487.4 ...
##  $ 2010-06: num [1:11] 1244.41 5.41 363.27 129.14 490.26 ...
##  $ 2010-07: num [1:11] NA 6.51 353.04 122.94 488.59 ...
##  $ 2010-08: num [1:11] NA 6.27 343.69 129.73 473.73 ...
##  $ 2010-09: num [1:11] 1225.5 4.7 381.6 132.9 521.1 ...
##  $ 2010-10: num [1:11] NA 4.4 385 131 532.4 ...
##  $ 2010-11: num [1:11] NA 3.73 367.95 130.89 502.89 ...
##  $ 2010-12: num [1:11] 1216.26 3.16 326.34 121.42 450.43 ...
##  $ 2011-01: num [1:11] NA 3.14 334.96 128.4 468.42 ...
##  $ 2011-02: num [1:11] NA 3.28 346.23 125.46 504.07 ...
##  $ 2011-03: num [1:11] 1223.45 3.67 380.4 134.37 516.73 ...
##  $ 2011-04: num [1:11] NA 4.25 380.45 134.17 528.63 ...
##  $ 2011-05: num [1:11] NA 4.43 385.29 136.14 528.12 ...
##  $ 2011-06: num [1:11] 1302.41 5.47 376.32 135.58 529.53 ...
##  $ 2011-07: num [1:11] NA 6.58 361.58 132.41 532.89 ...
##  $ 2011-08: num [1:11] NA 6.73 353.79 130.62 508.14 ...
##  $ 2011-09: num [1:11] 1291 5 388 137 550 ...
##  $ 2011-10: num [1:11] NA 4.48 398.46 128.72 554.93 ...

View the first 6 rows of mbta

head(mbta)

View a summary of mbta

summary(mbta)
##       ...1          mode              2007-01            2007-02        
##  Min.   : 1.0   Length:11          Min.   :   0.020   Min.   :  -0.040  
##  1st Qu.: 3.5   Class :character   1st Qu.:   4.804   1st Qu.:   4.563  
##  Median : 6.0   Mode  :character   Median :  77.478   Median :  75.707  
##  Mean   : 6.0                      Mean   : 233.397   Mean   : 238.324  
##  3rd Qu.: 8.5                      3rd Qu.: 308.672   3rd Qu.: 314.072  
##  Max.   :11.0                      Max.   :1166.974   Max.   :1191.639  
##                                    NA's   :1          NA's   :1         
##     2007-03            2007-04            2007-05            2007-06        
##  Min.   :   0.114   Min.   :  -0.002   Min.   :   0.049   Min.   :   0.096  
##  1st Qu.:   9.278   1st Qu.:   4.756   1st Qu.:   5.025   1st Qu.:   5.700  
##  Median : 137.700   Median :  76.472   Median :  76.240   Median : 143.000  
##  Mean   : 330.293   Mean   : 249.421   Mean   : 248.956   Mean   : 339.846  
##  3rd Qu.: 399.225   3rd Qu.: 328.011   3rd Qu.: 327.841   3rd Qu.: 413.788  
##  Max.   :1204.725   Max.   :1247.105   Max.   :1244.755   Max.   :1246.129  
##                     NA's   :1          NA's   :1                            
##     2007-07            2007-08            2007-09            2007-10        
##  Min.   :  -0.037   Min.   :   0.004   Min.   :  -0.007   Min.   :  -0.064  
##  1st Qu.:   5.570   1st Qu.:   5.624   1st Qu.:   5.539   1st Qu.:   5.310  
##  Median :  77.851   Median :  77.753   Median : 143.051   Median :  80.582  
##  Mean   : 248.787   Mean   : 244.665   Mean   : 352.554   Mean   : 248.884  
##  3rd Qu.: 328.961   3rd Qu.: 325.336   3rd Qu.: 436.082   3rd Qu.: 336.994  
##  Max.   :1243.952   Max.   :1223.323   Max.   :1310.764   Max.   :1244.453  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2007-11            2007-12            2008-01            2008-02        
##  Min.   :  -0.077   Min.   :  -0.060   Min.   :   0.048   Min.   :   0.061  
##  1st Qu.:   4.478   1st Qu.:   4.385   1st Qu.:   4.475   1st Qu.:   4.485  
##  Median :  79.356   Median : 141.585   Median :  78.023   Median :  78.389  
##  Mean   : 248.371   Mean   : 321.588   Mean   : 244.615   Mean   : 252.803  
##  3rd Qu.: 310.744   3rd Qu.: 380.594   3rd Qu.: 315.549   3rd Qu.: 327.005  
##  Max.   :1241.895   Max.   :1216.890   Max.   :1223.050   Max.   :1263.983  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2008-03            2008-04            2008-05            2008-06        
##  Min.   :   0.058   Min.   :   0.060   Min.   :   0.046   Min.   :   0.060  
##  1st Qu.:   5.170   1st Qu.:   4.689   1st Qu.:   4.629   1st Qu.:   5.742  
##  Median : 137.453   Median :  77.555   Median :  78.506   Median : 142.057  
##  Mean   : 345.604   Mean   : 264.435   Mean   : 260.323   Mean   : 359.667  
##  3rd Qu.: 427.601   3rd Qu.: 343.909   3rd Qu.: 337.515   3rd Qu.: 440.656  
##  Max.   :1274.031   Max.   :1322.146   Max.   :1301.591   Max.   :1320.728  
##                     NA's   :1          NA's   :1                            
##     2008-07            2008-08            2008-09            2008-10        
##  Min.   :   0.069   Min.   :   0.023   Min.   :   0.021   Min.   :   0.054  
##  1st Qu.:   6.019   1st Qu.:   5.887   1st Qu.:   5.691   1st Qu.:   5.087  
##  Median :  80.061   Median :  79.141   Median : 141.907   Median :  82.486  
##  Mean   : 266.027   Mean   : 250.383   Mean   : 362.099   Mean   : 262.440  
##  3rd Qu.: 345.730   3rd Qu.: 330.629   3rd Qu.: 453.430   3rd Qu.: 330.996  
##  Max.   :1330.103   Max.   :1251.905   Max.   :1338.015   Max.   :1312.172  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2008-11            2008-12            2009-01            2009-02       
##  Min.   :   0.002   Min.   :  -0.015   Min.   :  -0.034   Min.   :  -0.02  
##  1st Qu.:   4.829   1st Qu.:   4.689   1st Qu.:   4.186   1st Qu.:   4.38  
##  Median :  82.774   Median : 140.810   Median :  76.874   Median :  76.59  
##  Mean   : 248.871   Mean   : 319.882   Mean   : 236.303   Mean   : 247.65  
##  3rd Qu.: 317.084   3rd Qu.: 386.255   3rd Qu.: 307.814   3rd Qu.: 327.49  
##  Max.   :1244.354   Max.   :1232.655   Max.   :1181.534   Max.   :1238.24  
##  NA's   :1                             NA's   :1          NA's   :1        
##     2009-03            2009-04            2009-05            2009-06        
##  Min.   :  -0.050   Min.   :  -0.048   Min.   :  -0.058   Min.   :  -0.079  
##  1st Qu.:   5.003   1st Qu.:   4.720   1st Qu.:   4.763   1st Qu.:   5.845  
##  Median : 142.893   Median :  76.833   Median :  78.358   Median : 142.006  
##  Mean   : 330.142   Mean   : 251.603   Mean   : 245.116   Mean   : 333.194  
##  3rd Qu.: 410.455   3rd Qu.: 329.093   3rd Qu.: 322.303   3rd Qu.: 410.482  
##  Max.   :1210.912   Max.   :1258.037   Max.   :1225.608   Max.   :1233.085  
##                     NA's   :1          NA's   :1                            
##     2009-07            2009-08            2009-09            2009-10        
##  Min.   :  -0.094   Min.   :  -0.044   Min.   :  -0.035   Min.   :  -0.014  
##  1st Qu.:   6.298   1st Qu.:   6.033   1st Qu.:   5.693   1st Qu.:   4.883  
##  Median :  74.558   Median :  75.604   Median : 139.087   Median :  75.178  
##  Mean   : 241.006   Mean   : 239.427   Mean   : 346.687   Mean   : 258.811  
##  3rd Qu.: 312.185   3rd Qu.: 307.442   3rd Qu.: 437.332   3rd Qu.: 347.086  
##  Max.   :1205.079   Max.   :1197.158   Max.   :1291.564   Max.   :1294.064  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2009-11            2009-12            2010-01            2010-02        
##  Min.   :  -0.022   Min.   :  -0.022   Min.   :   0.004   Min.   :  -0.022  
##  1st Qu.:   4.323   1st Qu.:   4.784   1st Qu.:   4.034   1st Qu.:   4.062  
##  Median :  70.997   Median : 126.066   Median :  71.588   Median :  72.238  
##  Mean   : 243.363   Mean   : 312.962   Mean   : 237.255   Mean   : 242.244  
##  3rd Qu.: 328.913   3rd Qu.: 386.659   3rd Qu.: 322.769   3rd Qu.: 324.137  
##  Max.   :1216.824   Max.   :1207.845   Max.   :1186.271   Max.   :1211.228  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2010-03            2010-04            2010-05            2010-06        
##  Min.   :   0.012   Min.   :   0.007   Min.   :   0.013   Min.   :   0.008  
##  1st Qu.:   5.274   1st Qu.:   5.130   1st Qu.:   5.086   1st Qu.:   6.436  
##  Median : 131.252   Median :  72.370   Median :  70.785   Median : 129.144  
##  Mean   : 332.726   Mean   : 253.446   Mean   : 248.231   Mean   : 335.964  
##  3rd Qu.: 428.420   3rd Qu.: 340.770   3rd Qu.: 339.579   3rd Qu.: 426.769  
##  Max.   :1225.556   Max.   :1267.226   Max.   :1241.148   Max.   :1244.409  
##                     NA's   :1          NA's   :1                            
##     2010-07            2010-08            2010-09            2010-10        
##  Min.   :   0.001   Min.   :  -0.015   Min.   :   0.001   Min.   :   0.009  
##  1st Qu.:   6.531   1st Qu.:   6.281   1st Qu.:   5.567   1st Qu.:   5.006  
##  Median :  64.950   Median :  68.388   Median : 132.892   Median :  69.340  
##  Mean   : 241.180   Mean   : 235.947   Mean   : 346.524   Mean   : 261.255  
##  3rd Qu.: 319.462   3rd Qu.: 310.399   3rd Qu.: 451.361   3rd Qu.: 347.832  
##  Max.   :1205.901   Max.   :1179.745   Max.   :1293.117   Max.   :1306.271  
##  NA's   :1          NA's   :1                             NA's   :1         
##     2010-11            2010-12            2011-01            2011-02        
##  Min.   :   0.022   Min.   :  -0.004   Min.   :  -0.028   Min.   :   0.008  
##  1st Qu.:   4.402   1st Qu.:   4.466   1st Qu.:   4.039   1st Qu.:   4.329  
##  Median :  69.166   Median : 121.422   Median :  69.750   Median :  68.579  
##  Mean   : 248.748   Mean   : 312.917   Mean   : 230.680   Mean   : 244.133  
##  3rd Qu.: 331.437   3rd Qu.: 388.385   3rd Qu.: 300.831   3rd Qu.: 314.647  
##  Max.   :1243.730   Max.   :1216.262   Max.   :1153.413   Max.   :1220.663  
##  NA's   :1                             NA's   :1          NA's   :1         
##     2011-03           2011-04            2011-05            2011-06        
##  Min.   :   0.05   Min.   :   0.036   Min.   :   0.050   Min.   :   0.054  
##  1st Qu.:   6.03   1st Qu.:   5.224   1st Qu.:   5.338   1st Qu.:   6.926  
##  Median : 134.37   Median :  73.384   Median :  74.216   Median : 135.581  
##  Mean   : 345.17   Mean   : 262.660   Mean   : 260.582   Mean   : 353.331  
##  3rd Qu.: 448.56   3rd Qu.: 345.904   3rd Qu.: 345.411   3rd Qu.: 452.923  
##  Max.   :1286.66   Max.   :1313.283   Max.   :1302.884   Max.   :1302.414  
##                    NA's   :1          NA's   :1                            
##     2011-07            2011-08            2011-09            2011-10        
##  Min.   :   0.067   Min.   :   0.052   Min.   :   0.043   Min.   :   0.032  
##  1st Qu.:   6.911   1st Qu.:   7.067   1st Qu.:   6.660   1st Qu.:   5.513  
##  Median :  71.735   Median :  70.853   Median : 136.901   Median :  70.508  
##  Mean   : 257.228   Mean   : 248.259   Mean   : 362.555   Mean   : 269.648  
##  3rd Qu.: 328.941   3rd Qu.: 320.386   3rd Qu.: 469.204   3rd Qu.: 358.284  
##  Max.   :1286.107   Max.   :1241.268   Max.   :1348.754   Max.   :1348.222  
##  NA's   :1          NA's   :1                             NA's   :1

How many total missing values?

sum(is.na(mbta))
## [1] 39

How many missing values are in each column?

colSums(is.na(mbta))
##    ...1    mode 2007-01 2007-02 2007-03 2007-04 2007-05 2007-06 2007-07 2007-08 
##       0       0       1       1       0       1       1       0       1       1 
## 2007-09 2007-10 2007-11 2007-12 2008-01 2008-02 2008-03 2008-04 2008-05 2008-06 
##       0       1       1       0       1       1       0       1       1       0 
## 2008-07 2008-08 2008-09 2008-10 2008-11 2008-12 2009-01 2009-02 2009-03 2009-04 
##       1       1       0       1       1       0       1       1       0       1 
## 2009-05 2009-06 2009-07 2009-08 2009-09 2009-10 2009-11 2009-12 2010-01 2010-02 
##       1       0       1       1       0       1       1       0       1       1 
## 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 
##       0       1       1       0       1       1       0       1       1       0 
## 2011-01 2011-02 2011-03 2011-04 2011-05 2011-06 2011-07 2011-08 2011-09 2011-10 
##       1       1       0       1       1       0       1       1       0       1

Exercise 3 - Removing unnecessary rows and columns

Remove the first, seventh, and eleventh rows
Remove the first column

mbta <- mbta %>%
  slice(-c(1, 7, 11)) %>%
  select(-1)

What are the dimensions of the new data frame?

dim(mbta)
## [1]  8 59

Exercise 4 - Observations are stored in columns

Pivot rows and columns of the mbta data so that all columns are variables of the data

mbta <- mbta %>%
  pivot_longer(cols = -1, names_to = "date", values_to = "thou_riders")
dim(mbta)
## [1] 464   3

Exercise 5 - Seperating Columns

Split the date column into a month column and a year column

mbta <- mbta %>%
  separate(col = date, into = c("year", "month"), sep = "-")

View the head of the new data set

head(mbta)

Exercise 6 - Dealing with entry error

Locate the position of the incorrect value

which(mbta == 40 & mbta$mode == "Boat", arr.ind = T)
##      row col
## [1,]   3   4

Replace the incorrect value

mbta[3, 4] <- 4
head(mbta)

Exercise 7 - Performing descriptive analysis

Compute the average ridership per mode

avg_per_mode <- mbta %>%
  group_by(mode) %>%
  summarize(avg_ridership = mean(thou_riders))
avg_per_mode

Compute the average ridership per mode for the month of January

avg_mode_Jan <- mbta %>%
  filter(month == "01") %>%
  group_by(mode) %>%
  summarize(avg_ridership = mean(thou_riders))
avg_mode_Jan

Which year had the largest total ridership for the boat mode?

largest_year <- mbta %>%
  filter(mode == "Boat") %>%
  group_by(year) %>%
  summarize(total_riders = sum(thou_riders))
largest_year

The year with the largest total ridership for the boat mode is: 2007.

On average, which month experiences the greatest number of passengers on the Heavy Rail mode?

passenger_heavy <- mbta %>%
  filter(mode == "Heavy Rail") %>%
  group_by(month) %>%
  summarize(total_riders = mean(thou_riders))
passenger_heavy

The month that averages the greatest number of passengers on the heavy rail mode is: September with 517.71 thousands of riders.