library(tidyverse)
library(completejourney)
transactions <- transactions_sample
regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity loyalty_price = (sales_value + coupon_match_disc) / quantity coupon_price = (sales_value - coupon_disc) / quantity
Create three new variables names regular_price, loyalty_price, and coupon_price according to the logic shown above
transactions <- transactions %>%
mutate(regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
loyalty_price = (sales_value + coupon_match_disc) / quantity,
coupon_price = (sales_value - coupon_disc) / quantity
) %>%
select(regular_price, loyalty_price, coupon_price, product_id, everything())
Identify the five households with the largest loyalty_price transactions. What is unique about the transactions with the largest loyalty_price value?
transactions %>%
slice_max(order_by = loyalty_price, n = 5)
What is unique is that their loyalty price is the same as the regular price.
Now filter for only those observations where quantity was greater than 0. Now which household(s) have the largest loyalty_price transaction?
transactions %>%
filter(quantity > 0) %>%
slice_max(order_by = loyalty_price, n = 1)
Answer: 1246
Using the first transaction in the result from #2, filter the products data based on the product_id to find out which product the largest loyalty_price transaction is associated with
products %>%
filter(product_id == 12484608)
Answer: COUPON/MISC ITEMS
How many products had a regular price of $1 or less?
transactions %>%
filter(regular_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 2748
How many products had a loyalty price of $1 or less?
transactions %>%
filter(loyalty_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4648
How many products had a coupon price of $1 or less?
transactions %>%
filter(coupon_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4844
What proportion of baskets are over $10 in sales value? What proportion of baskets are over $20 in sales value?
baskets_all <- transactions %>%
group_by(basket_id) %>%
summarize(total_sales_value = sum(sales_value))
baskets_over_10 <- baskets_all %>%
filter(total_sales_value > 10) %>%
nrow()
baskets_over_20 <- baskets_all %>%
filter(total_sales_value > 20) %>%
nrow()
all_count <- nrow(baskets_all)
prop_over_10 <- baskets_over_10 / all_count * 100
prop_over_10
## [1] 10.65978
prop_over_20 <- baskets_over_20 / all_count * 100
prop_over_20
## [1] 2.57816
Proportion of baskets over $10: 10.6597803
Proportion of baskets
over $20: 2.5781597
pct_loyalty_disc = 1 - (loyalty_price / regular_price)
Which stores had the largest total sales_value?
transactions %>%
group_by(store_id) %>%
summarize(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales_value))
Which stores had the largest average loyalty discount as defined above?
transactions %>%
mutate(pct_loyalty_disc = 1 - (loyalty_price / regular_price)) %>%
group_by(store_id) %>%
summarize(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>%
arrange(desc(avg_pct_loyalty_disc))
library(readxl)
excel_sheets(path = 'mbta.xlsx')
## [1] "Sheet1"
mbta <- read_excel(path = 'mbta.xlsx', skip = 1, na = 'NA')
View the structure of mbta
str(mbta)
## tibble [11 × 60] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:11] 1 2 3 4 5 6 7 8 9 10 ...
## $ mode : chr [1:11] "All Modes by Qtr" "Boat" "Bus" "Commuter Rail" ...
## $ 2007-01: num [1:11] NA 4 336 142 435 ...
## $ 2007-02: num [1:11] NA 3.6 338.7 138.5 448.3 ...
## $ 2007-03: num [1:11] 1188 40 340 138 459 ...
## $ 2007-04: num [1:11] NA 4.3 352.2 139.5 472.2 ...
## $ 2007-05: num [1:11] NA 4.9 354.4 139 474.6 ...
## $ 2007-06: num [1:11] 1246 5.8 350.5 143 477 ...
## $ 2007-07: num [1:11] NA 6.52 357.52 142.39 471.74 ...
## $ 2007-08: num [1:11] NA 6.57 355.48 142.36 461.61 ...
## $ 2007-09: num [1:11] 1256.57 5.47 372.6 143.05 499.57 ...
## $ 2007-10: num [1:11] NA 5.14 368.85 146.54 457.74 ...
## $ 2007-11: num [1:11] NA 3.76 330.83 145.09 488.35 ...
## $ 2007-12: num [1:11] 1216.89 2.98 312.92 141.59 448.27 ...
## $ 2008-01: num [1:11] NA 3.17 340.32 142.15 472.62 ...
## $ 2008-02: num [1:11] NA 3.11 352.9 142.61 492.1 ...
## $ 2008-03: num [1:11] 1253.52 3.51 361.15 137.45 494.05 ...
## $ 2008-04: num [1:11] NA 4.16 368.19 140.39 513.2 ...
## $ 2008-05: num [1:11] NA 4.01 363.9 142.59 507.95 ...
## $ 2008-06: num [1:11] 1314.82 5.19 362.96 142.06 518.35 ...
## $ 2008-07: num [1:11] NA 6.02 370.92 145.73 512.31 ...
## $ 2008-08: num [1:11] NA 5.8 361.1 144.6 477 ...
## $ 2008-09: num [1:11] 1307.04 4.59 389.54 141.91 517.32 ...
## $ 2008-10: num [1:11] NA 4.29 357.97 151.96 523.64 ...
## $ 2008-11: num [1:11] NA 3.49 345.42 152.95 487.12 ...
## $ 2008-12: num [1:11] 1232.65 3.01 325.77 140.81 446.74 ...
## $ 2009-01: num [1:11] NA 3.01 338.53 141.45 461 ...
## $ 2009-02: num [1:11] NA 3.2 360.4 143.5 482.4 ...
## $ 2009-03: num [1:11] 1209.79 3.33 353.69 142.89 467.22 ...
## $ 2009-04: num [1:11] NA 4.05 359.38 142.34 493.15 ...
## $ 2009-05: num [1:11] NA 4.12 354.75 144.22 475.63 ...
## $ 2009-06: num [1:11] 1233.1 4.9 347.9 142 473.1 ...
## $ 2009-07: num [1:11] NA 6.44 339.48 137.69 470.83 ...
## $ 2009-08: num [1:11] NA 5.9 332.7 139.2 466.7 ...
## $ 2009-09: num [1:11] 1230.5 4.7 374.3 139.1 500.4 ...
## $ 2009-10: num [1:11] NA 4.21 385.87 137.1 513.41 ...
## $ 2009-11: num [1:11] NA 3.58 366.98 129.34 480.28 ...
## $ 2009-12: num [1:11] 1207.85 3.11 332.39 126.07 440.93 ...
## $ 2010-01: num [1:11] NA 3.21 362.23 130.91 464.07 ...
## $ 2010-02: num [1:11] NA 3.19 361.14 131.92 480.12 ...
## $ 2010-03: num [1:11] 1208.86 3.48 373.44 131.25 483.4 ...
## $ 2010-04: num [1:11] NA 4.45 378.61 131.72 502.37 ...
## $ 2010-05: num [1:11] NA 4.42 380.17 128.8 487.4 ...
## $ 2010-06: num [1:11] 1244.41 5.41 363.27 129.14 490.26 ...
## $ 2010-07: num [1:11] NA 6.51 353.04 122.94 488.59 ...
## $ 2010-08: num [1:11] NA 6.27 343.69 129.73 473.73 ...
## $ 2010-09: num [1:11] 1225.5 4.7 381.6 132.9 521.1 ...
## $ 2010-10: num [1:11] NA 4.4 385 131 532.4 ...
## $ 2010-11: num [1:11] NA 3.73 367.95 130.89 502.89 ...
## $ 2010-12: num [1:11] 1216.26 3.16 326.34 121.42 450.43 ...
## $ 2011-01: num [1:11] NA 3.14 334.96 128.4 468.42 ...
## $ 2011-02: num [1:11] NA 3.28 346.23 125.46 504.07 ...
## $ 2011-03: num [1:11] 1223.45 3.67 380.4 134.37 516.73 ...
## $ 2011-04: num [1:11] NA 4.25 380.45 134.17 528.63 ...
## $ 2011-05: num [1:11] NA 4.43 385.29 136.14 528.12 ...
## $ 2011-06: num [1:11] 1302.41 5.47 376.32 135.58 529.53 ...
## $ 2011-07: num [1:11] NA 6.58 361.58 132.41 532.89 ...
## $ 2011-08: num [1:11] NA 6.73 353.79 130.62 508.14 ...
## $ 2011-09: num [1:11] 1291 5 388 137 550 ...
## $ 2011-10: num [1:11] NA 4.48 398.46 128.72 554.93 ...
View the first 6 rows of mbta
head(mbta)
View a summary of mbta
summary(mbta)
## ...1 mode 2007-01 2007-02
## Min. : 1.0 Length:11 Min. : 0.020 Min. : -0.040
## 1st Qu.: 3.5 Class :character 1st Qu.: 4.804 1st Qu.: 4.563
## Median : 6.0 Mode :character Median : 77.478 Median : 75.707
## Mean : 6.0 Mean : 233.397 Mean : 238.324
## 3rd Qu.: 8.5 3rd Qu.: 308.672 3rd Qu.: 314.072
## Max. :11.0 Max. :1166.974 Max. :1191.639
## NA's :1 NA's :1
## 2007-03 2007-04 2007-05 2007-06
## Min. : 0.114 Min. : -0.002 Min. : 0.049 Min. : 0.096
## 1st Qu.: 9.278 1st Qu.: 4.756 1st Qu.: 5.025 1st Qu.: 5.700
## Median : 137.700 Median : 76.472 Median : 76.240 Median : 143.000
## Mean : 330.293 Mean : 249.421 Mean : 248.956 Mean : 339.846
## 3rd Qu.: 399.225 3rd Qu.: 328.011 3rd Qu.: 327.841 3rd Qu.: 413.788
## Max. :1204.725 Max. :1247.105 Max. :1244.755 Max. :1246.129
## NA's :1 NA's :1
## 2007-07 2007-08 2007-09 2007-10
## Min. : -0.037 Min. : 0.004 Min. : -0.007 Min. : -0.064
## 1st Qu.: 5.570 1st Qu.: 5.624 1st Qu.: 5.539 1st Qu.: 5.310
## Median : 77.851 Median : 77.753 Median : 143.051 Median : 80.582
## Mean : 248.787 Mean : 244.665 Mean : 352.554 Mean : 248.884
## 3rd Qu.: 328.961 3rd Qu.: 325.336 3rd Qu.: 436.082 3rd Qu.: 336.994
## Max. :1243.952 Max. :1223.323 Max. :1310.764 Max. :1244.453
## NA's :1 NA's :1 NA's :1
## 2007-11 2007-12 2008-01 2008-02
## Min. : -0.077 Min. : -0.060 Min. : 0.048 Min. : 0.061
## 1st Qu.: 4.478 1st Qu.: 4.385 1st Qu.: 4.475 1st Qu.: 4.485
## Median : 79.356 Median : 141.585 Median : 78.023 Median : 78.389
## Mean : 248.371 Mean : 321.588 Mean : 244.615 Mean : 252.803
## 3rd Qu.: 310.744 3rd Qu.: 380.594 3rd Qu.: 315.549 3rd Qu.: 327.005
## Max. :1241.895 Max. :1216.890 Max. :1223.050 Max. :1263.983
## NA's :1 NA's :1 NA's :1
## 2008-03 2008-04 2008-05 2008-06
## Min. : 0.058 Min. : 0.060 Min. : 0.046 Min. : 0.060
## 1st Qu.: 5.170 1st Qu.: 4.689 1st Qu.: 4.629 1st Qu.: 5.742
## Median : 137.453 Median : 77.555 Median : 78.506 Median : 142.057
## Mean : 345.604 Mean : 264.435 Mean : 260.323 Mean : 359.667
## 3rd Qu.: 427.601 3rd Qu.: 343.909 3rd Qu.: 337.515 3rd Qu.: 440.656
## Max. :1274.031 Max. :1322.146 Max. :1301.591 Max. :1320.728
## NA's :1 NA's :1
## 2008-07 2008-08 2008-09 2008-10
## Min. : 0.069 Min. : 0.023 Min. : 0.021 Min. : 0.054
## 1st Qu.: 6.019 1st Qu.: 5.887 1st Qu.: 5.691 1st Qu.: 5.087
## Median : 80.061 Median : 79.141 Median : 141.907 Median : 82.486
## Mean : 266.027 Mean : 250.383 Mean : 362.099 Mean : 262.440
## 3rd Qu.: 345.730 3rd Qu.: 330.629 3rd Qu.: 453.430 3rd Qu.: 330.996
## Max. :1330.103 Max. :1251.905 Max. :1338.015 Max. :1312.172
## NA's :1 NA's :1 NA's :1
## 2008-11 2008-12 2009-01 2009-02
## Min. : 0.002 Min. : -0.015 Min. : -0.034 Min. : -0.02
## 1st Qu.: 4.829 1st Qu.: 4.689 1st Qu.: 4.186 1st Qu.: 4.38
## Median : 82.774 Median : 140.810 Median : 76.874 Median : 76.59
## Mean : 248.871 Mean : 319.882 Mean : 236.303 Mean : 247.65
## 3rd Qu.: 317.084 3rd Qu.: 386.255 3rd Qu.: 307.814 3rd Qu.: 327.49
## Max. :1244.354 Max. :1232.655 Max. :1181.534 Max. :1238.24
## NA's :1 NA's :1 NA's :1
## 2009-03 2009-04 2009-05 2009-06
## Min. : -0.050 Min. : -0.048 Min. : -0.058 Min. : -0.079
## 1st Qu.: 5.003 1st Qu.: 4.720 1st Qu.: 4.763 1st Qu.: 5.845
## Median : 142.893 Median : 76.833 Median : 78.358 Median : 142.006
## Mean : 330.142 Mean : 251.603 Mean : 245.116 Mean : 333.194
## 3rd Qu.: 410.455 3rd Qu.: 329.093 3rd Qu.: 322.303 3rd Qu.: 410.482
## Max. :1210.912 Max. :1258.037 Max. :1225.608 Max. :1233.085
## NA's :1 NA's :1
## 2009-07 2009-08 2009-09 2009-10
## Min. : -0.094 Min. : -0.044 Min. : -0.035 Min. : -0.014
## 1st Qu.: 6.298 1st Qu.: 6.033 1st Qu.: 5.693 1st Qu.: 4.883
## Median : 74.558 Median : 75.604 Median : 139.087 Median : 75.178
## Mean : 241.006 Mean : 239.427 Mean : 346.687 Mean : 258.811
## 3rd Qu.: 312.185 3rd Qu.: 307.442 3rd Qu.: 437.332 3rd Qu.: 347.086
## Max. :1205.079 Max. :1197.158 Max. :1291.564 Max. :1294.064
## NA's :1 NA's :1 NA's :1
## 2009-11 2009-12 2010-01 2010-02
## Min. : -0.022 Min. : -0.022 Min. : 0.004 Min. : -0.022
## 1st Qu.: 4.323 1st Qu.: 4.784 1st Qu.: 4.034 1st Qu.: 4.062
## Median : 70.997 Median : 126.066 Median : 71.588 Median : 72.238
## Mean : 243.363 Mean : 312.962 Mean : 237.255 Mean : 242.244
## 3rd Qu.: 328.913 3rd Qu.: 386.659 3rd Qu.: 322.769 3rd Qu.: 324.137
## Max. :1216.824 Max. :1207.845 Max. :1186.271 Max. :1211.228
## NA's :1 NA's :1 NA's :1
## 2010-03 2010-04 2010-05 2010-06
## Min. : 0.012 Min. : 0.007 Min. : 0.013 Min. : 0.008
## 1st Qu.: 5.274 1st Qu.: 5.130 1st Qu.: 5.086 1st Qu.: 6.436
## Median : 131.252 Median : 72.370 Median : 70.785 Median : 129.144
## Mean : 332.726 Mean : 253.446 Mean : 248.231 Mean : 335.964
## 3rd Qu.: 428.420 3rd Qu.: 340.770 3rd Qu.: 339.579 3rd Qu.: 426.769
## Max. :1225.556 Max. :1267.226 Max. :1241.148 Max. :1244.409
## NA's :1 NA's :1
## 2010-07 2010-08 2010-09 2010-10
## Min. : 0.001 Min. : -0.015 Min. : 0.001 Min. : 0.009
## 1st Qu.: 6.531 1st Qu.: 6.281 1st Qu.: 5.567 1st Qu.: 5.006
## Median : 64.950 Median : 68.388 Median : 132.892 Median : 69.340
## Mean : 241.180 Mean : 235.947 Mean : 346.524 Mean : 261.255
## 3rd Qu.: 319.462 3rd Qu.: 310.399 3rd Qu.: 451.361 3rd Qu.: 347.832
## Max. :1205.901 Max. :1179.745 Max. :1293.117 Max. :1306.271
## NA's :1 NA's :1 NA's :1
## 2010-11 2010-12 2011-01 2011-02
## Min. : 0.022 Min. : -0.004 Min. : -0.028 Min. : 0.008
## 1st Qu.: 4.402 1st Qu.: 4.466 1st Qu.: 4.039 1st Qu.: 4.329
## Median : 69.166 Median : 121.422 Median : 69.750 Median : 68.579
## Mean : 248.748 Mean : 312.917 Mean : 230.680 Mean : 244.133
## 3rd Qu.: 331.437 3rd Qu.: 388.385 3rd Qu.: 300.831 3rd Qu.: 314.647
## Max. :1243.730 Max. :1216.262 Max. :1153.413 Max. :1220.663
## NA's :1 NA's :1 NA's :1
## 2011-03 2011-04 2011-05 2011-06
## Min. : 0.05 Min. : 0.036 Min. : 0.050 Min. : 0.054
## 1st Qu.: 6.03 1st Qu.: 5.224 1st Qu.: 5.338 1st Qu.: 6.926
## Median : 134.37 Median : 73.384 Median : 74.216 Median : 135.581
## Mean : 345.17 Mean : 262.660 Mean : 260.582 Mean : 353.331
## 3rd Qu.: 448.56 3rd Qu.: 345.904 3rd Qu.: 345.411 3rd Qu.: 452.923
## Max. :1286.66 Max. :1313.283 Max. :1302.884 Max. :1302.414
## NA's :1 NA's :1
## 2011-07 2011-08 2011-09 2011-10
## Min. : 0.067 Min. : 0.052 Min. : 0.043 Min. : 0.032
## 1st Qu.: 6.911 1st Qu.: 7.067 1st Qu.: 6.660 1st Qu.: 5.513
## Median : 71.735 Median : 70.853 Median : 136.901 Median : 70.508
## Mean : 257.228 Mean : 248.259 Mean : 362.555 Mean : 269.648
## 3rd Qu.: 328.941 3rd Qu.: 320.386 3rd Qu.: 469.204 3rd Qu.: 358.284
## Max. :1286.107 Max. :1241.268 Max. :1348.754 Max. :1348.222
## NA's :1 NA's :1 NA's :1
How many total missing values?
sum(is.na(mbta))
## [1] 39
How many missing values are in each column?
colSums(is.na(mbta))
## ...1 mode 2007-01 2007-02 2007-03 2007-04 2007-05 2007-06 2007-07 2007-08
## 0 0 1 1 0 1 1 0 1 1
## 2007-09 2007-10 2007-11 2007-12 2008-01 2008-02 2008-03 2008-04 2008-05 2008-06
## 0 1 1 0 1 1 0 1 1 0
## 2008-07 2008-08 2008-09 2008-10 2008-11 2008-12 2009-01 2009-02 2009-03 2009-04
## 1 1 0 1 1 0 1 1 0 1
## 2009-05 2009-06 2009-07 2009-08 2009-09 2009-10 2009-11 2009-12 2010-01 2010-02
## 1 0 1 1 0 1 1 0 1 1
## 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12
## 0 1 1 0 1 1 0 1 1 0
## 2011-01 2011-02 2011-03 2011-04 2011-05 2011-06 2011-07 2011-08 2011-09 2011-10
## 1 1 0 1 1 0 1 1 0 1
Remove the first, seventh, and eleventh rows
Remove the first
column
mbta <- mbta %>%
slice(-c(1, 7, 11)) %>%
select(-1)
What are the dimensions of the new data frame?
dim(mbta)
## [1] 8 59
Pivot rows and columns of the mbta data so that all columns are variables of the data
mbta <- mbta %>%
pivot_longer(cols = -1, names_to = "date", values_to = "thou_riders")
dim(mbta)
## [1] 464 3
Split the date column into a month column and a year column
mbta <- mbta %>%
separate(col = date, into = c("year", "month"), sep = "-")
View the head of the new data set
head(mbta)
Locate the position of the incorrect value
which(mbta == 40 & mbta$mode == "Boat", arr.ind = T)
## row col
## [1,] 3 4
Replace the incorrect value
mbta[3, 4] <- 4
head(mbta)
Compute the average ridership per mode
avg_per_mode <- mbta %>%
group_by(mode) %>%
summarize(avg_ridership = mean(thou_riders))
avg_per_mode
Compute the average ridership per mode for the month of January
avg_mode_Jan <- mbta %>%
filter(month == "01") %>%
group_by(mode) %>%
summarize(avg_ridership = mean(thou_riders))
avg_mode_Jan
Which year had the largest total ridership for the boat mode?
largest_year <- mbta %>%
filter(mode == "Boat") %>%
group_by(year) %>%
summarize(total_riders = sum(thou_riders))
largest_year
The year with the largest total ridership for the boat mode is: 2007.
On average, which month experiences the greatest number of passengers on the Heavy Rail mode?
passenger_heavy <- mbta %>%
filter(mode == "Heavy Rail") %>%
group_by(month) %>%
summarize(total_riders = mean(thou_riders))
passenger_heavy
The month that averages the greatest number of passengers on the heavy rail mode is: September with 517.71 thousands of riders.