Part 1
packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
transactions <- transactions_sample
products
## # A tibble: 92,331 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio… FRZN ICE ICE - CRUSH…
## 2 26081 2 MISCELLANEOUS Natio… <NA> <NA>
## 3 26093 69 PASTRY Priva… BREAD BREAD:ITALI…
## 4 26190 69 GROCERY Priva… FRUIT - SHELF S… APPLE SAUCE
## 5 26355 69 GROCERY Priva… COOKIES/CONES SPECIALTY C…
## 6 26426 69 GROCERY Priva… SPICES & EXTRAC… SPICES & SE…
## 7 26540 69 GROCERY Priva… COOKIES/CONES TRAY PACK/C…
## 8 26601 69 DRUG GM Priva… VITAMINS VITAMIN - M…
## 9 26636 69 PASTRY Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691 16 GROCERY Priva… PNT BTR/JELLY/J… HONEY
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
Excercise 1
transactions <- transactions %>%
mutate(
regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
loyalty_price = (sales_value + coupon_match_disc) / quantity,
coupon_price = (sales_value - coupon_disc) / quantity
) %>%
select(regular_price, loyalty_price, coupon_price, product_id, everything())
# Q1
transactions %>%
slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Inf Inf Inf 13945244 2491 389
## 2 100 100 100 12484608 1246 334
## 3 100. 100. 100. 13040176 2318 381
## 4 88.9 88.9 88.9 15630122 1172 396
## 5 85.0 66.1 66.1 916561 2312 442
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
# Q2
transactions %>%
filter(quantity > 0) %>%
slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 100 100 100 12484608 1246 334
## 2 100. 100. 100. 13040176 2318 381
## 3 88.9 88.9 88.9 15630122 1172 396
## 4 85.0 66.1 66.1 916561 2312 442
## 5 63.8 63.8 63.8 1076056 57 298
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
#Q3
products %>%
filter(product_id == '12484608')
## # A tibble: 1 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 12484608 903 MISCELLANEOUS Private COUPON/MISC ITE… ELECTRONIC …
## # ℹ 1 more variable: package_size <chr>
Excercise 2
# how many products had a regular price of $1 or less
transactions %>%
filter(regular_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 2748
# how many products had a loyalty price of $1 or less
transactions %>%
filter(loyalty_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4648
# how many products had a coupon price of $1 or less
transactions %>%
filter(coupon_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4844
Excercise 3
transactions %>%
group_by(basket_id) %>%
summarize(total_sales = sum(sales_value, na.rm = TRUE)) %>%
summarize(
prop_10 = mean(total_sales > 10),
prop_20 = mean(total_sales > 20)
)
## # A tibble: 1 × 2
## prop_10 prop_20
## <dbl> <dbl>
## 1 0.107 0.0258
Excercise 4
# Which stores had the largest total `sales_value`
transactions %>%
group_by(sales_value) %>%
summarize(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales_value))
## # A tibble: 1,850 × 2
## sales_value total_sales_value
## <dbl> <dbl>
## 1 2 8838
## 2 2.99 8369.
## 3 2.5 7430
## 4 3.99 6576.
## 5 1 5066
## 6 3 4578
## 7 4.99 4456.
## 8 5 4330
## 9 1.99 4310.
## 10 5.99 4067.
## # ℹ 1,840 more rows
# Which stores had the largest average loyalty discount
transactions %>%
mutate(pct_loyalty_disc = 1 - (loyalty_price / regular_price)) %>%
group_by(store_id) %>%
summarize(avg_pct_loyalty_disc = mean(loyalty_price, na.rm = TRUE)) %>%
arrange(desc(avg_pct_loyalty_disc))
## # A tibble: 293 × 2
## store_id avg_pct_loyalty_disc
## <chr> <dbl>
## 1 389 Inf
## 2 602 24.0
## 3 3113 20
## 4 108 14.0
## 5 3163 11.3
## 6 2867 10.5
## 7 3327 8.55
## 8 3253 8.00
## 9 197 6.35
## 10 1829 6.16
## # ℹ 283 more rows
Part 2
Excercise 1
library(readxl)
excel_sheets(path = "data/mbta.xlsx")
## [1] "Sheet1"
mbta <- read_excel(path = "data/mbta.xlsx", skip = 1, na = "NA")
## New names:
## • `` -> `...1`
Excercise 2
# Structure of mbta
mbta
## # A tibble: 11 × 60
## ...1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 All Modes … NA NA 1188. NA NA 1246.
## 2 2 Boat 4 3.6 40 4.3 4.9 5.8
## 3 3 Bus 336. 339. 340. 352. 354. 351.
## 4 4 Commuter R… 142. 138. 138. 140. 139 143
## 5 5 Heavy Rail 435. 448. 459. 472. 475. 477.
## 6 6 Light Rail 227. 240. 241. 256. 248. 246.
## 7 7 Pct Chg / … 0.02 -0.04 0.114 -0.002 0.049 0.096
## 8 8 Private Bus 4.77 4.42 4.57 4.54 4.77 4.72
## 9 9 RIDE 4.9 5 5.5 5.4 5.4 5.6
## 10 10 Trackless … 12.8 12.9 13.1 13.4 13.5 13.3
## 11 11 TOTAL 1167. 1192. 1205. 1247. 1245. 1246.
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## # `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## # `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## # `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## # `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## # `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## # `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
# View first 6 rows
head(mbta, n=6)
## # A tibble: 6 × 60
## ...1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 All Modes b… NA NA 1188. NA NA 1246.
## 2 2 Boat 4 3.6 40 4.3 4.9 5.8
## 3 3 Bus 336. 339. 340. 352. 354. 351.
## 4 4 Commuter Ra… 142. 138. 138. 140. 139 143
## 5 5 Heavy Rail 435. 448. 459. 472. 475. 477.
## 6 6 Light Rail 227. 240. 241. 256. 248. 246.
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## # `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## # `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## # `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## # `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## # `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## # `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
# Summary of mbta
summary(mbta)
## ...1 mode 2007-01 2007-02
## Min. : 1.0 Length:11 Min. : 0.020 Min. : -0.040
## 1st Qu.: 3.5 Class :character 1st Qu.: 4.804 1st Qu.: 4.563
## Median : 6.0 Mode :character Median : 77.478 Median : 75.707
## Mean : 6.0 Mean : 233.397 Mean : 238.324
## 3rd Qu.: 8.5 3rd Qu.: 308.672 3rd Qu.: 314.072
## Max. :11.0 Max. :1166.974 Max. :1191.639
## NA's :1 NA's :1
## 2007-03 2007-04 2007-05 2007-06
## Min. : 0.114 Min. : -0.002 Min. : 0.049 Min. : 0.096
## 1st Qu.: 9.278 1st Qu.: 4.756 1st Qu.: 5.025 1st Qu.: 5.700
## Median : 137.700 Median : 76.472 Median : 76.240 Median : 143.000
## Mean : 330.293 Mean : 249.421 Mean : 248.956 Mean : 339.846
## 3rd Qu.: 399.225 3rd Qu.: 328.011 3rd Qu.: 327.841 3rd Qu.: 413.788
## Max. :1204.725 Max. :1247.105 Max. :1244.755 Max. :1246.129
## NA's :1 NA's :1
## 2007-07 2007-08 2007-09 2007-10
## Min. : -0.037 Min. : 0.004 Min. : -0.007 Min. : -0.064
## 1st Qu.: 5.570 1st Qu.: 5.624 1st Qu.: 5.539 1st Qu.: 5.310
## Median : 77.851 Median : 77.753 Median : 143.051 Median : 80.582
## Mean : 248.787 Mean : 244.665 Mean : 352.554 Mean : 248.884
## 3rd Qu.: 328.961 3rd Qu.: 325.336 3rd Qu.: 436.082 3rd Qu.: 336.994
## Max. :1243.952 Max. :1223.323 Max. :1310.764 Max. :1244.453
## NA's :1 NA's :1 NA's :1
## 2007-11 2007-12 2008-01 2008-02
## Min. : -0.077 Min. : -0.060 Min. : 0.048 Min. : 0.061
## 1st Qu.: 4.478 1st Qu.: 4.385 1st Qu.: 4.475 1st Qu.: 4.485
## Median : 79.356 Median : 141.585 Median : 78.023 Median : 78.389
## Mean : 248.371 Mean : 321.588 Mean : 244.615 Mean : 252.803
## 3rd Qu.: 310.744 3rd Qu.: 380.594 3rd Qu.: 315.549 3rd Qu.: 327.005
## Max. :1241.895 Max. :1216.890 Max. :1223.050 Max. :1263.983
## NA's :1 NA's :1 NA's :1
## 2008-03 2008-04 2008-05 2008-06
## Min. : 0.058 Min. : 0.060 Min. : 0.046 Min. : 0.060
## 1st Qu.: 5.170 1st Qu.: 4.689 1st Qu.: 4.629 1st Qu.: 5.742
## Median : 137.453 Median : 77.555 Median : 78.506 Median : 142.057
## Mean : 345.604 Mean : 264.435 Mean : 260.323 Mean : 359.667
## 3rd Qu.: 427.601 3rd Qu.: 343.909 3rd Qu.: 337.515 3rd Qu.: 440.656
## Max. :1274.031 Max. :1322.146 Max. :1301.591 Max. :1320.728
## NA's :1 NA's :1
## 2008-07 2008-08 2008-09 2008-10
## Min. : 0.069 Min. : 0.023 Min. : 0.021 Min. : 0.054
## 1st Qu.: 6.019 1st Qu.: 5.887 1st Qu.: 5.691 1st Qu.: 5.087
## Median : 80.061 Median : 79.141 Median : 141.907 Median : 82.486
## Mean : 266.027 Mean : 250.383 Mean : 362.099 Mean : 262.440
## 3rd Qu.: 345.730 3rd Qu.: 330.629 3rd Qu.: 453.430 3rd Qu.: 330.996
## Max. :1330.103 Max. :1251.905 Max. :1338.015 Max. :1312.172
## NA's :1 NA's :1 NA's :1
## 2008-11 2008-12 2009-01 2009-02
## Min. : 0.002 Min. : -0.015 Min. : -0.034 Min. : -0.02
## 1st Qu.: 4.829 1st Qu.: 4.689 1st Qu.: 4.186 1st Qu.: 4.38
## Median : 82.774 Median : 140.810 Median : 76.874 Median : 76.59
## Mean : 248.871 Mean : 319.882 Mean : 236.303 Mean : 247.65
## 3rd Qu.: 317.084 3rd Qu.: 386.255 3rd Qu.: 307.814 3rd Qu.: 327.49
## Max. :1244.354 Max. :1232.655 Max. :1181.534 Max. :1238.24
## NA's :1 NA's :1 NA's :1
## 2009-03 2009-04 2009-05 2009-06
## Min. : -0.050 Min. : -0.048 Min. : -0.058 Min. : -0.079
## 1st Qu.: 5.003 1st Qu.: 4.720 1st Qu.: 4.763 1st Qu.: 5.845
## Median : 142.893 Median : 76.833 Median : 78.358 Median : 142.006
## Mean : 330.142 Mean : 251.603 Mean : 245.116 Mean : 333.194
## 3rd Qu.: 410.455 3rd Qu.: 329.093 3rd Qu.: 322.303 3rd Qu.: 410.482
## Max. :1210.912 Max. :1258.037 Max. :1225.608 Max. :1233.085
## NA's :1 NA's :1
## 2009-07 2009-08 2009-09 2009-10
## Min. : -0.094 Min. : -0.044 Min. : -0.035 Min. : -0.014
## 1st Qu.: 6.298 1st Qu.: 6.033 1st Qu.: 5.693 1st Qu.: 4.883
## Median : 74.558 Median : 75.604 Median : 139.087 Median : 75.178
## Mean : 241.006 Mean : 239.427 Mean : 346.687 Mean : 258.811
## 3rd Qu.: 312.185 3rd Qu.: 307.442 3rd Qu.: 437.332 3rd Qu.: 347.086
## Max. :1205.079 Max. :1197.158 Max. :1291.564 Max. :1294.064
## NA's :1 NA's :1 NA's :1
## 2009-11 2009-12 2010-01 2010-02
## Min. : -0.022 Min. : -0.022 Min. : 0.004 Min. : -0.022
## 1st Qu.: 4.323 1st Qu.: 4.784 1st Qu.: 4.034 1st Qu.: 4.062
## Median : 70.997 Median : 126.066 Median : 71.588 Median : 72.238
## Mean : 243.363 Mean : 312.962 Mean : 237.255 Mean : 242.244
## 3rd Qu.: 328.913 3rd Qu.: 386.659 3rd Qu.: 322.769 3rd Qu.: 324.137
## Max. :1216.824 Max. :1207.845 Max. :1186.271 Max. :1211.228
## NA's :1 NA's :1 NA's :1
## 2010-03 2010-04 2010-05 2010-06
## Min. : 0.012 Min. : 0.007 Min. : 0.013 Min. : 0.008
## 1st Qu.: 5.274 1st Qu.: 5.130 1st Qu.: 5.086 1st Qu.: 6.436
## Median : 131.252 Median : 72.370 Median : 70.785 Median : 129.144
## Mean : 332.726 Mean : 253.446 Mean : 248.231 Mean : 335.964
## 3rd Qu.: 428.420 3rd Qu.: 340.770 3rd Qu.: 339.579 3rd Qu.: 426.769
## Max. :1225.556 Max. :1267.226 Max. :1241.148 Max. :1244.409
## NA's :1 NA's :1
## 2010-07 2010-08 2010-09 2010-10
## Min. : 0.001 Min. : -0.015 Min. : 0.001 Min. : 0.009
## 1st Qu.: 6.531 1st Qu.: 6.281 1st Qu.: 5.567 1st Qu.: 5.006
## Median : 64.950 Median : 68.388 Median : 132.892 Median : 69.340
## Mean : 241.180 Mean : 235.947 Mean : 346.524 Mean : 261.255
## 3rd Qu.: 319.462 3rd Qu.: 310.399 3rd Qu.: 451.361 3rd Qu.: 347.832
## Max. :1205.901 Max. :1179.745 Max. :1293.117 Max. :1306.271
## NA's :1 NA's :1 NA's :1
## 2010-11 2010-12 2011-01 2011-02
## Min. : 0.022 Min. : -0.004 Min. : -0.028 Min. : 0.008
## 1st Qu.: 4.402 1st Qu.: 4.466 1st Qu.: 4.039 1st Qu.: 4.329
## Median : 69.166 Median : 121.422 Median : 69.750 Median : 68.579
## Mean : 248.748 Mean : 312.917 Mean : 230.680 Mean : 244.133
## 3rd Qu.: 331.437 3rd Qu.: 388.385 3rd Qu.: 300.831 3rd Qu.: 314.647
## Max. :1243.730 Max. :1216.262 Max. :1153.413 Max. :1220.663
## NA's :1 NA's :1 NA's :1
## 2011-03 2011-04 2011-05 2011-06
## Min. : 0.05 Min. : 0.036 Min. : 0.050 Min. : 0.054
## 1st Qu.: 6.03 1st Qu.: 5.224 1st Qu.: 5.338 1st Qu.: 6.926
## Median : 134.37 Median : 73.384 Median : 74.216 Median : 135.581
## Mean : 345.17 Mean : 262.660 Mean : 260.582 Mean : 353.331
## 3rd Qu.: 448.56 3rd Qu.: 345.904 3rd Qu.: 345.411 3rd Qu.: 452.923
## Max. :1286.66 Max. :1313.283 Max. :1302.884 Max. :1302.414
## NA's :1 NA's :1
## 2011-07 2011-08 2011-09 2011-10
## Min. : 0.067 Min. : 0.052 Min. : 0.043 Min. : 0.032
## 1st Qu.: 6.911 1st Qu.: 7.067 1st Qu.: 6.660 1st Qu.: 5.513
## Median : 71.735 Median : 70.853 Median : 136.901 Median : 70.508
## Mean : 257.228 Mean : 248.259 Mean : 362.555 Mean : 269.648
## 3rd Qu.: 328.941 3rd Qu.: 320.386 3rd Qu.: 469.204 3rd Qu.: 358.284
## Max. :1286.107 Max. :1241.268 Max. :1348.754 Max. :1348.222
## NA's :1 NA's :1 NA's :1
# How many missing values in each column
sum(is.na(mbta))
## [1] 39
Excercise 3
mbta <- mbta %>%
slice(-c(1, 7, 11)) %>% # Remove the first, seventh, and eleventh rows of mbta.
select(-1) # Remove the first column.
dim(mbta) # Now what is the dimensions of this new data frame?
## [1] 8 59
Excercise 4
mbta <- mbta %>%
pivot_longer(cols = 3, names_to = 'RIDE', values_to = 'thou_riders') # Pivot the rows and columns of the mbta data so that all
# columns are variables of the data. This should result in
# 3 columns - `mode`, `date`, and number of riders in
# thousands (`thou_riders`).
dim(mbta) # Now what is the dimensions of this new data frame?
## [1] 8 60
Excercise 5
mbta <- mbta %>%
separate(col = 'mode', into = c("year", "month"), sep = "-") # Split the month column of mbta at the dash and create a new
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 8 rows [1, 2, 3, 4, 5, 6,
## 7, 8].
# month column with only the month and a year column with only
# the year.
head(mbta) # View the head of this new mbta data set.
## # A tibble: 6 × 61
## year month `2007-01` `2007-03` `2007-04` `2007-05` `2007-06` `2007-07`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Boat <NA> 4 40 4.3 4.9 5.8 6.52
## 2 Bus <NA> 336. 340. 352. 354. 351. 358.
## 3 Commuter Ra… <NA> 142. 138. 140. 139 143 142.
## 4 Heavy Rail <NA> 435. 459. 472. 475. 477. 472.
## 5 Light Rail <NA> 227. 241. 256. 248. 246. 243.
## 6 Private Bus <NA> 4.77 4.57 4.54 4.77 4.72 3.94
## # ℹ 53 more variables: `2007-08` <dbl>, `2007-09` <dbl>, `2007-10` <dbl>,
## # `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>, `2008-02` <dbl>,
## # `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>, `2008-06` <dbl>,
## # `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>, `2008-10` <dbl>,
## # `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>, `2009-02` <dbl>,
## # `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>, `2009-06` <dbl>,
## # `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, `2009-10` <dbl>, …
Excercise 7
mbta %>%
group_by('mode') %>%
summarize(avg_ridership = mean(thou_riders))
## # A tibble: 1 × 2
## `"mode"` avg_ridership
## <chr> <dbl>
## 1 mode 149.
mbta %>%
filter(month == '01') %>%
group_by('mode') %>%
summarize(avg_ridership = mean(thou_riders))
## # A tibble: 0 × 2
## # ℹ 2 variables: "mode" <chr>, avg_ridership <dbl>