### Module 3 Lab
# Part 1 --------------------------------------------------------------------------
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
transactions <- transactions_sample # just using a sample of the entire data
transactions
## # A tibble: 75,000 × 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2261 309 31625220889 940996 1 3.86 0.43
## 2 2131 368 32053127496 873902 1 1.59 0.9
## 3 511 316 32445856036 847901 1 1 0.69
## 4 400 388 31932241118 13094913 2 11.9 2.9
## 5 918 340 32074655895 1085604 1 1.29 0
## 6 718 324 32614612029 883203 1 2.5 0.49
## 7 868 323 32074722463 9884484 1 3.49 0
## 8 1688 450 34850403304 1028715 1 2 1.79
## 9 467 31782 31280745102 896613 2 6.55 4.44
## 10 1947 32004 32744181707 978497 1 3.99 0
## # ℹ 74,990 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
## # A tibble: 75,000 x 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2261 309 31625220889 940996 1 3.86 0.43
## 2 2131 368 32053127496 873902 1 1.59 0.9
## 3 511 316 32445856036 847901 1 1 0.69
## 4 400 388 31932241118 13094913 2 11.9 2.9
## 5 918 340 32074655895 1085604 1 1.29 0
## 6 718 324 32614612029 883203 1 2.5 0.49
## 7 868 323 32074722463 9884484 1 3.49 0
## 8 1688 450 34850403304 1028715 1 2 1.79
## 9 467 31782 31280745102 896613 2 6.55 4.44
## 10 1947 32004 32744181707 978497 1 3.99 0
## # i 74,990 more rows
## # i 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
products
## # A tibble: 92,331 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio… FRZN ICE ICE - CRUSH…
## 2 26081 2 MISCELLANEOUS Natio… <NA> <NA>
## 3 26093 69 PASTRY Priva… BREAD BREAD:ITALI…
## 4 26190 69 GROCERY Priva… FRUIT - SHELF S… APPLE SAUCE
## 5 26355 69 GROCERY Priva… COOKIES/CONES SPECIALTY C…
## 6 26426 69 GROCERY Priva… SPICES & EXTRAC… SPICES & SE…
## 7 26540 69 GROCERY Priva… COOKIES/CONES TRAY PACK/C…
## 8 26601 69 DRUG GM Priva… VITAMINS VITAMIN - M…
## 9 26636 69 PASTRY Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691 16 GROCERY Priva… PNT BTR/JELLY/J… HONEY
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
## # A tibble: 92,331 x 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio~ FRZN ICE ICE - CRUSH~
## 2 26081 2 MISCELLANEOUS Natio~ <NA> <NA>
## 3 26093 69 PASTRY Priva~ BREAD BREAD:ITALI~
## 4 26190 69 GROCERY Priva~ FRUIT - SHELF S~ APPLE SAUCE
## 5 26355 69 GROCERY Priva~ COOKIES/CONES SPECIALTY C~
## 6 26426 69 GROCERY Priva~ SPICES & EXTRAC~ SPICES & SE~
## 7 26540 69 GROCERY Priva~ COOKIES/CONES TRAY PACK/C~
## 8 26601 69 DRUG GM Priva~ VITAMINS VITAMIN - M~
## 9 26636 69 PASTRY Priva~ BREAKFAST SWEETS SW GDS: SW ~
## 10 26691 16 GROCERY Priva~ PNT BTR/JELLY/J~ HONEY
## # i 92,321 more rows
## # i 1 more variable: package_size <chr>
# Exercise 1 ----------------------------------------------------------------------
## regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity
## loyalty_price = (sales_value + coupon_match_disc) / quantity
## coupon_price = (sales_value - coupon_disc) / quantity
# Q0: Create three new variables named `regular_price`, `loyalty_price`, and
# `coupon_price` according to the logic shown above
transactions <- transactions %>%
mutate(
regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
loyalty_price = (sales_value + coupon_match_disc) / quantity,
coupon_price = (sales_value - coupon_disc) / quantity
) %>%
select(regular_price, loyalty_price, coupon_price, product_id, everything())
# Q1. Identify the five households with the largest `loyalty_price` transactions. What
# is unique about the transaction with the largest `loyalty_price` value?
transactions %>%
slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Inf Inf Inf 13945244 2491 389
## 2 100 100 100 12484608 1246 334
## 3 100. 100. 100. 13040176 2318 381
## 4 88.9 88.9 88.9 15630122 1172 396
## 5 85.0 66.1 66.1 916561 2312 442
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
# Q2. Now filter for only those observations where quantity was greater than 0. Now which
# household(s) have the largest `loyalty_price` transaction?
transactions %>%
filter(quantity > 0) %>%
slice_max(order_by = loyalty_price, n = 1)
## # A tibble: 1 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 100 100 100 12484608 1246 334
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
## household id = 1246
# Q3. Using the first transaction in the result from #2, filter the `products` data based
# on the `product_id` to find out which product the largest `loyalty_price` transaction
# is associated with.
products %>%
filter(product_id == 12484608)
## # A tibble: 1 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 12484608 903 MISCELLANEOUS Private COUPON/MISC ITE… ELECTRONIC …
## # ℹ 1 more variable: package_size <chr>
# Exercise 2 ----------------------------------------------------------------------
## Hint: After filtering, select the product_id column and count
## unique products using the n_distinct() function.
# how many products had a regular price of $1 or less
transactions %>%
filter(regular_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 2748
# how many products had a loyalty price of $1 or less
transactions %>%
filter(loyalty_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4648
# how many products had a coupon price of $1 or less
transactions %>%
filter(coupon_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4844
# Exercise 3 ----------------------------------------------------------------------
## What proportion of baskets are over $10 in sales value?
## What proportion of baskets are over $20 in sales value?
## Hint: You need to use group_by() and summarize().
## Depending on your approach you may or may not use mutate().
# test your ability to right this code from scratch rather than just
# filling in the blanks :)
## _______
library(dplyr)
transactions %>%
group_by(basket_id) %>%
summarize(total_sales = sum(quantity * loyalty_price, na.rm = TRUE)) %>%
summarize(proportions = mean(total_sales >10))
## # A tibble: 1 × 1
## proportions
## <dbl>
## 1 0.107
transactions %>%
group_by(basket_id) %>%
summarize(total_sales = sum(quantity * loyalty_price, na.rm = TRUE)) %>%
summarize(proportions = mean(total_sales > 20))
## # A tibble: 1 × 1
## proportions
## <dbl>
## 1 0.0258
# Exercise 4 ----------------------------------------------------------------------
## Which stores had the largest total sales_value
## (hint: sum(sales_value, na.rm = TRUE)?
## Which stores had the largest average loyalty discount as defined below?
## Hint: You can calculate loyalty discount as a percentage
## of regular price using the following logic:
library(dplyr)
transactions
## # A tibble: 75,000 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 4.29 3.86 3.86 940996 2261 309
## 2 2.49 1.59 1.59 873902 2131 368
## 3 1.69 1 1 847901 511 316
## 4 7.38 5.94 5.94 13094913 400 388
## 5 1.29 1.29 1.29 1085604 918 340
## 6 2.99 2.5 2.5 883203 718 324
## 7 3.49 3.49 3.49 9884484 868 323
## 8 3.79 2 2 1028715 1688 450
## 9 5.50 3.28 3.28 896613 467 31782
## 10 3.99 3.99 3.99 978497 1947 32004
## # ℹ 74,990 more rows
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
## pct_loyalty_disc = 1 - (loyalty_price / regular_price)
# Which stores had the largest total `sales_value`
transactions %>%
group_by(store_id) %>%
summarize(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales_value))
## # A tibble: 293 × 2
## store_id total_sales_value
## <chr> <dbl>
## 1 367 7713.
## 2 406 6034.
## 3 429 4702.
## 4 343 4471.
## 5 361 4061.
## 6 356 3807.
## 7 381 3746.
## 8 292 3702.
## 9 31782 3684.
## 10 321 3515.
## # ℹ 283 more rows
# Which stores had the largest average loyalty discount
transactions %>%
mutate(pct_loyalty_disc = 1 - (loyalty_price / regular_price)) %>%
group_by(store_id) %>%
summarize(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>%
arrange(desc(avg_pct_loyalty_disc))
## # A tibble: 293 × 2
## store_id avg_pct_loyalty_disc
## <chr> <dbl>
## 1 224 0.576
## 2 62 0.501
## 3 779 0.475
## 4 3163 0.433
## 5 784 0.430
## 6 2950 0.418
## 7 572 0.401
## 8 65 0.390
## 9 486 0.388
## 10 2839 0.373
## # ℹ 283 more rows
# Part 2 --------------------------------------------------------------------------
# Exercise 1 - Import the data ----------------------------------------------------
library(readxl)
excel_sheets(path = "mbta.xlsx")
## [1] "Sheet1"
mbta <- read_excel(path = "mbta.xlsx", skip = 1, na = "NA")
## New names:
## • `` -> `...1`
mbta
## # A tibble: 11 × 60
## ...1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 All Modes … NA NA 1188. NA NA 1246.
## 2 2 Boat 4 3.6 40 4.3 4.9 5.8
## 3 3 Bus 336. 339. 340. 352. 354. 351.
## 4 4 Commuter R… 142. 138. 138. 140. 139 143
## 5 5 Heavy Rail 435. 448. 459. 472. 475. 477.
## 6 6 Light Rail 227. 240. 241. 256. 248. 246.
## 7 7 Pct Chg / … 0.02 -0.04 0.114 -0.002 0.049 0.096
## 8 8 Private Bus 4.77 4.42 4.57 4.54 4.77 4.72
## 9 9 RIDE 4.9 5 5.5 5.4 5.4 5.6
## 10 10 Trackless … 12.8 12.9 13.1 13.4 13.5 13.3
## 11 11 TOTAL 1167. 1192. 1205. 1247. 1245. 1246.
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## # `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## # `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## # `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## # `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## # `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## # `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
# Exercise 2 - Exam the data ------------------------------------------------------
head(mbta)
## # A tibble: 6 × 60
## ...1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 All Modes b… NA NA 1188. NA NA 1246.
## 2 2 Boat 4 3.6 40 4.3 4.9 5.8
## 3 3 Bus 336. 339. 340. 352. 354. 351.
## 4 4 Commuter Ra… 142. 138. 138. 140. 139 143
## 5 5 Heavy Rail 435. 448. 459. 472. 475. 477.
## 6 6 Light Rail 227. 240. 241. 256. 248. 246.
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## # `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## # `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## # `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## # `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## # `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## # `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
view(mbta)
head(mbta, n = 10)
## # A tibble: 10 × 60
## ...1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 All Modes … NA NA 1188. NA NA 1246.
## 2 2 Boat 4 3.6 40 4.3 4.9 5.8
## 3 3 Bus 336. 339. 340. 352. 354. 351.
## 4 4 Commuter R… 142. 138. 138. 140. 139 143
## 5 5 Heavy Rail 435. 448. 459. 472. 475. 477.
## 6 6 Light Rail 227. 240. 241. 256. 248. 246.
## 7 7 Pct Chg / … 0.02 -0.04 0.114 -0.002 0.049 0.096
## 8 8 Private Bus 4.77 4.42 4.57 4.54 4.77 4.72
## 9 9 RIDE 4.9 5 5.5 5.4 5.4 5.6
## 10 10 Trackless … 12.8 12.9 13.1 13.4 13.5 13.3
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## # `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## # `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## # `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## # `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## # `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## # `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
summary(mbta)
## ...1 mode 2007-01 2007-02
## Min. : 1.0 Length:11 Min. : 0.020 Min. : -0.040
## 1st Qu.: 3.5 Class :character 1st Qu.: 4.804 1st Qu.: 4.563
## Median : 6.0 Mode :character Median : 77.478 Median : 75.707
## Mean : 6.0 Mean : 233.397 Mean : 238.324
## 3rd Qu.: 8.5 3rd Qu.: 308.672 3rd Qu.: 314.072
## Max. :11.0 Max. :1166.974 Max. :1191.639
## NA's :1 NA's :1
## 2007-03 2007-04 2007-05 2007-06
## Min. : 0.114 Min. : -0.002 Min. : 0.049 Min. : 0.096
## 1st Qu.: 9.278 1st Qu.: 4.756 1st Qu.: 5.025 1st Qu.: 5.700
## Median : 137.700 Median : 76.472 Median : 76.240 Median : 143.000
## Mean : 330.293 Mean : 249.421 Mean : 248.956 Mean : 339.846
## 3rd Qu.: 399.225 3rd Qu.: 328.011 3rd Qu.: 327.841 3rd Qu.: 413.788
## Max. :1204.725 Max. :1247.105 Max. :1244.755 Max. :1246.129
## NA's :1 NA's :1
## 2007-07 2007-08 2007-09 2007-10
## Min. : -0.037 Min. : 0.004 Min. : -0.007 Min. : -0.064
## 1st Qu.: 5.570 1st Qu.: 5.624 1st Qu.: 5.539 1st Qu.: 5.310
## Median : 77.851 Median : 77.753 Median : 143.051 Median : 80.582
## Mean : 248.787 Mean : 244.665 Mean : 352.554 Mean : 248.884
## 3rd Qu.: 328.961 3rd Qu.: 325.336 3rd Qu.: 436.082 3rd Qu.: 336.994
## Max. :1243.952 Max. :1223.323 Max. :1310.764 Max. :1244.453
## NA's :1 NA's :1 NA's :1
## 2007-11 2007-12 2008-01 2008-02
## Min. : -0.077 Min. : -0.060 Min. : 0.048 Min. : 0.061
## 1st Qu.: 4.478 1st Qu.: 4.385 1st Qu.: 4.475 1st Qu.: 4.485
## Median : 79.356 Median : 141.585 Median : 78.023 Median : 78.389
## Mean : 248.371 Mean : 321.588 Mean : 244.615 Mean : 252.803
## 3rd Qu.: 310.744 3rd Qu.: 380.594 3rd Qu.: 315.549 3rd Qu.: 327.005
## Max. :1241.895 Max. :1216.890 Max. :1223.050 Max. :1263.983
## NA's :1 NA's :1 NA's :1
## 2008-03 2008-04 2008-05 2008-06
## Min. : 0.058 Min. : 0.060 Min. : 0.046 Min. : 0.060
## 1st Qu.: 5.170 1st Qu.: 4.689 1st Qu.: 4.629 1st Qu.: 5.742
## Median : 137.453 Median : 77.555 Median : 78.506 Median : 142.057
## Mean : 345.604 Mean : 264.435 Mean : 260.323 Mean : 359.667
## 3rd Qu.: 427.601 3rd Qu.: 343.909 3rd Qu.: 337.515 3rd Qu.: 440.656
## Max. :1274.031 Max. :1322.146 Max. :1301.591 Max. :1320.728
## NA's :1 NA's :1
## 2008-07 2008-08 2008-09 2008-10
## Min. : 0.069 Min. : 0.023 Min. : 0.021 Min. : 0.054
## 1st Qu.: 6.019 1st Qu.: 5.887 1st Qu.: 5.691 1st Qu.: 5.087
## Median : 80.061 Median : 79.141 Median : 141.907 Median : 82.486
## Mean : 266.027 Mean : 250.383 Mean : 362.099 Mean : 262.440
## 3rd Qu.: 345.730 3rd Qu.: 330.629 3rd Qu.: 453.430 3rd Qu.: 330.996
## Max. :1330.103 Max. :1251.905 Max. :1338.015 Max. :1312.172
## NA's :1 NA's :1 NA's :1
## 2008-11 2008-12 2009-01 2009-02
## Min. : 0.002 Min. : -0.015 Min. : -0.034 Min. : -0.02
## 1st Qu.: 4.829 1st Qu.: 4.689 1st Qu.: 4.186 1st Qu.: 4.38
## Median : 82.774 Median : 140.810 Median : 76.874 Median : 76.59
## Mean : 248.871 Mean : 319.882 Mean : 236.303 Mean : 247.65
## 3rd Qu.: 317.084 3rd Qu.: 386.255 3rd Qu.: 307.814 3rd Qu.: 327.49
## Max. :1244.354 Max. :1232.655 Max. :1181.534 Max. :1238.24
## NA's :1 NA's :1 NA's :1
## 2009-03 2009-04 2009-05 2009-06
## Min. : -0.050 Min. : -0.048 Min. : -0.058 Min. : -0.079
## 1st Qu.: 5.003 1st Qu.: 4.720 1st Qu.: 4.763 1st Qu.: 5.845
## Median : 142.893 Median : 76.833 Median : 78.358 Median : 142.006
## Mean : 330.142 Mean : 251.603 Mean : 245.116 Mean : 333.194
## 3rd Qu.: 410.455 3rd Qu.: 329.093 3rd Qu.: 322.303 3rd Qu.: 410.482
## Max. :1210.912 Max. :1258.037 Max. :1225.608 Max. :1233.085
## NA's :1 NA's :1
## 2009-07 2009-08 2009-09 2009-10
## Min. : -0.094 Min. : -0.044 Min. : -0.035 Min. : -0.014
## 1st Qu.: 6.298 1st Qu.: 6.033 1st Qu.: 5.693 1st Qu.: 4.883
## Median : 74.558 Median : 75.604 Median : 139.087 Median : 75.178
## Mean : 241.006 Mean : 239.427 Mean : 346.687 Mean : 258.811
## 3rd Qu.: 312.185 3rd Qu.: 307.442 3rd Qu.: 437.332 3rd Qu.: 347.086
## Max. :1205.079 Max. :1197.158 Max. :1291.564 Max. :1294.064
## NA's :1 NA's :1 NA's :1
## 2009-11 2009-12 2010-01 2010-02
## Min. : -0.022 Min. : -0.022 Min. : 0.004 Min. : -0.022
## 1st Qu.: 4.323 1st Qu.: 4.784 1st Qu.: 4.034 1st Qu.: 4.062
## Median : 70.997 Median : 126.066 Median : 71.588 Median : 72.238
## Mean : 243.363 Mean : 312.962 Mean : 237.255 Mean : 242.244
## 3rd Qu.: 328.913 3rd Qu.: 386.659 3rd Qu.: 322.769 3rd Qu.: 324.137
## Max. :1216.824 Max. :1207.845 Max. :1186.271 Max. :1211.228
## NA's :1 NA's :1 NA's :1
## 2010-03 2010-04 2010-05 2010-06
## Min. : 0.012 Min. : 0.007 Min. : 0.013 Min. : 0.008
## 1st Qu.: 5.274 1st Qu.: 5.130 1st Qu.: 5.086 1st Qu.: 6.436
## Median : 131.252 Median : 72.370 Median : 70.785 Median : 129.144
## Mean : 332.726 Mean : 253.446 Mean : 248.231 Mean : 335.964
## 3rd Qu.: 428.420 3rd Qu.: 340.770 3rd Qu.: 339.579 3rd Qu.: 426.769
## Max. :1225.556 Max. :1267.226 Max. :1241.148 Max. :1244.409
## NA's :1 NA's :1
## 2010-07 2010-08 2010-09 2010-10
## Min. : 0.001 Min. : -0.015 Min. : 0.001 Min. : 0.009
## 1st Qu.: 6.531 1st Qu.: 6.281 1st Qu.: 5.567 1st Qu.: 5.006
## Median : 64.950 Median : 68.388 Median : 132.892 Median : 69.340
## Mean : 241.180 Mean : 235.947 Mean : 346.524 Mean : 261.255
## 3rd Qu.: 319.462 3rd Qu.: 310.399 3rd Qu.: 451.361 3rd Qu.: 347.832
## Max. :1205.901 Max. :1179.745 Max. :1293.117 Max. :1306.271
## NA's :1 NA's :1 NA's :1
## 2010-11 2010-12 2011-01 2011-02
## Min. : 0.022 Min. : -0.004 Min. : -0.028 Min. : 0.008
## 1st Qu.: 4.402 1st Qu.: 4.466 1st Qu.: 4.039 1st Qu.: 4.329
## Median : 69.166 Median : 121.422 Median : 69.750 Median : 68.579
## Mean : 248.748 Mean : 312.917 Mean : 230.680 Mean : 244.133
## 3rd Qu.: 331.437 3rd Qu.: 388.385 3rd Qu.: 300.831 3rd Qu.: 314.647
## Max. :1243.730 Max. :1216.262 Max. :1153.413 Max. :1220.663
## NA's :1 NA's :1 NA's :1
## 2011-03 2011-04 2011-05 2011-06
## Min. : 0.05 Min. : 0.036 Min. : 0.050 Min. : 0.054
## 1st Qu.: 6.03 1st Qu.: 5.224 1st Qu.: 5.338 1st Qu.: 6.926
## Median : 134.37 Median : 73.384 Median : 74.216 Median : 135.581
## Mean : 345.17 Mean : 262.660 Mean : 260.582 Mean : 353.331
## 3rd Qu.: 448.56 3rd Qu.: 345.904 3rd Qu.: 345.411 3rd Qu.: 452.923
## Max. :1286.66 Max. :1313.283 Max. :1302.884 Max. :1302.414
## NA's :1 NA's :1
## 2011-07 2011-08 2011-09 2011-10
## Min. : 0.067 Min. : 0.052 Min. : 0.043 Min. : 0.032
## 1st Qu.: 6.911 1st Qu.: 7.067 1st Qu.: 6.660 1st Qu.: 5.513
## Median : 71.735 Median : 70.853 Median : 136.901 Median : 70.508
## Mean : 257.228 Mean : 248.259 Mean : 362.555 Mean : 269.648
## 3rd Qu.: 328.941 3rd Qu.: 320.386 3rd Qu.: 469.204 3rd Qu.: 358.284
## Max. :1286.107 Max. :1241.268 Max. :1348.754 Max. :1348.222
## NA's :1 NA's :1 NA's :1
sum(is.na(mbta))
## [1] 39
# Exercise 3 - Removing unnecessary rows and columns ------------------------------
## 1. Remove the first, seventh, and eleventh rows of mbta.
## 2. Remove the first column.
## 3. Now what is the dimensions of this new data frame?
mbta <- mbta %>%
slice(-c(1,7,11)) %>% # Remove the first, seventh, and eleventh rows of mbta.
select(-1) # Remove the first column.
dim(mbta) # Now what is the dimensions of this new data frame?
## [1] 8 59
head(mbta)
## # A tibble: 6 × 59
## mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06` `2007-07`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Boat 4 3.6 40 4.3 4.9 5.8 6.52
## 2 Bus 336. 339. 340. 352. 354. 351. 358.
## 3 Commute… 142. 138. 138. 140. 139 143 142.
## 4 Heavy R… 435. 448. 459. 472. 475. 477. 472.
## 5 Light R… 227. 240. 241. 256. 248. 246. 243.
## 6 Private… 4.77 4.42 4.57 4.54 4.77 4.72 3.94
## # ℹ 51 more variables: `2007-08` <dbl>, `2007-09` <dbl>, `2007-10` <dbl>,
## # `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>, `2008-02` <dbl>,
## # `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>, `2008-06` <dbl>,
## # `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>, `2008-10` <dbl>,
## # `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>, `2009-02` <dbl>,
## # `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>, `2009-06` <dbl>,
## # `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, `2009-10` <dbl>, …
# Exercise 4 - Observations are stored in columns ---------------------------------
## 1. Pivot the rows and columns of the mbta data so
## that all columns are variables of the data.
## This should result in 3 columns - mode, date, and
## number of riders in thousands (thou_riders).
## 2. What are the new dimensions of this data?
library(dplyr)
library(tidyr)
mbta<- mbta %>%
pivot_longer(
cols = -mode,
names_to = "date",
values_to = "thou_riders"
)
dim(mbta)
## [1] 464 3
head(mbta)
## # A tibble: 6 × 3
## mode date thou_riders
## <chr> <chr> <dbl>
## 1 Boat 2007-01 4
## 2 Boat 2007-02 3.6
## 3 Boat 2007-03 40
## 4 Boat 2007-04 4.3
## 5 Boat 2007-05 4.9
## 6 Boat 2007-06 5.8
view(mbta)
# Exercise 5 - Separating columns -------------------------------------------------
mbta <- mbta %>%
separate(date, into = c("year", "month"), sep = "-")
# Split the month column of mbta at the dash and create a new
# month column with only the month and a year column with only
# the year.
head(mbta)
## # A tibble: 6 × 4
## mode year month thou_riders
## <chr> <chr> <chr> <dbl>
## 1 Boat 2007 01 4
## 2 Boat 2007 02 3.6
## 3 Boat 2007 03 40
## 4 Boat 2007 04 4.3
## 5 Boat 2007 05 4.9
## 6 Boat 2007 06 5.8
# View the head of this new mbta data set.
view(mbta)
# Exercise 6 - Dealing with entry error -------------------------------------------
incorrect <- mbta %>%
filter(mode == "Boat", thou_riders == 40)
incorrect
## # A tibble: 1 × 4
## mode year month thou_riders
## <chr> <chr> <chr> <dbl>
## 1 Boat 2007 03 40
# Exercise 7 - Performing descriptive analysis ------------------------------------
## 1. Compute the average ridership per mode.
mbta %>%
group_by(mode) %>%
summarize(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
## mode avg_ridership
## <chr> <dbl>
## 1 Boat 5.07
## 2 Bus 359.
## 3 Commuter Rail 137.
## 4 Heavy Rail 489.
## 5 Light Rail 233.
## 6 Private Bus 3.35
## 7 RIDE 6.60
## 8 Trackless Trolley 12.1
## 2. Compute the average ridership per mode for the month of January.
mbta %>%
filter(month == "01") %>%
group_by(mode) %>%
summarize(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
## mode avg_ridership
## <chr> <dbl>
## 1 Boat 3.31
## 2 Bus 342.
## 3 Commuter Rail 137.
## 4 Heavy Rail 460.
## 5 Light Rail 217.
## 6 Private Bus 3.47
## 7 RIDE 5.94
## 8 Trackless Trolley 12.5
## 3. Which year had the largest total ridership for the boat mode?
mbta %>%
filter(mode == "Boat") %>%
group_by(year) %>%
summarize(total_riders = sum(thou_riders, na.rm = TRUE)) %>%
arrange(desc(total_riders)) %>%
slice(1)
## # A tibble: 1 × 2
## year total_riders
## <chr> <dbl>
## 1 2007 93.1
## 4. On average, which month experiences the greatest number of passengers on the Heavy Rail mode?
mbta %>%
filter(mode == "Heavy Rail") %>%
group_by(month) %>%
summarize(avg_ridership = mean(thou_riders, na.rm = TRUE)) %>%
arrange(desc(avg_ridership)) %>%
slice(1)
## # A tibble: 1 × 2
## month avg_ridership
## <chr> <dbl>
## 1 09 518.