transactions <- transactions_sample
transactions
## # A tibble: 75,000 × 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2261 309 31625220889 940996 1 3.86 0.43
## 2 2131 368 32053127496 873902 1 1.59 0.9
## 3 511 316 32445856036 847901 1 1 0.69
## 4 400 388 31932241118 13094913 2 11.9 2.9
## 5 918 340 32074655895 1085604 1 1.29 0
## 6 718 324 32614612029 883203 1 2.5 0.49
## 7 868 323 32074722463 9884484 1 3.49 0
## 8 1688 450 34850403304 1028715 1 2 1.79
## 9 467 31782 31280745102 896613 2 6.55 4.44
## 10 1947 32004 32744181707 978497 1 3.99 0
## # ℹ 74,990 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
# Q0: Create three new variables named `regular_price`, `loyalty_price`, and
# `coupon_price` according to the logic shown above
transactions <- transactions %>%
mutate(
regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
loyalty_price = (sales_value + coupon_match_disc) / quantity,
coupon_price = (sales_value - coupon_disc) / quantity
) %>%
select(regular_price, loyalty_price, coupon_price, product_id, everything())
# Q1. Identify the five households with the largest `loyalty_price` transactions. What
# is unique about the transaction with the largest `loyalty_price` value?
transactions %>%
slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Inf Inf Inf 13945244 2491 389
## 2 100 100 100 12484608 1246 334
## 3 100.0 100.0 100.0 13040176 2318 381
## 4 88.9 88.9 88.9 15630122 1172 396
## 5 85.0 66.1 66.1 916561 2312 442
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
Q1. The transaction with the largest loyalty_price has an infinite value, this is the unique feature of the highest loyalty_price transaction.
# Q2. Now filter for only those observations where quantity was greater than 0. Now which
# household(s) have the largest `loyalty_price` transaction?
transactions %>%
filter(quantity > 0) %>%
slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 100 100 100 12484608 1246 334
## 2 100.0 100.0 100.0 13040176 2318 381
## 3 88.9 88.9 88.9 15630122 1172 396
## 4 85.0 66.1 66.1 916561 2312 442
## 5 63.8 63.8 63.8 1076056 57 298
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
Q2. The household with the largest loyalty_price transaction is household_id 1246, the product_id associated is 12484608.
# Q3. Using the first transaction in the result from #2, filter the `products` data based
# on the `product_id` to find out which product the largest `loyalty_price` transaction
# is associated with.
products %>%
filter(product_id == 12484608)
## # A tibble: 1 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 12484608 903 MISCELLANEOUS Private COUPON/MISC ITE… ELECTRONIC …
## # ℹ 1 more variable: package_size <chr>
Q3. Using the first transaction in the result from Q2, the product with the largest loyalty_price is electronic gift cards activating.
# how many products had a regular price of $1 or less
transactions %>%
filter(regular_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 2748
# how many products had a loyalty price of $1 or less
transactions %>%
filter(loyalty_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4648
# how many products had a coupon price of $1 or less
transactions %>%
filter(coupon_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4844
There are 2748 products with a regular price of $1 or less, there are 4648 products with a loyalty price of $1 or less, and there are 4844 products with a coupon price of $1 or less.
#What proportion of baskets are over $10 in sales value?
transactions %>%
group_by(basket_id) %>%
summarise(basket_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
filter(basket_sales > 10) %>%
select(basket_id) %>%
n_distinct() / n_distinct(transactions$basket_id) * 100
## [1] 10.65978
There are 10.66% of baskets over $10 in sales value.
#What proportion of baskets are over $20 in sales value?
transactions %>%
group_by(basket_id) %>%
summarise(basket_sales = sum(sales_value, na.rm = TRUE), .groups = "drop") %>%
filter(basket_sales > 20) %>%
select(basket_id) %>%
n_distinct() / n_distinct(transactions$basket_id) * 100
## [1] 2.57816
There are 2.58% of baskets over $20 in sales value.
# Which stores had the largest total `sales_value`
transactions %>%
group_by(store_id) %>%
summarize(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales_value))
## # A tibble: 293 × 2
## store_id total_sales_value
## <chr> <dbl>
## 1 367 7713.
## 2 406 6034.
## 3 429 4702.
## 4 343 4471.
## 5 361 4061.
## 6 356 3807.
## 7 381 3746.
## 8 292 3702.
## 9 31782 3684.
## 10 321 3515.
## # ℹ 283 more rows
The stores with the largest total_sales_vale are stores 367, 406, 429, 343, and 361.
transactions %>%
mutate(pct_loyalty_disc = 1 - (loyalty_price / regular_price)) %>%
group_by(store_id) %>%
summarize(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>%
arrange(desc(avg_pct_loyalty_disc))
## # A tibble: 293 × 2
## store_id avg_pct_loyalty_disc
## <chr> <dbl>
## 1 224 0.576
## 2 62 0.501
## 3 779 0.475
## 4 3163 0.433
## 5 784 0.430
## 6 2950 0.418
## 7 572 0.401
## 8 65 0.390
## 9 486 0.388
## 10 2839 0.373
## # ℹ 283 more rows
The stores with the largest average loyalty discount are 224, 62, 779, 3163, and 784.
library(readxl)
excel_sheets(path = "C:/Users/tsander/Downloads/mbta.xlsx")
## [1] "Sheet1"
mbta <- read_excel(path = "C:/Users/tsander/Downloads/mbta.xlsx", skip = 1, na = 'NA')
## New names:
## • `` -> `...1`
#structure of mbta
glimpse(mbta)
## Rows: 11
## Columns: 60
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
## $ mode <chr> "All Modes by Qtr", "Boat", "Bus", "Commuter Rail", "Heavy R…
## $ `2007-01` <dbl> NA, 4.000, 335.819, 142.200, 435.294, 227.231, 0.020, 4.772,…
## $ `2007-02` <dbl> NA, 3.600, 338.675, 138.500, 448.271, 240.262, -0.040, 4.417…
## $ `2007-03` <dbl> 1187.653, 40.000, 339.867, 137.700, 458.583, 241.444, 0.114,…
## $ `2007-04` <dbl> NA, 4.300, 352.162, 139.500, 472.201, 255.557, -0.002, 4.542…
## $ `2007-05` <dbl> NA, 4.900, 354.367, 139.000, 474.579, 248.262, 0.049, 4.768,…
## $ `2007-06` <dbl> 1245.959, 5.800, 350.543, 143.000, 477.032, 246.108, 0.096, …
## $ `2007-07` <dbl> NA, 6.521, 357.519, 142.391, 471.735, 243.286, -0.037, 3.936…
## $ `2007-08` <dbl> NA, 6.572, 355.479, 142.364, 461.605, 234.907, 0.004, 3.946,…
## $ `2007-09` <dbl> 1256.571, 5.469, 372.598, 143.051, 499.566, 265.748, -0.007,…
## $ `2007-10` <dbl> NA, 5.145, 368.847, 146.542, 457.741, 241.434, -0.064, 4.315…
## $ `2007-11` <dbl> NA, 3.763, 330.826, 145.089, 488.348, 250.497, -0.077, 4.081…
## $ `2007-12` <dbl> 1216.890, 2.985, 312.920, 141.585, 448.268, 233.379, -0.060,…
## $ `2008-01` <dbl> NA, 3.175, 340.324, 142.145, 472.624, 241.223, 0.048, 4.122,…
## $ `2008-02` <dbl> NA, 3.111, 352.905, 142.607, 492.100, 249.306, 0.061, 4.079,…
## $ `2008-03` <dbl> 1253.522, 3.512, 361.155, 137.453, 494.046, 253.132, 0.058, …
## $ `2008-04` <dbl> NA, 4.164, 368.189, 140.389, 513.204, 271.070, 0.060, 4.144,…
## $ `2008-05` <dbl> NA, 4.015, 363.903, 142.585, 507.952, 258.351, 0.046, 4.079,…
## $ `2008-06` <dbl> 1314.821, 5.189, 362.962, 142.057, 518.349, 266.961, 0.060, …
## $ `2008-07` <dbl> NA, 6.016, 370.921, 145.731, 512.309, 270.158, 0.069, 4.548,…
## $ `2008-08` <dbl> NA, 5.800, 361.057, 144.565, 476.990, 239.344, 0.023, 4.286,…
## $ `2008-09` <dbl> 1307.041, 4.587, 389.537, 141.907, 517.324, 258.171, 0.021, …
## $ `2008-10` <dbl> NA, 4.285, 357.974, 151.957, 523.644, 250.063, 0.054, 4.556,…
## $ `2008-11` <dbl> NA, 3.488, 345.423, 152.952, 487.115, 232.068, 0.002, 4.302,…
## $ `2008-12` <dbl> 1232.655, 3.007, 325.767, 140.810, 446.743, 205.420, -0.015,…
## $ `2009-01` <dbl> NA, 3.014, 338.532, 141.448, 461.004, 215.660, -0.034, 3.584…
## $ `2009-02` <dbl> NA, 3.196, 360.412, 143.529, 482.407, 228.737, -0.020, 3.604…
## $ `2009-03` <dbl> 1209.792, 3.330, 353.686, 142.893, 467.224, 222.844, -0.050,…
## $ `2009-04` <dbl> NA, 4.049, 359.380, 142.340, 493.152, 238.232, -0.048, 2.824…
## $ `2009-05` <dbl> NA, 4.119, 354.750, 144.225, 475.634, 224.962, -0.058, 2.735…
## $ `2009-06` <dbl> 1233.085, 4.900, 347.865, 142.006, 473.099, 226.259, -0.079,…
## $ `2009-07` <dbl> NA, 6.444, 339.477, 137.691, 470.828, 230.308, -0.094, 2.655…
## $ `2009-08` <dbl> NA, 5.903, 332.661, 139.158, 466.676, 231.783, -0.044, 2.504…
## $ `2009-09` <dbl> 1230.461, 4.696, 374.260, 139.087, 500.403, 250.922, -0.035,…
## $ `2009-10` <dbl> NA, 4.212, 385.868, 137.104, 513.406, 230.739, -0.014, 2.584…
## $ `2009-11` <dbl> NA, 3.576, 366.980, 129.343, 480.278, 214.711, -0.022, 2.721…
## $ `2009-12` <dbl> 1207.845, 3.113, 332.394, 126.066, 440.925, 194.446, -0.022,…
## $ `2010-01` <dbl> NA, 3.207, 362.226, 130.910, 464.069, 204.396, 0.004, 2.682,…
## $ `2010-02` <dbl> NA, 3.195, 361.138, 131.918, 480.121, 213.136, -0.022, 2.498…
## $ `2010-03` <dbl> 1208.857, 3.481, 373.443, 131.252, 483.397, 211.693, 0.012, …
## $ `2010-04` <dbl> NA, 4.452, 378.611, 131.722, 502.374, 227.246, 0.007, 2.639,…
## $ `2010-05` <dbl> NA, 4.415, 380.171, 128.800, 487.400, 217.805, 0.013, 2.686,…
## $ `2010-06` <dbl> 1244.409, 5.411, 363.275, 129.144, 490.263, 215.922, 0.008, …
## $ `2010-07` <dbl> NA, 6.513, 353.040, 122.935, 488.587, 218.729, 0.001, 2.547,…
## $ `2010-08` <dbl> NA, 6.269, 343.688, 129.732, 473.731, 210.530, -0.015, 2.433…
## $ `2010-09` <dbl> 1225.527, 4.699, 381.622, 132.892, 521.099, 236.368, 0.001, …
## $ `2010-10` <dbl> NA, 4.402, 384.987, 131.033, 532.403, 236.366, 0.009, 2.616,…
## $ `2010-11` <dbl> NA, 3.731, 367.955, 130.889, 502.887, 221.881, 0.022, 2.528,…
## $ `2010-12` <dbl> 1216.262, 3.156, 326.338, 121.422, 450.433, 196.211, -0.004,…
## $ `2011-01` <dbl> NA, 3.140, 334.958, 128.396, 468.418, 198.450, -0.028, 2.213…
## $ `2011-02` <dbl> NA, 3.284, 346.234, 125.463, 504.068, 219.886, 0.008, 2.570,…
## $ `2011-03` <dbl> 1223.452, 3.674, 380.399, 134.374, 516.730, 227.935, 0.050, …
## $ `2011-04` <dbl> NA, 4.251, 380.446, 134.169, 528.631, 242.280, 0.036, 2.762,…
## $ `2011-05` <dbl> NA, 4.431, 385.289, 136.140, 528.122, 225.776, 0.050, 2.776,…
## $ `2011-06` <dbl> 1302.414, 5.474, 376.317, 135.581, 529.528, 221.865, 0.054, …
## $ `2011-07` <dbl> NA, 6.581, 361.585, 132.410, 532.888, 231.010, 0.067, 2.671,…
## $ `2011-08` <dbl> NA, 6.733, 353.793, 130.616, 508.145, 220.164, 0.052, 2.655,…
## $ `2011-09` <dbl> 1290.549, 5.003, 388.271, 136.901, 550.137, 244.949, 0.043, …
## $ `2011-10` <dbl> NA, 4.484, 398.456, 128.720, 554.932, 237.768, 0.032, 2.967,…
#first 6 rows of data
head(mbta)
## # A tibble: 6 × 60
## ...1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 All Modes b… NA NA 1188. NA NA 1246.
## 2 2 Boat 4 3.6 40 4.3 4.9 5.8
## 3 3 Bus 336. 339. 340. 352. 354. 351.
## 4 4 Commuter Ra… 142. 138. 138. 140. 139 143
## 5 5 Heavy Rail 435. 448. 459. 472. 475. 477.
## 6 6 Light Rail 227. 240. 241. 256. 248. 246.
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## # `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## # `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## # `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## # `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## # `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## # `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
#Summary
summary(mbta)
## ...1 mode 2007-01 2007-02
## Min. : 1.0 Length:11 Min. : 0.020 Min. : -0.040
## 1st Qu.: 3.5 Class :character 1st Qu.: 4.804 1st Qu.: 4.563
## Median : 6.0 Mode :character Median : 77.478 Median : 75.707
## Mean : 6.0 Mean : 233.397 Mean : 238.324
## 3rd Qu.: 8.5 3rd Qu.: 308.672 3rd Qu.: 314.072
## Max. :11.0 Max. :1166.974 Max. :1191.639
## NA's :1 NA's :1
## 2007-03 2007-04 2007-05 2007-06
## Min. : 0.114 Min. : -0.002 Min. : 0.049 Min. : 0.096
## 1st Qu.: 9.278 1st Qu.: 4.756 1st Qu.: 5.025 1st Qu.: 5.700
## Median : 137.700 Median : 76.472 Median : 76.240 Median : 143.000
## Mean : 330.293 Mean : 249.421 Mean : 248.956 Mean : 339.846
## 3rd Qu.: 399.225 3rd Qu.: 328.011 3rd Qu.: 327.841 3rd Qu.: 413.788
## Max. :1204.725 Max. :1247.105 Max. :1244.755 Max. :1246.129
## NA's :1 NA's :1
## 2007-07 2007-08 2007-09 2007-10
## Min. : -0.037 Min. : 0.004 Min. : -0.007 Min. : -0.064
## 1st Qu.: 5.570 1st Qu.: 5.624 1st Qu.: 5.539 1st Qu.: 5.310
## Median : 77.851 Median : 77.753 Median : 143.051 Median : 80.582
## Mean : 248.787 Mean : 244.665 Mean : 352.554 Mean : 248.884
## 3rd Qu.: 328.961 3rd Qu.: 325.336 3rd Qu.: 436.082 3rd Qu.: 336.994
## Max. :1243.952 Max. :1223.323 Max. :1310.764 Max. :1244.453
## NA's :1 NA's :1 NA's :1
## 2007-11 2007-12 2008-01 2008-02
## Min. : -0.077 Min. : -0.060 Min. : 0.048 Min. : 0.061
## 1st Qu.: 4.478 1st Qu.: 4.385 1st Qu.: 4.475 1st Qu.: 4.485
## Median : 79.356 Median : 141.585 Median : 78.023 Median : 78.389
## Mean : 248.371 Mean : 321.588 Mean : 244.615 Mean : 252.803
## 3rd Qu.: 310.744 3rd Qu.: 380.594 3rd Qu.: 315.549 3rd Qu.: 327.005
## Max. :1241.895 Max. :1216.890 Max. :1223.050 Max. :1263.983
## NA's :1 NA's :1 NA's :1
## 2008-03 2008-04 2008-05 2008-06
## Min. : 0.058 Min. : 0.060 Min. : 0.046 Min. : 0.060
## 1st Qu.: 5.170 1st Qu.: 4.689 1st Qu.: 4.629 1st Qu.: 5.742
## Median : 137.453 Median : 77.555 Median : 78.506 Median : 142.057
## Mean : 345.604 Mean : 264.435 Mean : 260.323 Mean : 359.667
## 3rd Qu.: 427.601 3rd Qu.: 343.909 3rd Qu.: 337.515 3rd Qu.: 440.656
## Max. :1274.031 Max. :1322.146 Max. :1301.591 Max. :1320.728
## NA's :1 NA's :1
## 2008-07 2008-08 2008-09 2008-10
## Min. : 0.069 Min. : 0.023 Min. : 0.021 Min. : 0.054
## 1st Qu.: 6.019 1st Qu.: 5.887 1st Qu.: 5.691 1st Qu.: 5.087
## Median : 80.061 Median : 79.141 Median : 141.907 Median : 82.486
## Mean : 266.027 Mean : 250.383 Mean : 362.099 Mean : 262.440
## 3rd Qu.: 345.730 3rd Qu.: 330.629 3rd Qu.: 453.430 3rd Qu.: 330.996
## Max. :1330.103 Max. :1251.905 Max. :1338.015 Max. :1312.172
## NA's :1 NA's :1 NA's :1
## 2008-11 2008-12 2009-01 2009-02
## Min. : 0.002 Min. : -0.015 Min. : -0.034 Min. : -0.02
## 1st Qu.: 4.829 1st Qu.: 4.689 1st Qu.: 4.186 1st Qu.: 4.38
## Median : 82.774 Median : 140.810 Median : 76.874 Median : 76.59
## Mean : 248.871 Mean : 319.882 Mean : 236.303 Mean : 247.65
## 3rd Qu.: 317.084 3rd Qu.: 386.255 3rd Qu.: 307.814 3rd Qu.: 327.49
## Max. :1244.354 Max. :1232.655 Max. :1181.534 Max. :1238.24
## NA's :1 NA's :1 NA's :1
## 2009-03 2009-04 2009-05 2009-06
## Min. : -0.050 Min. : -0.048 Min. : -0.058 Min. : -0.079
## 1st Qu.: 5.003 1st Qu.: 4.720 1st Qu.: 4.763 1st Qu.: 5.845
## Median : 142.893 Median : 76.833 Median : 78.358 Median : 142.006
## Mean : 330.142 Mean : 251.603 Mean : 245.116 Mean : 333.194
## 3rd Qu.: 410.455 3rd Qu.: 329.093 3rd Qu.: 322.303 3rd Qu.: 410.482
## Max. :1210.912 Max. :1258.037 Max. :1225.608 Max. :1233.085
## NA's :1 NA's :1
## 2009-07 2009-08 2009-09 2009-10
## Min. : -0.094 Min. : -0.044 Min. : -0.035 Min. : -0.014
## 1st Qu.: 6.298 1st Qu.: 6.033 1st Qu.: 5.693 1st Qu.: 4.883
## Median : 74.558 Median : 75.604 Median : 139.087 Median : 75.178
## Mean : 241.006 Mean : 239.427 Mean : 346.687 Mean : 258.811
## 3rd Qu.: 312.185 3rd Qu.: 307.442 3rd Qu.: 437.332 3rd Qu.: 347.086
## Max. :1205.079 Max. :1197.158 Max. :1291.564 Max. :1294.064
## NA's :1 NA's :1 NA's :1
## 2009-11 2009-12 2010-01 2010-02
## Min. : -0.022 Min. : -0.022 Min. : 0.004 Min. : -0.022
## 1st Qu.: 4.323 1st Qu.: 4.784 1st Qu.: 4.034 1st Qu.: 4.062
## Median : 70.997 Median : 126.066 Median : 71.588 Median : 72.238
## Mean : 243.363 Mean : 312.962 Mean : 237.255 Mean : 242.244
## 3rd Qu.: 328.913 3rd Qu.: 386.659 3rd Qu.: 322.769 3rd Qu.: 324.137
## Max. :1216.824 Max. :1207.845 Max. :1186.271 Max. :1211.228
## NA's :1 NA's :1 NA's :1
## 2010-03 2010-04 2010-05 2010-06
## Min. : 0.012 Min. : 0.007 Min. : 0.013 Min. : 0.008
## 1st Qu.: 5.274 1st Qu.: 5.130 1st Qu.: 5.086 1st Qu.: 6.436
## Median : 131.252 Median : 72.370 Median : 70.785 Median : 129.144
## Mean : 332.726 Mean : 253.446 Mean : 248.231 Mean : 335.964
## 3rd Qu.: 428.420 3rd Qu.: 340.770 3rd Qu.: 339.579 3rd Qu.: 426.769
## Max. :1225.556 Max. :1267.226 Max. :1241.148 Max. :1244.409
## NA's :1 NA's :1
## 2010-07 2010-08 2010-09 2010-10
## Min. : 0.001 Min. : -0.015 Min. : 0.001 Min. : 0.009
## 1st Qu.: 6.531 1st Qu.: 6.281 1st Qu.: 5.567 1st Qu.: 5.006
## Median : 64.950 Median : 68.388 Median : 132.892 Median : 69.340
## Mean : 241.180 Mean : 235.947 Mean : 346.524 Mean : 261.255
## 3rd Qu.: 319.462 3rd Qu.: 310.399 3rd Qu.: 451.361 3rd Qu.: 347.832
## Max. :1205.901 Max. :1179.745 Max. :1293.117 Max. :1306.271
## NA's :1 NA's :1 NA's :1
## 2010-11 2010-12 2011-01 2011-02
## Min. : 0.022 Min. : -0.004 Min. : -0.028 Min. : 0.008
## 1st Qu.: 4.402 1st Qu.: 4.466 1st Qu.: 4.039 1st Qu.: 4.329
## Median : 69.166 Median : 121.422 Median : 69.750 Median : 68.579
## Mean : 248.748 Mean : 312.917 Mean : 230.680 Mean : 244.133
## 3rd Qu.: 331.437 3rd Qu.: 388.385 3rd Qu.: 300.831 3rd Qu.: 314.647
## Max. :1243.730 Max. :1216.262 Max. :1153.413 Max. :1220.663
## NA's :1 NA's :1 NA's :1
## 2011-03 2011-04 2011-05 2011-06
## Min. : 0.05 Min. : 0.036 Min. : 0.050 Min. : 0.054
## 1st Qu.: 6.03 1st Qu.: 5.224 1st Qu.: 5.338 1st Qu.: 6.926
## Median : 134.37 Median : 73.384 Median : 74.216 Median : 135.581
## Mean : 345.17 Mean : 262.660 Mean : 260.582 Mean : 353.331
## 3rd Qu.: 448.56 3rd Qu.: 345.904 3rd Qu.: 345.411 3rd Qu.: 452.923
## Max. :1286.66 Max. :1313.283 Max. :1302.884 Max. :1302.414
## NA's :1 NA's :1
## 2011-07 2011-08 2011-09 2011-10
## Min. : 0.067 Min. : 0.052 Min. : 0.043 Min. : 0.032
## 1st Qu.: 6.911 1st Qu.: 7.067 1st Qu.: 6.660 1st Qu.: 5.513
## Median : 71.735 Median : 70.853 Median : 136.901 Median : 70.508
## Mean : 257.228 Mean : 248.259 Mean : 362.555 Mean : 269.648
## 3rd Qu.: 328.941 3rd Qu.: 320.386 3rd Qu.: 469.204 3rd Qu.: 358.284
## Max. :1286.107 Max. :1241.268 Max. :1348.754 Max. :1348.222
## NA's :1 NA's :1 NA's :1
#Missing values in each column
colSums(is.na(mbta))
## ...1 mode 2007-01 2007-02 2007-03 2007-04 2007-05 2007-06 2007-07 2007-08
## 0 0 1 1 0 1 1 0 1 1
## 2007-09 2007-10 2007-11 2007-12 2008-01 2008-02 2008-03 2008-04 2008-05 2008-06
## 0 1 1 0 1 1 0 1 1 0
## 2008-07 2008-08 2008-09 2008-10 2008-11 2008-12 2009-01 2009-02 2009-03 2009-04
## 1 1 0 1 1 0 1 1 0 1
## 2009-05 2009-06 2009-07 2009-08 2009-09 2009-10 2009-11 2009-12 2010-01 2010-02
## 1 0 1 1 0 1 1 0 1 1
## 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12
## 0 1 1 0 1 1 0 1 1 0
## 2011-01 2011-02 2011-03 2011-04 2011-05 2011-06 2011-07 2011-08 2011-09 2011-10
## 1 1 0 1 1 0 1 1 0 1
There are 1 missing value in each column.
mbta <- mbta %>%
slice(-c(1,7,11)) %>% # Remove the first, seventh, and eleventh rows of mbta.
select(-1) # Remove the first column.
dim(mbta) # Now what is the dimensions of this new data frame?
## [1] 8 59
The new dimensions are 8 rows and 59 columns.
mbta <- mbta %>%
pivot_longer(
cols = -mode,
names_to = "date",
values_to = "thou_riders",
values_drop_na = FALSE
) # Pivot the rows and columns of the mbta data so that all columns are variables of the data. This should result in 3 columns - `mode`, `date`, and number of riders in thousands (`thou_riders`).
dim(mbta) # Now what is the dimensions of this new data frame?
## [1] 464 3
The dimensions of the new data frame are 464 rows and 3 columns.
mbta <- mbta %>%
separate(date, into = c("year", "month"), sep = "-", remove = FALSE) # Split the month column of mbta at the dash and create a new month column with only the month and a year column with only the year.
head(mbta) # View the head of this new mbta data set.
## # A tibble: 6 × 5
## mode date year month thou_riders
## <chr> <chr> <chr> <chr> <dbl>
## 1 Boat 2007-01 2007 01 4
## 2 Boat 2007-02 2007 02 3.6
## 3 Boat 2007-03 2007 03 40
## 4 Boat 2007-04 2007 04 4.3
## 5 Boat 2007-05 2007 05 4.9
## 6 Boat 2007-06 2007 06 5.8
mbta <- mbta %>%
mutate(
thou_riders = if_else(
str_detect(mode, "(?i)boat") &
thou_riders == 40,
4,
thou_riders
)
)
head(mbta)
## # A tibble: 6 × 5
## mode date year month thou_riders
## <chr> <chr> <chr> <chr> <dbl>
## 1 Boat 2007-01 2007 01 4
## 2 Boat 2007-02 2007 02 3.6
## 3 Boat 2007-03 2007 03 4
## 4 Boat 2007-04 2007 04 4.3
## 5 Boat 2007-05 2007 05 4.9
## 6 Boat 2007-06 2007 06 5.8
#Check to confirm there are no values of 40, should return 0 rows
mbta %>%
filter(str_detect(mode, "(?i)^boat$"), thou_riders == 40)
## # A tibble: 0 × 5
## # ℹ 5 variables: mode <chr>, date <chr>, year <chr>, month <chr>,
## # thou_riders <dbl>
Average ridership by mode
mbta %>%
group_by(mode) %>%
summarize(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
## mode avg_ridership
## <chr> <dbl>
## 1 Boat 4.45
## 2 Bus 359.
## 3 Commuter Rail 137.
## 4 Heavy Rail 489.
## 5 Light Rail 233.
## 6 Private Bus 3.35
## 7 RIDE 6.60
## 8 Trackless Trolley 12.1
Average ridership by mode in January
mbta %>%
filter(month == "01") %>%
group_by(mode) %>%
summarize(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
## mode avg_ridership
## <chr> <dbl>
## 1 Boat 3.31
## 2 Bus 342.
## 3 Commuter Rail 137.
## 4 Heavy Rail 460.
## 5 Light Rail 217.
## 6 Private Bus 3.47
## 7 RIDE 5.94
## 8 Trackless Trolley 12.5
Which year had the largest ridership for the boat mode?
mbta %>%
filter(mode == "Boat") %>%
group_by(year) %>%
summarize(total_riders = sum(thou_riders, na.rm = TRUE)) %>%
slice_max(total_riders, n = 1)
## # A tibble: 1 × 2
## year total_riders
## <chr> <dbl>
## 1 2007 57.1
2007
Which month experiences the greatest number of passengers on the Heavy Rail mode?
mbta %>%
filter(mode == "Heavy Rail") %>%
group_by(month) %>%
summarize(avg_ridership = mean(thou_riders, na.rm = TRUE)) %>%
slice_max(avg_ridership, n = 1)
## # A tibble: 1 × 2
## month avg_ridership
## <chr> <dbl>
## 1 09 518.
September