Installing Packages
install.packages('completejourney')
##
## The downloaded binary packages are in
## /var/folders/k_/ws4k3wfd0mz22w3mtk9q3y000000gn/T//RtmphMrlSj/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
transactions <- transactions_sample
transactions
## # A tibble: 75,000 × 11
## household_id store_id basket_id product_id quantity sales_value retail_disc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2261 309 31625220889 940996 1 3.86 0.43
## 2 2131 368 32053127496 873902 1 1.59 0.9
## 3 511 316 32445856036 847901 1 1 0.69
## 4 400 388 31932241118 13094913 2 11.9 2.9
## 5 918 340 32074655895 1085604 1 1.29 0
## 6 718 324 32614612029 883203 1 2.5 0.49
## 7 868 323 32074722463 9884484 1 3.49 0
## 8 1688 450 34850403304 1028715 1 2 1.79
## 9 467 31782 31280745102 896613 2 6.55 4.44
## 10 1947 32004 32744181707 978497 1 3.99 0
## # ℹ 74,990 more rows
## # ℹ 4 more variables: coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
products
## # A tibble: 92,331 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 25671 2 GROCERY Natio… FRZN ICE ICE - CRUSH…
## 2 26081 2 MISCELLANEOUS Natio… <NA> <NA>
## 3 26093 69 PASTRY Priva… BREAD BREAD:ITALI…
## 4 26190 69 GROCERY Priva… FRUIT - SHELF S… APPLE SAUCE
## 5 26355 69 GROCERY Priva… COOKIES/CONES SPECIALTY C…
## 6 26426 69 GROCERY Priva… SPICES & EXTRAC… SPICES & SE…
## 7 26540 69 GROCERY Priva… COOKIES/CONES TRAY PACK/C…
## 8 26601 69 DRUG GM Priva… VITAMINS VITAMIN - M…
## 9 26636 69 PASTRY Priva… BREAKFAST SWEETS SW GDS: SW …
## 10 26691 16 GROCERY Priva… PNT BTR/JELLY/J… HONEY
## # ℹ 92,321 more rows
## # ℹ 1 more variable: package_size <chr>
Q0: Create New Variables
transactions <- transactions %>%
mutate (
regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
loyalty_price = (sales_value + coupon_match_disc) / quantity,
coupon_price = (sales_value - coupon_disc) / quantity
) %>%
select(regular_price, loyalty_price, coupon_price, product_id, everything())
Q1. Identify households with larget loyalty_price
transactions
transactions %>%
slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Inf Inf Inf 13945244 2491 389
## 2 100 100 100 12484608 1246 334
## 3 100. 100. 100. 13040176 2318 381
## 4 88.9 88.9 88.9 15630122 1172 396
## 5 85.0 66.1 66.1 916561 2312 442
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
The regular_price and loyalty_price
observations show infinity, indicating quantity was 0
Q2. Filter for observations > 0
transactions %>%
filter(quantity > 0) %>%
slice_max(order_by = loyalty_price, n = 5)
## # A tibble: 5 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 100 100 100 12484608 1246 334
## 2 100. 100. 100. 13040176 2318 381
## 3 88.9 88.9 88.9 15630122 1172 396
## 4 85.0 66.1 66.1 916561 2312 442
## 5 63.8 63.8 63.8 1076056 57 298
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
Households with largest loyalty_price transactions
are:
Q3. Finding the product of largest transaction
products %>%
filter(product_id == 12484608)
## # A tibble: 1 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 12484608 903 MISCELLANEOUS Private COUPON/MISC ITE… ELECTRONIC …
## # ℹ 1 more variable: package_size <chr>
The product is electronic gift card
how many products had a regular price of $1 or less
transactions %>%
filter(regular_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 2748
how many products had a loyalty price of $1 or less
transactions %>%
filter(loyalty_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4648
how many products had a coupon price of $1 or less
transactions %>%
filter(coupon_price <= 1) %>%
select(product_id) %>%
n_distinct()
## [1] 4844
? What proportion of baskets are over $20 in sales value?
transactions %>%
group_by(basket_id) %>%
summarise(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
summarise(
proportion_over_10 = mean(total_sales_value > 10),
proportion_over_20 = mean(total_sales_value > 20)
)
## # A tibble: 1 × 2
## proportion_over_10 proportion_over_20
## <dbl> <dbl>
## 1 0.107 0.0258
Which stores had the largest totalsales_value
transactions %>%
group_by(store_id) %>%
summarise(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales_value))
## # A tibble: 293 × 2
## store_id total_sales_value
## <chr> <dbl>
## 1 367 7713.
## 2 406 6034.
## 3 429 4702.
## 4 343 4471.
## 5 361 4061.
## 6 356 3807.
## 7 381 3746.
## 8 292 3702.
## 9 31782 3684.
## 10 321 3515.
## # ℹ 283 more rows
Which stores had the largest average loyalty discount
transactions %>%
mutate(pct_loyalty_disc = 1 - (loyalty_price/ regular_price)) %>%
group_by(store_id) %>%
summarise(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>%
arrange(desc(avg_pct_loyalty_disc))
## # A tibble: 293 × 2
## store_id avg_pct_loyalty_disc
## <chr> <dbl>
## 1 224 0.576
## 2 62 0.501
## 3 779 0.475
## 4 3163 0.433
## 5 784 0.430
## 6 2950 0.418
## 7 572 0.401
## 8 65 0.390
## 9 486 0.388
## 10 2839 0.373
## # ℹ 283 more rows
Import the data
library(readxl)
excel_sheets('/Users/shamssadin/Documents/4080 Data Mining/mbta.xlsx')
## [1] "Sheet1"
mbta <- read_excel(path = '/Users/shamssadin/Documents/4080 Data Mining/mbta.xlsx', skip = 1, na = 'NA')
## New names:
## • `` -> `...1`
mbta
## # A tibble: 11 × 60
## ...1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 All Modes … NA NA 1188. NA NA 1246.
## 2 2 Boat 4 3.6 40 4.3 4.9 5.8
## 3 3 Bus 336. 339. 340. 352. 354. 351.
## 4 4 Commuter R… 142. 138. 138. 140. 139 143
## 5 5 Heavy Rail 435. 448. 459. 472. 475. 477.
## 6 6 Light Rail 227. 240. 241. 256. 248. 246.
## 7 7 Pct Chg / … 0.02 -0.04 0.114 -0.002 0.049 0.096
## 8 8 Private Bus 4.77 4.42 4.57 4.54 4.77 4.72
## 9 9 RIDE 4.9 5 5.5 5.4 5.4 5.6
## 10 10 Trackless … 12.8 12.9 13.1 13.4 13.5 13.3
## 11 11 TOTAL 1167. 1192. 1205. 1247. 1245. 1246.
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## # `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## # `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## # `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## # `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## # `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## # `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
Examine the data
dim(mbta)
## [1] 11 60
str(mbta)
## tibble [11 × 60] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:11] 1 2 3 4 5 6 7 8 9 10 ...
## $ mode : chr [1:11] "All Modes by Qtr" "Boat" "Bus" "Commuter Rail" ...
## $ 2007-01: num [1:11] NA 4 336 142 435 ...
## $ 2007-02: num [1:11] NA 3.6 338.7 138.5 448.3 ...
## $ 2007-03: num [1:11] 1188 40 340 138 459 ...
## $ 2007-04: num [1:11] NA 4.3 352.2 139.5 472.2 ...
## $ 2007-05: num [1:11] NA 4.9 354.4 139 474.6 ...
## $ 2007-06: num [1:11] 1246 5.8 350.5 143 477 ...
## $ 2007-07: num [1:11] NA 6.52 357.52 142.39 471.74 ...
## $ 2007-08: num [1:11] NA 6.57 355.48 142.36 461.61 ...
## $ 2007-09: num [1:11] 1256.57 5.47 372.6 143.05 499.57 ...
## $ 2007-10: num [1:11] NA 5.14 368.85 146.54 457.74 ...
## $ 2007-11: num [1:11] NA 3.76 330.83 145.09 488.35 ...
## $ 2007-12: num [1:11] 1216.89 2.98 312.92 141.59 448.27 ...
## $ 2008-01: num [1:11] NA 3.17 340.32 142.15 472.62 ...
## $ 2008-02: num [1:11] NA 3.11 352.9 142.61 492.1 ...
## $ 2008-03: num [1:11] 1253.52 3.51 361.15 137.45 494.05 ...
## $ 2008-04: num [1:11] NA 4.16 368.19 140.39 513.2 ...
## $ 2008-05: num [1:11] NA 4.01 363.9 142.59 507.95 ...
## $ 2008-06: num [1:11] 1314.82 5.19 362.96 142.06 518.35 ...
## $ 2008-07: num [1:11] NA 6.02 370.92 145.73 512.31 ...
## $ 2008-08: num [1:11] NA 5.8 361.1 144.6 477 ...
## $ 2008-09: num [1:11] 1307.04 4.59 389.54 141.91 517.32 ...
## $ 2008-10: num [1:11] NA 4.29 357.97 151.96 523.64 ...
## $ 2008-11: num [1:11] NA 3.49 345.42 152.95 487.12 ...
## $ 2008-12: num [1:11] 1232.65 3.01 325.77 140.81 446.74 ...
## $ 2009-01: num [1:11] NA 3.01 338.53 141.45 461 ...
## $ 2009-02: num [1:11] NA 3.2 360.4 143.5 482.4 ...
## $ 2009-03: num [1:11] 1209.79 3.33 353.69 142.89 467.22 ...
## $ 2009-04: num [1:11] NA 4.05 359.38 142.34 493.15 ...
## $ 2009-05: num [1:11] NA 4.12 354.75 144.22 475.63 ...
## $ 2009-06: num [1:11] 1233.1 4.9 347.9 142 473.1 ...
## $ 2009-07: num [1:11] NA 6.44 339.48 137.69 470.83 ...
## $ 2009-08: num [1:11] NA 5.9 332.7 139.2 466.7 ...
## $ 2009-09: num [1:11] 1230.5 4.7 374.3 139.1 500.4 ...
## $ 2009-10: num [1:11] NA 4.21 385.87 137.1 513.41 ...
## $ 2009-11: num [1:11] NA 3.58 366.98 129.34 480.28 ...
## $ 2009-12: num [1:11] 1207.85 3.11 332.39 126.07 440.93 ...
## $ 2010-01: num [1:11] NA 3.21 362.23 130.91 464.07 ...
## $ 2010-02: num [1:11] NA 3.2 361.1 131.9 480.1 ...
## $ 2010-03: num [1:11] 1208.86 3.48 373.44 131.25 483.4 ...
## $ 2010-04: num [1:11] NA 4.45 378.61 131.72 502.37 ...
## $ 2010-05: num [1:11] NA 4.42 380.17 128.8 487.4 ...
## $ 2010-06: num [1:11] 1244.41 5.41 363.27 129.14 490.26 ...
## $ 2010-07: num [1:11] NA 6.51 353.04 122.94 488.59 ...
## $ 2010-08: num [1:11] NA 6.27 343.69 129.73 473.73 ...
## $ 2010-09: num [1:11] 1225.5 4.7 381.6 132.9 521.1 ...
## $ 2010-10: num [1:11] NA 4.4 385 131 532.4 ...
## $ 2010-11: num [1:11] NA 3.73 367.95 130.89 502.89 ...
## $ 2010-12: num [1:11] 1216.26 3.16 326.34 121.42 450.43 ...
## $ 2011-01: num [1:11] NA 3.14 334.96 128.4 468.42 ...
## $ 2011-02: num [1:11] NA 3.28 346.23 125.46 504.07 ...
## $ 2011-03: num [1:11] 1223.45 3.67 380.4 134.37 516.73 ...
## $ 2011-04: num [1:11] NA 4.25 380.45 134.17 528.63 ...
## $ 2011-05: num [1:11] NA 4.43 385.29 136.14 528.12 ...
## $ 2011-06: num [1:11] 1302.41 5.47 376.32 135.58 529.53 ...
## $ 2011-07: num [1:11] NA 6.58 361.58 132.41 532.89 ...
## $ 2011-08: num [1:11] NA 6.73 353.79 130.62 508.14 ...
## $ 2011-09: num [1:11] 1291 5 388 137 550 ...
## $ 2011-10: num [1:11] NA 4.48 398.46 128.72 554.93 ...
head(mbta)
## # A tibble: 6 × 60
## ...1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 All Modes b… NA NA 1188. NA NA 1246.
## 2 2 Boat 4 3.6 40 4.3 4.9 5.8
## 3 3 Bus 336. 339. 340. 352. 354. 351.
## 4 4 Commuter Ra… 142. 138. 138. 140. 139 143
## 5 5 Heavy Rail 435. 448. 459. 472. 475. 477.
## 6 6 Light Rail 227. 240. 241. 256. 248. 246.
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## # `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## # `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## # `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## # `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## # `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## # `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
summary(mbta)
## ...1 mode 2007-01 2007-02
## Min. : 1.0 Length:11 Min. : 0.020 Min. : -0.040
## 1st Qu.: 3.5 Class :character 1st Qu.: 4.804 1st Qu.: 4.563
## Median : 6.0 Mode :character Median : 77.478 Median : 75.707
## Mean : 6.0 Mean : 233.397 Mean : 238.324
## 3rd Qu.: 8.5 3rd Qu.: 308.672 3rd Qu.: 314.072
## Max. :11.0 Max. :1166.974 Max. :1191.639
## NA's :1 NA's :1
## 2007-03 2007-04 2007-05 2007-06
## Min. : 0.114 Min. : -0.002 Min. : 0.049 Min. : 0.096
## 1st Qu.: 9.278 1st Qu.: 4.756 1st Qu.: 5.025 1st Qu.: 5.700
## Median : 137.700 Median : 76.472 Median : 76.240 Median : 143.000
## Mean : 330.293 Mean : 249.421 Mean : 248.956 Mean : 339.846
## 3rd Qu.: 399.225 3rd Qu.: 328.011 3rd Qu.: 327.841 3rd Qu.: 413.788
## Max. :1204.725 Max. :1247.105 Max. :1244.755 Max. :1246.129
## NA's :1 NA's :1
## 2007-07 2007-08 2007-09 2007-10
## Min. : -0.037 Min. : 0.004 Min. : -0.007 Min. : -0.064
## 1st Qu.: 5.570 1st Qu.: 5.624 1st Qu.: 5.539 1st Qu.: 5.310
## Median : 77.851 Median : 77.753 Median : 143.051 Median : 80.582
## Mean : 248.787 Mean : 244.665 Mean : 352.554 Mean : 248.884
## 3rd Qu.: 328.961 3rd Qu.: 325.336 3rd Qu.: 436.082 3rd Qu.: 336.994
## Max. :1243.952 Max. :1223.323 Max. :1310.764 Max. :1244.453
## NA's :1 NA's :1 NA's :1
## 2007-11 2007-12 2008-01 2008-02
## Min. : -0.077 Min. : -0.060 Min. : 0.048 Min. : 0.061
## 1st Qu.: 4.478 1st Qu.: 4.385 1st Qu.: 4.475 1st Qu.: 4.485
## Median : 79.356 Median : 141.585 Median : 78.023 Median : 78.389
## Mean : 248.371 Mean : 321.588 Mean : 244.615 Mean : 252.803
## 3rd Qu.: 310.744 3rd Qu.: 380.594 3rd Qu.: 315.549 3rd Qu.: 327.005
## Max. :1241.895 Max. :1216.890 Max. :1223.050 Max. :1263.983
## NA's :1 NA's :1 NA's :1
## 2008-03 2008-04 2008-05 2008-06
## Min. : 0.058 Min. : 0.060 Min. : 0.046 Min. : 0.060
## 1st Qu.: 5.170 1st Qu.: 4.689 1st Qu.: 4.629 1st Qu.: 5.742
## Median : 137.453 Median : 77.555 Median : 78.506 Median : 142.057
## Mean : 345.604 Mean : 264.435 Mean : 260.323 Mean : 359.667
## 3rd Qu.: 427.601 3rd Qu.: 343.909 3rd Qu.: 337.515 3rd Qu.: 440.656
## Max. :1274.031 Max. :1322.146 Max. :1301.591 Max. :1320.728
## NA's :1 NA's :1
## 2008-07 2008-08 2008-09 2008-10
## Min. : 0.069 Min. : 0.023 Min. : 0.021 Min. : 0.054
## 1st Qu.: 6.019 1st Qu.: 5.887 1st Qu.: 5.691 1st Qu.: 5.087
## Median : 80.061 Median : 79.141 Median : 141.907 Median : 82.486
## Mean : 266.027 Mean : 250.383 Mean : 362.099 Mean : 262.440
## 3rd Qu.: 345.730 3rd Qu.: 330.629 3rd Qu.: 453.430 3rd Qu.: 330.996
## Max. :1330.103 Max. :1251.905 Max. :1338.015 Max. :1312.172
## NA's :1 NA's :1 NA's :1
## 2008-11 2008-12 2009-01 2009-02
## Min. : 0.002 Min. : -0.015 Min. : -0.034 Min. : -0.02
## 1st Qu.: 4.829 1st Qu.: 4.689 1st Qu.: 4.186 1st Qu.: 4.38
## Median : 82.774 Median : 140.810 Median : 76.874 Median : 76.59
## Mean : 248.871 Mean : 319.882 Mean : 236.303 Mean : 247.65
## 3rd Qu.: 317.084 3rd Qu.: 386.255 3rd Qu.: 307.814 3rd Qu.: 327.49
## Max. :1244.354 Max. :1232.655 Max. :1181.534 Max. :1238.24
## NA's :1 NA's :1 NA's :1
## 2009-03 2009-04 2009-05 2009-06
## Min. : -0.050 Min. : -0.048 Min. : -0.058 Min. : -0.079
## 1st Qu.: 5.003 1st Qu.: 4.720 1st Qu.: 4.763 1st Qu.: 5.845
## Median : 142.893 Median : 76.833 Median : 78.358 Median : 142.006
## Mean : 330.142 Mean : 251.603 Mean : 245.116 Mean : 333.194
## 3rd Qu.: 410.455 3rd Qu.: 329.093 3rd Qu.: 322.303 3rd Qu.: 410.482
## Max. :1210.912 Max. :1258.037 Max. :1225.608 Max. :1233.085
## NA's :1 NA's :1
## 2009-07 2009-08 2009-09 2009-10
## Min. : -0.094 Min. : -0.044 Min. : -0.035 Min. : -0.014
## 1st Qu.: 6.298 1st Qu.: 6.033 1st Qu.: 5.693 1st Qu.: 4.883
## Median : 74.558 Median : 75.604 Median : 139.087 Median : 75.178
## Mean : 241.006 Mean : 239.427 Mean : 346.687 Mean : 258.811
## 3rd Qu.: 312.185 3rd Qu.: 307.442 3rd Qu.: 437.332 3rd Qu.: 347.086
## Max. :1205.079 Max. :1197.158 Max. :1291.564 Max. :1294.064
## NA's :1 NA's :1 NA's :1
## 2009-11 2009-12 2010-01 2010-02
## Min. : -0.022 Min. : -0.022 Min. : 0.004 Min. : -0.022
## 1st Qu.: 4.323 1st Qu.: 4.784 1st Qu.: 4.034 1st Qu.: 4.062
## Median : 70.997 Median : 126.066 Median : 71.588 Median : 72.238
## Mean : 243.363 Mean : 312.962 Mean : 237.255 Mean : 242.244
## 3rd Qu.: 328.913 3rd Qu.: 386.659 3rd Qu.: 322.769 3rd Qu.: 324.137
## Max. :1216.824 Max. :1207.845 Max. :1186.271 Max. :1211.228
## NA's :1 NA's :1 NA's :1
## 2010-03 2010-04 2010-05 2010-06
## Min. : 0.012 Min. : 0.007 Min. : 0.013 Min. : 0.008
## 1st Qu.: 5.274 1st Qu.: 5.130 1st Qu.: 5.086 1st Qu.: 6.436
## Median : 131.252 Median : 72.370 Median : 70.785 Median : 129.144
## Mean : 332.726 Mean : 253.446 Mean : 248.231 Mean : 335.964
## 3rd Qu.: 428.420 3rd Qu.: 340.770 3rd Qu.: 339.579 3rd Qu.: 426.769
## Max. :1225.556 Max. :1267.226 Max. :1241.148 Max. :1244.409
## NA's :1 NA's :1
## 2010-07 2010-08 2010-09 2010-10
## Min. : 0.001 Min. : -0.015 Min. : 0.001 Min. : 0.009
## 1st Qu.: 6.531 1st Qu.: 6.281 1st Qu.: 5.567 1st Qu.: 5.006
## Median : 64.950 Median : 68.388 Median : 132.892 Median : 69.340
## Mean : 241.180 Mean : 235.947 Mean : 346.524 Mean : 261.255
## 3rd Qu.: 319.462 3rd Qu.: 310.399 3rd Qu.: 451.361 3rd Qu.: 347.832
## Max. :1205.901 Max. :1179.745 Max. :1293.117 Max. :1306.271
## NA's :1 NA's :1 NA's :1
## 2010-11 2010-12 2011-01 2011-02
## Min. : 0.022 Min. : -0.004 Min. : -0.028 Min. : 0.008
## 1st Qu.: 4.402 1st Qu.: 4.466 1st Qu.: 4.039 1st Qu.: 4.329
## Median : 69.166 Median : 121.422 Median : 69.750 Median : 68.579
## Mean : 248.748 Mean : 312.917 Mean : 230.680 Mean : 244.133
## 3rd Qu.: 331.437 3rd Qu.: 388.385 3rd Qu.: 300.831 3rd Qu.: 314.647
## Max. :1243.730 Max. :1216.262 Max. :1153.413 Max. :1220.663
## NA's :1 NA's :1 NA's :1
## 2011-03 2011-04 2011-05 2011-06
## Min. : 0.05 Min. : 0.036 Min. : 0.050 Min. : 0.054
## 1st Qu.: 6.03 1st Qu.: 5.224 1st Qu.: 5.338 1st Qu.: 6.926
## Median : 134.37 Median : 73.384 Median : 74.216 Median : 135.581
## Mean : 345.17 Mean : 262.660 Mean : 260.582 Mean : 353.331
## 3rd Qu.: 448.56 3rd Qu.: 345.904 3rd Qu.: 345.411 3rd Qu.: 452.923
## Max. :1286.66 Max. :1313.283 Max. :1302.884 Max. :1302.414
## NA's :1 NA's :1
## 2011-07 2011-08 2011-09 2011-10
## Min. : 0.067 Min. : 0.052 Min. : 0.043 Min. : 0.032
## 1st Qu.: 6.911 1st Qu.: 7.067 1st Qu.: 6.660 1st Qu.: 5.513
## Median : 71.735 Median : 70.853 Median : 136.901 Median : 70.508
## Mean : 257.228 Mean : 248.259 Mean : 362.555 Mean : 269.648
## 3rd Qu.: 328.941 3rd Qu.: 320.386 3rd Qu.: 469.204 3rd Qu.: 358.284
## Max. :1286.107 Max. :1241.268 Max. :1348.754 Max. :1348.222
## NA's :1 NA's :1 NA's :1
colSums(is.na(mbta))
## ...1 mode 2007-01 2007-02 2007-03 2007-04 2007-05 2007-06 2007-07 2007-08
## 0 0 1 1 0 1 1 0 1 1
## 2007-09 2007-10 2007-11 2007-12 2008-01 2008-02 2008-03 2008-04 2008-05 2008-06
## 0 1 1 0 1 1 0 1 1 0
## 2008-07 2008-08 2008-09 2008-10 2008-11 2008-12 2009-01 2009-02 2009-03 2009-04
## 1 1 0 1 1 0 1 1 0 1
## 2009-05 2009-06 2009-07 2009-08 2009-09 2009-10 2009-11 2009-12 2010-01 2010-02
## 1 0 1 1 0 1 1 0 1 1
## 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12
## 0 1 1 0 1 1 0 1 1 0
## 2011-01 2011-02 2011-03 2011-04 2011-05 2011-06 2011-07 2011-08 2011-09 2011-10
## 1 1 0 1 1 0 1 1 0 1
rowSums(is.na(mbta))
## [1] 39 0 0 0 0 0 0 0 0 0 0
Remove unnecessary rows and columns
mbta <- mbta %>%
slice(-c(1,7,11)) %>%
select(-1)
dim(mbta)
## [1] 8 59
Make the data long
mbta <- mbta %>%
pivot_longer(2:59, names_to = 'date', values_to = 'thou_riders')
head(mbta)
## # A tibble: 6 × 3
## mode date thou_riders
## <chr> <chr> <dbl>
## 1 Boat 2007-01 4
## 2 Boat 2007-02 3.6
## 3 Boat 2007-03 40
## 4 Boat 2007-04 4.3
## 5 Boat 2007-05 4.9
## 6 Boat 2007-06 5.8
Seperate years and months from date
mbta <- mbta %>%
separate(date, into = c('year','month'), sep = '-')
head(mbta)
## # A tibble: 6 × 4
## mode year month thou_riders
## <chr> <chr> <chr> <dbl>
## 1 Boat 2007 01 4
## 2 Boat 2007 02 3.6
## 3 Boat 2007 03 40
## 4 Boat 2007 04 4.3
## 5 Boat 2007 05 4.9
## 6 Boat 2007 06 5.8
Fix entry error
mbta <- mbta %>%
mutate(thou_riders = ifelse(thou_riders == 40.0, 4.0, thou_riders))
mbta
## # A tibble: 464 × 4
## mode year month thou_riders
## <chr> <chr> <chr> <dbl>
## 1 Boat 2007 01 4
## 2 Boat 2007 02 3.6
## 3 Boat 2007 03 4
## 4 Boat 2007 04 4.3
## 5 Boat 2007 05 4.9
## 6 Boat 2007 06 5.8
## 7 Boat 2007 07 6.52
## 8 Boat 2007 08 6.57
## 9 Boat 2007 09 5.47
## 10 Boat 2007 10 5.14
## # ℹ 454 more rows
Create descriptive statistics
mbta %>%
group_by(mode) %>%
summarise(avg_ridership = mean(thou_riders))
## # A tibble: 8 × 2
## mode avg_ridership
## <chr> <dbl>
## 1 Boat 4.45
## 2 Bus 359.
## 3 Commuter Rail 137.
## 4 Heavy Rail 489.
## 5 Light Rail 233.
## 6 Private Bus 3.35
## 7 RIDE 6.60
## 8 Trackless Trolley 12.1
mbta %>%
filter(month == '01') %>%
group_by(mode) %>%
summarise(avg_ridership = mean(thou_riders, na.rm = TRUE))
## # A tibble: 8 × 2
## mode avg_ridership
## <chr> <dbl>
## 1 Boat 3.31
## 2 Bus 342.
## 3 Commuter Rail 137.
## 4 Heavy Rail 460.
## 5 Light Rail 217.
## 6 Private Bus 3.47
## 7 RIDE 5.94
## 8 Trackless Trolley 12.5
mbta %>%
filter(mode == 'Boat') %>%
group_by(year) %>%
summarise(total_riders = sum(thou_riders)) %>%
arrange(desc(total_riders))
## # A tibble: 5 × 2
## year total_riders
## <chr> <dbl>
## 1 2007 57.1
## 2 2010 52.9
## 3 2009 50.6
## 4 2008 50.3
## 5 2011 47.1
mbta %>%
filter(mode == 'Heavy Rail') %>%
group_by(month) %>%
summarise(avg_total_riders = mean(thou_riders)) %>%
arrange(desc(avg_total_riders))
## # A tibble: 12 × 2
## month avg_total_riders
## <chr> <dbl>
## 1 09 518.
## 2 10 516.
## 3 04 502.
## 4 06 498.
## 5 07 495.
## 6 05 495.
## 7 11 490.
## 8 03 484.
## 9 02 481.
## 10 08 477.
## 11 01 460.
## 12 12 447.