# Install necessary package
install.packages('tidyverse')
install.packages('completejourney')
# Load necessary package
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(completejourney)
## Welcome to the completejourney package! Learn more about these data
## sets at http://bit.ly/completejourney.
# Load a sample of the transactions data
transactions <- transactions_sample
transactions <- transactions %>%
mutate(
regular_price = (sales_value + retail_disc + coupon_match_disc) / quantity,
loyalty_price = (sales_value + coupon_match_disc) / quantity,
coupon_price = (sales_value - coupon_disc) / quantity
) %>%
select(regular_price, loyalty_price, coupon_price, product_id, everything())
# View the data
head(transactions)
## # A tibble: 6 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 4.29 3.86 3.86 940996 2261 309
## 2 2.49 1.59 1.59 873902 2131 368
## 3 1.69 1 1 847901 511 316
## 4 7.38 5.94 5.94 13094913 400 388
## 5 1.29 1.29 1.29 1085604 918 340
## 6 2.99 2.5 2.5 883203 718 324
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
top_loyalty_transactions <- transactions %>%
slice_max(order_by = loyalty_price, n = 5)
top_loyalty_transactions
## # A tibble: 5 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Inf Inf Inf 13945244 2491 389
## 2 100 100 100 12484608 1246 334
## 3 100. 100. 100. 13040176 2318 381
## 4 88.9 88.9 88.9 15630122 1172 396
## 5 85.0 66.1 66.1 916561 2312 442
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
loyalty_price
transactionfiltered_transactions <- transactions %>%
filter(quantity > 0) %>%
slice_max(order_by = loyalty_price, n = 5)
filtered_transactions
## # A tibble: 5 × 14
## regular_price loyalty_price coupon_price product_id household_id store_id
## <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 100 100 100 12484608 1246 334
## 2 100. 100. 100. 13040176 2318 381
## 3 88.9 88.9 88.9 15630122 1172 396
## 4 85.0 66.1 66.1 916561 2312 442
## 5 63.8 63.8 63.8 1076056 57 298
## # ℹ 8 more variables: basket_id <chr>, quantity <dbl>, sales_value <dbl>,
## # retail_disc <dbl>, coupon_disc <dbl>, coupon_match_disc <dbl>, week <int>,
## # transaction_timestamp <dttm>
loyalty_price Transactionlargest_loyalty_product <- products %>%
filter(product_id == filtered_transactions$product_id[1])
largest_loyalty_product
## # A tibble: 1 × 7
## product_id manufacturer_id department brand product_category product_type
## <chr> <chr> <chr> <fct> <chr> <chr>
## 1 12484608 903 MISCELLANEOUS Private COUPON/MISC ITE… ELECTRONIC …
## # ℹ 1 more variable: package_size <chr>
# Count the number of unique products with a regular price of $1 or less
regular_price_count <- transactions %>%
filter(regular_price <= 1) %>%
select(product_id) %>%
n_distinct()
cat("Number of unique products with a regular price of $1 or less:", regular_price_count)
## Number of unique products with a regular price of $1 or less: 2748
loyalty_price_count <- transactions %>%
filter(loyalty_price <= 1) %>%
select(product_id) %>%
n_distinct()
cat("Number of unique products with a loyalty price of $1 or less:", loyalty_price_count)
## Number of unique products with a loyalty price of $1 or less: 4648
coupon_price_count <- transactions %>%
filter(coupon_price <= 1) %>%
select(product_id) %>%
n_distinct()
cat("Number of unique products with a coupon price of $1 or less:", coupon_price_count)
## Number of unique products with a coupon price of $1 or less: 4844
# First, group by basket_id and summarize the total sales value for each basket
basket_summary <- transactions %>%
group_by(basket_id) %>%
summarize(total_sales_value = sum(sales_value))
# Calculate the total number of baskets
total_baskets <- nrow(basket_summary)
# Calculate the number of baskets that are over $10 in sales value
baskets_over_10 <- basket_summary %>%
filter(total_sales_value > 10) %>%
nrow()
# Calculate the number of baskets that are over $20 in sales value
baskets_over_20 <- basket_summary %>%
filter(total_sales_value > 20) %>%
nrow()
# Calculate the proportion of baskets over $10
proportion_over_10 <- baskets_over_10 / total_baskets
# Calculate the proportion of baskets over $20
proportion_over_20 <- baskets_over_20 / total_baskets
# Print the results
print(paste("Proportion of baskets over $10:", round(proportion_over_10, 2)))
## [1] "Proportion of baskets over $10: 0.11"
print(paste("Proportion of baskets over $20:", round(proportion_over_20, 2)))
## [1] "Proportion of baskets over $20: 0.03"
# Identify which stores had the largest total `sales_value`.
transactions %>%
group_by(store_id) %>%
summarize(total_sales_value = sum(sales_value, na.rm = TRUE)) %>%
arrange(desc(total_sales_value))
## # A tibble: 293 × 2
## store_id total_sales_value
## <chr> <dbl>
## 1 367 7713.
## 2 406 6034.
## 3 429 4702.
## 4 343 4471.
## 5 361 4061.
## 6 356 3807.
## 7 381 3746.
## 8 292 3702.
## 9 31782 3684.
## 10 321 3515.
## # ℹ 283 more rows
# Identify which stores had the largest average loyalty discount, using the logic `pct_loyalty_disc = 1 - (loyalty_price / regular_price)`.
transactions %>%
mutate(pct_loyalty_disc = 1 - (loyalty_price / regular_price)) %>%
group_by(store_id) %>%
summarize(avg_pct_loyalty_disc = mean(pct_loyalty_disc, na.rm = TRUE)) %>%
arrange(desc(avg_pct_loyalty_disc))
## # A tibble: 293 × 2
## store_id avg_pct_loyalty_disc
## <chr> <dbl>
## 1 224 0.576
## 2 62 0.501
## 3 779 0.475
## 4 3163 0.433
## 5 784 0.430
## 6 2950 0.418
## 7 572 0.401
## 8 65 0.390
## 9 486 0.388
## 10 2839 0.373
## # ℹ 283 more rows
For this part of the lab we’ll work through the mbta.xlsx data. The Massachusetts Bay Transportation Authority (“MBTA”) manages America’s oldest subway, as well as Greater Boston’s commuter rail, ferry, and bus systems. It’s your first day on the job as the T’s data analyst and you’ve been tasked with analyzing average ridership through time. Complete the following data cleaning tasks and be sure to use proper code styling and commenting throughout your notebook.
# Load necessary package
library(readxl)
sheets <- excel_sheets(path = "./mbta.xlsx")
print(sheets)
## [1] "Sheet1"
mbta <- read_excel(path = "./mbta.xlsx",
sheet = 1, # assuming you want the first sheet; change as needed
skip = 1, # number of rows to skip; adjust if needed
na = "NA") # interpret 'NA' as missing values
## New names:
## • `` -> `...1`
str(mbta)
## tibble [11 × 60] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:11] 1 2 3 4 5 6 7 8 9 10 ...
## $ mode : chr [1:11] "All Modes by Qtr" "Boat" "Bus" "Commuter Rail" ...
## $ 2007-01: num [1:11] NA 40010 336 142 435 ...
## $ 2007-02: num [1:11] NA 3.6 338.7 138.5 448.3 ...
## $ 2007-03: num [1:11] 1188 40 340 138 459 ...
## $ 2007-04: num [1:11] NA 4.3 352.2 139.5 472.2 ...
## $ 2007-05: num [1:11] NA 4.9 354.4 139 474.6 ...
## $ 2007-06: num [1:11] 1246 5.8 350.5 143 477 ...
## $ 2007-07: num [1:11] NA 6.52 357.52 142.39 471.74 ...
## $ 2007-08: num [1:11] NA 6.57 355.48 142.36 461.61 ...
## $ 2007-09: num [1:11] 1256.57 5.47 372.6 143.05 499.57 ...
## $ 2007-10: num [1:11] NA 5.14 368.85 146.54 457.74 ...
## $ 2007-11: num [1:11] NA 3.76 330.83 145.09 488.35 ...
## $ 2007-12: num [1:11] 1216.89 2.98 312.92 141.59 448.27 ...
## $ 2008-01: num [1:11] NA 3.17 340.32 142.15 472.62 ...
## $ 2008-02: num [1:11] NA 3.11 352.9 142.61 492.1 ...
## $ 2008-03: num [1:11] 1253.52 3.51 361.15 137.45 494.05 ...
## $ 2008-04: num [1:11] NA 4.16 368.19 140.39 513.2 ...
## $ 2008-05: num [1:11] NA 4.01 363.9 142.59 507.95 ...
## $ 2008-06: num [1:11] 1314.82 5.19 362.96 142.06 518.35 ...
## $ 2008-07: num [1:11] NA 6.02 370.92 145.73 512.31 ...
## $ 2008-08: num [1:11] NA 5.8 361.1 144.6 477 ...
## $ 2008-09: num [1:11] 1307.04 4.59 389.54 141.91 517.32 ...
## $ 2008-10: num [1:11] NA 4.29 357.97 151.96 523.64 ...
## $ 2008-11: num [1:11] NA 3.49 345.42 152.95 487.12 ...
## $ 2008-12: num [1:11] 1232.65 3.01 325.77 140.81 446.74 ...
## $ 2009-01: num [1:11] NA 3.01 338.53 141.45 461 ...
## $ 2009-02: num [1:11] NA 3.2 360.4 143.5 482.4 ...
## $ 2009-03: num [1:11] 1209.79 3.33 353.69 142.89 467.22 ...
## $ 2009-04: num [1:11] NA 4.05 359.38 142.34 493.15 ...
## $ 2009-05: num [1:11] NA 4.12 354.75 144.22 475.63 ...
## $ 2009-06: num [1:11] 1233.1 4.9 347.9 142 473.1 ...
## $ 2009-07: num [1:11] NA 6.44 339.48 137.69 470.83 ...
## $ 2009-08: num [1:11] NA 5.9 332.7 139.2 466.7 ...
## $ 2009-09: num [1:11] 1230.5 4.7 374.3 139.1 500.4 ...
## $ 2009-10: num [1:11] NA 4.21 385.87 137.1 513.41 ...
## $ 2009-11: num [1:11] NA 3.58 366.98 129.34 480.28 ...
## $ 2009-12: num [1:11] 1207.85 3.11 332.39 126.07 440.93 ...
## $ 2010-01: num [1:11] NA 3.21 362.23 130.91 464.07 ...
## $ 2010-02: num [1:11] NA 3.19 361.14 131.92 480.12 ...
## $ 2010-03: num [1:11] 1208.86 3.48 373.44 131.25 483.4 ...
## $ 2010-04: num [1:11] NA 4.45 378.61 131.72 502.37 ...
## $ 2010-05: num [1:11] NA 4.42 380.17 128.8 487.4 ...
## $ 2010-06: num [1:11] 1244.41 5.41 363.27 129.14 490.26 ...
## $ 2010-07: num [1:11] NA 6.51 353.04 122.94 488.59 ...
## $ 2010-08: num [1:11] NA 6.27 343.69 129.73 473.73 ...
## $ 2010-09: num [1:11] 1225.5 4.7 381.6 132.9 521.1 ...
## $ 2010-10: num [1:11] NA 4.4 385 131 532.4 ...
## $ 2010-11: num [1:11] NA 3.73 367.95 130.89 502.89 ...
## $ 2010-12: num [1:11] 1216.26 3.16 326.34 121.42 450.43 ...
## $ 2011-01: num [1:11] NA 3.14 334.96 128.4 468.42 ...
## $ 2011-02: num [1:11] NA 3.28 346.23 125.46 504.07 ...
## $ 2011-03: num [1:11] 1223.45 3.67 380.4 134.37 516.73 ...
## $ 2011-04: num [1:11] NA 4.25 380.45 134.17 528.63 ...
## $ 2011-05: num [1:11] NA 4.43 385.29 136.14 528.12 ...
## $ 2011-06: num [1:11] 1302.41 5.47 376.32 135.58 529.53 ...
## $ 2011-07: num [1:11] NA 6.58 361.58 132.41 532.89 ...
## $ 2011-08: num [1:11] NA 6.73 353.79 130.62 508.14 ...
## $ 2011-09: num [1:11] 1291 5 388 137 550 ...
## $ 2011-10: num [1:11] NA 4.48 398.46 128.72 554.93 ...
head(mbta, 6)
## # A tibble: 6 × 60
## ...1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 All Modes b… NA NA 1188. NA NA 1246.
## 2 2 Boat 40010 3.6 40 4.3 4.9 5.8
## 3 3 Bus 336. 339. 340. 352. 354. 351.
## 4 4 Commuter Ra… 142. 138. 138. 140. 139 143
## 5 5 Heavy Rail 435. 448. 459. 472. 475. 477.
## 6 6 Light Rail 227. 240. 241. 256. 248. 246.
## # ℹ 52 more variables: `2007-07` <dbl>, `2007-08` <dbl>, `2007-09` <dbl>,
## # `2007-10` <dbl>, `2007-11` <dbl>, `2007-12` <dbl>, `2008-01` <dbl>,
## # `2008-02` <dbl>, `2008-03` <dbl>, `2008-04` <dbl>, `2008-05` <dbl>,
## # `2008-06` <dbl>, `2008-07` <dbl>, `2008-08` <dbl>, `2008-09` <dbl>,
## # `2008-10` <dbl>, `2008-11` <dbl>, `2008-12` <dbl>, `2009-01` <dbl>,
## # `2009-02` <dbl>, `2009-03` <dbl>, `2009-04` <dbl>, `2009-05` <dbl>,
## # `2009-06` <dbl>, `2009-07` <dbl>, `2009-08` <dbl>, `2009-09` <dbl>, …
summary(mbta)
## ...1 mode 2007-01 2007-02
## Min. : 1.0 Length:11 Min. : 0.02 Min. : -0.040
## 1st Qu.: 3.5 Class :character 1st Qu.: 6.86 1st Qu.: 4.563
## Median : 6.0 Mode :character Median : 184.72 Median : 75.707
## Mean : 6.0 Mean : 4234.00 Mean : 238.324
## 3rd Qu.: 8.5 3rd Qu.: 410.43 3rd Qu.: 314.072
## Max. :11.0 Max. :40010.00 Max. :1191.639
## NA's :1 NA's :1
## 2007-03 2007-04 2007-05 2007-06
## Min. : 0.114 Min. : -0.002 Min. : 0.049 Min. : 0.096
## 1st Qu.: 9.278 1st Qu.: 4.756 1st Qu.: 5.025 1st Qu.: 5.700
## Median : 137.700 Median : 76.472 Median : 76.240 Median : 143.000
## Mean : 330.293 Mean : 249.421 Mean : 248.956 Mean : 339.846
## 3rd Qu.: 399.225 3rd Qu.: 328.011 3rd Qu.: 327.841 3rd Qu.: 413.788
## Max. :1204.725 Max. :1247.105 Max. :1244.755 Max. :1246.129
## NA's :1 NA's :1
## 2007-07 2007-08 2007-09 2007-10
## Min. : -0.037 Min. : 0.004 Min. : -0.007 Min. : -0.064
## 1st Qu.: 5.570 1st Qu.: 5.624 1st Qu.: 5.539 1st Qu.: 5.310
## Median : 77.851 Median : 77.753 Median : 143.051 Median : 80.582
## Mean : 248.787 Mean : 244.665 Mean : 352.554 Mean : 248.884
## 3rd Qu.: 328.961 3rd Qu.: 325.336 3rd Qu.: 436.082 3rd Qu.: 336.994
## Max. :1243.952 Max. :1223.323 Max. :1310.764 Max. :1244.453
## NA's :1 NA's :1 NA's :1
## 2007-11 2007-12 2008-01 2008-02
## Min. : -0.077 Min. : -0.060 Min. : 0.048 Min. : 0.061
## 1st Qu.: 4.478 1st Qu.: 4.385 1st Qu.: 4.475 1st Qu.: 4.485
## Median : 79.356 Median : 141.585 Median : 78.023 Median : 78.389
## Mean : 248.371 Mean : 321.588 Mean : 244.615 Mean : 252.803
## 3rd Qu.: 310.744 3rd Qu.: 380.594 3rd Qu.: 315.549 3rd Qu.: 327.005
## Max. :1241.895 Max. :1216.890 Max. :1223.050 Max. :1263.983
## NA's :1 NA's :1 NA's :1
## 2008-03 2008-04 2008-05 2008-06
## Min. : 0.058 Min. : 0.060 Min. : 0.046 Min. : 0.060
## 1st Qu.: 5.170 1st Qu.: 4.689 1st Qu.: 4.629 1st Qu.: 5.742
## Median : 137.453 Median : 77.555 Median : 78.506 Median : 142.057
## Mean : 345.604 Mean : 264.435 Mean : 260.323 Mean : 359.667
## 3rd Qu.: 427.601 3rd Qu.: 343.909 3rd Qu.: 337.515 3rd Qu.: 440.656
## Max. :1274.031 Max. :1322.146 Max. :1301.591 Max. :1320.728
## NA's :1 NA's :1
## 2008-07 2008-08 2008-09 2008-10
## Min. : 0.069 Min. : 0.023 Min. : 0.021 Min. : 0.054
## 1st Qu.: 6.019 1st Qu.: 5.887 1st Qu.: 5.691 1st Qu.: 5.087
## Median : 80.061 Median : 79.141 Median : 141.907 Median : 82.486
## Mean : 266.027 Mean : 250.383 Mean : 362.099 Mean : 262.440
## 3rd Qu.: 345.730 3rd Qu.: 330.629 3rd Qu.: 453.430 3rd Qu.: 330.996
## Max. :1330.103 Max. :1251.905 Max. :1338.015 Max. :1312.172
## NA's :1 NA's :1 NA's :1
## 2008-11 2008-12 2009-01 2009-02
## Min. : 0.002 Min. : -0.015 Min. : -0.034 Min. : -0.02
## 1st Qu.: 4.829 1st Qu.: 4.689 1st Qu.: 4.186 1st Qu.: 4.38
## Median : 82.774 Median : 140.810 Median : 76.874 Median : 76.59
## Mean : 248.871 Mean : 319.882 Mean : 236.303 Mean : 247.65
## 3rd Qu.: 317.084 3rd Qu.: 386.255 3rd Qu.: 307.814 3rd Qu.: 327.49
## Max. :1244.354 Max. :1232.655 Max. :1181.534 Max. :1238.24
## NA's :1 NA's :1 NA's :1
## 2009-03 2009-04 2009-05 2009-06
## Min. : -0.050 Min. : -0.048 Min. : -0.058 Min. : -0.079
## 1st Qu.: 5.003 1st Qu.: 4.720 1st Qu.: 4.763 1st Qu.: 5.845
## Median : 142.893 Median : 76.833 Median : 78.358 Median : 142.006
## Mean : 330.142 Mean : 251.603 Mean : 245.116 Mean : 333.194
## 3rd Qu.: 410.455 3rd Qu.: 329.093 3rd Qu.: 322.303 3rd Qu.: 410.482
## Max. :1210.912 Max. :1258.037 Max. :1225.608 Max. :1233.085
## NA's :1 NA's :1
## 2009-07 2009-08 2009-09 2009-10
## Min. : -0.094 Min. : -0.044 Min. : -0.035 Min. : -0.014
## 1st Qu.: 6.298 1st Qu.: 6.033 1st Qu.: 5.693 1st Qu.: 4.883
## Median : 74.558 Median : 75.604 Median : 139.087 Median : 75.178
## Mean : 241.006 Mean : 239.427 Mean : 346.687 Mean : 258.811
## 3rd Qu.: 312.185 3rd Qu.: 307.442 3rd Qu.: 437.332 3rd Qu.: 347.086
## Max. :1205.079 Max. :1197.158 Max. :1291.564 Max. :1294.064
## NA's :1 NA's :1 NA's :1
## 2009-11 2009-12 2010-01 2010-02
## Min. : -0.022 Min. : -0.022 Min. : 0.004 Min. : -0.022
## 1st Qu.: 4.323 1st Qu.: 4.784 1st Qu.: 4.034 1st Qu.: 4.062
## Median : 70.997 Median : 126.066 Median : 71.588 Median : 72.238
## Mean : 243.363 Mean : 312.962 Mean : 237.255 Mean : 242.244
## 3rd Qu.: 328.913 3rd Qu.: 386.659 3rd Qu.: 322.769 3rd Qu.: 324.137
## Max. :1216.824 Max. :1207.845 Max. :1186.271 Max. :1211.228
## NA's :1 NA's :1 NA's :1
## 2010-03 2010-04 2010-05 2010-06
## Min. : 0.012 Min. : 0.007 Min. : 0.013 Min. : 0.008
## 1st Qu.: 5.274 1st Qu.: 5.130 1st Qu.: 5.086 1st Qu.: 6.436
## Median : 131.252 Median : 72.370 Median : 70.785 Median : 129.144
## Mean : 332.726 Mean : 253.446 Mean : 248.231 Mean : 335.964
## 3rd Qu.: 428.420 3rd Qu.: 340.770 3rd Qu.: 339.579 3rd Qu.: 426.769
## Max. :1225.556 Max. :1267.226 Max. :1241.148 Max. :1244.409
## NA's :1 NA's :1
## 2010-07 2010-08 2010-09 2010-10
## Min. : 0.001 Min. : -0.015 Min. : 0.001 Min. : 0.009
## 1st Qu.: 6.531 1st Qu.: 6.281 1st Qu.: 5.567 1st Qu.: 5.006
## Median : 64.950 Median : 68.388 Median : 132.892 Median : 69.340
## Mean : 241.180 Mean : 235.947 Mean : 346.524 Mean : 261.255
## 3rd Qu.: 319.462 3rd Qu.: 310.399 3rd Qu.: 451.361 3rd Qu.: 347.832
## Max. :1205.901 Max. :1179.745 Max. :1293.117 Max. :1306.271
## NA's :1 NA's :1 NA's :1
## 2010-11 2010-12 2011-01 2011-02
## Min. : 0.022 Min. : -0.004 Min. : -0.028 Min. : 0.008
## 1st Qu.: 4.402 1st Qu.: 4.466 1st Qu.: 4.039 1st Qu.: 4.329
## Median : 69.166 Median : 121.422 Median : 69.750 Median : 68.579
## Mean : 248.748 Mean : 312.917 Mean : 230.680 Mean : 244.133
## 3rd Qu.: 331.437 3rd Qu.: 388.385 3rd Qu.: 300.831 3rd Qu.: 314.647
## Max. :1243.730 Max. :1216.262 Max. :1153.413 Max. :1220.663
## NA's :1 NA's :1 NA's :1
## 2011-03 2011-04 2011-05 2011-06
## Min. : 0.05 Min. : 0.036 Min. : 0.050 Min. : 0.054
## 1st Qu.: 6.03 1st Qu.: 5.224 1st Qu.: 5.338 1st Qu.: 6.926
## Median : 134.37 Median : 73.384 Median : 74.216 Median : 135.581
## Mean : 345.17 Mean : 262.660 Mean : 260.582 Mean : 353.331
## 3rd Qu.: 448.56 3rd Qu.: 345.904 3rd Qu.: 345.411 3rd Qu.: 452.923
## Max. :1286.66 Max. :1313.283 Max. :1302.884 Max. :1302.414
## NA's :1 NA's :1
## 2011-07 2011-08 2011-09 2011-10
## Min. : 0.067 Min. : 0.052 Min. : 0.043 Min. : 0.032
## 1st Qu.: 6.911 1st Qu.: 7.067 1st Qu.: 6.660 1st Qu.: 5.513
## Median : 71.735 Median : 70.853 Median : 136.901 Median : 70.508
## Mean : 257.228 Mean : 248.259 Mean : 362.555 Mean : 269.648
## 3rd Qu.: 328.941 3rd Qu.: 320.386 3rd Qu.: 469.204 3rd Qu.: 358.284
## Max. :1286.107 Max. :1241.268 Max. :1348.754 Max. :1348.222
## NA's :1 NA's :1 NA's :1
sapply(mbta, function(x) sum(is.na(x)))
## ...1 mode 2007-01 2007-02 2007-03 2007-04 2007-05 2007-06 2007-07 2007-08
## 0 0 1 1 0 1 1 0 1 1
## 2007-09 2007-10 2007-11 2007-12 2008-01 2008-02 2008-03 2008-04 2008-05 2008-06
## 0 1 1 0 1 1 0 1 1 0
## 2008-07 2008-08 2008-09 2008-10 2008-11 2008-12 2009-01 2009-02 2009-03 2009-04
## 1 1 0 1 1 0 1 1 0 1
## 2009-05 2009-06 2009-07 2009-08 2009-09 2009-10 2009-11 2009-12 2010-01 2010-02
## 1 0 1 1 0 1 1 0 1 1
## 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12
## 0 1 1 0 1 1 0 1 1 0
## 2011-01 2011-02 2011-03 2011-04 2011-05 2011-06 2011-07 2011-08 2011-09 2011-10
## 1 1 0 1 1 0 1 1 0 1
# Load necessary package
library(dplyr)
mbta <- mbta %>%
slice(-c(1, 7, 11))
mbta <- mbta %>%
select(-1)
dim(mbta)
## [1] 8 59
# Load necessary package
library(tidyverse)
mbta <- mbta %>%
pivot_longer(cols = -mode, names_to = "date", values_to = "thou_riders")
dim(mbta)
## [1] 464 3
# Load necessary package
library(tidyverse)
mbta <- mbta %>%
separate(date, into = c("year", "month"), sep = "-")
head(mbta)
## # A tibble: 6 × 4
## mode year month thou_riders
## <chr> <chr> <chr> <dbl>
## 1 Boat 2007 01 40010
## 2 Boat 2007 02 3.6
## 3 Boat 2007 03 40
## 4 Boat 2007 04 4.3
## 5 Boat 2007 05 4.9
## 6 Boat 2007 06 5.8
erroneous_row <- which(mbta$mode == "Boat" & mbta$thou_riders > 40000)
if (length(erroneous_row) > 0) {
mbta$thou_riders[erroneous_row] <- 4
}
filtered_data <- mbta %>% filter(mode == "Boat")
print(filtered_data)
## # A tibble: 58 × 4
## mode year month thou_riders
## <chr> <chr> <chr> <dbl>
## 1 Boat 2007 01 4
## 2 Boat 2007 02 3.6
## 3 Boat 2007 03 40
## 4 Boat 2007 04 4.3
## 5 Boat 2007 05 4.9
## 6 Boat 2007 06 5.8
## 7 Boat 2007 07 6.52
## 8 Boat 2007 08 6.57
## 9 Boat 2007 09 5.47
## 10 Boat 2007 10 5.14
## # ℹ 48 more rows
# Load necessary package
library(dplyr)
average_ridership <- mbta %>%
group_by(mode) %>%
summarise(avg_ridership = mean(thou_riders, na.rm = TRUE))
# Show the results
print(average_ridership)
## # A tibble: 8 × 2
## mode avg_ridership
## <chr> <dbl>
## 1 Boat 5.07
## 2 Bus 359.
## 3 Commuter Rail 137.
## 4 Heavy Rail 489.
## 5 Light Rail 233.
## 6 Private Bus 3.35
## 7 RIDE 6.60
## 8 Trackless Trolley 12.1
january_avg_ridership <- mbta %>%
filter(month == "01") %>%
group_by(mode) %>%
summarise(avg_ridership = mean(thou_riders, na.rm = TRUE))
# Show the results
print(january_avg_ridership)
## # A tibble: 8 × 2
## mode avg_ridership
## <chr> <dbl>
## 1 Boat 3.31
## 2 Bus 342.
## 3 Commuter Rail 137.
## 4 Heavy Rail 460.
## 5 Light Rail 217.
## 6 Private Bus 3.47
## 7 RIDE 5.94
## 8 Trackless Trolley 12.5
largest_boat_ridership_year <- mbta %>%
filter(mode == "Boat") %>%
group_by(year) %>%
summarize(total_riders = sum(thou_riders, na.rm = TRUE)) %>%
arrange(desc(total_riders)) %>%
slice_head(n = 1)
# Show the results
print(largest_boat_ridership_year)
## # A tibble: 1 × 2
## year total_riders
## <chr> <dbl>
## 1 2007 93.1
busiest_month_heavy_rail <- mbta %>%
filter(mode == "Heavy Rail") %>%
group_by(month) %>%
summarize(avg_ridership = mean(thou_riders, na.rm = TRUE)) %>%
arrange(desc(avg_ridership)) %>%
slice_head(n = 1)
# Show the results
print(busiest_month_heavy_rail)
## # A tibble: 1 × 2
## month avg_ridership
## <chr> <dbl>
## 1 09 518.