# Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
# Read in the CSV data
data <- read_csv("NYC-2016-Summary.csv")
## Rows: 276798 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): day_of_week, user_type
## dbl (3): duration, month, hour
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert month to 3-letter abbreviations
month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
#Convert month to factor
data <- data %>%
mutate(
month = factor(month_levels[month], levels = month_levels, ordered = TRUE)
)
# Reformat hour as "__:00" time of day
# Convert hour to factor
data <- data %>%
mutate(
hour = sprintf("%02d:00", hour),
hour = factor(hour, levels = sprintf("%02d:00", 0:23), ordered = TRUE)
)
# Order days starting from Monday or Sunday
day_levels <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
# Convert day_of_week to factor
data <- data %>%
mutate(
day_of_week = factor(day_of_week, levels = day_levels, ordered = TRUE)
)
# Convert user_type to factor
user_levels <- c("Subscriber", "Customer")
data <- data %>%
mutate(
user_type = factor(user_type, levels = user_levels)
)
# Test the levels for each factor to confirm correct ordering and no typos
levels(data$month)
## [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
levels(data$hour)
## [1] "00:00" "01:00" "02:00" "03:00" "04:00" "05:00" "06:00" "07:00" "08:00"
## [10] "09:00" "10:00" "11:00" "12:00" "13:00" "14:00" "15:00" "16:00" "17:00"
## [19] "18:00" "19:00" "20:00" "21:00" "22:00" "23:00"
levels(data$day_of_week)
## [1] "Monday" "Tuesday" "Wednesday" "Thursday" "Friday" "Saturday"
## [7] "Sunday"
levels(data$user_type)
## [1] "Subscriber" "Customer"