monthly_trips <- read.csv("202201-divvy-tripdata.csv")
summary(monthly_trips)
## ride_id rideable_type started_at ended_at
## Length:103770 Length:103770 Length:103770 Length:103770
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:103770 Length:103770 Length:103770 Length:103770
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.65 Min. :-87.83 Min. :41.65 Min. :-87.83
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.89 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.65 Mean :41.90 Mean :-87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :45.64 Max. :-73.80 Max. :42.07 Max. :-87.52
## NA's :86 NA's :86
## member_casual
## Length:103770
## Class :character
## Mode :character
##
##
##
##
#load dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#glimpse dataframe
glimpse(monthly_trips)
## Rows: 103,770
## Columns: 13
## $ ride_id <chr> "C2F7DD78E82EC875", "A6CF8980A652D272", "BD0F91DFF7…
## $ rideable_type <chr> "electric_bike", "electric_bike", "classic_bike", "…
## $ started_at <chr> "1/13/2022 11:59", "1/10/2022 8:41", "1/25/2022 4:5…
## $ ended_at <chr> "1/13/2022 12:02", "1/10/2022 8:46", "1/25/2022 4:5…
## $ start_station_name <chr> "Glenwood Ave & Touhy Ave", "Glenwood Ave & Touhy A…
## $ start_station_id <chr> "525", "525", "TA1306000016", "KA1504000151", "TA13…
## $ end_station_name <chr> "Clark St & Touhy Ave", "Clark St & Touhy Ave", "Gr…
## $ end_station_id <chr> "RP-007", "RP-007", "TA1307000001", "TA1309000021",…
## $ start_lat <dbl> 42.01280, 42.01276, 41.92560, 41.98359, 41.87785, 4…
## $ start_lng <dbl> -87.66591, -87.66597, -87.65371, -87.66915, -87.624…
## $ end_lat <dbl> 42.01256, 42.01256, 41.92533, 41.96151, 41.88462, 4…
## $ end_lng <dbl> -87.67437, -87.67437, -87.66580, -87.67139, -87.627…
## $ member_casual <chr> "casual", "casual", "member", "casual", "member", "…
monthly_trips[monthly_trips == '' | monthly_trips == ' ' | is.null(monthly_trips)] <- NA
Method based on https://stackoverflow.com/questions/20760547/removing-whitespace-from-a-whole-data-frame-in-r; response from user user10917479
Trim not done since external white spaces (if any) removed on loading with argument strip.white = TRUE.
library(stringr)
monthly_trips <- monthly_trips %>%
mutate(across(where(is.character), str_squish))
#load package
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#change class from character to date_time, replace old data
monthly_trips$started_at <- mdy_hm(monthly_trips$started_at)
monthly_trips$ended_at <- mdy_hm(monthly_trips$ended_at)
#check to make sure column class are changed from character to date_time
glimpse(monthly_trips)
## Rows: 103,770
## Columns: 13
## $ ride_id <chr> "C2F7DD78E82EC875", "A6CF8980A652D272", "BD0F91DFF7…
## $ rideable_type <chr> "electric_bike", "electric_bike", "classic_bike", "…
## $ started_at <dttm> 2022-01-13 11:59:00, 2022-01-10 08:41:00, 2022-01-…
## $ ended_at <dttm> 2022-01-13 12:02:00, 2022-01-10 08:46:00, 2022-01-…
## $ start_station_name <chr> "Glenwood Ave & Touhy Ave", "Glenwood Ave & Touhy A…
## $ start_station_id <chr> "525", "525", "TA1306000016", "KA1504000151", "TA13…
## $ end_station_name <chr> "Clark St & Touhy Ave", "Clark St & Touhy Ave", "Gr…
## $ end_station_id <chr> "RP-007", "RP-007", "TA1307000001", "TA1309000021",…
## $ start_lat <dbl> 42.01280, 42.01276, 41.92560, 41.98359, 41.87785, 4…
## $ start_lng <dbl> -87.66591, -87.66597, -87.65371, -87.66915, -87.624…
## $ end_lat <dbl> 42.01256, 42.01256, 41.92533, 41.96151, 41.88462, 4…
## $ end_lng <dbl> -87.67437, -87.67437, -87.66580, -87.67139, -87.627…
## $ member_casual <chr> "casual", "casual", "member", "casual", "member", "…
#create started_at_month column; 1 = January
monthly_trips$started_at_month <- month(ymd_hms(monthly_trips$started_at))
## Warning: 17 failed to parse.
#group months to season in started_at_season column
#create copy of month column
monthly_trips$started_at_season <- ''
#combine months to seasons
#spring
monthly_trips$started_at_season[monthly_trips$started_at_month == 3 | monthly_trips$started_at_month == 4 | monthly_trips$started_at_month == 5] <- 'spring'
#summer
monthly_trips$started_at_season[monthly_trips$started_at_month == 6 | monthly_trips$started_at_month == 7 | monthly_trips$started_at_month == 8] <- 'summer'
#fall
monthly_trips$started_at_season[monthly_trips$started_at_month == 9 | monthly_trips$started_at_month == 10 | monthly_trips$started_at_month == 11] <- 'fall'
#winter
monthly_trips$started_at_season[monthly_trips$started_at_month == 1 | monthly_trips$started_at_month == 2 | monthly_trips$started_at_month == 12] <- 'winter'
#####2.2b Create hour columns
monthly_trips$start_time_hour <- hour(ymd_hms(monthly_trips$started_at))
## Warning: 17 failed to parse.
monthly_trips$end_time_hour <- hour(ymd_hms(monthly_trips$ended_at))
## Warning: 23 failed to parse.
#day of week, Sunday = 1; if Monday was day 1 then it becomes dfs <- wday(df1$col_name, week_start = 1)
monthly_trips$started_at_weekday_number <- wday(monthly_trips$started_at)
#instead of numbers, give actual days, class = ordered, factor
monthly_trips$started_at_weekday_name <- wday(monthly_trips$started_at, label = TRUE, abbr = FALSE)
monthly_trips$ride_length_min <- as.integer((difftime(monthly_trips$ended_at, monthly_trips$started_at, units = 'mins')))
glimpse(monthly_trips)
## Rows: 103,770
## Columns: 20
## $ ride_id <chr> "C2F7DD78E82EC875", "A6CF8980A652D272", "BD0…
## $ rideable_type <chr> "electric_bike", "electric_bike", "classic_b…
## $ started_at <dttm> 2022-01-13 11:59:00, 2022-01-10 08:41:00, 2…
## $ ended_at <dttm> 2022-01-13 12:02:00, 2022-01-10 08:46:00, 2…
## $ start_station_name <chr> "Glenwood Ave & Touhy Ave", "Glenwood Ave & …
## $ start_station_id <chr> "525", "525", "TA1306000016", "KA1504000151"…
## $ end_station_name <chr> "Clark St & Touhy Ave", "Clark St & Touhy Av…
## $ end_station_id <chr> "RP-007", "RP-007", "TA1307000001", "TA13090…
## $ start_lat <dbl> 42.01280, 42.01276, 41.92560, 41.98359, 41.8…
## $ start_lng <dbl> -87.66591, -87.66597, -87.65371, -87.66915, …
## $ end_lat <dbl> 42.01256, 42.01256, 41.92533, 41.96151, 41.8…
## $ end_lng <dbl> -87.67437, -87.67437, -87.66580, -87.67139, …
## $ member_casual <chr> "casual", "casual", "member", "casual", "mem…
## $ started_at_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ started_at_season <chr> "winter", "winter", "winter", "winter", "win…
## $ start_time_hour <int> 11, 8, 4, 0, 1, 18, 18, 12, 7, 15, 18, 12, 1…
## $ end_time_hour <int> 12, 8, 4, 0, 1, 18, 18, 12, 8, 15, 18, 12, 1…
## $ started_at_weekday_number <dbl> 5, 2, 3, 3, 5, 3, 1, 7, 2, 6, 3, 7, 1, 5, 7,…
## $ started_at_weekday_name <ord> Thursday, Monday, Tuesday, Tuesday, Thursday…
## $ ride_length_min <int> 3, 5, 5, 15, 6, 3, 17, 12, 26, 8, 7, 13, 9, …
#Remove rows with missing values
monthly_trips_na <- na.omit(monthly_trips)
#Remove rows with outlier latitude, longitude, and trips < 1 minute duration
monthly_trips_v2 <- monthly_trips_na[!(monthly_trips_na$start_lat>43 | monthly_trips_na$end_lat<40 | monthly_trips_na$ride_length_min<1),]
glimpse(monthly_trips_v2)
## Rows: 79,484
## Columns: 20
## $ ride_id <chr> "C2F7DD78E82EC875", "A6CF8980A652D272", "BD0…
## $ rideable_type <chr> "electric_bike", "electric_bike", "classic_b…
## $ started_at <dttm> 2022-01-13 11:59:00, 2022-01-10 08:41:00, 2…
## $ ended_at <dttm> 2022-01-13 12:02:00, 2022-01-10 08:46:00, 2…
## $ start_station_name <chr> "Glenwood Ave & Touhy Ave", "Glenwood Ave & …
## $ start_station_id <chr> "525", "525", "TA1306000016", "KA1504000151"…
## $ end_station_name <chr> "Clark St & Touhy Ave", "Clark St & Touhy Av…
## $ end_station_id <chr> "RP-007", "RP-007", "TA1307000001", "TA13090…
## $ start_lat <dbl> 42.01280, 42.01276, 41.92560, 41.98359, 41.8…
## $ start_lng <dbl> -87.66591, -87.66597, -87.65371, -87.66915, …
## $ end_lat <dbl> 42.01256, 42.01256, 41.92533, 41.96151, 41.8…
## $ end_lng <dbl> -87.67437, -87.67437, -87.66580, -87.67139, …
## $ member_casual <chr> "casual", "casual", "member", "casual", "mem…
## $ started_at_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ started_at_season <chr> "winter", "winter", "winter", "winter", "win…
## $ start_time_hour <int> 11, 8, 4, 0, 1, 18, 18, 12, 7, 15, 18, 12, 1…
## $ end_time_hour <int> 12, 8, 4, 0, 1, 18, 18, 12, 8, 15, 18, 12, 1…
## $ started_at_weekday_number <dbl> 5, 2, 3, 3, 5, 3, 1, 7, 2, 6, 3, 7, 1, 5, 7,…
## $ started_at_weekday_name <ord> Thursday, Monday, Tuesday, Tuesday, Thursday…
## $ ride_length_min <int> 3, 5, 5, 15, 6, 3, 17, 12, 26, 8, 7, 13, 9, …
sum_na_v1 <- sum(is.na(monthly_trips))
sum_na_v2 <- sum(is.na(monthly_trips_v2))
sum_na_v1_v2 <- c(v1 = sum_na_v1, v2 = sum_na_v2)
sum_na_v1_v2
## v1 v2
## 68603 0
nrow_v1 <- nrow(monthly_trips)
nrow_v2 <- nrow(monthly_trips_v2)
nrow_na <- nrow(monthly_trips_na)
percent_row_retained_na <- (nrow_na/nrow_v1)*100
percent_row_retained_v2 <- (nrow_v2/nrow_v1)*100
percent_row_retained <- c('NA' = percent_row_retained_na, V2 = percent_row_retained_v2)
percent_row_retained
## NA V2
## 77.18898 76.59632
#For each column, calculate percent NA (missing)
#Create a function to calculate percent NA
percent_na <- function(x) {(sum(is.na(x))/nrow(monthly_trips))*100}
#apply the function to the data frame, MARGIN = 2 means apply column_wise; MARGIN = 1 means apply row-wise
apply(monthly_trips, MARGIN = 2, FUN = percent_na)
## ride_id rideable_type started_at
## 0.00000000 0.00000000 0.00000000
## ended_at start_station_name start_station_id
## 0.00000000 15.66926857 15.66926857
## end_station_name end_station_id start_lat
## 17.27570589 17.27570589 0.00000000
## start_lng end_lat end_lng
## 0.00000000 0.08287559 0.08287559
## member_casual started_at_month started_at_season
## 0.00000000 0.01638238 0.00000000
## start_time_hour end_time_hour started_at_weekday_number
## 0.01638238 0.02216440 0.00000000
## started_at_weekday_name ride_length_min
## 0.00000000 0.00000000
member_trips <- filter(monthly_trips, member_casual == 'member')
casual_trips <- filter(monthly_trips, member_casual == 'casual')
apply(member_trips, MARGIN = 2, FUN = percent_na)
## ride_id rideable_type started_at
## 0.000000000 0.000000000 0.000000000
## ended_at start_station_name start_station_id
## 0.000000000 11.916738942 11.916738942
## end_station_name end_station_id start_lat
## 12.734894478 12.734894478 0.000000000
## start_lng end_lat end_lng
## 0.000000000 0.027946420 0.027946420
## member_casual started_at_month started_at_season
## 0.000000000 0.005782018 0.000000000
## start_time_hour end_time_hour started_at_weekday_number
## 0.005782018 0.015418714 0.000000000
## started_at_weekday_name ride_length_min
## 0.000000000 0.000000000
apply(casual_trips, MARGIN = 2, FUN = percent_na)
## ride_id rideable_type started_at
## 0.000000000 0.000000000 0.000000000
## ended_at start_station_name start_station_id
## 0.000000000 3.752529633 3.752529633
## end_station_name end_station_id start_lat
## 4.540811410 4.540811410 0.000000000
## start_lng end_lat end_lng
## 0.000000000 0.054929170 0.054929170
## member_casual started_at_month started_at_season
## 0.000000000 0.010600366 0.000000000
## start_time_hour end_time_hour started_at_weekday_number
## 0.010600366 0.006745688 0.000000000
## started_at_weekday_name ride_length_min
## 0.000000000 0.000000000
#Create function to calculate percent NA
percent_member_na <- function(x) {(sum(is.na(x))/nrow(member_trips))*100}
#Apply function column-wise to the members only dataframe
apply(member_trips, MARGIN = 2, FUN = percent_member_na)
## ride_id rideable_type started_at
## 0.000000000 0.000000000 0.000000000
## ended_at start_station_name start_station_id
## 0.000000000 14.505571848 14.505571848
## end_station_name end_station_id start_lat
## 15.501466276 15.501466276 0.000000000
## start_lng end_lat end_lng
## 0.000000000 0.034017595 0.034017595
## member_casual started_at_month started_at_season
## 0.000000000 0.007038123 0.000000000
## start_time_hour end_time_hour started_at_weekday_number
## 0.007038123 0.018768328 0.000000000
## started_at_weekday_name ride_length_min
## 0.000000000 0.000000000
#Create function to calculate percent NA
percent_casual_na <- function(x) {(sum(is.na(x))/nrow(casual_trips))*100}
#Apply function column-wise to the casuals only dataframe
apply(casual_trips, MARGIN = 2, FUN = percent_casual_na)
## ride_id rideable_type started_at
## 0.00000000 0.00000000 0.00000000
## ended_at start_station_name start_station_id
## 0.00000000 21.02591793 21.02591793
## end_station_name end_station_id start_lat
## 25.44276458 25.44276458 0.00000000
## start_lng end_lat end_lng
## 0.00000000 0.30777538 0.30777538
## member_casual started_at_month started_at_season
## 0.00000000 0.05939525 0.00000000
## start_time_hour end_time_hour started_at_weekday_number
## 0.05939525 0.03779698 0.00000000
## started_at_weekday_name ride_length_min
## 0.00000000 0.00000000