A bike-share program that features more than 5,800 bicycles and 600 docking stations. Cyclistic sets itself apart by also offering reclining bikes, hand tricycles, and cargo bikes, making bike-share more inclusive to people with disabilities and riders who can’t use a standard two-wheeled bike. The majority of riders opt for traditional bikes; about 8% of riders use the assistive options. Cyclistic users are more likely to ride for leisure, but about 30% use them to commute to work each day.
Cyclistic launched a successful bike-share offering company in 2016. Since then, the program has grown to a fleet of 5,824 bicycles that are geotracked and locked into a network of 692 stations across Chicago.
Until now, Cyclistic’s marketing strategy relied on building general awareness and appealing to broad consumer segments. One approach that helped make these things possible was the flexibility of its pricing plans: single-ride passes, full-day passes, and annual memberships.
Cyclistic’s finance analysts have concluded that annual members are much more profitable than casual riders. Although the pricing flexibility helps Cyclistic attract more customers, Director of marketing believes that maximizing the number of annual members will be key to future growth.
Installation of packages
install.packages("Hmisc")
install.packages("tidyverse")
install.packages("ggplot2")
install.packages("skimr")
install.packages("lubridate")Load all required packages
rm(list=ls()) #Removes all previously stored variables
library("Hmisc")
library("tidyverse")
library("ggplot2")
library("skimr")
library("lubridate")Data upload to R
jan_2022 <- read.csv("/cloud/project/202201-divvy-tripdata.csv")
feb_2022 <- read.csv("/cloud/project/202202-divvy-tripdata.csv")Creating data frames
colnames(jan_2022)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(feb_2022)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
str(jan_2022)
## 'data.frame': 103770 obs. of 13 variables:
## $ ride_id : chr "C2F7DD78E82EC875" "A6CF8980A652D272" "BD0F91DFF741C66D" "CBB80ED419105406" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2022-01-13 11:59:47" "2022-01-10 08:41:56" "2022-01-25 04:53:40" "2022-01-04 00:18:04" ...
## $ ended_at : chr "2022-01-13 12:02:44" "2022-01-10 08:46:17" "2022-01-25 04:58:01" "2022-01-04 00:33:00" ...
## $ start_station_name: chr "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Sheffield Ave & Fullerton Ave" "Clark St & Bryn Mawr Ave" ...
## $ start_station_id : chr "525" "525" "TA1306000016" "KA1504000151" ...
## $ end_station_name : chr "Clark St & Touhy Ave" "Clark St & Touhy Ave" "Greenview Ave & Fullerton Ave" "Paulina St & Montrose Ave" ...
## $ end_station_id : chr "RP-007" "RP-007" "TA1307000001" "TA1309000021" ...
## $ start_lat : num 42 42 41.9 42 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ end_lat : num 42 42 41.9 42 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ member_casual : chr "casual" "casual" "member" "casual" ...
str(feb_2022)
## 'data.frame': 115609 obs. of 13 variables:
## $ ride_id : chr "E1E065E7ED285C02" "1602DCDC5B30FFE3" "BE7DD2AF4B55C4AF" "A1789BDF844412BE" ...
## $ rideable_type : chr "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2022-02-19 18:08:41" "2022-02-20 17:41:30" "2022-02-25 18:55:56" "2022-02-14 11:57:03" ...
## $ ended_at : chr "2022-02-19 18:23:56" "2022-02-20 17:45:56" "2022-02-25 19:09:34" "2022-02-14 12:04:00" ...
## $ start_station_name: chr "State St & Randolph St" "Halsted St & Wrightwood Ave" "State St & Randolph St" "Southport Ave & Waveland Ave" ...
## $ start_station_id : chr "TA1305000029" "TA1309000061" "TA1305000029" "13235" ...
## $ end_station_name : chr "Clark St & Lincoln Ave" "Southport Ave & Wrightwood Ave" "Canal St & Adams St" "Broadway & Sheridan Rd" ...
## $ end_station_id : chr "13179" "TA1307000113" "13011" "13323" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ end_lat : num 41.9 41.9 41.9 42 41.9 ...
## $ end_lng : num -87.6 -87.7 -87.6 -87.6 -87.6 ...
## $ member_casual : chr "member" "member" "member" "member" ...
describe(jan_2022)
## jan_2022
##
## 13 Variables 103770 Observations
## --------------------------------------------------------------------------------
## ride_id
## n missing distinct
## 103770 0 103770
##
## lowest : 00010C6E382D644C 00019B33C06D5F19 0001F25D405EAD48 00023BD4A60D4C1B 000299C3B7253D12
## highest: FFFAB2A19178C495 FFFB937135789032 FFFD035553447DF2 FFFE4D9D52BDC81E FFFE5FA260E982B7
## --------------------------------------------------------------------------------
## rideable_type
## n missing distinct
## 103770 0 3
##
## Value classic_bike docked_bike electric_bike
## Frequency 55067 961 47742
## Proportion 0.531 0.009 0.460
## --------------------------------------------------------------------------------
## started_at
## n missing distinct
## 103770 0 100315
##
## lowest : 2022-01-01 00:00:05 2022-01-01 00:01:00 2022-01-01 00:01:16 2022-01-01 00:02:14 2022-01-01 00:02:35
## highest: 2022-01-31 23:50:09 2022-01-31 23:50:42 2022-01-31 23:53:21 2022-01-31 23:54:40 2022-01-31 23:58:37
## --------------------------------------------------------------------------------
## ended_at
## n missing distinct
## 103770 0 100047
##
## lowest : 2022-01-01 00:01:48 2022-01-01 00:04:02 2022-01-01 00:04:39 2022-01-01 00:08:06 2022-01-01 00:09:06
## highest: 2022-02-01 00:02:55 2022-02-01 00:06:18 2022-02-01 00:12:00 2022-02-01 00:12:04 2022-02-01 01:46:16
## --------------------------------------------------------------------------------
## start_station_name
## n missing distinct
## 87510 16260 758
##
## lowest : 2112 W Peterson Ave 63rd St Beach 900 W Harrison St Aberdeen St & Jackson Blvd Aberdeen St & Monroe St
## highest: Woodlawn & 103rd - Olive Harvey Vaccination Site Woodlawn Ave & 55th St Woodlawn Ave & Lake Park Ave Yates Blvd & 75th St Yates Blvd & 93rd St
## --------------------------------------------------------------------------------
## start_station_id
## n missing distinct
## 87510 16260 758
##
## lowest : 13001 13006 13008 13011 13016
## highest: TA1309000066 TA1309000067 WL-008 WL-011 WL-012
## --------------------------------------------------------------------------------
## end_station_name
## n missing distinct
## 85843 17927 724
##
## lowest : 2112 W Peterson Ave 63rd St Beach 900 W Harrison St Aberdeen St & Jackson Blvd Aberdeen St & Monroe St
## highest: Woodlawn & 103rd - Olive Harvey Vaccination Site Woodlawn Ave & 55th St Woodlawn Ave & Lake Park Ave Yates Blvd & 75th St Yates Blvd & 93rd St
## --------------------------------------------------------------------------------
## end_station_id
## n missing distinct
## 85843 17927 724
##
## lowest : 13001 13006 13008 13011 13016
## highest: TA1309000066 TA1309000067 WL-008 WL-011 WL-012
## --------------------------------------------------------------------------------
## start_lat
## n missing distinct Info Mean Gmd .05 .10
## 103770 0 30305 1 41.9 0.05118 41.79 41.84
## .25 .50 .75 .90 .95
## 41.88 41.89 41.93 41.95 41.97
##
## lowest : 41.65000 41.65707 41.65795 41.65841 41.65911
## highest: 42.06434 42.06468 42.06485 42.07000 45.63503
##
## Value 41.65 41.70 41.75 41.80 41.85 41.90 41.95 42.00 42.05 45.65
## Frequency 100 346 528 8164 12193 55650 23259 2824 705 1
## Proportion 0.001 0.003 0.005 0.079 0.118 0.536 0.224 0.027 0.007 0.000
##
## For the frequency table, variable is rounded to the nearest 0.05
## --------------------------------------------------------------------------------
## start_lng
## n missing distinct Info Mean Gmd .05 .10
## 103770 0 30261 1 -87.65 0.0338 -87.71 -87.69
## .25 .50 .75 .90 .95
## -87.66 -87.64 -87.63 -87.62 -87.60
##
## lowest : -87.83000 -87.82000 -87.81000 -87.80000 -87.79000
## highest: -87.53793 -87.53459 -87.53000 -87.52838 -73.79648
##
## Value -87.8 -87.6 -73.8
## Frequency 7344 96425 1
## Proportion 0.071 0.929 0.000
##
## For the frequency table, variable is rounded to the nearest 0.2
## --------------------------------------------------------------------------------
## end_lat
## n missing distinct Info Mean Gmd .05 .10
## 103684 86 768 1 41.9 0.05123 41.79 41.84
## .25 .50 .75 .90 .95
## 41.88 41.90 41.93 41.95 41.97
##
## lowest : 41.64850 41.65000 41.65841 41.65915 41.66000
## highest: 42.06000 42.06360 42.06431 42.06485 42.07000
## --------------------------------------------------------------------------------
## end_lng
## n missing distinct Info Mean Gmd .05 .10
## 103684 86 756 1 -87.65 0.03357 -87.71 -87.69
## .25 .50 .75 .90 .95
## -87.66 -87.64 -87.63 -87.62 -87.60
##
## lowest : -87.83000 -87.82000 -87.81000 -87.80000 -87.79000
## highest: -87.54851 -87.54609 -87.54000 -87.53000 -87.52000
## --------------------------------------------------------------------------------
## member_casual
## n missing distinct
## 103770 0 2
##
## Value casual member
## Frequency 18520 85250
## Proportion 0.178 0.822
## --------------------------------------------------------------------------------
describe(feb_2022)
## feb_2022
##
## 13 Variables 115609 Observations
## --------------------------------------------------------------------------------
## ride_id
## n missing distinct
## 115609 0 115609
##
## lowest : 00000123F60251E6 00019EDC63FCB69F 00021E08BEE8C0F0 000233E3BEE37FF1 0003526E72D4107F
## highest: FFFCD61197B2616B FFFE72631B6B5F5B FFFE824C0A4D034B FFFEDEF1CDC464C7 FFFF3E42F538BE4E
## --------------------------------------------------------------------------------
## rideable_type
## n missing distinct
## 115609 0 3
##
## Value classic_bike docked_bike electric_bike
## Frequency 59414 1361 54834
## Proportion 0.514 0.012 0.474
## --------------------------------------------------------------------------------
## started_at
## n missing distinct
## 115609 0 110689
##
## lowest : 2022-02-01 00:03:18 2022-02-01 00:04:30 2022-02-01 00:08:00 2022-02-01 00:08:02 2022-02-01 00:09:52
## highest: 2022-02-28 23:53:59 2022-02-28 23:54:48 2022-02-28 23:57:57 2022-02-28 23:58:38 2022-02-28 23:58:44
## --------------------------------------------------------------------------------
## ended_at
## n missing distinct
## 115609 0 110615
##
## lowest : 2022-02-01 00:09:37 2022-02-01 00:17:12 2022-02-01 00:30:10 2022-02-01 00:34:33 2022-02-01 00:39:07
## highest: 2022-03-01 02:29:21 2022-03-01 02:29:35 2022-03-01 06:37:04 2022-03-01 07:21:57 2022-03-01 08:55:17
## --------------------------------------------------------------------------------
## start_station_name
## n missing distinct
## 97029 18580 779
##
## lowest : 2112 W Peterson Ave 63rd St Beach 900 W Harrison St Aberdeen St & Jackson Blvd Aberdeen St & Monroe St
## highest: Wood St & Milwaukee Ave Wood St & Taylor St (Temp) Woodlawn Ave & 55th St Woodlawn Ave & Lake Park Ave Yates Blvd & 75th St
## --------------------------------------------------------------------------------
## start_station_id
## n missing distinct
## 97029 18580 779
##
## lowest : 13001 13006 13008 13011 13016
## highest: TA1309000067 Wilton Ave & Diversey Pkwy - Charging WL-008 WL-011 WL-012
## --------------------------------------------------------------------------------
## end_station_name
## n missing distinct
## 95254 20355 740
##
## lowest : 2112 W Peterson Ave 63rd St Beach 900 W Harrison St Aberdeen St & Jackson Blvd Aberdeen St & Monroe St
## highest: Wood St & Milwaukee Ave Wood St & Taylor St (Temp) Woodlawn Ave & 55th St Woodlawn Ave & Lake Park Ave Yates Blvd & 75th St
## --------------------------------------------------------------------------------
## end_station_id
## n missing distinct
## 95254 20355 741
##
## lowest : 13001 13006 13008 13011 13016
## highest: TA1309000067 Wilton Ave & Diversey Pkwy - Charging WL-008 WL-011 WL-012
## --------------------------------------------------------------------------------
## start_lat
## n missing distinct Info Mean Gmd .05 .10
## 115609 0 34524 1 41.89 0.054 41.79 41.80
## .25 .50 .75 .90 .95
## 41.88 41.89 41.92 41.95 41.97
##
## lowest : 41.64861 41.65000 41.65193 41.65707 41.65707
## highest: 42.06432 42.06435 42.06435 42.06485 42.07000
## --------------------------------------------------------------------------------
## start_lng
## n missing distinct Info Mean Gmd .05 .10
## 115609 0 34427 1 -87.65 0.0345 -87.71 -87.69
## .25 .50 .75 .90 .95
## -87.66 -87.64 -87.63 -87.61 -87.60
##
## lowest : -87.83000 -87.82000 -87.81000 -87.80105 -87.80000
## highest: -87.53463 -87.53079 -87.53043 -87.53000 -87.52848
## --------------------------------------------------------------------------------
## end_lat
## n missing distinct Info Mean Gmd .05 .10
## 115532 77 798 1 41.89 0.05408 41.79 41.80
## .25 .50 .75 .90 .95
## 41.88 41.89 41.92 41.95 41.97
##
## lowest : 41.65000 41.65667 41.65841 41.65841 41.65915
## highest: 42.06000 42.06360 42.06431 42.06485 42.07000
## --------------------------------------------------------------------------------
## end_lng
## n missing distinct Info Mean Gmd .05 .10
## 115532 77 785 1 -87.65 0.03449 -87.71 -87.69
## .25 .50 .75 .90 .95
## -87.66 -87.64 -87.63 -87.61 -87.60
##
## lowest : -87.83000 -87.82000 -87.81000 -87.80000 -87.79000
## highest: -87.54000 -87.53793 -87.53084 -87.53043 -87.53000
## --------------------------------------------------------------------------------
## member_casual
## n missing distinct
## 115609 0 2
##
## Value casual member
## Frequency 21416 94193
## Proportion 0.185 0.815
## --------------------------------------------------------------------------------
all_trip <- bind_rows(jan_2022,feb_2022)
all_trip <- all_trip %>%
select(-c(start_lat, start_lng, end_lat, end_lng))
describe(all_trip)
## all_trip
##
## 9 Variables 219379 Observations
## --------------------------------------------------------------------------------
## ride_id
## n missing distinct
## 219379 0 219379
##
## lowest : 00000123F60251E6 00010C6E382D644C 00019B33C06D5F19 00019EDC63FCB69F 0001F25D405EAD48
## highest: FFFE5FA260E982B7 FFFE72631B6B5F5B FFFE824C0A4D034B FFFEDEF1CDC464C7 FFFF3E42F538BE4E
## --------------------------------------------------------------------------------
## rideable_type
## n missing distinct
## 219379 0 3
##
## Value classic_bike docked_bike electric_bike
## Frequency 114481 2322 102576
## Proportion 0.522 0.011 0.468
## --------------------------------------------------------------------------------
## started_at
## n missing distinct
## 219379 0 211004
##
## lowest : 2022-01-01 00:00:05 2022-01-01 00:01:00 2022-01-01 00:01:16 2022-01-01 00:02:14 2022-01-01 00:02:35
## highest: 2022-02-28 23:53:59 2022-02-28 23:54:48 2022-02-28 23:57:57 2022-02-28 23:58:38 2022-02-28 23:58:44
## --------------------------------------------------------------------------------
## ended_at
## n missing distinct
## 219379 0 210662
##
## lowest : 2022-01-01 00:01:48 2022-01-01 00:04:02 2022-01-01 00:04:39 2022-01-01 00:08:06 2022-01-01 00:09:06
## highest: 2022-03-01 02:29:21 2022-03-01 02:29:35 2022-03-01 06:37:04 2022-03-01 07:21:57 2022-03-01 08:55:17
## --------------------------------------------------------------------------------
## start_station_name
## n missing distinct
## 184539 34840 807
##
## lowest : 2112 W Peterson Ave 63rd St Beach 900 W Harrison St Aberdeen St & Jackson Blvd Aberdeen St & Monroe St
## highest: Woodlawn & 103rd - Olive Harvey Vaccination Site Woodlawn Ave & 55th St Woodlawn Ave & Lake Park Ave Yates Blvd & 75th St Yates Blvd & 93rd St
## --------------------------------------------------------------------------------
## start_station_id
## n missing distinct
## 184539 34840 807
##
## lowest : 13001 13006 13008 13011 13016
## highest: TA1309000067 Wilton Ave & Diversey Pkwy - Charging WL-008 WL-011 WL-012
## --------------------------------------------------------------------------------
## end_station_name
## n missing distinct
## 181097 38282 789
##
## lowest : 2112 W Peterson Ave 63rd St Beach 900 W Harrison St Aberdeen St & Jackson Blvd Aberdeen St & Monroe St
## highest: Woodlawn & 103rd - Olive Harvey Vaccination Site Woodlawn Ave & 55th St Woodlawn Ave & Lake Park Ave Yates Blvd & 75th St Yates Blvd & 93rd St
## --------------------------------------------------------------------------------
## end_station_id
## n missing distinct
## 181097 38282 789
##
## lowest : 13001 13006 13008 13011 13016
## highest: TA1309000067 Wilton Ave & Diversey Pkwy - Charging WL-008 WL-011 WL-012
## --------------------------------------------------------------------------------
## member_casual
## n missing distinct
## 219379 0 2
##
## Value casual member
## Frequency 39936 179443
## Proportion 0.182 0.818
## --------------------------------------------------------------------------------Preparing data for analysis
Inspecting of all the columns in newly created data frame
Separate columns of date so that we clear idea about time, day of the trip and duration of total trip
all_trip$date <- as.Date(all_trip$started_at)
all_trip$month <- format(as.Date(all_trip$date),"%m")
all_trip$day <- format(as.Date(all_trip$date),"%d")
all_trip$year <- format(as.Date(all_trip$date),"%Y")
all_trip$day_of_week <- format(as.Date(all_trip$date),"%A")
all_trip$hour <- lubridate::hour(all_trip$started_at)
all_trip$ride_length <- difftime(all_trip$ended_at,all_trip$started_at)Check data type of all column
str(all_trip)
## 'data.frame': 219379 obs. of 16 variables:
## $ ride_id : chr "C2F7DD78E82EC875" "A6CF8980A652D272" "BD0F91DFF741C66D" "CBB80ED419105406" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2022-01-13 11:59:47" "2022-01-10 08:41:56" "2022-01-25 04:53:40" "2022-01-04 00:18:04" ...
## $ ended_at : chr "2022-01-13 12:02:44" "2022-01-10 08:46:17" "2022-01-25 04:58:01" "2022-01-04 00:33:00" ...
## $ start_station_name: chr "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Sheffield Ave & Fullerton Ave" "Clark St & Bryn Mawr Ave" ...
## $ start_station_id : chr "525" "525" "TA1306000016" "KA1504000151" ...
## $ end_station_name : chr "Clark St & Touhy Ave" "Clark St & Touhy Ave" "Greenview Ave & Fullerton Ave" "Paulina St & Montrose Ave" ...
## $ end_station_id : chr "RP-007" "RP-007" "TA1307000001" "TA1309000021" ...
## $ member_casual : chr "casual" "casual" "member" "casual" ...
## $ date : Date, format: "2022-01-13" "2022-01-10" ...
## $ month : chr "01" "01" "01" "01" ...
## $ day : chr "13" "10" "25" "04" ...
## $ year : chr "2022" "2022" "2022" "2022" ...
## $ day_of_week : chr "Thursday" "Monday" "Tuesday" "Tuesday" ...
## $ hour : int 11 8 4 0 1 18 18 12 7 15 ...
## $ ride_length : 'difftime' num 177 261 261 896 ...
## ..- attr(*, "units")= chr "secs"Convert ride_length to numeric for calculations on data
is.factor(all_trip$ride_length)
## [1] FALSE
all_trip$ride_length <- as.numeric(as.character(all_trip$ride_length))
is.numeric(all_trip$ride_length)
## [1] TRUECalculate the trip length and separate bike trip length which has been taken to quality control issues
all_trips_v2 <- all_trip[!(all_trip$start_station_name == "HQ QR" | all_trip$ride_length<30),]
# Creating a visualization for different time stamp
all_trips_v2 %>%
group_by(member_casual) %>%
summarize("<10 min" = sum(ride_length <600),
"10-20 min" = sum(ride_length >=600 & ride_length <1200),
"20-30 min" = sum(ride_length >=1200 & ride_length <1800),
"30-60 min" = sum(ride_length >=1800 & ride_length <3600),
"60-120 min" = sum(ride_length >=3600 & ride_length <7200),
">120 min" = sum(ride_length >=7200))
## # A tibble: 2 × 7
## member_casual `<10 min` `10-20 min` `20-30 min` `30-60 min` 60-120 m…¹ >120 …²
## <chr> <int> <int> <int> <int> <int> <int>
## 1 casual 18345 11762 4538 2966 960 767
## 2 member 113201 43328 12382 7071 551 512
## # … with abbreviated variable names ¹​`60-120 min`, ²​`>120 min`
# Creating time stamp columns
all_trips_v2 <- all_trips_v2 %>%
mutate(ride_length_timestamps = case_when(
ride_length <600 ~ "<10 min",
ride_length >=600 & ride_length <1200 ~ "10-20 min" ,
ride_length >=1200 & ride_length <1800 ~ "20-30 min",
ride_length >=1800 & ride_length <3600 ~ "30-60 min",
ride_length >=3600 & ride_length <7200 ~ "60-120 min",
ride_length >=7200 ~ ">120 min"))
all_trips_v2$ride_length_timestamps <- ordered(all_trips_v2$ride_length_timestamps, levels=c("<10 min", "10-20 min", "20-30 min", "30-60 min", "60-120 min", ">120 min"))
all_trips_v2 <- all_trips_v2 %>% mutate(time_of_day = case_when(
hour >= 6 & hour < 9 ~ "Early Morning",
hour >= 9 & hour < 12 ~ "Late Morning",
hour >= 12 & hour < 15 ~ "Afternoon",
hour >= 15 & hour < 18 ~ "Evening",
hour >= 18 & hour <=23 ~ "Night",
hour >=0 & hour <6 ~ "Mid Night"))
all_trips_v2 %>%
group_by(member_casual) %>%
summarize("Early Morning" = sum(time_of_day == "Early Morning"),
"Late Morning" = sum(time_of_day == "Late Morning"),
"Afternoon" = sum(time_of_day == "Afternoon"),
"Evening" = sum(time_of_day == "Evening"),
"Night" = sum(time_of_day == "Night"),
"Mid Night" = sum(time_of_day == "Mid Night"))
## # A tibble: 2 × 7
## member_casual `Early Morning` `Late Morning` Afternoon Evening Night Mid Nig…¹
## <chr> <int> <int> <int> <int> <int> <int>
## 1 casual 3049 5173 8406 10313 9732 2665
## 2 member 27741 24612 32491 47546 38500 6155
## # … with abbreviated variable name ¹​`Mid Night`
# Fixing the order of week
all_trips_v2$time_of_day <- ordered(all_trips_v2$time_of_day, levels=c("Early Morning", "Late Morning", "Afternoon", "Evening", "Night", "Mid Night"))
#summarized data
describe(all_trips_v2)
## all_trips_v2
##
## 18 Variables 216383 Observations
## --------------------------------------------------------------------------------
## ride_id
## n missing distinct
## 216383 0 216383
##
## lowest : 00000123F60251E6 00010C6E382D644C 00019B33C06D5F19 00019EDC63FCB69F 0001F25D405EAD48
## highest: FFFE5FA260E982B7 FFFE72631B6B5F5B FFFE824C0A4D034B FFFEDEF1CDC464C7 FFFF3E42F538BE4E
## --------------------------------------------------------------------------------
## rideable_type
## n missing distinct
## 216383 0 3
##
## Value classic_bike docked_bike electric_bike
## Frequency 113399 2316 100668
## Proportion 0.524 0.011 0.465
## --------------------------------------------------------------------------------
## started_at
## n missing distinct
## 216383 0 208214
##
## lowest : 2022-01-01 00:00:05 2022-01-01 00:01:00 2022-01-01 00:01:16 2022-01-01 00:02:14 2022-01-01 00:02:35
## highest: 2022-02-28 23:53:59 2022-02-28 23:54:48 2022-02-28 23:57:57 2022-02-28 23:58:38 2022-02-28 23:58:44
## --------------------------------------------------------------------------------
## ended_at
## n missing distinct
## 216383 0 207896
##
## lowest : 2022-01-01 00:01:48 2022-01-01 00:04:39 2022-01-01 00:08:06 2022-01-01 00:09:06 2022-01-01 00:13:24
## highest: 2022-03-01 02:29:21 2022-03-01 02:29:35 2022-03-01 06:37:04 2022-03-01 07:21:57 2022-03-01 08:55:17
## --------------------------------------------------------------------------------
## start_station_name
## n missing distinct
## 182771 33612 807
##
## lowest : 2112 W Peterson Ave 63rd St Beach 900 W Harrison St Aberdeen St & Jackson Blvd Aberdeen St & Monroe St
## highest: Woodlawn & 103rd - Olive Harvey Vaccination Site Woodlawn Ave & 55th St Woodlawn Ave & Lake Park Ave Yates Blvd & 75th St Yates Blvd & 93rd St
## --------------------------------------------------------------------------------
## start_station_id
## n missing distinct
## 182771 33612 807
##
## lowest : 13001 13006 13008 13011 13016
## highest: TA1309000067 Wilton Ave & Diversey Pkwy - Charging WL-008 WL-011 WL-012
## --------------------------------------------------------------------------------
## end_station_name
## n missing distinct
## 179516 36867 779
##
## lowest : 2112 W Peterson Ave 63rd St Beach 900 W Harrison St Aberdeen St & Jackson Blvd Aberdeen St & Monroe St
## highest: Woodlawn & 103rd - Olive Harvey Vaccination Site Woodlawn Ave & 55th St Woodlawn Ave & Lake Park Ave Yates Blvd & 75th St Yates Blvd & 93rd St
## --------------------------------------------------------------------------------
## end_station_id
## n missing distinct
## 179516 36867 779
##
## lowest : 13001 13006 13008 13011 13016
## highest: TA1309000067 Wilton Ave & Diversey Pkwy - Charging WL-008 WL-011 WL-012
## --------------------------------------------------------------------------------
## member_casual
## n missing distinct
## 216383 0 2
##
## Value casual member
## Frequency 39338 177045
## Proportion 0.182 0.818
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 216383 0 59 1 2022-02-01 20.1 2022-01-04
## .10 .25 .50 .75 .90 .95
## 2022-01-08 2022-01-17 2022-02-03 2022-02-16 2022-02-24 2022-02-27
##
## lowest : 2022-01-01 2022-01-02 2022-01-03 2022-01-04 2022-01-05
## highest: 2022-02-24 2022-02-25 2022-02-26 2022-02-27 2022-02-28
## --------------------------------------------------------------------------------
## month
## n missing distinct
## 216383 0 2
##
## Value 1 2
## Frequency 102542 113841
## Proportion 0.474 0.526
## --------------------------------------------------------------------------------
## day
## n missing distinct
## 216383 0 31
##
## lowest : 01 02 03 04 05, highest: 27 28 29 30 31
## --------------------------------------------------------------------------------
## year
## n missing distinct value
## 216383 0 1 2022
##
## Value 2022
## Frequency 216383
## Proportion 1
## --------------------------------------------------------------------------------
## day_of_week
## n missing distinct
## 216383 0 7
##
## lowest : Friday Monday Saturday Sunday Thursday
## highest: Saturday Sunday Thursday Tuesday Wednesday
##
## Value Friday Monday Saturday Sunday Thursday Tuesday
## Frequency 28051 38122 26851 27042 29692 34790
## Proportion 0.130 0.176 0.124 0.125 0.137 0.161
##
## Value Wednesday
## Frequency 31835
## Proportion 0.147
## --------------------------------------------------------------------------------
## hour
## n missing distinct Info Mean Gmd .05 .10
## 216383 0 24 0.996 13.65 5.464 6 7
## .25 .50 .75 .90 .95
## 10 14 17 19 21
##
## lowest : 0 1 2 3 4, highest: 19 20 21 22 23
## --------------------------------------------------------------------------------
## ride_length
## n missing distinct Info Mean Gmd .05 .10
## 216383 0 5710 1 895.6 1005 148 194
## .25 .50 .75 .90 .95
## 297 484 832 1422 1933
##
## lowest : 30 31 32 33 34
## highest: 710261 854286 1586332 1624968 1756266
## --------------------------------------------------------------------------------
## ride_length_timestamps
## n missing distinct
## 216383 0 6
##
## lowest : <10 min 10-20 min 20-30 min 30-60 min 60-120 min
## highest: 10-20 min 20-30 min 30-60 min 60-120 min >120 min
##
## Value <10 min 10-20 min 20-30 min 30-60 min 60-120 min >120 min
## Frequency 131546 55090 16920 10037 1511 1279
## Proportion 0.608 0.255 0.078 0.046 0.007 0.006
## --------------------------------------------------------------------------------
## time_of_day
## n missing distinct
## 216383 0 6
##
## lowest : Early Morning Late Morning Afternoon Evening Night
## highest: Late Morning Afternoon Evening Night Mid Night
##
## Value Early Morning Late Morning Afternoon Evening
## Frequency 30790 29785 40897 57859
## Proportion 0.142 0.138 0.189 0.267
##
## Value Night Mid Night
## Frequency 48232 8820
## Proportion 0.223 0.041
## --------------------------------------------------------------------------------Analysis on ride length
mean(all_trips_v2$ride_length) #straight average (total ride length / rides)
## [1] 895.562
median(all_trips_v2$ride_length) #midpoint number in the ascending array of ride lengths
## [1] 484
max(all_trips_v2$ride_length) #longest ride
## [1] 1756266
min(all_trips_v2$ride_length) #shortest ride
## [1] 30
summary(all_trips_v2$ride_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.0 297.0 484.0 895.6 832.0 1756266.0Comparison between casual and annual members
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 1730.3300
## 2 member 710.0832
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 641
## 2 member 456
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 1756266
## 2 member 89997
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 30
## 2 member 30
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1 casual Friday 1419.5282
## 2 member Friday 718.2178
## 3 casual Monday 1583.6239
## 4 member Monday 692.2583
## 5 casual Saturday 2035.0455
## 6 member Saturday 732.9314
## 7 casual Sunday 1859.4583
## 8 member Sunday 768.2466
## 9 casual Thursday 1949.2285
## 10 member Thursday 690.1049
## 11 casual Tuesday 1425.3263
## 12 member Tuesday 707.6212
## 13 casual Wednesday 1797.8949
## 14 member Wednesday 684.0684
# Fixing the order of week
all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
# Average ride time by each day for members vs casual users in order
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1 casual Sunday 1859.4583
## 2 member Sunday 768.2466
## 3 casual Monday 1583.6239
## 4 member Monday 692.2583
## 5 casual Tuesday 1425.3263
## 6 member Tuesday 707.6212
## 7 casual Wednesday 1797.8949
## 8 member Wednesday 684.0684
## 9 casual Thursday 1949.2285
## 10 member Thursday 690.1049
## 11 casual Friday 1419.5282
## 12 member Friday 718.2178
## 13 casual Saturday 2035.0455
## 14 member Saturday 732.9314
# analyzing rider data by type and weekday
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(member_casual, weekday) # sorts
## # A tibble: 14 × 4
## # Groups: member_casual [2]
## member_casual weekday number_of_rides average_duration
## <chr> <ord> <int> <dbl>
## 1 casual Sun 6635 1859.
## 2 casual Mon 6743 1584.
## 3 casual Tue 5112 1425.
## 4 casual Wed 4911 1798.
## 5 casual Thu 4364 1949.
## 6 casual Fri 5072 1420.
## 7 casual Sat 6501 2035.
## 8 member Sun 20407 768.
## 9 member Mon 31379 692.
## 10 member Tue 29678 708.
## 11 member Wed 26924 684.
## 12 member Thu 25328 690.
## 13 member Fri 22979 718.
## 14 member Sat 20350 733.# Creating a visualization for number trips taken
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge")
# Creating a visualization for average duration
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge")
# Creating a visualization for different types of bikes
all_trips_v2 %>%
group_by(member_casual, rideable_type) %>%
summarise(number_of_rides = n()) %>%
ggplot(aes(x=member_casual, y=number_of_rides, fill=rideable_type)) +
geom_bar(stat="identity", width = 0.3)
#Creating a visualization for time stamp
all_trips_v2 %>%
group_by(member_casual, ride_length_timestamps) %>%
summarise(no_of_rides = n()) %>%
ggplot(aes(x=ride_length_timestamps, y=no_of_rides, fill=member_casual)) +
geom_col(position = "dodge")
all_trips_v2 %>%
group_by(member_casual, time_of_day) %>%
summarise(no_of_rides = n()) %>%
ggplot(aes(x=time_of_day, y=no_of_rides, color=member_casual)) +
geom_point() + geom_line(aes(group = member_casual))