In this step, we will load all the necessary libraries for data manipulation and visualization operations.
library(tidyverse) # Data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr) # Data wrangling
library(tidyr) # Data tidying
library(ggplot2) # Data visualization
library(readr) #Data reading
library(lubridate) # Date handling
library(janitor) # Data cleaning
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr) # Data summarization
library(anytime) # Date Parsing
library(hms) # For time objects
##
## Attaching package: 'hms'
##
## The following object is masked from 'package:lubridate':
##
## hms
Next, we will load all the CSV files containing the ride data for the year 2023. We’ll also take a look at the first few rows of each dataset to understand their structure.
jan_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Jan_ride.csv")
head(jan_2023)
## ride_id rideable_type started_at ended_at
## 1 F96D5A74A3E41399 electric_bike 1/21/2023 20:05 1/21/2023 20:16
## 2 13CB7EB698CEDB88 classic_bike 1/10/2023 15:37 1/10/2023 15:46
## 3 BD88A2E670661CE5 electric_bike 1/2/2023 7:51 1/2/2023 8:05
## 4 C90792D034FED968 classic_bike 1/22/2023 10:52 1/22/2023 11:01
## 5 3397017529188E8A classic_bike 1/12/2023 13:58 1/12/2023 14:13
## 6 58E68156DAE3E311 electric_bike 1/31/2023 7:18 1/31/2023 7:21
## start_station_name start_station_id end_station_name
## 1 Lincoln Ave & Fullerton Ave TA1309000058 Hampden Ct & Diversey Ave
## 2 Kimbark Ave & 53rd St TA1309000037 Greenwood Ave & 47th St
## 3 Western Ave & Lunt Ave RP-005 Valli Produce - Evanston Plaza
## 4 Kimbark Ave & 53rd St TA1309000037 Greenwood Ave & 47th St
## 5 Kimbark Ave & 53rd St TA1309000037 Greenwood Ave & 47th St
## 6 Lakeview Ave & Fullerton Pkwy TA1309000019 Hampden Ct & Diversey Ave
## end_station_id start_lat start_lng end_lat end_lng member_casual
## 1 202480 41.92407 -87.64628 41.93000 -87.64000 member
## 2 TA1308000002 41.79957 -87.59475 41.80983 -87.59938 member
## 3 599 42.00857 -87.69048 42.03974 -87.69941 casual
## 4 TA1308000002 41.79957 -87.59475 41.80983 -87.59938 member
## 5 TA1308000002 41.79957 -87.59475 41.80983 -87.59938 member
## 6 202480 41.92607 -87.63886 41.93000 -87.64000 member
## ride_length day_of_week
## 1 0:10:51 Saturday
## 2 0:08:29 Tuesday
## 3 0:13:14 Monday
## 4 0:08:46 Sunday
## 5 0:15:19 Thursday
## 6 0:03:13 Tuesday
feb_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Feb_ride.csv")
head(feb_2023)
## ride_id rideable_type started_at ended_at
## 1 CBCD0D7777F0E45F classic_bike 2/14/2023 11:59 2/14/2023 12:13
## 2 F3EC5FCE5FF39DE9 electric_bike 2/15/2023 13:53 2/15/2023 13:59
## 3 E54C1F27FA9354FF classic_bike 2/19/2023 11:10 2/19/2023 11:35
## 4 3D561E04F739CC45 electric_bike 2/26/2023 16:12 2/26/2023 16:39
## 5 0CB4B4D53B2DBE05 electric_bike 2/20/2023 11:55 2/20/2023 12:05
## 6 C67EB62172C472EB classic_bike 2/24/2023 18:50 2/24/2023 18:56
## start_station_name start_station_id end_station_name
## 1 Southport Ave & Clybourn Ave TA1309000030 Clark St & Schiller St
## 2 Clarendon Ave & Gordon Ter 13379 Sheridan Rd & Lawrence Ave
## 3 Southport Ave & Clybourn Ave TA1309000030 Aberdeen St & Monroe St
## 4 Southport Ave & Clybourn Ave TA1309000030 Franklin St & Adams St (Temp)
## 5 Prairie Ave & Garfield Blvd TA1307000160 Cottage Grove Ave & 63rd St
## 6 Wells St & Concord Ln TA1308000050 Clybourn Ave & Division St
## end_station_id start_lat start_lng end_lat end_lng member_casual
## 1 TA1309000024 41.92077 -87.66371 41.90799 -87.63150 casual
## 2 TA1309000041 41.95788 -87.64958 41.96952 -87.65469 casual
## 3 13156 41.92077 -87.66371 41.88042 -87.65552 member
## 4 TA1309000008 41.92087 -87.66373 41.87943 -87.63550 member
## 5 KA1503000054 41.79483 -87.61879 41.78053 -87.60597 member
## 6 TA1307000115 41.91213 -87.63466 41.90461 -87.64055 member
## ride_length day_of_week
## 1 00:13:56 Tuesday
## 2 00:05:20 Wednesday
## 3 00:24:04 Sunday
## 4 00:27:50 Sunday
## 5 00:10:25 Monday
## 6 00:06:24 Friday
mar_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Mar_ride.csv")
head(mar_2023)
## ride_id rideable_type started_at ended_at
## 1 6842AA605EE9FBB3 electric_bike 3/16/2023 8:20 3/16/2023 8:22
## 2 FF7CF57CFE026D02 classic_bike 3/31/2023 12:28 3/31/2023 12:38
## 3 6B61B916032CB6D6 classic_bike 3/22/2023 14:09 3/22/2023 14:24
## 4 E55E61A5F1260040 electric_bike 3/9/2023 7:15 3/9/2023 7:26
## 5 123AAD676850F53C classic_bike 3/22/2023 17:47 3/22/2023 18:01
## 6 5929D3080983AF4F classic_bike 3/8/2023 19:58 3/8/2023 20:05
## start_station_name start_station_id
## 1 Clark St & Armitage Ave 13146
## 2 Orleans St & Chestnut St (NEXT Apts) 620
## 3 Desplaines St & Kinzie St TA1306000003
## 4 Walsh Park 18067
## 5 Orleans St & Chestnut St (NEXT Apts) 620
## 6 Rush St & Hubbard St KA1503000044
## end_station_name end_station_id start_lat start_lng end_lat
## 1 Larrabee St & Webster Ave 13193 41.91841 -87.63645 41.92182
## 2 Clark St & Randolph St TA1305000030 41.89820 -87.63754 41.88458
## 3 Sheffield Ave & Kingsbury St 13154 41.88872 -87.64445 41.91052
## 4 Sangamon St & Lake St TA1306000015 41.91448 -87.66801 41.88578
## 5 Halsted St & Wrightwood Ave TA1309000061 41.89820 -87.63754 41.92914
## 6 Wells St & Huron St TA1306000012 41.89017 -87.62619 41.89472
## end_lng member_casual ride_length day_of_week
## 1 -87.64414 member 0:02:18 Thursday
## 2 -87.63189 member 0:10:38 Friday
## 3 -87.65311 member 0:15:43 Wednesday
## 4 -87.65102 member 0:11:00 Thursday
## 5 -87.64908 member 0:14:27 Wednesday
## 6 -87.63436 member 0:06:55 Wednesday
apr_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Apr_ride.csv")
head(apr_2023)
## ride_id rideable_type started_at ended_at
## 1 5B6500E1E58655C0 classic_bike 4/10/2023 17:34 4/10/2023 18:02
## 2 AA65D25D69AF771F classic_bike 4/12/2023 12:29 4/12/2023 12:54
## 3 079FB2C196414482 electric_bike 4/13/2023 17:39 4/13/2023 17:40
## 4 599623864C871207 classic_bike 4/29/2023 20:57 4/29/2023 20:57
## 5 63ECC8A13D11A76A classic_bike 4/20/2023 17:03 4/20/2023 17:24
## 6 A396F54F4C1927D8 electric_bike 4/14/2023 22:29 4/14/2023 22:29
## start_station_name start_station_id
## 1 Avenue O & 134th St 20214
## 2 Cottage Grove Ave & 51st St TA1309000067
## 3 Morgan Ave & 14th Pl TA1306000002
## 4 Cottage Grove Ave & 51st St TA1309000067
## 5 California Ave & Division St 13256
## 6 California Ave & Milwaukee Ave 13084
## end_station_name end_station_id start_lat start_lng end_lat
## 1 Avenue O & 134th St 20214 41.65187 -87.53967 41.65187
## 2 Cottage Grove Ave & 51st St TA1309000067 41.80304 -87.60662 41.80304
## 3 Morgan Ave & 14th Pl TA1306000002 41.86243 -87.65115 41.86238
## 4 Cottage Grove Ave & 51st St TA1309000067 41.80304 -87.60662 41.80304
## 5 California Ave & Milwaukee Ave 13084 41.90303 -87.69747 41.92269
## 6 California Ave & Milwaukee Ave 13084 41.92269 -87.69713 41.92269
## end_lng member_casual ride_length day_of_week
## 1 -87.53967 member 00:28:01 Monday
## 2 -87.60662 member 00:24:14 Wednesday
## 3 -87.65106 member 00:01:34 Thursday
## 4 -87.60662 member 00:00:03 Saturday
## 5 -87.69715 casual 00:21:47 Thursday
## 6 -87.69715 member 00:00:03 Friday
may_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_May_ride.csv")
head(may_2023)
## ride_id rideable_type started_at ended_at
## 1 DDEB93BC2CE9AA77 classic_bike 5/10/2023 16:47 5/10/2023 16:59
## 2 C07B70172FC92F59 classic_bike 5/9/2023 18:30 5/9/2023 18:39
## 3 2BA66385DF8F815A classic_bike 5/30/2023 15:01 5/30/2023 15:17
## 4 31EFCCB05F12D8EF docked_bike 5/9/2023 14:13 5/9/2023 14:47
## 5 71DFF834E1D3CE0B classic_bike 5/6/2023 16:47 5/6/2023 16:52
## 6 2117485899B4CEA4 classic_bike 5/15/2023 12:47 5/15/2023 13:00
## start_station_name start_station_id
## 1 Carpenter St & Huron St 13196
## 2 Southport Ave & Clark St TA1308000047
## 3 Clinton St & Madison St TA1305000032
## 4 DuSable Lake Shore Dr & Monroe St 13300
## 5 Shore Dr & 55th St TA1308000009
## 6 Clark St & Schiller St TA1309000024
## end_station_name end_station_id start_lat start_lng end_lat
## 1 Damen Ave & Cortland St 13133 41.89456 -87.65345 41.91598
## 2 Southport Ave & Belmont Ave 13229 41.95708 -87.66420 41.93948
## 3 McClurg Ct & Ohio St TA1306000029 41.88275 -87.64119 41.89259
## 4 Adler Planetarium 13431 41.88096 -87.61674 41.86610
## 5 Harper Ave & 59th St KA1503000070 41.79521 -87.58071 41.78794
## 6 Franklin St & Lake St TA1307000111 41.90799 -87.63150 41.88584
## end_lng member_casual ride_length day_of_week
## 1 -87.67733 member 0:12:51 Wednesday
## 2 -87.66375 member 0:08:54 Tuesday
## 3 -87.61729 member 0:15:39 Tuesday
## 4 -87.60727 casual 0:33:40 Tuesday
## 5 -87.58832 member 0:04:51 Saturday
## 6 -87.63550 member 0:12:39 Monday
jun_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Jun_ride.csv")
head(jun_2023)
## ride_id rideable_type started_at ended_at
## 1 055E6783FCD761FD electric_bike 6/6/2023 20:58 6/6/2023 21:04
## 2 65BC2A1FC12CFFA4 classic_bike 6/28/2023 18:21 6/28/2023 18:56
## 3 4D149B57532FE9C9 classic_bike 6/30/2023 9:53 6/30/2023 9:53
## 4 BAE48AFCA5C36414 classic_bike 6/21/2023 13:23 6/21/2023 13:24
## 5 E3491C5273700783 classic_bike 6/21/2023 13:24 6/21/2023 13:40
## 6 4AE5E4B5B30DBFEF classic_bike 6/2/2023 23:39 6/2/2023 23:40
## start_station_name start_station_id
## 1 California Ave & Milwaukee Ave 13084
## 2 Cottage Grove Ave & 51st St TA1309000067
## 3 Western Ave & Roscoe St 15634
## 4 Cottage Grove Ave & 51st St TA1309000067
## 5 Cottage Grove Ave & 51st St TA1309000067
## 6 California Ave & Milwaukee Ave 13084
## end_station_name end_station_id start_lat start_lng end_lat
## 1 California Ave & Division St 13256 41.92262 -87.69711 41.90303
## 2 Cottage Grove Ave & 51st St TA1309000067 41.80304 -87.60662 41.80304
## 3 Western Ave & Roscoe St 15634 41.94303 -87.68729 41.94303
## 4 Cottage Grove Ave & 51st St TA1309000067 41.80304 -87.60662 41.80304
## 5 Cottage Grove Ave & 51st St TA1309000067 41.80304 -87.60662 41.80304
## 6 California Ave & Milwaukee Ave 13084 41.92269 -87.69715 41.92269
## end_lng member_casual ride_length day_of_week
## 1 -87.69747 member 0:05:47 Tuesday
## 2 -87.60662 member 0:34:40 Wednesday
## 3 -87.68729 member 0:00:22 Friday
## 4 -87.60662 member 0:00:19 Wednesday
## 5 -87.60662 member 0:16:01 Wednesday
## 6 -87.69715 member 0:00:45 Friday
jul_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Jul_ride.csv")
head(jul_2023)
## ride_id rideable_type started_at ended_at
## 1 9340B064F0AEE130 electric_bike 7/23/2023 20:06 7/23/2023 20:22
## 2 D1460EE3CE0D8AF8 classic_bike 7/23/2023 17:05 7/23/2023 17:18
## 3 DF41BE31B895A25E classic_bike 7/23/2023 10:14 7/23/2023 10:24
## 4 9624A293749EF703 electric_bike 7/21/2023 8:27 7/21/2023 8:32
## 5 2F68A6A4CDB4C99A classic_bike 7/8/2023 15:46 7/8/2023 15:58
## 6 9AEE973E6B941A9C classic_bike 7/10/2023 8:44 7/10/2023 8:49
## start_station_name start_station_id end_station_name
## 1 Kedzie Ave & 110th St 20204 Public Rack - Racine Ave & 109th Pl
## 2 Western Ave & Walton St KA1504000103 Milwaukee Ave & Grand Ave
## 3 Western Ave & Walton St KA1504000103 Damen Ave & Pierce Ave
## 4 Racine Ave & Randolph St 13155 Clinton St & Madison St
## 5 Clark St & Leland Ave TA1309000014 Montrose Harbor
## 6 Racine Ave & Randolph St 13155 Sangamon St & Lake St
## end_station_id start_lat start_lng end_lat end_lng member_casual
## 1 877 41.69241 -87.70091 41.69483 -87.65304 member
## 2 13033 41.89842 -87.68660 41.89158 -87.64838 member
## 3 TA1305000041 41.89842 -87.68660 41.90940 -87.67769 member
## 4 TA1305000032 41.88411 -87.65694 41.88275 -87.64119 member
## 5 TA1308000012 41.96709 -87.66729 41.96398 -87.63818 member
## 6 TA1306000015 41.88407 -87.65685 41.88578 -87.65102 member
## ride_length day_of_week
## 1 00:16:30 Sunday
## 2 00:13:30 Sunday
## 3 00:09:36 Sunday
## 4 00:04:56 Friday
## 5 00:11:26 Saturday
## 6 00:04:54 Monday
aug_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Aug_ride.csv")
head(aug_2023)
## ride_id rideable_type started_at ended_at
## 1 903C30C2D810A53B electric_bike 8/19/2023 15:41 8/19/2023 15:53
## 2 6400344C80D626CA electric_bike 8/11/2023 14:27 8/11/2023 14:34
## 3 B56F0D2EC8B33085 electric_bike 8/26/2023 15:49 8/26/2023 16:11
## 4 EE2EF7362A4CE72A electric_bike 8/30/2023 16:00 8/30/2023 16:08
## 5 82511E5639E5254E electric_bike 8/10/2023 12:18 8/10/2023 12:23
## 6 56D52B6F72CBF23C electric_bike 8/15/2023 21:54 8/15/2023 21:58
## start_station_name start_station_id
## 1 LaSalle St & Illinois St 13430
## 2 Clark St & Randolph St TA1305000030
## 3 Sheffield Ave & Kingsbury St 13154
## 4 Orleans St & Merchandise Mart Plaza TA1305000022
## 5 Orleans St & Merchandise Mart Plaza TA1305000022
## 6 Wells St & Elm St KA1504000135
## end_station_name end_station_id start_lat start_lng end_lat
## 1 Clark St & Elm St TA1307000039 41.89072 -87.63148 41.90297
## 2 Dearborn Pkwy & Delaware Pl TA1307000128 41.88497 -87.63144 41.89897
## 3 Sawyer Ave & Irving Park Rd KA1504000096 41.91070 -87.65320 41.95359
## 4 Dearborn Pkwy & Delaware Pl TA1307000128 41.88837 -87.63732 41.89897
## 5 Dearborn Pkwy & Delaware Pl TA1307000128 41.88775 -87.63684 41.89897
## 6 Dearborn Pkwy & Delaware Pl TA1307000128 41.90311 -87.63472 41.89897
## end_lng member_casual ride_length day_of_week
## 1 -87.63128 member 00:11:43 Saturday
## 2 -87.62991 member 00:07:37 Friday
## 3 -87.70931 member 00:22:17 Saturday
## 4 -87.62991 member 00:08:17 Wednesday
## 5 -87.62991 member 00:05:10 Thursday
## 6 -87.62991 member 00:04:02 Tuesday
sep_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Sep_ride.csv")
head(sep_2023)
## ride_id rideable_type started_at ended_at
## 1 011C1903BF4E2E28 classic_bike 9/23/2023 0:27 9/23/2023 0:33
## 2 87DB80E048A1BF9F classic_bike 9/2/2023 9:26 9/2/2023 9:38
## 3 7C2EB7AF669066E3 electric_bike 9/25/2023 18:30 9/25/2023 18:41
## 4 57D197B010269CE3 classic_bike 9/13/2023 15:30 9/13/2023 15:39
## 5 8A2CEA7C8C8074D8 classic_bike 9/18/2023 15:58 9/18/2023 16:05
## 6 40D9EF382CC6C53D classic_bike 9/17/2023 11:58 9/17/2023 12:08
## start_station_name start_station_id end_station_name
## 1 Halsted St & Wrightwood Ave TA1309000061 Sheffield Ave & Wellington Ave
## 2 Clark St & Drummond Pl TA1307000142 Racine Ave & Fullerton Ave
## 3 Financial Pl & Ida B Wells Dr SL-010 Racine Ave & 15th St
## 4 Clark St & Drummond Pl TA1307000142 Racine Ave & Belmont Ave
## 5 Halsted St & Wrightwood Ave TA1309000061 Racine Ave & Fullerton Ave
## 6 Kedzie Ave & Milwaukee Ave 13085 California Ave & Milwaukee Ave
## end_station_id start_lat start_lng end_lat end_lng member_casual
## 1 TA1307000052 41.92914 -87.64908 41.93625 -87.65266 member
## 2 TA1306000026 41.93125 -87.64434 41.92557 -87.65842 member
## 3 13304 41.87506 -87.63314 41.86127 -87.65663 member
## 4 TA1308000019 41.93125 -87.64434 41.93974 -87.65887 member
## 5 TA1306000026 41.92914 -87.64908 41.92557 -87.65842 member
## 6 13084 41.92957 -87.70786 41.92269 -87.69715 member
## ride_length day_of_week
## 1 0:05:37 Saturday
## 2 0:11:36 Saturday
## 3 0:11:28 Monday
## 4 0:08:29 Wednesday
## 5 0:06:06 Monday
## 6 0:09:46 Sunday
oct_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Oct_ride.csv")
head(oct_2023)
## ride_id rideable_type started_at ended_at
## 1 4449097279F8BBE7 classic_bike 10/8/2023 10:36 10/8/2023 10:49
## 2 9CF060543CA7B439 electric_bike 10/11/2023 17:23 10/11/2023 17:36
## 3 667F21F4D6BDE69C electric_bike 10/12/2023 7:02 10/12/2023 7:06
## 4 F92714CC6B019B96 classic_bike 10/24/2023 19:13 10/24/2023 19:18
## 5 5E34BA5DE945A9CC classic_bike 10/9/2023 18:19 10/9/2023 18:30
## 6 F7D7420AFAC53CD9 electric_bike 10/4/2023 17:10 10/4/2023 17:25
## start_station_name start_station_id
## 1 Orleans St & Chestnut St (NEXT Apts) 620
## 2 Desplaines St & Kinzie St TA1306000003
## 3 Orleans St & Chestnut St (NEXT Apts) 620
## 4 Desplaines St & Kinzie St TA1306000003
## 5 Desplaines St & Kinzie St TA1306000003
## 6 Orleans St & Chestnut St (NEXT Apts) 620
## end_station_name end_station_id start_lat start_lng end_lat
## 1 Sheffield Ave & Webster Ave TA1309000033 41.89820 -87.63754 41.92154
## 2 Sheffield Ave & Webster Ave TA1309000033 41.88864 -87.64441 41.92154
## 3 Franklin St & Lake St TA1307000111 41.89807 -87.63751 41.88584
## 4 Franklin St & Lake St TA1307000111 41.88872 -87.64445 41.88584
## 5 Franklin St & Lake St TA1307000111 41.88872 -87.64445 41.88584
## 6 Sheffield Ave & Webster Ave TA1309000033 41.89812 -87.63753 41.92154
## end_lng member_casual ride_length day_of_week
## 1 -87.65382 member 0:12:53 Sunday
## 2 -87.65382 member 0:12:09 Wednesday
## 3 -87.63550 member 0:04:20 Thursday
## 4 -87.63550 member 0:05:26 Tuesday
## 5 -87.63550 member 0:11:30 Monday
## 6 -87.65382 member 0:14:22 Wednesday
nov_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Nov_ride.csv")
head(nov_2023)
## ride_id rideable_type started_at ended_at
## 1 4EAD8F1AD547356B electric_bike 11/30/2023 21:50 11/30/2023 22:13
## 2 6322270563BF5470 electric_bike 11/3/2023 9:44 11/3/2023 10:17
## 3 B37BDE091ECA38E0 electric_bike 11/30/2023 11:39 11/30/2023 11:40
## 4 CF0CA5DD26E4F90E classic_bike 11/8/2023 10:01 11/8/2023 10:27
## 5 EB8381AA641348DB classic_bike 11/3/2023 16:20 11/3/2023 16:54
## 6 B8CF14EA423D6886 electric_bike 11/30/2023 16:15 11/30/2023 16:39
## start_station_name start_station_id end_station_name
## 1 Millennium Park 13008 Pine Grove Ave & Waveland Ave
## 2 Broadway & Sheridan Rd 13323 Broadway & Sheridan Rd
## 3 State St & Pearson St TA1307000061 State St & Pearson St
## 4 Theater on the Lake TA1308000001 Theater on the Lake
## 5 Theater on the Lake TA1308000001 Theater on the Lake
## 6 Pine Grove Ave & Waveland Ave TA1307000150 Millennium Park
## end_station_id start_lat start_lng end_lat end_lng member_casual
## 1 TA1307000150 41.88110 -87.62408 41.94947 -87.64645 member
## 2 13323 41.95287 -87.65003 41.95283 -87.64999 member
## 3 TA1307000061 41.89753 -87.62869 41.89745 -87.62872 member
## 4 TA1308000001 41.92628 -87.63083 41.92628 -87.63083 member
## 5 TA1308000001 41.92628 -87.63083 41.92628 -87.63083 member
## 6 13008 41.94942 -87.64638 41.88103 -87.62408 member
## ride_length day_of_week
## 1 0:23:22 Thursday
## 2 0:33:13 Friday
## 3 0:00:24 Thursday
## 4 0:25:20 Wednesday
## 5 0:34:00 Friday
## 6 0:23:59 Thursday
dec_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Dec_ride.csv")
head(dec_2023)
## ride_id rideable_type started_at ended_at
## 1 84BFC1F137684EAB classic_bike 12/2/2023 23:12 12/2/2023 23:21
## 2 EEC92D30A70471E5 classic_bike 12/14/2023 13:43 12/14/2023 13:44
## 3 1C33464DEEB1F23C electric_bike 12/4/2023 11:57 12/4/2023 12:13
## 4 E0A61810C305E5EC classic_bike 12/4/2023 9:34 12/4/2023 9:35
## 5 0706CEB2E1924F3D classic_bike 12/4/2023 9:36 12/4/2023 9:36
## 6 EB09035006DCCB2C electric_bike 12/2/2023 6:06 12/2/2023 6:09
## start_station_name start_station_id end_station_name
## 1 DuSable Museum KA1503000075 Cottage Grove Ave & 51st St
## 2 California Ave & Division St 13256 California Ave & Division St
## 3 Chicago State University 20106 Chicago State University
## 4 Cottage Grove Ave & 51st St TA1309000067 Cottage Grove Ave & 51st St
## 5 Cottage Grove Ave & 51st St TA1309000067 Cottage Grove Ave & 51st St
## 6 Chicago State University 20106 Chicago State University
## end_station_id start_lat start_lng end_lat end_lng member_casual
## 1 TA1309000067 41.79157 -87.60785 41.80304 -87.60662 member
## 2 13256 41.90303 -87.69747 41.90303 -87.69747 casual
## 3 20106 41.71895 -87.60831 41.71896 -87.60830 casual
## 4 TA1309000067 41.80304 -87.60662 41.80304 -87.60662 casual
## 5 TA1309000067 41.80304 -87.60662 41.80304 -87.60662 casual
## 6 20106 41.71896 -87.60834 41.71896 -87.60830 casual
## ride_length day_of_week
## 1 00:08:10 Saturday
## 2 00:01:00 Thursday
## 3 00:16:55 Monday
## 4 00:01:34 Monday
## 5 00:00:13 Monday
## 6 00:02:34 Saturday
Before combining the datasets, we ensure that the data types of corresponding columns are consistent using the compare_df_cols() function to identify any mismatches.
compare_df_cols(jan_2023,feb_2023,mar_2023,apr_2023,may_2023,jun_2023,jul_2023,
aug_2023,sep_2023,oct_2023,nov_2023,dec_2023, return = "mismatch")
## [1] column_name jan_2023 feb_2023 mar_2023 apr_2023 may_2023
## [7] jun_2023 jul_2023 aug_2023 sep_2023 oct_2023 nov_2023
## [13] dec_2023
## <0 rows> (or 0-length row.names)
We merge different csv files into single data for clearer analysis.
ride_2023 <- bind_rows(jan_2023,feb_2023,mar_2023,apr_2023,may_2023,jun_2023,jul_2023,
aug_2023,sep_2023,oct_2023,nov_2023,dec_2023)
After merging dataframe, we run some function to get summary of the dataframe.
summary(ride_2023)
## ride_id rideable_type started_at ended_at
## Length:4335767 Length:4335767 Length:4335767 Length:4335767
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:4335767 Length:4335767 Length:4335767 Length:4335767
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.65 Min. :-87.84 Min. : 0.00 Min. :-87.84
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.64 Mean :41.90 Mean :-87.64
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.06 Max. :-87.53 Max. :42.06 Max. : 0.00
## member_casual ride_length day_of_week
## Length:4335767 Length:4335767 Length:4335767
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
skim_without_charts(ride_2023)
| Name | ride_2023 |
| Number of rows | 4335767 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1 | 8 | 16 | 0 | 4335360 | 0 |
| rideable_type | 0 | 1 | 11 | 13 | 0 | 3 | 0 |
| started_at | 0 | 1 | 13 | 16 | 0 | 442713 | 0 |
| ended_at | 0 | 1 | 13 | 16 | 0 | 443128 | 0 |
| start_station_name | 0 | 1 | 0 | 64 | 12 | 1535 | 0 |
| start_station_id | 0 | 1 | 0 | 35 | 12 | 1468 | 0 |
| end_station_name | 0 | 1 | 0 | 64 | 4048 | 1558 | 0 |
| end_station_id | 0 | 1 | 0 | 36 | 4048 | 1483 | 0 |
| member_casual | 0 | 1 | 6 | 6 | 0 | 2 | 0 |
| ride_length | 0 | 1 | 7 | 255 | 0 | 31049 | 0 |
| day_of_week | 0 | 1 | 6 | 9 | 0 | 7 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.04 | 41.65 | 41.88 | 41.90 | 41.93 | 42.06 |
| start_lng | 0 | 1 | -87.64 | 0.02 | -87.84 | -87.66 | -87.64 | -87.63 | -87.53 |
| end_lat | 0 | 1 | 41.90 | 0.06 | 0.00 | 41.88 | 41.90 | 41.93 | 42.06 |
| end_lng | 0 | 1 | -87.64 | 0.08 | -87.84 | -87.66 | -87.64 | -87.63 | 0.00 |
glimpse(ride_2023)
## Rows: 4,335,767
## Columns: 15
## $ ride_id <chr> "F96D5A74A3E41399", "13CB7EB698CEDB88", "BD88A2E670…
## $ rideable_type <chr> "electric_bike", "classic_bike", "electric_bike", "…
## $ started_at <chr> "1/21/2023 20:05", "1/10/2023 15:37", "1/2/2023 7:5…
## $ ended_at <chr> "1/21/2023 20:16", "1/10/2023 15:46", "1/2/2023 8:0…
## $ start_station_name <chr> "Lincoln Ave & Fullerton Ave", "Kimbark Ave & 53rd …
## $ start_station_id <chr> "TA1309000058", "TA1309000037", "RP-005", "TA130900…
## $ end_station_name <chr> "Hampden Ct & Diversey Ave", "Greenwood Ave & 47th …
## $ end_station_id <chr> "202480", "TA1308000002", "599", "TA1308000002", "T…
## $ start_lat <dbl> 41.92407, 41.79957, 42.00857, 41.79957, 41.79957, 4…
## $ start_lng <dbl> -87.64628, -87.59475, -87.69048, -87.59475, -87.594…
## $ end_lat <dbl> 41.93000, 41.80983, 42.03974, 41.80983, 41.80983, 4…
## $ end_lng <dbl> -87.64000, -87.59938, -87.69941, -87.59938, -87.599…
## $ member_casual <chr> "member", "member", "casual", "member", "member", "…
## $ ride_length <chr> "0:10:51", "0:08:29", "0:13:14", "0:08:46", "0:15:1…
## $ day_of_week <chr> "Saturday", "Tuesday", "Monday", "Sunday", "Thursda…
head(ride_2023)
## ride_id rideable_type started_at ended_at
## 1 F96D5A74A3E41399 electric_bike 1/21/2023 20:05 1/21/2023 20:16
## 2 13CB7EB698CEDB88 classic_bike 1/10/2023 15:37 1/10/2023 15:46
## 3 BD88A2E670661CE5 electric_bike 1/2/2023 7:51 1/2/2023 8:05
## 4 C90792D034FED968 classic_bike 1/22/2023 10:52 1/22/2023 11:01
## 5 3397017529188E8A classic_bike 1/12/2023 13:58 1/12/2023 14:13
## 6 58E68156DAE3E311 electric_bike 1/31/2023 7:18 1/31/2023 7:21
## start_station_name start_station_id end_station_name
## 1 Lincoln Ave & Fullerton Ave TA1309000058 Hampden Ct & Diversey Ave
## 2 Kimbark Ave & 53rd St TA1309000037 Greenwood Ave & 47th St
## 3 Western Ave & Lunt Ave RP-005 Valli Produce - Evanston Plaza
## 4 Kimbark Ave & 53rd St TA1309000037 Greenwood Ave & 47th St
## 5 Kimbark Ave & 53rd St TA1309000037 Greenwood Ave & 47th St
## 6 Lakeview Ave & Fullerton Pkwy TA1309000019 Hampden Ct & Diversey Ave
## end_station_id start_lat start_lng end_lat end_lng member_casual
## 1 202480 41.92407 -87.64628 41.93000 -87.64000 member
## 2 TA1308000002 41.79957 -87.59475 41.80983 -87.59938 member
## 3 599 42.00857 -87.69048 42.03974 -87.69941 casual
## 4 TA1308000002 41.79957 -87.59475 41.80983 -87.59938 member
## 5 TA1308000002 41.79957 -87.59475 41.80983 -87.59938 member
## 6 202480 41.92607 -87.63886 41.93000 -87.64000 member
## ride_length day_of_week
## 1 0:10:51 Saturday
## 2 0:08:29 Tuesday
## 3 0:13:14 Monday
## 4 0:08:46 Sunday
## 5 0:15:19 Thursday
## 6 0:03:13 Tuesday
str(ride_2023)
## 'data.frame': 4335767 obs. of 15 variables:
## $ ride_id : chr "F96D5A74A3E41399" "13CB7EB698CEDB88" "BD88A2E670661CE5" "C90792D034FED968" ...
## $ rideable_type : chr "electric_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : chr "1/21/2023 20:05" "1/10/2023 15:37" "1/2/2023 7:51" "1/22/2023 10:52" ...
## $ ended_at : chr "1/21/2023 20:16" "1/10/2023 15:46" "1/2/2023 8:05" "1/22/2023 11:01" ...
## $ start_station_name: chr "Lincoln Ave & Fullerton Ave" "Kimbark Ave & 53rd St" "Western Ave & Lunt Ave" "Kimbark Ave & 53rd St" ...
## $ start_station_id : chr "TA1309000058" "TA1309000037" "RP-005" "TA1309000037" ...
## $ end_station_name : chr "Hampden Ct & Diversey Ave" "Greenwood Ave & 47th St" "Valli Produce - Evanston Plaza" "Greenwood Ave & 47th St" ...
## $ end_station_id : chr "202480" "TA1308000002" "599" "TA1308000002" ...
## $ start_lat : num 41.9 41.8 42 41.8 41.8 ...
## $ start_lng : num -87.6 -87.6 -87.7 -87.6 -87.6 ...
## $ end_lat : num 41.9 41.8 42 41.8 41.8 ...
## $ end_lng : num -87.6 -87.6 -87.7 -87.6 -87.6 ...
## $ member_casual : chr "member" "member" "casual" "member" ...
## $ ride_length : chr "0:10:51" "0:08:29" "0:13:14" "0:08:46" ...
## $ day_of_week : chr "Saturday" "Tuesday" "Monday" "Sunday" ...
as_tibble(ride_2023)
## # A tibble: 4,335,767 × 15
## ride_id rideable_type started_at ended_at start_station_name start_station_id
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 F96D5A… electric_bike 1/21/2023… 1/21/20… Lincoln Ave & Ful… TA1309000058
## 2 13CB7E… classic_bike 1/10/2023… 1/10/20… Kimbark Ave & 53r… TA1309000037
## 3 BD88A2… electric_bike 1/2/2023 … 1/2/202… Western Ave & Lun… RP-005
## 4 C90792… classic_bike 1/22/2023… 1/22/20… Kimbark Ave & 53r… TA1309000037
## 5 339701… classic_bike 1/12/2023… 1/12/20… Kimbark Ave & 53r… TA1309000037
## 6 58E681… electric_bike 1/31/2023… 1/31/20… Lakeview Ave & Fu… TA1309000019
## 7 2F7194… electric_bike 1/15/2023… 1/15/20… Kimbark Ave & 53r… TA1309000037
## 8 DB1CF8… classic_bike 1/25/2023… 1/25/20… Kimbark Ave & 53r… TA1309000037
## 9 34EAB9… electric_bike 1/25/2023… 1/25/20… Kimbark Ave & 53r… TA1309000037
## 10 BC8AB1… classic_bike 1/6/2023 … 1/6/202… Kimbark Ave & 53r… TA1309000037
## # ℹ 4,335,757 more rows
## # ℹ 9 more variables: end_station_name <chr>, end_station_id <chr>,
## # start_lat <dbl>, start_lng <dbl>, end_lat <dbl>, end_lng <dbl>,
## # member_casual <chr>, ride_length <chr>, day_of_week <chr>
We add a column for ride length in minutes to perform meaningful arithmetic analysis.We also round the result to two decimal places.
# Function to convert HH:MM:SS format to seconds
time_to_seconds <- function(time_str) {
time_parts <- as.numeric(strsplit(time_str, ":")[[1]])
hours <- time_parts[1]
minutes <- time_parts[2]
seconds <- time_parts[3]
total_seconds <- hours * 3600 + minutes * 60 + seconds
return(total_seconds)
}
# Use mutate from dplyr to create ride_length_minutes
ride_2023 <- ride_2023 %>%
mutate(
ride_length_minutes = sapply(ride_length, time_to_seconds) / 6
)
## Warning: There were 66 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `ride_length_minutes = sapply(ride_length, time_to_seconds)/6`.
## Caused by warning in `FUN()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 65 remaining warnings.
#Rounding up decimal points to two places
ride_2023$ride_length_minutes=round(ride_2023$ride_length_minutes,2)
We make a copy so we have a place to start in case something goes wrong.
ride_2023V2 <- ride_2023
Now, we clean the data by removing null values, duplicate records and records with non-positive ride lengths.
#Remove the duplicate value
ride_2023V2 <- distinct(ride_2023)
summary(ride_2023V2)
## ride_id rideable_type started_at ended_at
## Length:4335767 Length:4335767 Length:4335767 Length:4335767
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:4335767 Length:4335767 Length:4335767 Length:4335767
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.65 Min. :-87.84 Min. : 0.00 Min. :-87.84
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.64 Mean :41.90 Mean :-87.64
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.06 Max. :-87.53 Max. :42.06 Max. : 0.00
##
## member_casual ride_length day_of_week ride_length_minutes
## Length:4335767 Length:4335767 Length:4335767 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 56.17
## Mode :character Mode :character Mode :character Median : 98.00
## Mean : 158.96
## 3rd Qu.: 174.67
## Max. :14398.67
## NA's :66
# Remove values with null value
ride_2023V2 <- na.omit(ride_2023)
summary(ride_2023V2)
## ride_id rideable_type started_at ended_at
## Length:4335701 Length:4335701 Length:4335701 Length:4335701
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:4335701 Length:4335701 Length:4335701 Length:4335701
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.65 Min. :-87.84 Min. : 0.00 Min. :-87.84
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.64 Mean :41.90 Mean :-87.64
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.06 Max. :-87.53 Max. :42.06 Max. : 0.00
## member_casual ride_length day_of_week ride_length_minutes
## Length:4335701 Length:4335701 Length:4335701 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 56.17
## Mode :character Mode :character Mode :character Median : 98.00
## Mean : 158.96
## 3rd Qu.: 174.67
## Max. :14398.67
#Remove values of ride_length which has value less than or equal to 0
ride_2023V2 <- ride_2023 [!(ride_2023$ride_length<=0),]
summary(ride_2023V2)
## ride_id rideable_type started_at ended_at
## Length:4335701 Length:4335701 Length:4335701 Length:4335701
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:4335701 Length:4335701 Length:4335701 Length:4335701
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.65 Min. :-87.84 Min. : 0.00 Min. :-87.84
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.64 Mean :41.90 Mean :-87.64
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.06 Max. :-87.53 Max. :42.06 Max. : 0.00
## member_casual ride_length day_of_week ride_length_minutes
## Length:4335701 Length:4335701 Length:4335701 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 56.17
## Mode :character Mode :character Mode :character Median : 98.00
## Mean : 158.96
## 3rd Qu.: 174.67
## Max. :14398.67
We convert the ‘started_at’ column to a Date object and extract the month, day, and year for proper analysis.
date <- as.Date(ride_2023V2$started_at, format = "%m/%d/%Y %H:%M")
# Assuming ride_2023 is your data frame and started_at is a column in it
ride_2023V2$date <- as.Date(ride_2023V2$started_at, format = "%m/%d/%Y %H:%M")
ride_2023V2$month <- format(as.Date(ride_2023V2$date), "%m")
ride_2023V2$day <- format(as.Date(ride_2023V2$date), "%d")
ride_2023V2$year <- format(as.Date(ride_2023V2$date), "%Y")
summary(ride_2023V2)
## ride_id rideable_type started_at ended_at
## Length:4335701 Length:4335701 Length:4335701 Length:4335701
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:4335701 Length:4335701 Length:4335701 Length:4335701
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.65 Min. :-87.84 Min. : 0.00 Min. :-87.84
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.64 Mean :41.90 Mean :-87.64
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.06 Max. :-87.53 Max. :42.06 Max. : 0.00
## member_casual ride_length day_of_week ride_length_minutes
## Length:4335701 Length:4335701 Length:4335701 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 56.17
## Mode :character Mode :character Mode :character Median : 98.00
## Mean : 158.96
## 3rd Qu.: 174.67
## Max. :14398.67
## date month day year
## Min. :2023-01-01 Length:4335701 Length:4335701 Length:4335701
## 1st Qu.:2023-05-20 Class :character Class :character Class :character
## Median :2023-07-20 Mode :character Mode :character Mode :character
## Mean :2023-07-15
## 3rd Qu.:2023-09-16
## Max. :2023-12-31
Ride Length Analysis: In this analysis, we analyze the max,min,average and median of ride length.
# Maximum ride length
ride_2023V2 %>% summarize(max_ride_length = max(ride_length_minutes))
## max_ride_length
## 1 14398.67
# Minimum ride length
ride_2023V2 %>% summarise(min_ride_length= min(ride_length_minutes))
## min_ride_length
## 1 0
# Average ride length
ride_2023V2 %>% summarise(avg_ride_length=mean(ride_length_minutes))
## avg_ride_length
## 1 158.958
# Median ride length
ride_2023V2 %>% summarise(med_ride_length=median(ride_length_minutes))
## med_ride_length
## 1 98
Ride count by other factors:
# Count Unique Rides by ride_id
ride_2023V2 %>% distinct(ride_id) %>% count()
## n
## 1 4335294
# Count number of rides by rideable type
ride_2023V2 %>% count(rideable_type)
## rideable_type n
## 1 classic_bike 2690707
## 2 docked_bike 76124
## 3 electric_bike 1568870
# Count number of rides by member type (member vs casual)
ride_2023V2 %>% count(member_casual)
## member_casual n
## 1 casual 1532770
## 2 member 2802931
# Count number of rides by rideable type and member type
ride_2023V2 %>% group_by(member_casual) %>% count(rideable_type) %>% arrange(n) %>% rename(total_rides = n)
## # A tibble: 5 × 3
## # Groups: member_casual [2]
## member_casual rideable_type total_rides
## <chr> <chr> <int>
## 1 casual docked_bike 76124
## 2 casual electric_bike 583641
## 3 casual classic_bike 873005
## 4 member electric_bike 985229
## 5 member classic_bike 1817702
# Count number of rides by month, rideable type, and member type
ride_2023V2 %>% group_by(month, member_casual, rideable_type) %>% summarise(no_of_ride = n(), avg_ride_length = mean(ride_length_minutes))
## `summarise()` has grouped output by 'month', 'member_casual'. You can override
## using the `.groups` argument.
## # A tibble: 56 × 5
## # Groups: month, member_casual [24]
## month member_casual rideable_type no_of_ride avg_ride_length
## <chr> <chr> <chr> <int> <dbl>
## 1 01 casual classic_bike 13860 172.
## 2 01 casual docked_bike 1682 376.
## 3 01 casual electric_bike 14079 97.5
## 4 01 member classic_bike 76359 106.
## 5 01 member electric_bike 42304 88.0
## 6 02 casual classic_bike 15446 202.
## 7 02 casual docked_bike 2151 429.
## 8 02 casual electric_bike 15621 114.
## 9 02 member classic_bike 74223 111.
## 10 02 member electric_bike 43933 92.0
## # ℹ 46 more rows
# Summarize member (Member, Casual) rides by month
ride_2023V2 %>% group_by(member_casual, month) %>% filter(member_casual == "member") %>% summarise(no_of_ride = n(), avg_ride_length = mean(ride_length_minutes))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups: member_casual [1]
## member_casual month no_of_ride avg_ride_length
## <chr> <chr> <int> <dbl>
## 1 member 01 118663 99.6
## 2 member 02 118156 104.
## 3 member 03 155294 102.
## 4 member 04 213657 116.
## 5 member 05 286185 127.
## 6 member 06 314962 129.
## 7 member 07 328663 133.
## 8 member 08 351061 133.
## 9 member 09 309670 127.
## 10 member 10 273478 116.
## 11 member 11 202689 110.
## 12 member 12 130453 108.
ride_2023V2 %>% group_by(member_casual, month) %>% filter(member_casual == "casual") %>% summarise(no_of_ride = n(), avg_ride_length = mean(ride_length_minutes))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups: member_casual [1]
## member_casual month no_of_ride avg_ride_length
## <chr> <chr> <int> <dbl>
## 1 casual 01 29621 148.
## 2 casual 02 33218 175.
## 3 casual 03 47399 166.
## 4 casual 04 110537 225.
## 5 casual 05 177037 243.
## 6 casual 06 219793 239.
## 7 casual 07 245292 251.
## 8 casual 08 233851 242.
## 9 casual 09 196962 234.
## 10 casual 10 130297 213.
## 11 casual 11 72077 178.
## 12 casual 12 36686 165.
Time Analysis: In this analysis, we focuses on understanding ride patterns over different times of the day, days of the weeks and months
# Average ride duration (ride_length) by rideable type
ride_2023V2 %>% group_by(rideable_type) %>%
summarise(avg_ride_length=mean(ride_length_minutes))
## # A tibble: 3 × 2
## rideable_type avg_ride_length
## <chr> <dbl>
## 1 classic_bike 171.
## 2 docked_bike 529.
## 3 electric_bike 121.
# Arranging the data of day of week in Sunday, Monday,...etc. format
ride_2023V2$day_of_week <- ordered(ride_2023V2$day_of_week,
levels=c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"))
# Analyze the distribution of rides across different days of the week
ride_2023V2 %>% group_by(day_of_week) %>% summarise(total_ride=n())
## # A tibble: 7 × 2
## day_of_week total_ride
## <ord> <int>
## 1 Sunday 563092
## 2 Monday 562673
## 3 Tuesday 631006
## 4 Wednesday 636539
## 5 Thursday 652182
## 6 Friday 628928
## 7 Saturday 661281
# Compare ride lengths between members
ride_2023V2 %>% group_by(member_casual) %>%
summarise(avg_ride_length=mean(ride_length_minutes),total = n())
## # A tibble: 2 × 3
## member_casual avg_ride_length total
## <chr> <dbl> <int>
## 1 casual 228. 1532770
## 2 member 121. 2802931
# Count the number of subscriber per month
ride_2023V2 %>% group_by(month) %>% filter(member_casual=="casual") %>% count(member_casual)
## # A tibble: 12 × 3
## # Groups: month [12]
## month member_casual n
## <chr> <chr> <int>
## 1 01 casual 29621
## 2 02 casual 33218
## 3 03 casual 47399
## 4 04 casual 110537
## 5 05 casual 177037
## 6 06 casual 219793
## 7 07 casual 245292
## 8 08 casual 233851
## 9 09 casual 196962
## 10 10 casual 130297
## 11 11 casual 72077
## 12 12 casual 36686
ride_2023V2 %>% group_by(month) %>% filter(member_casual=="member") %>% count(member_casual)
## # A tibble: 12 × 3
## # Groups: month [12]
## month member_casual n
## <chr> <chr> <int>
## 1 01 member 118663
## 2 02 member 118156
## 3 03 member 155294
## 4 04 member 213657
## 5 05 member 286185
## 6 06 member 314962
## 7 07 member 328663
## 8 08 member 351061
## 9 09 member 309670
## 10 10 member 273478
## 11 11 member 202689
## 12 12 member 130453
# Total no of ride per month
ride_2023V2 %>% group_by(month) %>% summarise(total_ride=n())
## # A tibble: 12 × 2
## month total_ride
## <chr> <int>
## 1 01 148284
## 2 02 151374
## 3 03 202693
## 4 04 324194
## 5 05 463222
## 6 06 534755
## 7 07 573955
## 8 08 584912
## 9 09 506632
## 10 10 403775
## 11 11 274766
## 12 12 167139
# Month-wise maximum number of rides in a day along with the corresponding day
# Count the number of rides for each day within each month
rides_per_day <- ride_2023V2 %>%
group_by(month, day) %>%
summarise(num_rides = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
# Calculating the maximum number of rides in a day (max_ride_in_a_day) for each month
max_rides_per_month <- rides_per_day %>%
group_by(month) %>%
summarise(max_ride_in_a_day = max(num_rides), day = which.max(num_rides)) %>%
select(month, max_ride_in_a_day, day)
Other analysis:
# Summarize Ride Data for 2023 by Member Type
ride_2023V2 %>% group_by(member_casual) %>% summarise(no_of_ride=n(),max_ride_length=max(ride_length_minutes),min_ride_length=min(ride_length_minutes),mean_ride_length=mean(ride_length_minutes))
## # A tibble: 2 × 5
## member_casual no_of_ride max_ride_length min_ride_length mean_ride_length
## <chr> <int> <dbl> <dbl> <dbl>
## 1 casual 1532770 14377. 0 228.
## 2 member 2802931 14399. 0 121.
# Most popular start and end station
# Start station
ride_2023V2 %>% group_by(start_station_name) %>% summarise(no_of_ride=n()) %>%
arrange(desc(no_of_ride))
## # A tibble: 1,535 × 2
## start_station_name no_of_ride
## <chr> <int>
## 1 Streeter Dr & Grand Ave 59624
## 2 DuSable Lake Shore Dr & Monroe St 38021
## 3 Michigan Ave & Oak St 34793
## 4 DuSable Lake Shore Dr & North Blvd 33171
## 5 Clark St & Elm St 32974
## 6 Kingsbury St & Kinzie St 32323
## 7 Clinton St & Washington Blvd 30240
## 8 Wells St & Concord Ln 30071
## 9 Millennium Park 28113
## 10 Theater on the Lake 28041
## # ℹ 1,525 more rows
# End station
ride_2023V2 %>% group_by(end_station_name) %>% summarise(no_of_ride=n()) %>%
arrange(desc(no_of_ride))
## # A tibble: 1,558 × 2
## end_station_name no_of_ride
## <chr> <int>
## 1 Streeter Dr & Grand Ave 61713
## 2 DuSable Lake Shore Dr & North Blvd 37301
## 3 DuSable Lake Shore Dr & Monroe St 36211
## 4 Michigan Ave & Oak St 35775
## 5 Clark St & Elm St 32313
## 6 Kingsbury St & Kinzie St 31651
## 7 Clinton St & Washington Blvd 31012
## 8 Wells St & Concord Ln 30969
## 9 Millennium Park 29543
## 10 Theater on the Lake 29011
## # ℹ 1,548 more rows
# Most popular start and end station by member
# Popular start station for 'member'
ride_2023V2 %>% group_by(member_casual,start_station_name) %>% filter(member_casual=="member") %>%
summarise(no_of_ride=n()) %>% arrange(desc(no_of_ride))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 1,389 × 3
## # Groups: member_casual [1]
## member_casual start_station_name no_of_ride
## <chr> <chr> <int>
## 1 member Clinton St & Washington Blvd 24312
## 2 member Kingsbury St & Kinzie St 24230
## 3 member Clark St & Elm St 23085
## 4 member Wells St & Concord Ln 19236
## 5 member Clinton St & Madison St 19120
## 6 member Wells St & Elm St 18530
## 7 member University Ave & 57th St 18066
## 8 member Loomis St & Lexington St 17819
## 9 member Ellis Ave & 60th St 16987
## 10 member Broadway & Barry Ave 16836
## # ℹ 1,379 more rows
# Popular end station for 'member'
ride_2023V2 %>% group_by(member_casual,end_station_name) %>% filter(member_casual=="member") %>%
summarise(no_of_ride=n()) %>% arrange(desc(no_of_ride))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 1,397 × 3
## # Groups: member_casual [1]
## member_casual end_station_name no_of_ride
## <chr> <chr> <int>
## 1 member Clinton St & Washington Blvd 25483
## 2 member Kingsbury St & Kinzie St 24314
## 3 member Clark St & Elm St 22963
## 4 member Clinton St & Madison St 20446
## 5 member Wells St & Concord Ln 20096
## 6 member Wells St & Elm St 18627
## 7 member University Ave & 57th St 18370
## 8 member Loomis St & Lexington St 17588
## 9 member Broadway & Barry Ave 17501
## 10 member State St & Chicago Ave 17231
## # ℹ 1,387 more rows
# Popular start station for 'casual'
ride_2023V2 %>% group_by(member_casual,start_station_name) %>% filter(member_casual=="casual") %>%
summarise(no_of_ride=n()) %>% arrange(desc(no_of_ride))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 1,467 × 3
## # Groups: member_casual [1]
## member_casual start_station_name no_of_ride
## <chr> <chr> <int>
## 1 casual Streeter Dr & Grand Ave 43373
## 2 casual DuSable Lake Shore Dr & Monroe St 28719
## 3 casual Michigan Ave & Oak St 21103
## 4 casual Millennium Park 18883
## 5 casual DuSable Lake Shore Dr & North Blvd 18778
## 6 casual Shedd Aquarium 16959
## 7 casual Theater on the Lake 15059
## 8 casual Dusable Harbor 14552
## 9 casual Adler Planetarium 11307
## 10 casual Montrose Harbor 10907
## # ℹ 1,457 more rows
# Popular end station for 'casual'
ride_2023V2 %>% group_by(member_casual,end_station_name) %>% filter(member_casual=="casual") %>%
summarise(no_of_ride=n()) %>% arrange(desc(no_of_ride))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 1,495 × 3
## # Groups: member_casual [1]
## member_casual end_station_name no_of_ride
## <chr> <chr> <int>
## 1 casual Streeter Dr & Grand Ave 47718
## 2 casual DuSable Lake Shore Dr & Monroe St 26356
## 3 casual Michigan Ave & Oak St 22486
## 4 casual DuSable Lake Shore Dr & North Blvd 22338
## 5 casual Millennium Park 21319
## 6 casual Theater on the Lake 16701
## 7 casual Shedd Aquarium 15242
## 8 casual Dusable Harbor 12870
## 9 casual Wells St & Concord Ln 10873
## 10 casual Montrose Harbor 10853
## # ℹ 1,485 more rows
In this step, we plot the analysis for understanding data patterns and communicating insights.
# Total no of ride per month
ride_2023V2 %>% group_by(month) %>% summarise(total_ride=n()) %>%
ggplot(aes(x=month,y=total_ride,fill = month))+geom_col()+
labs(x="Month",y="No of Rides",title = "Total No of Rides by Month")+
theme_minimal()
# Total no of ride by rideable type
ride_2023V2 %>% group_by(rideable_type) %>% summarise(no_of_ride=n())%>%
ggplot(aes(x=rideable_type,y=no_of_ride,fill =rideable_type)) + geom_col()+
labs(x="Type of Ride",y="Count of Rides",title = "No of Rides by Rideable Type") +
theme_minimal()+scale_fill_discrete(name="Ride Type")
# Total no of ride by member casual
ride_2023V2 %>% group_by(member_casual) %>% summarise(no_of_ride=n()) %>%
ggplot(aes(x=member_casual,y=no_of_ride,fill=member_casual))+
geom_col()+labs(x= "Member Type",y="Count of Rides",title="No of Rides by Member ")+
theme_minimal()+ scale_fill_discrete(name="Member Type")
# Total no of ride by rideable type and member casual
ride_2023V2 %>% group_by(member_casual,rideable_type) %>% summarise(no_of_ride=n()) %>%
ggplot(aes(x=rideable_type,y=no_of_ride,fill = member_casual))+geom_col(position = "dodge2")+
labs(x="Ride Type",y="Count of Rides",title="No of Rides by Rideable Type and Member")+
theme_minimal()+scale_fill_discrete(name="Member Type")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
# Total no of ride by month,rideable type and member
ride_2023V2 %>% group_by(month,member_casual,rideable_type) %>%
summarise(no_of_ride=n(),avg_ride_length=mean(ride_length_minutes)) %>%
ggplot(aes(x=month,y=no_of_ride,fill = rideable_type))+geom_col(position = "dodge2")+
facet_wrap(~member_casual)+labs(x="Month",y="No of Rides",title="No of Ride by Month, Bike Type, Member")+
scale_fill_discrete(name="Bike Type")+
theme(
plot.title = element_text(size = 14), # Increase title size
strip.text = element_text(size = 10,face = "bold"),
legend.position = "bottom"
)
## `summarise()` has grouped output by 'month', 'member_casual'. You can override
## using the `.groups` argument.
# Summarize Member(Member,Casual) Rides by Month
ride_2023V2 %>% group_by(member_casual,month) %>%
summarise(no_of_ride=n(),avg_ride_length=mean(ride_length_minutes)) %>%
ggplot(aes(x=month,y=no_of_ride,fill = member_casual))+geom_col(position = "dodge2")+
labs(x="Month",y="Count of Rides",title = "Summarize Member Rides by Month")+
facet_wrap(~member_casual)+scale_fill_discrete(name="Member Type")+
theme(
plot.title = element_text(size=14),
strip.text = element_text(size=10,face="bold"),
legend.position = "bottom"
)
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
# Average ride duration (ride_length) by rideable type
ride_2023V2 %>% group_by(rideable_type) %>%
summarise(avg_ride_length=mean(ride_length_minutes)) %>%
ggplot(aes(x=rideable_type,y=avg_ride_length,fill = rideable_type))+ geom_col()+
labs(x="Type of Rides",y="Average Ride Length",title = "Average Ride Duration by Ride Type")+
theme_minimal()+scale_fill_discrete(name="Bike Type")
# Analyze the distribution of rides (ride_id) across different day_of_week
ride_2023V2 %>% group_by(day_of_week) %>% summarise(total_ride=n()) %>%
ggplot(aes(x=day_of_week,y=total_ride,fill = day_of_week))+geom_col()+
labs(x="Week",y="Count of Rides",title = "Distribution of Rides Across Different Week")+
scale_fill_discrete(name="Week") + theme(axis.text.x = element_text(angle=45,hjust=1))
# No of Rides by rideable types for each day of the week
ride_2023V2 %>% group_by(member_casual,rideable_type,day_of_week) %>% summarise(total_ride=n()) %>%
ggplot(aes(x=day_of_week,y=total_ride,fill = rideable_type))+geom_col(position = "dodge2")+
facet_wrap(~ member_casual) +labs(x="Week",y="Count of Rides",title = " Rides by Rideable Types For Each Day of the Week")+
theme(
axis.text.x = element_text(angle = 45,hjust = 1),
legend.position = "bottom"
)
## `summarise()` has grouped output by 'member_casual', 'rideable_type'. You can
## override using the `.groups` argument.
# Compare ride lengths (ride_length) between member_casual users
ride_2023V2 %>% group_by(member_casual) %>%
summarise(avg_ride_length=mean(ride_length_minutes),total = n()) %>%
mutate(avg_ride_length_rounded = round(avg_ride_length, 2)) %>%
ggplot(aes(x=member_casual,y=total,fill = factor(avg_ride_length_rounded)))+geom_col()+
labs(x="Member Types",y="No of Rides",title="Average Ride Length and Number of Rides by Member Type ")+
theme_minimal() + scale_fill_discrete(name="Average Ride Length")
# Month-wise maximum number of rides in a day along with the corresponding day
rides_per_day %>%
group_by(month) %>%
summarise(max_ride_in_a_day = max(num_rides), day = which.max(num_rides)) %>%
select(month, max_ride_in_a_day, day) %>%
ggplot(aes(x=month,y=max_ride_in_a_day,fill = factor(day)))+geom_col(position = "dodge2")+
labs(x="Month",y="Maximum Ride in a Day",title = "Month-Wise Maximum no of Rides in a Day")+
theme_minimal() + scale_fill_discrete(name = "MonthDay")
# Most popular start and end station
# Top 10 Popular Start station by member_casual(member)
ride_2023V2 %>% group_by(member_casual, start_station_name) %>% filter(member_casual=="member")%>% summarise(no_of_ride=n()) %>%
arrange(desc(no_of_ride)) %>% top_n(10) %>%
ggplot(aes(x=start_station_name,y=no_of_ride,fill = member_casual))+geom_col(position = "dodge2")+
coord_flip() +labs(x="Start Station Name",y="No of Rides",title="Top 10 Popular Start Station by Member")+
theme_minimal() + scale_fill_discrete(name="Member Type")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Selecting by no_of_ride
# Top 10 Popular Start station by member_casual(casual)
ride_2023V2 %>% group_by(member_casual,start_station_name) %>% filter(member_casual=="casual") %>%
summarise(no_of_ride=n()) %>% arrange(desc(no_of_ride)) %>% top_n(10) %>%
ggplot(aes(x=start_station_name,y=no_of_ride,fill=member_casual))+geom_col(position = "dodge2")+coord_flip()+
labs(x="Start Station Name",y="No of Rides",title = "Top 10 Popular Start Station by Member")+
theme_minimal()+scale_fill_manual(name="Member Type",values = c("casual"="blue"))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Selecting by no_of_ride
# Top 10 Popular End station by member_casual(member)
ride_2023V2 %>% group_by(member_casual,end_station_name) %>% filter(member_casual=="member") %>% summarise(no_of_ride=n()) %>%
arrange(desc(no_of_ride)) %>% top_n(10) %>%
ggplot(aes(x=end_station_name,y=no_of_ride,fill=member_casual))+geom_col(position = "dodge2")+coord_flip()+
labs(x="End Station Name",y="No of Rides",title = "Top 10 Popular End Station by Member")+
theme_minimal()+scale_fill_discrete(name="Member Type")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Selecting by no_of_ride
# Top 10 Popular End station by member_casual(casual)
ride_2023V2 %>% group_by(member_casual,end_station_name) %>% filter(member_casual=="casual") %>% summarise(no_of_ride=n()) %>%
arrange(desc(no_of_ride)) %>% top_n(10) %>%
ggplot(aes(x=end_station_name,y=no_of_ride,fill = member_casual))+geom_col(position = "dodge2")+
coord_flip()+labs(x="End Station Name",y="No of Rides",title = "Top 10 Popular End Station By Member")+
theme_minimal()+scale_fill_manual(name="Member Type",values = c("casual"="blue"))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Selecting by no_of_ride
Finally, we export the cleaned dataframe for further analysis to CSV format.
# Load the package to export
library(writexl)
# Export directly CSV file to local computer
write.csv(ride_2023V2,"ride_2023_V2.csv",row.names = FALSE)