Step 1: Loading the required Library

In this step, we will load all the necessary libraries for data manipulation and visualization operations.

library(tidyverse) # Data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr) # Data wrangling
library(tidyr) # Data tidying
library(ggplot2) # Data visualization
library(readr) #Data reading
library(lubridate) #  Date handling
library(janitor) # Data cleaning
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(skimr) # Data summarization
library(anytime) # Date Parsing
library(hms) # For time objects
## 
## Attaching package: 'hms'
## 
## The following object is masked from 'package:lubridate':
## 
##     hms

Step 2: Loading the CSV files

Next, we will load all the CSV files containing the ride data for the year 2023. We’ll also take a look at the first few rows of each dataset to understand their structure.

jan_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Jan_ride.csv")
head(jan_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 F96D5A74A3E41399 electric_bike 1/21/2023 20:05 1/21/2023 20:16
## 2 13CB7EB698CEDB88  classic_bike 1/10/2023 15:37 1/10/2023 15:46
## 3 BD88A2E670661CE5 electric_bike   1/2/2023 7:51   1/2/2023 8:05
## 4 C90792D034FED968  classic_bike 1/22/2023 10:52 1/22/2023 11:01
## 5 3397017529188E8A  classic_bike 1/12/2023 13:58 1/12/2023 14:13
## 6 58E68156DAE3E311 electric_bike  1/31/2023 7:18  1/31/2023 7:21
##              start_station_name start_station_id               end_station_name
## 1   Lincoln Ave & Fullerton Ave     TA1309000058      Hampden Ct & Diversey Ave
## 2         Kimbark Ave & 53rd St     TA1309000037        Greenwood Ave & 47th St
## 3        Western Ave & Lunt Ave           RP-005 Valli Produce - Evanston Plaza
## 4         Kimbark Ave & 53rd St     TA1309000037        Greenwood Ave & 47th St
## 5         Kimbark Ave & 53rd St     TA1309000037        Greenwood Ave & 47th St
## 6 Lakeview Ave & Fullerton Pkwy     TA1309000019      Hampden Ct & Diversey Ave
##   end_station_id start_lat start_lng  end_lat   end_lng member_casual
## 1         202480  41.92407 -87.64628 41.93000 -87.64000        member
## 2   TA1308000002  41.79957 -87.59475 41.80983 -87.59938        member
## 3            599  42.00857 -87.69048 42.03974 -87.69941        casual
## 4   TA1308000002  41.79957 -87.59475 41.80983 -87.59938        member
## 5   TA1308000002  41.79957 -87.59475 41.80983 -87.59938        member
## 6         202480  41.92607 -87.63886 41.93000 -87.64000        member
##   ride_length day_of_week
## 1     0:10:51    Saturday
## 2     0:08:29     Tuesday
## 3     0:13:14      Monday
## 4     0:08:46      Sunday
## 5     0:15:19    Thursday
## 6     0:03:13     Tuesday
feb_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Feb_ride.csv")
head(feb_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 CBCD0D7777F0E45F  classic_bike 2/14/2023 11:59 2/14/2023 12:13
## 2 F3EC5FCE5FF39DE9 electric_bike 2/15/2023 13:53 2/15/2023 13:59
## 3 E54C1F27FA9354FF  classic_bike 2/19/2023 11:10 2/19/2023 11:35
## 4 3D561E04F739CC45 electric_bike 2/26/2023 16:12 2/26/2023 16:39
## 5 0CB4B4D53B2DBE05 electric_bike 2/20/2023 11:55 2/20/2023 12:05
## 6 C67EB62172C472EB  classic_bike 2/24/2023 18:50 2/24/2023 18:56
##             start_station_name start_station_id              end_station_name
## 1 Southport Ave & Clybourn Ave     TA1309000030        Clark St & Schiller St
## 2   Clarendon Ave & Gordon Ter            13379    Sheridan Rd & Lawrence Ave
## 3 Southport Ave & Clybourn Ave     TA1309000030       Aberdeen St & Monroe St
## 4 Southport Ave & Clybourn Ave     TA1309000030 Franklin St & Adams St (Temp)
## 5  Prairie Ave & Garfield Blvd     TA1307000160   Cottage Grove Ave & 63rd St
## 6        Wells St & Concord Ln     TA1308000050    Clybourn Ave & Division St
##   end_station_id start_lat start_lng  end_lat   end_lng member_casual
## 1   TA1309000024  41.92077 -87.66371 41.90799 -87.63150        casual
## 2   TA1309000041  41.95788 -87.64958 41.96952 -87.65469        casual
## 3          13156  41.92077 -87.66371 41.88042 -87.65552        member
## 4   TA1309000008  41.92087 -87.66373 41.87943 -87.63550        member
## 5   KA1503000054  41.79483 -87.61879 41.78053 -87.60597        member
## 6   TA1307000115  41.91213 -87.63466 41.90461 -87.64055        member
##   ride_length day_of_week
## 1    00:13:56     Tuesday
## 2    00:05:20   Wednesday
## 3    00:24:04      Sunday
## 4    00:27:50      Sunday
## 5    00:10:25      Monday
## 6    00:06:24      Friday
mar_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Mar_ride.csv")
head(mar_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 6842AA605EE9FBB3 electric_bike  3/16/2023 8:20  3/16/2023 8:22
## 2 FF7CF57CFE026D02  classic_bike 3/31/2023 12:28 3/31/2023 12:38
## 3 6B61B916032CB6D6  classic_bike 3/22/2023 14:09 3/22/2023 14:24
## 4 E55E61A5F1260040 electric_bike   3/9/2023 7:15   3/9/2023 7:26
## 5 123AAD676850F53C  classic_bike 3/22/2023 17:47 3/22/2023 18:01
## 6 5929D3080983AF4F  classic_bike  3/8/2023 19:58  3/8/2023 20:05
##                     start_station_name start_station_id
## 1              Clark St & Armitage Ave            13146
## 2 Orleans St & Chestnut St (NEXT Apts)              620
## 3            Desplaines St & Kinzie St     TA1306000003
## 4                           Walsh Park            18067
## 5 Orleans St & Chestnut St (NEXT Apts)              620
## 6                 Rush St & Hubbard St     KA1503000044
##               end_station_name end_station_id start_lat start_lng  end_lat
## 1    Larrabee St & Webster Ave          13193  41.91841 -87.63645 41.92182
## 2       Clark St & Randolph St   TA1305000030  41.89820 -87.63754 41.88458
## 3 Sheffield Ave & Kingsbury St          13154  41.88872 -87.64445 41.91052
## 4        Sangamon St & Lake St   TA1306000015  41.91448 -87.66801 41.88578
## 5  Halsted St & Wrightwood Ave   TA1309000061  41.89820 -87.63754 41.92914
## 6          Wells St & Huron St   TA1306000012  41.89017 -87.62619 41.89472
##     end_lng member_casual ride_length day_of_week
## 1 -87.64414        member     0:02:18    Thursday
## 2 -87.63189        member     0:10:38      Friday
## 3 -87.65311        member     0:15:43   Wednesday
## 4 -87.65102        member     0:11:00    Thursday
## 5 -87.64908        member     0:14:27   Wednesday
## 6 -87.63436        member     0:06:55   Wednesday
apr_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Apr_ride.csv")
head(apr_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 5B6500E1E58655C0  classic_bike 4/10/2023 17:34 4/10/2023 18:02
## 2 AA65D25D69AF771F  classic_bike 4/12/2023 12:29 4/12/2023 12:54
## 3 079FB2C196414482 electric_bike 4/13/2023 17:39 4/13/2023 17:40
## 4 599623864C871207  classic_bike 4/29/2023 20:57 4/29/2023 20:57
## 5 63ECC8A13D11A76A  classic_bike 4/20/2023 17:03 4/20/2023 17:24
## 6 A396F54F4C1927D8 electric_bike 4/14/2023 22:29 4/14/2023 22:29
##               start_station_name start_station_id
## 1            Avenue O & 134th St            20214
## 2    Cottage Grove Ave & 51st St     TA1309000067
## 3           Morgan Ave & 14th Pl     TA1306000002
## 4    Cottage Grove Ave & 51st St     TA1309000067
## 5   California Ave & Division St            13256
## 6 California Ave & Milwaukee Ave            13084
##                 end_station_name end_station_id start_lat start_lng  end_lat
## 1            Avenue O & 134th St          20214  41.65187 -87.53967 41.65187
## 2    Cottage Grove Ave & 51st St   TA1309000067  41.80304 -87.60662 41.80304
## 3           Morgan Ave & 14th Pl   TA1306000002  41.86243 -87.65115 41.86238
## 4    Cottage Grove Ave & 51st St   TA1309000067  41.80304 -87.60662 41.80304
## 5 California Ave & Milwaukee Ave          13084  41.90303 -87.69747 41.92269
## 6 California Ave & Milwaukee Ave          13084  41.92269 -87.69713 41.92269
##     end_lng member_casual ride_length day_of_week
## 1 -87.53967        member    00:28:01      Monday
## 2 -87.60662        member    00:24:14   Wednesday
## 3 -87.65106        member    00:01:34    Thursday
## 4 -87.60662        member    00:00:03    Saturday
## 5 -87.69715        casual    00:21:47    Thursday
## 6 -87.69715        member    00:00:03      Friday
may_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_May_ride.csv")
head(may_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 DDEB93BC2CE9AA77  classic_bike 5/10/2023 16:47 5/10/2023 16:59
## 2 C07B70172FC92F59  classic_bike  5/9/2023 18:30  5/9/2023 18:39
## 3 2BA66385DF8F815A  classic_bike 5/30/2023 15:01 5/30/2023 15:17
## 4 31EFCCB05F12D8EF   docked_bike  5/9/2023 14:13  5/9/2023 14:47
## 5 71DFF834E1D3CE0B  classic_bike  5/6/2023 16:47  5/6/2023 16:52
## 6 2117485899B4CEA4  classic_bike 5/15/2023 12:47 5/15/2023 13:00
##                  start_station_name start_station_id
## 1           Carpenter St & Huron St            13196
## 2          Southport Ave & Clark St     TA1308000047
## 3           Clinton St & Madison St     TA1305000032
## 4 DuSable Lake Shore Dr & Monroe St            13300
## 5                Shore Dr & 55th St     TA1308000009
## 6            Clark St & Schiller St     TA1309000024
##              end_station_name end_station_id start_lat start_lng  end_lat
## 1     Damen Ave & Cortland St          13133  41.89456 -87.65345 41.91598
## 2 Southport Ave & Belmont Ave          13229  41.95708 -87.66420 41.93948
## 3        McClurg Ct & Ohio St   TA1306000029  41.88275 -87.64119 41.89259
## 4           Adler Planetarium          13431  41.88096 -87.61674 41.86610
## 5        Harper Ave & 59th St   KA1503000070  41.79521 -87.58071 41.78794
## 6       Franklin St & Lake St   TA1307000111  41.90799 -87.63150 41.88584
##     end_lng member_casual ride_length day_of_week
## 1 -87.67733        member     0:12:51   Wednesday
## 2 -87.66375        member     0:08:54     Tuesday
## 3 -87.61729        member     0:15:39     Tuesday
## 4 -87.60727        casual     0:33:40     Tuesday
## 5 -87.58832        member     0:04:51    Saturday
## 6 -87.63550        member     0:12:39      Monday
jun_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Jun_ride.csv")
head(jun_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 055E6783FCD761FD electric_bike  6/6/2023 20:58  6/6/2023 21:04
## 2 65BC2A1FC12CFFA4  classic_bike 6/28/2023 18:21 6/28/2023 18:56
## 3 4D149B57532FE9C9  classic_bike  6/30/2023 9:53  6/30/2023 9:53
## 4 BAE48AFCA5C36414  classic_bike 6/21/2023 13:23 6/21/2023 13:24
## 5 E3491C5273700783  classic_bike 6/21/2023 13:24 6/21/2023 13:40
## 6 4AE5E4B5B30DBFEF  classic_bike  6/2/2023 23:39  6/2/2023 23:40
##               start_station_name start_station_id
## 1 California Ave & Milwaukee Ave            13084
## 2    Cottage Grove Ave & 51st St     TA1309000067
## 3        Western Ave & Roscoe St            15634
## 4    Cottage Grove Ave & 51st St     TA1309000067
## 5    Cottage Grove Ave & 51st St     TA1309000067
## 6 California Ave & Milwaukee Ave            13084
##                 end_station_name end_station_id start_lat start_lng  end_lat
## 1   California Ave & Division St          13256  41.92262 -87.69711 41.90303
## 2    Cottage Grove Ave & 51st St   TA1309000067  41.80304 -87.60662 41.80304
## 3        Western Ave & Roscoe St          15634  41.94303 -87.68729 41.94303
## 4    Cottage Grove Ave & 51st St   TA1309000067  41.80304 -87.60662 41.80304
## 5    Cottage Grove Ave & 51st St   TA1309000067  41.80304 -87.60662 41.80304
## 6 California Ave & Milwaukee Ave          13084  41.92269 -87.69715 41.92269
##     end_lng member_casual ride_length day_of_week
## 1 -87.69747        member     0:05:47     Tuesday
## 2 -87.60662        member     0:34:40   Wednesday
## 3 -87.68729        member     0:00:22      Friday
## 4 -87.60662        member     0:00:19   Wednesday
## 5 -87.60662        member     0:16:01   Wednesday
## 6 -87.69715        member     0:00:45      Friday
jul_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Jul_ride.csv")
head(jul_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 9340B064F0AEE130 electric_bike 7/23/2023 20:06 7/23/2023 20:22
## 2 D1460EE3CE0D8AF8  classic_bike 7/23/2023 17:05 7/23/2023 17:18
## 3 DF41BE31B895A25E  classic_bike 7/23/2023 10:14 7/23/2023 10:24
## 4 9624A293749EF703 electric_bike  7/21/2023 8:27  7/21/2023 8:32
## 5 2F68A6A4CDB4C99A  classic_bike  7/8/2023 15:46  7/8/2023 15:58
## 6 9AEE973E6B941A9C  classic_bike  7/10/2023 8:44  7/10/2023 8:49
##         start_station_name start_station_id                    end_station_name
## 1    Kedzie Ave & 110th St            20204 Public Rack - Racine Ave & 109th Pl
## 2  Western Ave & Walton St     KA1504000103           Milwaukee Ave & Grand Ave
## 3  Western Ave & Walton St     KA1504000103              Damen Ave & Pierce Ave
## 4 Racine Ave & Randolph St            13155             Clinton St & Madison St
## 5    Clark St & Leland Ave     TA1309000014                     Montrose Harbor
## 6 Racine Ave & Randolph St            13155               Sangamon St & Lake St
##   end_station_id start_lat start_lng  end_lat   end_lng member_casual
## 1            877  41.69241 -87.70091 41.69483 -87.65304        member
## 2          13033  41.89842 -87.68660 41.89158 -87.64838        member
## 3   TA1305000041  41.89842 -87.68660 41.90940 -87.67769        member
## 4   TA1305000032  41.88411 -87.65694 41.88275 -87.64119        member
## 5   TA1308000012  41.96709 -87.66729 41.96398 -87.63818        member
## 6   TA1306000015  41.88407 -87.65685 41.88578 -87.65102        member
##   ride_length day_of_week
## 1    00:16:30      Sunday
## 2    00:13:30      Sunday
## 3    00:09:36      Sunday
## 4    00:04:56      Friday
## 5    00:11:26    Saturday
## 6    00:04:54      Monday
aug_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Aug_ride.csv")
head(aug_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 903C30C2D810A53B electric_bike 8/19/2023 15:41 8/19/2023 15:53
## 2 6400344C80D626CA electric_bike 8/11/2023 14:27 8/11/2023 14:34
## 3 B56F0D2EC8B33085 electric_bike 8/26/2023 15:49 8/26/2023 16:11
## 4 EE2EF7362A4CE72A electric_bike 8/30/2023 16:00 8/30/2023 16:08
## 5 82511E5639E5254E electric_bike 8/10/2023 12:18 8/10/2023 12:23
## 6 56D52B6F72CBF23C electric_bike 8/15/2023 21:54 8/15/2023 21:58
##                    start_station_name start_station_id
## 1            LaSalle St & Illinois St            13430
## 2              Clark St & Randolph St     TA1305000030
## 3        Sheffield Ave & Kingsbury St            13154
## 4 Orleans St & Merchandise Mart Plaza     TA1305000022
## 5 Orleans St & Merchandise Mart Plaza     TA1305000022
## 6                   Wells St & Elm St     KA1504000135
##              end_station_name end_station_id start_lat start_lng  end_lat
## 1           Clark St & Elm St   TA1307000039  41.89072 -87.63148 41.90297
## 2 Dearborn Pkwy & Delaware Pl   TA1307000128  41.88497 -87.63144 41.89897
## 3 Sawyer Ave & Irving Park Rd   KA1504000096  41.91070 -87.65320 41.95359
## 4 Dearborn Pkwy & Delaware Pl   TA1307000128  41.88837 -87.63732 41.89897
## 5 Dearborn Pkwy & Delaware Pl   TA1307000128  41.88775 -87.63684 41.89897
## 6 Dearborn Pkwy & Delaware Pl   TA1307000128  41.90311 -87.63472 41.89897
##     end_lng member_casual ride_length day_of_week
## 1 -87.63128        member    00:11:43    Saturday
## 2 -87.62991        member    00:07:37      Friday
## 3 -87.70931        member    00:22:17    Saturday
## 4 -87.62991        member    00:08:17   Wednesday
## 5 -87.62991        member    00:05:10    Thursday
## 6 -87.62991        member    00:04:02     Tuesday
sep_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Sep_ride.csv")
head(sep_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 011C1903BF4E2E28  classic_bike  9/23/2023 0:27  9/23/2023 0:33
## 2 87DB80E048A1BF9F  classic_bike   9/2/2023 9:26   9/2/2023 9:38
## 3 7C2EB7AF669066E3 electric_bike 9/25/2023 18:30 9/25/2023 18:41
## 4 57D197B010269CE3  classic_bike 9/13/2023 15:30 9/13/2023 15:39
## 5 8A2CEA7C8C8074D8  classic_bike 9/18/2023 15:58 9/18/2023 16:05
## 6 40D9EF382CC6C53D  classic_bike 9/17/2023 11:58 9/17/2023 12:08
##              start_station_name start_station_id               end_station_name
## 1   Halsted St & Wrightwood Ave     TA1309000061 Sheffield Ave & Wellington Ave
## 2        Clark St & Drummond Pl     TA1307000142     Racine Ave & Fullerton Ave
## 3 Financial Pl & Ida B Wells Dr           SL-010           Racine Ave & 15th St
## 4        Clark St & Drummond Pl     TA1307000142       Racine Ave & Belmont Ave
## 5   Halsted St & Wrightwood Ave     TA1309000061     Racine Ave & Fullerton Ave
## 6    Kedzie Ave & Milwaukee Ave            13085 California Ave & Milwaukee Ave
##   end_station_id start_lat start_lng  end_lat   end_lng member_casual
## 1   TA1307000052  41.92914 -87.64908 41.93625 -87.65266        member
## 2   TA1306000026  41.93125 -87.64434 41.92557 -87.65842        member
## 3          13304  41.87506 -87.63314 41.86127 -87.65663        member
## 4   TA1308000019  41.93125 -87.64434 41.93974 -87.65887        member
## 5   TA1306000026  41.92914 -87.64908 41.92557 -87.65842        member
## 6          13084  41.92957 -87.70786 41.92269 -87.69715        member
##   ride_length day_of_week
## 1     0:05:37    Saturday
## 2     0:11:36    Saturday
## 3     0:11:28      Monday
## 4     0:08:29   Wednesday
## 5     0:06:06      Monday
## 6     0:09:46      Sunday
oct_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Oct_ride.csv")
head(oct_2023)
##            ride_id rideable_type       started_at         ended_at
## 1 4449097279F8BBE7  classic_bike  10/8/2023 10:36  10/8/2023 10:49
## 2 9CF060543CA7B439 electric_bike 10/11/2023 17:23 10/11/2023 17:36
## 3 667F21F4D6BDE69C electric_bike  10/12/2023 7:02  10/12/2023 7:06
## 4 F92714CC6B019B96  classic_bike 10/24/2023 19:13 10/24/2023 19:18
## 5 5E34BA5DE945A9CC  classic_bike  10/9/2023 18:19  10/9/2023 18:30
## 6 F7D7420AFAC53CD9 electric_bike  10/4/2023 17:10  10/4/2023 17:25
##                     start_station_name start_station_id
## 1 Orleans St & Chestnut St (NEXT Apts)              620
## 2            Desplaines St & Kinzie St     TA1306000003
## 3 Orleans St & Chestnut St (NEXT Apts)              620
## 4            Desplaines St & Kinzie St     TA1306000003
## 5            Desplaines St & Kinzie St     TA1306000003
## 6 Orleans St & Chestnut St (NEXT Apts)              620
##              end_station_name end_station_id start_lat start_lng  end_lat
## 1 Sheffield Ave & Webster Ave   TA1309000033  41.89820 -87.63754 41.92154
## 2 Sheffield Ave & Webster Ave   TA1309000033  41.88864 -87.64441 41.92154
## 3       Franklin St & Lake St   TA1307000111  41.89807 -87.63751 41.88584
## 4       Franklin St & Lake St   TA1307000111  41.88872 -87.64445 41.88584
## 5       Franklin St & Lake St   TA1307000111  41.88872 -87.64445 41.88584
## 6 Sheffield Ave & Webster Ave   TA1309000033  41.89812 -87.63753 41.92154
##     end_lng member_casual ride_length day_of_week
## 1 -87.65382        member     0:12:53      Sunday
## 2 -87.65382        member     0:12:09   Wednesday
## 3 -87.63550        member     0:04:20    Thursday
## 4 -87.63550        member     0:05:26     Tuesday
## 5 -87.63550        member     0:11:30      Monday
## 6 -87.65382        member     0:14:22   Wednesday
nov_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Nov_ride.csv")
head(nov_2023)
##            ride_id rideable_type       started_at         ended_at
## 1 4EAD8F1AD547356B electric_bike 11/30/2023 21:50 11/30/2023 22:13
## 2 6322270563BF5470 electric_bike   11/3/2023 9:44  11/3/2023 10:17
## 3 B37BDE091ECA38E0 electric_bike 11/30/2023 11:39 11/30/2023 11:40
## 4 CF0CA5DD26E4F90E  classic_bike  11/8/2023 10:01  11/8/2023 10:27
## 5 EB8381AA641348DB  classic_bike  11/3/2023 16:20  11/3/2023 16:54
## 6 B8CF14EA423D6886 electric_bike 11/30/2023 16:15 11/30/2023 16:39
##              start_station_name start_station_id              end_station_name
## 1               Millennium Park            13008 Pine Grove Ave & Waveland Ave
## 2        Broadway & Sheridan Rd            13323        Broadway & Sheridan Rd
## 3         State St & Pearson St     TA1307000061         State St & Pearson St
## 4           Theater on the Lake     TA1308000001           Theater on the Lake
## 5           Theater on the Lake     TA1308000001           Theater on the Lake
## 6 Pine Grove Ave & Waveland Ave     TA1307000150               Millennium Park
##   end_station_id start_lat start_lng  end_lat   end_lng member_casual
## 1   TA1307000150  41.88110 -87.62408 41.94947 -87.64645        member
## 2          13323  41.95287 -87.65003 41.95283 -87.64999        member
## 3   TA1307000061  41.89753 -87.62869 41.89745 -87.62872        member
## 4   TA1308000001  41.92628 -87.63083 41.92628 -87.63083        member
## 5   TA1308000001  41.92628 -87.63083 41.92628 -87.63083        member
## 6          13008  41.94942 -87.64638 41.88103 -87.62408        member
##   ride_length day_of_week
## 1     0:23:22    Thursday
## 2     0:33:13      Friday
## 3     0:00:24    Thursday
## 4     0:25:20   Wednesday
## 5     0:34:00      Friday
## 6     0:23:59    Thursday
dec_2023 <- read.csv("D:\\Docs\\Google Data Analytics\\divvy_trip_2023\\2023_Ride_Record\\2023_Dec_ride.csv")
head(dec_2023)
##            ride_id rideable_type       started_at         ended_at
## 1 84BFC1F137684EAB  classic_bike  12/2/2023 23:12  12/2/2023 23:21
## 2 EEC92D30A70471E5  classic_bike 12/14/2023 13:43 12/14/2023 13:44
## 3 1C33464DEEB1F23C electric_bike  12/4/2023 11:57  12/4/2023 12:13
## 4 E0A61810C305E5EC  classic_bike   12/4/2023 9:34   12/4/2023 9:35
## 5 0706CEB2E1924F3D  classic_bike   12/4/2023 9:36   12/4/2023 9:36
## 6 EB09035006DCCB2C electric_bike   12/2/2023 6:06   12/2/2023 6:09
##             start_station_name start_station_id             end_station_name
## 1               DuSable Museum     KA1503000075  Cottage Grove Ave & 51st St
## 2 California Ave & Division St            13256 California Ave & Division St
## 3     Chicago State University            20106     Chicago State University
## 4  Cottage Grove Ave & 51st St     TA1309000067  Cottage Grove Ave & 51st St
## 5  Cottage Grove Ave & 51st St     TA1309000067  Cottage Grove Ave & 51st St
## 6     Chicago State University            20106     Chicago State University
##   end_station_id start_lat start_lng  end_lat   end_lng member_casual
## 1   TA1309000067  41.79157 -87.60785 41.80304 -87.60662        member
## 2          13256  41.90303 -87.69747 41.90303 -87.69747        casual
## 3          20106  41.71895 -87.60831 41.71896 -87.60830        casual
## 4   TA1309000067  41.80304 -87.60662 41.80304 -87.60662        casual
## 5   TA1309000067  41.80304 -87.60662 41.80304 -87.60662        casual
## 6          20106  41.71896 -87.60834 41.71896 -87.60830        casual
##   ride_length day_of_week
## 1    00:08:10    Saturday
## 2    00:01:00    Thursday
## 3    00:16:55      Monday
## 4    00:01:34      Monday
## 5    00:00:13      Monday
## 6    00:02:34    Saturday

Step 3: Confirming Consistent Data Types

Before combining the datasets, we ensure that the data types of corresponding columns are consistent using the compare_df_cols() function to identify any mismatches.

compare_df_cols(jan_2023,feb_2023,mar_2023,apr_2023,may_2023,jun_2023,jul_2023,
                aug_2023,sep_2023,oct_2023,nov_2023,dec_2023, return = "mismatch")
##  [1] column_name jan_2023    feb_2023    mar_2023    apr_2023    may_2023   
##  [7] jun_2023    jul_2023    aug_2023    sep_2023    oct_2023    nov_2023   
## [13] dec_2023   
## <0 rows> (or 0-length row.names)

Step 4: Merging Data Frames

We merge different csv files into single data for clearer analysis.

ride_2023 <- bind_rows(jan_2023,feb_2023,mar_2023,apr_2023,may_2023,jun_2023,jul_2023,
                       aug_2023,sep_2023,oct_2023,nov_2023,dec_2023)

After merging dataframe, we run some function to get summary of the dataframe.

summary(ride_2023)
##    ride_id          rideable_type       started_at          ended_at        
##  Length:4335767     Length:4335767     Length:4335767     Length:4335767    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  start_station_name start_station_id   end_station_name   end_station_id    
##  Length:4335767     Length:4335767     Length:4335767     Length:4335767    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    start_lat       start_lng         end_lat         end_lng      
##  Min.   :41.65   Min.   :-87.84   Min.   : 0.00   Min.   :-87.84  
##  1st Qu.:41.88   1st Qu.:-87.66   1st Qu.:41.88   1st Qu.:-87.66  
##  Median :41.90   Median :-87.64   Median :41.90   Median :-87.64  
##  Mean   :41.90   Mean   :-87.64   Mean   :41.90   Mean   :-87.64  
##  3rd Qu.:41.93   3rd Qu.:-87.63   3rd Qu.:41.93   3rd Qu.:-87.63  
##  Max.   :42.06   Max.   :-87.53   Max.   :42.06   Max.   :  0.00  
##  member_casual      ride_length        day_of_week       
##  Length:4335767     Length:4335767     Length:4335767    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 
skim_without_charts(ride_2023)
Data summary
Name ride_2023
Number of rows 4335767
Number of columns 15
_______________________
Column type frequency:
character 11
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1 8 16 0 4335360 0
rideable_type 0 1 11 13 0 3 0
started_at 0 1 13 16 0 442713 0
ended_at 0 1 13 16 0 443128 0
start_station_name 0 1 0 64 12 1535 0
start_station_id 0 1 0 35 12 1468 0
end_station_name 0 1 0 64 4048 1558 0
end_station_id 0 1 0 36 4048 1483 0
member_casual 0 1 6 6 0 2 0
ride_length 0 1 7 255 0 31049 0
day_of_week 0 1 6 9 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.04 41.65 41.88 41.90 41.93 42.06
start_lng 0 1 -87.64 0.02 -87.84 -87.66 -87.64 -87.63 -87.53
end_lat 0 1 41.90 0.06 0.00 41.88 41.90 41.93 42.06
end_lng 0 1 -87.64 0.08 -87.84 -87.66 -87.64 -87.63 0.00
glimpse(ride_2023)
## Rows: 4,335,767
## Columns: 15
## $ ride_id            <chr> "F96D5A74A3E41399", "13CB7EB698CEDB88", "BD88A2E670…
## $ rideable_type      <chr> "electric_bike", "classic_bike", "electric_bike", "…
## $ started_at         <chr> "1/21/2023 20:05", "1/10/2023 15:37", "1/2/2023 7:5…
## $ ended_at           <chr> "1/21/2023 20:16", "1/10/2023 15:46", "1/2/2023 8:0…
## $ start_station_name <chr> "Lincoln Ave & Fullerton Ave", "Kimbark Ave & 53rd …
## $ start_station_id   <chr> "TA1309000058", "TA1309000037", "RP-005", "TA130900…
## $ end_station_name   <chr> "Hampden Ct & Diversey Ave", "Greenwood Ave & 47th …
## $ end_station_id     <chr> "202480", "TA1308000002", "599", "TA1308000002", "T…
## $ start_lat          <dbl> 41.92407, 41.79957, 42.00857, 41.79957, 41.79957, 4…
## $ start_lng          <dbl> -87.64628, -87.59475, -87.69048, -87.59475, -87.594…
## $ end_lat            <dbl> 41.93000, 41.80983, 42.03974, 41.80983, 41.80983, 4…
## $ end_lng            <dbl> -87.64000, -87.59938, -87.69941, -87.59938, -87.599…
## $ member_casual      <chr> "member", "member", "casual", "member", "member", "…
## $ ride_length        <chr> "0:10:51", "0:08:29", "0:13:14", "0:08:46", "0:15:1…
## $ day_of_week        <chr> "Saturday", "Tuesday", "Monday", "Sunday", "Thursda…
head(ride_2023)
##            ride_id rideable_type      started_at        ended_at
## 1 F96D5A74A3E41399 electric_bike 1/21/2023 20:05 1/21/2023 20:16
## 2 13CB7EB698CEDB88  classic_bike 1/10/2023 15:37 1/10/2023 15:46
## 3 BD88A2E670661CE5 electric_bike   1/2/2023 7:51   1/2/2023 8:05
## 4 C90792D034FED968  classic_bike 1/22/2023 10:52 1/22/2023 11:01
## 5 3397017529188E8A  classic_bike 1/12/2023 13:58 1/12/2023 14:13
## 6 58E68156DAE3E311 electric_bike  1/31/2023 7:18  1/31/2023 7:21
##              start_station_name start_station_id               end_station_name
## 1   Lincoln Ave & Fullerton Ave     TA1309000058      Hampden Ct & Diversey Ave
## 2         Kimbark Ave & 53rd St     TA1309000037        Greenwood Ave & 47th St
## 3        Western Ave & Lunt Ave           RP-005 Valli Produce - Evanston Plaza
## 4         Kimbark Ave & 53rd St     TA1309000037        Greenwood Ave & 47th St
## 5         Kimbark Ave & 53rd St     TA1309000037        Greenwood Ave & 47th St
## 6 Lakeview Ave & Fullerton Pkwy     TA1309000019      Hampden Ct & Diversey Ave
##   end_station_id start_lat start_lng  end_lat   end_lng member_casual
## 1         202480  41.92407 -87.64628 41.93000 -87.64000        member
## 2   TA1308000002  41.79957 -87.59475 41.80983 -87.59938        member
## 3            599  42.00857 -87.69048 42.03974 -87.69941        casual
## 4   TA1308000002  41.79957 -87.59475 41.80983 -87.59938        member
## 5   TA1308000002  41.79957 -87.59475 41.80983 -87.59938        member
## 6         202480  41.92607 -87.63886 41.93000 -87.64000        member
##   ride_length day_of_week
## 1     0:10:51    Saturday
## 2     0:08:29     Tuesday
## 3     0:13:14      Monday
## 4     0:08:46      Sunday
## 5     0:15:19    Thursday
## 6     0:03:13     Tuesday
str(ride_2023)
## 'data.frame':    4335767 obs. of  15 variables:
##  $ ride_id           : chr  "F96D5A74A3E41399" "13CB7EB698CEDB88" "BD88A2E670661CE5" "C90792D034FED968" ...
##  $ rideable_type     : chr  "electric_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : chr  "1/21/2023 20:05" "1/10/2023 15:37" "1/2/2023 7:51" "1/22/2023 10:52" ...
##  $ ended_at          : chr  "1/21/2023 20:16" "1/10/2023 15:46" "1/2/2023 8:05" "1/22/2023 11:01" ...
##  $ start_station_name: chr  "Lincoln Ave & Fullerton Ave" "Kimbark Ave & 53rd St" "Western Ave & Lunt Ave" "Kimbark Ave & 53rd St" ...
##  $ start_station_id  : chr  "TA1309000058" "TA1309000037" "RP-005" "TA1309000037" ...
##  $ end_station_name  : chr  "Hampden Ct & Diversey Ave" "Greenwood Ave & 47th St" "Valli Produce - Evanston Plaza" "Greenwood Ave & 47th St" ...
##  $ end_station_id    : chr  "202480" "TA1308000002" "599" "TA1308000002" ...
##  $ start_lat         : num  41.9 41.8 42 41.8 41.8 ...
##  $ start_lng         : num  -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ end_lat           : num  41.9 41.8 42 41.8 41.8 ...
##  $ end_lng           : num  -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr  "member" "member" "casual" "member" ...
##  $ ride_length       : chr  "0:10:51" "0:08:29" "0:13:14" "0:08:46" ...
##  $ day_of_week       : chr  "Saturday" "Tuesday" "Monday" "Sunday" ...
as_tibble(ride_2023)
## # A tibble: 4,335,767 × 15
##    ride_id rideable_type started_at ended_at start_station_name start_station_id
##    <chr>   <chr>         <chr>      <chr>    <chr>              <chr>           
##  1 F96D5A… electric_bike 1/21/2023… 1/21/20… Lincoln Ave & Ful… TA1309000058    
##  2 13CB7E… classic_bike  1/10/2023… 1/10/20… Kimbark Ave & 53r… TA1309000037    
##  3 BD88A2… electric_bike 1/2/2023 … 1/2/202… Western Ave & Lun… RP-005          
##  4 C90792… classic_bike  1/22/2023… 1/22/20… Kimbark Ave & 53r… TA1309000037    
##  5 339701… classic_bike  1/12/2023… 1/12/20… Kimbark Ave & 53r… TA1309000037    
##  6 58E681… electric_bike 1/31/2023… 1/31/20… Lakeview Ave & Fu… TA1309000019    
##  7 2F7194… electric_bike 1/15/2023… 1/15/20… Kimbark Ave & 53r… TA1309000037    
##  8 DB1CF8… classic_bike  1/25/2023… 1/25/20… Kimbark Ave & 53r… TA1309000037    
##  9 34EAB9… electric_bike 1/25/2023… 1/25/20… Kimbark Ave & 53r… TA1309000037    
## 10 BC8AB1… classic_bike  1/6/2023 … 1/6/202… Kimbark Ave & 53r… TA1309000037    
## # ℹ 4,335,757 more rows
## # ℹ 9 more variables: end_station_name <chr>, end_station_id <chr>,
## #   start_lat <dbl>, start_lng <dbl>, end_lat <dbl>, end_lng <dbl>,
## #   member_casual <chr>, ride_length <chr>, day_of_week <chr>

Step 5: Adding Ride Length in Minutes

We add a column for ride length in minutes to perform meaningful arithmetic analysis.We also round the result to two decimal places.

# Function to convert HH:MM:SS format to seconds
time_to_seconds <- function(time_str) {
  time_parts <- as.numeric(strsplit(time_str, ":")[[1]])
  hours <- time_parts[1]
  minutes <- time_parts[2]
  seconds <- time_parts[3]
  total_seconds <- hours * 3600 + minutes * 60 + seconds
  return(total_seconds)
}

# Use mutate from dplyr to create ride_length_minutes
ride_2023 <- ride_2023 %>%
  mutate(
    ride_length_minutes = sapply(ride_length, time_to_seconds) / 6
    
  )
## Warning: There were 66 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `ride_length_minutes = sapply(ride_length, time_to_seconds)/6`.
## Caused by warning in `FUN()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 65 remaining warnings.
#Rounding up decimal points to two places
ride_2023$ride_length_minutes=round(ride_2023$ride_length_minutes,2)

Step 6: Create a copy of dataframe

We make a copy so we have a place to start in case something goes wrong.

ride_2023V2 <- ride_2023

Step 7: Cleaning the Data

Now, we clean the data by removing null values, duplicate records and records with non-positive ride lengths.

#Remove the duplicate value
ride_2023V2 <- distinct(ride_2023)
summary(ride_2023V2)
##    ride_id          rideable_type       started_at          ended_at        
##  Length:4335767     Length:4335767     Length:4335767     Length:4335767    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  start_station_name start_station_id   end_station_name   end_station_id    
##  Length:4335767     Length:4335767     Length:4335767     Length:4335767    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    start_lat       start_lng         end_lat         end_lng      
##  Min.   :41.65   Min.   :-87.84   Min.   : 0.00   Min.   :-87.84  
##  1st Qu.:41.88   1st Qu.:-87.66   1st Qu.:41.88   1st Qu.:-87.66  
##  Median :41.90   Median :-87.64   Median :41.90   Median :-87.64  
##  Mean   :41.90   Mean   :-87.64   Mean   :41.90   Mean   :-87.64  
##  3rd Qu.:41.93   3rd Qu.:-87.63   3rd Qu.:41.93   3rd Qu.:-87.63  
##  Max.   :42.06   Max.   :-87.53   Max.   :42.06   Max.   :  0.00  
##                                                                   
##  member_casual      ride_length        day_of_week        ride_length_minutes
##  Length:4335767     Length:4335767     Length:4335767     Min.   :    0.00   
##  Class :character   Class :character   Class :character   1st Qu.:   56.17   
##  Mode  :character   Mode  :character   Mode  :character   Median :   98.00   
##                                                           Mean   :  158.96   
##                                                           3rd Qu.:  174.67   
##                                                           Max.   :14398.67   
##                                                           NA's   :66
# Remove values with null value
ride_2023V2 <- na.omit(ride_2023)
summary(ride_2023V2)
##    ride_id          rideable_type       started_at          ended_at        
##  Length:4335701     Length:4335701     Length:4335701     Length:4335701    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  start_station_name start_station_id   end_station_name   end_station_id    
##  Length:4335701     Length:4335701     Length:4335701     Length:4335701    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    start_lat       start_lng         end_lat         end_lng      
##  Min.   :41.65   Min.   :-87.84   Min.   : 0.00   Min.   :-87.84  
##  1st Qu.:41.88   1st Qu.:-87.66   1st Qu.:41.88   1st Qu.:-87.66  
##  Median :41.90   Median :-87.64   Median :41.90   Median :-87.64  
##  Mean   :41.90   Mean   :-87.64   Mean   :41.90   Mean   :-87.64  
##  3rd Qu.:41.93   3rd Qu.:-87.63   3rd Qu.:41.93   3rd Qu.:-87.63  
##  Max.   :42.06   Max.   :-87.53   Max.   :42.06   Max.   :  0.00  
##  member_casual      ride_length        day_of_week        ride_length_minutes
##  Length:4335701     Length:4335701     Length:4335701     Min.   :    0.00   
##  Class :character   Class :character   Class :character   1st Qu.:   56.17   
##  Mode  :character   Mode  :character   Mode  :character   Median :   98.00   
##                                                           Mean   :  158.96   
##                                                           3rd Qu.:  174.67   
##                                                           Max.   :14398.67
#Remove values of ride_length which has value less than or equal to 0
ride_2023V2 <- ride_2023 [!(ride_2023$ride_length<=0),]
summary(ride_2023V2)
##    ride_id          rideable_type       started_at          ended_at        
##  Length:4335701     Length:4335701     Length:4335701     Length:4335701    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  start_station_name start_station_id   end_station_name   end_station_id    
##  Length:4335701     Length:4335701     Length:4335701     Length:4335701    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    start_lat       start_lng         end_lat         end_lng      
##  Min.   :41.65   Min.   :-87.84   Min.   : 0.00   Min.   :-87.84  
##  1st Qu.:41.88   1st Qu.:-87.66   1st Qu.:41.88   1st Qu.:-87.66  
##  Median :41.90   Median :-87.64   Median :41.90   Median :-87.64  
##  Mean   :41.90   Mean   :-87.64   Mean   :41.90   Mean   :-87.64  
##  3rd Qu.:41.93   3rd Qu.:-87.63   3rd Qu.:41.93   3rd Qu.:-87.63  
##  Max.   :42.06   Max.   :-87.53   Max.   :42.06   Max.   :  0.00  
##  member_casual      ride_length        day_of_week        ride_length_minutes
##  Length:4335701     Length:4335701     Length:4335701     Min.   :    0.00   
##  Class :character   Class :character   Class :character   1st Qu.:   56.17   
##  Mode  :character   Mode  :character   Mode  :character   Median :   98.00   
##                                                           Mean   :  158.96   
##                                                           3rd Qu.:  174.67   
##                                                           Max.   :14398.67

Step 8: Converting Date and Time

We convert the ‘started_at’ column to a Date object and extract the month, day, and year for proper analysis.

date <- as.Date(ride_2023V2$started_at, format = "%m/%d/%Y %H:%M")
# Assuming ride_2023 is your data frame and started_at is a column in it
ride_2023V2$date <- as.Date(ride_2023V2$started_at, format = "%m/%d/%Y %H:%M")
ride_2023V2$month <- format(as.Date(ride_2023V2$date), "%m")
ride_2023V2$day <- format(as.Date(ride_2023V2$date), "%d")
ride_2023V2$year <- format(as.Date(ride_2023V2$date), "%Y")
summary(ride_2023V2)
##    ride_id          rideable_type       started_at          ended_at        
##  Length:4335701     Length:4335701     Length:4335701     Length:4335701    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  start_station_name start_station_id   end_station_name   end_station_id    
##  Length:4335701     Length:4335701     Length:4335701     Length:4335701    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    start_lat       start_lng         end_lat         end_lng      
##  Min.   :41.65   Min.   :-87.84   Min.   : 0.00   Min.   :-87.84  
##  1st Qu.:41.88   1st Qu.:-87.66   1st Qu.:41.88   1st Qu.:-87.66  
##  Median :41.90   Median :-87.64   Median :41.90   Median :-87.64  
##  Mean   :41.90   Mean   :-87.64   Mean   :41.90   Mean   :-87.64  
##  3rd Qu.:41.93   3rd Qu.:-87.63   3rd Qu.:41.93   3rd Qu.:-87.63  
##  Max.   :42.06   Max.   :-87.53   Max.   :42.06   Max.   :  0.00  
##  member_casual      ride_length        day_of_week        ride_length_minutes
##  Length:4335701     Length:4335701     Length:4335701     Min.   :    0.00   
##  Class :character   Class :character   Class :character   1st Qu.:   56.17   
##  Mode  :character   Mode  :character   Mode  :character   Median :   98.00   
##                                                           Mean   :  158.96   
##                                                           3rd Qu.:  174.67   
##                                                           Max.   :14398.67   
##       date               month               day                year          
##  Min.   :2023-01-01   Length:4335701     Length:4335701     Length:4335701    
##  1st Qu.:2023-05-20   Class :character   Class :character   Class :character  
##  Median :2023-07-20   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2023-07-15                                                           
##  3rd Qu.:2023-09-16                                                           
##  Max.   :2023-12-31

Step 9: Descriptive Analysis

Ride Length Analysis: In this analysis, we analyze the max,min,average and median of ride length.

# Maximum ride length
 ride_2023V2 %>% summarize(max_ride_length = max(ride_length_minutes))
##   max_ride_length
## 1        14398.67
# Minimum ride length
 ride_2023V2 %>% summarise(min_ride_length= min(ride_length_minutes))
##   min_ride_length
## 1               0
# Average ride length
 ride_2023V2 %>% summarise(avg_ride_length=mean(ride_length_minutes))
##   avg_ride_length
## 1         158.958
# Median ride length
 ride_2023V2 %>%  summarise(med_ride_length=median(ride_length_minutes))
##   med_ride_length
## 1              98

Ride count by other factors:

# Count Unique Rides by ride_id
 ride_2023V2 %>% distinct(ride_id) %>% count()
##         n
## 1 4335294
# Count number of rides by rideable type
 ride_2023V2 %>% count(rideable_type)
##   rideable_type       n
## 1  classic_bike 2690707
## 2   docked_bike   76124
## 3 electric_bike 1568870
# Count number of rides by member type (member vs casual)
 ride_2023V2 %>% count(member_casual)
##   member_casual       n
## 1        casual 1532770
## 2        member 2802931
# Count number of rides by rideable type and member type
 ride_2023V2 %>% group_by(member_casual) %>% count(rideable_type) %>% arrange(n) %>% rename(total_rides = n)
## # A tibble: 5 × 3
## # Groups:   member_casual [2]
##   member_casual rideable_type total_rides
##   <chr>         <chr>               <int>
## 1 casual        docked_bike         76124
## 2 casual        electric_bike      583641
## 3 casual        classic_bike       873005
## 4 member        electric_bike      985229
## 5 member        classic_bike      1817702
# Count number of rides by month, rideable type, and member type
 ride_2023V2 %>% group_by(month, member_casual, rideable_type) %>% summarise(no_of_ride = n(), avg_ride_length = mean(ride_length_minutes))
## `summarise()` has grouped output by 'month', 'member_casual'. You can override
## using the `.groups` argument.
## # A tibble: 56 × 5
## # Groups:   month, member_casual [24]
##    month member_casual rideable_type no_of_ride avg_ride_length
##    <chr> <chr>         <chr>              <int>           <dbl>
##  1 01    casual        classic_bike       13860           172. 
##  2 01    casual        docked_bike         1682           376. 
##  3 01    casual        electric_bike      14079            97.5
##  4 01    member        classic_bike       76359           106. 
##  5 01    member        electric_bike      42304            88.0
##  6 02    casual        classic_bike       15446           202. 
##  7 02    casual        docked_bike         2151           429. 
##  8 02    casual        electric_bike      15621           114. 
##  9 02    member        classic_bike       74223           111. 
## 10 02    member        electric_bike      43933            92.0
## # ℹ 46 more rows
# Summarize member (Member, Casual) rides by month
 ride_2023V2 %>% group_by(member_casual, month) %>% filter(member_casual == "member") %>% summarise(no_of_ride = n(), avg_ride_length = mean(ride_length_minutes))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups:   member_casual [1]
##    member_casual month no_of_ride avg_ride_length
##    <chr>         <chr>      <int>           <dbl>
##  1 member        01        118663            99.6
##  2 member        02        118156           104. 
##  3 member        03        155294           102. 
##  4 member        04        213657           116. 
##  5 member        05        286185           127. 
##  6 member        06        314962           129. 
##  7 member        07        328663           133. 
##  8 member        08        351061           133. 
##  9 member        09        309670           127. 
## 10 member        10        273478           116. 
## 11 member        11        202689           110. 
## 12 member        12        130453           108.
 ride_2023V2 %>% group_by(member_casual, month) %>% filter(member_casual == "casual") %>% summarise(no_of_ride = n(), avg_ride_length = mean(ride_length_minutes))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## # Groups:   member_casual [1]
##    member_casual month no_of_ride avg_ride_length
##    <chr>         <chr>      <int>           <dbl>
##  1 casual        01         29621            148.
##  2 casual        02         33218            175.
##  3 casual        03         47399            166.
##  4 casual        04        110537            225.
##  5 casual        05        177037            243.
##  6 casual        06        219793            239.
##  7 casual        07        245292            251.
##  8 casual        08        233851            242.
##  9 casual        09        196962            234.
## 10 casual        10        130297            213.
## 11 casual        11         72077            178.
## 12 casual        12         36686            165.

Time Analysis: In this analysis, we focuses on understanding ride patterns over different times of the day, days of the weeks and months

# Average ride duration (ride_length) by rideable type
ride_2023V2 %>% group_by(rideable_type) %>%
  summarise(avg_ride_length=mean(ride_length_minutes))
## # A tibble: 3 × 2
##   rideable_type avg_ride_length
##   <chr>                   <dbl>
## 1 classic_bike             171.
## 2 docked_bike              529.
## 3 electric_bike            121.
# Arranging the data of day of week in Sunday, Monday,...etc. format
ride_2023V2$day_of_week <- ordered(ride_2023V2$day_of_week, 
        levels=c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"))

# Analyze the distribution of rides across different days of the week
ride_2023V2 %>% group_by(day_of_week) %>% summarise(total_ride=n())
## # A tibble: 7 × 2
##   day_of_week total_ride
##   <ord>            <int>
## 1 Sunday          563092
## 2 Monday          562673
## 3 Tuesday         631006
## 4 Wednesday       636539
## 5 Thursday        652182
## 6 Friday          628928
## 7 Saturday        661281
# Compare ride lengths between members 
ride_2023V2 %>% group_by(member_casual) %>% 
  summarise(avg_ride_length=mean(ride_length_minutes),total = n())
## # A tibble: 2 × 3
##   member_casual avg_ride_length   total
##   <chr>                   <dbl>   <int>
## 1 casual                   228. 1532770
## 2 member                   121. 2802931
# Count the number of subscriber per month
ride_2023V2 %>% group_by(month) %>% filter(member_casual=="casual") %>% count(member_casual)
## # A tibble: 12 × 3
## # Groups:   month [12]
##    month member_casual      n
##    <chr> <chr>          <int>
##  1 01    casual         29621
##  2 02    casual         33218
##  3 03    casual         47399
##  4 04    casual        110537
##  5 05    casual        177037
##  6 06    casual        219793
##  7 07    casual        245292
##  8 08    casual        233851
##  9 09    casual        196962
## 10 10    casual        130297
## 11 11    casual         72077
## 12 12    casual         36686
ride_2023V2 %>% group_by(month) %>% filter(member_casual=="member") %>% count(member_casual)
## # A tibble: 12 × 3
## # Groups:   month [12]
##    month member_casual      n
##    <chr> <chr>          <int>
##  1 01    member        118663
##  2 02    member        118156
##  3 03    member        155294
##  4 04    member        213657
##  5 05    member        286185
##  6 06    member        314962
##  7 07    member        328663
##  8 08    member        351061
##  9 09    member        309670
## 10 10    member        273478
## 11 11    member        202689
## 12 12    member        130453
# Total no of ride  per month
ride_2023V2 %>% group_by(month) %>% summarise(total_ride=n()) 
## # A tibble: 12 × 2
##    month total_ride
##    <chr>      <int>
##  1 01        148284
##  2 02        151374
##  3 03        202693
##  4 04        324194
##  5 05        463222
##  6 06        534755
##  7 07        573955
##  8 08        584912
##  9 09        506632
## 10 10        403775
## 11 11        274766
## 12 12        167139
# Month-wise maximum number of rides in a day along with the corresponding day
# Count the number of rides for each day within each month
rides_per_day <- ride_2023V2 %>%
  group_by(month, day) %>%
  summarise(num_rides = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
# Calculating the maximum number of rides in a day (max_ride_in_a_day) for each month
max_rides_per_month <- rides_per_day %>%
  group_by(month) %>%
  summarise(max_ride_in_a_day = max(num_rides), day = which.max(num_rides)) %>% 
  select(month, max_ride_in_a_day, day)

Other analysis:

# Summarize Ride Data for 2023 by Member Type
ride_2023V2 %>% group_by(member_casual) %>% summarise(no_of_ride=n(),max_ride_length=max(ride_length_minutes),min_ride_length=min(ride_length_minutes),mean_ride_length=mean(ride_length_minutes))
## # A tibble: 2 × 5
##   member_casual no_of_ride max_ride_length min_ride_length mean_ride_length
##   <chr>              <int>           <dbl>           <dbl>            <dbl>
## 1 casual           1532770          14377.               0             228.
## 2 member           2802931          14399.               0             121.
# Most popular start and end station 
# Start station
ride_2023V2 %>% group_by(start_station_name) %>% summarise(no_of_ride=n()) %>%
  arrange(desc(no_of_ride)) 
## # A tibble: 1,535 × 2
##    start_station_name                 no_of_ride
##    <chr>                                   <int>
##  1 Streeter Dr & Grand Ave                 59624
##  2 DuSable Lake Shore Dr & Monroe St       38021
##  3 Michigan Ave & Oak St                   34793
##  4 DuSable Lake Shore Dr & North Blvd      33171
##  5 Clark St & Elm St                       32974
##  6 Kingsbury St & Kinzie St                32323
##  7 Clinton St & Washington Blvd            30240
##  8 Wells St & Concord Ln                   30071
##  9 Millennium Park                         28113
## 10 Theater on the Lake                     28041
## # ℹ 1,525 more rows
# End station
ride_2023V2 %>% group_by(end_station_name) %>% summarise(no_of_ride=n()) %>% 
  arrange(desc(no_of_ride))
## # A tibble: 1,558 × 2
##    end_station_name                   no_of_ride
##    <chr>                                   <int>
##  1 Streeter Dr & Grand Ave                 61713
##  2 DuSable Lake Shore Dr & North Blvd      37301
##  3 DuSable Lake Shore Dr & Monroe St       36211
##  4 Michigan Ave & Oak St                   35775
##  5 Clark St & Elm St                       32313
##  6 Kingsbury St & Kinzie St                31651
##  7 Clinton St & Washington Blvd            31012
##  8 Wells St & Concord Ln                   30969
##  9 Millennium Park                         29543
## 10 Theater on the Lake                     29011
## # ℹ 1,548 more rows
# Most popular start and end station by member 
# Popular start station for 'member' 
ride_2023V2 %>% group_by(member_casual,start_station_name) %>% filter(member_casual=="member") %>% 
  summarise(no_of_ride=n()) %>%  arrange(desc(no_of_ride))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 1,389 × 3
## # Groups:   member_casual [1]
##    member_casual start_station_name           no_of_ride
##    <chr>         <chr>                             <int>
##  1 member        Clinton St & Washington Blvd      24312
##  2 member        Kingsbury St & Kinzie St          24230
##  3 member        Clark St & Elm St                 23085
##  4 member        Wells St & Concord Ln             19236
##  5 member        Clinton St & Madison St           19120
##  6 member        Wells St & Elm St                 18530
##  7 member        University Ave & 57th St          18066
##  8 member        Loomis St & Lexington St          17819
##  9 member        Ellis Ave & 60th St               16987
## 10 member        Broadway & Barry Ave              16836
## # ℹ 1,379 more rows
# Popular end station for 'member'
ride_2023V2 %>% group_by(member_casual,end_station_name) %>% filter(member_casual=="member") %>% 
  summarise(no_of_ride=n()) %>% arrange(desc(no_of_ride))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 1,397 × 3
## # Groups:   member_casual [1]
##    member_casual end_station_name             no_of_ride
##    <chr>         <chr>                             <int>
##  1 member        Clinton St & Washington Blvd      25483
##  2 member        Kingsbury St & Kinzie St          24314
##  3 member        Clark St & Elm St                 22963
##  4 member        Clinton St & Madison St           20446
##  5 member        Wells St & Concord Ln             20096
##  6 member        Wells St & Elm St                 18627
##  7 member        University Ave & 57th St          18370
##  8 member        Loomis St & Lexington St          17588
##  9 member        Broadway & Barry Ave              17501
## 10 member        State St & Chicago Ave            17231
## # ℹ 1,387 more rows
# Popular start station for 'casual'
ride_2023V2 %>% group_by(member_casual,start_station_name) %>% filter(member_casual=="casual") %>% 
  summarise(no_of_ride=n()) %>% arrange(desc(no_of_ride))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 1,467 × 3
## # Groups:   member_casual [1]
##    member_casual start_station_name                 no_of_ride
##    <chr>         <chr>                                   <int>
##  1 casual        Streeter Dr & Grand Ave                 43373
##  2 casual        DuSable Lake Shore Dr & Monroe St       28719
##  3 casual        Michigan Ave & Oak St                   21103
##  4 casual        Millennium Park                         18883
##  5 casual        DuSable Lake Shore Dr & North Blvd      18778
##  6 casual        Shedd Aquarium                          16959
##  7 casual        Theater on the Lake                     15059
##  8 casual        Dusable Harbor                          14552
##  9 casual        Adler Planetarium                       11307
## 10 casual        Montrose Harbor                         10907
## # ℹ 1,457 more rows
# Popular end station for 'casual'
ride_2023V2 %>% group_by(member_casual,end_station_name) %>% filter(member_casual=="casual") %>% 
  summarise(no_of_ride=n()) %>% arrange(desc(no_of_ride))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 1,495 × 3
## # Groups:   member_casual [1]
##    member_casual end_station_name                   no_of_ride
##    <chr>         <chr>                                   <int>
##  1 casual        Streeter Dr & Grand Ave                 47718
##  2 casual        DuSable Lake Shore Dr & Monroe St       26356
##  3 casual        Michigan Ave & Oak St                   22486
##  4 casual        DuSable Lake Shore Dr & North Blvd      22338
##  5 casual        Millennium Park                         21319
##  6 casual        Theater on the Lake                     16701
##  7 casual        Shedd Aquarium                          15242
##  8 casual        Dusable Harbor                          12870
##  9 casual        Wells St & Concord Ln                   10873
## 10 casual        Montrose Harbor                         10853
## # ℹ 1,485 more rows

Step 10 : Visualization

In this step, we plot the analysis for understanding data patterns and communicating insights.

# Total no of ride  per month
ride_2023V2 %>% group_by(month) %>% summarise(total_ride=n()) %>% 
  ggplot(aes(x=month,y=total_ride,fill = month))+geom_col()+
  labs(x="Month",y="No of Rides",title = "Total No of Rides by Month")+
  theme_minimal()  

# Total no of ride by rideable type
ride_2023V2 %>% group_by(rideable_type) %>% summarise(no_of_ride=n())%>%
  ggplot(aes(x=rideable_type,y=no_of_ride,fill =rideable_type)) + geom_col()+ 
  labs(x="Type of Ride",y="Count of Rides",title = "No of Rides by Rideable Type") +
  theme_minimal()+scale_fill_discrete(name="Ride Type")

# Total no of ride by member casual
ride_2023V2 %>% group_by(member_casual) %>% summarise(no_of_ride=n()) %>% 
  ggplot(aes(x=member_casual,y=no_of_ride,fill=member_casual))+
  geom_col()+labs(x= "Member Type",y="Count of Rides",title="No of Rides by Member ")+
  theme_minimal()+ scale_fill_discrete(name="Member Type")

# Total no of ride by rideable type and member casual
ride_2023V2 %>% group_by(member_casual,rideable_type) %>% summarise(no_of_ride=n()) %>% 
  ggplot(aes(x=rideable_type,y=no_of_ride,fill = member_casual))+geom_col(position = "dodge2")+
  labs(x="Ride Type",y="Count of Rides",title="No of Rides by Rideable Type and Member")+
  theme_minimal()+scale_fill_discrete(name="Member Type")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# Total no of ride by month,rideable type and member
ride_2023V2 %>% group_by(month,member_casual,rideable_type) %>% 
  summarise(no_of_ride=n(),avg_ride_length=mean(ride_length_minutes)) %>% 
  ggplot(aes(x=month,y=no_of_ride,fill = rideable_type))+geom_col(position = "dodge2")+
  facet_wrap(~member_casual)+labs(x="Month",y="No of Rides",title="No of Ride by Month, Bike Type, Member")+
   scale_fill_discrete(name="Bike Type")+
  theme(
       plot.title = element_text(size = 14), # Increase title size 
       strip.text = element_text(size = 10,face = "bold"),  
       legend.position = "bottom"           
       )
## `summarise()` has grouped output by 'month', 'member_casual'. You can override
## using the `.groups` argument.

# Summarize Member(Member,Casual) Rides by Month
ride_2023V2 %>% group_by(member_casual,month) %>% 
  summarise(no_of_ride=n(),avg_ride_length=mean(ride_length_minutes)) %>% 
  ggplot(aes(x=month,y=no_of_ride,fill = member_casual))+geom_col(position = "dodge2")+
  labs(x="Month",y="Count of Rides",title = "Summarize Member Rides by Month")+
  facet_wrap(~member_casual)+scale_fill_discrete(name="Member Type")+
  theme(
    plot.title = element_text(size=14),
    strip.text = element_text(size=10,face="bold"),
    legend.position = "bottom"
  )
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# Average ride duration (ride_length) by rideable type
ride_2023V2 %>% group_by(rideable_type) %>%
  summarise(avg_ride_length=mean(ride_length_minutes)) %>%
  ggplot(aes(x=rideable_type,y=avg_ride_length,fill = rideable_type))+ geom_col()+
  labs(x="Type of Rides",y="Average Ride Length",title = "Average Ride Duration by Ride Type")+
  theme_minimal()+scale_fill_discrete(name="Bike Type")

# Analyze the distribution of rides (ride_id) across different day_of_week
ride_2023V2 %>% group_by(day_of_week) %>% summarise(total_ride=n()) %>% 
  ggplot(aes(x=day_of_week,y=total_ride,fill = day_of_week))+geom_col()+
  labs(x="Week",y="Count of Rides",title = "Distribution of Rides Across Different Week")+
  scale_fill_discrete(name="Week") + theme(axis.text.x = element_text(angle=45,hjust=1))

# No of Rides by rideable types for each day of the week
ride_2023V2 %>% group_by(member_casual,rideable_type,day_of_week) %>% summarise(total_ride=n()) %>% 
  ggplot(aes(x=day_of_week,y=total_ride,fill = rideable_type))+geom_col(position = "dodge2")+
  facet_wrap(~ member_casual) +labs(x="Week",y="Count of Rides",title = " Rides by Rideable Types For Each Day of the Week")+
  theme(
    axis.text.x = element_text(angle = 45,hjust = 1),
    legend.position = "bottom"
    )
## `summarise()` has grouped output by 'member_casual', 'rideable_type'. You can
## override using the `.groups` argument.

# Compare ride lengths (ride_length) between member_casual users
ride_2023V2 %>% group_by(member_casual) %>% 
  summarise(avg_ride_length=mean(ride_length_minutes),total = n()) %>% 
  mutate(avg_ride_length_rounded = round(avg_ride_length, 2)) %>% 
  ggplot(aes(x=member_casual,y=total,fill = factor(avg_ride_length_rounded)))+geom_col()+
  labs(x="Member Types",y="No of Rides",title="Average Ride Length and Number of Rides by Member Type ")+
  theme_minimal() + scale_fill_discrete(name="Average Ride Length")

# Month-wise maximum number of rides in a day along with the corresponding day
rides_per_day %>%
  group_by(month) %>%
  summarise(max_ride_in_a_day = max(num_rides), day = which.max(num_rides)) %>% 
  select(month, max_ride_in_a_day, day) %>% 
  ggplot(aes(x=month,y=max_ride_in_a_day,fill = factor(day)))+geom_col(position = "dodge2")+
           labs(x="Month",y="Maximum Ride in a Day",title = "Month-Wise Maximum no of Rides in a Day")+
           theme_minimal() +  scale_fill_discrete(name = "MonthDay")

# Most popular start and end station 
# Top 10 Popular Start station by member_casual(member)
ride_2023V2 %>% group_by(member_casual, start_station_name) %>% filter(member_casual=="member")%>% summarise(no_of_ride=n()) %>%
  arrange(desc(no_of_ride)) %>% top_n(10) %>% 
  ggplot(aes(x=start_station_name,y=no_of_ride,fill = member_casual))+geom_col(position = "dodge2")+
  coord_flip() +labs(x="Start Station Name",y="No of Rides",title="Top 10 Popular Start Station by Member")+
  theme_minimal() + scale_fill_discrete(name="Member Type")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Selecting by no_of_ride

# Top 10 Popular Start station by member_casual(casual)
ride_2023V2 %>% group_by(member_casual,start_station_name) %>% filter(member_casual=="casual") %>% 
  summarise(no_of_ride=n()) %>% arrange(desc(no_of_ride)) %>% top_n(10) %>% 
  ggplot(aes(x=start_station_name,y=no_of_ride,fill=member_casual))+geom_col(position = "dodge2")+coord_flip()+
  labs(x="Start Station Name",y="No of Rides",title = "Top 10 Popular Start Station by Member")+
  theme_minimal()+scale_fill_manual(name="Member Type",values = c("casual"="blue"))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Selecting by no_of_ride

# Top 10 Popular End station by member_casual(member)
ride_2023V2 %>% group_by(member_casual,end_station_name) %>% filter(member_casual=="member") %>% summarise(no_of_ride=n()) %>% 
  arrange(desc(no_of_ride)) %>% top_n(10) %>% 
  ggplot(aes(x=end_station_name,y=no_of_ride,fill=member_casual))+geom_col(position = "dodge2")+coord_flip()+
  labs(x="End Station Name",y="No of Rides",title = "Top 10 Popular End Station by Member")+
  theme_minimal()+scale_fill_discrete(name="Member Type")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Selecting by no_of_ride

# Top 10 Popular End station by member_casual(casual)
ride_2023V2 %>% group_by(member_casual,end_station_name) %>% filter(member_casual=="casual") %>% summarise(no_of_ride=n()) %>% 
  arrange(desc(no_of_ride)) %>% top_n(10) %>% 
  ggplot(aes(x=end_station_name,y=no_of_ride,fill = member_casual))+geom_col(position = "dodge2")+
  coord_flip()+labs(x="End Station Name",y="No of Rides",title = "Top 10 Popular End Station By Member")+
  theme_minimal()+scale_fill_manual(name="Member Type",values = c("casual"="blue"))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Selecting by no_of_ride

Step 11: Export Combined DataFrame

Finally, we export the cleaned dataframe for further analysis to CSV format.

# Load the package to export
library(writexl)

# Export directly CSV file to local computer
write.csv(ride_2023V2,"ride_2023_V2.csv",row.names = FALSE)