BikeShare_Results

Key Steps in Date-Time Conversion

1. Importing and Cleaning Data

The project began with importing four quarterly datasets using read_csv(). Since column names differed across datasets, I standardized them using rename() from the dplyr package.

Loading required packages

library(here)

## here() starts at C:/Users/oscar

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(skimr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

Importing data

q1_2019 <- read_csv("Divvy_Trips_2019_Q1.csv")

## Rows: 365069 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): from_station_name, to_station_name, usertype, gender
## dbl  (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num  (1): tripduration
## dttm (2): start_time, end_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q2_2019i <- read_csv("Divvy_Trips_2019_Q2.csv")

## Rows: 1108163 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): 03 - Rental Start Station Name, 02 - Rental End Station Name, User...
## dbl  (5): 01 - Rental Details Rental ID, 01 - Rental Details Bike ID, 03 - R...
## num  (1): 01 - Rental Details Duration In Seconds Uncapped
## dttm (2): 01 - Rental Details Local Start Time, 01 - Rental Details Local En...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q3_2019 <- read_csv("Divvy_Trips_2019_Q3.csv")

## Rows: 1640718 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): from_station_name, to_station_name, usertype, gender
## dbl  (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num  (1): tripduration
## dttm (2): start_time, end_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q4_2019 <- read_csv("Divvy_Trips_2019_Q4.csv")

## Rows: 704054 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): from_station_name, to_station_name, usertype, gender
## dbl  (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num  (1): tripduration
## dttm (2): start_time, end_time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspecting and comparing column names

colnames(q1_2019)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

colnames(q2_2019i)

##  [1] "01 - Rental Details Rental ID"                   
##  [2] "01 - Rental Details Local Start Time"            
##  [3] "01 - Rental Details Local End Time"              
##  [4] "01 - Rental Details Bike ID"                     
##  [5] "01 - Rental Details Duration In Seconds Uncapped"
##  [6] "03 - Rental Start Station ID"                    
##  [7] "03 - Rental Start Station Name"                  
##  [8] "02 - Rental End Station ID"                      
##  [9] "02 - Rental End Station Name"                    
## [10] "User Type"                                       
## [11] "Member Gender"                                   
## [12] "05 - Member Details Member Birthday Year"

colnames(q3_2019)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

colnames(q4_2019)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

Renaming column names to make them consistent

q2_2019 <- q2_2019i %>% 
  rename(trip_id = '01 - Rental Details Rental ID',
         start_time = '01 - Rental Details Local Start Time',
         end_time = '01 - Rental Details Local End Time',
         bikeid = '01 - Rental Details Bike ID',
         tripduration = '01 - Rental Details Duration In Seconds Uncapped',
         from_station_id = '03 - Rental Start Station ID',
         from_station_name = '03 - Rental Start Station Name',
         to_station_id = '02 - Rental End Station ID',
         to_station_name = '02 - Rental End Station Name',
         usertype = 'User Type',
         gender = 'Member Gender',
         birthyear = '05 - Member Details Member Birthday Year')

Inspecting dataframes

str(q1_2019)

## spc_tbl_ [365,069 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : num [1:365069] 21742443 21742444 21742445 21742446 21742447 ...
##  $ start_time       : POSIXct[1:365069], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
##  $ end_time         : POSIXct[1:365069], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
##  $ bikeid           : num [1:365069] 2167 4386 1524 252 1170 ...
##  $ tripduration     : num [1:365069] 390 441 829 1783 364 ...
##  $ from_station_id  : num [1:365069] 199 44 15 123 173 98 98 211 150 268 ...
##  $ from_station_name: chr [1:365069] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ to_station_id    : num [1:365069] 84 624 644 176 35 49 49 142 148 141 ...
##  $ to_station_name  : chr [1:365069] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ usertype         : chr [1:365069] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender           : chr [1:365069] "Male" "Female" "Female" "Male" ...
##  $ birthyear        : num [1:365069] 1989 1990 1994 1993 1994 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   trip_id = col_double(),
##   ..   start_time = col_datetime(format = ""),
##   ..   end_time = col_datetime(format = ""),
##   ..   bikeid = col_double(),
##   ..   tripduration = col_number(),
##   ..   from_station_id = col_double(),
##   ..   from_station_name = col_character(),
##   ..   to_station_id = col_double(),
##   ..   to_station_name = col_character(),
##   ..   usertype = col_character(),
##   ..   gender = col_character(),
##   ..   birthyear = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q2_2019)

## spc_tbl_ [1,108,163 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : num [1:1108163] 22178529 22178530 22178531 22178532 22178533 ...
##  $ start_time       : POSIXct[1:1108163], format: "2019-04-01 00:02:22" "2019-04-01 00:03:02" ...
##  $ end_time         : POSIXct[1:1108163], format: "2019-04-01 00:09:48" "2019-04-01 00:20:30" ...
##  $ bikeid           : num [1:1108163] 6251 6226 5649 4151 3270 ...
##  $ tripduration     : num [1:1108163] 446 1048 252 357 1007 ...
##  $ from_station_id  : num [1:1108163] 81 317 283 26 202 420 503 260 211 211 ...
##  $ from_station_name: chr [1:1108163] "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
##  $ to_station_id    : num [1:1108163] 56 59 174 133 129 426 500 499 211 211 ...
##  $ to_station_name  : chr [1:1108163] "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
##  $ usertype         : chr [1:1108163] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender           : chr [1:1108163] "Male" "Female" "Male" "Male" ...
##  $ birthyear        : num [1:1108163] 1975 1984 1990 1993 1992 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `01 - Rental Details Rental ID` = col_double(),
##   ..   `01 - Rental Details Local Start Time` = col_datetime(format = ""),
##   ..   `01 - Rental Details Local End Time` = col_datetime(format = ""),
##   ..   `01 - Rental Details Bike ID` = col_double(),
##   ..   `01 - Rental Details Duration In Seconds Uncapped` = col_number(),
##   ..   `03 - Rental Start Station ID` = col_double(),
##   ..   `03 - Rental Start Station Name` = col_character(),
##   ..   `02 - Rental End Station ID` = col_double(),
##   ..   `02 - Rental End Station Name` = col_character(),
##   ..   `User Type` = col_character(),
##   ..   `Member Gender` = col_character(),
##   ..   `05 - Member Details Member Birthday Year` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q3_2019)

## spc_tbl_ [1,640,718 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : num [1:1640718] 23479388 23479389 23479390 23479391 23479392 ...
##  $ start_time       : POSIXct[1:1640718], format: "2019-07-01 00:00:27" "2019-07-01 00:01:16" ...
##  $ end_time         : POSIXct[1:1640718], format: "2019-07-01 00:20:41" "2019-07-01 00:18:44" ...
##  $ bikeid           : num [1:1640718] 3591 5353 6180 5540 6014 ...
##  $ tripduration     : num [1:1640718] 1214 1048 1554 1503 1213 ...
##  $ from_station_id  : num [1:1640718] 117 381 313 313 168 300 168 313 43 43 ...
##  $ from_station_name: chr [1:1640718] "Wilton Ave & Belmont Ave" "Western Ave & Monroe St" "Lakeview Ave & Fullerton Pkwy" "Lakeview Ave & Fullerton Pkwy" ...
##  $ to_station_id    : num [1:1640718] 497 203 144 144 62 232 62 144 195 195 ...
##  $ to_station_name  : chr [1:1640718] "Kimball Ave & Belmont Ave" "Western Ave & 21st St" "Larrabee St & Webster Ave" "Larrabee St & Webster Ave" ...
##  $ usertype         : chr [1:1640718] "Subscriber" "Customer" "Customer" "Customer" ...
##  $ gender           : chr [1:1640718] "Male" NA NA NA ...
##  $ birthyear        : num [1:1640718] 1992 NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   trip_id = col_double(),
##   ..   start_time = col_datetime(format = ""),
##   ..   end_time = col_datetime(format = ""),
##   ..   bikeid = col_double(),
##   ..   tripduration = col_number(),
##   ..   from_station_id = col_double(),
##   ..   from_station_name = col_character(),
##   ..   to_station_id = col_double(),
##   ..   to_station_name = col_character(),
##   ..   usertype = col_character(),
##   ..   gender = col_character(),
##   ..   birthyear = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q4_2019)

## spc_tbl_ [704,054 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : num [1:704054] 25223640 25223641 25223642 25223643 25223644 ...
##  $ start_time       : POSIXct[1:704054], format: "2019-10-01 00:01:39" "2019-10-01 00:02:16" ...
##  $ end_time         : POSIXct[1:704054], format: "2019-10-01 00:17:20" "2019-10-01 00:06:34" ...
##  $ bikeid           : num [1:704054] 2215 6328 3003 3275 5294 ...
##  $ tripduration     : num [1:704054] 940 258 850 2350 1867 ...
##  $ from_station_id  : num [1:704054] 20 19 84 313 210 156 84 156 156 336 ...
##  $ from_station_name: chr [1:704054] "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
##  $ to_station_id    : num [1:704054] 309 241 199 290 382 226 142 463 463 336 ...
##  $ to_station_name  : chr [1:704054] "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
##  $ usertype         : chr [1:704054] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender           : chr [1:704054] "Male" "Male" "Female" "Male" ...
##  $ birthyear        : num [1:704054] 1987 1998 1991 1990 1987 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   trip_id = col_double(),
##   ..   start_time = col_datetime(format = ""),
##   ..   end_time = col_datetime(format = ""),
##   ..   bikeid = col_double(),
##   ..   tripduration = col_number(),
##   ..   from_station_id = col_double(),
##   ..   from_station_name = col_character(),
##   ..   to_station_id = col_double(),
##   ..   to_station_name = col_character(),
##   ..   usertype = col_character(),
##   ..   gender = col_character(),
##   ..   birthyear = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Converting trip_id and bike_id to character data type

q1_2019 <- mutate(q1_2019, trip_id = as.character(trip_id), bikeid = as.character(bikeid))
q2_2019 <- mutate(q2_2019, trip_id = as.character(trip_id), bikeid = as.character(bikeid))
q3_2019 <- mutate(q3_2019, trip_id = as.character(trip_id), bikeid = as.character(bikeid))
q4_2019 <- mutate(q4_2019, trip_id = as.character(trip_id), bikeid = as.character(bikeid))

Combining all data frames to one

TPdt_19 <- bind_rows(q1_2019, q2_2019, q3_2019, q4_2019)

2. Converting and Standardizing Date-Time Fields

The dataset contained start_time and end_time columns, which were initially in string format. To facilitate time-based calculations, I converted them into Date format:

TPdt_19$date <- as.Date(TPdt_19$start_time)

Additionally, I extracted specific time-related attributes to improve analysis:

TPdt_19$month <- format(as.Date(TPdt_19$date), '%m')
TPdt_19$day <- format(as.Date(TPdt_19$date), '%d')
TPdt_19$year <- format(as.Date(TPdt_19$date), '%Y')
TPdt_19$day_of_week <- format(as.Date(TPdt_19$date), '%A')

This transformation allowed for flexible grouping and trend analysis based on weekdays, months, and years.

3. Calculating Ride Duration

To measure trip durations, I computed ride_length using the difftime() function:

TPdt_19$ride_length <- difftime(TPdt_19$end_time, TPdt_19$start_time)
str(TPdt_19$ride_length)

##  'difftime' num [1:3818004] 6.5 7.35 13.8166666666667 29.7166666666667 ...
##  - attr(*, "units")= chr "mins"

Since ride_length was initially stored as a factor, I converted it into a numeric format for proper calculations:

TPdt_19d$ride_length <- as.numeric(as.character(TPdt_19d$ride_length))

4. Filtering and Removing Invalid Data

To ensure data integrity, I filtered out erroneous records, such as rides with negative durations and specific invalid station names:

skim_without_charts(TPdt_19)

Data summary
Name	TPdt_19
Number of rows	3818004
Number of columns	18
_______________________
Column type frequency:
character	10
Date	1
difftime	1
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
trip_id	0	1.00	7	8	3818004
bikeid	0	1.00	1	4	6017
from_station_name	0	1.00	10	43	640
to_station_name	0	1.00	10	43	641
usertype	0	1.00	8	10	2
gender	559206	0.85	4	6	2
month	0	1.00	2	2	12
day	0	1.00	2	2	31
year	0	1.00	4	4	1
day_of_week	0	1.00	6	9	7

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
date	0	1	2019-01-01	2019-12-31	2019-07-25	365

Variable type: difftime

skim_variable	n_missing	complete_rate	min	max	median	n_unique
ride_length	0	1	-56.37 mins	177200.4 mins	11.82 mins	21577

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
tripduration	0	1.00	1450.20	29854.14	61	411	709	1283	10628400
from_station_id	0	1.00	201.67	156.08	1	77	174	289	673
to_station_id	0	1.00	202.64	156.24	1	77	174	291	673
birthyear	538751	0.86	1984.07	10.87	1759	1979	1987	1992	2014

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
start_time	0	1	2019-01-01 00:04:37	2019-12-31 23:57:17	2019-07-25 17:50:54	3306090
end_time	0	1	2019-01-01 00:11:07	2020-01-21 13:54:35	2019-07-25 18:12:23	3238342

TPdt_19 <- TPdt_19[!(TPdt_19$from_station_name == "HQ QR" | TPdt_19$ride_length < 0),]
skim_without_charts(TPdt_19)

Data summary
Name	TPdt_19
Number of rows	3817991
Number of columns	18
_______________________
Column type frequency:
character	10
Date	1
difftime	1
numeric	4
POSIXct	2
________________________
Group variables	None