knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(fs)
library(arrow)
library(tidyverse)
library(psych)
library(lubridate)
library(dplyr)
library(scales)
library(geosphere)
citibike_4everyone <- read_parquet("data_606_citiBike.parquet")
Answer:
Are the “golden hours” right before and after sunset predictive of Citi Bike biking patterns, defined by the start and stop location of each ride?
Citi Bike has automated tracking and data collection for all their shared bikes in NYC. The data is one row for every single ride and includes:
While the data is at the individual ride level, it’s stripped of all PII and the only thing that remain to keep it as a ride-level report is the ride ID they assign to it. The data is released to the public online and is made available for use under Citi Bike’s Data License Agreement.The policy states that all members of the public have a non-exclusive license to use all all data release publicly.
That policy governed when I downloaded all files available for January 1st 2023 to September 30th, 2024 from the Citi Bike System Data tool (Bike). I downloaded 10 zip files total: one for all 2023 data and then 10 for 2024, one per month. After the year ends, Citi Bike will compress it all together and make it available as once zip file. I went folder by folder manually after download to see what was in them and then moved them to one folder on my desktop. The code starts at having all the downloaded files in one folder.
#creating a dataframe that will tell me the average size of a row bytes of a random 1,000 rows. This is so I can determine roughly
# how many rows of data will leave me with a 1GB CSV, which will then become much smaller as I converte it to .parquet and remove unneeded columns.
citi_csvs <- list.files("/Users/uwsthoughts/Desktop/citibike_files", full.names = TRUE, pattern = "\\.csv$")
size_df <- data.frame(
file_name = character(),
sample_size = integer(),
row_bytes = numeric(),
file_rows = integer(),
stringsAsFactors = FALSE
)
for (bike in citi_csvs) {
citi_sample <- fread(bike, nrows = 1000)
sample_bytes <- object.size(citi_sample)
row_bytes <- as.numeric(sample_bytes) / nrow(citi_sample)
total_rows <- fread(bike, select = 1)[, .N]
size_df <- rbind(
size_df,
data.frame(
file_name = basename(bike),
sample_size = nrow(citi_sample),
row_bytes = row_bytes,
file_rows = total_rows
)
)
}
size_df
#fs is a file/director management library that can be used for tasks luke copying, deleting, and file details
# I'm using it here to pluck out how many bytes are in a 1GB, which is the target fize size I want
# bytes in 1GB
gb_bytes <- as.numeric(as_fs_bytes("1GB"))
#total bytes in size_df
bike_bytes <- sum(size_df$row_bytes * size_df$file_rows)
# how many bytes each each sample of 1,000, added as column in the table
size_df$sample_bytes <- gb_bytes * (size_df$row_bytes * size_df$file_rows) / bike_bytes
# how many rows I need toi get a proportional sample across uneven file sizes
size_df$sample_rows <- round(size_df$sample_bytes / size_df$row_bytes)
# printing out the sizes I got above and then the final size_df I can use to figure out a sample
cat("Total bytes in 1GB:\n")
print(gb_bytes)
cat("\nBytes in my size_df of row samples:\n")
print(bike_bytes)
cat("\nHow many times larger my sample df bytes is than 1GB of bytes:\n")
print(bike_bytes/gb_bytes)
cat("\nUpdated bytes size tracker:/n")
size_df
# Fread from the data.table can read files relatively fast and then rblindlist uses perform memory-efficient concatenation to process all it with minimal compute
size_df$file_name <- file.path("/Users/uwsthoughts/Desktop/citibike_files", size_df$file_name)
biker_sample <- lapply(seq_len(nrow(size_df)), function(i) {
cruiser <- fread(size_df$file_name[i])
cruiser[sample(.N, min(size_df$sample_rows[i], .N))]
})
yellow_jersey <- rbindlist(biker_sample, fill = TRUE)
write_parquet(yellow_jersey, "data_606_citiBike.parquet")
write.csv(yellow_jersey, "data_606_citiBike.csv", row.names = FALSE)
Comparing file sizes for parquet versus CSV to show how much smaller files can become:
citi_parquet <- file_info("data_606_citiBike.parquet")
citi_csv <- file_info("data_606_citiBike.csv")
cat("Citi CSV file size:\n")
## Citi CSV file size:
print(citi_csv$size)
## 1G
cat("\nParquet file size:\n")
##
## Parquet file size:
print(citi_parquet$size)
## 253M
That’s how you go from 14GB to 253MB of files. I can now save the
Rmd, set eval = FALSE
for most of the above, and then do an
import of the parquet file as a dataframe at the top as well. If you
wanted to work with 14GB of files directly, congrats! You’re done. if
you skipped that part and just reviewed the code, that’s also fine.
I will now proceed with the rest of the data preparation, which uses the 253MB file.
head(citibike_4everyone)
## ride_id rideable_type started_at ended_at
## <char> <char> <POSc> <POSc>
## 1: 78C1395890FA4A77 classic_bike 2023-01-18 15:24:25 2023-01-18 15:28:44
## 2: BAB67EB1A2B7554D classic_bike 2023-01-16 17:32:54 2023-01-16 17:40:58
## 3: E205DF08EB4CD442 classic_bike 2023-01-30 06:49:14 2023-01-30 06:56:59
## 4: B3998CCD21418531 classic_bike 2023-01-28 18:49:58 2023-01-28 18:53:29
## 5: 1F38547DF25FC252 classic_bike 2023-01-29 13:59:08 2023-01-29 14:29:20
## 6: EA8F763848AEB485 classic_bike 2023-01-12 15:35:10 2023-01-12 15:56:30
## start_station_name start_station_id end_station_name
## <char> <char> <char>
## 1: Steinway St & 21 Ave 7137.04 31 St & 23 Ave
## 2: Gansevoort St & Hudson St 6072.14 W 15 St & 6 Ave
## 3: 6 Ave & W 34 St 6364.10 E 33 St & 1 Ave
## 4: Rutgers St & Henry St 5230.02 Catherine St & Monroe St
## 5: W 43 St & 10 Ave 6756.01 Albany St & Greenwich St
## 6: W 63 St & Broadway 7052.01 Central Park W & W 91 St
## end_station_id start_lat start_lng end_lat end_lng member_casual
## <char> <num> <num> <num> <num> <char>
## 1: 7144.01 40.77480 -73.90379 40.77479 -73.91256 member
## 2: 5989.02 40.73945 -74.00507 40.73805 -73.99643 member
## 3: 6197.08 40.74964 -73.98805 40.74323 -73.97450 member
## 4: 5128.04 40.71332 -73.99010 40.71117 -73.99683 member
## 5: 5145.02 40.76009 -73.99462 40.70927 -74.01325 member
## 6: 7453.01 40.77164 -73.98261 40.78866 -73.96680 member
## Unnamed: 0 rideable_type_duplicate_column_name_1
## <int> <char>
## 1: NA <NA>
## 2: NA <NA>
## 3: NA <NA>
## 4: NA <NA>
## 5: NA <NA>
## 6: NA <NA>
Before I go any further, I’m going to drop all the columns I won’t
need. I’m interested in bike ride patterns so I will keep
ride_id
to preserve the by-ride quality and then the start
and end time and location names latitude/longitudes. The rest can be
dropped.
citibike_4everyone <- citibike_4everyone %>%
select(-rideable_type, -start_station_id, -end_station_id, -member_casual, -'Unnamed: 0', -rideable_type_duplicate_column_name_1)
cat("Head of new dataframe without dropped columns: /n")
## Head of new dataframe without dropped columns: /n
print(head(citibike_4everyone))
## ride_id started_at ended_at
## <char> <POSc> <POSc>
## 1: 78C1395890FA4A77 2023-01-18 15:24:25 2023-01-18 15:28:44
## 2: BAB67EB1A2B7554D 2023-01-16 17:32:54 2023-01-16 17:40:58
## 3: E205DF08EB4CD442 2023-01-30 06:49:14 2023-01-30 06:56:59
## 4: B3998CCD21418531 2023-01-28 18:49:58 2023-01-28 18:53:29
## 5: 1F38547DF25FC252 2023-01-29 13:59:08 2023-01-29 14:29:20
## 6: EA8F763848AEB485 2023-01-12 15:35:10 2023-01-12 15:56:30
## start_station_name end_station_name start_lat start_lng
## <char> <char> <num> <num>
## 1: Steinway St & 21 Ave 31 St & 23 Ave 40.77480 -73.90379
## 2: Gansevoort St & Hudson St W 15 St & 6 Ave 40.73945 -74.00507
## 3: 6 Ave & W 34 St E 33 St & 1 Ave 40.74964 -73.98805
## 4: Rutgers St & Henry St Catherine St & Monroe St 40.71332 -73.99010
## 5: W 43 St & 10 Ave Albany St & Greenwich St 40.76009 -73.99462
## 6: W 63 St & Broadway Central Park W & W 91 St 40.77164 -73.98261
## end_lat end_lng
## <num> <num>
## 1: 40.77479 -73.91256
## 2: 40.73805 -73.99643
## 3: 40.74323 -73.97450
## 4: 40.71117 -73.99683
## 5: 40.70927 -74.01325
## 6: 40.78866 -73.96680
cat("\n NA counts for new and smaller dataframe: \n")
##
## NA counts for new and smaller dataframe:
print(sapply(citibike_4everyone, function(x) sum(is.na(x))))
## ride_id started_at ended_at start_station_name
## 0 0 0 0
## end_station_name start_lat start_lng end_lat
## 0 0 0 2535
## end_lng
## 2535
cat("I will drop the NAs and set a new variable name to create a fallback point:/n")
## I will drop the NAs and set a new variable name to create a fallback point:/n
citibike_4life <- na.omit(citibike_4everyone)
I will change started_at
and ended_at
to
actual date and time columns so that one of them can be used as the join
key to the Time and Date AS dataset.
citibike_4life$started_at <- as.POSIXct(citibike_4life$started_at, format = "%Y-%m-%d %H:%M:%S")
citibike_4life$ended_at <- as.POSIXct(citibike_4life$ended_at, format = "%Y-%m-%d %H:%M:%S")
I can now pull in the time and date data and join it to the Citi Bike data on date after I remove fields I don’t need.
astro_2023 <- read_csv_arrow('data_606_astroevents_2023.csv')
astro_2024 <- read_csv_arrow('data_606_astroevents_2024.csv')
astro_soar <- rbind(astro_2023, astro_2024)
Time and Date AS is a group based in Stavanger, Norway that specializes in making various forms of data about time and time zones available to the public. Their developer portal includes free and paid data access, use, and download services that can be used to varying degrees, depending on how much your willing to pay and how much data you need (Thorsen) . For this project, I was interested in the different time measurements they had for sun-related events, which is accessed by using the Astro Events service of their Astronmy API.
When you create a account, you get 500 free API credits that you can use on any service. I used them on the Data Download: Astronomy service. Here’s how you configure the settings:
You
can only do 12 months per download request this way. I did two requests:
one for the 12 months of 2023 (shown above) and then a second one for
the 12 months of 2024. While the CitiBike data is based on rides that
already happened, time and date is based on event that can already be
precisely planning.
In this data, I’m interested in: * Date
*
Twi6end
* Set
Twi6
is the Time and Date AS API term for Civilian
Twilight, which is defined by them as: “The Sun is just below the
horizon, so there is generally enough natural light to carry out most
outdoor activities.”
Twi6
usually ends about 20 minutes after sunset. For the
purpose of this assignment, I will now define the golden hour as the
total time between one hour before sunset and the end of
Twi6
.
astro_soaring <- astro_soar %>%
select(Date, Twi6End, Set)
astro_soaring$Date <- as.Date(astro_soaring$Date, format = "%m/%d/%y")
Now I can join to the Citi Bike dataset using date. Each bike ride within any given day would have the same sunset and related data.
citibike_4life$date_only <- as.Date(citibike_4life$started_at)
astro_soaring$date_only <- as.Date(astro_soaring$Date)
citi_skies_df <- merge(citibike_4life, astro_soaring, by = "date_only", all.x = TRUE)
citi_skies_df <- citi_skies_df[!is.na(citi_skies_df$Set), ]
citi_skies_df$Set <- hms(citi_skies_df$Set)
citi_skies_df$minus_sixty <- citi_skies_df$Set - minutes(60)
citi_skies_df$Set <- gsub("H ", ":", gsub("M ", ":", gsub("S", "", citi_skies_df$Set)))
citi_skies_df$minus_sixty <- gsub("H ", ":", gsub("M ", ":", gsub("S", "", citi_skies_df$minus_sixty)))
What are the cases, and how many are there?
There are 4,990,991 million cases, where a case is an individual bike ride.
This is an observational study
The predictor variables are: Twi6End
: quantitative
minus_sixty
: quantitative
The response variables are: started_at
: quantitative
ended_at
: quantitative start_lat
: quantitative
end_lat
: quantitative
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
citi_skies_df$hour <- format(as.POSIXct(citi_skies_df$started_at), "%H")
hourly_counts <- citi_skies_df %>%
group_by(hour) %>%
summarise(count = n())
hourly_counts
## # A tibble: 24 × 2
## hour count
## <chr> <int>
## 1 00 69583
## 2 01 42553
## 3 02 27371
## 4 03 18215
## 5 04 16180
## 6 05 35301
## 7 06 96193
## 8 07 197550
## 9 08 302142
## 10 09 254515
## # ℹ 14 more rows
Bar chart of ride counts by hour:
ggplot(hourly_counts, aes(x = hour, y = count)) +
geom_col() +
scale_y_continuous(labels = comma) +
labs(x = "Hour of the Day", y = "Number of Rides", title = "Citi Bike Rides per Hour") +
theme_minimal()
Average and median bike ride durations during the golden hour:
citi_skies_df$Set_time <- as.POSIXct(paste(citi_skies_df$date_only, citi_skies_df$Set), format = "%Y-%m-%d %H:%M:%S")
citi_skies_df$golden_hour_start <- citi_skies_df$started_at >= (citi_skies_df$Set_time - minutes(60)) &
citi_skies_df$started_at <= (citi_skies_df$Set_time + minutes(60))
citi_skies_df$golden_hour_end <- citi_skies_df$ended_at >= (citi_skies_df$Set_time - minutes(60)) &
citi_skies_df$ended_at <= (citi_skies_df$Set_time + minutes(60))
golden_hour_stats <- citi_skies_df %>%
filter(golden_hour_start | golden_hour_end) %>%
summarise(
avg_duration = mean(difftime(ended_at, started_at, units = "mins")),
median_duration = median(difftime(ended_at, started_at, units = "mins")),
total_rides = n()
)
print(golden_hour_stats)
## avg_duration median_duration total_rides
## 1 16.94196 mins 10.1572 mins 200432
Most frequency start and end locations for rides during the golden hour:
golden_hour_locations <- citi_skies_df %>%
filter(golden_hour_start | golden_hour_end) %>%
group_by(start_station_name, end_station_name) %>%
summarise(count = n()) %>%
arrange(desc(count))
print(golden_hour_locations)
## # A tibble: 130,388 × 3
## # Groups: start_station_name [2,261]
## start_station_name end_station_name count
## <chr> <chr> <int>
## 1 "" "" 170
## 2 "Greenwich St & W Houston St" "Pier 40 - Hudson River Park" 61
## 3 "Brooklyn Bridge Park - Pier 2" "Brooklyn Bridge Park - Pier 2" 59
## 4 "Eastern Pkwy & Kingston Ave" "Eastern Pkwy & Kingston Ave" 48
## 5 "W 22 St & 8 Ave" "Broadway & W 29 St" 40
## 6 "River Ter & Warren St" "Vesey Pl & River Terrace" 36
## 7 "W 47 St & 10 Ave" "W 47 St & 9 Ave" 33
## 8 "W 70 St & Amsterdam Ave" "Riverside Blvd & W 67 St" 32
## 9 "Amsterdam Ave & W 73 St" "Amsterdam Ave & W 79 St" 31
## 10 "Pier 40 - Hudson River Park" "West St & Chambers St" 31
## # ℹ 130,378 more rows
Average and median distances during the golden hour:
citi_skies_df$ride_distance <- distHaversine(
matrix(c(citi_skies_df$start_lng, citi_skies_df$start_lat), ncol = 2),
matrix(c(citi_skies_df$end_lng, citi_skies_df$end_lat), ncol = 2)
) / 1000
golden_hour_distances <- citi_skies_df %>%
filter(golden_hour_start | golden_hour_end) %>%
summarise(
avg_distance = mean(ride_distance, na.rm = TRUE),
median_distance = median(ride_distance, na.rm = TRUE)
)
print(golden_hour_distances)
## avg_distance median_distance
## 1 2.236369 1.602731