# Load the data set
raw_data <- read.csv("Divvy_Trips_2019_Q1.csv")
#Load the library necessary for analysis
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Check the data preview for the attributes in the data
str(raw_data)
## 'data.frame': 365069 obs. of 12 variables:
## $ trip_id : int 21742443 21742444 21742445 21742446 21742447 21742448 21742449 21742450 21742451 21742452 ...
## $ start_time : chr "2019-01-01 0:04:37" "2019-01-01 0:08:13" "2019-01-01 0:13:23" "2019-01-01 0:13:45" ...
## $ end_time : chr "2019-01-01 0:11:07" "2019-01-01 0:15:34" "2019-01-01 0:27:12" "2019-01-01 0:43:28" ...
## $ bikeid : int 2167 4386 1524 252 1170 2437 2708 2796 6205 3939 ...
## $ tripduration : chr "390" "441" "829" "1,783.00" ...
## $ from_station_id : int 199 44 15 123 173 98 98 211 150 268 ...
## $ from_station_name: chr "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ to_station_id : int 84 624 644 176 35 49 49 142 148 141 ...
## $ to_station_name : chr "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ usertype : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr "Male" "Female" "Female" "Male" ...
## $ birthyear : int 1989 1990 1994 1993 1994 1983 1984 1990 1995 1996 ...
# Using drop_na() to remove rows with missing values
data_cleaned <- raw_data %>% drop_na()
# Display the cleaned data
str(data_cleaned)
## 'data.frame': 347046 obs. of 12 variables:
## $ trip_id : int 21742443 21742444 21742445 21742446 21742447 21742448 21742449 21742450 21742451 21742452 ...
## $ start_time : chr "2019-01-01 0:04:37" "2019-01-01 0:08:13" "2019-01-01 0:13:23" "2019-01-01 0:13:45" ...
## $ end_time : chr "2019-01-01 0:11:07" "2019-01-01 0:15:34" "2019-01-01 0:27:12" "2019-01-01 0:43:28" ...
## $ bikeid : int 2167 4386 1524 252 1170 2437 2708 2796 6205 3939 ...
## $ tripduration : chr "390" "441" "829" "1,783.00" ...
## $ from_station_id : int 199 44 15 123 173 98 98 211 150 268 ...
## $ from_station_name: chr "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ to_station_id : int 84 624 644 176 35 49 49 142 148 141 ...
## $ to_station_name : chr "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ usertype : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr "Male" "Female" "Female" "Male" ...
## $ birthyear : int 1989 1990 1994 1993 1994 1983 1984 1990 1995 1996 ...
#Number of rows after cleaning data
nrow(data_cleaned)
## [1] 347046
data <- data_cleaned %>% distinct()
# Convert start_time and end_time to POSIXct
data$start_time <- as.POSIXct(data$start_time, format = "%Y-%m-%d %H:%M:%S")
data$end_time <- as.POSIXct(data$end_time, format = "%Y-%m-%d %H:%M:%S")
data$tripduration <- as.numeric((data$end_time - data$start_time)*60)
# Check the structure to confirm the conversion
str(data)
## 'data.frame': 347046 obs. of 12 variables:
## $ trip_id : int 21742443 21742444 21742445 21742446 21742447 21742448 21742449 21742450 21742451 21742452 ...
## $ start_time : POSIXct, format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
## $ end_time : POSIXct, format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
## $ bikeid : int 2167 4386 1524 252 1170 2437 2708 2796 6205 3939 ...
## $ tripduration : num 390 441 829 1783 364 ...
## $ from_station_id : int 199 44 15 123 173 98 98 211 150 268 ...
## $ from_station_name: chr "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ to_station_id : int 84 624 644 176 35 49 49 142 148 141 ...
## $ to_station_name : chr "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ usertype : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr "Male" "Female" "Female" "Male" ...
## $ birthyear : int 1989 1990 1994 1993 1994 1983 1984 1990 1995 1996 ...
max_duration <- max(data$tripduration, na.rm = TRUE)
max_duration
## [1] 6096428
max_casual_duration <- max(data$tripduration[data$usertype == "Customer"], na.rm = TRUE)
max_casual_duration
## [1] 477724
ave_duration_casual <- mean(data$tripduration[data$usertype == "Customer"], na.rm = TRUE)
ave_duration_casual
## [1] 2212.131
ave_duration_subscribers <- mean(data$tripduration[data$usertype == "Subscriber"], na.rm = TRUE)
ave_duration_subscribers
## [1] 833.2299
#Create a new column with the weekday of the startdate
data$weekday <- weekdays(data$start_time)
table(data$usertype, data$weekday)
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## Customer 888 577 1355 985 795 794 643
## Subscriber 59543 48372 29215 24145 63831 58125 57778
data$weekday <- factor(data$weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
daily_counts <- table(data$usertype, data$weekday)
# Create a bar plot
barplot(daily_counts,
beside = TRUE, # Place bars side by side
col = c("lightblue", "lightgreen"), # Colors for each user type
main = "Daily Rides: Subscribers vs Casual Riders",
xlab = "Day of the Week", # X-axis label
ylab = "Number of Rides", # Y-axis label
legend.text = c("Casual Rides", "Subscribers"), # Add legend
args.legend = list(x = "topright", bty = "n"),
names.arg = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), # Explicitly set the labels
cex.names = 1) # Ensure labels are appropriately sized

# Trim leading and trailing whitespace from the gender column
data$gender <- trimws(data$gender)
#Remove rows with just whitespace
data <- data[data$gender != "", ]
#Count the number of each usertype
table(data$usertype)
##
## Customer Subscriber
## 5934 339423
#count the number of each gender
table(data$gender)
##
## Female Male
## 66918 278439
nFemale_Subscribers <- as.numeric(sum(data$usertype == "Subscriber" & data$gender == "Female", na.rm = TRUE))
nFemale_Subscribers
## [1] 65043
nMale_Subscribers <- as.numeric(sum(data$usertype == "Subscriber" & data$gender == "Male", na.rm = TRUE))
nMale_Subscribers
## [1] 274380
#Top 3 station with the highest users
station_counts <- table(data$from_station_name)
top_3_stations <- head(sort(station_counts, decreasing = TRUE), 3)
top_3_stations
##
## Clinton St & Washington Blvd Clinton St & Madison St
## 7596 6329
## Canal St & Adams St
## 6254
# Filter for subscribers only
subscriber_data <- data[data$usertype == "Subscriber", ]
# Count trips from each station by subscribers
subscriber_station_counts <- table(subscriber_data$from_station_name)
# Find the station with the highest count
most_used_by_subscribers <- names(which.max(subscriber_station_counts))
# Display the most used station by subscribers
subscriber_station_counts[most_used_by_subscribers]
## Clinton St & Washington Blvd
## 7563
# Filter for casual users only
casual_data <- data[data$usertype == "Customer", ]
# Count trips from each station by casual users
casual_station_counts <- table(casual_data$from_station_name)
# Find the station with the highest count
most_used_by_casuals <- names(which.max(casual_station_counts))
# Display the most used station by casual customers
casual_station_counts[most_used_by_casuals]
## Lake Shore Dr & Monroe St
## 280
#Histogram
subscribers_data <- data[data$usertype == "Subscriber", ]
# Step 2: Calculate the range of birth years for subscribers
birthyear_range <- range(subscribers_data$birthyear, na.rm = TRUE)
# Calculate the bin width
bin_width <- 10
breaks_seq <- seq(floor(birthyear_range[1] / bin_width) * bin_width,
ceiling(birthyear_range[2] / bin_width) * bin_width,
by = bin_width)
breaks_seq <- c(breaks_seq[1] - bin_width, breaks_seq, breaks_seq[length(breaks_seq)] + bin_width)
hist_data <- hist(data$birthyear, breaks = breaks_seq,
main = "Histogram of Subscibers' Birth Year",
xlab = "Birth Year",
ylab = "Subscribers",
col = "lightblue",
border = "black")
# Add labels with the number of users per category
for (i in 1:length(hist_data$counts)) {
# Add label on top of each bar
text(hist_data$mids[i], hist_data$counts[i],
labels = as.character(hist_data$counts[i]),
pos = 3, cex = 0.8, col = "black")
}

# Filter the data for casual customers only
casuals_data <- data[data$usertype == "Customer", ]
# Calculate the range of birth years for casual customers
birthyear_range_casuals <- range(casuals_data$birthyear, na.rm = TRUE)
# Create the breaks for the histogram
breaks_seq_casuals <- seq(floor(birthyear_range_casuals[1] / bin_width) * bin_width,
ceiling(birthyear_range_casuals[2] / bin_width) * bin_width,
by = bin_width)
# Ensure breaks cover the range of birth years for casual customers
breaks_seq_casuals <- c(breaks_seq_casuals[1] - bin_width, breaks_seq_casuals, breaks_seq_casuals[length(breaks_seq_casuals)] + bin_width)
# Create the histogram for casual customers
hist_data_casuals <- hist(casuals_data$birthyear,
breaks = breaks_seq_casuals,
main = "Histogram of Birth Year for Casual Customers",
xlab = "Birth Year",
ylab = "Casual Customers",
col = "lightgreen",
border = "black")
# Add labels with the number of users per category
for (i in 1:length(hist_data_casuals$counts)) {
# Add label on top of each bar
text(hist_data_casuals$mids[i], hist_data_casuals$counts[i],
labels = as.character(hist_data_casuals$counts[i]),
pos = 3, cex = 0.8, col = "black")
}

# Count the number of subscribers and Casual Customers by gender
gender_counts <- table(data$usertype, data$gender)
#Plot a double bar chart
barplot(gender_counts,
beside = TRUE, # Place bars beside each other
col = c("lightblue", "lightgreen"), # Colors for each bar
main = "Gender Comparison: Subscribers vs Customers",
xlab = "Gender",
ylab = "Number of Users",
legend.text = c("Customers", "Subscribers"),
args.legend = list(x = "center",
y = "top",
bty = "n"))
