# Load the data set
raw_data <- read.csv("Divvy_Trips_2019_Q1.csv")

#Load the library necessary for analysis
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Check the data preview for the attributes in the data
str(raw_data)
## 'data.frame':    365069 obs. of  12 variables:
##  $ trip_id          : int  21742443 21742444 21742445 21742446 21742447 21742448 21742449 21742450 21742451 21742452 ...
##  $ start_time       : chr  "2019-01-01 0:04:37" "2019-01-01 0:08:13" "2019-01-01 0:13:23" "2019-01-01 0:13:45" ...
##  $ end_time         : chr  "2019-01-01 0:11:07" "2019-01-01 0:15:34" "2019-01-01 0:27:12" "2019-01-01 0:43:28" ...
##  $ bikeid           : int  2167 4386 1524 252 1170 2437 2708 2796 6205 3939 ...
##  $ tripduration     : chr  "390" "441" "829" "1,783.00" ...
##  $ from_station_id  : int  199 44 15 123 173 98 98 211 150 268 ...
##  $ from_station_name: chr  "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ to_station_id    : int  84 624 644 176 35 49 49 142 148 141 ...
##  $ to_station_name  : chr  "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ usertype         : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender           : chr  "Male" "Female" "Female" "Male" ...
##  $ birthyear        : int  1989 1990 1994 1993 1994 1983 1984 1990 1995 1996 ...

# Using drop_na() to remove rows with missing values
data_cleaned <- raw_data %>% drop_na()

# Display the cleaned data
str(data_cleaned)
## 'data.frame':    347046 obs. of  12 variables:
##  $ trip_id          : int  21742443 21742444 21742445 21742446 21742447 21742448 21742449 21742450 21742451 21742452 ...
##  $ start_time       : chr  "2019-01-01 0:04:37" "2019-01-01 0:08:13" "2019-01-01 0:13:23" "2019-01-01 0:13:45" ...
##  $ end_time         : chr  "2019-01-01 0:11:07" "2019-01-01 0:15:34" "2019-01-01 0:27:12" "2019-01-01 0:43:28" ...
##  $ bikeid           : int  2167 4386 1524 252 1170 2437 2708 2796 6205 3939 ...
##  $ tripduration     : chr  "390" "441" "829" "1,783.00" ...
##  $ from_station_id  : int  199 44 15 123 173 98 98 211 150 268 ...
##  $ from_station_name: chr  "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ to_station_id    : int  84 624 644 176 35 49 49 142 148 141 ...
##  $ to_station_name  : chr  "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ usertype         : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender           : chr  "Male" "Female" "Female" "Male" ...
##  $ birthyear        : int  1989 1990 1994 1993 1994 1983 1984 1990 1995 1996 ...
#Number of rows after cleaning data
nrow(data_cleaned)
## [1] 347046
data <- data_cleaned %>% distinct()
# Convert start_time and end_time to POSIXct
data$start_time <- as.POSIXct(data$start_time, format = "%Y-%m-%d %H:%M:%S")
data$end_time <- as.POSIXct(data$end_time, format = "%Y-%m-%d %H:%M:%S")
data$tripduration <- as.numeric((data$end_time - data$start_time)*60)

# Check the structure to confirm the conversion
str(data)
## 'data.frame':    347046 obs. of  12 variables:
##  $ trip_id          : int  21742443 21742444 21742445 21742446 21742447 21742448 21742449 21742450 21742451 21742452 ...
##  $ start_time       : POSIXct, format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
##  $ end_time         : POSIXct, format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
##  $ bikeid           : int  2167 4386 1524 252 1170 2437 2708 2796 6205 3939 ...
##  $ tripduration     : num  390 441 829 1783 364 ...
##  $ from_station_id  : int  199 44 15 123 173 98 98 211 150 268 ...
##  $ from_station_name: chr  "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ to_station_id    : int  84 624 644 176 35 49 49 142 148 141 ...
##  $ to_station_name  : chr  "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ usertype         : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender           : chr  "Male" "Female" "Female" "Male" ...
##  $ birthyear        : int  1989 1990 1994 1993 1994 1983 1984 1990 1995 1996 ...
max_duration <- max(data$tripduration, na.rm = TRUE)
max_duration
## [1] 6096428
max_casual_duration <- max(data$tripduration[data$usertype == "Customer"], na.rm = TRUE)
max_casual_duration
## [1] 477724
ave_duration_casual <- mean(data$tripduration[data$usertype == "Customer"], na.rm = TRUE)
ave_duration_casual
## [1] 2212.131
ave_duration_subscribers <- mean(data$tripduration[data$usertype == "Subscriber"], na.rm = TRUE)
ave_duration_subscribers
## [1] 833.2299
#Create a new column with the weekday of the startdate
data$weekday <- weekdays(data$start_time)
table(data$usertype, data$weekday)
##             
##              Friday Monday Saturday Sunday Thursday Tuesday Wednesday
##   Customer      888    577     1355    985      795     794       643
##   Subscriber  59543  48372    29215  24145    63831   58125     57778
data$weekday <- factor(data$weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))

daily_counts <- table(data$usertype, data$weekday)

# Create a bar plot
barplot(daily_counts, 
        beside = TRUE,  # Place bars side by side
        col = c("lightblue", "lightgreen"),  # Colors for each user type
        main = "Daily Rides: Subscribers vs Casual Riders",
        xlab = "Day of the Week",  # X-axis label
        ylab = "Number of Rides",  # Y-axis label
        legend.text = c("Casual Rides", "Subscribers"),  # Add legend
        args.legend = list(x = "topright", bty = "n"),
        names.arg = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"),  # Explicitly set the labels
        cex.names = 1)  # Ensure labels are appropriately sized

# Trim leading and trailing whitespace from the gender column
data$gender <- trimws(data$gender)
#Remove rows with just whitespace
data <- data[data$gender != "", ]
#Count the number of each usertype
table(data$usertype)
## 
##   Customer Subscriber 
##       5934     339423
#count the number of each gender
table(data$gender)
## 
## Female   Male 
##  66918 278439
nFemale_Subscribers <- as.numeric(sum(data$usertype == "Subscriber" & data$gender == "Female", na.rm = TRUE))
nFemale_Subscribers
## [1] 65043
nMale_Subscribers <- as.numeric(sum(data$usertype == "Subscriber" & data$gender == "Male", na.rm = TRUE))
nMale_Subscribers
## [1] 274380
#Top 3 station with the highest users
station_counts <- table(data$from_station_name)
top_3_stations <- head(sort(station_counts, decreasing = TRUE), 3)
top_3_stations
## 
## Clinton St & Washington Blvd      Clinton St & Madison St 
##                         7596                         6329 
##          Canal St & Adams St 
##                         6254
# Filter for subscribers only
subscriber_data <- data[data$usertype == "Subscriber", ]
# Count trips from each station by subscribers
subscriber_station_counts <- table(subscriber_data$from_station_name)
# Find the station with the highest count
most_used_by_subscribers <- names(which.max(subscriber_station_counts))
# Display the most used station by subscribers
subscriber_station_counts[most_used_by_subscribers]
## Clinton St & Washington Blvd 
##                         7563
# Filter for casual users only
casual_data <- data[data$usertype == "Customer", ]
# Count trips from each station by casual users
casual_station_counts <- table(casual_data$from_station_name)
# Find the station with the highest count
most_used_by_casuals <- names(which.max(casual_station_counts))
# Display the most used station by casual customers
casual_station_counts[most_used_by_casuals]
## Lake Shore Dr & Monroe St 
##                       280
#Histogram

subscribers_data <- data[data$usertype == "Subscriber", ]

# Step 2: Calculate the range of birth years for subscribers
birthyear_range <- range(subscribers_data$birthyear, na.rm = TRUE)

# Calculate the bin width
bin_width <- 10

breaks_seq <- seq(floor(birthyear_range[1] / bin_width) * bin_width, 
                  ceiling(birthyear_range[2] / bin_width) * bin_width, 
                  by = bin_width)

breaks_seq <- c(breaks_seq[1] - bin_width, breaks_seq, breaks_seq[length(breaks_seq)] + bin_width)

hist_data <- hist(data$birthyear, breaks = breaks_seq,
     main = "Histogram of Subscibers' Birth Year",
     xlab = "Birth Year",
     ylab = "Subscribers",
     col = "lightblue",
     border = "black")

# Add labels with the number of users per category
for (i in 1:length(hist_data$counts)) {
  # Add label on top of each bar
  text(hist_data$mids[i], hist_data$counts[i], 
       labels = as.character(hist_data$counts[i]), 
       pos = 3, cex = 0.8, col = "black")
  }

#  Filter the data for casual customers only
casuals_data <- data[data$usertype == "Customer", ]

# Calculate the range of birth years for casual customers
birthyear_range_casuals <- range(casuals_data$birthyear, na.rm = TRUE)


# Create the breaks for the histogram
breaks_seq_casuals <- seq(floor(birthyear_range_casuals[1] / bin_width) * bin_width, 
                          ceiling(birthyear_range_casuals[2] / bin_width) * bin_width, 
                          by = bin_width)

# Ensure breaks cover the range of birth years for casual customers
breaks_seq_casuals <- c(breaks_seq_casuals[1] - bin_width, breaks_seq_casuals, breaks_seq_casuals[length(breaks_seq_casuals)] + bin_width)

# Create the histogram for casual customers
hist_data_casuals <- hist(casuals_data$birthyear, 
                          breaks = breaks_seq_casuals,
                          main = "Histogram of Birth Year for Casual Customers",
                          xlab = "Birth Year",
                          ylab = "Casual Customers",
                          col = "lightgreen",
                          border = "black")

# Add labels with the number of users per category
for (i in 1:length(hist_data_casuals$counts)) {
  # Add label on top of each bar
  text(hist_data_casuals$mids[i], hist_data_casuals$counts[i], 
       labels = as.character(hist_data_casuals$counts[i]), 
       pos = 3, cex = 0.8, col = "black")
  }

# Count the number of subscribers and Casual Customers by gender
gender_counts <- table(data$usertype, data$gender)

#Plot a double bar chart
barplot(gender_counts, 
        beside = TRUE,  # Place bars beside each other
        col = c("lightblue", "lightgreen"),  # Colors for each bar
        main = "Gender Comparison: Subscribers vs Customers",
        xlab = "Gender",
        ylab = "Number of Users",
        legend.text = c("Customers", "Subscribers"),
        args.legend = list(x = "center",
                           y = "top",
                           bty = "n"))