# Case Study 1

#setting the working directory
setwd("~/Desktop/GA Capstone Project/Project 2_ Aug2024/RAW")

#Load Packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lubridate)
library(tibble)

#Import CSV
Jul_2024 <- read.csv("202407-divvy-tripdata.csv")
Jun_2024 <- read.csv("202406-divvy-tripdata.csv")
May_2024 <- read.csv("202405-divvy-tripdata.csv")
Apr_2024 <- read.csv("202404-divvy-tripdata.csv")
Mar_2024 <- read.csv("202403-divvy-tripdata.csv")
Feb_2024 <- read.csv("202402-divvy-tripdata.csv")
Jan_2024 <- read.csv("202401-divvy-tripdata.csv")
Dec_2023 <- read.csv("202312-divvy-tripdata.csv")
Nov_2023 <- read.csv("202311-divvy-tripdata.csv")
Oct_2023 <- read.csv("202310-divvy-tripdata.csv")
Sep_2023 <- read.csv("202309-divvy-tripdata.csv")
Aug_2023 <- read.csv("202308-divvy-tripdata.csv")

# Compare column names each of the files
colnames(Jul_2024)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Jun_2024)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(May_2024)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Apr_2024)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Mar_2024)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Feb_2024)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Jan_2024)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Dec_2023)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Nov_2023)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Oct_2023)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Sep_2023)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(Aug_2023)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
str(Jul_2024)
## 'data.frame':    748962 obs. of  13 variables:
##  $ ride_id           : chr  "2658E319B13141F9" "B2176315168A47CE" "C2A9D33DF7EBB422" "8BFEA406DF01D8AD" ...
##  $ rideable_type     : chr  "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : chr  "2024-07-11 08:15:14.784" "2024-07-11 15:45:07.851" "2024-07-11 08:24:48.192" "2024-07-11 08:46:06.864" ...
##  $ ended_at          : chr  "2024-07-11 08:17:56.335" "2024-07-11 16:06:04.243" "2024-07-11 08:28:05.237" "2024-07-11 09:14:11.664" ...
##  $ start_station_name: chr  "" "" "" "" ...
##  $ start_station_id  : chr  "" "" "" "" ...
##  $ end_station_name  : chr  "" "" "" "" ...
##  $ end_station_id    : chr  "" "" "" "" ...
##  $ start_lat         : num  41.8 41.8 41.8 41.9 42 ...
##  $ start_lng         : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num  41.8 41.8 41.8 41.9 41.9 ...
##  $ end_lng           : num  -87.6 -87.6 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr  "casual" "casual" "casual" "casual" ...
# Merge
all_trips <- bind_rows(Jul_2024, Jun_2024, May_2024, Apr_2024, Mar_2024, Feb_2024, Jan_2024, 
                       Dec_2023, Nov_2023, Oct_2023, Sep_2023, Aug_2023)

# Inspect the new table that has been created
colnames(all_trips)  #List of column names
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
nrow(all_trips)  #How many rows are in data frame?
## [1] 5715693
dim(all_trips)  #Dimensions of the data frame?
## [1] 5715693      13
head(all_trips)  #See the first 6 rows of data frame.  Also tail(all_trips)
##            ride_id rideable_type              started_at
## 1 2658E319B13141F9 electric_bike 2024-07-11 08:15:14.784
## 2 B2176315168A47CE electric_bike 2024-07-11 15:45:07.851
## 3 C2A9D33DF7EBB422 electric_bike 2024-07-11 08:24:48.192
## 4 8BFEA406DF01D8AD electric_bike 2024-07-11 08:46:06.864
## 5 ECD3EF02E5EB73B6 electric_bike 2024-07-11 18:18:16.588
## 6 A3C62391BBBAC107 electric_bike 2024-07-11 16:03:59.708
##                  ended_at start_station_name start_station_id end_station_name
## 1 2024-07-11 08:17:56.335                                                     
## 2 2024-07-11 16:06:04.243                                                     
## 3 2024-07-11 08:28:05.237                                                     
## 4 2024-07-11 09:14:11.664                                                     
## 5 2024-07-11 18:30:20.288                                                     
## 6 2024-07-11 16:32:38.635                                                     
##   end_station_id start_lat start_lng end_lat end_lng member_casual
## 1                    41.80    -87.59   41.79  -87.59        casual
## 2                    41.79    -87.60   41.80  -87.59        casual
## 3                    41.79    -87.59   41.79  -87.60        casual
## 4                    41.88    -87.64   41.90  -87.67        casual
## 5                    41.95    -87.64   41.91  -87.62        casual
## 6                    41.70    -87.61   41.70  -87.61        casual
str(all_trips)  #See list of columns and data types (numeric, character, etc)
## 'data.frame':    5715693 obs. of  13 variables:
##  $ ride_id           : chr  "2658E319B13141F9" "B2176315168A47CE" "C2A9D33DF7EBB422" "8BFEA406DF01D8AD" ...
##  $ rideable_type     : chr  "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : chr  "2024-07-11 08:15:14.784" "2024-07-11 15:45:07.851" "2024-07-11 08:24:48.192" "2024-07-11 08:46:06.864" ...
##  $ ended_at          : chr  "2024-07-11 08:17:56.335" "2024-07-11 16:06:04.243" "2024-07-11 08:28:05.237" "2024-07-11 09:14:11.664" ...
##  $ start_station_name: chr  "" "" "" "" ...
##  $ start_station_id  : chr  "" "" "" "" ...
##  $ end_station_name  : chr  "" "" "" "" ...
##  $ end_station_id    : chr  "" "" "" "" ...
##  $ start_lat         : num  41.8 41.8 41.8 41.9 42 ...
##  $ start_lng         : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num  41.8 41.8 41.8 41.9 41.9 ...
##  $ end_lng           : num  -87.6 -87.6 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr  "casual" "casual" "casual" "casual" ...
summary(all_trips)  #Statistical summary of data. Mainly for numerics
##    ride_id          rideable_type       started_at          ended_at        
##  Length:5715693     Length:5715693     Length:5715693     Length:5715693    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  start_station_name start_station_id   end_station_name   end_station_id    
##  Length:5715693     Length:5715693     Length:5715693     Length:5715693    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    start_lat       start_lng         end_lat         end_lng       
##  Min.   :41.63   Min.   :-87.94   Min.   : 0.00   Min.   :-144.05  
##  1st Qu.:41.88   1st Qu.:-87.66   1st Qu.:41.88   1st Qu.: -87.66  
##  Median :41.90   Median :-87.64   Median :41.90   Median : -87.64  
##  Mean   :41.90   Mean   :-87.65   Mean   :41.90   Mean   : -87.65  
##  3rd Qu.:41.93   3rd Qu.:-87.63   3rd Qu.:41.93   3rd Qu.: -87.63  
##  Max.   :42.07   Max.   :-87.46   Max.   :87.96   Max.   :   0.00  
##                                   NA's   :7756    NA's   :7756     
##  member_casual     
##  Length:5715693    
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
all_trips$date <- as.Date(all_trips$started_at) #The default format is yyyy-mm-dd
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")

# Add a "ride_length" calculation to all_trips (in seconds)
all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)

str(all_trips)
## 'data.frame':    5715693 obs. of  19 variables:
##  $ ride_id           : chr  "2658E319B13141F9" "B2176315168A47CE" "C2A9D33DF7EBB422" "8BFEA406DF01D8AD" ...
##  $ rideable_type     : chr  "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : chr  "2024-07-11 08:15:14.784" "2024-07-11 15:45:07.851" "2024-07-11 08:24:48.192" "2024-07-11 08:46:06.864" ...
##  $ ended_at          : chr  "2024-07-11 08:17:56.335" "2024-07-11 16:06:04.243" "2024-07-11 08:28:05.237" "2024-07-11 09:14:11.664" ...
##  $ start_station_name: chr  "" "" "" "" ...
##  $ start_station_id  : chr  "" "" "" "" ...
##  $ end_station_name  : chr  "" "" "" "" ...
##  $ end_station_id    : chr  "" "" "" "" ...
##  $ start_lat         : num  41.8 41.8 41.8 41.9 42 ...
##  $ start_lng         : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num  41.8 41.8 41.8 41.9 41.9 ...
##  $ end_lng           : num  -87.6 -87.6 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr  "casual" "casual" "casual" "casual" ...
##  $ date              : Date, format: "2024-07-11" "2024-07-11" ...
##  $ month             : chr  "07" "07" "07" "07" ...
##  $ day               : chr  "11" "11" "11" "11" ...
##  $ year              : chr  "2024" "2024" "2024" "2024" ...
##  $ day_of_week       : chr  "Thursday" "Thursday" "Thursday" "Thursday" ...
##  $ ride_length       : 'difftime' num  161.551000118256 1256.39199995995 197.045000076294 1684.79999995232 ...
##   ..- attr(*, "units")= chr "secs"
#quality check with results in excel
check <- mean(all_trips$ride_length[all_trips$month == "07"])
print(check/60)
## Time difference of 19.54278 secs
# New version without un usable data (ride_length<10sec, faulty bike put back, inspection, problem with application, etc)
all_trips_v2 <- all_trips[!(all_trips$ride_length<10),]

summary(all_trips_v2$ride_length)
##   Length    Class     Mode 
##  5686857 difftime  numeric
# Compare members and casual users
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
##   all_trips_v2$member_casual all_trips_v2$ride_length
## 1                     casual           1637.7863 secs
## 2                     member            783.8447 secs
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
##   all_trips_v2$member_casual all_trips_v2$ride_length
## 1                     casual              735.66 secs
## 2                     member              526.00 secs
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
##   all_trips_v2$member_casual all_trips_v2$ride_length
## 1                     casual             5909344 secs
## 2                     member               93588 secs
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)
##   all_trips_v2$member_casual all_trips_v2$ride_length
## 1                     casual                  10 secs
## 2                     member                  10 secs
# The average ride time by each day for members vs casual users
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
##    all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1                      casual                   Friday           1627.6873 secs
## 2                      member                   Friday            770.2110 secs
## 3                      casual                   Monday           1583.2426 secs
## 4                      member                   Monday            746.1417 secs
## 5                      casual                 Saturday           1830.5872 secs
## 6                      member                 Saturday            864.6809 secs
## 7                      casual                   Sunday           1924.1781 secs
## 8                      member                   Sunday            875.2154 secs
## 9                      casual                 Thursday           1453.2443 secs
## 10                     member                 Thursday            751.3384 secs
## 11                     casual                  Tuesday           1413.1502 secs
## 12                     member                  Tuesday            752.9224 secs
## 13                     casual                Wednesday           1403.4551 secs
## 14                     member                Wednesday            763.4795 secs
# The days of the week are out of order. Let's fix that.
all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

# Run the average ride time by each day for members vs casual users
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
##    all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1                      casual                   Sunday           1924.1781 secs
## 2                      member                   Sunday            875.2154 secs
## 3                      casual                   Monday           1583.2426 secs
## 4                      member                   Monday            746.1417 secs
## 5                      casual                  Tuesday           1413.1502 secs
## 6                      member                  Tuesday            752.9224 secs
## 7                      casual                Wednesday           1403.4551 secs
## 8                      member                Wednesday            763.4795 secs
## 9                      casual                 Thursday           1453.2443 secs
## 10                     member                 Thursday            751.3384 secs
## 11                     casual                   Friday           1627.6873 secs
## 12                     member                   Friday            770.2110 secs
## 13                     casual                 Saturday           1830.5872 secs
## 14                     member                 Saturday            864.6809 secs
# analyze ridership data by type and weekday
all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>%  #creates weekday field using wday()
  group_by(member_casual, weekday) %>%  #groups by usertype and weekday
  summarise(number_of_rides = n()                           #calculates the number of rides and average duration 
            ,average_duration = mean(ride_length)) %>%      # calculates the average duration
  arrange(member_casual, weekday)                               # sorts
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups:   member_casual [2]
##    member_casual weekday number_of_rides average_duration
##    <chr>         <ord>             <int> <drtn>          
##  1 casual        Sun              339167 1924.1781 secs  
##  2 casual        Mon              224878 1583.2426 secs  
##  3 casual        Tue              233743 1413.1502 secs  
##  4 casual        Wed              264908 1403.4551 secs  
##  5 casual        Thu              254357 1453.2443 secs  
##  6 casual        Fri              298764 1627.6873 secs  
##  7 casual        Sat              413230 1830.5872 secs  
##  8 member        Sun              406512  875.2154 secs  
##  9 member        Mon              501426  746.1417 secs  
## 10 member        Tue              580200  752.9224 secs  
## 11 member        Wed              612827  763.4795 secs  
## 12 member        Thu              573679  751.3384 secs  
## 13 member        Fri              514913  770.2110 secs  
## 14 member        Sat              468253  864.6809 secs
# Visualization of the number of rides by rider type
all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# Visualization for average duration
all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
  geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.

# Analyze ridership data by type of membership and month
all_trips_v2 %>% 
  group_by(member_casual, month) %>% 
  summarise(number_of_rides = n(),average_duration = mean(ride_length)) %>% 
  arrange(member_casual, month) %>% 
  ggplot(aes(x = month, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "dodge") + labs(title="Total Number of Ride by Month", x = "Month", y = "Number of Rides") + theme(axis.text.x = element_text(angle = 60, hjust = 1))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# Created column to organize by seasons
all_trips_v2 <- all_trips_v2 %>%
  mutate(seasons = 
           lapply(month, function(x) {
             switch(x,
                    "06" = "Summer",
                    "07" = "Summer",
                    "08" = "Summer",
                    "09" = "Fall",
                    "10" = "Fall",
                    "11" = "Fall",
                    "12" = "Winter",
                    "01" = "Winter",
                    "02" = "Winter",
                    "Spring")
           }))


#Evaluate what season casual members spend time riding, and how it is different from members
member_casual_by_season <- all_trips_v2 %>%
  group_by(member_casual, seasons = factor(seasons, levels = c("Summer", "Fall", "Winter", "Spring"))) %>%
  summarise(number_of_rides = n(), .groups = 'drop') %>%
  group_by(member_casual) %>%
  mutate(percentage = number_of_rides / sum(number_of_rides) * 100)

print(member_casual_by_season)
## # A tibble: 8 × 4
## # Groups:   member_casual [2]
##   member_casual seasons number_of_rides percentage
##   <chr>         <fct>             <int>      <dbl>
## 1 casual        Summer           928438      45.8 
## 2 casual        Fall             534813      26.4 
## 3 casual        Winter           122736       6.05
## 4 casual        Spring           443060      21.8 
## 5 member        Summer          1291645      35.3 
## 6 member        Fall            1023988      28.0 
## 7 member        Winter           466589      12.8 
## 8 member        Spring           875588      23.9