About the Project

This project analysis was performed by Anthony in the beginning of June from his Coursera Google Analytics Certification. The data is from Coursera and the study is performed to understand the properties of cyclists who purchase the membership.

About the Analysis Process

S1: Load necessary libraries and datasets

We load the data using the read_csv() function and merge datasets (with similar, but not identical columns). Before cleaning the data, we need to observe the different columns, the classes, and think of modifications and formatting, which affects the visualization.

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(skimr)
library(lubridate)
library(readxl)
library(viridis)
## Loading required package: viridisLite
options(warn = -1)

# -- S2: Load, Clean, and Extrapolate the Data -- #
f1 <- "C:/Users/Anthony Morciglio/OneDrive/Coursera Google Data Analytics/Divvy_Trips_2019_2020/Divvy_Trips_2019_Q1_CL.csv"
f2 <- "C:/Users/Anthony Morciglio/OneDrive/Coursera Google Data Analytics/Divvy_Trips_2019_2020/Divvy_Trips_2020_Q1_CL.csv"

# Read the data that was cleaned in Excel and exported as a CSV file.
Divvy_Trips_2019_Q1 <- read_csv(f1)
## Rows: 365069 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): start_time, end_time, trip_duration, weekday, month, start_station_...
## dbl (7): trip_id, tripduration_mins, year, bikeid, start_station_id, end_sta...
## num (1): tripduration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Divvy_Trips_2020_Q1 <- read_csv(f2)
## Rows: 426887 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): trip_id, start_time, end_time, weekday, month, start_station_name,...
## dbl  (9): tripduration, tripduration_mins, year, start_station_id, end_stati...
## time (1): trip_duration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Join the data
# Ensure both dataframes of the same class/type:
Divvy_Trips_2019_Q1$trip_id <- as.character(Divvy_Trips_2019_Q1$trip_id)
Divvy_Trips_2020_Q1$trip_id <- as.character(Divvy_Trips_2020_Q1$trip_id)
df_merge <- merge(Divvy_Trips_2019_Q1, Divvy_Trips_2020_Q1, all.x = TRUE, all.y = TRUE)

# Glance at the data
tail(df_merge)
##                 trip_id      start_time        end_time trip_duration
## 791951 FFFF2D5AE185EFFB  3/8/2020 20:51  3/8/2020 20:57           393
## 791952 FFFF33C12C91FAC9  3/12/2020 9:04  3/12/2020 9:04          <NA>
## 791953 FFFF35F930C6A3B1 3/29/2020 15:44 3/29/2020 15:48           199
## 791954 FFFF484842A1315A 1/27/2020 16:30 1/27/2020 16:35           309
## 791955 FFFF6663FA25657F   3/2/2020 8:50   3/2/2020 9:01           652
## 791956 FFFFED71E01CE28F 2/21/2020 16:15 2/21/2020 16:33          1095
##        tripduration tripduration_mins  weekday    month year
## 791951          393              6.55   Sunday    March 2020
## 791952           NA                NA Thursday    March 2020
## 791953          199              3.32   Sunday    March 2020
## 791954          309              5.15   Monday  January 2020
## 791955          652             10.87   Monday    March 2020
## 791956         1095             18.25   Friday February 2020
##               start_station_name start_station_id          end_station_name
## 791951      Halsted St & Polk St              108  Loomis St & Lexington St
## 791952                     HQ QR              675                     HQ QR
## 791953        Shore Dr & 55th St              247   Lake Park Ave & 56th St
## 791954     Canal St & Madison St              174 Milwaukee Ave & Grand Ave
## 791955 Clark St & Ida B Wells Dr               50         Adler Planetarium
## 791956        Daley Center Plaza               81 Sedgwick St & Webster Ave
##        end_station_id   usertype bikeid gender birthyear start_lat start_lng
## 791951            320 Subscriber     NA   <NA>        NA   41.8718  -87.6466
## 791952            675   Customer     NA   <NA>        NA   41.8899  -87.6803
## 791953            345 Subscriber     NA   <NA>        NA   41.7952  -87.5807
## 791954             84 Subscriber     NA   <NA>        NA   41.8821  -87.6398
## 791955            341 Subscriber     NA   <NA>        NA   41.8759  -87.6306
## 791956            143 Subscriber     NA   <NA>        NA   41.8842  -87.6296
##        end_lat  end_lng
## 791951 41.8722 -87.6615
## 791952 41.8899 -87.6803
## 791953 41.7932 -87.5878
## 791954 41.8916 -87.6484
## 791955 41.8661 -87.6073
## 791956 41.9222 -87.6389
glimpse(df_merge)
## Rows: 791,956
## Columns: 21
## $ trip_id            <chr> "000054ABAD1C067C", "0000D320A07EE21F", "0000D37202…
## $ start_time         <chr> "3/3/2020 18:08", "2/11/2020 6:44", "2/10/2020 16:1…
## $ end_time           <chr> "3/3/2020 18:19", "2/11/2020 6:56", "2/10/2020 16:2…
## $ trip_duration      <chr> "675", "738", "473", "2", "577", "648", "742", "388…
## $ tripduration       <dbl> 675, 738, 473, 2, 577, 648, 742, 388, 1418, 1333, 2…
## $ tripduration_mins  <dbl> 11.25, 12.30, 7.88, 0.03, 9.62, 10.80, 12.37, 6.47,…
## $ weekday            <chr> "Tuesday", "Tuesday", "Monday", "Thursday", "Wednes…
## $ month              <chr> "March", "February", "February", "February", "Febru…
## $ year               <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 202…
## $ start_station_name <chr> "State St & Randolph St", "Streeter Dr & Grand Ave"…
## $ start_station_id   <dbl> 44, 35, 125, 224, 127, 194, 111, 45, 195, 313, 423,…
## $ end_station_name   <chr> "Wells St & Polk St", "Clinton St & Lake St", "Clin…
## $ end_station_id     <dbl> 175, 66, 91, 224, 331, 111, 52, 175, 338, 67, 345, …
## $ usertype           <chr> "Subscriber", "Subscriber", "Subscriber", "Subscrib…
## $ bikeid             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ gender             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ birthyear          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ start_lat          <dbl> 41.8847, 41.8923, 41.8902, 41.9139, 41.9259, 41.886…
## $ start_lng          <dbl> -87.6277, -87.6120, -87.6262, -87.6488, -87.6493, -…
## $ end_lat            <dbl> 41.8726, 41.8856, 41.8834, 41.9139, 41.9097, 41.894…
## $ end_lng            <dbl> -87.6335, -87.6418, -87.6412, -87.6488, -87.6481, -…
str(df_merge)
## 'data.frame':    791956 obs. of  21 variables:
##  $ trip_id           : chr  "000054ABAD1C067C" "0000D320A07EE21F" "0000D372025B3040" "00011A7CBF765993" ...
##  $ start_time        : chr  "3/3/2020 18:08" "2/11/2020 6:44" "2/10/2020 16:19" "2/6/2020 8:10" ...
##  $ end_time          : chr  "3/3/2020 18:19" "2/11/2020 6:56" "2/10/2020 16:27" "2/6/2020 8:10" ...
##  $ trip_duration     : chr  "675" "738" "473" "2" ...
##  $ tripduration      : num  675 738 473 2 577 ...
##  $ tripduration_mins : num  11.25 12.3 7.88 0.03 9.62 ...
##  $ weekday           : chr  "Tuesday" "Tuesday" "Monday" "Thursday" ...
##  $ month             : chr  "March" "February" "February" "February" ...
##  $ year              : num  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ start_station_name: chr  "State St & Randolph St" "Streeter Dr & Grand Ave" "Rush St & Hubbard St" "Halsted St & Willow St" ...
##  $ start_station_id  : num  44 35 125 224 127 194 111 45 195 313 ...
##  $ end_station_name  : chr  "Wells St & Polk St" "Clinton St & Lake St" "Clinton St & Washington Blvd" "Halsted St & Willow St" ...
##  $ end_station_id    : num  175 66 91 224 331 111 52 175 338 67 ...
##  $ usertype          : chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ bikeid            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ gender            : chr  NA NA NA NA ...
##  $ birthyear         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
summary(df_merge)
##    trip_id           start_time          end_time         trip_duration     
##  Length:791956      Length:791956      Length:791956      Length:791956     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   tripduration      tripduration_mins     weekday             month          
##  Min.   :       0   Min.   :     0.00   Length:791956      Length:791956     
##  1st Qu.:     328   1st Qu.:     5.48   Class :character   Class :character  
##  Median :     537   Median :     8.97   Mode  :character   Mode  :character  
##  Mean   :    1184   Mean   :    19.74                                        
##  3rd Qu.:     910   3rd Qu.:    15.18                                        
##  Max.   :10628400   Max.   :177140.00                                        
##  NA's   :117        NA's   :117                                              
##       year      start_station_name start_station_id end_station_name  
##  Min.   :2019   Length:791956      Min.   :  2.0    Length:791956     
##  1st Qu.:2019   Class :character   1st Qu.: 77.0    Class :character  
##  Median :2020   Mode  :character   Median :174.0    Mode  :character  
##  Mean   :2020                      Mean   :204.4                      
##  3rd Qu.:2020                      3rd Qu.:291.0                      
##  Max.   :2020                      Max.   :675.0                      
##                                                                       
##  end_station_id    usertype             bikeid          gender         
##  Min.   :  2.0   Length:791956      Min.   :   1     Length:791956     
##  1st Qu.: 77.0   Class :character   1st Qu.:1777     Class :character  
##  Median :174.0   Mode  :character   Median :3489     Mode  :character  
##  Mean   :204.4                      Mean   :3430                       
##  3rd Qu.:291.0                      3rd Qu.:5157                       
##  Max.   :675.0                      Max.   :6471                       
##  NA's   :1                          NA's   :426887                     
##    birthyear        start_lat        start_lng         end_lat      
##  Min.   :1900     Min.   :41.7     Min.   :-87.8    Min.   :41.7    
##  1st Qu.:1975     1st Qu.:41.9     1st Qu.:-87.7    1st Qu.:41.9    
##  Median :1985     Median :41.9     Median :-87.6    Median :41.9    
##  Mean   :1982     Mean   :41.9     Mean   :-87.6    Mean   :41.9    
##  3rd Qu.:1990     3rd Qu.:41.9     3rd Qu.:-87.6    3rd Qu.:41.9    
##  Max.   :2003     Max.   :42.1     Max.   :-87.5    Max.   :42.1    
##  NA's   :444910   NA's   :365069   NA's   :365069   NA's   :365070  
##     end_lng      
##  Min.   :-87.8   
##  1st Qu.:-87.7   
##  Median :-87.6   
##  Mean   :-87.6   
##  3rd Qu.:-87.6   
##  Max.   :-87.5   
##  NA's   :365070
skim_without_charts(df_merge)
Data summary
Name df_merge
Number of rows 791956
Number of columns 21
_______________________
Column type frequency:
character 10
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
trip_id 0 1.00 7 16 0 791956 0
start_time 0 1.00 13 15 0 172752 0
end_time 0 1.00 13 15 0 172920 0
trip_duration 213 1.00 1 10 0 14915 0
weekday 0 1.00 6 9 0 7 0
month 0 1.00 5 8 0 3 0
start_station_name 0 1.00 5 43 0 636 0
end_station_name 1 1.00 5 43 0 636 0
usertype 0 1.00 8 10 0 2 0
gender 446598 0.44 4 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
tripduration 117 1.00 1183.92 33211.18 0.00 328.00 537.00 910.00 10628400.00
tripduration_mins 117 1.00 19.74 553.52 0.00 5.48 8.97 15.18 177140.00
year 0 1.00 2019.54 0.50 2019.00 2019.00 2020.00 2020.00 2020.00
start_station_id 0 1.00 204.40 158.92 2.00 77.00 174.00 291.00 675.00
end_station_id 1 1.00 204.38 159.32 2.00 77.00 174.00 291.00 675.00
bikeid 426887 0.46 3429.48 1923.32 1.00 1777.00 3489.00 5157.00 6471.00
birthyear 444910 0.44 1981.67 11.25 1900.00 1975.00 1985.00 1990.00 2003.00
start_lat 365069 0.54 41.90 0.04 41.74 41.88 41.89 41.92 42.06
start_lng 365069 0.54 -87.64 0.02 -87.77 -87.66 -87.64 -87.63 -87.55
end_lat 365070 0.54 41.90 0.04 41.74 41.88 41.89 41.92 42.06
end_lng 365070 0.54 -87.64 0.02 -87.77 -87.66 -87.64 -87.63 -87.55

S2: Clean and mutate existing datasets

After exploring the original dataset, we notice the trip duration is in seconds and consider changing the units to minutes, we will need to mutate the dataframe to achieve this. Additionally, there are null values for gender, which can be replaced to Other. Lastly, we are given the birth year of the cyclists and start time of trips, which will be used to determine the age of the cyclist at the time of travel.

S3: Group the Data based on Categorical Factors and Perform Hypothesis Testing

After observing the columns/variables of the dataset with skim_without_charts(), we notice there are a few categorical variables such as gender and usertype. The dataframe df_merge can be grouped by these factors to see if there are differences in measures of central tendency/spread, such as mean/variance.
We evaluate the summary statistics and notice there is not much of a difference in average trip duration between men and women, but there is a difference between men/women and other. Likewise, there is a difference in average trip duration for subscriber and customer usertypes. We will group df_merge by the factors and perform hypothesis testing to validate the observations.

# -- Grouping Data -- #
options(warn = -1)
# Group data based on Gender and UserType
print("--- Summarizing key statistics based on Gender ---")
## [1] "--- Summarizing key statistics based on Gender ---"
df_gender <- df_merge %>%  group_by(gender) %>% 
  drop_na() %>% 
  summarize(avg_tripduration_mins = mean(tripduration_mins), med_tripduration_mins = median(tripduration_mins), sd_tripduration_mins = sd(tripduration_mins), min_tripduration_mins = min(tripduration_mins), max_tripduration_mins = max(tripduration_mins))
print(tail(df_gender))
## # A tibble: 0 × 6
## # ℹ 6 variables: gender <chr>, avg_tripduration_mins <dbl>,
## #   med_tripduration_mins <dbl>, sd_tripduration_mins <dbl>,
## #   min_tripduration_mins <dbl>, max_tripduration_mins <dbl>
print("--- Summarizing key statistics based on Usertype ---")
## [1] "--- Summarizing key statistics based on Usertype ---"
df_users <- df_merge %>%  group_by(usertype) %>% 
  drop_na() %>% 
  summarize(avg_tripduration_mins = mean(tripduration_mins), med_tripduration_mins = median(tripduration_mins), sd_tripduration_mins = sd(tripduration_mins), min_tripduration_mins = min(tripduration_mins), max_tripduration_mins = max(tripduration_mins))
print(tail(df_users))
## # A tibble: 0 × 6
## # ℹ 6 variables: usertype <chr>, avg_tripduration_mins <dbl>,
## #   med_tripduration_mins <dbl>, sd_tripduration_mins <dbl>,
## #   min_tripduration_mins <dbl>, max_tripduration_mins <dbl>
# Filter and evaluate key metrics: tripduration
# Filter based on Gender
df_gender_M <- filter(df_merge, df_merge$gender == "Male")
df_gender_F <- filter(df_merge, df_merge$gender == "Female")
df_gender_O <- filter(df_merge, df_merge$gender == "Other")

# Filter based on Usertype
df_users_S <- filter(df_merge, df_merge$usertype == "Subscriber")
df_users_C <- filter(df_merge, df_merge$usertype == "Customer")


# There appears to be a difference in avg trip duration between Other and Male/Female
# We will validate the empirical observation using Hypothesis Testing
# -- Hypothesis Testing for Gender -- #
# F Statistic for proportion of population variances
Fstat_mf <- var.test(df_gender_M$tripduration_mins, df_gender_F$tripduration_mins, alternative = "two.sided")
var_diff_bool <- FALSE
tmp_df <- df_merge %>% filter(df_merge$gender == "Male" | df_merge$gender == "Female")
# T Statistic for difference in population averages
Tstat_mf <- t.test(tmp_df$tripduration_mins ~ tmp_df$gender, data = tmp_df, var.equal = !var_diff_bool)

if (Tstat_mf[3]$p.value <= 0.05)
{ print("There is a Statistical difference between the average tripduration of Male and Female") 
  } else 
    { print("There is no Statistical difference between the average tripduration of Male and Female")
}
## [1] "There is no Statistical difference between the average tripduration of Male and Female"
if (Fstat_mf[3]$p.value <= 0.05)
{ print("There is a Statistical difference between the variance of tripduration of Male and Female") 
  } else 
    { print("There is no Statistical difference between the variance of tripduration of Male and Female")
}
## [1] "There is a Statistical difference between the variance of tripduration of Male and Female"
# -- Apply same method for M/F and Other in Gender -- #
Fstat_mo <- var.test(df_gender_M$tripduration_mins, df_gender_O$tripduration_mins, alternative = "two.sided")
tmp_df <- df_merge %>% filter(df_merge$gender == "Male" | df_merge$gender == "Other")
Tstat_mo <- t.test(tmp_df$tripduration_mins ~ tmp_df$gender, data = tmp_df, var.equal = !var_diff_bool)

if (Tstat_mo[3]$p.value <= 0.05)
  { print("There is a Statistical difference between the average tripduration of Male and Other")
  } else 
    { print("There is no Statistical difference between the average tripduration of Male and Other")
}
## [1] "There is a Statistical difference between the average tripduration of Male and Other"
if (Fstat_mo[3]$p.value <= 0.05)
{ print("There is a Statistical difference between the variance of tripduration of Male and Other") 
  } else 
    { print("There is no Statistical difference between the variance of tripduration of Male and Other")
}
## [1] "There is a Statistical difference between the variance of tripduration of Male and Other"
# -- Hypothesis testing for UserType -- #
Fstat_sc <- var.test(df_users_S$tripduration_mins, df_users_C$tripduration_mins, alternative = "two.sided")
tmp_df <- df_merge %>% filter(df_merge$usertype == "Subscriber" | df_merge$usertype == "Customer")
Tstat_sc <- t.test(tmp_df$tripduration_mins ~ tmp_df$usertype, data = tmp_df, var.equal = !var_diff_bool)

if (Tstat_sc[3]$p.value <= 0.05)
  { print("There is a Statistical difference between the average tripduration of Subscriber and Customer")
 } else 
   { print("There is no Statistical difference between the average tripduration of Subscriber and Customer")
}
## [1] "There is a Statistical difference between the average tripduration of Subscriber and Customer"
if (Fstat_sc[3]$p.value <= 0.05) 
  { print("There is a Statistical difference between the variance of tripduration of Subscriber and Customer")
 } else 
  { print("There is no Statistical difference between the variance of tripduration of Subscriber and Customer")
  }
## [1] "There is a Statistical difference between the variance of tripduration of Subscriber and Customer"

S4: Plot and Visualize the Results obtained from Hypothesis Testing

We first limit our data to not include outliers that significantly skew the results, which makes it difficult to extract insights from the visualizations. By using InterQuartileRange (IQR) and the 1st and 3rd quartiles, we restrict the trip duration to a range and visualize the remaining dataframe: df_lim.
We visualize the trip duration vs categorical variables such as gender, usertype, weekday, month, and year using boxplots, violin plots, and histograms.
The visualizations suggest that Subscriber and Customer have significantly different distributions with a greater proportion of Customers using cycles to travel for more than 40 minutes whereas almost all of the Subscribers travel no more than 40 minutes. Of those individuals that purchase the annual subscription, we observe that the age group with the smallest median of trip duration is the elderly (60 - 130] suggesting we can expect the elderly to use the cycles the least. Lastly, we find that Saturday and Sunday have the highest 3rd quartiles (75% point of the boxplot) and the month of March has the most number of cyclist trips (compared to January and February) indicating the possibility for a seasonal growth during the spring. Further analysis would need to compare the data based on the season. Additional visualizations can be found on my Tableau Public

# -- Plotting Section -- #
options(warn = -1)
# Filter data for cleaner plots (remove outliers)
q3 <- quantile(df_merge$tripduration_mins, 0.75, na.rm = TRUE)
q1 <- quantile(df_merge$tripduration_mins, 0.25, na.rm = TRUE)
IQR <- q3 - q1

# -- the following commented code does not help produce better visualizations since the data is highly skewed (there are many outliers). -- #
# UB <- mean(df_merge$tripduration_mins, na.rm = TRUE) + 1*sd(df_merge$tripduration_mins, na.rm = TRUE)
# LB <- mean(df_merge$tripduration_mins, na.rm = TRUE) - 1*sd(df_merge$tripduration_mins, na.rm = TRUE)
# -- #

UB <- q3 + 5 * IQR
LB <- q1 - 5 * IQR
df_lim <- df_merge %>% filter(df_merge$tripduration_mins <= UB & df_merge$tripduration_mins >= LB)

# Boxplots
ggplot(data=df_lim, aes(x = gender, y = tripduration_mins, fill = gender)) +
  geom_violin(data = df_lim, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(data=df_lim, aes(df_lim$gender, df_lim$tripduration_mins), width = 0.075, alpha = 0.5, fill = 'white') + 
  labs(x = "Gender", y = "Trip Duration (Minutes)", title = "Box Plot: Gender vs Trip Duration (Minutes)", fill = "Gender") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$gender)))) +
  theme_linedraw()

ggplot(data=df_lim, aes(x = usertype, y = tripduration_mins, fill = usertype)) +
  geom_violin(data = df_lim, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(data=df_lim, aes(df_lim$usertype, df_lim$tripduration_mins), width = 0.075, alpha = 0.5, fill = 'white') + 
  labs(x = "UserType", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
  theme_linedraw()

df_lim %>% 
  ggplot(aes(x = age_group, y = tripduration_mins, fill = age_group)) +
  geom_violin(trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(aes(df_lim$age_group, df_lim$tripduration_mins), width = 0.075, alpha = 0.5, fill = 'white') + 
  labs(x = "Age Group", y = "Trip Duration (Minutes)", title = "Box Plot: Age Group vs Trip Duration (Minutes)", fill = "Age Group") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$age_group)))) +
  theme_linedraw()

ggplot(data=df_lim, aes(x = age_group, y = tripduration_mins, fill = usertype)) +
  geom_boxplot(data=df_lim, aes(df_lim$age_group, df_lim$tripduration_mins)) + 
  labs(x = "Age Group", y = "Trip Duration (Minutes)", title = "Box Plot: Age Group vs Trip Duration (Minutes)", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
  theme_linedraw()

# Density plots
ggplot(data=df_lim, aes(x = tripduration_mins, fill = gender)) +
  geom_histogram(data=df_lim, aes(fill=df_lim$gender, x=df_lim$tripduration_mins, alpha = 0.1)) + 
  labs(x = "Trip Duration (Minutes)", y = "Frequency", title = "Histogram: Trip Duration (Minutes)", fill = "Gender") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$gender)))) +
  theme_linedraw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=df_lim, aes(x = tripduration_mins, fill = df_lim$usertype)) +
  geom_density(data=df_lim, aes(fill=df_lim$usertype, x=df_lim$tripduration_mins, alpha = 0.1)) + 
  labs(x = "Trip Duration (Minutes)", y = "Probability", title = "PDF: Trip Duration (Minutes)", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
  theme_linedraw()

ggplot(data=df_lim, aes(x = tripduration_mins, color = df_lim$usertype)) +
  stat_ecdf(data=df_lim, geom = "step", aes(color=df_lim$usertype, x=df_lim$tripduration_mins)) + 
  labs(x = "Trip Duration (Minutes)", y = "Probability", title = "CDF: Trip Duration (Minutes)", color = "UserType") +
  scale_color_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
  theme_linedraw()

df_lim %>% 
  mutate(weekday = factor(weekday, levels = c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))) %>% 
  ggplot(aes(x = weekday, y = tripduration_mins, fill = weekday)) +
  geom_violin(trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(aes(df_lim$weekday, df_lim$tripduration_mins), width = 0.075, alpha = 0.5, fill = 'white') + 
  labs(x = "Weekday", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "Weekday") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$weekday)))) +
  theme_linedraw()

# The following needs to be modified to organize the factor based on day of the week.
df_lim %>% 
  mutate(weekday = factor(weekday, levels = c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))) %>% 
ggplot(aes(x = weekday, y = tripduration_mins, fill = usertype)) +
  geom_boxplot(aes(df_lim$weekday, df_lim$tripduration_mins, fill = df_lim$usertype), alpha = 0.5) + 
  labs(x = "Weekday", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
  theme_linedraw()

df_lim %>% 
  ggplot(aes(x = age_group, y = tripduration_mins, fill = usertype)) +
  geom_boxplot(aes(df_lim$age_group, df_lim$tripduration_mins, fill = df_lim$usertype), alpha = 0.5) + 
  labs(x = "Weekday", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "UserType") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
  theme_linedraw()

ggplot(data=df_lim, aes(x = weekday, y = tripduration_mins, fill = weekday)) +
  #geom_violin(data = df_lim, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) + 
  geom_boxplot(data=df_lim, aes(df_lim$weekday, df_lim$tripduration_mins, fill = df_lim$weekday), alpha = 0.5) + 
  labs(x = "Weekday", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "Weekday") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$weekday)))) +
  theme_linedraw()

# -- Time Plots -- #
ggplot(data = df_lim, aes(x = df_lim$month, fill = df_lim$year)) +
  geom_histogram(data = df_lim, aes(x = df_lim$month, fill = df_lim$year), stat = 'count') + 
  labs(x = "Month", y = "Number of Trips", title = "Number of Trips vs Month", fill = "Year") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$year)))) +
  theme_linedraw()

ggplot(data = df_lim, aes(x = week_num, fill = df_lim$year)) +
  geom_histogram(data = df_lim, aes(fill = df_lim$year, x = df_lim$week_num)) + 
  labs(x = "Week Number", y = "Number of Trips", title = "Number of Trips vs Week Number", fill = "Year") +
  scale_fill_manual(values = viridis(n = length(unique(df_lim$year)))) +
  theme_linedraw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

S5: Summary and Recommendations