This project analysis was performed by Anthony in the beginning of June from his Coursera Google Analytics Certification. The data is from Coursera and the study is performed to understand the properties of cyclists who purchase the membership.
We load the data using the read_csv() function and merge datasets (with similar, but not identical columns). Before cleaning the data, we need to observe the different columns, the classes, and think of modifications and formatting, which affects the visualization.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(skimr)
library(lubridate)
library(readxl)
library(viridis)
## Loading required package: viridisLite
options(warn = -1)
# -- S2: Load, Clean, and Extrapolate the Data -- #
f1 <- "C:/Users/Anthony Morciglio/OneDrive/Coursera Google Data Analytics/Divvy_Trips_2019_2020/Divvy_Trips_2019_Q1_CL.csv"
f2 <- "C:/Users/Anthony Morciglio/OneDrive/Coursera Google Data Analytics/Divvy_Trips_2019_2020/Divvy_Trips_2020_Q1_CL.csv"
# Read the data that was cleaned in Excel and exported as a CSV file.
Divvy_Trips_2019_Q1 <- read_csv(f1)
## Rows: 365069 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): start_time, end_time, trip_duration, weekday, month, start_station_...
## dbl (7): trip_id, tripduration_mins, year, bikeid, start_station_id, end_sta...
## num (1): tripduration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Divvy_Trips_2020_Q1 <- read_csv(f2)
## Rows: 426887 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): trip_id, start_time, end_time, weekday, month, start_station_name,...
## dbl (9): tripduration, tripduration_mins, year, start_station_id, end_stati...
## time (1): trip_duration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Join the data
# Ensure both dataframes of the same class/type:
Divvy_Trips_2019_Q1$trip_id <- as.character(Divvy_Trips_2019_Q1$trip_id)
Divvy_Trips_2020_Q1$trip_id <- as.character(Divvy_Trips_2020_Q1$trip_id)
df_merge <- merge(Divvy_Trips_2019_Q1, Divvy_Trips_2020_Q1, all.x = TRUE, all.y = TRUE)
# Glance at the data
tail(df_merge)
## trip_id start_time end_time trip_duration
## 791951 FFFF2D5AE185EFFB 3/8/2020 20:51 3/8/2020 20:57 393
## 791952 FFFF33C12C91FAC9 3/12/2020 9:04 3/12/2020 9:04 <NA>
## 791953 FFFF35F930C6A3B1 3/29/2020 15:44 3/29/2020 15:48 199
## 791954 FFFF484842A1315A 1/27/2020 16:30 1/27/2020 16:35 309
## 791955 FFFF6663FA25657F 3/2/2020 8:50 3/2/2020 9:01 652
## 791956 FFFFED71E01CE28F 2/21/2020 16:15 2/21/2020 16:33 1095
## tripduration tripduration_mins weekday month year
## 791951 393 6.55 Sunday March 2020
## 791952 NA NA Thursday March 2020
## 791953 199 3.32 Sunday March 2020
## 791954 309 5.15 Monday January 2020
## 791955 652 10.87 Monday March 2020
## 791956 1095 18.25 Friday February 2020
## start_station_name start_station_id end_station_name
## 791951 Halsted St & Polk St 108 Loomis St & Lexington St
## 791952 HQ QR 675 HQ QR
## 791953 Shore Dr & 55th St 247 Lake Park Ave & 56th St
## 791954 Canal St & Madison St 174 Milwaukee Ave & Grand Ave
## 791955 Clark St & Ida B Wells Dr 50 Adler Planetarium
## 791956 Daley Center Plaza 81 Sedgwick St & Webster Ave
## end_station_id usertype bikeid gender birthyear start_lat start_lng
## 791951 320 Subscriber NA <NA> NA 41.8718 -87.6466
## 791952 675 Customer NA <NA> NA 41.8899 -87.6803
## 791953 345 Subscriber NA <NA> NA 41.7952 -87.5807
## 791954 84 Subscriber NA <NA> NA 41.8821 -87.6398
## 791955 341 Subscriber NA <NA> NA 41.8759 -87.6306
## 791956 143 Subscriber NA <NA> NA 41.8842 -87.6296
## end_lat end_lng
## 791951 41.8722 -87.6615
## 791952 41.8899 -87.6803
## 791953 41.7932 -87.5878
## 791954 41.8916 -87.6484
## 791955 41.8661 -87.6073
## 791956 41.9222 -87.6389
glimpse(df_merge)
## Rows: 791,956
## Columns: 21
## $ trip_id <chr> "000054ABAD1C067C", "0000D320A07EE21F", "0000D37202…
## $ start_time <chr> "3/3/2020 18:08", "2/11/2020 6:44", "2/10/2020 16:1…
## $ end_time <chr> "3/3/2020 18:19", "2/11/2020 6:56", "2/10/2020 16:2…
## $ trip_duration <chr> "675", "738", "473", "2", "577", "648", "742", "388…
## $ tripduration <dbl> 675, 738, 473, 2, 577, 648, 742, 388, 1418, 1333, 2…
## $ tripduration_mins <dbl> 11.25, 12.30, 7.88, 0.03, 9.62, 10.80, 12.37, 6.47,…
## $ weekday <chr> "Tuesday", "Tuesday", "Monday", "Thursday", "Wednes…
## $ month <chr> "March", "February", "February", "February", "Febru…
## $ year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 202…
## $ start_station_name <chr> "State St & Randolph St", "Streeter Dr & Grand Ave"…
## $ start_station_id <dbl> 44, 35, 125, 224, 127, 194, 111, 45, 195, 313, 423,…
## $ end_station_name <chr> "Wells St & Polk St", "Clinton St & Lake St", "Clin…
## $ end_station_id <dbl> 175, 66, 91, 224, 331, 111, 52, 175, 338, 67, 345, …
## $ usertype <chr> "Subscriber", "Subscriber", "Subscriber", "Subscrib…
## $ bikeid <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ gender <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ birthyear <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ start_lat <dbl> 41.8847, 41.8923, 41.8902, 41.9139, 41.9259, 41.886…
## $ start_lng <dbl> -87.6277, -87.6120, -87.6262, -87.6488, -87.6493, -…
## $ end_lat <dbl> 41.8726, 41.8856, 41.8834, 41.9139, 41.9097, 41.894…
## $ end_lng <dbl> -87.6335, -87.6418, -87.6412, -87.6488, -87.6481, -…
str(df_merge)
## 'data.frame': 791956 obs. of 21 variables:
## $ trip_id : chr "000054ABAD1C067C" "0000D320A07EE21F" "0000D372025B3040" "00011A7CBF765993" ...
## $ start_time : chr "3/3/2020 18:08" "2/11/2020 6:44" "2/10/2020 16:19" "2/6/2020 8:10" ...
## $ end_time : chr "3/3/2020 18:19" "2/11/2020 6:56" "2/10/2020 16:27" "2/6/2020 8:10" ...
## $ trip_duration : chr "675" "738" "473" "2" ...
## $ tripduration : num 675 738 473 2 577 ...
## $ tripduration_mins : num 11.25 12.3 7.88 0.03 9.62 ...
## $ weekday : chr "Tuesday" "Tuesday" "Monday" "Thursday" ...
## $ month : chr "March" "February" "February" "February" ...
## $ year : num 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
## $ start_station_name: chr "State St & Randolph St" "Streeter Dr & Grand Ave" "Rush St & Hubbard St" "Halsted St & Willow St" ...
## $ start_station_id : num 44 35 125 224 127 194 111 45 195 313 ...
## $ end_station_name : chr "Wells St & Polk St" "Clinton St & Lake St" "Clinton St & Washington Blvd" "Halsted St & Willow St" ...
## $ end_station_id : num 175 66 91 224 331 111 52 175 338 67 ...
## $ usertype : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ bikeid : num NA NA NA NA NA NA NA NA NA NA ...
## $ gender : chr NA NA NA NA ...
## $ birthyear : num NA NA NA NA NA NA NA NA NA NA ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num -87.6 -87.6 -87.6 -87.6 -87.6 ...
summary(df_merge)
## trip_id start_time end_time trip_duration
## Length:791956 Length:791956 Length:791956 Length:791956
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## tripduration tripduration_mins weekday month
## Min. : 0 Min. : 0.00 Length:791956 Length:791956
## 1st Qu.: 328 1st Qu.: 5.48 Class :character Class :character
## Median : 537 Median : 8.97 Mode :character Mode :character
## Mean : 1184 Mean : 19.74
## 3rd Qu.: 910 3rd Qu.: 15.18
## Max. :10628400 Max. :177140.00
## NA's :117 NA's :117
## year start_station_name start_station_id end_station_name
## Min. :2019 Length:791956 Min. : 2.0 Length:791956
## 1st Qu.:2019 Class :character 1st Qu.: 77.0 Class :character
## Median :2020 Mode :character Median :174.0 Mode :character
## Mean :2020 Mean :204.4
## 3rd Qu.:2020 3rd Qu.:291.0
## Max. :2020 Max. :675.0
##
## end_station_id usertype bikeid gender
## Min. : 2.0 Length:791956 Min. : 1 Length:791956
## 1st Qu.: 77.0 Class :character 1st Qu.:1777 Class :character
## Median :174.0 Mode :character Median :3489 Mode :character
## Mean :204.4 Mean :3430
## 3rd Qu.:291.0 3rd Qu.:5157
## Max. :675.0 Max. :6471
## NA's :1 NA's :426887
## birthyear start_lat start_lng end_lat
## Min. :1900 Min. :41.7 Min. :-87.8 Min. :41.7
## 1st Qu.:1975 1st Qu.:41.9 1st Qu.:-87.7 1st Qu.:41.9
## Median :1985 Median :41.9 Median :-87.6 Median :41.9
## Mean :1982 Mean :41.9 Mean :-87.6 Mean :41.9
## 3rd Qu.:1990 3rd Qu.:41.9 3rd Qu.:-87.6 3rd Qu.:41.9
## Max. :2003 Max. :42.1 Max. :-87.5 Max. :42.1
## NA's :444910 NA's :365069 NA's :365069 NA's :365070
## end_lng
## Min. :-87.8
## 1st Qu.:-87.7
## Median :-87.6
## Mean :-87.6
## 3rd Qu.:-87.6
## Max. :-87.5
## NA's :365070
skim_without_charts(df_merge)
| Name | df_merge |
| Number of rows | 791956 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| trip_id | 0 | 1.00 | 7 | 16 | 0 | 791956 | 0 |
| start_time | 0 | 1.00 | 13 | 15 | 0 | 172752 | 0 |
| end_time | 0 | 1.00 | 13 | 15 | 0 | 172920 | 0 |
| trip_duration | 213 | 1.00 | 1 | 10 | 0 | 14915 | 0 |
| weekday | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month | 0 | 1.00 | 5 | 8 | 0 | 3 | 0 |
| start_station_name | 0 | 1.00 | 5 | 43 | 0 | 636 | 0 |
| end_station_name | 1 | 1.00 | 5 | 43 | 0 | 636 | 0 |
| usertype | 0 | 1.00 | 8 | 10 | 0 | 2 | 0 |
| gender | 446598 | 0.44 | 4 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| tripduration | 117 | 1.00 | 1183.92 | 33211.18 | 0.00 | 328.00 | 537.00 | 910.00 | 10628400.00 |
| tripduration_mins | 117 | 1.00 | 19.74 | 553.52 | 0.00 | 5.48 | 8.97 | 15.18 | 177140.00 |
| year | 0 | 1.00 | 2019.54 | 0.50 | 2019.00 | 2019.00 | 2020.00 | 2020.00 | 2020.00 |
| start_station_id | 0 | 1.00 | 204.40 | 158.92 | 2.00 | 77.00 | 174.00 | 291.00 | 675.00 |
| end_station_id | 1 | 1.00 | 204.38 | 159.32 | 2.00 | 77.00 | 174.00 | 291.00 | 675.00 |
| bikeid | 426887 | 0.46 | 3429.48 | 1923.32 | 1.00 | 1777.00 | 3489.00 | 5157.00 | 6471.00 |
| birthyear | 444910 | 0.44 | 1981.67 | 11.25 | 1900.00 | 1975.00 | 1985.00 | 1990.00 | 2003.00 |
| start_lat | 365069 | 0.54 | 41.90 | 0.04 | 41.74 | 41.88 | 41.89 | 41.92 | 42.06 |
| start_lng | 365069 | 0.54 | -87.64 | 0.02 | -87.77 | -87.66 | -87.64 | -87.63 | -87.55 |
| end_lat | 365070 | 0.54 | 41.90 | 0.04 | 41.74 | 41.88 | 41.89 | 41.92 | 42.06 |
| end_lng | 365070 | 0.54 | -87.64 | 0.02 | -87.77 | -87.66 | -87.64 | -87.63 | -87.55 |
After exploring the original dataset, we notice the trip duration is in seconds and consider changing the units to minutes, we will need to mutate the dataframe to achieve this. Additionally, there are null values for gender, which can be replaced to Other. Lastly, we are given the birth year of the cyclists and start time of trips, which will be used to determine the age of the cyclist at the time of travel.
After observing the columns/variables of the dataset with
skim_without_charts(), we notice there are a few categorical variables
such as gender and usertype. The dataframe df_merge can be grouped by
these factors to see if there are differences in measures of central
tendency/spread, such as mean/variance.
We evaluate the summary statistics and notice there is not much of a
difference in average trip duration between men and women, but there is
a difference between men/women and other. Likewise, there is a
difference in average trip duration for subscriber and customer
usertypes. We will group df_merge by the factors and perform hypothesis
testing to validate the observations.
# -- Grouping Data -- #
options(warn = -1)
# Group data based on Gender and UserType
print("--- Summarizing key statistics based on Gender ---")
## [1] "--- Summarizing key statistics based on Gender ---"
df_gender <- df_merge %>% group_by(gender) %>%
drop_na() %>%
summarize(avg_tripduration_mins = mean(tripduration_mins), med_tripduration_mins = median(tripduration_mins), sd_tripduration_mins = sd(tripduration_mins), min_tripduration_mins = min(tripduration_mins), max_tripduration_mins = max(tripduration_mins))
print(tail(df_gender))
## # A tibble: 0 × 6
## # ℹ 6 variables: gender <chr>, avg_tripduration_mins <dbl>,
## # med_tripduration_mins <dbl>, sd_tripduration_mins <dbl>,
## # min_tripduration_mins <dbl>, max_tripduration_mins <dbl>
print("--- Summarizing key statistics based on Usertype ---")
## [1] "--- Summarizing key statistics based on Usertype ---"
df_users <- df_merge %>% group_by(usertype) %>%
drop_na() %>%
summarize(avg_tripduration_mins = mean(tripduration_mins), med_tripduration_mins = median(tripduration_mins), sd_tripduration_mins = sd(tripduration_mins), min_tripduration_mins = min(tripduration_mins), max_tripduration_mins = max(tripduration_mins))
print(tail(df_users))
## # A tibble: 0 × 6
## # ℹ 6 variables: usertype <chr>, avg_tripduration_mins <dbl>,
## # med_tripduration_mins <dbl>, sd_tripduration_mins <dbl>,
## # min_tripduration_mins <dbl>, max_tripduration_mins <dbl>
# Filter and evaluate key metrics: tripduration
# Filter based on Gender
df_gender_M <- filter(df_merge, df_merge$gender == "Male")
df_gender_F <- filter(df_merge, df_merge$gender == "Female")
df_gender_O <- filter(df_merge, df_merge$gender == "Other")
# Filter based on Usertype
df_users_S <- filter(df_merge, df_merge$usertype == "Subscriber")
df_users_C <- filter(df_merge, df_merge$usertype == "Customer")
# There appears to be a difference in avg trip duration between Other and Male/Female
# We will validate the empirical observation using Hypothesis Testing
# -- Hypothesis Testing for Gender -- #
# F Statistic for proportion of population variances
Fstat_mf <- var.test(df_gender_M$tripduration_mins, df_gender_F$tripduration_mins, alternative = "two.sided")
var_diff_bool <- FALSE
tmp_df <- df_merge %>% filter(df_merge$gender == "Male" | df_merge$gender == "Female")
# T Statistic for difference in population averages
Tstat_mf <- t.test(tmp_df$tripduration_mins ~ tmp_df$gender, data = tmp_df, var.equal = !var_diff_bool)
if (Tstat_mf[3]$p.value <= 0.05)
{ print("There is a Statistical difference between the average tripduration of Male and Female")
} else
{ print("There is no Statistical difference between the average tripduration of Male and Female")
}
## [1] "There is no Statistical difference between the average tripduration of Male and Female"
if (Fstat_mf[3]$p.value <= 0.05)
{ print("There is a Statistical difference between the variance of tripduration of Male and Female")
} else
{ print("There is no Statistical difference between the variance of tripduration of Male and Female")
}
## [1] "There is a Statistical difference between the variance of tripduration of Male and Female"
# -- Apply same method for M/F and Other in Gender -- #
Fstat_mo <- var.test(df_gender_M$tripduration_mins, df_gender_O$tripduration_mins, alternative = "two.sided")
tmp_df <- df_merge %>% filter(df_merge$gender == "Male" | df_merge$gender == "Other")
Tstat_mo <- t.test(tmp_df$tripduration_mins ~ tmp_df$gender, data = tmp_df, var.equal = !var_diff_bool)
if (Tstat_mo[3]$p.value <= 0.05)
{ print("There is a Statistical difference between the average tripduration of Male and Other")
} else
{ print("There is no Statistical difference between the average tripduration of Male and Other")
}
## [1] "There is a Statistical difference between the average tripduration of Male and Other"
if (Fstat_mo[3]$p.value <= 0.05)
{ print("There is a Statistical difference between the variance of tripduration of Male and Other")
} else
{ print("There is no Statistical difference between the variance of tripduration of Male and Other")
}
## [1] "There is a Statistical difference between the variance of tripduration of Male and Other"
# -- Hypothesis testing for UserType -- #
Fstat_sc <- var.test(df_users_S$tripduration_mins, df_users_C$tripduration_mins, alternative = "two.sided")
tmp_df <- df_merge %>% filter(df_merge$usertype == "Subscriber" | df_merge$usertype == "Customer")
Tstat_sc <- t.test(tmp_df$tripduration_mins ~ tmp_df$usertype, data = tmp_df, var.equal = !var_diff_bool)
if (Tstat_sc[3]$p.value <= 0.05)
{ print("There is a Statistical difference between the average tripduration of Subscriber and Customer")
} else
{ print("There is no Statistical difference between the average tripduration of Subscriber and Customer")
}
## [1] "There is a Statistical difference between the average tripduration of Subscriber and Customer"
if (Fstat_sc[3]$p.value <= 0.05)
{ print("There is a Statistical difference between the variance of tripduration of Subscriber and Customer")
} else
{ print("There is no Statistical difference between the variance of tripduration of Subscriber and Customer")
}
## [1] "There is a Statistical difference between the variance of tripduration of Subscriber and Customer"
We first limit our data to not include outliers that significantly
skew the results, which makes it difficult to extract insights from the
visualizations. By using InterQuartileRange (IQR) and the 1st and 3rd
quartiles, we restrict the trip duration to a range and visualize the
remaining dataframe: df_lim.
We visualize the trip duration vs categorical variables such as gender,
usertype, weekday, month, and year using boxplots, violin plots, and
histograms.
The visualizations suggest that Subscriber and Customer have
significantly different distributions with a greater proportion of
Customers using cycles to travel for more than 40 minutes whereas almost
all of the Subscribers travel no more than 40 minutes. Of those
individuals that purchase the annual subscription, we observe that the
age group with the smallest median of trip duration is the elderly (60 -
130] suggesting we can expect the elderly to use the cycles the least.
Lastly, we find that Saturday and Sunday have the highest 3rd quartiles
(75% point of the boxplot) and the month of March has the most number of
cyclist trips (compared to January and February) indicating the
possibility for a seasonal growth during the spring. Further analysis
would need to compare the data based on the season. Additional
visualizations can be found on my Tableau
Public
# -- Plotting Section -- #
options(warn = -1)
# Filter data for cleaner plots (remove outliers)
q3 <- quantile(df_merge$tripduration_mins, 0.75, na.rm = TRUE)
q1 <- quantile(df_merge$tripduration_mins, 0.25, na.rm = TRUE)
IQR <- q3 - q1
# -- the following commented code does not help produce better visualizations since the data is highly skewed (there are many outliers). -- #
# UB <- mean(df_merge$tripduration_mins, na.rm = TRUE) + 1*sd(df_merge$tripduration_mins, na.rm = TRUE)
# LB <- mean(df_merge$tripduration_mins, na.rm = TRUE) - 1*sd(df_merge$tripduration_mins, na.rm = TRUE)
# -- #
UB <- q3 + 5 * IQR
LB <- q1 - 5 * IQR
df_lim <- df_merge %>% filter(df_merge$tripduration_mins <= UB & df_merge$tripduration_mins >= LB)
# Boxplots
ggplot(data=df_lim, aes(x = gender, y = tripduration_mins, fill = gender)) +
geom_violin(data = df_lim, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) +
geom_boxplot(data=df_lim, aes(df_lim$gender, df_lim$tripduration_mins), width = 0.075, alpha = 0.5, fill = 'white') +
labs(x = "Gender", y = "Trip Duration (Minutes)", title = "Box Plot: Gender vs Trip Duration (Minutes)", fill = "Gender") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$gender)))) +
theme_linedraw()
ggplot(data=df_lim, aes(x = usertype, y = tripduration_mins, fill = usertype)) +
geom_violin(data = df_lim, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) +
geom_boxplot(data=df_lim, aes(df_lim$usertype, df_lim$tripduration_mins), width = 0.075, alpha = 0.5, fill = 'white') +
labs(x = "UserType", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "UserType") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
theme_linedraw()
df_lim %>%
ggplot(aes(x = age_group, y = tripduration_mins, fill = age_group)) +
geom_violin(trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) +
geom_boxplot(aes(df_lim$age_group, df_lim$tripduration_mins), width = 0.075, alpha = 0.5, fill = 'white') +
labs(x = "Age Group", y = "Trip Duration (Minutes)", title = "Box Plot: Age Group vs Trip Duration (Minutes)", fill = "Age Group") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$age_group)))) +
theme_linedraw()
ggplot(data=df_lim, aes(x = age_group, y = tripduration_mins, fill = usertype)) +
geom_boxplot(data=df_lim, aes(df_lim$age_group, df_lim$tripduration_mins)) +
labs(x = "Age Group", y = "Trip Duration (Minutes)", title = "Box Plot: Age Group vs Trip Duration (Minutes)", fill = "UserType") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
theme_linedraw()
# Density plots
ggplot(data=df_lim, aes(x = tripduration_mins, fill = gender)) +
geom_histogram(data=df_lim, aes(fill=df_lim$gender, x=df_lim$tripduration_mins, alpha = 0.1)) +
labs(x = "Trip Duration (Minutes)", y = "Frequency", title = "Histogram: Trip Duration (Minutes)", fill = "Gender") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$gender)))) +
theme_linedraw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=df_lim, aes(x = tripduration_mins, fill = df_lim$usertype)) +
geom_density(data=df_lim, aes(fill=df_lim$usertype, x=df_lim$tripduration_mins, alpha = 0.1)) +
labs(x = "Trip Duration (Minutes)", y = "Probability", title = "PDF: Trip Duration (Minutes)", fill = "UserType") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
theme_linedraw()
ggplot(data=df_lim, aes(x = tripduration_mins, color = df_lim$usertype)) +
stat_ecdf(data=df_lim, geom = "step", aes(color=df_lim$usertype, x=df_lim$tripduration_mins)) +
labs(x = "Trip Duration (Minutes)", y = "Probability", title = "CDF: Trip Duration (Minutes)", color = "UserType") +
scale_color_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
theme_linedraw()
df_lim %>%
mutate(weekday = factor(weekday, levels = c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))) %>%
ggplot(aes(x = weekday, y = tripduration_mins, fill = weekday)) +
geom_violin(trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) +
geom_boxplot(aes(df_lim$weekday, df_lim$tripduration_mins), width = 0.075, alpha = 0.5, fill = 'white') +
labs(x = "Weekday", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "Weekday") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$weekday)))) +
theme_linedraw()
# The following needs to be modified to organize the factor based on day of the week.
df_lim %>%
mutate(weekday = factor(weekday, levels = c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'))) %>%
ggplot(aes(x = weekday, y = tripduration_mins, fill = usertype)) +
geom_boxplot(aes(df_lim$weekday, df_lim$tripduration_mins, fill = df_lim$usertype), alpha = 0.5) +
labs(x = "Weekday", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "UserType") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
theme_linedraw()
df_lim %>%
ggplot(aes(x = age_group, y = tripduration_mins, fill = usertype)) +
geom_boxplot(aes(df_lim$age_group, df_lim$tripduration_mins, fill = df_lim$usertype), alpha = 0.5) +
labs(x = "Weekday", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "UserType") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$usertype)))) +
theme_linedraw()
ggplot(data=df_lim, aes(x = weekday, y = tripduration_mins, fill = weekday)) +
#geom_violin(data = df_lim, trim = FALSE, draw_quantiles = c(0.25, 0.5, 0.75)) +
geom_boxplot(data=df_lim, aes(df_lim$weekday, df_lim$tripduration_mins, fill = df_lim$weekday), alpha = 0.5) +
labs(x = "Weekday", y = "Trip Duration (Minutes)", title = "Box Plot: UserType vs Trip Duration (Minutes)", fill = "Weekday") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$weekday)))) +
theme_linedraw()
# -- Time Plots -- #
ggplot(data = df_lim, aes(x = df_lim$month, fill = df_lim$year)) +
geom_histogram(data = df_lim, aes(x = df_lim$month, fill = df_lim$year), stat = 'count') +
labs(x = "Month", y = "Number of Trips", title = "Number of Trips vs Month", fill = "Year") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$year)))) +
theme_linedraw()
ggplot(data = df_lim, aes(x = week_num, fill = df_lim$year)) +
geom_histogram(data = df_lim, aes(fill = df_lim$year, x = df_lim$week_num)) +
labs(x = "Week Number", y = "Number of Trips", title = "Number of Trips vs Week Number", fill = "Year") +
scale_fill_manual(values = viridis(n = length(unique(df_lim$year)))) +
theme_linedraw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.