library(tidyverse)
library(magrittr)
library(rmarkdown)
Oct_2021 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202110-divvy-tripdata.csv")
Nov_2021 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202111-divvy-tripdata.csv")
Dec_2021 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202112-divvy-tripdata.csv")
Jan_2022 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202201-divvy-tripdata.csv")
Feb_2022 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202202-divvy-tripdata.csv")
Mar_2022 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202203-divvy-tripdata.csv")
Apr_2022 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202204-divvy-tripdata.csv")
May_2022 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202205-divvy-tripdata.csv")
June_2022 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202206-divvy-tripdata.csv")
July_2022 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202207-divvy-tripdata.csv")
Aug_2022 <- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202208-divvy-tripdata.csv")
Sept_2022<- read_csv("~/Desktop/FINAL PROJECT/2021-2022 original files/202209-divvy-tripdata.csv")
dim(Oct_2021)
[1] 631226 13
dim(Nov_2021)
[1] 359978 13
dim(Dec_2021)
[1] 247540 13
dim(Jan_2022)
[1] 103770 13
dim(Feb_2022)
[1] 115609 13
dim(Mar_2022)
[1] 284042 13
dim(Apr_2022)
[1] 371249 13
dim(May_2022)
[1] 634858 13
dim(June_2022)
[1] 769204 13
dim(July_2022)
[1] 823488 13
dim(Aug_2022)
[1] 785932 13
dim(Sept_2022)
[1] 701339 13
colnames(Oct_2021)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(Nov_2021)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(Dec_2021)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(Jan_2022)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(Feb_2022)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(Mar_2022)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(Apr_2022)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(May_2022)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(June_2022)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(July_2022)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(Aug_2022)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
colnames(Sept_2022)
[1] "ride_id" "rideable_type" "started_at"
[4] "ended_at" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "member_casual"
Note many NAs.
View(Oct_2021)
fiscal_year_2021_2022_1Q <- bind_rows (Oct_2021, Nov_2021, Dec_2021)
fiscal_year_2021_2022_2Q <- bind_rows (Jan_2022, Feb_2022, Mar_2022)
fiscal_year_2021_2022_3Q <- bind_rows (Apr_2022, May_2022, June_2022)
fiscal_year_2021_2022_4Q <- bind_rows (July_2022, Aug_2022, Sept_2022)
dim(fiscal_year_2021_2022_1Q)
[1] 1238744 13
dim(fiscal_year_2021_2022_2Q)
[1] 503421 13
dim(fiscal_year_2021_2022_3Q)
[1] 1775311 13
dim(fiscal_year_2021_2022_4Q)
[1] 2310759 13
cyclistic_2021_2022 <- bind_rows (fiscal_year_2021_2022_1Q,fiscal_year_2021_2022_2Q,fiscal_year_2021_2022_3Q,fiscal_year_2021_2022_4Q)
glimpse(cyclistic_2021_2022)
Rows: 5,828,235
Columns: 13
$ ride_id <chr> "620BC6107255BF4C", "4471C70731AB2E45", "26CA69D43D15EE14…
$ rideable_type <chr> "electric_bike", "electric_bike", "electric_bike", "elect…
$ started_at <dttm> 2021-10-22 12:46:42, 2021-10-21 09:12:37, 2021-10-16 16:…
$ ended_at <dttm> 2021-10-22 12:49:50, 2021-10-21 09:14:14, 2021-10-16 16:…
$ start_station_name <chr> "Kingsbury St & Kinzie St", NA, NA, NA, NA, NA, NA, NA, N…
$ start_station_id <chr> "KA1503000043", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ end_station_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ end_station_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ start_lat <dbl> 41.88919, 41.93000, 41.92000, 41.92000, 41.89000, 41.8900…
$ start_lng <dbl> -87.63850, -87.70000, -87.70000, -87.69000, -87.71000, -8…
$ end_lat <dbl> 41.89000, 41.93000, 41.94000, 41.92000, 41.89000, 41.9300…
$ end_lng <dbl> -87.63000, -87.71000, -87.72000, -87.69000, -87.69000, -8…
$ member_casual <chr> "member", "member", "member", "member", "member", "member…
table(cyclistic_2021_2022$member_casual)
casual member
2401286 3426949
table(cyclistic_2021_2022$rideable_type)
classic_bike docked_bike electric_bike
2740516 192475 2895244
cyclistic_2021_2022 <- cyclistic_2021_2022 %>%
rename(trip_id = ride_id
,bike_type = rideable_type
,start_time = started_at
,end_time = ended_at
,user_type = member_casual)
colnames(cyclistic_2021_2022)
[1] "trip_id" "bike_type" "start_time"
[4] "end_time" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "user_type"
any(is.na(cyclistic_2021_2022))
[1] TRUE
library(skimr)
library(janitor)
skim(cyclistic_2021_2022)
── Data Summary ────────────────────────
Values
Name cyclistic_2021_2022
Number of rows 5828235
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None
5828235 rows the trip_id is a consistent string length at 16 characters
3 unique types of bikes
1591 unique start stations, which do not match the unique start station ids at 1302
1609 unique end stations, which do not match the unique end station ids at 1309
Both of these suggest that the station names are not consistent
Also, curious that the number of unique start and end stations is not the same.
895032 start stations and ids each are missing
958227 end stations and ids each are missing
0 missing start latitudes or longitudes
5844 missing end latitudes and longitudes each
2 unique user types no white spaces
cyclistic_2021_2022_V2 <- cyclistic_2021_2022 %>% drop_na(c(trip_id
,bike_type
,start_time
,end_time
,start_station_name
,start_station_id
,end_station_name
,end_station_id
,start_lat
,start_lng
,end_lat
,end_lng
,user_type))
dim(cyclistic_2021_2022_V2)
[1] 4474141 13
sum(duplicated(cyclistic_2021_2022_V2$trip_id))
[1] 0
cyclistic_2021_2022_V2 <- clean_names(cyclistic_2021_2022_V2)
n_distinct(cyclistic_2021_2022_V2$start_station_name)
[1] 1480
n_distinct(cyclistic_2021_2022_V2$end_station_name)
[1] 1513
skim(cyclistic_2021_2022_V2)
── Data Summary ────────────────────────
Values
Name cyclistic_2021_2022_V2
Number of rows 4474141
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None
5828235 rows went to 4474141
1480 unique start stations, which do not match the unique start station ids at 1252
1513 unique end stations, which do not match the unique end station ids at 1266
Also, curious that the number of unique start and end stations is not the same.
0 start stations and ids are missing
0 end stations and ids are missing
0 missing start latitudes or longitudes
0 missing end latitudes and longitudes
2 unique user types
cyclistic_2021_2022_V3 <- cyclistic_2021_2022_V2[!(cyclistic_2021_2022_V2$start_station_name=="Base - 2132 W Hubbard" |
cyclistic_2021_2022_V2$start_station_name=="Base - 2132 W Hubbard Warehouse" |
cyclistic_2021_2022_V2$start_station_name=="Base - 2132 W Hubbard Warehouse" |
cyclistic_2021_2022_V2$start_station_name=="Hastings WH 2" |
cyclistic_2021_2022_V2$start_station_name=="DIVVY CASSETTE REPAIR MOBILE STATION" |
cyclistic_2021_2022_V2$start_station_name=="Throop/Hastings Mobile Station" |
cyclistic_2021_2022_V2$start_station_name=="Bissell St & Armitage Ave - Charging" |
cyclistic_2021_2022_V2$start_station_name=="Lincoln Ave & Roscoe St - Charging" |
cyclistic_2021_2022_V2$start_station_name=="Pawel Bialowas - Test- PBSC charging station" |
cyclistic_2021_2022_V2$start_station_name=="Wilton Ave & Diversey Pkwy - Charging"),]
cyclistic_2021_2022_V3 <- cyclistic_2021_2022_V3[!(cyclistic_2021_2022_V3$end_station_name=="Base - 2132 W Hubbard" |
cyclistic_2021_2022_V3$end_station_name=="Base - 2132 W Hubbard Warehouse" |
cyclistic_2021_2022_V3$end_station_name=="Base - 2132 W Hubbard Warehouse" |
cyclistic_2021_2022_V3$end_station_name=="Hastings WH 2" |
cyclistic_2021_2022_V3$end_station_name=="DIVVY CASSETTE REPAIR MOBILE STATION" |
cyclistic_2021_2022_V3$end_station_name=="Throop/Hastings Mobile Station" |
cyclistic_2021_2022_V3$end_station_name=="Bissell St & Armitage Ave - Charging" |
cyclistic_2021_2022_V3$end_station_name=="Lincoln Ave & Roscoe St - Charging" |
cyclistic_2021_2022_V3$end_station_name=="Pawel Bialowas - Test- PBSC charging station" |
cyclistic_2021_2022_V3$end_station_name=="Wilton Ave & Diversey Pkwy - Charging"),]
skim(cyclistic_2021_2022_V3)
── Data Summary ────────────────────────
Values
Name cyclistic_2021_2022_V3
Number of rows 4472599
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None
5828235 rows became 4474141 became 4472599
1472 unique start stations, which do not match the unique start station ids at 1245
1502 unique end stations, which do not match the unique end station ids at 1258
0 start stations and ids are missing
0 end stations and ids are missing
0 missing start latitudes or longitudes
0 missing end latitudes and longitudes
2 unique user types
cyclistic_2021_2022_V3$date <- as.Date(cyclistic_2021_2022_V3$start_time)
cyclistic_2021_2022_V3$month <- format(as.Date(cyclistic_2021_2022_V3$date), "%m")
cyclistic_2021_2022_V3$day <- format(as.Date(cyclistic_2021_2022_V3$date), "%d")
cyclistic_2021_2022_V3$year <- format(as.Date(cyclistic_2021_2022_V3$date), "%Y")
cyclistic_2021_2022_V3$day_of_week <- format(as.Date(cyclistic_2021_2022_V3$date), "%A")
colnames(cyclistic_2021_2022_V3)
[1] "trip_id" "bike_type" "start_time"
[4] "end_time" "start_station_name" "start_station_id"
[7] "end_station_name" "end_station_id" "start_lat"
[10] "start_lng" "end_lat" "end_lng"
[13] "user_type" "date" "month"
[16] "day" "year" "day_of_week"
Note to change the manner by which a month name is referred from a number to a name.
View(cyclistic_2021_2022_V3)
Note to retype to integer.
cyclistic_2021_2022_V3$trip_duration<- difftime(cyclistic_2021_2022_V3$end_time, cyclistic_2021_2022_V3$start_time)
glimpse(cyclistic_2021_2022_V3)
Rows: 4,472,599
Columns: 19
$ trip_id <chr> "614B15BC42810184", "ADCC6E3CF9C04688", "6184CC57243AEF3C…
$ bike_type <chr> "docked_bike", "classic_bike", "docked_bike", "docked_bik…
$ start_time <dttm> 2021-10-05 10:56:05, 2021-10-06 13:55:33, 2021-10-16 10:…
$ end_time <dttm> 2021-10-05 11:38:48, 2021-10-06 13:58:16, 2021-10-16 12:…
$ start_station_name <chr> "Michigan Ave & Oak St", "Desplaines St & Kinzie St", "Mi…
$ start_station_id <chr> "13042", "TA1306000003", "13042", "13042", "KA1503000043"…
$ end_station_name <chr> "Michigan Ave & Oak St", "Kingsbury St & Kinzie St", "Mic…
$ end_station_id <chr> "13042", "KA1503000043", "13042", "13042", "TA1306000003"…
$ start_lat <dbl> 41.90096, 41.88872, 41.90096, 41.90096, 41.88918, 42.0582…
$ start_lng <dbl> -87.62378, -87.64445, -87.62378, -87.62378, -87.63851, -8…
$ end_lat <dbl> 41.90096, 41.88918, 41.90096, 41.90096, 41.88872, 42.0582…
$ end_lng <dbl> -87.62378, -87.63851, -87.62378, -87.62378, -87.64445, -8…
$ user_type <chr> "casual", "member", "casual", "casual", "member", "member…
$ date <date> 2021-10-05, 2021-10-06, 2021-10-16, 2021-10-24, 2021-10-…
$ month <chr> "10", "10", "10", "10", "10", "10", "10", "10", "10", "10…
$ day <chr> "05", "06", "16", "24", "23", "25", "01", "21", "08", "31…
$ year <chr> "2021", "2021", "2021", "2021", "2021", "2021", "2021", "…
$ day_of_week <chr> "Tuesday", "Wednesday", "Saturday", "Sunday", "Saturday",…
$ trip_duration <drtn> 2563 secs, 163 secs, 6097 secs, 7587 secs, 125 secs, 307…
Note that trip_duration has trips under 60 seconds long (-7621 seconds) and trips over 86400 seconds long (over 24hrs [2442301 seconds]). Note that these outliers are likely representative of false starts, break downs, or theft and should be removed.
cyclistic_2021_2022_V3$trip_duration<- as.numeric(as.character(cyclistic_2021_2022_V3$trip_duration))
summary(cyclistic_2021_2022_V3)
trip_id bike_type start_time
Length:4472599 Length:4472599 Min. :2021-10-01 00:00:09.00
Class :character Class :character 1st Qu.:2022-03-05 17:19:39.50
Mode :character Mode :character Median :2022-06-09 21:10:45.00
Mean :2022-05-08 21:08:27.41
3rd Qu.:2022-08-02 08:37:36.00
Max. :2022-09-30 23:59:56.00
end_time start_station_name start_station_id
Min. :2021-10-01 00:03:51.0 Length:4472599 Length:4472599
1st Qu.:2022-03-05 17:48:25.5 Class :character Class :character
Median :2022-06-09 21:29:44.0 Mode :character Mode :character
Mean :2022-05-08 21:25:53.9
3rd Qu.:2022-08-02 08:51:07.5
Max. :2022-10-01 14:22:35.0
end_station_name end_station_id start_lat start_lng
Length:4472599 Length:4472599 Min. :41.65 Min. :-87.83
Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
Mode :character Mode :character Median :41.90 Median :-87.64
Mean :41.90 Mean :-87.64
3rd Qu.:41.93 3rd Qu.:-87.63
Max. :42.06 Max. :-87.53
end_lat end_lng user_type date
Min. :41.65 Min. :-87.83 Length:4472599 Min. :2021-10-01
1st Qu.:41.88 1st Qu.:-87.66 Class :character 1st Qu.:2022-03-05
Median :41.90 Median :-87.64 Mode :character Median :2022-06-09
Mean :41.90 Mean :-87.64 Mean :2022-05-08
3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:2022-08-02
Max. :42.06 Max. :-87.53 Max. :2022-09-30
month day year day_of_week
Length:4472599 Length:4472599 Length:4472599 Length:4472599
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
trip_duration
Min. : -7621
1st Qu.: 370
Median : 647
Mean : 1046
3rd Qu.: 1160
Max. :2442301
Note that 74443+269 = 74712.
length(which(cyclistic_2021_2022_V3$trip_duration <= 60))
[1] 74443
length(which(cyclistic_2021_2022_V3$trip_duration >= 86400))
[1] 269
Note that 4472599-74712 = 4397887.
cyclistic_2021_2022_CLEAN <- cyclistic_2021_2022_V3 %>%
filter(trip_duration > 60 & trip_duration < 86400)
dim(cyclistic_2021_2022_CLEAN)
[1] 4397887 19
Note that there were none.
length(which(cyclistic_2021_2022_CLEAN$trip_duration <= 60))
[1] 0
length(which(cyclistic_2021_2022_CLEAN$trip_duration >= 86400))
[1] 0
write.csv(cyclistic_2021_2022_CLEAN, “cyclistic_2021_2022_CLEAN.csv”)
cyclistic_analysis<- cyclistic_2021_2022_CLEAN
library(scales)
library(data.table)
library(formattable)
improvement_formatter <- formatter("span", style = x ~ style(font.weight = "bold", color = ifelse(x > .50, "tomato", ifelse(x < .50, "steelblue", "black")))
, x ~ icontext(ifelse (x > .50, "arrow-up", "arrow-down"), x))
user_percent <- cyclistic_analysis %>%
group_by(user_type) %>%
summarize(total_rides = n()) %>%
mutate(percent=percent(total_rides/sum(total_rides),0))
formattable(user_percent,
align =c("l", "c","r"),
list('user_type' = color_tile("seashell2", "seashell3"), 'percent' = improvement_formatter))
ggplot(user_percent, aes(x = "", y = percent, fill = user_type)) +
geom_col(color = "black") +
geom_label(aes(label = percent), color = c("white", 1), position = position_stack(vjust = 0.5),show.legend = FALSE) +
guides(fill = guide_legend(title = "Percentage of Total Rides by User Type")) +
scale_fill_viridis_d() +
coord_polar(theta = "y") +
theme_void()
Note that month references should be changed to names.
table(cyclistic_2021_2022_CLEAN$month)
01 02 03 04 05 06 07 08 09 10 11 12
79015 87606 212719 268413 493923 609513 630474 593928 525033 471244 252093 173926
Note that names are out of order now that data type is changed to string.
cyclistic_analysis <- cyclistic_2021_2022_CLEAN %>% mutate(month = month.abb[as.numeric(month)])
table(cyclistic_analysis$month)
Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
268413 593928 173926 87606 79015 630474 609513 212719 493923 252093 471244 525033
cyclistic_analysis$month <- ordered(cyclistic_analysis$month, levels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug", "Sep", "Oct","Nov", "Dec"))
table(cyclistic_analysis$month)
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
79015 87606 212719 268413 493923 609513 630474 593928 525033 471244 252093 173926
rm(month_totals)
rides_by_month<- cyclistic_analysis %>%
group_by(month, user_type) %>%
summarize(total_rides = n()) %>%
mutate(percent = (percent(total_rides/sum(total_rides),0)))
formattable(rides_by_month,
align =c("l","c","c"),
list('month' = color_tile("pink", "lightblue"), 'percent' = improvement_formatter))
cyclistic_analysis%>%
group_by(user_type, month)%>%
summarize(number_of_rides = n())%>%
arrange(user_type, month)%>%
ggplot(aes(x = month, y = number_of_rides, fill = user_type)) + geom_col(position = "dodge", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend(title = "User Type"))+
ggtitle("Ride Count by User by Month")
cyclistic_analysis%>%
group_by(month, user_type)%>%
summarize(total_rides = n())%>%
mutate(percent = percent(total_rides / sum(total_rides)))%>%
arrange(user_type, month)%>%
ggplot(aes(x = month, y = percent, fill = user_type)) + geom_col(position = "dodge", color="black") + scale_fill_hue(l=40)+
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Percentage Rides by User by Month")
table(cyclistic_2021_2022_CLEAN$day_of_week)
Friday Monday Saturday Sunday Thursday Tuesday Wednesday
626214 573444 726464 605710 626459 620765 618831
cyclistic_analysis$day_of_week <- ordered(cyclistic_analysis$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
table(cyclistic_analysis$day_of_week)
Sunday Monday Tuesday Wednesday Thursday Friday Saturday
605710 573444 620765 618831 626459 626214 726464
rides_by_day_of_week<- cyclistic_analysis %>%
group_by(day_of_week, user_type) %>%
summarize(total_rides = n()) %>%
mutate(percent = (percent(total_rides/sum(total_rides),0)))
formattable(rides_by_day_of_week,
align =c("l","c","c"),
list('day_of_week' = color_tile("pink", "lightblue"), 'percent' = improvement_formatter))
cyclistic_analysis%>%
group_by(user_type, day_of_week)%>%
summarize(number_of_rides = n())%>%
arrange(user_type, day_of_week)%>%
ggplot(aes(x = day_of_week, y = number_of_rides, fill = user_type)) + geom_col(position = "dodge", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Ride Count by User by Day")
cyclistic_analysis%>%
group_by(day_of_week, user_type)%>%
summarize(total_rides = n())%>%
mutate(percent=formattable::percent(total_rides/sum(total_rides)))%>%
arrange(user_type, day_of_week)%>%
ggplot(aes(x = day_of_week, y = percent, fill = user_type)) + geom_col(position = "dodge", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Percentage of Rides by User by Day")
round(summary(cyclistic_analysis$trip_duration)/60)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 6 11 18 20 1439
Note that the average duration of a casual user’s ride is roughly twice as long as a member’s ride.
summary_trip_duration <- cyclistic_analysis %>%
group_by(user_type) %>%
summarize(average_duration=round((mean(trip_duration))/60),
median_duration=round(median(trip_duration)/60),
min_duration=round(min(trip_duration)/60),
max_duration=round(max(trip_duration)/60))
formattable(summary_trip_duration)
improvement <- formatter("span", style = x ~ style(font.weight = "bold", color = ifelse(x > 20, "tomato", ifelse(x < 20, "steelblue", "black")))
, x ~ icontext(ifelse(x >= 20, "arrow-up", "arrow-down"), x))
by_month <- cyclistic_analysis%>%
group_by(month, user_type)%>%
summarize(average_duration=round((mean(trip_duration))/60))%>%
arrange(month, user_type)
formattable(by_month,
align =c("l","c","c"),
list('month' = color_tile("pink", "lightblue"), 'average_duration' = improvement))
cyclistic_analysis%>%
group_by(user_type, month) %>%
summarize(average_trip_duration = mean(trip_duration)/60) %>%
arrange(user_type, month) %>%
ggplot(aes(x = month, y = average_trip_duration, fill = user_type)) + geom_col(position = "stack", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Average Trip Duration by User by Month")
by_day_of_week <- cyclistic_analysis%>%
group_by(day_of_week, user_type)%>%
summarize(average_duration=round((mean(trip_duration))/60))%>%
arrange(day_of_week, user_type)
tibble(by_day_of_week)
formattable(by_day_of_week,
align =c("l","c","c"),
list('day_of_week' = color_tile("pink", "lightblue"), 'average_duration' = improvement))
cyclistic_analysis%>%
group_by(user_type, day_of_week) %>%
summarize(average_duration = mean(trip_duration)/60) %>%
arrange(user_type, day_of_week) %>%
ggplot(aes(x = day_of_week, y = average_duration, fill = user_type)) + geom_col(position = "stack", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Average Trip Duration by User by Day")
by_bike_type <- cyclistic_analysis %>%
group_by(bike_type) %>%
summarize(total_number = n()) %>%
mutate(percent=percent(total_number/sum(total_number),0))
formattable(by_bike_type,
align =c("l","c","r"),
list('bike_type' = color_tile("pink", "red"), 'percent' = improvement_formatter))
ggplot(by_bike_type, aes(x = "", y = percent, fill = bike_type)) +
geom_col(color = "black") +
geom_label(aes(label = percent), color = c("white", 1, 1), position = position_stack(vjust = 0.5),show.legend = FALSE) +
guides(fill = guide_legend(title = "Percentage of Use of Bike Types")) +
scale_fill_viridis_d() +
coord_polar(theta = "y") +
theme_void()
bike_preference <- cyclistic_analysis %>%
group_by(bike_type, user_type) %>%
summarize(number_of_rides = n(), average_duration=round((mean(trip_duration))/60))%>%
mutate(percent=percent(number_of_rides/sum(number_of_rides),0))%>%
arrange(user_type, bike_type)
formattable(bike_preference,
align =c("l","c","c"),
list('bike_type' = color_tile("pink", "lightblue"), 'percent' = improvement_formatter))
bike_preference%>%
ggplot(aes(x = bike_type, y = number_of_rides, fill = user_type)) + geom_col(position = "dodge", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Ride Count by Bike and User Type")
bike_preference%>%
ggplot(aes(x = bike_type, y = percent, fill = user_type)) + geom_col(position = "dodge", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Percentage of Rides by Bike and User Type")
bike_preference%>%
ggplot(aes(x = bike_type, y = average_duration, fill = user_type)) +
geom_col(position = "dodge", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Average Duration of Rides by Bike and User Type")
Note that electric bikes were chosen roughly 30% of the time for both user types.
by_casual <- cyclistic_analysis %>%
filter(user_type == "casual") %>%
group_by(user_type, bike_type) %>%
summarize(total_number = n(), percent = n()) %>%
mutate(percent = percent(percent / sum(percent),0))
formattable(by_casual,
align =c("l","c","c", "r"),
list('bike_type' = color_tile("pink", "lightblue"), 'percent' = improvement_formatter))
ggplot(by_casual, aes(x = "", y = percent, fill = bike_type)) +
geom_col(color = "black") +
geom_label(aes(label = percent), color = c("white", 1, 1), position = position_stack(vjust = 0.5),show.legend = FALSE) +
guides(fill = guide_legend(title = "Percentage of Bike Type Use by Casual Users")) +
scale_fill_viridis_d() +
coord_polar(theta = "y") +
theme_void()
by_member <- cyclistic_analysis %>%
filter(user_type == "member") %>%
group_by(user_type, bike_type) %>%
summarize(total_number = n(), percent = n()) %>%
mutate(percent = percent(percent / sum(percent),0))
formattable(by_casual,
align =c("l","c","c", "r"),
list('bike_type' = color_tile("pink", "lightblue"), 'percent' = improvement_formatter))
ggplot(by_member, aes(x = "", y = percent, fill = bike_type)) +
geom_col(color = "black") +
geom_label(aes(label = percent), color = c("white", 1), position = position_stack(vjust = 0.5),show.legend = FALSE) +
guides(fill = guide_legend(title = "Percentage of Bike Type Use by Member Users")) +
scale_fill_viridis_d() +
coord_polar(theta = "y") +
theme_void()
cyclistic_analysis_V2 <- cyclistic_analysis %>%
mutate(round_trip = start_station_id == end_station_id)
round_trip <- cyclistic_analysis_V2 %>%
group_by(user_type, round_trip) %>%
summarize(number_of_rides = n(), average_duration = round(mean(trip_duration)/60))%>%
mutate(percent_round_trips = percent(number_of_rides / sum(number_of_rides),0))%>%
arrange(user_type, round_trip)
formattable(round_trip,
align =c("l","c","c", "c", "r"),
list('bike_type' = color_tile("pink", "lightblue"), 'percent_round_trips' = formatter("span", style = x ~ style(font.weight = "bold", color = ifelse(x > .90, "tomato", ifelse(x < .90, "steelblue", "black"))))))
round_trip %>%
ggplot(aes(x = round_trip, y = percent_round_trips, fill = user_type)) +
geom_col(position = "dodge", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Percentage of Round Trips by Users")
round_trip <- cyclistic_analysis_V2 %>%
group_by(user_type, round_trip, bike_type) %>%
summarize(number_of_rides = n(), average_duration = round(mean(trip_duration)/60))%>%
mutate(percent_round_trips = percent(number_of_rides / sum(number_of_rides),0))%>%
arrange(user_type, round_trip, bike_type)
formattable(round_trip, align =c("l","c","c","c","c","r"),
list('percent_round_trips' = formatter("span", style = x ~ style(font.weight = "bold"))))
round_trip_TRUE <- round_trip %>%
filter(round_trip == TRUE)
round_trip_TRUE %>%
ggplot(aes(x = bike_type, y = percent_round_trips, fill = user_type)) +
geom_col(position = "dodge", color="black") +
scale_fill_hue(l=40) +
guides(fill = guide_legend (title = "User Type"))+
ggtitle("Percentage of Round Trips by Bike and User Types")
most_popular_stations <- cyclistic_analysis_V2 %>%
group_by(start_station_id, start_station_name) %>%
summarize(number_of_rides = n(),average_duration = round(mean(trip_duration)/60))%>%
arrange(desc(number_of_rides))%>%
head(n = 5)
formattable(most_popular_stations)
most_popular_casual <- cyclistic_analysis_V2 %>%
filter(user_type == "casual") %>%
group_by(start_station_id, start_station_name) %>%
summarize(number_of_rides=n(), average_duration=round(mean(trip_duration)/60))%>%
arrange(desc(number_of_rides))%>%
head(n = 5)
formattable(most_popular_casual)
Note that Streeter Dr & Grand Ave is Navy Pier on the Lakefront Trail.
LFT <- cyclistic_analysis_V2 %>%
filter(user_type == "casual", start_station_name == "Streeter Dr & Grand Ave") %>%
group_by(start_station_name, end_station_name)%>%
summarize(number_of_rides = n(),average_duration=round(mean(trip_duration)/60))%>%
arrange(desc(number_of_rides))%>%
head(n = 5)
formattable(LFT)
most_popular_member <- cyclistic_analysis_V2 %>%
filter(user_type == "member") %>%
group_by(start_station_id, start_station_name) %>%
summarize(number_of_rides = n(),average_duration=round(mean(trip_duration)/60))%>%
arrange(desc(number_of_rides))%>%
head(n = 5)
formattable(most_popular_member)
Note that Clinton St & Madison St is Union Station.
Union_Station <- cyclistic_analysis_V2 %>%
filter(user_type == "member", start_station_name == "Clinton St & Madison St") %>%
group_by(start_station_name, end_station_name)%>%
summarize(number_of_rides = n(),average_duration=round(mean(trip_duration)/60))%>%
arrange(desc(number_of_rides))%>%
head(n = 5)
formattable(Union_Station)
least_popular <- cyclistic_analysis_V2 %>%
group_by(start_station_name) %>%
summarize(number_of_rides = n())%>%
arrange(number_of_rides)%>%
head(n = 5)
formattable(least_popular)
least_popular_casual <- cyclistic_analysis_V2 %>%
filter(user_type == "casual") %>%
group_by(start_station_name) %>%
summarize(number_of_rides = n())%>%
arrange(number_of_rides)%>%
head(n = 5)
formattable(least_popular_casual)
least_popular_member <- cyclistic_analysis_V2 %>%
filter(user_type == "casual") %>%
group_by(start_station_name) %>%
summarize(number_of_rides = n())%>%
arrange(number_of_rides)%>%
head(n = 5)
formattable(least_popular_member)