library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(readr)
station <- read_csv("station_data.csv", col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X2 = col_character(),
## X3 = col_double(),
## X4 = col_double(),
## X5 = col_double(),
## X6 = col_character(),
## X7 = col_character()
## )
trip <- read_csv("trip_data.csv", col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X2 = col_double(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_double(),
## X6 = col_character(),
## X7 = col_character(),
## X8 = col_double(),
## X9 = col_double(),
## X10 = col_character(),
## X11 = col_character()
## )
colnames(station) <- c("id","Name","lat","long","dockcount","landmark","installation")
colnames(trip) <- c("trip_id","duration","startdate","startstation","startterminal", "enddate","endstation","endterminal","bike", "subscriptionType", "ZzipCode")
The most popular bike, i.e.ย the bike that has made the highest number of trips (Exclude trips that start and end at the same station)
trip %>%
filter(!startterminal == endterminal) %>%
group_by(bike) %>%
count(sort=TRUE) %>%
head(1)
## # A tibble: 1 x 2
## # Groups: bike [1]
## bike n
## <dbl> <int>
## 1 878 1090
Find the number of trips made by each subscription type
trip %>%
filter(!startterminal == endterminal) %>%
group_by(subscriptionType) %>%
count()
## # A tibble: 2 x 2
## # Groups: subscriptionType [2]
## subscriptionType n
## <chr> <int>
## 1 Customer 37038
## 2 Subscriber 306838
A table that shows which stations are connected, and the minimum duration between them.
trip %>%
filter(!startstation == endstation) %>%
group_by(startstation, endstation) %>%
summarise(minduration=min(duration)) %>%
arrange(startstation,endstation, minduration) %>%
head(5)
## # A tibble: 5 x 3
## # Groups: startstation [1]
## startstation endstation minduration
## <chr> <chr> <dbl>
## 1 2nd at Folsom 2nd at South Park 61
## 2 2nd at Folsom 2nd at Townsend 137
## 3 2nd at Folsom 5th at Howard 215
## 4 2nd at Folsom Beale at Market 219
## 5 2nd at Folsom Broadway St at Battery St 351
The number of trips originating from each landmark.
table1 <- trip %>%
left_join(station, by = c("startterminal" = "id"))
table2 <- table1 %>%
filter(!startstation == endstation) %>%
group_by(landmark) %>%
count()
table2
## # A tibble: 5 x 2
## # Groups: landmark [5]
## landmark n
## <chr> <int>
## 1 Mountain View 9361
## 2 Palo Alto 2370
## 3 Redwood City 1790
## 4 San Francisco 313528
## 5 San Jose 16827
Number of trips crossing landmarks, i.e trips that originate in one landmark and end in another.
table3 <- trip %>%
left_join(station, by = c("startterminal" = "id"))
colnames(table3)[colnames(table3)=="landmark"] <- "start_landmark"
table4 <- trip %>%
left_join(station, by = c("endterminal"= "id"))
colnames(table4)[colnames(table4)=="landmark"] <- "end_landmark"
table3 %>%
left_join(table4, by = c("trip_id" = "trip_id"))%>%
filter(!start_landmark == end_landmark) %>%
group_by(start_landmark, end_landmark) %>%
count()
## # A tibble: 13 x 3
## # Groups: start_landmark, end_landmark [13]
## start_landmark end_landmark n
## <chr> <chr> <int>
## 1 Mountain View Palo Alto 198
## 2 Mountain View Redwood City 3
## 3 Mountain View San Francisco 4
## 4 Mountain View San Jose 6
## 5 Palo Alto Mountain View 182
## 6 Palo Alto Redwood City 36
## 7 Palo Alto San Francisco 4
## 8 Redwood City Mountain View 1
## 9 Redwood City Palo Alto 64
## 10 San Francisco Mountain View 2
## 11 San Francisco Redwood City 2
## 12 San Jose Mountain View 6
## 13 San Jose San Francisco 1