This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
getwd()
## [1] "C:/Users/TrungDang/Desktop/Data analysis projects/SFbike"
setwd("C:/Users/TrungDang/Desktop/Data analysis projects/SFbike")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.1
## -- Attaching packages ---------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.1
## Warning: package 'dplyr' was built under R version 3.6.1
## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(readr)
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 3.6.1
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.1
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
station <- read_csv("station_data.csv", col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X2 = col_character(),
## X3 = col_double(),
## X4 = col_double(),
## X5 = col_double(),
## X6 = col_character(),
## X7 = col_character()
## )
trip <- read_csv("trip_data.csv", col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X2 = col_double(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_double(),
## X6 = col_character(),
## X7 = col_character(),
## X8 = col_double(),
## X9 = col_double(),
## X10 = col_character(),
## X11 = col_character()
## )
str(station)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 70 obs. of 7 variables:
## $ X1: num 2 3 4 5 6 7 8 9 10 11 ...
## $ X2: chr "San Jose Diridon Caltrain Station" "San Jose Civic Center" "Santa Clara at Almaden" "Adobe on Almaden" ...
## $ X3: num 37.3 37.3 37.3 37.3 37.3 ...
## $ X4: num -122 -122 -122 -122 -122 ...
## $ X5: num 27 15 11 19 15 15 15 15 15 19 ...
## $ X6: chr "San Jose" "San Jose" "San Jose" "San Jose" ...
## $ X7: chr "8/6/2013" "8/5/2013" "8/6/2013" "8/5/2013" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. X2 = col_character(),
## .. X3 = col_double(),
## .. X4 = col_double(),
## .. X5 = col_double(),
## .. X6 = col_character(),
## .. X7 = col_character()
## .. )
str(trip)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 354152 obs. of 11 variables:
## $ X1 : num 913460 913459 913455 913454 913453 ...
## $ X2 : num 765 1036 307 409 789 ...
## $ X3 : chr "8/31/2015 23:26" "8/31/2015 23:11" "8/31/2015 23:13" "8/31/2015 23:10" ...
## $ X4 : chr "Harry Bridges Plaza (Ferry Building)" "San Antonio Shopping Center" "Post at Kearny" "San Jose City Hall" ...
## $ X5 : num 50 31 47 10 51 68 51 60 56 47 ...
## $ X6 : chr "8/31/2015 23:39" "8/31/2015 23:28" "8/31/2015 23:18" "8/31/2015 23:17" ...
## $ X7 : chr "San Francisco Caltrain (Townsend at 4th)" "Mountain View City Hall" "2nd at South Park" "San Salvador at 1st" ...
## $ X8 : num 70 27 64 8 60 70 60 74 55 66 ...
## $ X9 : num 288 35 468 68 487 538 363 470 439 472 ...
## $ X10: chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ X11: chr "2139" "95032" "94107" "95113" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. X2 = col_double(),
## .. X3 = col_character(),
## .. X4 = col_character(),
## .. X5 = col_double(),
## .. X6 = col_character(),
## .. X7 = col_character(),
## .. X8 = col_double(),
## .. X9 = col_double(),
## .. X10 = col_character(),
## .. X11 = col_character()
## .. )
As the raw data does not contain column names, we will assigne a name to each column
colnames(station) <- c("stationid","name","lat","long","dockcount","landmark","installation")
colnames(trip) <- c("tripid","duration","startdate","startstation","startterminal", "enddate","endstation","endterminal","bikeid", "subscriptiontype", "zipcode")
There are 70 stations with the corroressponding names. The data also contains the location (lat and long). In addition, the data shows the number of dock for each station. Lastly, the installation date is also reported.
str(trip)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 354152 obs. of 11 variables:
## $ tripid : num 913460 913459 913455 913454 913453 ...
## $ duration : num 765 1036 307 409 789 ...
## $ startdate : chr "8/31/2015 23:26" "8/31/2015 23:11" "8/31/2015 23:13" "8/31/2015 23:10" ...
## $ startstation : chr "Harry Bridges Plaza (Ferry Building)" "San Antonio Shopping Center" "Post at Kearny" "San Jose City Hall" ...
## $ startterminal : num 50 31 47 10 51 68 51 60 56 47 ...
## $ enddate : chr "8/31/2015 23:39" "8/31/2015 23:28" "8/31/2015 23:18" "8/31/2015 23:17" ...
## $ endstation : chr "San Francisco Caltrain (Townsend at 4th)" "Mountain View City Hall" "2nd at South Park" "San Salvador at 1st" ...
## $ endterminal : num 70 27 64 8 60 70 60 74 55 66 ...
## $ bikeid : num 288 35 468 68 487 538 363 470 439 472 ...
## $ subscriptiontype: chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ zipcode : chr "2139" "95032" "94107" "95113" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. X2 = col_double(),
## .. X3 = col_character(),
## .. X4 = col_character(),
## .. X5 = col_double(),
## .. X6 = col_character(),
## .. X7 = col_character(),
## .. X8 = col_double(),
## .. X9 = col_double(),
## .. X10 = col_character(),
## .. X11 = col_character()
## .. )
The table trip contain 11 variables with 354152 rows. Herein is the data dictionnary
Transformation data convert some of the fields to factors.
trip$tripid <- as.factor(trip$tripid) # we may not need that column
trip$startterminal <- as.factor(trip$startterminal)
trip$endternminal <- as.factor(trip$endterminal)
trip$subscriptiontype <- as.factor(trip$subscriptiontype)
trip$bikeid <- as.factor(trip$bikeid)
trip$zipcode <- as.factor(trip$zipcode)
Convert start and end dates to date times.
trip$startdate <- as.POSIXct(trip$startdate, format="%m/%d/%Y %H:%M")
trip$enddate <- as.POSIXct(trip$enddate, format="%m/%d/%Y %H:%M")
DEALING WITH DUPLICATE
nrow(distinct(trip))
## [1] 354152
nrow(distinct(station))
## [1] 70
DEALING WITH MISSING VALUES
plot_missing(trip)
Dealing with missing values
length(which(is.na(trip$zipcode)))
## [1] 278
In the column zipcode, there are 278 rows (0.07% of data) are missing.
INITIAL ANALYSIS : The dataset trip UNIVARIRATE ANALYSIS How many trip_id (unique) in the dataset
trip %>%
distinct(tripid) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 354152
DURATION The longest trip and the short trip
trip %>%
select(duration, startstation, endstation) %>%
filter(duration == max(duration))
## # A tibble: 1 x 3
## duration startstation endstation
## <dbl> <chr> <chr>
## 1 17270400 South Van Ness at Market 2nd at Folsom
We would like to see what are these trips
trip %>%
select(startstation, endstation, duration) %>%
filter(duration == min(duration))
## # A tibble: 27 x 3
## startstation endstation duration
## <chr> <chr> <dbl>
## 1 Temporary Transbay Terminal (Ho~ Temporary Transbay Terminal (~ 60
## 2 San Francisco Caltrain 2 (330 T~ San Francisco Caltrain 2 (330~ 60
## 3 Howard at 2nd Howard at 2nd 60
## 4 San Francisco Caltrain (Townsen~ San Francisco Caltrain 2 (330~ 60
## 5 Post at Kearny 2nd at South Park 60
## 6 Spear at Folsom Spear at Folsom 60
## 7 Steuart at Market Steuart at Market 60
## 8 Howard at 2nd Howard at 2nd 60
## 9 Harry Bridges Plaza (Ferry Buil~ Harry Bridges Plaza (Ferry Bu~ 60
## 10 San Francisco Caltrain (Townsen~ San Francisco Caltrain (Towns~ 60
## # ... with 17 more rows
There are 27 rows where the duration of trip is only 60 seconds. Let see if the starting station and the ending station are the same.
trip %>%
select(startstation, endstation, duration) %>%
filter(startstation == endstation, duration == min(duration))
## # A tibble: 23 x 3
## startstation endstation duration
## <chr> <chr> <dbl>
## 1 Temporary Transbay Terminal (Ho~ Temporary Transbay Terminal (~ 60
## 2 San Francisco Caltrain 2 (330 T~ San Francisco Caltrain 2 (330~ 60
## 3 Howard at 2nd Howard at 2nd 60
## 4 Spear at Folsom Spear at Folsom 60
## 5 Steuart at Market Steuart at Market 60
## 6 Howard at 2nd Howard at 2nd 60
## 7 Harry Bridges Plaza (Ferry Buil~ Harry Bridges Plaza (Ferry Bu~ 60
## 8 San Francisco Caltrain (Townsen~ San Francisco Caltrain (Towns~ 60
## 9 Embarcadero at Folsom Embarcadero at Folsom 60
## 10 Embarcadero at Sansome Embarcadero at Sansome 60
## # ... with 13 more rows
Among 27 trips where the duration is 60 s, there are 23 trips that people take and drop the bike at the same station. We can speculate that these biker changed chose not to bike just after pick up the bike.
trip %>%
select(startstation, endstation, duration) %>%
filter(startstation != endstation, duration == min(duration))
## # A tibble: 4 x 3
## startstation endstation duration
## <chr> <chr> <dbl>
## 1 San Francisco Caltrain (Townsend~ San Francisco Caltrain 2 (330~ 60
## 2 Post at Kearny 2nd at South Park 60
## 3 Howard at 2nd 2nd at South Park 60
## 4 South Van Ness at Market Market at 10th 60
summary(trip$duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 60 342 511 1046 739 17270400
The mean duration is 1046 seconde Distribution: box plot and histogram
ggplot(data = trip, aes(duration))+
geom_histogram(bins= 300,
col = "blue",
fill = "blue",
alpha = 0.5) +
scale_x_continuous(name = "Duration",
limits = c(0, 10000)) +
scale_y_continuous(name = "Count")+
ggtitle("Duration distribution")+
theme(panel.background = element_blank())
## Warning: Removed 3879 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
Visualize the distribution of all the trips that last 1 hour maximum.
ggplot(data = trip, aes(duration))+
geom_histogram(bins= 100,
col = "blue",
fill = "blue",
alpha = 0.5) +
scale_x_continuous(name = "Duration",
limits = c(0, 3600)) +
scale_y_continuous(name = "Count")+
ggtitle("Duration distribution")+
theme(panel.background = element_blank())
## Warning: Removed 9438 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
Visualize the distribution of all the trips that last 30 minutes maximum.
ggplot(data = trip, aes(duration))+
geom_histogram(bins= 36,
col = "blue",
fill = "blue",
alpha = 0.5) +
scale_x_continuous(name = "Duration (seconds)",
limits = c(0, 1800)) +
scale_y_continuous(name = "Frequency")+
ggtitle("Duration distribution of all the trips")+
theme(panel.background = element_blank())
## Warning: Removed 15798 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
How many trip where the duration is less than or equal to 1 hour.
trip %>%
filter(duration <= 3600) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 344714
trip %>%
filter(duration > 3600) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 9438
Ther are about 9438 trips whose duration is longer than 1 hours. There are outliers in the duration columns.
boxplot(trip$duration,
ylim= c(0,1800),
names = c("Duration"),
col = c("blue"),
main = "Distribution of Duration",
ylab = "Duration (s)",
border = c("red"),
boxwex = 0.5)
WORKING WITH DATE: STARTDATE and ENDATE
str(trip$startdate)
## POSIXct[1:354152], format: "2015-08-31 23:26:00" "2015-08-31 23:11:00" ...
head(trip$startdate)
## [1] "2015-08-31 23:26:00 EDT" "2015-08-31 23:11:00 EDT"
## [3] "2015-08-31 23:13:00 EDT" "2015-08-31 23:10:00 EDT"
## [5] "2015-08-31 23:09:00 EDT" "2015-08-31 23:07:00 EDT"
We should strip this column into startyear, startmonth, startday, starthour Also, we will strip the column endate into 4 columns: endyear, endmonth, endday, and endhour
# lubridate package
trip <- trip %>%
mutate(start_date = date(startdate),
start_year = year(startdate),
start_month = month(startdate),
start_hour = hour(startdate),
start_dayofweek = wday(startdate),
start_dayofmonth = mday(startdate),
trip_ym = floor_date(startdate, "month"),
end_date = date(enddate),
end_year = year(enddate),
end_month = month(enddate),
end_hour = hour(enddate),
end_dayofweek = wday(enddate),
end_dayofmonth = mday(enddate),
trip_ym = floor_date(enddate, "month"))
As we want to see the day of week, we will create a column of day of week
trip %>%
select(start_month,start_dayofweek,end_month,end_dayofweek ) %>%
head(10)
## # A tibble: 10 x 4
## start_month start_dayofweek end_month end_dayofweek
## <dbl> <dbl> <dbl> <dbl>
## 1 8 2 8 2
## 2 8 2 8 2
## 3 8 2 8 2
## 4 8 2 8 2
## 5 8 2 8 2
## 6 8 2 8 2
## 7 8 2 8 2
## 8 8 2 8 2
## 9 8 2 8 2
## 10 8 2 8 2
We should convert these columns into the proper format For day of week
trip$start_dayofweek <- as.factor(trip$start_dayofweek)
levels(trip$start_dayofweek) <- c("Sunday", "Monday", "Tueday", "Wednesday", "Thursday", "Friday", "Saturday")
trip$end_dayofweek <- as.factor(trip$end_dayofweek)
levels(trip$end_dayofweek) <- c("Sunday", "Monday", "Tueday", "Wednesday", "Thursday", "Friday", "Saturday")
For month
trip$start_month <- as.factor(trip$start_month)
levels(trip$start_month) <- c("January","February","March","April","May","June", "July","August","September","October","November","December")
trip$end_month <- as.factor(trip$end_month)
levels(trip$end_month) <- c("January","February","March","April","May","June","July","August","September","October","November","December")
Checking these four columns
trip %>%
select(start_month,start_dayofweek, end_month, end_dayofweek ) %>%
head(10)
## # A tibble: 10 x 4
## start_month start_dayofweek end_month end_dayofweek
## <fct> <fct> <fct> <fct>
## 1 August Monday August Monday
## 2 August Monday August Monday
## 3 August Monday August Monday
## 4 August Monday August Monday
## 5 August Monday August Monday
## 6 August Monday August Monday
## 7 August Monday August Monday
## 8 August Monday August Monday
## 9 August Monday August Monday
## 10 August Monday August Monday
trip %>%
group_by(start_year) %>%
count()
## # A tibble: 2 x 2
## # Groups: start_year [2]
## start_year n
## <dbl> <int>
## 1 2014 111095
## 2 2015 243057
startmonth_trip <- trip %>%
group_by(start_month) %>%
summarise(counts = n())
g_startmonth <- ggplot(startmonth_trip,
aes(x = start_month, y = counts))
g_startmonth + geom_bar(stat = "identity",
width = 0.5,
fill = "cornflowerblue") +
labs(title = "Number of trips per month",
caption = "SF Bay bike share",
x = "Month",
y = "Counts") +
theme(axis.text.x = element_text(size = 18, angle = 65, vjust = 0.6) +
theme_set(theme_classic())) +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25)
endmonth_trip <- trip %>%
group_by(end_month) %>%
summarise(counts = n())
g_endmonth <- ggplot(endmonth_trip,
aes(x = end_month, y = counts))
g_endmonth + geom_bar(stat = "identity",
width = 0.5,
fill = "purple") +
labs(title = "Number of trips per month",
caption = "SF Bay bike share",
x = "Month",
y = "Counts") +
theme(axis.text.x = element_text(size = 18, angle = 65, vjust = 0.6) +
theme_set(theme_classic())) +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25)
It is obviously that these two bar charts are almost identical, except for a few long trip. When look at the monthly data for the number of trips, we should consider the fact that there are seven months that have 31 days, and the month of February has only 28 days.
trip %>%
filter(duration == max(duration)) %>%
select(startdate, enddate)
## # A tibble: 1 x 2
## startdate enddate
## <dttm> <dttm>
## 1 2014-12-06 21:59:00 2015-06-24 20:18:00
trip %>%
filter(start_year != end_year) %>%
select(startdate, enddate, bikeid, startstation, endstation)
## # A tibble: 1 x 5
## startdate enddate bikeid startstation endstation
## <dttm> <dttm> <fct> <chr> <chr>
## 1 2014-12-06 21:59:00 2015-06-24 20:18:00 535 South Van Ness~ 2nd at Fo~
The user took the bike (bikeid 535) for 199 days.
trip %>%
filter(start_month != end_month) %>%
select(startdate, enddate, duration = duration, bikeid, startstation, endstation)
## # A tibble: 28 x 6
## startdate enddate duration bikeid startstation
## <dttm> <dttm> <dbl> <fct> <chr>
## 1 2015-07-31 23:59:00 2015-08-01 00:14:00 917 583 Embarcadero~
## 2 2015-07-31 23:56:00 2015-08-01 00:01:00 330 344 2nd at Town~
## 3 2015-07-31 23:54:00 2015-08-01 00:14:00 1198 528 Embarcadero~
## 4 2015-07-31 22:59:00 2015-08-01 00:56:00 7057 531 Embarcadero~
## 5 2015-07-31 22:56:00 2015-08-01 00:56:00 7183 214 Embarcadero~
## 6 2015-06-28 21:50:00 2015-07-23 15:27:00 2137000 466 Market at S~
## 7 2015-05-31 18:20:00 2015-06-01 16:22:00 79336 207 Park at Oli~
## 8 2015-05-31 17:39:00 2015-06-01 16:22:00 81783 648 Park at Oli~
## 9 2015-05-31 17:05:00 2015-06-01 16:22:00 83830 695 Park at Oli~
## 10 2015-04-30 23:57:00 2015-05-01 00:03:00 370 399 Townsend at~
## # ... with 18 more rows, and 1 more variable: endstation <chr>
day of month
start_dayofmonth_tbl <- trip %>%
group_by(start_dayofmonth) %>%
count()
g_occurrenceday <- ggplot(start_dayofmonth_tbl, aes(x = start_dayofmonth, y = n))
g_occurrenceday +
geom_line(color = "blue", size = 1) +
geom_point(color = "purple", size = 3)+
labs(title = "Number of crimes by day of month",
caption = "Source: SF Bay bike share",
x = "Day of month",
y = "Counts") + theme_set(theme_classic())
end_dayofmonth_tbl <- trip %>%
group_by(end_dayofmonth) %>%
count()
g_occurrenceday <- ggplot(end_dayofmonth_tbl, aes(x = end_dayofmonth, y = n))
g_occurrenceday +
geom_line(color = "purple", size = 1) +
geom_point(color = "blue", size = 3)+
labs(title = "Number of crimes by day of month",
caption = "Source: SF Bay bike share",
x = "Day of month",
y = "Counts") + theme_set(theme_classic())
When comparing the total number of trips in function of day of month, we should remember that are 7 months that have 31 days.
What day of week that bikes were used at most?
start_dayofweek_trip <- trip %>%
group_by(start_dayofweek) %>%
summarise(counts = n())
g_start_dayofweek <- ggplot(start_dayofweek_trip,
aes(x = start_dayofweek, y = counts))
g_start_dayofweek + geom_bar(stat = "identity",
width = 0.5,
fill = "purple") +
labs(title = "Number of trips from Monday to Sunday (At starting station)",
caption = "SF Bay bike share",
x = "Day of week",
y = "Counts") +
theme(axis.text.x = element_text(size = 18, angle = 65, vjust = 0.6) +
theme_set(theme_classic())) +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25)
end_dayofweek_trip <- trip %>%
group_by(end_dayofweek) %>%
summarise(counts = n())
g_start_dayofweek <- ggplot(end_dayofweek_trip,
aes(x = end_dayofweek, y = counts))
g_start_dayofweek + geom_bar(stat = "identity",
width = 0.5,
fill = "grey") +
labs(title = "Number of trips from Monday to Sunday (At ending station)",
caption = "SF Bay bike share",
x = "Day of week",
y = "Counts") +
theme(axis.text.x = element_text(size = 18, angle = 65, vjust = 0.6) +
theme_set(theme_classic())) +
geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25)
Once again, the two graphs are almost identical. The number of trips on Friday are less than the other week day. Remarkably, the use of bike during Saturday and Sunday plunged.
Time of the day
starthour_tbl <- trip %>%
group_by(start_hour) %>%
summarise(counts = n())
g_occurrencehour <- ggplot(starthour_tbl,
aes(x = start_hour, y = counts))
g_occurrencehour + geom_line(color = "grey") + geom_point (color = "blue", size = 4) +
labs(title = "Number of trips at starting stations by time of the day",
caption = "Source: SF Bay bike share",
x = "Time of the day",
y = "Counts") + theme_set(theme_classic())
How many trips from 8 pm to 5 am.
trip %>%
filter(start_hour >= 20 | start_hour <= 6) %>%
group_by(start_hour) %>%
count()
## # A tibble: 11 x 2
## # Groups: start_hour [11]
## start_hour n
## <int> <int>
## 1 0 1014
## 2 1 511
## 3 2 282
## 4 3 156
## 5 4 640
## 6 5 1848
## 7 6 8014
## 8 20 8251
## 9 21 5741
## 10 22 3662
## 11 23 2207
There are still some trips effectued from 0 am to 5 am. The number of trip increases gradually and peaked at 8 am and decreased at 10 am. The bike use from 10 am to 3 pm was steady, and jumped dramatically at 4 pm at reached the highest number at 5 pm. In other words, the number of trips (or the bike use) are significantly high during rush hour.
The following graph show the frequency of bike use at the endding station
endhour_tbl <- trip %>%
group_by(end_hour) %>%
summarise(counts = n())
g_occurrencehour <- ggplot(endhour_tbl,
aes(x = end_hour, y = counts))
g_occurrencehour + geom_line(color = "grey") + geom_point (color = "blue", size = 4) +
labs(title = "Number of trips at ending stations by time of the day",
caption = "Source: SF Bay bike share",
x = "Time of the day",
y = "Counts") + theme_set(theme_classic())
STARTSTATION The top 10 used start station
top_startstation <- trip %>%
group_by(startstation) %>%
summarise(counts = n()) %>%
arrange(desc(counts)) %>%
top_n(10)
## Selecting by counts
g1a <- ggplot(top_startstation,
aes(x= reorder(startstation, counts), y = counts))
g1a + geom_bar(stat="identity",
width = 0.5,
fill = "blue") +
labs(title="The frequency the most trips at startstation",
caption="Source: SF Bay Area Bike Share",
x = "Starting Stations",
y= "Number of trips") + coord_flip()+
theme(axis.text.x = element_text(angle = 65, vjust=1) +
theme_set(theme_classic())) +
geom_text(aes(label=counts), position = position_dodge(width = 0.9), vjust=-1)
least_startstation <- trip %>%
group_by(startstation) %>%
summarise(counts = n()) %>%
arrange((counts))
least_startstation <- head(least_startstation,10)
least_startstation
## # A tibble: 10 x 2
## startstation counts
## <chr> <int>
## 1 Franklin at Maple 81
## 2 Redwood City Public Library 118
## 3 San Mateo County Center 127
## 4 Redwood City Medical Center 150
## 5 Mezes Park 212
## 6 Park at Olive 376
## 7 California Ave Caltrain Station 400
## 8 Stanford in Redwood City 436
## 9 SJSU 4th at San Carlos 475
## 10 SJSU - San Salvador at 9th 494
g1b <- ggplot(least_startstation,
aes(x= reorder(startstation, -counts), y = counts))
g1b + geom_bar(stat="identity",
width = 0.5,
fill = "blue") +
labs(title="The frequency the least trips at startstation",
caption="Source: SF Bay Area Bike Share",
x = "Starting Stations",
y= "Number of trips") + coord_flip()+
theme(axis.text.x = element_text(angle = 65, vjust=0.6) +
theme_set(theme_classic())) +
geom_text(aes(label=counts), position = position_dodge(width = 0.9), vjust=-1)
ENDSTATION
top_endstation <- trip %>%
group_by(endstation) %>%
summarise(counts = n()) %>%
arrange(desc(counts)) %>%
top_n(10)
## Selecting by counts
top_endstation
## # A tibble: 10 x 2
## endstation counts
## <chr> <int>
## 1 San Francisco Caltrain (Townsend at 4th) 34810
## 2 San Francisco Caltrain 2 (330 Townsend) 22523
## 3 Harry Bridges Plaza (Ferry Building) 17810
## 4 2nd at Townsend 15463
## 5 Townsend at 7th 15422
## 6 Embarcadero at Sansome 15065
## 7 Market at Sansome 13916
## 8 Steuart at Market 13617
## 9 Temporary Transbay Terminal (Howard at Beale) 12966
## 10 Powell Street BART 10239
g2a <- ggplot(top_endstation,
aes(x= reorder(endstation, counts), y = counts))
g2a + geom_bar(stat="identity",
width = 0.5,
fill = "orange") +
labs(title="The frequency the most trips at the terminal station",
caption="Source: SF Bay Area Bike Share",
x = "Starting Stations",
y= "Number of trips") + coord_flip()+
theme(axis.text.x = element_text(angle = 65, vjust=1) +
theme_set(theme_classic())) +
geom_text(aes(label=counts), position = position_dodge(width = 0.9), vjust=-1)
least_endstation <- trip %>%
group_by(endstation) %>%
summarise(counts = n()) %>%
arrange((counts))
least_endstation <- head(least_endstation, 10)
g2b <- ggplot(least_endstation,
aes(x= reorder(endstation, -counts), y = counts))
g2b + geom_bar(stat="identity",
width = 0.5,
fill = "orange") +
labs(title="The frequency the least trips at the terminal station",
caption="Source: SF Bay Area Bike Share",
x = "Starting Stations",
y= "Number of trips") + coord_flip()+
theme(axis.text.x = element_text(angle = 65, vjust=1) +
theme_set(theme_classic())) +
geom_text(aes(label=counts), position = position_dodge(width = 0.9), vjust=-1)
BIKE
How many bikes are available?
n_distinct(trip$bikeid)
## [1] 668
There are 668 bikes that have been used.
What are the most and the least bike used.
trip %>%
group_by(bikeid) %>%
summarise(frencency_bikes = n()) %>%
arrange((desc(frencency_bikes))) %>%
head(10)
## # A tibble: 10 x 2
## bikeid frencency_bikes
## <fct> <int>
## 1 878 1121
## 2 392 1102
## 3 489 1101
## 4 463 1085
## 5 532 1074
## 6 558 1071
## 7 306 1060
## 8 29 1057
## 9 66 1053
## 10 589 1052
trip %>%
group_by(bikeid) %>%
summarise(frencency_bike = n()) %>%
mutate(usepermonth = round(frencency_bike/12),
userperday = round(frencency_bike/365)) %>%
arrange((desc(frencency_bike))) %>%
head(10)
## # A tibble: 10 x 4
## bikeid frencency_bike usepermonth userperday
## <fct> <int> <dbl> <dbl>
## 1 878 1121 93 3
## 2 392 1102 92 3
## 3 489 1101 92 3
## 4 463 1085 90 3
## 5 532 1074 90 3
## 6 558 1071 89 3
## 7 306 1060 88 3
## 8 29 1057 88 3
## 9 66 1053 88 3
## 10 589 1052 88 3
These 10 bikes were used 90 time per month, about 3 times a day.
trip %>%
group_by(bikeid) %>%
summarise(frencency_bikes = n()) %>%
arrange(((frencency_bikes))) %>%
head(10)
## # A tibble: 10 x 2
## bikeid frencency_bikes
## <fct> <int>
## 1 62 4
## 2 49 7
## 3 58 26
## 4 24 28
## 5 673 33
## 6 139 34
## 7 35 37
## 8 26 39
## 9 641 52
## 10 216 54
On avarage, how many time a bike is use
trip %>%
summarise(frencency_bike = n(),
average = sum(frencency_bike)/668)
## # A tibble: 1 x 2
## frencency_bike average
## <int> <dbl>
## 1 354152 530.
Let check the duration distribution of bike 878
bike878 <- trip %>%
filter(bikeid == "878") %>%
select(duration)
head(bike878)
## # A tibble: 6 x 1
## duration
## <dbl>
## 1 692
## 2 749
## 3 905
## 4 5773
## 5 370
## 6 323
his1 <- ggplot(bike878, aes(x = duration)) +
geom_histogram(aes(y = ..count..), binwidth = 60,
colour = "blue", fill = "blue") +
scale_x_continuous(name = "Mean ozone in\nparts per billion",
breaks = seq(0, 30000, 60),
limits=c(0, 30000)) +
scale_y_continuous(name = "Count") +
ggtitle("Frequency histogram of mean ozone")
his1
## Warning: Removed 2 rows containing missing values (geom_bar).
duration_bike878 <- trip %>%
filter(bikeid == "878") %>%
group_by(duration) %>%
select(duration) %>%
arrange(desc(duration))
head(duration_bike878,10)
## # A tibble: 10 x 1
## # Groups: duration [10]
## duration
## <dbl>
## 1 29137
## 2 23783
## 3 23518
## 4 22545
## 5 15271
## 6 14758
## 7 12761
## 8 11571
## 9 10521
## 10 9690
duration_bike878_least <- trip %>%
filter(bikeid == "878") %>%
group_by(duration) %>%
select(duration) %>%
arrange((duration))
head(duration_bike878_least,10)
## # A tibble: 10 x 1
## # Groups: duration [10]
## duration
## <dbl>
## 1 61
## 2 66
## 3 78
## 4 80
## 5 85
## 6 88
## 7 100
## 8 110
## 9 119
## 10 123
duration_bike62 <- trip %>%
filter(bikeid == "62") %>%
group_by(duration) %>%
select(duration) %>%
arrange(desc(duration))
head(duration_bike62,10)
## # A tibble: 4 x 1
## # Groups: duration [4]
## duration
## <dbl>
## 1 3752
## 2 1395
## 3 522
## 4 184
ggplot(data = bike878, aes(duration))+
geom_histogram(bins= 600,
col = "blue",
fill = "blue",
alpha = 0.5) +
scale_x_continuous(name = "Duration",
limits = c(0, 30000)) +
scale_y_continuous(name = "Count")+
ggtitle("Duration distribution for bike 878")+
theme(panel.background = element_blank())
## Warning: Removed 2 rows containing missing values (geom_bar).
ggplot(data = bike878, aes(duration))+
geom_histogram(bins = 60,
col = "blue",
fill = "blue",
alpha = 0.5) +
scale_x_continuous(name = "Duration",
limits = c(0,3600)) +
scale_y_continuous(name = "Count")+
ggtitle("Duration distribution for bike 878")+
theme(panel.background = element_blank())
## Warning: Removed 37 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
How many trips that last more than 1 h
trip %>%
filter(duration > 3600) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 9438
There are 9738 trips whose the duration is longer than 60 min (9738*100%/354152) = 2.74 %
SUBscription TYPE
summary(trip$subscriptiontype)
## Customer Subscriber
## 43935 310217
trip$subscriptiontype <- as.factor(trip$subscriptiontype)
summary(trip$subscriptiontype)
## Customer Subscriber
## 43935 310217
There are 43935 Custumer and 310217 Subscriber.
Creating a list where we found the min duration between two station. We exclude the trips where the startstation and the end station is the same.
samepoint <- trip %>%
select(startstation,endstation, duration) %>%
group_by(startstation, endstation) %>%
filter(startstation == endstation) %>%
summarise(counts = n())
sum(samepoint$counts)
## [1] 10276
different_point <- trip %>%
select(startstation,endstation, duration) %>%
group_by(startstation, endstation) %>%
filter(startstation != endstation) %>%
summarise(counts = n())
sum(different_point$counts)
## [1] 343876
List of starting station and ending station, with the minimal duration of trip We will exclude all the trip where the user pick up and drop off the bike at the same station.
trip_minduration <- trip %>%
select(startstation,endstation, duration) %>%
group_by(startstation, endstation) %>%
filter(startstation != endstation, duration == min(duration)) %>%
arrange(duration)
head(trip_minduration,30)
## # A tibble: 30 x 3
## # Groups: startstation, endstation [26]
## startstation endstation duration
## <chr> <chr> <dbl>
## 1 San Francisco Caltrain (Townsend ~ San Francisco Caltrain 2 (3~ 60
## 2 Post at Kearny 2nd at South Park 60
## 3 Howard at 2nd 2nd at South Park 60
## 4 South Van Ness at Market Market at 10th 60
## 5 Market at Sansome 2nd at South Park 61
## 6 Temporary Transbay Terminal (Howa~ 2nd at South Park 61
## 7 Embarcadero at Folsom Spear at Folsom 61
## 8 Market at Sansome 2nd at South Park 61
## 9 2nd at Folsom 2nd at South Park 61
## 10 Post at Kearny Washington at Kearny 61
## # ... with 20 more rows
trip %>%
select(startstation,endstation, duration) %>%
filter(startstation == 'Castro Street and El Camino Real',
endstation == 'Howard at 2nd')
## # A tibble: 2 x 3
## startstation endstation duration
## <chr> <chr> <dbl>
## 1 Castro Street and El Camino Real Howard at 2nd 179095
## 2 Castro Street and El Camino Real Howard at 2nd 179330
trip %>%
select(startstation,endstation, duration) %>%
filter(startstation == 'MLK Library',
endstation == 'Mountain View Caltrain Station')
## # A tibble: 2 x 3
## startstation endstation duration
## <chr> <chr> <dbl>
## 1 MLK Library Mountain View Caltrain Station 18493
## 2 MLK Library Mountain View Caltrain Station 18475
trip %>%
select(startstation,endstation, duration) %>%
filter(startstation == 'San Antonio Caltrain Station',
endstation == 'Stanford in Redwood City')
## # A tibble: 2 x 3
## startstation endstation duration
## <chr> <chr> <dbl>
## 1 San Antonio Caltrain Station Stanford in Redwood City 2985
## 2 San Antonio Caltrain Station Stanford in Redwood City 2975
trip %>%
select(startstation,endstation, duration) %>%
filter(startstation == 'Beale at Market',
endstation == 'Harry Bridges Plaza (Ferry Building)')
## # A tibble: 122 x 3
## startstation endstation duration
## <chr> <chr> <dbl>
## 1 Beale at Market Harry Bridges Plaza (Ferry Building) 198
## 2 Beale at Market Harry Bridges Plaza (Ferry Building) 314
## 3 Beale at Market Harry Bridges Plaza (Ferry Building) 228
## 4 Beale at Market Harry Bridges Plaza (Ferry Building) 1081
## 5 Beale at Market Harry Bridges Plaza (Ferry Building) 205
## 6 Beale at Market Harry Bridges Plaza (Ferry Building) 156
## 7 Beale at Market Harry Bridges Plaza (Ferry Building) 178
## 8 Beale at Market Harry Bridges Plaza (Ferry Building) 140
## 9 Beale at Market Harry Bridges Plaza (Ferry Building) 157
## 10 Beale at Market Harry Bridges Plaza (Ferry Building) 114
## # ... with 112 more rows
Examining the data set Staion
str(station)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 70 obs. of 7 variables:
## $ stationid : num 2 3 4 5 6 7 8 9 10 11 ...
## $ name : chr "San Jose Diridon Caltrain Station" "San Jose Civic Center" "Santa Clara at Almaden" "Adobe on Almaden" ...
## $ lat : num 37.3 37.3 37.3 37.3 37.3 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ dockcount : num 27 15 11 19 15 15 15 15 15 19 ...
## $ landmark : chr "San Jose" "San Jose" "San Jose" "San Jose" ...
## $ installation: chr "8/6/2013" "8/5/2013" "8/6/2013" "8/5/2013" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. X2 = col_character(),
## .. X3 = col_double(),
## .. X4 = col_double(),
## .. X5 = col_double(),
## .. X6 = col_character(),
## .. X7 = col_character()
## .. )
Transformation data Convert some of the fields to factors.
station$stationid <- as.factor(station$stationid)
station$name <- as.factor(station$name)
station$landmark <- as.factor(station$landmark)
(TO BE CONTINUED)