R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

getwd()
## [1] "C:/Users/TrungDang/Desktop/Data analysis projects/SFbike"
setwd("C:/Users/TrungDang/Desktop/Data analysis projects/SFbike")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.1
## -- Attaching packages ---------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.1
## Warning: package 'dplyr' was built under R version 3.6.1
## -- Conflicts ------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(readr)
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 3.6.1
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.1
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
station <- read_csv("station_data.csv", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_character(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_double(),
##   X6 = col_character(),
##   X7 = col_character()
## )
trip <- read_csv("trip_data.csv", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_character(),
##   X4 = col_character(),
##   X5 = col_double(),
##   X6 = col_character(),
##   X7 = col_character(),
##   X8 = col_double(),
##   X9 = col_double(),
##   X10 = col_character(),
##   X11 = col_character()
## )
str(station)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 70 obs. of  7 variables:
##  $ X1: num  2 3 4 5 6 7 8 9 10 11 ...
##  $ X2: chr  "San Jose Diridon Caltrain Station" "San Jose Civic Center" "Santa Clara at Almaden" "Adobe on Almaden" ...
##  $ X3: num  37.3 37.3 37.3 37.3 37.3 ...
##  $ X4: num  -122 -122 -122 -122 -122 ...
##  $ X5: num  27 15 11 19 15 15 15 15 15 19 ...
##  $ X6: chr  "San Jose" "San Jose" "San Jose" "San Jose" ...
##  $ X7: chr  "8/6/2013" "8/5/2013" "8/6/2013" "8/5/2013" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   X2 = col_character(),
##   ..   X3 = col_double(),
##   ..   X4 = col_double(),
##   ..   X5 = col_double(),
##   ..   X6 = col_character(),
##   ..   X7 = col_character()
##   .. )
str(trip)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 354152 obs. of  11 variables:
##  $ X1 : num  913460 913459 913455 913454 913453 ...
##  $ X2 : num  765 1036 307 409 789 ...
##  $ X3 : chr  "8/31/2015 23:26" "8/31/2015 23:11" "8/31/2015 23:13" "8/31/2015 23:10" ...
##  $ X4 : chr  "Harry Bridges Plaza (Ferry Building)" "San Antonio Shopping Center" "Post at Kearny" "San Jose City Hall" ...
##  $ X5 : num  50 31 47 10 51 68 51 60 56 47 ...
##  $ X6 : chr  "8/31/2015 23:39" "8/31/2015 23:28" "8/31/2015 23:18" "8/31/2015 23:17" ...
##  $ X7 : chr  "San Francisco Caltrain (Townsend at 4th)" "Mountain View City Hall" "2nd at South Park" "San Salvador at 1st" ...
##  $ X8 : num  70 27 64 8 60 70 60 74 55 66 ...
##  $ X9 : num  288 35 468 68 487 538 363 470 439 472 ...
##  $ X10: chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ X11: chr  "2139" "95032" "94107" "95113" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   X2 = col_double(),
##   ..   X3 = col_character(),
##   ..   X4 = col_character(),
##   ..   X5 = col_double(),
##   ..   X6 = col_character(),
##   ..   X7 = col_character(),
##   ..   X8 = col_double(),
##   ..   X9 = col_double(),
##   ..   X10 = col_character(),
##   ..   X11 = col_character()
##   .. )

As the raw data does not contain column names, we will assigne a name to each column

colnames(station) <- c("stationid","name","lat","long","dockcount","landmark","installation")
colnames(trip) <- c("tripid","duration","startdate","startstation","startterminal", "enddate","endstation","endterminal","bikeid", "subscriptiontype", "zipcode")

There are 70 stations with the corroressponding names. The data also contains the location (lat and long). In addition, the data shows the number of dock for each station. Lastly, the installation date is also reported.

str(trip)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 354152 obs. of  11 variables:
##  $ tripid          : num  913460 913459 913455 913454 913453 ...
##  $ duration        : num  765 1036 307 409 789 ...
##  $ startdate       : chr  "8/31/2015 23:26" "8/31/2015 23:11" "8/31/2015 23:13" "8/31/2015 23:10" ...
##  $ startstation    : chr  "Harry Bridges Plaza (Ferry Building)" "San Antonio Shopping Center" "Post at Kearny" "San Jose City Hall" ...
##  $ startterminal   : num  50 31 47 10 51 68 51 60 56 47 ...
##  $ enddate         : chr  "8/31/2015 23:39" "8/31/2015 23:28" "8/31/2015 23:18" "8/31/2015 23:17" ...
##  $ endstation      : chr  "San Francisco Caltrain (Townsend at 4th)" "Mountain View City Hall" "2nd at South Park" "San Salvador at 1st" ...
##  $ endterminal     : num  70 27 64 8 60 70 60 74 55 66 ...
##  $ bikeid          : num  288 35 468 68 487 538 363 470 439 472 ...
##  $ subscriptiontype: chr  "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ zipcode         : chr  "2139" "95032" "94107" "95113" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   X2 = col_double(),
##   ..   X3 = col_character(),
##   ..   X4 = col_character(),
##   ..   X5 = col_double(),
##   ..   X6 = col_character(),
##   ..   X7 = col_character(),
##   ..   X8 = col_double(),
##   ..   X9 = col_double(),
##   ..   X10 = col_character(),
##   ..   X11 = col_character()
##   .. )

The table trip contain 11 variables with 354152 rows. Herein is the data dictionnary

Transformation data convert some of the fields to factors.

trip$tripid <- as.factor(trip$tripid) # we may not need that column
trip$startterminal <- as.factor(trip$startterminal)
trip$endternminal <- as.factor(trip$endterminal)
trip$subscriptiontype <- as.factor(trip$subscriptiontype)
trip$bikeid <- as.factor(trip$bikeid)
trip$zipcode <- as.factor(trip$zipcode)

Convert start and end dates to date times.

trip$startdate <- as.POSIXct(trip$startdate, format="%m/%d/%Y %H:%M")
trip$enddate <- as.POSIXct(trip$enddate, format="%m/%d/%Y %H:%M")

DEALING WITH DUPLICATE

nrow(distinct(trip))
## [1] 354152
nrow(distinct(station))
## [1] 70

DEALING WITH MISSING VALUES

plot_missing(trip)

Dealing with missing values

length(which(is.na(trip$zipcode)))
## [1] 278

In the column zipcode, there are 278 rows (0.07% of data) are missing.

INITIAL ANALYSIS : The dataset trip UNIVARIRATE ANALYSIS How many trip_id (unique) in the dataset

 trip %>% 
  distinct(tripid) %>% 
  count()
## # A tibble: 1 x 1
##        n
##    <int>
## 1 354152

DURATION The longest trip and the short trip

trip %>% 
  select(duration, startstation, endstation) %>% 
  filter(duration == max(duration))
## # A tibble: 1 x 3
##   duration startstation             endstation   
##      <dbl> <chr>                    <chr>        
## 1 17270400 South Van Ness at Market 2nd at Folsom

We would like to see what are these trips

trip %>% 
  select(startstation, endstation, duration) %>% 
  filter(duration ==  min(duration))
## # A tibble: 27 x 3
##    startstation                     endstation                     duration
##    <chr>                            <chr>                             <dbl>
##  1 Temporary Transbay Terminal (Ho~ Temporary Transbay Terminal (~       60
##  2 San Francisco Caltrain 2 (330 T~ San Francisco Caltrain 2 (330~       60
##  3 Howard at 2nd                    Howard at 2nd                        60
##  4 San Francisco Caltrain (Townsen~ San Francisco Caltrain 2 (330~       60
##  5 Post at Kearny                   2nd at South Park                    60
##  6 Spear at Folsom                  Spear at Folsom                      60
##  7 Steuart at Market                Steuart at Market                    60
##  8 Howard at 2nd                    Howard at 2nd                        60
##  9 Harry Bridges Plaza (Ferry Buil~ Harry Bridges Plaza (Ferry Bu~       60
## 10 San Francisco Caltrain (Townsen~ San Francisco Caltrain (Towns~       60
## # ... with 17 more rows

There are 27 rows where the duration of trip is only 60 seconds. Let see if the starting station and the ending station are the same.

trip %>% 
  select(startstation, endstation, duration) %>% 
  filter(startstation == endstation, duration ==  min(duration))
## # A tibble: 23 x 3
##    startstation                     endstation                     duration
##    <chr>                            <chr>                             <dbl>
##  1 Temporary Transbay Terminal (Ho~ Temporary Transbay Terminal (~       60
##  2 San Francisco Caltrain 2 (330 T~ San Francisco Caltrain 2 (330~       60
##  3 Howard at 2nd                    Howard at 2nd                        60
##  4 Spear at Folsom                  Spear at Folsom                      60
##  5 Steuart at Market                Steuart at Market                    60
##  6 Howard at 2nd                    Howard at 2nd                        60
##  7 Harry Bridges Plaza (Ferry Buil~ Harry Bridges Plaza (Ferry Bu~       60
##  8 San Francisco Caltrain (Townsen~ San Francisco Caltrain (Towns~       60
##  9 Embarcadero at Folsom            Embarcadero at Folsom                60
## 10 Embarcadero at Sansome           Embarcadero at Sansome               60
## # ... with 13 more rows

Among 27 trips where the duration is 60 s, there are 23 trips that people take and drop the bike at the same station. We can speculate that these biker changed chose not to bike just after pick up the bike.

trip %>% 
  select(startstation, endstation, duration) %>% 
  filter(startstation != endstation, duration ==  min(duration))
## # A tibble: 4 x 3
##   startstation                      endstation                     duration
##   <chr>                             <chr>                             <dbl>
## 1 San Francisco Caltrain (Townsend~ San Francisco Caltrain 2 (330~       60
## 2 Post at Kearny                    2nd at South Park                    60
## 3 Howard at 2nd                     2nd at South Park                    60
## 4 South Van Ness at Market          Market at 10th                       60
summary(trip$duration)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       60      342      511     1046      739 17270400

The mean duration is 1046 seconde Distribution: box plot and histogram

ggplot(data = trip, aes(duration))+
  geom_histogram(bins= 300,
                 col = "blue",
                 fill = "blue", 
                 alpha = 0.5) +
  scale_x_continuous(name = "Duration", 
                     limits = c(0, 10000)) +
  scale_y_continuous(name = "Count")+
  ggtitle("Duration distribution")+
  theme(panel.background = element_blank())
## Warning: Removed 3879 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

Visualize the distribution of all the trips that last 1 hour maximum.

ggplot(data = trip, aes(duration))+
  geom_histogram(bins= 100,
                 col = "blue",
                 fill = "blue", 
                 alpha = 0.5) +
  scale_x_continuous(name = "Duration", 
                     limits = c(0, 3600)) +
  scale_y_continuous(name = "Count")+
  ggtitle("Duration distribution")+
  theme(panel.background = element_blank())
## Warning: Removed 9438 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

Visualize the distribution of all the trips that last 30 minutes maximum.

ggplot(data = trip, aes(duration))+
  geom_histogram(bins= 36,
                 col = "blue",
                 fill = "blue", 
                 alpha = 0.5) +
  scale_x_continuous(name = "Duration (seconds)", 
                     limits = c(0, 1800)) +
  scale_y_continuous(name = "Frequency")+
  ggtitle("Duration distribution of all the trips")+
  theme(panel.background = element_blank())
## Warning: Removed 15798 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

How many trip where the duration is less than or equal to 1 hour.

trip %>% 
  filter(duration <= 3600) %>% 
  count()
## # A tibble: 1 x 1
##        n
##    <int>
## 1 344714
trip %>% 
  filter(duration > 3600) %>% 
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1  9438

Ther are about 9438 trips whose duration is longer than 1 hours. There are outliers in the duration columns.

boxplot(trip$duration,
        ylim= c(0,1800),
        names = c("Duration"),
        col = c("blue"),
        main = "Distribution of Duration", 
        ylab = "Duration (s)", 
        border = c("red"),
        boxwex = 0.5)

WORKING WITH DATE: STARTDATE and ENDATE

str(trip$startdate)
##  POSIXct[1:354152], format: "2015-08-31 23:26:00" "2015-08-31 23:11:00" ...
head(trip$startdate)
## [1] "2015-08-31 23:26:00 EDT" "2015-08-31 23:11:00 EDT"
## [3] "2015-08-31 23:13:00 EDT" "2015-08-31 23:10:00 EDT"
## [5] "2015-08-31 23:09:00 EDT" "2015-08-31 23:07:00 EDT"

We should strip this column into startyear, startmonth, startday, starthour Also, we will strip the column endate into 4 columns: endyear, endmonth, endday, and endhour

# lubridate package
trip <- trip %>% 
  mutate(start_date = date(startdate),
         start_year = year(startdate),
         start_month = month(startdate),
         start_hour = hour(startdate),
         start_dayofweek = wday(startdate),
         start_dayofmonth = mday(startdate),
         trip_ym = floor_date(startdate, "month"),
         end_date = date(enddate),
         end_year = year(enddate),
         end_month = month(enddate),
         end_hour = hour(enddate),
         end_dayofweek = wday(enddate),
         end_dayofmonth = mday(enddate),
         trip_ym = floor_date(enddate, "month"))

As we want to see the day of week, we will create a column of day of week

trip %>% 
  select(start_month,start_dayofweek,end_month,end_dayofweek ) %>% 
  head(10)
## # A tibble: 10 x 4
##    start_month start_dayofweek end_month end_dayofweek
##          <dbl>           <dbl>     <dbl>         <dbl>
##  1           8               2         8             2
##  2           8               2         8             2
##  3           8               2         8             2
##  4           8               2         8             2
##  5           8               2         8             2
##  6           8               2         8             2
##  7           8               2         8             2
##  8           8               2         8             2
##  9           8               2         8             2
## 10           8               2         8             2

We should convert these columns into the proper format For day of week

trip$start_dayofweek <- as.factor(trip$start_dayofweek)
levels(trip$start_dayofweek) <- c("Sunday", "Monday", "Tueday", "Wednesday", "Thursday", "Friday", "Saturday")

trip$end_dayofweek <- as.factor(trip$end_dayofweek)
levels(trip$end_dayofweek) <- c("Sunday", "Monday", "Tueday", "Wednesday", "Thursday", "Friday", "Saturday")

For month

trip$start_month <- as.factor(trip$start_month)
levels(trip$start_month) <- c("January","February","March","April","May","June", "July","August","September","October","November","December")

trip$end_month <- as.factor(trip$end_month)
levels(trip$end_month) <- c("January","February","March","April","May","June","July","August","September","October","November","December")

Checking these four columns

trip %>% 
  select(start_month,start_dayofweek,  end_month, end_dayofweek ) %>% 
  head(10)
## # A tibble: 10 x 4
##    start_month start_dayofweek end_month end_dayofweek
##    <fct>       <fct>           <fct>     <fct>        
##  1 August      Monday          August    Monday       
##  2 August      Monday          August    Monday       
##  3 August      Monday          August    Monday       
##  4 August      Monday          August    Monday       
##  5 August      Monday          August    Monday       
##  6 August      Monday          August    Monday       
##  7 August      Monday          August    Monday       
##  8 August      Monday          August    Monday       
##  9 August      Monday          August    Monday       
## 10 August      Monday          August    Monday
trip %>% 
  group_by(start_year) %>% 
  count()
## # A tibble: 2 x 2
## # Groups:   start_year [2]
##   start_year      n
##        <dbl>  <int>
## 1       2014 111095
## 2       2015 243057
startmonth_trip <- trip %>% 
  group_by(start_month) %>% 
  summarise(counts = n())
g_startmonth <- ggplot(startmonth_trip, 
            aes(x = start_month, y = counts))
g_startmonth + geom_bar(stat = "identity", 
             width = 0.5, 
             fill = "cornflowerblue") + 
      labs(title = "Number of trips per month", 
           caption = "SF Bay bike share",
           x = "Month",
           y = "Counts") +
      theme(axis.text.x = element_text(size = 18, angle = 65, vjust = 0.6) +
              theme_set(theme_classic())) +
  geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25)

endmonth_trip <- trip %>% 
  group_by(end_month) %>% 
  summarise(counts = n())

g_endmonth <- ggplot(endmonth_trip, 
            aes(x = end_month, y = counts))
g_endmonth + geom_bar(stat = "identity", 
             width = 0.5, 
             fill = "purple") + 
      labs(title = "Number of trips per month", 
           caption = "SF Bay bike share",
           x = "Month",
           y = "Counts") +
      theme(axis.text.x = element_text(size = 18, angle = 65, vjust = 0.6) +
              theme_set(theme_classic())) +
  geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25)

It is obviously that these two bar charts are almost identical, except for a few long trip. When look at the monthly data for the number of trips, we should consider the fact that there are seven months that have 31 days, and the month of February has only 28 days.

trip %>% 
  filter(duration == max(duration)) %>% 
  select(startdate, enddate)
## # A tibble: 1 x 2
##   startdate           enddate            
##   <dttm>              <dttm>             
## 1 2014-12-06 21:59:00 2015-06-24 20:18:00
trip %>% 
  filter(start_year != end_year) %>% 
  select(startdate, enddate, bikeid, startstation, endstation)
## # A tibble: 1 x 5
##   startdate           enddate             bikeid startstation    endstation
##   <dttm>              <dttm>              <fct>  <chr>           <chr>     
## 1 2014-12-06 21:59:00 2015-06-24 20:18:00 535    South Van Ness~ 2nd at Fo~

The user took the bike (bikeid 535) for 199 days.

trip %>% 
  filter(start_month != end_month) %>% 
  select(startdate, enddate, duration = duration, bikeid, startstation, endstation)
## # A tibble: 28 x 6
##    startdate           enddate             duration bikeid startstation
##    <dttm>              <dttm>                 <dbl> <fct>  <chr>       
##  1 2015-07-31 23:59:00 2015-08-01 00:14:00      917 583    Embarcadero~
##  2 2015-07-31 23:56:00 2015-08-01 00:01:00      330 344    2nd at Town~
##  3 2015-07-31 23:54:00 2015-08-01 00:14:00     1198 528    Embarcadero~
##  4 2015-07-31 22:59:00 2015-08-01 00:56:00     7057 531    Embarcadero~
##  5 2015-07-31 22:56:00 2015-08-01 00:56:00     7183 214    Embarcadero~
##  6 2015-06-28 21:50:00 2015-07-23 15:27:00  2137000 466    Market at S~
##  7 2015-05-31 18:20:00 2015-06-01 16:22:00    79336 207    Park at Oli~
##  8 2015-05-31 17:39:00 2015-06-01 16:22:00    81783 648    Park at Oli~
##  9 2015-05-31 17:05:00 2015-06-01 16:22:00    83830 695    Park at Oli~
## 10 2015-04-30 23:57:00 2015-05-01 00:03:00      370 399    Townsend at~
## # ... with 18 more rows, and 1 more variable: endstation <chr>

day of month

start_dayofmonth_tbl <-  trip %>% 
  group_by(start_dayofmonth) %>% 
  count()

g_occurrenceday <- ggplot(start_dayofmonth_tbl, aes(x = start_dayofmonth, y = n))
g_occurrenceday + 
  geom_line(color = "blue", size = 1) + 
  geom_point(color = "purple", size = 3)+
      labs(title = "Number of crimes by day of month", 
           caption = "Source: SF Bay bike share",
           x = "Day of month",
           y = "Counts") + theme_set(theme_classic())

end_dayofmonth_tbl <- trip %>% 
  group_by(end_dayofmonth) %>% 
  count()

g_occurrenceday <- ggplot(end_dayofmonth_tbl, aes(x = end_dayofmonth, y = n))
g_occurrenceday + 
  geom_line(color = "purple", size = 1) + 
  geom_point(color = "blue", size = 3)+
      labs(title = "Number of crimes by day of month", 
           caption = "Source: SF Bay bike share",
           x = "Day of month",
           y = "Counts") + theme_set(theme_classic())

When comparing the total number of trips in function of day of month, we should remember that are 7 months that have 31 days.

What day of week that bikes were used at most?

start_dayofweek_trip <- trip %>% 
  group_by(start_dayofweek) %>% 
  summarise(counts = n())

g_start_dayofweek <- ggplot(start_dayofweek_trip, 
            aes(x = start_dayofweek, y = counts))
g_start_dayofweek + geom_bar(stat = "identity", 
             width = 0.5, 
             fill = "purple") + 
      labs(title = "Number of trips from Monday to Sunday (At starting station)", 
           caption = "SF Bay bike share",
           x = "Day of week",
           y = "Counts") +
      theme(axis.text.x = element_text(size = 18, angle = 65, vjust = 0.6) +
              theme_set(theme_classic())) +
  geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25)

end_dayofweek_trip <- trip %>% 
  group_by(end_dayofweek) %>% 
  summarise(counts = n())

g_start_dayofweek <- ggplot(end_dayofweek_trip, 
            aes(x = end_dayofweek, y = counts))
g_start_dayofweek + geom_bar(stat = "identity", 
             width = 0.5, 
             fill = "grey") + 
      labs(title = "Number of trips from Monday to Sunday (At ending station)", 
           caption = "SF Bay bike share",
           x = "Day of week",
           y = "Counts") +
      theme(axis.text.x = element_text(size = 18, angle = 65, vjust = 0.6) +
              theme_set(theme_classic())) +
  geom_text(aes(label = counts), position = position_dodge(width = 0.9), vjust = -0.25)

Once again, the two graphs are almost identical. The number of trips on Friday are less than the other week day. Remarkably, the use of bike during Saturday and Sunday plunged.

Time of the day

starthour_tbl <-  trip %>% 
  group_by(start_hour) %>% 
  summarise(counts = n())

g_occurrencehour <- ggplot(starthour_tbl, 
            aes(x = start_hour, y = counts))
g_occurrencehour + geom_line(color = "grey") + geom_point (color = "blue", size = 4) +
      labs(title = "Number of trips at starting stations by time of the day", 
           caption = "Source: SF Bay bike share",
           x = "Time of the day",
           y = "Counts") + theme_set(theme_classic())

How many trips from 8 pm to 5 am.

trip %>%
  filter(start_hour >= 20 | start_hour <= 6) %>% 
  group_by(start_hour) %>% 
  count()
## # A tibble: 11 x 2
## # Groups:   start_hour [11]
##    start_hour     n
##         <int> <int>
##  1          0  1014
##  2          1   511
##  3          2   282
##  4          3   156
##  5          4   640
##  6          5  1848
##  7          6  8014
##  8         20  8251
##  9         21  5741
## 10         22  3662
## 11         23  2207

There are still some trips effectued from 0 am to 5 am. The number of trip increases gradually and peaked at 8 am and decreased at 10 am. The bike use from 10 am to 3 pm was steady, and jumped dramatically at 4 pm at reached the highest number at 5 pm. In other words, the number of trips (or the bike use) are significantly high during rush hour.

The following graph show the frequency of bike use at the endding station

endhour_tbl <-  trip %>% 
  group_by(end_hour) %>% 
  summarise(counts = n())

g_occurrencehour <- ggplot(endhour_tbl, 
            aes(x = end_hour, y = counts))
g_occurrencehour + geom_line(color = "grey") + geom_point (color = "blue", size = 4) +
      labs(title = "Number of trips at ending stations by time of the day", 
           caption = "Source: SF Bay bike share",
           x = "Time of the day",
           y = "Counts") + theme_set(theme_classic())

STARTSTATION The top 10 used start station

top_startstation <- trip %>%
  group_by(startstation) %>% 
  summarise(counts = n()) %>% 
  arrange(desc(counts)) %>% 
  top_n(10)
## Selecting by counts
g1a <- ggplot(top_startstation, 
            aes(x= reorder(startstation, counts), y = counts))
g1a + geom_bar(stat="identity", 
             width = 0.5, 
             fill = "blue") + 
      labs(title="The frequency the most trips at startstation", 
           caption="Source: SF Bay Area Bike Share",
           x =  "Starting Stations",
           y= "Number of trips") + coord_flip()+
      theme(axis.text.x = element_text(angle = 65, vjust=1) +
              theme_set(theme_classic())) +
  geom_text(aes(label=counts), position = position_dodge(width = 0.9), vjust=-1)

least_startstation <- trip %>%
  group_by(startstation) %>% 
  summarise(counts = n()) %>% 
  arrange((counts)) 

least_startstation <- head(least_startstation,10)
least_startstation
## # A tibble: 10 x 2
##    startstation                    counts
##    <chr>                            <int>
##  1 Franklin at Maple                   81
##  2 Redwood City Public Library        118
##  3 San Mateo County Center            127
##  4 Redwood City Medical Center        150
##  5 Mezes Park                         212
##  6 Park at Olive                      376
##  7 California Ave Caltrain Station    400
##  8 Stanford in Redwood City           436
##  9 SJSU 4th at San Carlos             475
## 10 SJSU - San Salvador at 9th         494
g1b <- ggplot(least_startstation, 
            aes(x= reorder(startstation, -counts), y = counts))
g1b + geom_bar(stat="identity", 
             width = 0.5, 
             fill = "blue") + 
      labs(title="The frequency the least trips at startstation", 
           caption="Source: SF Bay Area Bike Share",
           x =  "Starting Stations",
           y= "Number of trips") + coord_flip()+
      theme(axis.text.x = element_text(angle = 65, vjust=0.6) +
              theme_set(theme_classic())) +
  geom_text(aes(label=counts), position = position_dodge(width = 0.9), vjust=-1)

ENDSTATION

top_endstation <- trip %>%
  group_by(endstation) %>% 
  summarise(counts = n()) %>% 
  arrange(desc(counts)) %>% 
  top_n(10)
## Selecting by counts
top_endstation
## # A tibble: 10 x 2
##    endstation                                    counts
##    <chr>                                          <int>
##  1 San Francisco Caltrain (Townsend at 4th)       34810
##  2 San Francisco Caltrain 2 (330 Townsend)        22523
##  3 Harry Bridges Plaza (Ferry Building)           17810
##  4 2nd at Townsend                                15463
##  5 Townsend at 7th                                15422
##  6 Embarcadero at Sansome                         15065
##  7 Market at Sansome                              13916
##  8 Steuart at Market                              13617
##  9 Temporary Transbay Terminal (Howard at Beale)  12966
## 10 Powell Street BART                             10239
g2a <- ggplot(top_endstation, 
            aes(x= reorder(endstation, counts), y = counts))
g2a + geom_bar(stat="identity", 
             width = 0.5, 
             fill = "orange") + 
      labs(title="The frequency the most trips at the terminal station", 
           caption="Source: SF Bay Area Bike Share",
           x =  "Starting Stations",
           y= "Number of trips") + coord_flip()+
      theme(axis.text.x = element_text(angle = 65, vjust=1) +
              theme_set(theme_classic())) +
  geom_text(aes(label=counts), position = position_dodge(width = 0.9), vjust=-1)

least_endstation <- trip %>%
  group_by(endstation) %>% 
  summarise(counts = n()) %>% 
  arrange((counts))

least_endstation <- head(least_endstation, 10)

g2b <- ggplot(least_endstation, 
            aes(x= reorder(endstation, -counts), y = counts))
g2b + geom_bar(stat="identity", 
             width = 0.5, 
             fill = "orange") + 
      labs(title="The frequency the least trips at the terminal station", 
           caption="Source: SF Bay Area Bike Share",
           x =  "Starting Stations",
           y= "Number of trips") + coord_flip()+
      theme(axis.text.x = element_text(angle = 65, vjust=1) +
              theme_set(theme_classic())) +
  geom_text(aes(label=counts), position = position_dodge(width = 0.9), vjust=-1)

BIKE

How many bikes are available?

n_distinct(trip$bikeid)
## [1] 668

There are 668 bikes that have been used.

What are the most and the least bike used.

trip %>% 
  group_by(bikeid) %>% 
  summarise(frencency_bikes =  n()) %>% 
  arrange((desc(frencency_bikes))) %>% 
  head(10)
## # A tibble: 10 x 2
##    bikeid frencency_bikes
##    <fct>            <int>
##  1 878               1121
##  2 392               1102
##  3 489               1101
##  4 463               1085
##  5 532               1074
##  6 558               1071
##  7 306               1060
##  8 29                1057
##  9 66                1053
## 10 589               1052
trip %>% 
  group_by(bikeid) %>% 
  summarise(frencency_bike =  n()) %>% 
  mutate(usepermonth = round(frencency_bike/12),
         userperday = round(frencency_bike/365)) %>% 
  arrange((desc(frencency_bike))) %>% 
  head(10)
## # A tibble: 10 x 4
##    bikeid frencency_bike usepermonth userperday
##    <fct>           <int>       <dbl>      <dbl>
##  1 878              1121          93          3
##  2 392              1102          92          3
##  3 489              1101          92          3
##  4 463              1085          90          3
##  5 532              1074          90          3
##  6 558              1071          89          3
##  7 306              1060          88          3
##  8 29               1057          88          3
##  9 66               1053          88          3
## 10 589              1052          88          3

These 10 bikes were used 90 time per month, about 3 times a day.

trip %>% 
  group_by(bikeid) %>% 
  summarise(frencency_bikes =  n()) %>% 
  arrange(((frencency_bikes))) %>% 
  head(10)
## # A tibble: 10 x 2
##    bikeid frencency_bikes
##    <fct>            <int>
##  1 62                   4
##  2 49                   7
##  3 58                  26
##  4 24                  28
##  5 673                 33
##  6 139                 34
##  7 35                  37
##  8 26                  39
##  9 641                 52
## 10 216                 54

On avarage, how many time a bike is use

trip %>% 
  summarise(frencency_bike =  n(),
  average = sum(frencency_bike)/668)
## # A tibble: 1 x 2
##   frencency_bike average
##            <int>   <dbl>
## 1         354152    530.

Let check the duration distribution of bike 878

bike878 <- trip %>%
  filter(bikeid == "878") %>% 
  select(duration)

head(bike878)
## # A tibble: 6 x 1
##   duration
##      <dbl>
## 1      692
## 2      749
## 3      905
## 4     5773
## 5      370
## 6      323
his1 <- ggplot(bike878, aes(x = duration)) +
        geom_histogram(aes(y = ..count..), binwidth = 60,
                       colour = "blue", fill = "blue") +
        scale_x_continuous(name = "Mean ozone in\nparts per billion",
                           breaks = seq(0, 30000, 60),
                           limits=c(0, 30000)) +
        scale_y_continuous(name = "Count") +
        ggtitle("Frequency histogram of mean ozone")
his1
## Warning: Removed 2 rows containing missing values (geom_bar).

duration_bike878 <- trip %>% 
  filter(bikeid == "878") %>%
  group_by(duration) %>% 
  select(duration) %>% 
  arrange(desc(duration))

head(duration_bike878,10)
## # A tibble: 10 x 1
## # Groups:   duration [10]
##    duration
##       <dbl>
##  1    29137
##  2    23783
##  3    23518
##  4    22545
##  5    15271
##  6    14758
##  7    12761
##  8    11571
##  9    10521
## 10     9690
duration_bike878_least <- trip %>% 
  filter(bikeid == "878") %>%
  group_by(duration) %>% 
  select(duration) %>% 
  arrange((duration))

head(duration_bike878_least,10)
## # A tibble: 10 x 1
## # Groups:   duration [10]
##    duration
##       <dbl>
##  1       61
##  2       66
##  3       78
##  4       80
##  5       85
##  6       88
##  7      100
##  8      110
##  9      119
## 10      123
duration_bike62 <- trip %>% 
  filter(bikeid == "62") %>%
  group_by(duration) %>% 
  select(duration) %>% 
  arrange(desc(duration))

head(duration_bike62,10)
## # A tibble: 4 x 1
## # Groups:   duration [4]
##   duration
##      <dbl>
## 1     3752
## 2     1395
## 3      522
## 4      184
ggplot(data = bike878, aes(duration))+
  geom_histogram(bins= 600,
                 col = "blue",
                 fill = "blue", 
                 alpha = 0.5) +
  scale_x_continuous(name = "Duration", 
                     limits = c(0, 30000)) +
  scale_y_continuous(name = "Count")+
  ggtitle("Duration distribution for bike 878")+
  theme(panel.background = element_blank())
## Warning: Removed 2 rows containing missing values (geom_bar).

ggplot(data = bike878, aes(duration))+
  geom_histogram(bins = 60,
                 col = "blue",
                 fill = "blue", 
                 alpha = 0.5) +
  scale_x_continuous(name = "Duration", 
                     limits = c(0,3600)) +
  scale_y_continuous(name = "Count")+
  ggtitle("Duration distribution for bike 878")+
  theme(panel.background = element_blank())
## Warning: Removed 37 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

How many trips that last more than 1 h

trip %>% 
  filter(duration > 3600) %>% 
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1  9438

There are 9738 trips whose the duration is longer than 60 min (9738*100%/354152) = 2.74 %

SUBscription TYPE

summary(trip$subscriptiontype)
##   Customer Subscriber 
##      43935     310217
trip$subscriptiontype <- as.factor(trip$subscriptiontype)
summary(trip$subscriptiontype)
##   Customer Subscriber 
##      43935     310217

There are 43935 Custumer and 310217 Subscriber.

Creating a list where we found the min duration between two station. We exclude the trips where the startstation and the end station is the same.

samepoint <- trip %>%
  select(startstation,endstation, duration) %>% 
  group_by(startstation, endstation) %>%
  filter(startstation == endstation) %>% 
  summarise(counts = n())

sum(samepoint$counts)
## [1] 10276
different_point <- trip %>%
  select(startstation,endstation, duration) %>% 
  group_by(startstation, endstation) %>%
  filter(startstation != endstation) %>% 
  summarise(counts = n())

sum(different_point$counts)
## [1] 343876

List of starting station and ending station, with the minimal duration of trip We will exclude all the trip where the user pick up and drop off the bike at the same station.

trip_minduration <- trip %>%
  select(startstation,endstation, duration) %>% 
  group_by(startstation, endstation) %>%
  filter(startstation != endstation, duration == min(duration)) %>% 
  arrange(duration)

head(trip_minduration,30)
## # A tibble: 30 x 3
## # Groups:   startstation, endstation [26]
##    startstation                       endstation                   duration
##    <chr>                              <chr>                           <dbl>
##  1 San Francisco Caltrain (Townsend ~ San Francisco Caltrain 2 (3~       60
##  2 Post at Kearny                     2nd at South Park                  60
##  3 Howard at 2nd                      2nd at South Park                  60
##  4 South Van Ness at Market           Market at 10th                     60
##  5 Market at Sansome                  2nd at South Park                  61
##  6 Temporary Transbay Terminal (Howa~ 2nd at South Park                  61
##  7 Embarcadero at Folsom              Spear at Folsom                    61
##  8 Market at Sansome                  2nd at South Park                  61
##  9 2nd at Folsom                      2nd at South Park                  61
## 10 Post at Kearny                     Washington at Kearny               61
## # ... with 20 more rows
trip %>% 
  select(startstation,endstation, duration) %>% 
  filter(startstation == 'Castro Street and El Camino Real',
endstation == 'Howard at 2nd')
## # A tibble: 2 x 3
##   startstation                     endstation    duration
##   <chr>                            <chr>            <dbl>
## 1 Castro Street and El Camino Real Howard at 2nd   179095
## 2 Castro Street and El Camino Real Howard at 2nd   179330
trip %>% 
  select(startstation,endstation, duration) %>% 
  filter(startstation == 'MLK Library',
endstation == 'Mountain View Caltrain Station')
## # A tibble: 2 x 3
##   startstation endstation                     duration
##   <chr>        <chr>                             <dbl>
## 1 MLK Library  Mountain View Caltrain Station    18493
## 2 MLK Library  Mountain View Caltrain Station    18475
trip %>% 
  select(startstation,endstation, duration) %>% 
  filter(startstation == 'San Antonio Caltrain Station',
endstation == 'Stanford in Redwood City')
## # A tibble: 2 x 3
##   startstation                 endstation               duration
##   <chr>                        <chr>                       <dbl>
## 1 San Antonio Caltrain Station Stanford in Redwood City     2985
## 2 San Antonio Caltrain Station Stanford in Redwood City     2975
trip %>% 
  select(startstation,endstation, duration) %>% 
  filter(startstation == 'Beale at Market',
endstation == 'Harry Bridges Plaza (Ferry Building)')
## # A tibble: 122 x 3
##    startstation    endstation                           duration
##    <chr>           <chr>                                   <dbl>
##  1 Beale at Market Harry Bridges Plaza (Ferry Building)      198
##  2 Beale at Market Harry Bridges Plaza (Ferry Building)      314
##  3 Beale at Market Harry Bridges Plaza (Ferry Building)      228
##  4 Beale at Market Harry Bridges Plaza (Ferry Building)     1081
##  5 Beale at Market Harry Bridges Plaza (Ferry Building)      205
##  6 Beale at Market Harry Bridges Plaza (Ferry Building)      156
##  7 Beale at Market Harry Bridges Plaza (Ferry Building)      178
##  8 Beale at Market Harry Bridges Plaza (Ferry Building)      140
##  9 Beale at Market Harry Bridges Plaza (Ferry Building)      157
## 10 Beale at Market Harry Bridges Plaza (Ferry Building)      114
## # ... with 112 more rows

Examining the data set Staion

str(station)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 70 obs. of  7 variables:
##  $ stationid   : num  2 3 4 5 6 7 8 9 10 11 ...
##  $ name        : chr  "San Jose Diridon Caltrain Station" "San Jose Civic Center" "Santa Clara at Almaden" "Adobe on Almaden" ...
##  $ lat         : num  37.3 37.3 37.3 37.3 37.3 ...
##  $ long        : num  -122 -122 -122 -122 -122 ...
##  $ dockcount   : num  27 15 11 19 15 15 15 15 15 19 ...
##  $ landmark    : chr  "San Jose" "San Jose" "San Jose" "San Jose" ...
##  $ installation: chr  "8/6/2013" "8/5/2013" "8/6/2013" "8/5/2013" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   X2 = col_character(),
##   ..   X3 = col_double(),
##   ..   X4 = col_double(),
##   ..   X5 = col_double(),
##   ..   X6 = col_character(),
##   ..   X7 = col_character()
##   .. )

Transformation data Convert some of the fields to factors.

station$stationid <- as.factor(station$stationid) 
station$name <- as.factor(station$name)
station$landmark <- as.factor(station$landmark)

(TO BE CONTINUED)