Analysis of citi bike database for May of 2016 at New Jersey and how the weather affectes

the usage of bikes

Data:

The site for the dataset can be found at Citi Bike Dataset

This is the exact Dataset i used for my analysis

The data includes:

Data notes:

Weather Database:

I download from Kaggle the temperature.csv, weather_description.csv and wind_speed.csv. I bind them and pick the May of 2016 to make my final weather.csv which i’m gonna use for my analysis. Also i change the temperature from kelvin to Celsius.

tempr <- read.csv("temperature.csv", sep = ','
                    ,stringsAsFactors = T, header = T)
descr <- read.csv("weather_description.csv", sep = ','
                  ,stringsAsFactors = T, header = T)
wind <- read.csv("wind_speed.csv", sep = ','
                 ,stringsAsFactors = T, header = T)


weather <- data.frame(tempr$datetime, tempr$New.York, descr$New.York, wind$New.York)
weather <- weather[grepl("2016-05",weather$tempr.datetime), ]

names(weather) <- c("timestamp","temperature","description","wind")
weather$temperature <- weather$temperature - 273.15

write.csv(weather, file = "weather.csv",row.names = F)

Loading the libraries

library(ggplot2)
library(data.table)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggmap)
library(randomForest)
library(GGally)
library(RColorBrewer)

downloading

temp <- tempfile()
download.file("https://s3.amazonaws.com/tripdata/JC-201605-citibike-tripdata.csv.zip",temp, mode="wb")
unzip(temp, "JC-201605-citibike-tripdata.csv")
unlink(temp)

Loading the datasets

data <- read.csv("JC-201605-citibike-tripdata.csv", sep = ','
                 ,stringsAsFactors = TRUE, header = TRUE, na.strings =c(""," "))
weather <- read.csv("weather.csv", sep = ','
                    ,stringsAsFactors = TRUE, header = TRUE)

Data Preprocessing

head(data,5)
##   Trip.Duration          Start.Time           Stop.Time Start.Station.ID
## 1           243 2016-05-01 00:04:31 2016-05-01 00:08:34             3203
## 2           118 2016-05-01 00:06:47 2016-05-01 00:08:46             3186
## 3          2891 2016-05-01 00:13:14 2016-05-01 01:01:25             3192
## 4           303 2016-05-01 00:19:32 2016-05-01 00:24:35             3202
## 5            93 2016-05-01 00:21:34 2016-05-01 00:23:08             3211
##   Start.Station.Name Start.Station.Latitude Start.Station.Longitude
## 1      Hamilton Park               40.72760               -74.04425
## 2      Grove St PATH               40.71959               -74.04312
## 3 Liberty Light Rail               40.71124               -74.05570
## 4       Newport PATH               40.72722               -74.03376
## 5         Newark Ave               40.72153               -74.04630
##   End.Station.ID   End.Station.Name End.Station.Latitude End.Station.Longitude
## 1           3202       Newport PATH             40.72722             -74.03376
## 2           3211         Newark Ave             40.72153             -74.04630
## 3           3192 Liberty Light Rail             40.71124             -74.05570
## 4           3203      Hamilton Park             40.72760             -74.04425
## 5           3213     Van Vorst Park             40.71849             -74.04773
##   Bike.ID  User.Type Birth.Year Gender
## 1   24507 Subscriber       1964      1
## 2   24660 Subscriber       1967      1
## 3   24687 Subscriber       1991      1
## 4   24551 Subscriber       1966      1
## 5   14717 Subscriber       1989      1

Data Structure and dimension

dim(data)
## [1] 19488    15
str(data)
## 'data.frame':    19488 obs. of  15 variables:
##  $ Trip.Duration          : int  243 118 2891 303 93 241 278 254 156 324 ...
##  $ Start.Time             : Factor w/ 19265 levels "2016-05-01 00:04:31",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Stop.Time              : Factor w/ 19223 levels "2016-05-01 00:08:34",..: 1 2 13 4 3 5 6 8 7 9 ...
##  $ Start.Station.ID       : int  3203 3186 3192 3202 3211 3186 3186 3195 3195 3202 ...
##  $ Start.Station.Name     : Factor w/ 35 levels "5 Corners Library",..: 14 13 18 23 22 13 13 31 31 23 ...
##  $ Start.Station.Latitude : num  40.7 40.7 40.7 40.7 40.7 ...
##  $ Start.Station.Longitude: num  -74 -74 -74.1 -74 -74 ...
##  $ End.Station.ID         : int  3202 3211 3192 3203 3213 3183 3209 3225 3194 3187 ...
##  $ End.Station.Name       : Factor w/ 42 levels "5 Corners Library",..: 27 26 22 18 39 14 5 2 24 41 ...
##  $ End.Station.Latitude   : num  40.7 40.7 40.7 40.7 40.7 ...
##  $ End.Station.Longitude  : num  -74 -74 -74.1 -74 -74 ...
##  $ Bike.ID                : int  24507 24660 24687 24551 14717 24667 24618 24532 24411 24507 ...
##  $ User.Type              : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Birth.Year             : int  1964 1967 1991 1966 1989 1982 1994 1978 1990 1990 ...
##  $ Gender                 : int  1 1 1 1 1 1 1 2 1 1 ...
data$Start.Time <- as.POSIXct(data$Start.Time,format = "%Y-%m-%d %H:%M:%S")
data$Stop.Time <- as.POSIXct(data$Stop.Time,format = "%Y-%m-%d %H:%M:%S")
data$Start.Station.ID <- as.factor(data$Start.Station.ID)
data$End.Station.ID <- as.factor(data$End.Station.ID)

Changing the Gender entries for better plot understanding

data$Gender <- as.character(data$Gender)
data$Gender <- ifelse(data$Gender=="0","Unknown",ifelse(data$Gender=="1","Male","Female"))
data$Gender <- as.factor(data$Gender)

Changing the Birth Year column with a Age column

data <- data %>% mutate(Age=2016-Birth.Year) %>% select(-c("Birth.Year"))

Checking for NA

sapply(data, function(x) sum(is.na(x)))
##           Trip.Duration              Start.Time               Stop.Time 
##                       0                       0                       0 
##        Start.Station.ID      Start.Station.Name  Start.Station.Latitude 
##                       0                       0                       0 
## Start.Station.Longitude          End.Station.ID        End.Station.Name 
##                       0                       0                       0 
##    End.Station.Latitude   End.Station.Longitude                 Bike.ID 
##                       0                       0                       0 
##               User.Type                  Gender                     Age 
##                       8                       0                    1726

removing the rows with NA User Type. It’s only 8 so we can drop them.

data<-data[!is.na(data$User.Type),]
sum(is.na(data$User.Type))
## [1] 0

Checking the Trip Duration column for anomalies and outliers

summary(data$Trip.Duration)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     61.0    270.0    431.0    818.3    743.0 237444.0

Using a bike for 237444 sec = 3957 min has no sense. So i am gonna remove trip durations more than a hour(3600 sec) but also i’m gonna save the upper whisker for more useful plots.

u_whisker <- boxplot.stats(data$Trip.Duration)$stats[5]
data<-data[data$Trip.Duration<3600,]

Implement a random forest model to fill NA at Age column

rf <- randomForest(Age~Trip.Duration+Start.Station.ID+End.Station.ID+User.Type+Gender,
                   ntree=10,data=data,na.action=na.omit)
year_pred <- predict(rf, newdata = data[is.na(data$Age), ][c("Trip.Duration","Start.Station.ID",
                                                             "End.Station.ID","User.Type",
                                                             "Gender")])
data[is.na(data$Age),]$Age <- round(year_pred)
sum(is.na(data$Age))
## [1] 0

Exploratory Data Analysis

ggpairs(data[,c(1,13,14,15)])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Trip duration graph for entries lower than upper whisker

ggplot(data[data$Trip.Duration<u_whisker, ],aes(x=Trip.Duration)) + 
  geom_line(aes(fill="count"), stat="bin", binwidth=50,size=1,color="Blue") +
  labs(title="Plot of Trip Durations", x="Trip Duration") 
## Warning: Ignoring unknown aesthetics: fill

User types and mean duration per type

ggplot(data, aes(x=User.Type))+ geom_bar(aes(fill=User.Type)) + 
  labs(title = "Number of User Types", x= "User Type")

ggplot(data, aes(x=User.Type, y=Trip.Duration, fill=User.Type)) + 
  stat_summary(fun.y="mean", geom="bar") + 
  labs(title = "Mean Trip Duration per User Type", x= "User Type", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.

boxplot(Trip.Duration~User.Type, data=data[data$Trip.Duration<u_whisker,],main = "Trip Duration per User Type ", 
        xlab = "User Type", ylab = "Trip Duration", col = "Green")

As we can see the Subscibers (annual membership) are more than Customers (24 hour or 3-day pass) but the Customers use the bikes for longer duration

Number of Gender per type and mean trip duration per Gender. As we can see the users who dont declare the sex use bikes for longer duration

ggplot(data, aes(x=Gender))+ geom_bar(aes(fill=Gender)) + labs(title = "Number of Gender Types")

ggplot(data, aes(x=Gender, y=Trip.Duration, fill=Gender)) + 
  stat_summary(fun.y="mean", geom="bar") + 
  labs(title = "Mean Trip Duration per Gender", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.

Most users are between 20 and 40 and also the users who dont declare the sex are between 30 and 35.

ggplot(data, aes(x=Age,group=Gender,colour=Gender)) + geom_freqpoly(stat = "bin",size=1) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data, aes(x=Age,y=Trip.Duration ,colour=Gender)) + 
  stat_summary(fun.y="mean", geom="line", size=1) + facet_grid(Gender ~ .) + 
  labs(title = "Relationship between Mean Trip Duration and Age per Gender", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.

We can see that the trips during the weekdays are more than weekend but also sorter. It seems logical because as we can assume that the purpose of usage at weekdays is more about going to work or different kind of business. On the contrary we can assume tha the purpose of weekend usage is more about fun puproses while the user have not the pressure of time. Also subscribers use bikes more often at the weekday and customers more often at the weekend.

data$day <- wday(data$Start.Time,label = TRUE,abbr = FALSE,week_start = getOption("lubridate.week.start", 7), locale = "English_United States")

ggplot(data, aes(x=day, y=Trip.Duration, fill=day)) + 
  stat_summary(fun.y="mean", geom="bar") + 
  labs(title = "Mean Trip Duration per Day", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.

ggplot(data, aes(x=day,fill=day))+geom_bar() + 
  labs(title = "Number of trips per day")

ggplot(data, aes(x=day,fill=User.Type))+geom_bar() + 
  labs(title = "Number of trips per day") + facet_grid(User.Type ~ .)

Rush hour graph

We can assume that the most subscribers use the bike as mean of transport to go or leave from work this is why we have so many entries around 8 am and 18 pm. Usually customers rent a bike for 24 hours so its seems reasonable to assume that most of them are tourists and using them to move around to see the sights of New Jersey.

ggplot(data, aes(x=hour(data$Start.Time),fill=as.factor(hour(data$Start.Time))))+geom_bar() + 
  labs(title = "Rush Hour", x="Hour") + facet_grid(User.Type ~ .) + theme(legend.position = "none") 

Heatmap of Start stations and top five Start Station position

chi_bb <- c(left = min(data$Start.Station.Longitude),
            bottom = min(data$Start.Station.Latitude),
            right = max(data$Start.Station.Longitude+0.01),
            top = max(data$Start.Station.Latitude))
JC_stamen <- get_stamenmap(bbox = chi_bb,zoom = 13)
## Source : http://tile.stamen.com/terrain/13/2409/3078.png
## Source : http://tile.stamen.com/terrain/13/2410/3078.png
## Source : http://tile.stamen.com/terrain/13/2411/3078.png
## Source : http://tile.stamen.com/terrain/13/2409/3079.png
## Source : http://tile.stamen.com/terrain/13/2410/3079.png
## Source : http://tile.stamen.com/terrain/13/2411/3079.png
## Source : http://tile.stamen.com/terrain/13/2409/3080.png
## Source : http://tile.stamen.com/terrain/13/2410/3080.png
## Source : http://tile.stamen.com/terrain/13/2411/3080.png
pointstolabel <- c("Grove St PATH", "Exchange Place", "Sip Ave", "Hamilton Park", "Newport PATH") 
ggmap(JC_stamen) +
  stat_density_2d(data = data,
              mapping = aes(x = Start.Station.Longitude,
              y = Start.Station.Latitude,fill = stat(level)),
              alpha = .2,
              bins = 50,
              geom = "polygon") + scale_fill_gradientn(colors = brewer.pal(7, "YlOrRd")) +
  geom_point(data = data, color=I("red"), 
             mapping = aes(x = Start.Station.Longitude,
             y = Start.Station.Latitude)) + 
  geom_text(data=subset(data, Start.Station.Name %in% pointstolabel), 
            aes(x= Start.Station.Longitude, y = Start.Station.Latitude, 
            label = Start.Station.Name, color=I("Grey20"))) + 
  theme(legend.position = "none") + 
  labs(title = "Heatmap of Start Station Popularity", y = "Latitude", x = "Longitude")

Heatmap of End stations and top five End Station position

ggmap(JC_stamen) +
  stat_density_2d(data = data,
                  mapping = aes(x = End.Station.Longitude,
                  y = End.Station.Latitude,fill = stat(level)),
                  alpha = .2,
                  bins = 50,
                  geom = "polygon") + scale_fill_gradientn(colors = brewer.pal(7, "YlOrRd")) +
  geom_point(data = data, color=I("red"), 
            mapping = aes(x = End.Station.Longitude,
            y = End.Station.Latitude)) + 
  geom_text(data=subset(data, End.Station.Name %in% pointstolabel), 
            aes(x= End.Station.Longitude, y = End.Station.Latitude, 
            label = End.Station.Name, color=I("Grey20"))) + 
  theme(legend.position = "none") + 
  labs(title = "Heatmap of End Station Popularity", y = "Latitude", x = "Longitude")

Joining data and weather dataset

Creating common column for joining purposes.

weather$mc <- substr(weather[,1],1,13)
data$mc <- substr(data[,as.character(Start.Time)], 1, 13)
total <- merge(data,weather, by="mc")
total <- select(total,-c(mc,timestamp))
total$temperature <- round(total$temperature)

description engineering

total$description <- as.character(total$description)
total$description[total$description %in% c("thunderstorm with light rain","thunderstorm",
                                           "proximity thunderstorm")] <- "Thunderstorm"
total$description[total$description %in% c("drizzle","light intensity drizzle",
                                           "moderate rain","light rain")] <- "Rainy"
total$description[total$description %in% c("broken clouds","few clouds","overcast clouds",
                                           "scattered clouds")] <- "Cloudy"
total$description[total$description %in% c("fog","haze","mist")] <- "Mist"
total$description[total$description=="sky is clear"] <- "Clear"

total$description <- as.factor(total$description)
levels(total$description)
## [1] "Clear"        "Cloudy"       "Mist"         "Rainy"        "Thunderstorm"

Impact of weather on the trip duration

We can see that the most routes start with cloudy or misty weather and it seem logical because is usual to have mist around 7-9 am and because we are at May the weather can not be so clear or having thunderstorms.

ggplot(total, aes(x=description))+ geom_bar(aes(fill=description)) + labs(title = "Number of routes per weather condition")

ggplot(total, aes(x=hour(total$Start.Time),fill=description))+geom_bar() + 
  labs(title = "Rush Hour", x="Hour")

We can see that the weather conditions does not have so much impact at trip duration.

boxplot(Trip.Duration~description, data= total[total$Trip.Duration<u_whisker, ], col = "orange", main = "Impact of weather on trip duration",  xlab = "Weather conditions", ylab = "Trip Duration")

If the wind is more than 5 there are not so many bike users, but the wind’s impact is not so big regarding to the mean trip duration.

total$wind <- as.factor(total$wind)
ggplot(total, aes(x=wind))+ geom_bar(aes(fill=wind)) + labs(title = "Number of routes per wind condition") + theme(legend.position = "none")

boxplot(Trip.Duration~wind, data= total[total$Trip.Duration<u_whisker,], col = "blue", main = "Impact of wind on trip duration",  xlab = "Wind", ylab = "Trip Duration")

The temperature has impact on trip duration

ggplot(total, aes(x=temperature))+ geom_bar(aes(fill=as.factor(temperature))) + labs(title = "Number of routes per temperature") + theme(legend.position = "none")

ggplot(total, aes(x=temperature,y=Trip.Duration)) + 
  stat_summary(fun.y="mean", geom="line", color="red",size=1) + 
  labs(title = "Mean Trip Duration per Temperature", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.