The site for the dataset can be found at Citi Bike Dataset
This is the exact Dataset i used for my analysis
I download from Kaggle the temperature.csv, weather_description.csv and wind_speed.csv. I bind them and pick the May of 2016 to make my final weather.csv which i’m gonna use for my analysis. Also i change the temperature from kelvin to Celsius.
tempr <- read.csv("temperature.csv", sep = ','
,stringsAsFactors = T, header = T)
descr <- read.csv("weather_description.csv", sep = ','
,stringsAsFactors = T, header = T)
wind <- read.csv("wind_speed.csv", sep = ','
,stringsAsFactors = T, header = T)
weather <- data.frame(tempr$datetime, tempr$New.York, descr$New.York, wind$New.York)
weather <- weather[grepl("2016-05",weather$tempr.datetime), ]
names(weather) <- c("timestamp","temperature","description","wind")
weather$temperature <- weather$temperature - 273.15
write.csv(weather, file = "weather.csv",row.names = F)
library(ggplot2)
library(data.table)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggmap)
library(randomForest)
library(GGally)
library(RColorBrewer)
temp <- tempfile()
download.file("https://s3.amazonaws.com/tripdata/JC-201605-citibike-tripdata.csv.zip",temp, mode="wb")
unzip(temp, "JC-201605-citibike-tripdata.csv")
unlink(temp)
data <- read.csv("JC-201605-citibike-tripdata.csv", sep = ','
,stringsAsFactors = TRUE, header = TRUE, na.strings =c(""," "))
weather <- read.csv("weather.csv", sep = ','
,stringsAsFactors = TRUE, header = TRUE)
head(data,5)
## Trip.Duration Start.Time Stop.Time Start.Station.ID
## 1 243 2016-05-01 00:04:31 2016-05-01 00:08:34 3203
## 2 118 2016-05-01 00:06:47 2016-05-01 00:08:46 3186
## 3 2891 2016-05-01 00:13:14 2016-05-01 01:01:25 3192
## 4 303 2016-05-01 00:19:32 2016-05-01 00:24:35 3202
## 5 93 2016-05-01 00:21:34 2016-05-01 00:23:08 3211
## Start.Station.Name Start.Station.Latitude Start.Station.Longitude
## 1 Hamilton Park 40.72760 -74.04425
## 2 Grove St PATH 40.71959 -74.04312
## 3 Liberty Light Rail 40.71124 -74.05570
## 4 Newport PATH 40.72722 -74.03376
## 5 Newark Ave 40.72153 -74.04630
## End.Station.ID End.Station.Name End.Station.Latitude End.Station.Longitude
## 1 3202 Newport PATH 40.72722 -74.03376
## 2 3211 Newark Ave 40.72153 -74.04630
## 3 3192 Liberty Light Rail 40.71124 -74.05570
## 4 3203 Hamilton Park 40.72760 -74.04425
## 5 3213 Van Vorst Park 40.71849 -74.04773
## Bike.ID User.Type Birth.Year Gender
## 1 24507 Subscriber 1964 1
## 2 24660 Subscriber 1967 1
## 3 24687 Subscriber 1991 1
## 4 24551 Subscriber 1966 1
## 5 14717 Subscriber 1989 1
dim(data)
## [1] 19488 15
str(data)
## 'data.frame': 19488 obs. of 15 variables:
## $ Trip.Duration : int 243 118 2891 303 93 241 278 254 156 324 ...
## $ Start.Time : Factor w/ 19265 levels "2016-05-01 00:04:31",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Stop.Time : Factor w/ 19223 levels "2016-05-01 00:08:34",..: 1 2 13 4 3 5 6 8 7 9 ...
## $ Start.Station.ID : int 3203 3186 3192 3202 3211 3186 3186 3195 3195 3202 ...
## $ Start.Station.Name : Factor w/ 35 levels "5 Corners Library",..: 14 13 18 23 22 13 13 31 31 23 ...
## $ Start.Station.Latitude : num 40.7 40.7 40.7 40.7 40.7 ...
## $ Start.Station.Longitude: num -74 -74 -74.1 -74 -74 ...
## $ End.Station.ID : int 3202 3211 3192 3203 3213 3183 3209 3225 3194 3187 ...
## $ End.Station.Name : Factor w/ 42 levels "5 Corners Library",..: 27 26 22 18 39 14 5 2 24 41 ...
## $ End.Station.Latitude : num 40.7 40.7 40.7 40.7 40.7 ...
## $ End.Station.Longitude : num -74 -74 -74.1 -74 -74 ...
## $ Bike.ID : int 24507 24660 24687 24551 14717 24667 24618 24532 24411 24507 ...
## $ User.Type : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 2 ...
## $ Birth.Year : int 1964 1967 1991 1966 1989 1982 1994 1978 1990 1990 ...
## $ Gender : int 1 1 1 1 1 1 1 2 1 1 ...
data$Start.Time <- as.POSIXct(data$Start.Time,format = "%Y-%m-%d %H:%M:%S")
data$Stop.Time <- as.POSIXct(data$Stop.Time,format = "%Y-%m-%d %H:%M:%S")
data$Start.Station.ID <- as.factor(data$Start.Station.ID)
data$End.Station.ID <- as.factor(data$End.Station.ID)
data$Gender <- as.character(data$Gender)
data$Gender <- ifelse(data$Gender=="0","Unknown",ifelse(data$Gender=="1","Male","Female"))
data$Gender <- as.factor(data$Gender)
data <- data %>% mutate(Age=2016-Birth.Year) %>% select(-c("Birth.Year"))
sapply(data, function(x) sum(is.na(x)))
## Trip.Duration Start.Time Stop.Time
## 0 0 0
## Start.Station.ID Start.Station.Name Start.Station.Latitude
## 0 0 0
## Start.Station.Longitude End.Station.ID End.Station.Name
## 0 0 0
## End.Station.Latitude End.Station.Longitude Bike.ID
## 0 0 0
## User.Type Gender Age
## 8 0 1726
data<-data[!is.na(data$User.Type),]
sum(is.na(data$User.Type))
## [1] 0
summary(data$Trip.Duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 61.0 270.0 431.0 818.3 743.0 237444.0
u_whisker <- boxplot.stats(data$Trip.Duration)$stats[5]
data<-data[data$Trip.Duration<3600,]
rf <- randomForest(Age~Trip.Duration+Start.Station.ID+End.Station.ID+User.Type+Gender,
ntree=10,data=data,na.action=na.omit)
year_pred <- predict(rf, newdata = data[is.na(data$Age), ][c("Trip.Duration","Start.Station.ID",
"End.Station.ID","User.Type",
"Gender")])
data[is.na(data$Age),]$Age <- round(year_pred)
sum(is.na(data$Age))
## [1] 0
ggpairs(data[,c(1,13,14,15)])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(as.data.frame(sort(table(data$Start.Station.Name),decreasing=TRUE)[1:5]),aes(Var1, Freq)) +
geom_col(aes(fill=Var1))+ coord_flip() + theme(legend.position = "none") +
labs(title = "Most Frequent Start Stations", y = "Frequency", x = "Stations") +
geom_text(aes(label= Freq), size = 3, position = position_stack(vjust = 0.5))
ggplot(as.data.frame(sort(table(data$End.Station.Name),decreasing=TRUE)[1:5]),aes(Var1, Freq)) +
geom_col(aes(fill=Var1)) + coord_flip() + theme(legend.position = "none") +
labs(title = "Most Frequent End Stations", y = "Frequency", x = "Stations") +
geom_text(aes(label= Freq), size = 3, position = position_stack(vjust = 0.5))
Number_of_routes <- setDT(data)[, .N, by=.(Start.Station.Name,End.Station.Name)][, .SD[which.max(N)], by = Start.Station.Name]
Number_of_routes <- Number_of_routes[order(Start.Station.Name),]
Number_of_routes$Percentage <- round(Number_of_routes[order(Start.Station.Name),]$N/table(data$Start.Station.Name),2)
Number_of_routes$Percentage <- Number_of_routes$Percentage*100
names(Number_of_routes) <- c("Start Station Name", "End Station Name", "Number of routes", "% Percentage")
Number_of_routes
## Start Station Name End Station Name Number of routes % Percentage
## 1: 5 Corners Library Pershing Field 46 21
## 2: Baldwin at Montgomery Sip Ave 146 31
## 3: Bayside Park Grove St PATH 18 55
## 4: Brunswick St Grove St PATH 612 65
## 5: Central Ave Sip Ave 103 42
## 6: Christ Hospital Sip Ave 57 27
## 7: City Hall Exchange Place 74 13
## 8: Columbia Park Sip Ave 2 33
## 9: Dey St Sip Ave 105 54
## 10: Essex Light Rail Exchange Place 422 43
## 11: Exchange Place Essex Light Rail 400 25
## 12: Garfield Ave Station Grove St PATH 38 46
## 13: Grove St PATH Brunswick St 418 18
## 14: Hamilton Park Grove St PATH 562 43
## 15: Heights Elevator Central Ave 29 18
## 16: Hilltop Oakland Ave 23 11
## 17: JC Medical Center Grove St PATH 140 33
## 18: Liberty Light Rail Grove St PATH 172 27
## 19: Lincoln Park Sip Ave 200 70
## 20: McGinley Square Sip Ave 373 69
## 21: MLK Light Rail Grove St PATH 10 40
## 22: Newark Ave Grove St PATH 151 19
## 23: Newport PATH Hamilton Park 312 25
## 24: Newport Pkwy Newport Pkwy 188 22
## 25: NJCU Sip Ave 13 42
## 26: North St Sip Ave 32 19
## 27: Oakland Ave Sip Ave 205 59
## 28: Paulus Hook Exchange Place 87 13
## 29: Pershing Field Sip Ave 74 20
## 30: Riverview Park Grove St PATH 31 18
## 31: Sip Ave McGinley Square 332 21
## 32: Union St Sip Ave 15 62
## 33: Van Vorst Park Grove St PATH 340 46
## 34: Warren St Exchange Place 95 15
## 35: West Side Light Rail Sip Ave 2 18
## Start Station Name End Station Name Number of routes % Percentage
myFreqs <- data %>%
group_by(Start.Station.Name, End.Station.Name) %>%
summarise(Freq = n())
myFreqs$routes <- paste(myFreqs$Start.Station.Name,"-",myFreqs$End.Station.Name)
myFreqs <- myFreqs[order(myFreqs$Freq, decreasing = T),]
ggplot(data=myFreqs[1:5,],aes(x=routes,y=Freq)) +
geom_col(aes(fill=routes)) + coord_flip() + theme(legend.position = "none") +
labs(title = "Most popular routes", y = "Frequency", x = "Routes") +
geom_text(aes(label= Freq), size = 3, position = position_stack(vjust = 0.5))
ggplot(data[data$Trip.Duration<u_whisker, ],aes(x=Trip.Duration)) +
geom_line(aes(fill="count"), stat="bin", binwidth=50,size=1,color="Blue") +
labs(title="Plot of Trip Durations", x="Trip Duration")
## Warning: Ignoring unknown aesthetics: fill
ggplot(data, aes(x=User.Type))+ geom_bar(aes(fill=User.Type)) +
labs(title = "Number of User Types", x= "User Type")
ggplot(data, aes(x=User.Type, y=Trip.Duration, fill=User.Type)) +
stat_summary(fun.y="mean", geom="bar") +
labs(title = "Mean Trip Duration per User Type", x= "User Type", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.
boxplot(Trip.Duration~User.Type, data=data[data$Trip.Duration<u_whisker,],main = "Trip Duration per User Type ",
xlab = "User Type", ylab = "Trip Duration", col = "Green")
ggplot(data, aes(x=Gender))+ geom_bar(aes(fill=Gender)) + labs(title = "Number of Gender Types")
ggplot(data, aes(x=Gender, y=Trip.Duration, fill=Gender)) +
stat_summary(fun.y="mean", geom="bar") +
labs(title = "Mean Trip Duration per Gender", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.
ggplot(data, aes(x=Age,group=Gender,colour=Gender)) + geom_freqpoly(stat = "bin",size=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data, aes(x=Age,y=Trip.Duration ,colour=Gender)) +
stat_summary(fun.y="mean", geom="line", size=1) + facet_grid(Gender ~ .) +
labs(title = "Relationship between Mean Trip Duration and Age per Gender", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.
pop_bikes <- sort(table(data$Bike.ID),decreasing=TRUE)[1:5]
ggplot(as.data.frame(pop_bikes),aes(Var1, Freq)) + coord_flip() + geom_col(aes(fill=Var1)) +
labs(title = "Most used bikes", y="Number of times used",x="Bike ID") + theme(legend.position = "none") +
geom_text(aes(label= Freq), size = 3, position = position_stack(vjust = 0.5))
data$day <- wday(data$Start.Time,label = TRUE,abbr = FALSE,week_start = getOption("lubridate.week.start", 7), locale = "English_United States")
ggplot(data, aes(x=day, y=Trip.Duration, fill=day)) +
stat_summary(fun.y="mean", geom="bar") +
labs(title = "Mean Trip Duration per Day", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.
ggplot(data, aes(x=day,fill=day))+geom_bar() +
labs(title = "Number of trips per day")
ggplot(data, aes(x=day,fill=User.Type))+geom_bar() +
labs(title = "Number of trips per day") + facet_grid(User.Type ~ .)
ggplot(data, aes(x=hour(data$Start.Time),fill=as.factor(hour(data$Start.Time))))+geom_bar() +
labs(title = "Rush Hour", x="Hour") + facet_grid(User.Type ~ .) + theme(legend.position = "none")
chi_bb <- c(left = min(data$Start.Station.Longitude),
bottom = min(data$Start.Station.Latitude),
right = max(data$Start.Station.Longitude+0.01),
top = max(data$Start.Station.Latitude))
JC_stamen <- get_stamenmap(bbox = chi_bb,zoom = 13)
## Source : http://tile.stamen.com/terrain/13/2409/3078.png
## Source : http://tile.stamen.com/terrain/13/2410/3078.png
## Source : http://tile.stamen.com/terrain/13/2411/3078.png
## Source : http://tile.stamen.com/terrain/13/2409/3079.png
## Source : http://tile.stamen.com/terrain/13/2410/3079.png
## Source : http://tile.stamen.com/terrain/13/2411/3079.png
## Source : http://tile.stamen.com/terrain/13/2409/3080.png
## Source : http://tile.stamen.com/terrain/13/2410/3080.png
## Source : http://tile.stamen.com/terrain/13/2411/3080.png
pointstolabel <- c("Grove St PATH", "Exchange Place", "Sip Ave", "Hamilton Park", "Newport PATH")
ggmap(JC_stamen) +
stat_density_2d(data = data,
mapping = aes(x = Start.Station.Longitude,
y = Start.Station.Latitude,fill = stat(level)),
alpha = .2,
bins = 50,
geom = "polygon") + scale_fill_gradientn(colors = brewer.pal(7, "YlOrRd")) +
geom_point(data = data, color=I("red"),
mapping = aes(x = Start.Station.Longitude,
y = Start.Station.Latitude)) +
geom_text(data=subset(data, Start.Station.Name %in% pointstolabel),
aes(x= Start.Station.Longitude, y = Start.Station.Latitude,
label = Start.Station.Name, color=I("Grey20"))) +
theme(legend.position = "none") +
labs(title = "Heatmap of Start Station Popularity", y = "Latitude", x = "Longitude")
ggmap(JC_stamen) +
stat_density_2d(data = data,
mapping = aes(x = End.Station.Longitude,
y = End.Station.Latitude,fill = stat(level)),
alpha = .2,
bins = 50,
geom = "polygon") + scale_fill_gradientn(colors = brewer.pal(7, "YlOrRd")) +
geom_point(data = data, color=I("red"),
mapping = aes(x = End.Station.Longitude,
y = End.Station.Latitude)) +
geom_text(data=subset(data, End.Station.Name %in% pointstolabel),
aes(x= End.Station.Longitude, y = End.Station.Latitude,
label = End.Station.Name, color=I("Grey20"))) +
theme(legend.position = "none") +
labs(title = "Heatmap of End Station Popularity", y = "Latitude", x = "Longitude")
weather$mc <- substr(weather[,1],1,13)
data$mc <- substr(data[,as.character(Start.Time)], 1, 13)
total <- merge(data,weather, by="mc")
total <- select(total,-c(mc,timestamp))
total$temperature <- round(total$temperature)
total$description <- as.character(total$description)
total$description[total$description %in% c("thunderstorm with light rain","thunderstorm",
"proximity thunderstorm")] <- "Thunderstorm"
total$description[total$description %in% c("drizzle","light intensity drizzle",
"moderate rain","light rain")] <- "Rainy"
total$description[total$description %in% c("broken clouds","few clouds","overcast clouds",
"scattered clouds")] <- "Cloudy"
total$description[total$description %in% c("fog","haze","mist")] <- "Mist"
total$description[total$description=="sky is clear"] <- "Clear"
total$description <- as.factor(total$description)
levels(total$description)
## [1] "Clear" "Cloudy" "Mist" "Rainy" "Thunderstorm"
ggplot(total, aes(x=description))+ geom_bar(aes(fill=description)) + labs(title = "Number of routes per weather condition")
ggplot(total, aes(x=hour(total$Start.Time),fill=description))+geom_bar() +
labs(title = "Rush Hour", x="Hour")
boxplot(Trip.Duration~description, data= total[total$Trip.Duration<u_whisker, ], col = "orange", main = "Impact of weather on trip duration", xlab = "Weather conditions", ylab = "Trip Duration")
total$wind <- as.factor(total$wind)
ggplot(total, aes(x=wind))+ geom_bar(aes(fill=wind)) + labs(title = "Number of routes per wind condition") + theme(legend.position = "none")
boxplot(Trip.Duration~wind, data= total[total$Trip.Duration<u_whisker,], col = "blue", main = "Impact of wind on trip duration", xlab = "Wind", ylab = "Trip Duration")
ggplot(total, aes(x=temperature))+ geom_bar(aes(fill=as.factor(temperature))) + labs(title = "Number of routes per temperature") + theme(legend.position = "none")
ggplot(total, aes(x=temperature,y=Trip.Duration)) +
stat_summary(fun.y="mean", geom="line", color="red",size=1) +
labs(title = "Mean Trip Duration per Temperature", y="mean Trip Duration")
## Warning: `fun.y` is deprecated. Use `fun` instead.