library(ggplot2)
library(ggthemes)
library(lubridate)
library(dplyr)
library(tidyr)
library(DT)
library(scales)
colorsData <- c("#CC1011", "#665555", "#05a399", "#cfcaca", "#f5e840", "#0683c9", "#e075b0")
Now, we will read several csv files that contain the data from April 2014 to September 2014.
aprilData <- read.csv("uber-raw-data-apr14.csv")
mayData <- read.csv("uber-raw-data-may14.csv")
juneData <- read.csv("uber-raw-data-jun14.csv")
julyData <- read.csv("uber-raw-data-jul14.csv")
augustData <- read.csv("uber-raw-data-aug14.csv")
septemberData <- read.csv("uber-raw-data-sep14.csv")
Now we will combine all of these files into a single dataframe.
uberData <- rbind(aprilData,mayData,juneData,julyData,augustData,septemberData)
head(uberData)
## Date.Time Lat Lon Base
## 1 4/1/2014 0:11:00 40.7690 -73.9549 B02512
## 2 4/1/2014 0:17:00 40.7267 -74.0345 B02512
## 3 4/1/2014 0:21:00 40.7316 -73.9873 B02512
## 4 4/1/2014 0:28:00 40.7588 -73.9776 B02512
## 5 4/1/2014 0:33:00 40.7594 -73.9722 B02512
## 6 4/1/2014 0:33:00 40.7383 -74.0403 B02512
dim(uberData)
## [1] 4534327 4
we will perform the appropriate formatting of Date.Time column.
uberData$Date.Time <- as.POSIXct(uberData$Date.Time,format="%m/%d/%Y%H:%M:%S")
uberData$Time <- format(as.POSIXct(uberData$Date.Time,format="%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
uberData$Date.Time <- ymd_hms(uberData$Date.Time)
Now create factors of time objects like day, month, year etc.
uberData$day <- factor(day(uberData$Date.Time))
uberData$month <- factor(month(uberData$Date.Time,label = TRUE))
uberData$year <- factor(year(uberData$Date.Time))
uberData$daysofweek <- factor(wday(uberData$Date.Time,label = TRUE))
uberData$hour <- factor(hour(hms(uberData$Time)))
uberData$minute <- factor(minute(hms(uberData$Time)))
uberData$second <- factor(second(hms(uberData$Time)))
In the next step we will use the ggplot function to plot the number of trips that the passengers had made in a day. We will also use dplyr to aggregate our data. In the resulting visualizations, we can understand how the number of passengers fares throughout the day.
dataUber <- uberData
hourData <- dataUber%>%group_by(hour)%>%
dplyr::summarize(Total=n())
head(hourData,10)
## # A tibble: 10 x 2
## hour Total
## <fct> <int>
## 1 0 103836
## 2 1 67227
## 3 2 45865
## 4 3 48287
## 5 4 55230
## 6 5 83939
## 7 6 143213
## 8 7 193094
## 9 8 190504
## 10 9 159967
ggplot(hourData,aes(hour,Total))+
geom_bar(stat = "identity",fill="orange",color="black")+
ggtitle("Trips Every Hour")+
theme(legend.position = "none")+theme_light()+
scale_y_continuous(labels = comma)+xlab("Hour")+ylab("Total Trips")
monthHour <- dataUber%>%group_by(month,hour)%>%
dplyr::summarize(Total=n())
ggplot(monthHour,aes(hour,Total,fill=month))+
geom_bar(stat = "identity")+ggtitle("Trips By Hour And Month")+
theme_light()+scale_y_continuous(labels = comma)+xlab("Hour")
We observe from the resulting visualization that 30th of the month had the highest trips in the year which is mostly contributed by the month of April.
dayGroup <- dataUber%>%group_by(day)%>%
dplyr::summarize(Total=n())
head(dayGroup,10)
## # A tibble: 10 x 2
## day Total
## <fct> <int>
## 1 1 127430
## 2 2 143201
## 3 3 142983
## 4 4 140923
## 5 5 147054
## 6 6 139886
## 7 7 143503
## 8 8 145984
## 9 9 155135
## 10 10 152500
ggplot(dayGroup,aes(day,Total))+
geom_bar(stat = "identity",fill="salmon",color="white")+
ggtitle("Trips Every Day")+
theme(legend.position = "none")+
scale_y_continuous(labels = comma)+theme_light()+xlab("Day")
daymg <- dataUber%>%
group_by(month,day,daysofweek)%>%
dplyr::summarize(Total=n())
ggplot(daymg,aes(month,Total,fill=daysofweek))+
geom_bar(stat = "identity",position = "dodge")+
ggtitle("Trips By Day And Month")+
scale_y_continuous(labels = comma)+
scale_fill_manual(values = colorsData)+xlab("Month")+
theme_light()
We will visualize the number of trips that are taking place each month of the year. In the output visualization, we observe that most trips were made during the month of September. Furthermore, we also obtain visual reports of the number of trips that were made on every day of the week.
monthGroup <- dataUber%>%
group_by(month)%>%
dplyr::summarize(Total=n())
head(monthGroup)
## # A tibble: 6 x 2
## month Total
## <ord> <int>
## 1 Apr 564516
## 2 May 652435
## 3 Jun 663844
## 4 Jul 796121
## 5 Aug 829275
## 6 Sep 1028136
ggplot(monthGroup,aes(month,Total,fill=month))+
geom_bar(stat = "identity")+
ggtitle("Trips By Month")+theme(legend.position = "none")+
scale_y_continuous(labels = comma)+theme_light()+xlab("Month")
monthWeekday <- dataUber%>%
group_by(month,daysofweek)%>%
summarize(Total=n())
ggplot(monthWeekday,aes(month,Total,fill=daysofweek))+
geom_bar(stat = "identity",position = "dodge")+
ggtitle("Trips By Day And Month")+
scale_y_continuous(labels = comma)+
scale_fill_manual(values = colorsData)+xlab("Month")+
theme_light()
In the following visualization, we plot the number of trips that have been taken by the passengers from each of the bases. There are five bases in all out of which, we observe that B02617 had the highest number of trips. Furthermore, this base had the highest number of trips in the month B02617. Thursday observed highest trips in the three bases – B02598, B02617, B02682.
ggplot(dataUber,aes(Base))+
geom_bar(fill="salmon")+
scale_y_continuous(labels = comma)+
ggtitle("Trips By Bases")+ylab("Count")+
theme_light()
ggplot(dataUber,aes(Base,fill=month))+
geom_bar(position = "dodge")+
scale_y_continuous(labels = comma)+
ggtitle("Trips By Bases And Month")+
scale_fill_manual(values = colorsData)+ylab("Count")+
theme_light()
ggplot(dataUber,aes(Base,fill=daysofweek))+
geom_bar(position = "dodge")+
scale_y_continuous(labels = comma)+
ggtitle("Trips By Bases And Day Of Week")+
scale_fill_manual(values = colorsData)+ylab("Count")+
theme_light()
dayHour <- dataUber%>%
group_by(day,hour)%>%
summarize(Total=n())
ggplot(dayHour,aes(day,hour,fill=Total))+
geom_tile(color="White")+
ggtitle("Heat Map By Hour And Day")
ggplot(daymg,aes(day,month,fill=Total))+
geom_tile(color="white")+
ggtitle("Heat Map By Month And Day")
ggplot(monthWeekday,aes(daysofweek,month,fill=Total))+
geom_tile(color="white")+
ggtitle("Heat Map By Month And Day Of Week")
monthBase <- dataUber%>%
group_by(Base,month)%>%
summarize(Total=n())
ggplot(monthBase,aes(Base,month,fill=Total))+
geom_tile(color="white")+
ggtitle("Heat Map By Month And Bases")+ylab("Month")+
theme_light()
daysBases <- dataUber%>%
group_by(Base,daysofweek)%>%
summarize(Total=n())
ggplot(daysBases,aes(Base,daysofweek,fill=Total))+
geom_tile(color="white")+
ggtitle("Heat Map By Day Of Week And Bases")+
ylab("Day Of Week")
Now we will visualize the rides in New York city by creating a geo-plot that will help us to visualize the rides during 2014 (Apr – Sep) and by the bases in the same period.
minLat <- 40.5774
maxLat <- 40.9176
minLong <- -74.15
maxLong <- -73.7004
uber <- na.omit(dataUber)
ggplot(uber,aes(Lon,Lat))+
geom_point(size=1,color="blue")+
scale_x_continuous(limits = c(minLong,maxLong))+
scale_y_continuous(limits = c(minLat,maxLat))+
theme_map()+
ggtitle("New York Map Based On Uber Rides During 2014 From April To September")
ggplot(dataUber,aes(x=Lon,y=Lat,color=Base))+
geom_point(size=1)+
scale_x_continuous(limits = c(minLong,maxLong))+
scale_y_continuous(limits = c(minLat,maxLat))+
theme_map()+
ggtitle("New York Map Based On Uber Rides During 2014 From April To September By Base")