Import Libraries

library(ggplot2)
library(ggthemes)
library(lubridate)
library(dplyr)
library(tidyr)
library(DT)
library(scales)

Create Vector Of Colors To Be Implemented In Our Plots

colorsData <-  c("#CC1011", "#665555", "#05a399", "#cfcaca", "#f5e840", "#0683c9", "#e075b0")

Reading Data

Now, we will read several csv files that contain the data from April 2014 to September 2014.

aprilData <- read.csv("uber-raw-data-apr14.csv")
mayData <- read.csv("uber-raw-data-may14.csv")
juneData <- read.csv("uber-raw-data-jun14.csv")
julyData <- read.csv("uber-raw-data-jul14.csv")
augustData <- read.csv("uber-raw-data-aug14.csv")
septemberData <- read.csv("uber-raw-data-sep14.csv")

Combine All Files In One Dataset

Now we will combine all of these files into a single dataframe.

uberData <- rbind(aprilData,mayData,juneData,julyData,augustData,septemberData)
head(uberData)
##          Date.Time     Lat      Lon   Base
## 1 4/1/2014 0:11:00 40.7690 -73.9549 B02512
## 2 4/1/2014 0:17:00 40.7267 -74.0345 B02512
## 3 4/1/2014 0:21:00 40.7316 -73.9873 B02512
## 4 4/1/2014 0:28:00 40.7588 -73.9776 B02512
## 5 4/1/2014 0:33:00 40.7594 -73.9722 B02512
## 6 4/1/2014 0:33:00 40.7383 -74.0403 B02512
dim(uberData)
## [1] 4534327       4

Formatting Of Date And Time Column

we will perform the appropriate formatting of Date.Time column.

uberData$Date.Time <- as.POSIXct(uberData$Date.Time,format="%m/%d/%Y%H:%M:%S")
uberData$Time <- format(as.POSIXct(uberData$Date.Time,format="%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
uberData$Date.Time <- ymd_hms(uberData$Date.Time)

Create Factors

Now create factors of time objects like day, month, year etc.

uberData$day <- factor(day(uberData$Date.Time))
uberData$month <- factor(month(uberData$Date.Time,label = TRUE))
uberData$year <- factor(year(uberData$Date.Time))
uberData$daysofweek <- factor(wday(uberData$Date.Time,label = TRUE))
uberData$hour <- factor(hour(hms(uberData$Time)))
uberData$minute <- factor(minute(hms(uberData$Time)))
uberData$second <- factor(second(hms(uberData$Time)))

Plotting the trips by the hours in a day

In the next step we will use the ggplot function to plot the number of trips that the passengers had made in a day. We will also use dplyr to aggregate our data. In the resulting visualizations, we can understand how the number of passengers fares throughout the day.

dataUber <- uberData
hourData <- dataUber%>%group_by(hour)%>%
  dplyr::summarize(Total=n())
head(hourData,10)
## # A tibble: 10 x 2
##    hour   Total
##    <fct>  <int>
##  1 0     103836
##  2 1      67227
##  3 2      45865
##  4 3      48287
##  5 4      55230
##  6 5      83939
##  7 6     143213
##  8 7     193094
##  9 8     190504
## 10 9     159967

Trips Every Hour

ggplot(hourData,aes(hour,Total))+
  geom_bar(stat = "identity",fill="orange",color="black")+
  ggtitle("Trips Every Hour")+
  theme(legend.position = "none")+theme_light()+
  scale_y_continuous(labels = comma)+xlab("Hour")+ylab("Total Trips")

Trips By Hour And Month

monthHour <- dataUber%>%group_by(month,hour)%>%
  dplyr::summarize(Total=n())
ggplot(monthHour,aes(hour,Total,fill=month))+
  geom_bar(stat = "identity")+ggtitle("Trips By Hour And Month")+
  theme_light()+scale_y_continuous(labels = comma)+xlab("Hour")

Plotting Data By Trips During Every Day Of The Month

We observe from the resulting visualization that 30th of the month had the highest trips in the year which is mostly contributed by the month of April.

dayGroup <- dataUber%>%group_by(day)%>%
  dplyr::summarize(Total=n())
head(dayGroup,10)
## # A tibble: 10 x 2
##    day    Total
##    <fct>  <int>
##  1 1     127430
##  2 2     143201
##  3 3     142983
##  4 4     140923
##  5 5     147054
##  6 6     139886
##  7 7     143503
##  8 8     145984
##  9 9     155135
## 10 10    152500
ggplot(dayGroup,aes(day,Total))+
  geom_bar(stat = "identity",fill="salmon",color="white")+
  ggtitle("Trips Every Day")+
  theme(legend.position = "none")+
  scale_y_continuous(labels = comma)+theme_light()+xlab("Day")

Trips By Day And Month

daymg <- dataUber%>%
  group_by(month,day,daysofweek)%>%
  dplyr::summarize(Total=n())

ggplot(daymg,aes(month,Total,fill=daysofweek))+
  geom_bar(stat = "identity",position = "dodge")+
  ggtitle("Trips By Day And Month")+
  scale_y_continuous(labels = comma)+
  scale_fill_manual(values = colorsData)+xlab("Month")+
  theme_light()

Number Of Trips Taking Place During Months In A Year

We will visualize the number of trips that are taking place each month of the year. In the output visualization, we observe that most trips were made during the month of September. Furthermore, we also obtain visual reports of the number of trips that were made on every day of the week.

monthGroup <- dataUber%>%
  group_by(month)%>%
  dplyr::summarize(Total=n())
head(monthGroup)
## # A tibble: 6 x 2
##   month   Total
##   <ord>   <int>
## 1 Apr    564516
## 2 May    652435
## 3 Jun    663844
## 4 Jul    796121
## 5 Aug    829275
## 6 Sep   1028136
ggplot(monthGroup,aes(month,Total,fill=month))+
  geom_bar(stat = "identity")+
  ggtitle("Trips By Month")+theme(legend.position = "none")+
  scale_y_continuous(labels = comma)+theme_light()+xlab("Month")

monthWeekday <- dataUber%>%
  group_by(month,daysofweek)%>%
  summarize(Total=n())

ggplot(monthWeekday,aes(month,Total,fill=daysofweek))+
  geom_bar(stat = "identity",position = "dodge")+
  ggtitle("Trips By Day And Month")+
  scale_y_continuous(labels = comma)+
  scale_fill_manual(values = colorsData)+xlab("Month")+
  theme_light()

Number Of Trips By Bases

In the following visualization, we plot the number of trips that have been taken by the passengers from each of the bases. There are five bases in all out of which, we observe that B02617 had the highest number of trips. Furthermore, this base had the highest number of trips in the month B02617. Thursday observed highest trips in the three bases – B02598, B02617, B02682.

ggplot(dataUber,aes(Base))+
  geom_bar(fill="salmon")+
  scale_y_continuous(labels = comma)+
  ggtitle("Trips By Bases")+ylab("Count")+
  theme_light()

ggplot(dataUber,aes(Base,fill=month))+
  geom_bar(position = "dodge")+
  scale_y_continuous(labels = comma)+
  ggtitle("Trips By Bases And Month")+
  scale_fill_manual(values = colorsData)+ylab("Count")+
  theme_light()

ggplot(dataUber,aes(Base,fill=daysofweek))+
  geom_bar(position = "dodge")+
  scale_y_continuous(labels = comma)+
  ggtitle("Trips By Bases And Day Of Week")+
  scale_fill_manual(values = colorsData)+ylab("Count")+
  theme_light()

Creating A Heatmap Visualization Of Day, Hour And Month

dayHour <- dataUber%>%
  group_by(day,hour)%>%
  summarize(Total=n())

ggplot(dayHour,aes(day,hour,fill=Total))+
  geom_tile(color="White")+
  ggtitle("Heat Map By Hour And Day")

ggplot(daymg,aes(day,month,fill=Total))+
  geom_tile(color="white")+
  ggtitle("Heat Map By Month And Day")

ggplot(monthWeekday,aes(daysofweek,month,fill=Total))+
  geom_tile(color="white")+
  ggtitle("Heat Map By Month And Day Of Week")

monthBase <- dataUber%>%
  group_by(Base,month)%>%
  summarize(Total=n())

ggplot(monthBase,aes(Base,month,fill=Total))+
  geom_tile(color="white")+
  ggtitle("Heat Map By Month And Bases")+ylab("Month")+
  theme_light()

daysBases <- dataUber%>%
  group_by(Base,daysofweek)%>%
  summarize(Total=n())

ggplot(daysBases,aes(Base,daysofweek,fill=Total))+
  geom_tile(color="white")+
  ggtitle("Heat Map By Day Of Week And Bases")+
  ylab("Day Of Week")

Creating A Map Visualization Of Rides In New York

Now we will visualize the rides in New York city by creating a geo-plot that will help us to visualize the rides during 2014 (Apr – Sep) and by the bases in the same period.

minLat <- 40.5774
maxLat <- 40.9176
minLong <- -74.15
maxLong <- -73.7004

uber <- na.omit(dataUber)
ggplot(uber,aes(Lon,Lat))+
  geom_point(size=1,color="blue")+
  scale_x_continuous(limits = c(minLong,maxLong))+
  scale_y_continuous(limits = c(minLat,maxLat))+
  theme_map()+
  ggtitle("New York Map Based On Uber Rides During 2014 From April To September")

ggplot(dataUber,aes(x=Lon,y=Lat,color=Base))+
  geom_point(size=1)+
  scale_x_continuous(limits = c(minLong,maxLong))+
  scale_y_continuous(limits = c(minLat,maxLat))+
  theme_map()+
  ggtitle("New York Map Based On Uber Rides During 2014 From April To September By Base")