Uber Data Analysis
Let’s load some data.
#Importing Library
library(ggplot2)
library(ggthemes)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(DT)
library(scales)
#Create Vector of colors to be implemented in our Plots
colorsData <- c("#CC1011", "#665555", "#05a399", "#cfcaca", "#f5e840", "#0683c9", "#e075b0")
#Reading Data
setwd("/Volumes/gyan1 /uber data")
aprilData <- read.csv("uber-raw-data-apr14.csv")
mayData <- read.csv("uber-raw-data-may14.csv")
juneData <- read.csv("uber-raw-data-jun14.csv")
julyData <- read.csv("uber-raw-data-jul14.csv")
augustData <- read.csv("uber-raw-data-aug14.csv")
septemberData <- read.csv("uber-raw-data-sep14.csv")
#Combine All Files in one Dataset
uberData <- rbind(aprilData,mayData,juneData,julyData,augustData,septemberData)
head(uberData)
## Date.Time Lat Lon Base
## 1 4/1/2014 0:11:00 40.7690 -73.9549 B02512
## 2 4/1/2014 0:17:00 40.7267 -74.0345 B02512
## 3 4/1/2014 0:21:00 40.7316 -73.9873 B02512
## 4 4/1/2014 0:28:00 40.7588 -73.9776 B02512
## 5 4/1/2014 0:33:00 40.7594 -73.9722 B02512
## 6 4/1/2014 0:33:00 40.7383 -74.0403 B02512
dim(uberData)
## [1] 4534327 4
#Formatting Of Date and Time Column
uberData$Date.Time <- as.POSIXct(uberData$Date.Time,format="%m/%d/%Y%H:%M:%S")
uberData$Time <- format(as.POSIXct(uberData$Date.Time,format="%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
uberData$Date.Time <- ymd_hms(uberData$Date.Time)
#Create Foctors
uberData$day <- factor(day(uberData$Date.Time))
uberData$month <- factor(month(uberData$Date.Time,label = TRUE))
uberData$year <- factor(year(uberData$Date.Time))
uberData$daysofweek <- factor(wday(uberData$Date.Time,label = TRUE))
uberData$hour <- factor(hour(hms(uberData$Time)))
uberData$minute <- factor(minute(hms(uberData$Time)))
uberData$second <- factor(second(hms(uberData$Time)))
#Plotting the trips by the hours in a day
dataUber <- uberData
hourData <- dataUber%>%group_by(hour)%>%
dplyr::summarize(Total=n())
## `summarise()` ungrouping output (override with `.groups` argument)
head(hourData,10)
## # A tibble: 10 x 2
## hour Total
## <fct> <int>
## 1 0 103836
## 2 1 67227
## 3 2 45865
## 4 3 48287
## 5 4 55230
## 6 5 83939
## 7 6 143213
## 8 7 193094
## 9 8 190504
## 10 9 159967
#Trips Every Hour
ggplot(hourData,aes(hour,Total))+
geom_bar(stat = "identity",fill="red",color="yellow")+
ggtitle("Trips Every Hour")+
theme(legend.position = "none")+theme_light()+
scale_y_continuous(labels = comma)+xlab("Hour")+ylab("Total Trips")
#Trips by Hour and month
monthHour <- dataUber%>%group_by(month,hour)%>%
dplyr::summarize(Total=n())
## `summarise()` regrouping output by 'month' (override with `.groups` argument)
ggplot(monthHour,aes(hour,Total,fill=month))+
geom_bar(stat = "identity")+ggtitle("Trips By Hour And Month")+
theme_light()+scale_y_continuous(labels = comma)+xlab("Hour")
#Plotting Data By Trips During Every Day of the Month
dayGroup <- dataUber%>%group_by(day)%>%
dplyr::summarize(Total=n())
## `summarise()` ungrouping output (override with `.groups` argument)
head(dayGroup,10)
## # A tibble: 10 x 2
## day Total
## <fct> <int>
## 1 1 127430
## 2 2 143201
## 3 3 142983
## 4 4 140923
## 5 5 147054
## 6 6 139886
## 7 7 143503
## 8 8 145984
## 9 9 155135
## 10 10 152500
ggplot(dayGroup,aes(day,Total))+
geom_bar(stat = "identity",fill="purple",color="white")+
ggtitle("Trips Every Day")+
theme(legend.position = "none")+
scale_y_continuous(labels = comma)+theme_light()+xlab("Day")
#Trips by Date and Month
daymg <- dataUber%>%
group_by(month,day,daysofweek)%>%
dplyr::summarize(Total=n())
## `summarise()` regrouping output by 'month', 'day' (override with `.groups` argument)
ggplot(daymg,aes(month,Total,fill=daysofweek))+
geom_bar(stat = "identity",position = "dodge")+
ggtitle("Trips By Day And Month")+
scale_y_continuous(labels = comma)+
scale_fill_manual(values = colorsData)+xlab("Month")+
theme_light()
#Number of Trips Taking place During months in A Year
monthGroup <- dataUber%>%
group_by(month)%>%
dplyr::summarize(Total=n())
## `summarise()` ungrouping output (override with `.groups` argument)
head(monthGroup)
## # A tibble: 6 x 2
## month Total
## <ord> <int>
## 1 Apr 564516
## 2 May 652435
## 3 Jun 663844
## 4 Jul 796121
## 5 Aug 829275
## 6 Sep 1028136
ggplot(monthGroup,aes(month,Total,fill=month))+
geom_bar(stat = "identity")+
ggtitle("Trips By Month")+theme(legend.position = "none")+
scale_y_continuous(labels = comma)+theme_light()+xlab("Month")
monthWeekday <- dataUber%>%
group_by(month,daysofweek)%>%
summarize(Total=n())
## `summarise()` regrouping output by 'month' (override with `.groups` argument)
ggplot(monthWeekday,aes(month,Total,fill=daysofweek))+
geom_bar(stat = "identity",position = "dodge")+
ggtitle("Trips By Day And Month")+
scale_y_continuous(labels = comma)+
scale_fill_manual(values = colorsData)+xlab("Month")+
theme_light()
#Number of trip by Bases
ggplot(dataUber,aes(Base))+
geom_bar(fill="salmon")+
scale_y_continuous(labels = comma)+
ggtitle("Trips By Bases")+ylab("Count")+
theme_light()
ggplot(dataUber,aes(Base,fill=month))+
geom_bar(position = "dodge")+
scale_y_continuous(labels = comma)+
ggtitle("Trips By Bases And Month")+
scale_fill_manual(values = colorsData)+ylab("Count")+
theme_light()
ggplot(dataUber,aes(Base,fill=daysofweek))+
geom_bar(position = "dodge")+
scale_y_continuous(labels = comma)+
ggtitle("Trips By Bases And Day Of Week")+
scale_fill_manual(values = colorsData)+ylab("Count")+
theme_light()
#Creating A Heatmap Visualisation of Day, hour and month
#.....Day and Hour...........
dayHour <- dataUber%>%
group_by(day,hour)%>%
summarize(Total=n())
## `summarise()` regrouping output by 'day' (override with `.groups` argument)
ggplot(dayHour,aes(day,hour,fill=Total))+
geom_tile(color="White")+
ggtitle("Heat Map By Hour And Day")
#.....Days and Month............
ggplot(daymg,aes(day,month,fill=Total))+
geom_tile(color="white")+
ggtitle("Heat Map By Month And Day")
# Month and Week Days
ggplot(monthWeekday,aes(daysofweek,month,fill=Total))+
geom_tile(color="white")+
ggtitle("Heat Map By Month And Day Of Week")
#Month and Bases
monthBase <- dataUber%>%
group_by(Base,month)%>%
summarize(Total=n())
## `summarise()` regrouping output by 'Base' (override with `.groups` argument)
ggplot(monthBase,aes(Base,month,fill=Total))+
geom_tile(color="white")+
ggtitle("Heat Map By Month And Bases")+ylab("Month")+
theme_light()
#......Days Of Week and Bases...............
daysBases <- dataUber%>%
group_by(Base,daysofweek)%>%
summarize(Total=n())
## `summarise()` regrouping output by 'Base' (override with `.groups` argument)
ggplot(daysBases,aes(Base,daysofweek,fill=Total))+
geom_tile(color="white")+
ggtitle("Heat Map By Day Of Week And Bases")+
ylab("Day Of Week")
#Creating A Map Visualization of Rides in NEW YORK
minLat <- 40.5774
maxLat <- 40.9176
minLong <- -74.15
maxLong <- -73.7004
uber <- na.omit(dataUber)
ggplot(uber,aes(Lon,Lat))+
geom_point(size=1,color="blue")+
scale_x_continuous(limits = c(minLong,maxLong))+
scale_y_continuous(limits = c(minLat,maxLat))+
theme_map()+
ggtitle("New York Map Based On Uber Rides During 2014 From April To September")
## Warning: Removed 71701 rows containing missing values (geom_point).
ggplot(dataUber,aes(x=Lon,y=Lat,color=Base))+
geom_point(size=1)+
scale_x_continuous(limits = c(minLong,maxLong))+
scale_y_continuous(limits = c(minLat,maxLat))+
theme_map()+
ggtitle("New York Map Based On Uber Rides During 2014 From April To September By Base")
## Warning: Removed 71701 rows containing missing values (geom_point).