Import the necessary libraries
library(ggplot2)
library(ggthemes)
library(lubridate)
## Loading required package: timechange
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ lubridate::union() masks base::union()
library(DT)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
Creating a vector of colors for the plots
colors = c("#030303", "#BF3EFF","#05a399", "#cfcaca","#B22222","#0683c9","#f83599")
Read the data from each time-frame
library(readr)
apr <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-apr14.csv")
may <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-may14.csv")
jun <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-jun14.csv")
jul <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-jul14.csv")
aug <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-aug14.csv")
sept <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-sep14.csv")
Combine all the files into a single dataframe or dataset
uber_data <- rbind(apr,may,jun,jul,aug,sept)
Return the first 10 rows of the dataset, uber_data
head(uber_data,10)
## Date.Time Lat Lon Base
## 1 4/1/2014 0:11:00 40.7690 -73.9549 B02512
## 2 4/1/2014 0:17:00 40.7267 -74.0345 B02512
## 3 4/1/2014 0:21:00 40.7316 -73.9873 B02512
## 4 4/1/2014 0:28:00 40.7588 -73.9776 B02512
## 5 4/1/2014 0:33:00 40.7594 -73.9722 B02512
## 6 4/1/2014 0:33:00 40.7383 -74.0403 B02512
## 7 4/1/2014 0:39:00 40.7223 -73.9887 B02512
## 8 4/1/2014 0:45:00 40.7620 -73.9790 B02512
## 9 4/1/2014 0:55:00 40.7524 -73.9960 B02512
## 10 4/1/2014 1:01:00 40.7575 -73.9846 B02512
What are the dimensions of the dataset?
dim(uber_data)
## [1] 4534327 4
What is the class of the date/time?
class(uber_data$Date.Time)
## [1] "character"
Format/Convert the Date.Time into a more readable format.
library(lubridate)
uber_data$Date.Time <- as.POSIXct(uber_data$Date.Time, format = "%m/%d/%Y %H:%M:%S")
uber_data$Time <- format(as.POSIXct(uber_data$Date.Time,format="%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
uber_data$Date.Time <- ymd_hms(uber_data$Date.Time)
Create individual columns for the year, month, day, and time variables.
uber_data$day <- factor(day(uber_data$Date.Time))
uber_data$month <- factor(month(uber_data$Date.Time, label = TRUE))
uber_data$year <- factor(year(uber_data$Date.Time))
uber_data$dayofweek <- factor(wday(uber_data$Date.Time, label = TRUE))
#Time Variables ~ Hour, Minute, and Seconds
uber_data$hour <- factor(hour(hms(uber_data$Time)))
uber_data$minute <- factor(minute(hms(uber_data$Time)))
uber_data$second <- factor(second(hms(uber_data$Time)))
Let’s see if we got what we wanted
head(uber_data,10)
## Date.Time Lat Lon Base Time day month year
## 1 2014-04-01 00:11:00 40.7690 -73.9549 B02512 00:11:00 1 Apr 2014
## 2 2014-04-01 00:17:00 40.7267 -74.0345 B02512 00:17:00 1 Apr 2014
## 3 2014-04-01 00:21:00 40.7316 -73.9873 B02512 00:21:00 1 Apr 2014
## 4 2014-04-01 00:28:00 40.7588 -73.9776 B02512 00:28:00 1 Apr 2014
## 5 2014-04-01 00:33:00 40.7594 -73.9722 B02512 00:33:00 1 Apr 2014
## 6 2014-04-01 00:33:00 40.7383 -74.0403 B02512 00:33:00 1 Apr 2014
## 7 2014-04-01 00:39:00 40.7223 -73.9887 B02512 00:39:00 1 Apr 2014
## 8 2014-04-01 00:45:00 40.7620 -73.9790 B02512 00:45:00 1 Apr 2014
## 9 2014-04-01 00:55:00 40.7524 -73.9960 B02512 00:55:00 1 Apr 2014
## 10 2014-04-01 01:01:00 40.7575 -73.9846 B02512 01:01:00 1 Apr 2014
## dayofweek hour minute second
## 1 Tue 0 11 0
## 2 Tue 0 17 0
## 3 Tue 0 21 0
## 4 Tue 0 28 0
## 5 Tue 0 33 0
## 6 Tue 0 33 0
## 7 Tue 0 39 0
## 8 Tue 0 45 0
## 9 Tue 0 55 0
## 10 Tue 1 1 0
Data Visualization
Plot the number of trips made by passengers in a day by hour
dataUber <- uber_data
hourData <- dataUber %>% group_by(hour) %>%
dplyr::summarize(Total = n())
ggplot(hourData,aes(hour,Total)) + geom_bar(stat = "identity", fill = "purple",color ="black") + ggtitle("Trips Per Hour") + theme(legend.position = "none") + scale_y_continuous(labels = comma) + xlab("Hour") + ylab("Total Trips") + theme_light()
Plot the number of trips made by passengers in a day by hour and month
monthHour <- dataUber %>% group_by(month,hour) %>%
dplyr::summarize(Total=n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(monthHour,aes(hour,Total, fill = month)) + geom_bar(stat = "identity") + ggtitle("Trips Per Hour and Month") + theme(legend.position = "none") + scale_y_continuous(labels = comma) + xlab("Hour") + theme_light()
Plot the number of trips per day of the month
#Aggregate the data by the month
dayGroup <- dataUber %>% group_by(day) %>%
dplyr::summarize(Total=n())
head(dayGroup,10)
## # A tibble: 10 × 2
## day Total
## <fct> <int>
## 1 1 127430
## 2 2 143201
## 3 3 142983
## 4 4 140923
## 5 5 147054
## 6 6 139886
## 7 7 143503
## 8 8 145984
## 9 9 155135
## 10 10 152500
#Plot the data for the day
ggplot(dayGroup,aes(day,Total))+geom_bar(stat = "identity",fill="mediumvioletred",color="white")+ggtitle("Trips Every Day")+theme(legend.position = "none")+scale_y_continuous(labels = comma)+theme_light()+xlab("Day") + ylab("Total Trips")
#collect data by day of the week and month
day_month_data <- dataUber %>%
group_by(month,day,dayofweek) %>%
dplyr::summarize(Total=n())
## `summarise()` has grouped output by 'month', 'day'. You can override using the
## `.groups` argument.
ggplot(day_month_data,aes(month,Total,fill=dayofweek))+
geom_bar(stat = "Identity", position = "dodge")+
ggtitle("Trips By Day And Month")+
scale_y_continuous(labels = comma)+
scale_fill_manual(values = colors) + xlab("Month") + ylab("Total Trips")+
theme_light() + labs(fill = "Day of the Week")
Plot the number of trips taking place during months in a year
monthGroup <- dataUber %>% group_by(month) %>%
dplyr::summarize(Total = n())
monthGroup
## # A tibble: 6 × 2
## month Total
## <ord> <int>
## 1 Apr 564516
## 2 May 652435
## 3 Jun 663844
## 4 Jul 796121
## 5 Aug 829275
## 6 Sep 1028136
ggplot(monthGroup, aes(month, Total, fill = month)) +
geom_bar(stat = "Identity") +
ggtitle("Trips Per Month") +
theme(legend.position = "none") +
scale_y_continuous(labels = comma) +
scale_fill_manual(values = colors) + xlab("Months") + ylab("Number of Trips")
Plot the number of trips by days per month in a year
monthWeekday <- dataUber %>%
group_by(month,dayofweek)%>%
summarize(Total=n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(monthWeekday,aes(month,Total,fill=dayofweek))+
geom_bar(stat = "identity",position = "dodge")+
ggtitle("Trips By Day And Month")+
scale_y_continuous(labels = comma)+
scale_fill_manual(values = colors)+xlab("Month")+ylab("Total Trips")+
theme_light() + labs(fill ="Day of the Week")
Plot the number of trips by each base
ggplot(dataUber,aes(Base)) + geom_bar(fill="aquamarine1")+
scale_y_continuous(labels = comma)+
ggtitle("Trips By Bases")+ylab("Trips")+
theme_light()
Plot the number of trips by each base and month
ggplot(dataUber,aes(Base,fill=month))+
geom_bar(position = "dodge")+
scale_y_continuous(labels = comma)+
ggtitle("Trips By Bases And Month")+
scale_fill_manual(values = colors)+ylab("Trips")+
theme_light()
Plot the number of trips by each base and day of the week
ggplot(dataUber,aes(Base,fill=dayofweek))+
geom_bar(position = "dodge")+
scale_y_continuous(labels = comma)+
ggtitle("Trips By Bases And Day of the Week")+
scale_fill_manual(values = colors)+ylab("Trips")+
theme_light() + labs(fill = "Day of the Week")
Create a heat map visualization of day, hour, and month
day_hour_data <- uber_data %>%
group_by(day,hour) %>%
dplyr::summarize(Total = n())
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
#Plot a Heat Map by Hour and Day
ggplot(day_hour_data,aes(day,hour,fill = Total))+
geom_tile(color = "yellow")+
ggtitle("Heat Map by Hour and Day") + ylab("Hour") + xlab("Day") + scale_fill_gradient(low="#ff5e62",high="#ff9966" ,guide="colorbar")
#Plot a Heat Map by Month and Day
ggplot(day_month_data,aes(day,month,fill = Total)) +
geom_tile(color = "yellow")+
ggtitle("Heat Map by Month and Day") + ylab("Month") + xlab("Day") + scale_fill_gradient(low = "#185a9d",high = "#43cea2",guide = "colorbar")
#Plot a Heat Map by Month and Week Day
ggplot(monthWeekday, aes(dayofweek,month,fill = Total))+ geom_tile(color="white")+
ggtitle("Heat Map By Month And Day Of Week") + ylab("Month") + xlab("Day of the Week") + scale_fill_gradient(low = "#753a88",high = "#cc2b5e",guide = "colorbar")
#Plot a Heat Map by Month and Base
monthBase <- dataUber%>%
group_by(Base,month)%>%
summarize(Total=n())
## `summarise()` has grouped output by 'Base'. You can override using the
## `.groups` argument.
ggplot(monthBase,aes(Base,month,fill=Total))+
geom_tile(color="white")+
ggtitle("Heat Map By Month And Bases")+ylab("Month")+
theme_light() + xlab("Base")
#Plot a Heat Map by Day of the Week and Bases
dayofweekBase <- dataUber %>%
group_by(Base,dayofweek) %>%
summarize(Total=n())
## `summarise()` has grouped output by 'Base'. You can override using the
## `.groups` argument.
ggplot(dayofweekBase,aes(Base,dayofweek,fill = Total)) +
geom_tile(color="white")+
ggtitle("Heat Map By Day of the Week And Bases")+ylab("Day of the Week")+
theme_light() + xlab("Base") + scale_fill_gradient(low = "#19547b",high = "#ffd89b",guide = "colorbar")
Create a Map Visualization of Rides in New York
#Set map constants
minLat <- 40.5774
maxLat <- 40.9176
minLong <- -74.15
maxLong <- -73.7004
uber <- na.omit(dataUber)
ggplot(uber,aes(Lon,Lat))+
geom_point(size=1,color="blue")+
scale_x_continuous(limits = c(minLong,maxLong))+
scale_y_continuous(limits = c(minLat,maxLat))+
theme_map()+
ggtitle("NYC Map Based On Uber Rides During 2014 (APR - SEPT)")
## Warning: Removed 71701 rows containing missing values (`geom_point()`).
ggplot(dataUber,aes(x=Lon,y=Lat,color=Base))+
geom_point(size=1)+
scale_x_continuous(limits = c(minLong,maxLong))+
scale_y_continuous(limits = c(minLat,maxLat))+
theme_map()+
ggtitle("NYC Map Based On Uber Rides During 2014 By Base (APR - SEPT)")
## Warning: Removed 71701 rows containing missing values (`geom_point()`).