This is an R Markdown document. Markdown is a simple formatting syntax for authoring web pages (click the MD toolbar button for help on Markdown).
When you click the Knit HTML button a web page will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
The objective of this study is to predict the demand for uber pick-ups in the Manhattan area using time series models. As the first step of the study, a descriptive analysis of the datasets is presented here, including: 1. Data Source 2. Summary of statistics 3. Time series plots 4. Autocorrelation Analysis (ACF, PCF, CCF) 5. Stationary Test
There are several steps involved in the data cleaning process: 1. Read in the data (uber 2014, uber 2015, weather, holiday) 2. For uber 2014 data, long and lat information is used to geo-locate the pick-ups in the Manhattan area; for uber 2015 data, pickups in the zones in the Manhattan area are selected for the study. 3. The
# set working directory
setwd("~/Documents/uber")
# import libraries
library(zoo)
## Warning: package 'zoo' was built under R version 3.1.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.1.3
library(sandwich)
## Warning: package 'sandwich' was built under R version 3.1.3
library(plyr)
## Warning: package 'plyr' was built under R version 3.1.3
library(tseries)
# read in the uber data
uber15 <- read.csv("janjune15.csv") # 2015 Jan - June
uber1404<-read.csv("201404.csv") # 2014 Apr-Sep; the 2014 data has been processed in ArcGIS and now only contains the Manhattan Area Data
uber1404<-uber1404[,-(5:11)] # keep only the relevent first four columns
uber1405<-read.csv("201405.csv")
uber1406<-read.csv("201406.csv")
uber1407<-read.csv("201407.csv")
uber1408<-read.csv("201408.csv")
uber1409<-read.csv("201409.csv")
uber14<-rbind(uber1404, uber1405, uber1406, uber1407, uber1408, uber1409) # merge all the 2014 pickups
# change date format
uber15$date<- as.Date(uber15$Pickup_date)
uber15<- uber15[order(uber15$date),]
uber14$date<- as.Date(uber14$Date.Time,format = "%m/%d/%Y")
uber14<-uber14[order(uber14$date),]
# aggregate the 2014 data to daily counts (need the *plyr* package)
uber14<-count(uber14, "date")
# subset the 2015 pickup data in the Manhattan Area and estimate the daily counts
locationlookup<- read.csv("taxi-zone-lookup.csv") # this is a lookup table for location ID (in the pickup data) and borough/zone information)
uber15$LocationID <- uber15$locationID
uber15<-merge(uber15, locationlookup, by="LocationID") # merge in the columns containing borough and zone information
uber15<-subset(uber15, uber15$Borough == "Manhattan")
uber15<-count(uber15, "date")
# read in the weather data
weather <- read.csv("weather.csv")
weather$date<-as.Date(weather$DATE, format = "%m/%d/%y")
# read in the holiday data
holiday <-read.csv("holiday data.csv")
holiday$date <-as.Date(holiday$Date, format = "%m/%d/%y")
# merge pickup, weather and holiday data
uber14<-merge(uber14, holiday,by = "date") # the merge function can take in two data frames at a time
uber14<-merge(uber14, weather, by = "date")
uber14$holiday<-as.factor(uber14$holiday)
uber14$weekend<-as.factor(uber14$weekend)
uber15<-merge(uber15, holiday,by = "date")
uber15<-merge(uber15, weather, by = "date")
uber15$holiday<-as.factor(uber15$holiday)
uber15$weekend<-as.factor(uber15$weekend)
# convert to ts obj; frequency is the number of obs per year
#2014 data
uber14ts<-zoo(uber14$freq, uber14$date)
prcp14 <-zoo(uber14$PRCP, uber14$date)
tmax14 <-zoo(uber14$TMAX, uber14$date)
tmin14<-zoo(uber14$TMIN, uber14$date)
holiday14<-zoo(uber14$holiday, uber14$date)
weekend14<-zoo(uber14$weekend, uber14$date)
#2015 data
uber15ts<-zoo(uber15$freq, uber15$date)
prcp15 <-zoo(uber15$PRCP, uber15$date)
snwd15<-zoo(uber15$SNWD, uber15$date)
snow15<-zoo(uber15$SNOW, uber15$date)
tmax15 <-zoo(uber15$TMAX, uber15$date)
tmin15<-zoo(uber15$TMIN, uber15$date)
holiday15<-zoo(uber15$holiday, uber15$date)
weekend15<-zoo(uber15$weekend, uber15$date)
The time series trend plots show that As for summary statistics, the pickup rides have increase dramatically in 2015. The number of rides seems to be higher during summer seasons (May - Sept), probably due to the increase in the number of travelers in the city.
# time series trends for the pickup data
plot(uber14ts,ylim = c(5000,40000))
plot(uber15ts,ylim = c(20000,100000))
# summary statistics
summary(uber14)
## date freq Date holiday
## Min. :2014-04-01 Min. : 5809 4/1/14 : 1 0:180
## 1st Qu.:2014-05-16 1st Qu.:14510 4/10/14: 1 1: 3
## Median :2014-07-01 Median :18700 4/11/14: 1
## Mean :2014-07-01 Mean :18817 4/12/14: 1
## 3rd Qu.:2014-08-15 3rd Qu.:22668 4/13/14: 1
## Max. :2014-09-30 Max. :32892 4/14/14: 1
## (Other):177
## weekday weekend DATE PRCP SNWD
## Min. :1.000 0:131 4/1/14 : 1 Min. : 0.00 Min. :0
## 1st Qu.:2.000 1: 52 4/10/14: 1 1st Qu.: 0.00 1st Qu.:0
## Median :4.000 4/11/14: 1 Median : 0.00 Median :0
## Mean :3.989 4/12/14: 1 Mean : 35.43 Mean :0
## 3rd Qu.:6.000 4/13/14: 1 3rd Qu.: 13.00 3rd Qu.:0
## Max. :7.000 4/14/14: 1 Max. :1262.00 Max. :0
## (Other):177
## SNOW TMAX TMIN
## Min. :0 Min. : 83 Min. : -5.0
## 1st Qu.:0 1st Qu.:217 1st Qu.:125.0
## Median :0 Median :256 Median :172.0
## Mean :0 Mean :246 Mean :156.8
## 3rd Qu.:0 3rd Qu.:283 3rd Qu.:200.0
## Max. :0 Max. :333 Max. :250.0
##
summary(uber15)
## date freq Date holiday
## Min. :2015-01-01 Min. :19687 1/1/15 : 1 0:177
## 1st Qu.:2015-02-15 1st Qu.:50028 1/10/15: 1 1: 4
## Median :2015-04-01 Median :57321 1/11/15: 1
## Mean :2015-04-01 Mean :57299 1/12/15: 1
## 3rd Qu.:2015-05-16 3rd Qu.:66013 1/13/15: 1
## Max. :2015-06-30 Max. :96118 1/14/15: 1
## (Other):175
## weekday weekend DATE PRCP SNWD
## Min. :1.000 0:129 1/1/15 : 1 Min. : 0.00 Min. : 0.00
## 1st Qu.:2.000 1: 52 1/10/15: 1 1st Qu.: 0.00 1st Qu.: 0.00
## Median :4.000 1/11/15: 1 Median : 0.00 Median : 0.00
## Mean :4.006 1/12/15: 1 Mean : 29.09 Mean : 64.81
## 3rd Qu.:6.000 1/13/15: 1 3rd Qu.: 13.00 3rd Qu.:100.00
## Max. :7.000 1/14/15: 1 Max. :533.00 Max. :480.00
## (Other):175
## SNOW TMAX TMIN
## Min. : 0.000 Min. :-71.0 Min. :-166.00
## 1st Qu.: 0.000 1st Qu.: 39.0 1st Qu.: -43.00
## Median : 0.000 Median :122.0 Median : 39.00
## Mean : 6.889 Mean :131.8 Mean : 44.97
## 3rd Qu.: 0.000 3rd Qu.:233.0 3rd Qu.: 133.00
## Max. :191.000 Max. :322.0 Max. : 239.00
##
# histogram for the pickups (holiday vs non-holiday)
Both the 2014 and 2015 data are stationary as the test results indicate.
adf.test(uber14ts) # stationary
##
## Augmented Dickey-Fuller Test
##
## data: uber14ts
## Dickey-Fuller = -3.0429, Lag order = 5, p-value = 0.1402
## alternative hypothesis: stationary
adf.test(uber15ts) # stationary
##
## Augmented Dickey-Fuller Test
##
## data: uber15ts
## Dickey-Fuller = -3.804, Lag order = 5, p-value = 0.02031
## alternative hypothesis: stationary
For 2014 pickups, the partial correlation dispeared after the 7th lag, indicating that we should probably include 7 lags in the model. The cross correlation estimates show that weather attributes and the weekend attribute may strongly correlated to the uber pickups.
For 2015 pickups, the partial correlation disapeared after the 8th lag, indicating that we should probably include 8 lags in the model. The cross correlation estimates show that temp and weekend attributes seem to strongly correlated with the uber pickups.
# 2014 pickups
acf(uber14ts, main = "ACF of uber14")
pacf(uber14ts, main= "PCF of uber14")
ccf(uber14ts, prcp14)
ccf(uber14ts, tmax14)
ccf(uber14ts, tmin14)
ccf(uber14ts, tmax14)
ccf(uber14ts, holiday14)
ccf(uber14ts, weekend14)
# 2015 pickups
acf(uber15ts, main = "ACF of uber15")
pacf(uber15ts, main= "PCF of uber15")
ccf(uber15ts, prcp15)
ccf(uber15ts, snwd15)
ccf(uber15ts, snow15)
ccf(uber15ts, tmax15)
ccf(uber15ts, tmin15)
ccf(uber15ts, holiday15)
ccf(uber15ts, weekend15)
The objective of this step is to conduct a linear regression analysis on the uber pickups, identify the significant influencing factors and use them to perform predictions.
Two dependent variables are examed in regression models: 1. 2014 daily pickups; 2. 2015 daily pickups
For both 2014 and 2015 data, weather data and “trend” data (i.e., holiday or not, weekend or not) are included in the models as independent variables; since the 2015 pickup data includes winter months, seasonal weather data such as snow depth and snow fall have also been included in the model as independent variables
Model 1 ,model2, model 3 are for 2014 pickups, and model 2 is for 2015 pickups. For Model1, all the selected variables are statistically significant: people were more interested in taking ubers in rainy days and colder days, while fewer people took uber in hot days, holidays or on weekends. For Model 2, none of the selected variables are statistically significant at this point. Model 3 is built on the findings from the autocorrelation analysis, with a complicate trend shown in the analysis result, an ARMA model is applied and both the moving average and autoregressive trends are significant. Model 4 is the best fitted model with the addition of a lag variable, with 80% of the variance have been explained by the selected attributes. For the prediction vs real data plot, the results show that the model generally provides a good prediction to the dependent variable. For the residual vs fitted plot, the results show no obvious trend and the residuals tend to cluster around y = 0, which is good.
# LR for 2014 pickups
model1<-lm(uber14ts~prcp14+tmax14+tmin14+weekend14+holiday14)
summary(model1)
##
## Call:
## lm(formula = uber14ts ~ prcp14 + tmax14 + tmin14 + weekend14 +
## holiday14)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10370.1 -2851.5 -353.4 2044.9 15358.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39323.725 3392.463 11.591 < 2e-16 ***
## prcp14 8.979 2.913 3.082 0.00239 **
## tmax14 -49.634 14.164 -3.504 0.00058 ***
## tmin14 73.441 14.111 5.205 5.35e-07 ***
## weekend14 -5156.012 756.814 -6.813 1.44e-10 ***
## holiday14 -13288.325 2672.517 -4.972 1.56e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4545 on 177 degrees of freedom
## Multiple R-squared: 0.4031, Adjusted R-squared: 0.3863
## F-statistic: 23.91 on 5 and 177 DF, p-value: < 2.2e-16
model2<-lm(uber15ts~prcp15+snwd15)
summary(model2)
##
## Call:
## lm(formula = uber15ts ~ prcp15 + snwd15)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38473 -6997 122 8633 35510
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.689e+04 1.067e+03 53.340 <2e-16 ***
## prcp15 1.308e+01 1.208e+01 1.083 0.280
## snwd15 3.801e-01 7.671e+00 0.050 0.961
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11840 on 178 degrees of freedom
## Multiple R-squared: 0.006572, Adjusted R-squared: -0.00459
## F-statistic: 0.5888 on 2 and 178 DF, p-value: 0.5561
model3 <- arma(uber14ts, order = c(1,1) )
summary(model3)
##
## Call:
## arma(x = uber14ts, order = c(1, 1))
##
## Model:
## ARMA(1,1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11138.5 -3228.7 548.7 3016.7 11208.3
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## ar1 4.907e-01 8.130e-02 6.036 1.58e-09 ***
## ma1 4.293e-01 7.878e-02 5.450 5.04e-08 ***
## intercept 9.650e+03 1.583e+03 6.096 1.09e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Fit:
## sigma^2 estimated as 15920203, Conditional Sum-of-Squares = 2881556795, AIC = 3560.04
lag14<-lag(uber14ts,-1)
data4<-merge(uber14ts, lag14, prcp14,tmax14,tmin14,weekend14,holiday14)
model4<-lm(uber14ts~lag14+prcp14+tmax14+tmin14+weekend14+holiday14, data = data4)
summary(model4)
##
## Call:
## lm(formula = uber14ts ~ lag14 + prcp14 + tmax14 + tmin14 + weekend14 +
## holiday14, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10097.5 -1077.3 76.9 1262.9 8941.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.249e+04 2.142e+03 10.498 < 2e-16 ***
## lag14 6.745e-01 3.551e-02 18.995 < 2e-16 ***
## prcp14 8.127e+00 1.672e+00 4.862 2.57e-06 ***
## tmax14 -1.621e+01 8.311e+00 -1.950 0.05275 .
## tmin14 2.721e+01 8.478e+00 3.210 0.00158 **
## weekend14 -6.386e+03 4.391e+02 -14.543 < 2e-16 ***
## holiday14 -8.529e+03 1.552e+03 -5.494 1.36e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2605 on 175 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8045, Adjusted R-squared: 0.7978
## F-statistic: 120 on 6 and 175 DF, p-value: < 2.2e-16
uber14fit<-zoo(fitted(model1), uber14$date)
uber14residue<-zoo(residuals(model1), uber14$date)
# HAC std errs
coeftest(model4)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22492.43624 2142.49443 10.4982 < 2.2e-16 ***
## lag14 0.67450 0.03551 18.9949 < 2.2e-16 ***
## prcp14 8.12748 1.67159 4.8621 2.572e-06 ***
## tmax14 -16.20719 8.31053 -1.9502 0.05275 .
## tmin14 27.21473 8.47825 3.2099 0.00158 **
## weekend14 -6385.73114 439.09624 -14.5429 < 2.2e-16 ***
## holiday14 -8529.08933 1552.31663 -5.4944 1.365e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(model4,NeweyWest(model1,lag = 1, prewhite = FALSE))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22492.4362 2423.0512 9.2827 < 2.2e-16 ***
## prcp14 8.1275 2.2810 3.5631 0.0004724 ***
## tmax14 -16.2072 13.8033 -1.1742 0.2419300
## tmin14 27.2147 12.0358 2.2611 0.0249815 *
## weekend14 -6385.7311 806.7552 -7.9153 2.714e-13 ***
## holiday14 -8529.0893 910.8506 -9.3639 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# fitted
df0<-merge(uber14ts, uber14fit)
df<-merge(df0,uber14residue)
head(df)
## uber14ts uber14fit uber14residue
## 2014-04-01 11730 16000.702 -4270.7021
## 2014-04-02 14500 18936.759 -4436.7588
## 2014-04-03 17235 17140.438 94.5617
## 2014-04-04 22582 20467.056 2114.9439
## 2014-04-05 14741 12899.451 1841.5489
## 2014-04-06 9209 9348.018 -139.0180
# ts plot of fitted
plot(df[,1:2],plot.type = "single", col = c("red", "blue"), lwd = 2)
# residuals
plot(df[,2],df[,3])
abline(h=0)
Several experiments have been conducted to explore how “far” the model could be used to do the prediction. Both the R-sqaure and HAC t statistics prove that the model is best at short horizon predictions (i.e., 6 days ahead).
# search for best lag
# by R^2
for(i in 1:8)
{
if(i==1) cat("horizon, R squared","\n")
uber14tsi<-lag(uber14ts,i)
datai<-merge(uber14tsi, lag14,prcp14, tmax14, weekend14, holiday14)
fiti<-lm(uber14tsi~lag14+prcp14+tmax14+tmax14+weekend14+holiday14, data = datai)
cat(sprintf("%7d %9.4f \n", i, summary(fiti)$r.squared))
}
## horizon, R squared
## 1 0.4778
## 2 0.1087
## 3 0.1262
## 4 0.2775
## 5 0.4732
## 6 0.5540
## 7 0.5389
## 8 0.4057
# by HAC t stat
for(i in 1:8)
{
if(i==1) cat("horizon, HAC t stat","\n")
uber14tsi<-lag(uber14ts,i)
datai<-merge(uber14tsi, lag14,prcp14, tmax14, weekend14, holiday14)
fiti<-lm(uber14tsi~lag14+prcp14+tmax14+tmax14+weekend14+holiday14, data = datai)
ct<-coeftest(fiti,NeweyWest(fiti,lag = i, prewhite = FALSE))
cat(sprintf("%7d %9.4f \n", i, ct[2,3]))
}
## horizon, HAC t stat
## 1 5.1844
## 2 0.2520
## 3 -0.2382
## 4 2.2081
## 5 8.1330
## 6 11.4624
## 7 7.8261
## 8 1.5725