In this section we will model our refined data to find a regression model that explains the
library(glmnet)
library(lme4)
library(readr)
library(Hmisc)
library(osmdata)
library(darksky)
library(ggplot2)
library(tidyverse)
library(rgdal)
library(rgeos)
library(tmap)
library(leaflet)
library(RColorBrewer)
library(sp)
library(spatialEco)
library(mapview)
library(sf)
library(dplyr)
library(tidyr)
library(plm)
library(corrplot)
wd = "~/Uni Temp Work/Big Data"
plsep <- .Platform$file.sep
regression <- read.csv(paste(wd, plsep, "regression.csv", sep =""))
regression1 <- regression[complete.cases(regression),]
factors <- c("hour", "day", "ZIPCODE","summary")
numeric <- c("apparentTemperature", "POPULATION", "AREA", "bar", "nightclub", "attraction", "hospital", "social_facility", "total_passengers")
regression1[, factors] <- lapply(regression1[factors], as.factor)
regression1[, numeric] <- lapply(regression1[numeric], as.numeric)
regression1$time = as.POSIXct(regression1$time,format = "%m/%d/%Y %H")
regression1 <- regression1[regression1$POPULATION > 0 ,]
regression1 <- regression1[!duplicated(c("regression1$ZIPCODE","regression1$time")), ]
regression1$number_activity <- regression1$bar + regression1$nightclub + regression1$attraction + regression1$hospital +regression1$social_facility
summary(regression1)
## ZIPCODE time summary
## 10035 : 796 Min. :2016-03-01 00:00:00 Clear :15644
## 11103 : 744 1st Qu.:2016-03-08 18:00:00 Partly Cloudy :10575
## 11106 : 744 Median :2016-03-16 13:00:00 Overcast : 6901
## 11201 : 744 Mean :2016-03-16 11:52:48 Mostly Cloudy : 4398
## 11211 : 744 3rd Qu.:2016-03-24 06:00:00 Light Rain : 1193
## 11215 : 744 Max. :2016-03-31 23:00:00 Possible Drizzle: 940
## (Other):36898 (Other) : 1763
## day hour apparentTemperature POPULATION
## Friday :5431 15 : 1855 Min. :-7.680 Min. : 2349
## Monday :5266 18 : 1850 1st Qu.: 2.930 1st Qu.: 38234
## Saturday :5590 17 : 1849 Median : 7.320 Median : 57010
## Sunday :5473 19 : 1848 Mean : 7.529 Mean : 58491
## Thursday :6636 16 : 1840 3rd Qu.:11.450 3rd Qu.: 76130
## Tuesday :6453 14 : 1835 Max. :24.710 Max. :109069
## Wednesday:6565 (Other):30337
## AREA total_passengers total_trip_distance bar
## Min. : 581237 Min. : 0.00 Min. : 0.01 Min. : 0.000
## 1st Qu.: 23159566 1st Qu.: 5.00 1st Qu.: 11.76 1st Qu.: 0.000
## Median : 37640607 Median : 14.00 Median : 33.33 Median : 1.000
## Mean : 41193470 Mean : 39.57 Mean : 83.61 Mean : 3.323
## 3rd Qu.: 55879065 3rd Qu.: 48.00 3rd Qu.: 101.89 3rd Qu.: 5.000
## Max. :215835794 Max. :761.00 Max. :1877.27 Max. :27.000
##
## nightclub attraction hospital social_facility
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.3309 Mean :0.1533 Mean :0.1842 Mean :0.4841
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :5.0000 Max. :3.0000 Max. :3.0000 Max. :4.0000
##
## number_activity
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 2.000
## Mean : 4.476
## 3rd Qu.: 6.000
## Max. :29.000
##
\(Y = X\beta + \epsilon\)
Assumptions: 1) \(E[\epsilon] = 0\) 2) \(V[\epsilon] = \sigma^2\) 3) \(Cov[\epsilon_i, \epsilon_j] = 0\) \(\forall\) \(i \neq j\)
If these assumptions are found to be true, the Gauss-Markov theorem states that OLS is the best linear unbiased estimator for the dataset (BLUE).
ols_pass <- lm(total_passengers ~ ZIPCODE+day+hour+apparentTemperature+summary+AREA+POPULATION+bar+nightclub+hospital+social_facility, data = regression1)
ols_pass <- lm(total_passengers ~ ZIPCODE+day+hour+apparentTemperature+summary+AREA+POPULATION+bar, data = regression1)
summary(ols_pass)
##
## Call:
## lm(formula = total_passengers ~ ZIPCODE + day + hour + apparentTemperature +
## summary + AREA + POPULATION + bar, data = regression1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -219.37 -15.66 -2.38 11.87 678.61
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.240e+02 4.125e+01 3.006 0.002647 **
## ZIPCODE10002 7.544e+01 4.936e+01 1.528 0.126452
## ZIPCODE10007 -6.791e+01 5.716e+01 -1.188 0.234797
## ZIPCODE10016 -2.950e+01 5.678e+01 -0.520 0.603406
## ZIPCODE10018 -5.301e+01 5.690e+01 -0.932 0.351457
## ZIPCODE10021 -4.128e+01 5.690e+01 -0.725 0.468150
## ZIPCODE10022 3.878e-01 5.683e+01 0.007 0.994556
## ZIPCODE10024 4.934e+01 4.034e+01 1.223 0.221330
## ZIPCODE10025 7.421e+01 4.018e+01 1.847 0.064759 .
## ZIPCODE10027 2.218e+02 4.034e+01 5.498 3.86e-08 ***
## ZIPCODE10029 2.050e+02 4.026e+01 5.091 3.58e-07 ***
## ZIPCODE10031 5.662e+01 4.017e+01 1.410 0.158682
## ZIPCODE10032 1.039e+02 4.027e+01 2.581 0.009867 **
## ZIPCODE10035 9.978e+01 4.018e+01 2.483 0.013020 *
## ZIPCODE10038 -4.377e+01 5.706e+01 -0.767 0.442997
## ZIPCODE10040 1.169e+01 4.018e+01 0.291 0.770989
## ZIPCODE10044 -7.616e+01 4.068e+01 -1.872 0.061203 .
## ZIPCODE10075 -9.741e+01 5.719e+01 -1.703 0.088539 .
## ZIPCODE10128 -3.044e+01 4.034e+01 -0.755 0.450451
## ZIPCODE10301 6.599e+02 6.200e+01 10.643 < 2e-16 ***
## ZIPCODE10304 6.043e+02 5.912e+01 10.222 < 2e-16 ***
## ZIPCODE10309 1.494e+03 1.199e+02 12.455 < 2e-16 ***
## ZIPCODE10310 2.823e+02 5.272e+01 5.356 8.56e-08 ***
## ZIPCODE10452 9.842e+01 4.050e+01 2.430 0.015109 *
## ZIPCODE10455 3.796e+01 4.018e+01 0.945 0.344826
## ZIPCODE10456 1.112e+02 4.069e+01 2.732 0.006299 **
## ZIPCODE10457 1.653e+02 4.154e+01 3.980 6.90e-05 ***
## ZIPCODE10461 3.547e+02 4.680e+01 7.579 3.55e-14 ***
## ZIPCODE10466 2.866e+02 4.492e+01 6.381 1.78e-10 ***
## ZIPCODE10467 4.010e+02 4.868e+01 8.239 < 2e-16 ***
## ZIPCODE10468 1.445e+02 4.114e+01 3.512 0.000446 ***
## ZIPCODE10469 3.844e+02 4.831e+01 7.958 1.80e-15 ***
## ZIPCODE10470 3.805e+01 4.044e+01 0.941 0.346773
## ZIPCODE11040 -9.242e+01 4.530e+01 -2.040 0.041332 *
## ZIPCODE11101 5.491e+02 5.175e+01 10.611 < 2e-16 ***
## ZIPCODE11102 6.115e+01 4.017e+01 1.522 0.127996
## ZIPCODE11103 1.112e+02 4.019e+01 2.767 0.005665 **
## ZIPCODE11104 -6.049e+00 4.028e+01 -0.150 0.880642
## ZIPCODE11105 2.802e+02 4.344e+01 6.452 1.12e-10 ***
## ZIPCODE11106 1.081e+02 4.030e+01 2.683 0.007289 **
## ZIPCODE11109 -1.118e+02 4.121e+01 -2.712 0.006697 **
## ZIPCODE11201 4.212e+02 4.205e+01 10.019 < 2e-16 ***
## ZIPCODE11203 3.345e+02 4.622e+01 7.239 4.61e-13 ***
## ZIPCODE11204 2.015e+02 4.256e+01 4.736 2.19e-06 ***
## ZIPCODE11205 1.063e+02 4.027e+01 2.640 0.008296 **
## ZIPCODE11206 2.163e+02 4.198e+01 5.152 2.58e-07 ***
## ZIPCODE11207 4.341e+02 5.011e+01 8.661 < 2e-16 ***
## ZIPCODE11209 3.086e+02 4.551e+01 6.781 1.21e-11 ***
## ZIPCODE11211 2.917e+02 4.195e+01 6.953 3.62e-12 ***
## ZIPCODE11212 1.955e+02 4.219e+01 4.635 3.59e-06 ***
## ZIPCODE11215 4.224e+02 4.656e+01 9.072 < 2e-16 ***
## ZIPCODE11216 1.339e+02 4.043e+01 3.313 0.000924 ***
## ZIPCODE11217 1.561e+02 4.022e+01 3.881 0.000104 ***
## ZIPCODE11218 1.569e+02 4.144e+01 3.787 0.000153 ***
## ZIPCODE11220 2.319e+02 4.319e+01 5.370 7.92e-08 ***
## ZIPCODE11221 1.859e+02 4.167e+01 4.460 8.21e-06 ***
## ZIPCODE11222 2.570e+02 4.233e+01 6.072 1.27e-09 ***
## ZIPCODE11224 2.276e+02 4.303e+01 5.290 1.23e-07 ***
## ZIPCODE11225 7.455e+01 4.029e+01 1.850 0.064294 .
## ZIPCODE11226 1.870e+02 4.179e+01 4.474 7.69e-06 ***
## ZIPCODE11231 2.162e+02 4.178e+01 5.173 2.31e-07 ***
## ZIPCODE11232 3.000e+02 4.501e+01 6.665 2.68e-11 ***
## ZIPCODE11233 1.810e+02 4.157e+01 4.355 1.34e-05 ***
## ZIPCODE11234 1.409e+03 1.082e+02 13.021 < 2e-16 ***
## ZIPCODE11235 4.019e+02 4.859e+01 8.271 < 2e-16 ***
## ZIPCODE11237 9.631e+01 4.050e+01 2.378 0.017409 *
## ZIPCODE11238 1.590e+02 4.064e+01 3.912 9.19e-05 ***
## ZIPCODE11249 1.098e+02 4.017e+01 2.733 0.006269 **
## ZIPCODE11354 3.632e+02 4.657e+01 7.800 6.36e-15 ***
## ZIPCODE11361 2.558e+02 4.391e+01 5.826 5.72e-09 ***
## ZIPCODE11368 4.367e+02 4.964e+01 8.797 < 2e-16 ***
## ZIPCODE11369 1.120e+02 4.070e+01 2.752 0.005918 **
## ZIPCODE11370 6.654e+01 4.029e+01 1.652 0.098640 .
## ZIPCODE11375 3.540e+02 4.494e+01 7.878 3.41e-15 ***
## ZIPCODE11377 5.014e+02 4.944e+01 10.143 < 2e-16 ***
## ZIPCODE11378 3.707e+02 4.778e+01 7.758 8.80e-15 ***
## ZIPCODE11379 3.036e+02 4.549e+01 6.674 2.53e-11 ***
## ZIPCODE11385 8.080e+02 6.980e+01 11.576 < 2e-16 ***
## ZIPCODE11418 1.207e+02 4.089e+01 2.953 0.003151 **
## ZIPCODE11432 3.305e+02 4.610e+01 7.170 7.61e-13 ***
## ZIPCODE11693 4.275e+01 4.217e+01 1.014 0.310711
## dayMonday -1.081e+01 8.401e-01 -12.864 < 2e-16 ***
## daySaturday 9.725e+00 8.300e-01 11.717 < 2e-16 ***
## daySunday 1.088e+00 8.255e-01 1.317 0.187723
## dayThursday -3.943e+00 7.890e-01 -4.998 5.82e-07 ***
## dayTuesday -1.028e+01 7.808e-01 -13.169 < 2e-16 ***
## dayWednesday -7.400e+00 7.814e-01 -9.470 < 2e-16 ***
## hour1 -1.013e+01 1.414e+00 -7.167 7.79e-13 ***
## hour2 -2.137e+01 1.442e+00 -14.817 < 2e-16 ***
## hour3 -2.642e+01 1.447e+00 -18.256 < 2e-16 ***
## hour4 -3.213e+01 1.459e+00 -22.025 < 2e-16 ***
## hour5 -3.810e+01 1.452e+00 -26.237 < 2e-16 ***
## hour6 -3.080e+01 1.408e+00 -21.873 < 2e-16 ***
## hour7 -1.418e+01 1.380e+00 -10.275 < 2e-16 ***
## hour8 -1.378e+00 1.367e+00 -1.008 0.313608
## hour9 -6.469e-01 1.373e+00 -0.471 0.637535
## hour10 -4.564e+00 1.383e+00 -3.301 0.000963 ***
## hour11 -5.606e+00 1.387e+00 -4.042 5.31e-05 ***
## hour12 -5.718e+00 1.379e+00 -4.147 3.37e-05 ***
## hour13 -4.869e+00 1.375e+00 -3.541 0.000399 ***
## hour14 -4.167e-01 1.380e+00 -0.302 0.762692
## hour15 2.354e+00 1.370e+00 1.719 0.085613 .
## hour16 7.090e+00 1.376e+00 5.152 2.59e-07 ***
## hour17 1.458e+01 1.388e+00 10.508 < 2e-16 ***
## hour18 2.191e+01 1.368e+00 16.017 < 2e-16 ***
## hour19 2.202e+01 1.377e+00 15.994 < 2e-16 ***
## hour20 1.829e+01 1.370e+00 13.349 < 2e-16 ***
## hour21 1.638e+01 1.378e+00 11.884 < 2e-16 ***
## hour22 1.291e+01 1.374e+00 9.395 < 2e-16 ***
## hour23 9.613e+00 1.379e+00 6.973 3.16e-12 ***
## apparentTemperature -5.761e-02 3.659e-02 -1.574 0.115392
## summaryDrizzle 8.421e+00 5.355e+00 1.572 0.115844
## summaryFoggy 1.452e+01 2.483e+00 5.849 4.99e-09 ***
## summaryLight Rain 6.134e-01 1.243e+00 0.493 0.621814
## summaryLight Snow 2.321e+00 3.130e+00 0.741 0.458456
## summaryMostly Cloudy 1.203e+00 7.336e-01 1.640 0.101016
## summaryOvercast 5.642e-01 6.380e-01 0.884 0.376476
## summaryPartly Cloudy 1.104e+00 5.529e-01 1.996 0.045899 *
## summaryPossible Drizzle 1.310e+00 1.392e+00 0.941 0.346570
## summaryPossible Flurries 5.413e+00 3.864e+00 1.401 0.161285
## summaryPossible Light Rain 5.842e+00 2.013e+00 2.903 0.003701 **
## summaryPossible Light Snow 1.730e-01 5.352e+00 0.032 0.974213
## summaryRain -1.098e+00 1.727e+00 -0.636 0.525095
## AREA -7.420e-06 5.335e-07 -13.910 < 2e-16 ***
## POPULATION NA NA NA NA
## bar NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.12 on 41290 degrees of freedom
## Multiple R-squared: 0.6083, Adjusted R-squared: 0.6071
## F-statistic: 521.3 on 123 and 41290 DF, p-value: < 2.2e-16
ols_distance <- lm(total_trip_distance ~ ZIPCODE+day+hour+apparentTemperature+summary+AREA+POPULATION+bar+nightclub+hospital+social_facility, data = regression1)
summary(ols_distance)
##
## Call:
## lm(formula = total_trip_distance ~ ZIPCODE + day + hour + apparentTemperature +
## summary + AREA + POPULATION + bar + nightclub + hospital +
## social_facility, data = regression1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -522.01 -29.76 -4.76 22.53 1668.85
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.183e+02 8.785e+01 2.485 0.012953 *
## ZIPCODE10002 1.436e+02 1.051e+02 1.366 0.171889
## ZIPCODE10007 -1.157e+02 1.217e+02 -0.950 0.341950
## ZIPCODE10016 -3.314e+01 1.209e+02 -0.274 0.784066
## ZIPCODE10018 -8.368e+01 1.212e+02 -0.691 0.489825
## ZIPCODE10021 -7.420e+01 1.212e+02 -0.612 0.540378
## ZIPCODE10022 -7.752e-01 1.210e+02 -0.006 0.994890
## ZIPCODE10024 1.018e+02 8.592e+01 1.184 0.236288
## ZIPCODE10025 1.546e+02 8.557e+01 1.806 0.070850 .
## ZIPCODE10027 4.250e+02 8.590e+01 4.947 7.57e-07 ***
## ZIPCODE10029 3.851e+02 8.575e+01 4.491 7.10e-06 ***
## ZIPCODE10031 1.350e+02 8.555e+01 1.578 0.114590
## ZIPCODE10032 2.655e+02 8.576e+01 3.096 0.001965 **
## ZIPCODE10035 1.962e+02 8.557e+01 2.293 0.021848 *
## ZIPCODE10038 -7.955e+01 1.215e+02 -0.655 0.512724
## ZIPCODE10040 4.025e+01 8.557e+01 0.470 0.638032
## ZIPCODE10044 -1.278e+02 8.664e+01 -1.475 0.140092
## ZIPCODE10075 -1.576e+02 1.218e+02 -1.294 0.195656
## ZIPCODE10128 -4.586e+01 8.591e+01 -0.534 0.593435
## ZIPCODE10301 1.225e+03 1.320e+02 9.275 < 2e-16 ***
## ZIPCODE10304 1.126e+03 1.259e+02 8.941 < 2e-16 ***
## ZIPCODE10309 2.781e+03 2.554e+02 10.888 < 2e-16 ***
## ZIPCODE10310 5.248e+02 1.123e+02 4.675 2.95e-06 ***
## ZIPCODE10452 1.932e+02 8.626e+01 2.239 0.025143 *
## ZIPCODE10455 7.996e+01 8.558e+01 0.934 0.350134
## ZIPCODE10456 2.151e+02 8.665e+01 2.483 0.013044 *
## ZIPCODE10457 3.147e+02 8.847e+01 3.558 0.000375 ***
## ZIPCODE10461 6.637e+02 9.968e+01 6.658 2.81e-11 ***
## ZIPCODE10466 5.391e+02 9.567e+01 5.635 1.76e-08 ***
## ZIPCODE10467 7.549e+02 1.037e+02 7.281 3.36e-13 ***
## ZIPCODE10468 2.767e+02 8.762e+01 3.158 0.001592 **
## ZIPCODE10469 7.168e+02 1.029e+02 6.966 3.30e-12 ***
## ZIPCODE10470 8.357e+01 8.612e+01 0.970 0.331873
## ZIPCODE11040 -1.443e+02 9.648e+01 -1.495 0.134825
## ZIPCODE11101 1.043e+03 1.102e+02 9.467 < 2e-16 ***
## ZIPCODE11102 1.190e+02 8.556e+01 1.390 0.164407
## ZIPCODE11103 2.021e+02 8.559e+01 2.361 0.018223 *
## ZIPCODE11104 5.182e+00 8.579e+01 0.060 0.951839
## ZIPCODE11105 5.173e+02 9.251e+01 5.592 2.26e-08 ***
## ZIPCODE11106 2.036e+02 8.582e+01 2.373 0.017672 *
## ZIPCODE11109 -1.886e+02 8.777e+01 -2.149 0.031630 *
## ZIPCODE11201 9.012e+02 8.955e+01 10.065 < 2e-16 ***
## ZIPCODE11203 6.268e+02 9.843e+01 6.368 1.93e-10 ***
## ZIPCODE11204 3.819e+02 9.064e+01 4.214 2.52e-05 ***
## ZIPCODE11205 2.203e+02 8.577e+01 2.568 0.010233 *
## ZIPCODE11206 4.147e+02 8.942e+01 4.638 3.53e-06 ***
## ZIPCODE11207 8.119e+02 1.067e+02 7.608 2.85e-14 ***
## ZIPCODE11209 5.818e+02 9.692e+01 6.003 1.95e-09 ***
## ZIPCODE11211 5.731e+02 8.934e+01 6.415 1.42e-10 ***
## ZIPCODE11212 3.713e+02 8.985e+01 4.133 3.59e-05 ***
## ZIPCODE11215 7.939e+02 9.915e+01 8.007 1.21e-15 ***
## ZIPCODE11216 2.743e+02 8.611e+01 3.185 0.001446 **
## ZIPCODE11217 3.331e+02 8.566e+01 3.888 0.000101 ***
## ZIPCODE11218 3.021e+02 8.826e+01 3.423 0.000619 ***
## ZIPCODE11220 4.384e+02 9.198e+01 4.767 1.88e-06 ***
## ZIPCODE11221 3.592e+02 8.874e+01 4.048 5.17e-05 ***
## ZIPCODE11222 5.024e+02 9.015e+01 5.574 2.51e-08 ***
## ZIPCODE11224 4.392e+02 9.165e+01 4.793 1.65e-06 ***
## ZIPCODE11225 1.559e+02 8.581e+01 1.817 0.069222 .
## ZIPCODE11226 3.627e+02 8.900e+01 4.075 4.61e-05 ***
## ZIPCODE11231 4.334e+02 8.899e+01 4.870 1.12e-06 ***
## ZIPCODE11232 5.653e+02 9.587e+01 5.897 3.74e-09 ***
## ZIPCODE11233 3.495e+02 8.853e+01 3.947 7.91e-05 ***
## ZIPCODE11234 2.598e+03 2.305e+02 11.272 < 2e-16 ***
## ZIPCODE11235 7.476e+02 1.035e+02 7.224 5.13e-13 ***
## ZIPCODE11237 1.950e+02 8.625e+01 2.261 0.023777 *
## ZIPCODE11238 3.249e+02 8.656e+01 3.753 0.000175 ***
## ZIPCODE11249 2.655e+02 8.555e+01 3.103 0.001914 **
## ZIPCODE11354 6.947e+02 9.919e+01 7.004 2.52e-12 ***
## ZIPCODE11361 4.818e+02 9.351e+01 5.153 2.58e-07 ***
## ZIPCODE11368 8.090e+02 1.057e+02 7.652 2.03e-14 ***
## ZIPCODE11369 2.166e+02 8.668e+01 2.499 0.012456 *
## ZIPCODE11370 1.305e+02 8.580e+01 1.521 0.128165
## ZIPCODE11375 6.749e+02 9.571e+01 7.052 1.80e-12 ***
## ZIPCODE11377 8.980e+02 1.053e+02 8.528 < 2e-16 ***
## ZIPCODE11378 6.910e+02 1.018e+02 6.790 1.14e-11 ***
## ZIPCODE11379 5.680e+02 9.688e+01 5.863 4.58e-09 ***
## ZIPCODE11385 1.494e+03 1.486e+02 10.048 < 2e-16 ***
## ZIPCODE11418 2.332e+02 8.709e+01 2.678 0.007417 **
## ZIPCODE11432 6.225e+02 9.817e+01 6.341 2.31e-10 ***
## ZIPCODE11693 9.256e+01 8.982e+01 1.031 0.302766
## dayMonday -2.363e+01 1.789e+00 -13.207 < 2e-16 ***
## daySaturday 2.169e+01 1.768e+00 12.273 < 2e-16 ***
## daySunday 6.718e+00 1.758e+00 3.821 0.000133 ***
## dayThursday -9.756e+00 1.680e+00 -5.806 6.46e-09 ***
## dayTuesday -2.331e+01 1.663e+00 -14.020 < 2e-16 ***
## dayWednesday -1.755e+01 1.664e+00 -10.545 < 2e-16 ***
## hour1 -2.009e+01 3.011e+00 -6.671 2.58e-11 ***
## hour2 -4.278e+01 3.072e+00 -13.926 < 2e-16 ***
## hour3 -5.174e+01 3.082e+00 -16.786 < 2e-16 ***
## hour4 -5.920e+01 3.107e+00 -19.055 < 2e-16 ***
## hour5 -6.673e+01 3.093e+00 -21.578 < 2e-16 ***
## hour6 -4.762e+01 2.999e+00 -15.876 < 2e-16 ***
## hour7 -1.664e+01 2.938e+00 -5.662 1.51e-08 ***
## hour8 5.023e+00 2.912e+00 1.725 0.084512 .
## hour9 6.113e+00 2.924e+00 2.090 0.036585 *
## hour10 -6.430e+00 2.944e+00 -2.184 0.028997 *
## hour11 -9.678e+00 2.954e+00 -3.276 0.001052 **
## hour12 -1.024e+01 2.937e+00 -3.487 0.000489 ***
## hour13 -9.165e+00 2.928e+00 -3.130 0.001751 **
## hour14 1.551e-01 2.939e+00 0.053 0.957921
## hour15 3.792e+00 2.917e+00 1.300 0.193555
## hour16 1.078e+01 2.931e+00 3.676 0.000237 ***
## hour17 2.009e+01 2.956e+00 6.799 1.07e-11 ***
## hour18 2.940e+01 2.913e+00 10.090 < 2e-16 ***
## hour19 2.999e+01 2.932e+00 10.226 < 2e-16 ***
## hour20 2.591e+01 2.917e+00 8.881 < 2e-16 ***
## hour21 2.678e+01 2.935e+00 9.122 < 2e-16 ***
## hour22 2.439e+01 2.926e+00 8.335 < 2e-16 ***
## hour23 2.102e+01 2.936e+00 7.159 8.27e-13 ***
## apparentTemperature 3.485e-02 7.792e-02 0.447 0.654711
## summaryDrizzle 1.010e+01 1.140e+01 0.886 0.375841
## summaryFoggy 2.636e+01 5.288e+00 4.986 6.20e-07 ***
## summaryLight Rain 7.782e-01 2.648e+00 0.294 0.768876
## summaryLight Snow 7.160e+00 6.666e+00 1.074 0.282795
## summaryMostly Cloudy 1.680e+00 1.562e+00 1.075 0.282374
## summaryOvercast 1.637e-01 1.359e+00 0.120 0.904119
## summaryPartly Cloudy 2.300e+00 1.178e+00 1.954 0.050758 .
## summaryPossible Drizzle 8.866e-01 2.964e+00 0.299 0.764832
## summaryPossible Flurries 1.226e+01 8.229e+00 1.489 0.136434
## summaryPossible Light Rain 1.143e+01 4.286e+00 2.666 0.007676 **
## summaryPossible Light Snow 4.174e-01 1.140e+01 0.037 0.970786
## summaryRain -5.541e+00 3.679e+00 -1.506 0.132075
## AREA -1.361e-05 1.136e-06 -11.977 < 2e-16 ***
## POPULATION NA NA NA NA
## bar NA NA NA NA
## nightclub NA NA NA NA
## hospital NA NA NA NA
## social_facility NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 85.45 on 41290 degrees of freedom
## Multiple R-squared: 0.595, Adjusted R-squared: 0.5937
## F-statistic: 493.1 on 123 and 41290 DF, p-value: < 2.2e-16
Why do you think we get NA values ??
ols_pass$rank
## [1] 124
ncol(model.matrix(ols_pass))
## [1] 126
## Correlation Matrix
# Numerical
numeric <- regression1[, -c(1:5)]
M<-cor(numeric)
corrplot(M, method="pie")
What OLS assumption does this violate? Why is OLS regression unsuitable for this analysis? What type of regression should we employ instead?
regression2<- regression1[, c("total_passengers", "total_trip_distance", "ZIPCODE", "day", "hour", "apparentTemperature", "summary", "AREA", "number_activity")]
ols_pass1 <- lm(total_passengers ~ day+hour+apparentTemperature+summary+AREA+number_activity, data = regression2)
summary(ols_pass1)
##
## Call:
## lm(formula = total_passengers ~ day + hour + apparentTemperature +
## summary + AREA + number_activity, data = regression2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -138.78 -29.23 -12.26 8.71 719.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.116e+01 1.855e+00 22.186 < 2e-16 ***
## dayMonday -1.017e+01 1.207e+00 -8.431 < 2e-16 ***
## daySaturday 8.826e+00 1.193e+00 7.400 1.39e-13 ***
## daySunday 4.969e-01 1.186e+00 0.419 0.675314
## dayThursday -3.507e+00 1.134e+00 -3.093 0.001980 **
## dayTuesday -9.016e+00 1.122e+00 -8.036 9.56e-16 ***
## dayWednesday -6.694e+00 1.123e+00 -5.961 2.52e-09 ***
## hour1 -8.266e+00 2.031e+00 -4.069 4.72e-05 ***
## hour2 -1.804e+01 2.072e+00 -8.708 < 2e-16 ***
## hour3 -2.214e+01 2.079e+00 -10.652 < 2e-16 ***
## hour4 -2.758e+01 2.095e+00 -13.165 < 2e-16 ***
## hour5 -3.317e+01 2.085e+00 -15.905 < 2e-16 ***
## hour6 -2.870e+01 2.022e+00 -14.191 < 2e-16 ***
## hour7 -1.470e+01 1.982e+00 -7.417 1.22e-13 ***
## hour8 -3.114e+00 1.964e+00 -1.586 0.112768
## hour9 -2.016e+00 1.972e+00 -1.022 0.306696
## hour10 -5.770e+00 1.985e+00 -2.907 0.003652 **
## hour11 -6.181e+00 1.991e+00 -3.104 0.001910 **
## hour12 -6.730e+00 1.981e+00 -3.398 0.000679 ***
## hour13 -6.267e+00 1.975e+00 -3.173 0.001508 **
## hour14 -2.102e+00 1.982e+00 -1.061 0.288886
## hour15 5.096e-01 1.967e+00 0.259 0.795599
## hour16 5.562e+00 1.977e+00 2.813 0.004906 **
## hour17 1.286e+01 1.993e+00 6.452 1.12e-10 ***
## hour18 2.007e+01 1.964e+00 10.215 < 2e-16 ***
## hour19 2.005e+01 1.978e+00 10.140 < 2e-16 ***
## hour20 1.669e+01 1.968e+00 8.483 < 2e-16 ***
## hour21 1.518e+01 1.980e+00 7.667 1.80e-14 ***
## hour22 1.147e+01 1.973e+00 5.814 6.15e-09 ***
## hour23 9.114e+00 1.981e+00 4.600 4.23e-06 ***
## apparentTemperature -6.542e-02 5.258e-02 -1.244 0.213445
## summaryDrizzle 8.485e+00 7.697e+00 1.102 0.270266
## summaryFoggy 1.403e+01 3.568e+00 3.932 8.46e-05 ***
## summaryLight Rain 9.542e-01 1.787e+00 0.534 0.593394
## summaryLight Snow 1.288e+00 4.498e+00 0.286 0.774672
## summaryMostly Cloudy 7.030e-01 1.054e+00 0.667 0.504831
## summaryOvercast 5.444e-01 9.168e-01 0.594 0.552668
## summaryPartly Cloudy 1.127e+00 7.945e-01 1.419 0.155951
## summaryPossible Drizzle 1.110e+00 2.000e+00 0.555 0.579027
## summaryPossible Flurries 5.094e+00 5.553e+00 0.917 0.358992
## summaryPossible Light Rain 5.263e+00 2.892e+00 1.820 0.068818 .
## summaryPossible Light Snow -4.356e-01 7.692e+00 -0.057 0.954841
## summaryRain -8.283e-01 2.483e+00 -0.334 0.738666
## AREA -3.468e-07 1.021e-08 -33.978 < 2e-16 ***
## number_activity 3.925e+00 5.139e-02 76.372 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.67 on 41369 degrees of freedom
## Multiple R-squared: 0.1891, Adjusted R-squared: 0.1883
## F-statistic: 219.3 on 44 and 41369 DF, p-value: < 2.2e-16
ols_distance1 <- lm(total_trip_distance ~ day+hour+apparentTemperature+summary+AREA+number_activity, data = regression2)
summary(ols_distance1)
##
## Call:
## lm(formula = total_trip_distance ~ day + hour + apparentTemperature +
## summary + AREA + number_activity, data = regression2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -302.46 -57.58 -26.29 17.17 1782.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.451e+01 3.907e+00 21.633 < 2e-16 ***
## dayMonday -2.241e+01 2.542e+00 -8.816 < 2e-16 ***
## daySaturday 1.988e+01 2.512e+00 7.914 2.55e-15 ***
## daySunday 5.493e+00 2.498e+00 2.199 0.027894 *
## dayThursday -8.901e+00 2.388e+00 -3.727 0.000194 ***
## dayTuesday -2.084e+01 2.363e+00 -8.820 < 2e-16 ***
## dayWednesday -1.617e+01 2.365e+00 -6.837 8.20e-12 ***
## hour1 -1.636e+01 4.278e+00 -3.824 0.000131 ***
## hour2 -3.624e+01 4.364e+00 -8.304 < 2e-16 ***
## hour3 -4.331e+01 4.378e+00 -9.892 < 2e-16 ***
## hour4 -5.029e+01 4.412e+00 -11.397 < 2e-16 ***
## hour5 -5.713e+01 4.391e+00 -13.009 < 2e-16 ***
## hour6 -4.357e+01 4.259e+00 -10.230 < 2e-16 ***
## hour7 -1.757e+01 4.173e+00 -4.209 2.57e-05 ***
## hour8 1.687e+00 4.136e+00 0.408 0.683358
## hour9 3.459e+00 4.153e+00 0.833 0.404922
## hour10 -8.739e+00 4.180e+00 -2.090 0.036587 *
## hour11 -1.072e+01 4.194e+00 -2.557 0.010567 *
## hour12 -1.216e+01 4.171e+00 -2.916 0.003543 **
## hour13 -1.186e+01 4.159e+00 -2.851 0.004360 **
## hour14 -3.084e+00 4.174e+00 -0.739 0.460110
## hour15 2.700e-01 4.143e+00 0.065 0.948032
## hour16 7.887e+00 4.164e+00 1.894 0.058207 .
## hour17 1.682e+01 4.198e+00 4.007 6.17e-05 ***
## hour18 2.589e+01 4.137e+00 6.259 3.91e-10 ***
## hour19 2.616e+01 4.165e+00 6.280 3.42e-10 ***
## hour20 2.286e+01 4.144e+00 5.517 3.46e-08 ***
## hour21 2.449e+01 4.169e+00 5.873 4.31e-09 ***
## hour22 2.158e+01 4.156e+00 5.193 2.08e-07 ***
## hour23 2.003e+01 4.172e+00 4.802 1.58e-06 ***
## apparentTemperature 1.949e-02 1.107e-01 0.176 0.860295
## summaryDrizzle 1.011e+01 1.621e+01 0.624 0.532637
## summaryFoggy 2.540e+01 7.514e+00 3.380 0.000725 ***
## summaryLight Rain 1.438e+00 3.764e+00 0.382 0.702334
## summaryLight Snow 4.989e+00 9.473e+00 0.527 0.598397
## summaryMostly Cloudy 6.954e-01 2.220e+00 0.313 0.754088
## summaryOvercast 1.159e-01 1.931e+00 0.060 0.952125
## summaryPartly Cloudy 2.316e+00 1.673e+00 1.384 0.166239
## summaryPossible Drizzle 5.636e-01 4.212e+00 0.134 0.893547
## summaryPossible Flurries 1.179e+01 1.169e+01 1.009 0.313209
## summaryPossible Light Rain 1.042e+01 6.091e+00 1.711 0.087044 .
## summaryPossible Light Snow -8.055e-01 1.620e+01 -0.050 0.960341
## summaryRain -4.963e+00 5.228e+00 -0.949 0.342511
## AREA -7.223e-07 2.149e-08 -33.606 < 2e-16 ***
## number_activity 8.566e+00 1.082e-01 79.154 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 121.5 on 41369 degrees of freedom
## Multiple R-squared: 0.1801, Adjusted R-squared: 0.1793
## F-statistic: 206.6 on 44 and 41369 DF, p-value: < 2.2e-16
$y_{it}=\beta_{1}x_{it} + a_{i} + u_{it}$
pdata <- regression1
pdata$time = as.numeric(pdata$time)
pdata$ZIPCODE = as.numeric(pdata$ZIPCODE)
pdata <- pdata.frame(pdata, index=c("ZIPCODE", "time"))
## Warning in pdata.frame(pdata, index = c("ZIPCODE", "time")): duplicate couples (id-time) in resulting pdata.frame
## to find out which, use e.g. table(index(your_pdataframe), useNA = "ifany")
#view(table(index(pdata), useNA = "ifany")) ## Remove duplicates
pdata <- pdata[pdata$time != 1456768800 & pdata$ZIPCODE != 14,]
pdata <- pdata[pdata$POPULATION > 0, ]
view(table(index(pdata), useNA = "ifany"))
re_passenger <- plm(total_passengers ~ summary+apparentTemperature+number_activity+day+hour, data = pdata, model= "random", index=c("ZIPCODE", "time"))
summary(re_passenger)
## Oneway (individual) effect Random Effect Model
## (Swamy-Arora's transformation)
##
## Call:
## plm(formula = total_passengers ~ summary + apparentTemperature +
## number_activity + day + hour, data = pdata, model = "random",
## index = c("ZIPCODE", "time"))
##
## Unbalanced Panel: n = 80, T = 1-743, N = 40575
##
## Effects:
## var std.dev share
## idiosyncratic 1585.12 39.81 0.476
## individual 1744.68 41.77 0.524
## theta:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3100 0.9628 0.9648 0.9620 0.9650 0.9651
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -214.45 -15.36 -2.47 0.27 11.26 679.59
##
## Coefficients:
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept) 20.7252535 6.2107584 3.3370 0.0008469 ***
## summaryDrizzle 8.5170708 5.3591308 1.5893 0.1120009
## summaryFoggy 14.3300273 2.4879975 5.7597 8.428e-09 ***
## summaryLight Rain 1.2285535 1.2676362 0.9692 0.3324609
## summaryLight Snow 2.5365655 3.1404665 0.8077 0.4192613
## summaryMostly Cloudy 1.2280465 0.7349091 1.6710 0.0947181 .
## summaryOvercast 0.3738778 0.6390918 0.5850 0.5585382
## summaryPartly Cloudy 1.2029922 0.5540708 2.1712 0.0299169 *
## summaryPossible Drizzle 1.3477698 1.3934907 0.9672 0.3334492
## summaryPossible Flurries 5.7190933 3.8678703 1.4786 0.1392431
## summaryPossible Light Rain 6.1635497 2.0155236 3.0580 0.0022279 **
## summaryPossible Light Snow 0.0040267 5.3560518 0.0008 0.9994001
## summaryRain -0.9666104 1.7296746 -0.5588 0.5762713
## apparentTemperature -0.0525022 0.0366650 -1.4319 0.1521599
## number_activity 2.9890804 0.8602671 3.4746 0.0005116 ***
## dayMonday -10.9027564 0.8416146 -12.9546 < 2.2e-16 ***
## daySaturday 9.7296553 0.8314843 11.7015 < 2.2e-16 ***
## daySunday 0.9395213 0.8271222 1.1359 0.2560019
## dayThursday -4.0963715 0.7904723 -5.1822 2.193e-07 ***
## dayTuesday -10.2183666 0.7829472 -13.0512 < 2.2e-16 ***
## dayWednesday -7.4585241 0.7827420 -9.5287 < 2.2e-16 ***
## hour1 -9.3562197 1.4275756 -6.5539 5.605e-11 ***
## hour2 -20.9162115 1.4455307 -14.4696 < 2.2e-16 ***
## hour3 -25.8345536 1.4513402 -17.8005 < 2.2e-16 ***
## hour4 -31.6431551 1.4626907 -21.6335 < 2.2e-16 ***
## hour5 -37.7876954 1.4561885 -25.9497 < 2.2e-16 ***
## hour6 -31.0390598 1.4121230 -21.9804 < 2.2e-16 ***
## hour7 -15.1945579 1.3817725 -10.9964 < 2.2e-16 ***
## hour8 -2.6661926 1.3694037 -1.9470 0.0515379 .
## hour9 -1.9770392 1.3755348 -1.4373 0.1506363
## hour10 -5.9414749 1.3847556 -4.2906 1.782e-05 ***
## hour11 -6.9487518 1.3891331 -5.0022 5.667e-07 ***
## hour12 -6.8868992 1.3809785 -4.9870 6.133e-07 ***
## hour13 -5.9914853 1.3769275 -4.3513 1.353e-05 ***
## hour14 -1.8305943 1.3822849 -1.3243 0.1853952
## hour15 0.8106615 1.3713395 0.5911 0.5544228
## hour16 5.5540364 1.3786451 4.0286 5.611e-05 ***
## hour17 12.8996199 1.3901754 9.2791 < 2.2e-16 ***
## hour18 19.8383764 1.3695387 14.4854 < 2.2e-16 ***
## hour19 20.2586946 1.3789899 14.6910 < 2.2e-16 ***
## hour20 16.8369249 1.3720747 12.2711 < 2.2e-16 ***
## hour21 14.9913583 1.3799659 10.8636 < 2.2e-16 ***
## hour22 12.0620431 1.3755012 8.7692 < 2.2e-16 ***
## hour23 9.4580229 1.3811241 6.8481 7.486e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 76121000
## Residual Sum of Squares: 64237000
## R-Squared: 0.15616
## Adj. R-Squared: 0.15527
## Chisq: 7510.15 on 43 DF, p-value: < 2.22e-16
\(\sum^{n}_{i=1} \left(y_i-\beta_0-\sum^{p}_{j=1} \beta_j x_{ij}\right)^2 + \lambda \underbrace{\sum^{p}_{j=1} \mid{\beta_j}\mid}_{\ell^1-penalisation}\)
#Partition Data into Training and Test Data:
lasso <- data.matrix(regression1)
smp_size <- floor(0.75 * nrow(lasso))
set.seed(6789)
train_ind <- sample(seq_len(nrow(lasso)), size = smp_size)
length(lasso)
## [1] 662624
x <- data.matrix(lasso[, -10])
y <- data.matrix(lasso[, 10])
train.x <- data.matrix(x[train_ind, ])
test.x <- data.matrix(x[-train_ind, ])
train.y <- data.matrix(y[train_ind, ])
test.y <- data.matrix(y[-train_ind, ])
Next we create an empty list to store our resutls. This will create a space for all our different values of alpha which we will run 10 iterations for.Then we will run the regression for different alpha values.
list.of.fits <- list()
for (i in 0:10) {
fit.name <- paste0("alpha", i/10)
list.of.fits[[fit.name]] <-
cv.glmnet(train.x, train.y, type.measure="mse", alpha=i/10,
family="gaussian")
}
Now we see which alpha (0, 0.1, … , 0.9, 1) does the best job atpredicting the values in the Testing dataset.
results <- data.frame()
for (i in 0:10) {
fit.name <- paste0("alpha", i/10)
## Use each model to predict 'y' given the Testing dataset
predicted <-
predict(list.of.fits[[fit.name]],
s=list.of.fits[[fit.name]]$lambda.1se, newx=test.x)
## Calculate the Mean Squared Error...
mse <- mean((test.y - predicted)^2)
## Store the results
temp <- data.frame(alpha=i/10, mse=mse, fit.name=fit.name)
results <- rbind(results, temp)
}
View Results
results
## alpha mse fit.name
## 1 0.0 1233.375 alpha0
## 2 0.1 1075.381 alpha0.1
## 3 0.2 1075.798 alpha0.2
## 4 0.3 1060.052 alpha0.3
## 5 0.4 1075.950 alpha0.4
## 6 0.5 1060.970 alpha0.5
## 7 0.6 1060.612 alpha0.6
## 8 0.7 1057.216 alpha0.7
## 9 0.8 1081.844 alpha0.8
## 10 0.9 1053.314 alpha0.9
## 11 1.0 1069.142 alpha1