In this section we will model our refined data to find a regression model that explains the

library(glmnet)
library(lme4)
library(readr)
library(Hmisc)
library(osmdata)
library(darksky)
library(ggplot2)
library(tidyverse)
library(rgdal)
library(rgeos)
library(tmap)
library(leaflet)
library(RColorBrewer)
library(sp)
library(spatialEco)
library(mapview)
library(sf)
library(dplyr)
library(tidyr)
library(plm)
library(corrplot)
wd = "~/Uni Temp Work/Big Data"
plsep <- .Platform$file.sep
regression <- read.csv(paste(wd, plsep, "regression.csv", sep =""))
regression1 <- regression[complete.cases(regression),]
factors <- c("hour", "day", "ZIPCODE","summary")
numeric <- c("apparentTemperature", "POPULATION", "AREA", "bar", "nightclub", "attraction", "hospital", "social_facility", "total_passengers")
regression1[, factors] <- lapply(regression1[factors], as.factor)
regression1[, numeric] <- lapply(regression1[numeric], as.numeric)
regression1$time = as.POSIXct(regression1$time,format = "%m/%d/%Y %H")
regression1 <- regression1[regression1$POPULATION > 0 ,]
regression1 <- regression1[!duplicated(c("regression1$ZIPCODE","regression1$time")), ]
regression1$number_activity <- regression1$bar + regression1$nightclub + regression1$attraction + regression1$hospital +regression1$social_facility
summary(regression1)
##     ZIPCODE           time                                 summary     
##  10035  :  796   Min.   :2016-03-01 00:00:00   Clear           :15644  
##  11103  :  744   1st Qu.:2016-03-08 18:00:00   Partly Cloudy   :10575  
##  11106  :  744   Median :2016-03-16 13:00:00   Overcast        : 6901  
##  11201  :  744   Mean   :2016-03-16 11:52:48   Mostly Cloudy   : 4398  
##  11211  :  744   3rd Qu.:2016-03-24 06:00:00   Light Rain      : 1193  
##  11215  :  744   Max.   :2016-03-31 23:00:00   Possible Drizzle:  940  
##  (Other):36898                                 (Other)         : 1763  
##         day            hour       apparentTemperature   POPULATION    
##  Friday   :5431   15     : 1855   Min.   :-7.680      Min.   :  2349  
##  Monday   :5266   18     : 1850   1st Qu.: 2.930      1st Qu.: 38234  
##  Saturday :5590   17     : 1849   Median : 7.320      Median : 57010  
##  Sunday   :5473   19     : 1848   Mean   : 7.529      Mean   : 58491  
##  Thursday :6636   16     : 1840   3rd Qu.:11.450      3rd Qu.: 76130  
##  Tuesday  :6453   14     : 1835   Max.   :24.710      Max.   :109069  
##  Wednesday:6565   (Other):30337                                       
##       AREA           total_passengers total_trip_distance      bar        
##  Min.   :   581237   Min.   :  0.00   Min.   :   0.01     Min.   : 0.000  
##  1st Qu.: 23159566   1st Qu.:  5.00   1st Qu.:  11.76     1st Qu.: 0.000  
##  Median : 37640607   Median : 14.00   Median :  33.33     Median : 1.000  
##  Mean   : 41193470   Mean   : 39.57   Mean   :  83.61     Mean   : 3.323  
##  3rd Qu.: 55879065   3rd Qu.: 48.00   3rd Qu.: 101.89     3rd Qu.: 5.000  
##  Max.   :215835794   Max.   :761.00   Max.   :1877.27     Max.   :27.000  
##                                                                           
##    nightclub        attraction        hospital      social_facility 
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.3309   Mean   :0.1533   Mean   :0.1842   Mean   :0.4841  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :5.0000   Max.   :3.0000   Max.   :3.0000   Max.   :4.0000  
##                                                                     
##  number_activity 
##  Min.   : 1.000  
##  1st Qu.: 1.000  
##  Median : 2.000  
##  Mean   : 4.476  
##  3rd Qu.: 6.000  
##  Max.   :29.000  
## 

1) Ordinary Least Square Regression

\(Y = X\beta + \epsilon\)

Assumptions: 1) \(E[\epsilon] = 0\) 2) \(V[\epsilon] = \sigma^2\) 3) \(Cov[\epsilon_i, \epsilon_j] = 0\) \(\forall\) \(i \neq j\)

If these assumptions are found to be true, the Gauss-Markov theorem states that OLS is the best linear unbiased estimator for the dataset (BLUE).

ols_pass <-  lm(total_passengers ~ ZIPCODE+day+hour+apparentTemperature+summary+AREA+POPULATION+bar+nightclub+hospital+social_facility, data = regression1)

ols_pass <-  lm(total_passengers ~ ZIPCODE+day+hour+apparentTemperature+summary+AREA+POPULATION+bar, data = regression1)
summary(ols_pass)
## 
## Call:
## lm(formula = total_passengers ~ ZIPCODE + day + hour + apparentTemperature + 
##     summary + AREA + POPULATION + bar, data = regression1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -219.37  -15.66   -2.38   11.87  678.61 
## 
## Coefficients: (2 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.240e+02  4.125e+01   3.006 0.002647 ** 
## ZIPCODE10002                7.544e+01  4.936e+01   1.528 0.126452    
## ZIPCODE10007               -6.791e+01  5.716e+01  -1.188 0.234797    
## ZIPCODE10016               -2.950e+01  5.678e+01  -0.520 0.603406    
## ZIPCODE10018               -5.301e+01  5.690e+01  -0.932 0.351457    
## ZIPCODE10021               -4.128e+01  5.690e+01  -0.725 0.468150    
## ZIPCODE10022                3.878e-01  5.683e+01   0.007 0.994556    
## ZIPCODE10024                4.934e+01  4.034e+01   1.223 0.221330    
## ZIPCODE10025                7.421e+01  4.018e+01   1.847 0.064759 .  
## ZIPCODE10027                2.218e+02  4.034e+01   5.498 3.86e-08 ***
## ZIPCODE10029                2.050e+02  4.026e+01   5.091 3.58e-07 ***
## ZIPCODE10031                5.662e+01  4.017e+01   1.410 0.158682    
## ZIPCODE10032                1.039e+02  4.027e+01   2.581 0.009867 ** 
## ZIPCODE10035                9.978e+01  4.018e+01   2.483 0.013020 *  
## ZIPCODE10038               -4.377e+01  5.706e+01  -0.767 0.442997    
## ZIPCODE10040                1.169e+01  4.018e+01   0.291 0.770989    
## ZIPCODE10044               -7.616e+01  4.068e+01  -1.872 0.061203 .  
## ZIPCODE10075               -9.741e+01  5.719e+01  -1.703 0.088539 .  
## ZIPCODE10128               -3.044e+01  4.034e+01  -0.755 0.450451    
## ZIPCODE10301                6.599e+02  6.200e+01  10.643  < 2e-16 ***
## ZIPCODE10304                6.043e+02  5.912e+01  10.222  < 2e-16 ***
## ZIPCODE10309                1.494e+03  1.199e+02  12.455  < 2e-16 ***
## ZIPCODE10310                2.823e+02  5.272e+01   5.356 8.56e-08 ***
## ZIPCODE10452                9.842e+01  4.050e+01   2.430 0.015109 *  
## ZIPCODE10455                3.796e+01  4.018e+01   0.945 0.344826    
## ZIPCODE10456                1.112e+02  4.069e+01   2.732 0.006299 ** 
## ZIPCODE10457                1.653e+02  4.154e+01   3.980 6.90e-05 ***
## ZIPCODE10461                3.547e+02  4.680e+01   7.579 3.55e-14 ***
## ZIPCODE10466                2.866e+02  4.492e+01   6.381 1.78e-10 ***
## ZIPCODE10467                4.010e+02  4.868e+01   8.239  < 2e-16 ***
## ZIPCODE10468                1.445e+02  4.114e+01   3.512 0.000446 ***
## ZIPCODE10469                3.844e+02  4.831e+01   7.958 1.80e-15 ***
## ZIPCODE10470                3.805e+01  4.044e+01   0.941 0.346773    
## ZIPCODE11040               -9.242e+01  4.530e+01  -2.040 0.041332 *  
## ZIPCODE11101                5.491e+02  5.175e+01  10.611  < 2e-16 ***
## ZIPCODE11102                6.115e+01  4.017e+01   1.522 0.127996    
## ZIPCODE11103                1.112e+02  4.019e+01   2.767 0.005665 ** 
## ZIPCODE11104               -6.049e+00  4.028e+01  -0.150 0.880642    
## ZIPCODE11105                2.802e+02  4.344e+01   6.452 1.12e-10 ***
## ZIPCODE11106                1.081e+02  4.030e+01   2.683 0.007289 ** 
## ZIPCODE11109               -1.118e+02  4.121e+01  -2.712 0.006697 ** 
## ZIPCODE11201                4.212e+02  4.205e+01  10.019  < 2e-16 ***
## ZIPCODE11203                3.345e+02  4.622e+01   7.239 4.61e-13 ***
## ZIPCODE11204                2.015e+02  4.256e+01   4.736 2.19e-06 ***
## ZIPCODE11205                1.063e+02  4.027e+01   2.640 0.008296 ** 
## ZIPCODE11206                2.163e+02  4.198e+01   5.152 2.58e-07 ***
## ZIPCODE11207                4.341e+02  5.011e+01   8.661  < 2e-16 ***
## ZIPCODE11209                3.086e+02  4.551e+01   6.781 1.21e-11 ***
## ZIPCODE11211                2.917e+02  4.195e+01   6.953 3.62e-12 ***
## ZIPCODE11212                1.955e+02  4.219e+01   4.635 3.59e-06 ***
## ZIPCODE11215                4.224e+02  4.656e+01   9.072  < 2e-16 ***
## ZIPCODE11216                1.339e+02  4.043e+01   3.313 0.000924 ***
## ZIPCODE11217                1.561e+02  4.022e+01   3.881 0.000104 ***
## ZIPCODE11218                1.569e+02  4.144e+01   3.787 0.000153 ***
## ZIPCODE11220                2.319e+02  4.319e+01   5.370 7.92e-08 ***
## ZIPCODE11221                1.859e+02  4.167e+01   4.460 8.21e-06 ***
## ZIPCODE11222                2.570e+02  4.233e+01   6.072 1.27e-09 ***
## ZIPCODE11224                2.276e+02  4.303e+01   5.290 1.23e-07 ***
## ZIPCODE11225                7.455e+01  4.029e+01   1.850 0.064294 .  
## ZIPCODE11226                1.870e+02  4.179e+01   4.474 7.69e-06 ***
## ZIPCODE11231                2.162e+02  4.178e+01   5.173 2.31e-07 ***
## ZIPCODE11232                3.000e+02  4.501e+01   6.665 2.68e-11 ***
## ZIPCODE11233                1.810e+02  4.157e+01   4.355 1.34e-05 ***
## ZIPCODE11234                1.409e+03  1.082e+02  13.021  < 2e-16 ***
## ZIPCODE11235                4.019e+02  4.859e+01   8.271  < 2e-16 ***
## ZIPCODE11237                9.631e+01  4.050e+01   2.378 0.017409 *  
## ZIPCODE11238                1.590e+02  4.064e+01   3.912 9.19e-05 ***
## ZIPCODE11249                1.098e+02  4.017e+01   2.733 0.006269 ** 
## ZIPCODE11354                3.632e+02  4.657e+01   7.800 6.36e-15 ***
## ZIPCODE11361                2.558e+02  4.391e+01   5.826 5.72e-09 ***
## ZIPCODE11368                4.367e+02  4.964e+01   8.797  < 2e-16 ***
## ZIPCODE11369                1.120e+02  4.070e+01   2.752 0.005918 ** 
## ZIPCODE11370                6.654e+01  4.029e+01   1.652 0.098640 .  
## ZIPCODE11375                3.540e+02  4.494e+01   7.878 3.41e-15 ***
## ZIPCODE11377                5.014e+02  4.944e+01  10.143  < 2e-16 ***
## ZIPCODE11378                3.707e+02  4.778e+01   7.758 8.80e-15 ***
## ZIPCODE11379                3.036e+02  4.549e+01   6.674 2.53e-11 ***
## ZIPCODE11385                8.080e+02  6.980e+01  11.576  < 2e-16 ***
## ZIPCODE11418                1.207e+02  4.089e+01   2.953 0.003151 ** 
## ZIPCODE11432                3.305e+02  4.610e+01   7.170 7.61e-13 ***
## ZIPCODE11693                4.275e+01  4.217e+01   1.014 0.310711    
## dayMonday                  -1.081e+01  8.401e-01 -12.864  < 2e-16 ***
## daySaturday                 9.725e+00  8.300e-01  11.717  < 2e-16 ***
## daySunday                   1.088e+00  8.255e-01   1.317 0.187723    
## dayThursday                -3.943e+00  7.890e-01  -4.998 5.82e-07 ***
## dayTuesday                 -1.028e+01  7.808e-01 -13.169  < 2e-16 ***
## dayWednesday               -7.400e+00  7.814e-01  -9.470  < 2e-16 ***
## hour1                      -1.013e+01  1.414e+00  -7.167 7.79e-13 ***
## hour2                      -2.137e+01  1.442e+00 -14.817  < 2e-16 ***
## hour3                      -2.642e+01  1.447e+00 -18.256  < 2e-16 ***
## hour4                      -3.213e+01  1.459e+00 -22.025  < 2e-16 ***
## hour5                      -3.810e+01  1.452e+00 -26.237  < 2e-16 ***
## hour6                      -3.080e+01  1.408e+00 -21.873  < 2e-16 ***
## hour7                      -1.418e+01  1.380e+00 -10.275  < 2e-16 ***
## hour8                      -1.378e+00  1.367e+00  -1.008 0.313608    
## hour9                      -6.469e-01  1.373e+00  -0.471 0.637535    
## hour10                     -4.564e+00  1.383e+00  -3.301 0.000963 ***
## hour11                     -5.606e+00  1.387e+00  -4.042 5.31e-05 ***
## hour12                     -5.718e+00  1.379e+00  -4.147 3.37e-05 ***
## hour13                     -4.869e+00  1.375e+00  -3.541 0.000399 ***
## hour14                     -4.167e-01  1.380e+00  -0.302 0.762692    
## hour15                      2.354e+00  1.370e+00   1.719 0.085613 .  
## hour16                      7.090e+00  1.376e+00   5.152 2.59e-07 ***
## hour17                      1.458e+01  1.388e+00  10.508  < 2e-16 ***
## hour18                      2.191e+01  1.368e+00  16.017  < 2e-16 ***
## hour19                      2.202e+01  1.377e+00  15.994  < 2e-16 ***
## hour20                      1.829e+01  1.370e+00  13.349  < 2e-16 ***
## hour21                      1.638e+01  1.378e+00  11.884  < 2e-16 ***
## hour22                      1.291e+01  1.374e+00   9.395  < 2e-16 ***
## hour23                      9.613e+00  1.379e+00   6.973 3.16e-12 ***
## apparentTemperature        -5.761e-02  3.659e-02  -1.574 0.115392    
## summaryDrizzle              8.421e+00  5.355e+00   1.572 0.115844    
## summaryFoggy                1.452e+01  2.483e+00   5.849 4.99e-09 ***
## summaryLight Rain           6.134e-01  1.243e+00   0.493 0.621814    
## summaryLight Snow           2.321e+00  3.130e+00   0.741 0.458456    
## summaryMostly Cloudy        1.203e+00  7.336e-01   1.640 0.101016    
## summaryOvercast             5.642e-01  6.380e-01   0.884 0.376476    
## summaryPartly Cloudy        1.104e+00  5.529e-01   1.996 0.045899 *  
## summaryPossible Drizzle     1.310e+00  1.392e+00   0.941 0.346570    
## summaryPossible Flurries    5.413e+00  3.864e+00   1.401 0.161285    
## summaryPossible Light Rain  5.842e+00  2.013e+00   2.903 0.003701 ** 
## summaryPossible Light Snow  1.730e-01  5.352e+00   0.032 0.974213    
## summaryRain                -1.098e+00  1.727e+00  -0.636 0.525095    
## AREA                       -7.420e-06  5.335e-07 -13.910  < 2e-16 ***
## POPULATION                         NA         NA      NA       NA    
## bar                                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.12 on 41290 degrees of freedom
## Multiple R-squared:  0.6083, Adjusted R-squared:  0.6071 
## F-statistic: 521.3 on 123 and 41290 DF,  p-value: < 2.2e-16
ols_distance <-  lm(total_trip_distance ~ ZIPCODE+day+hour+apparentTemperature+summary+AREA+POPULATION+bar+nightclub+hospital+social_facility, data = regression1)

summary(ols_distance)
## 
## Call:
## lm(formula = total_trip_distance ~ ZIPCODE + day + hour + apparentTemperature + 
##     summary + AREA + POPULATION + bar + nightclub + hospital + 
##     social_facility, data = regression1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -522.01  -29.76   -4.76   22.53 1668.85 
## 
## Coefficients: (5 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.183e+02  8.785e+01   2.485 0.012953 *  
## ZIPCODE10002                1.436e+02  1.051e+02   1.366 0.171889    
## ZIPCODE10007               -1.157e+02  1.217e+02  -0.950 0.341950    
## ZIPCODE10016               -3.314e+01  1.209e+02  -0.274 0.784066    
## ZIPCODE10018               -8.368e+01  1.212e+02  -0.691 0.489825    
## ZIPCODE10021               -7.420e+01  1.212e+02  -0.612 0.540378    
## ZIPCODE10022               -7.752e-01  1.210e+02  -0.006 0.994890    
## ZIPCODE10024                1.018e+02  8.592e+01   1.184 0.236288    
## ZIPCODE10025                1.546e+02  8.557e+01   1.806 0.070850 .  
## ZIPCODE10027                4.250e+02  8.590e+01   4.947 7.57e-07 ***
## ZIPCODE10029                3.851e+02  8.575e+01   4.491 7.10e-06 ***
## ZIPCODE10031                1.350e+02  8.555e+01   1.578 0.114590    
## ZIPCODE10032                2.655e+02  8.576e+01   3.096 0.001965 ** 
## ZIPCODE10035                1.962e+02  8.557e+01   2.293 0.021848 *  
## ZIPCODE10038               -7.955e+01  1.215e+02  -0.655 0.512724    
## ZIPCODE10040                4.025e+01  8.557e+01   0.470 0.638032    
## ZIPCODE10044               -1.278e+02  8.664e+01  -1.475 0.140092    
## ZIPCODE10075               -1.576e+02  1.218e+02  -1.294 0.195656    
## ZIPCODE10128               -4.586e+01  8.591e+01  -0.534 0.593435    
## ZIPCODE10301                1.225e+03  1.320e+02   9.275  < 2e-16 ***
## ZIPCODE10304                1.126e+03  1.259e+02   8.941  < 2e-16 ***
## ZIPCODE10309                2.781e+03  2.554e+02  10.888  < 2e-16 ***
## ZIPCODE10310                5.248e+02  1.123e+02   4.675 2.95e-06 ***
## ZIPCODE10452                1.932e+02  8.626e+01   2.239 0.025143 *  
## ZIPCODE10455                7.996e+01  8.558e+01   0.934 0.350134    
## ZIPCODE10456                2.151e+02  8.665e+01   2.483 0.013044 *  
## ZIPCODE10457                3.147e+02  8.847e+01   3.558 0.000375 ***
## ZIPCODE10461                6.637e+02  9.968e+01   6.658 2.81e-11 ***
## ZIPCODE10466                5.391e+02  9.567e+01   5.635 1.76e-08 ***
## ZIPCODE10467                7.549e+02  1.037e+02   7.281 3.36e-13 ***
## ZIPCODE10468                2.767e+02  8.762e+01   3.158 0.001592 ** 
## ZIPCODE10469                7.168e+02  1.029e+02   6.966 3.30e-12 ***
## ZIPCODE10470                8.357e+01  8.612e+01   0.970 0.331873    
## ZIPCODE11040               -1.443e+02  9.648e+01  -1.495 0.134825    
## ZIPCODE11101                1.043e+03  1.102e+02   9.467  < 2e-16 ***
## ZIPCODE11102                1.190e+02  8.556e+01   1.390 0.164407    
## ZIPCODE11103                2.021e+02  8.559e+01   2.361 0.018223 *  
## ZIPCODE11104                5.182e+00  8.579e+01   0.060 0.951839    
## ZIPCODE11105                5.173e+02  9.251e+01   5.592 2.26e-08 ***
## ZIPCODE11106                2.036e+02  8.582e+01   2.373 0.017672 *  
## ZIPCODE11109               -1.886e+02  8.777e+01  -2.149 0.031630 *  
## ZIPCODE11201                9.012e+02  8.955e+01  10.065  < 2e-16 ***
## ZIPCODE11203                6.268e+02  9.843e+01   6.368 1.93e-10 ***
## ZIPCODE11204                3.819e+02  9.064e+01   4.214 2.52e-05 ***
## ZIPCODE11205                2.203e+02  8.577e+01   2.568 0.010233 *  
## ZIPCODE11206                4.147e+02  8.942e+01   4.638 3.53e-06 ***
## ZIPCODE11207                8.119e+02  1.067e+02   7.608 2.85e-14 ***
## ZIPCODE11209                5.818e+02  9.692e+01   6.003 1.95e-09 ***
## ZIPCODE11211                5.731e+02  8.934e+01   6.415 1.42e-10 ***
## ZIPCODE11212                3.713e+02  8.985e+01   4.133 3.59e-05 ***
## ZIPCODE11215                7.939e+02  9.915e+01   8.007 1.21e-15 ***
## ZIPCODE11216                2.743e+02  8.611e+01   3.185 0.001446 ** 
## ZIPCODE11217                3.331e+02  8.566e+01   3.888 0.000101 ***
## ZIPCODE11218                3.021e+02  8.826e+01   3.423 0.000619 ***
## ZIPCODE11220                4.384e+02  9.198e+01   4.767 1.88e-06 ***
## ZIPCODE11221                3.592e+02  8.874e+01   4.048 5.17e-05 ***
## ZIPCODE11222                5.024e+02  9.015e+01   5.574 2.51e-08 ***
## ZIPCODE11224                4.392e+02  9.165e+01   4.793 1.65e-06 ***
## ZIPCODE11225                1.559e+02  8.581e+01   1.817 0.069222 .  
## ZIPCODE11226                3.627e+02  8.900e+01   4.075 4.61e-05 ***
## ZIPCODE11231                4.334e+02  8.899e+01   4.870 1.12e-06 ***
## ZIPCODE11232                5.653e+02  9.587e+01   5.897 3.74e-09 ***
## ZIPCODE11233                3.495e+02  8.853e+01   3.947 7.91e-05 ***
## ZIPCODE11234                2.598e+03  2.305e+02  11.272  < 2e-16 ***
## ZIPCODE11235                7.476e+02  1.035e+02   7.224 5.13e-13 ***
## ZIPCODE11237                1.950e+02  8.625e+01   2.261 0.023777 *  
## ZIPCODE11238                3.249e+02  8.656e+01   3.753 0.000175 ***
## ZIPCODE11249                2.655e+02  8.555e+01   3.103 0.001914 ** 
## ZIPCODE11354                6.947e+02  9.919e+01   7.004 2.52e-12 ***
## ZIPCODE11361                4.818e+02  9.351e+01   5.153 2.58e-07 ***
## ZIPCODE11368                8.090e+02  1.057e+02   7.652 2.03e-14 ***
## ZIPCODE11369                2.166e+02  8.668e+01   2.499 0.012456 *  
## ZIPCODE11370                1.305e+02  8.580e+01   1.521 0.128165    
## ZIPCODE11375                6.749e+02  9.571e+01   7.052 1.80e-12 ***
## ZIPCODE11377                8.980e+02  1.053e+02   8.528  < 2e-16 ***
## ZIPCODE11378                6.910e+02  1.018e+02   6.790 1.14e-11 ***
## ZIPCODE11379                5.680e+02  9.688e+01   5.863 4.58e-09 ***
## ZIPCODE11385                1.494e+03  1.486e+02  10.048  < 2e-16 ***
## ZIPCODE11418                2.332e+02  8.709e+01   2.678 0.007417 ** 
## ZIPCODE11432                6.225e+02  9.817e+01   6.341 2.31e-10 ***
## ZIPCODE11693                9.256e+01  8.982e+01   1.031 0.302766    
## dayMonday                  -2.363e+01  1.789e+00 -13.207  < 2e-16 ***
## daySaturday                 2.169e+01  1.768e+00  12.273  < 2e-16 ***
## daySunday                   6.718e+00  1.758e+00   3.821 0.000133 ***
## dayThursday                -9.756e+00  1.680e+00  -5.806 6.46e-09 ***
## dayTuesday                 -2.331e+01  1.663e+00 -14.020  < 2e-16 ***
## dayWednesday               -1.755e+01  1.664e+00 -10.545  < 2e-16 ***
## hour1                      -2.009e+01  3.011e+00  -6.671 2.58e-11 ***
## hour2                      -4.278e+01  3.072e+00 -13.926  < 2e-16 ***
## hour3                      -5.174e+01  3.082e+00 -16.786  < 2e-16 ***
## hour4                      -5.920e+01  3.107e+00 -19.055  < 2e-16 ***
## hour5                      -6.673e+01  3.093e+00 -21.578  < 2e-16 ***
## hour6                      -4.762e+01  2.999e+00 -15.876  < 2e-16 ***
## hour7                      -1.664e+01  2.938e+00  -5.662 1.51e-08 ***
## hour8                       5.023e+00  2.912e+00   1.725 0.084512 .  
## hour9                       6.113e+00  2.924e+00   2.090 0.036585 *  
## hour10                     -6.430e+00  2.944e+00  -2.184 0.028997 *  
## hour11                     -9.678e+00  2.954e+00  -3.276 0.001052 ** 
## hour12                     -1.024e+01  2.937e+00  -3.487 0.000489 ***
## hour13                     -9.165e+00  2.928e+00  -3.130 0.001751 ** 
## hour14                      1.551e-01  2.939e+00   0.053 0.957921    
## hour15                      3.792e+00  2.917e+00   1.300 0.193555    
## hour16                      1.078e+01  2.931e+00   3.676 0.000237 ***
## hour17                      2.009e+01  2.956e+00   6.799 1.07e-11 ***
## hour18                      2.940e+01  2.913e+00  10.090  < 2e-16 ***
## hour19                      2.999e+01  2.932e+00  10.226  < 2e-16 ***
## hour20                      2.591e+01  2.917e+00   8.881  < 2e-16 ***
## hour21                      2.678e+01  2.935e+00   9.122  < 2e-16 ***
## hour22                      2.439e+01  2.926e+00   8.335  < 2e-16 ***
## hour23                      2.102e+01  2.936e+00   7.159 8.27e-13 ***
## apparentTemperature         3.485e-02  7.792e-02   0.447 0.654711    
## summaryDrizzle              1.010e+01  1.140e+01   0.886 0.375841    
## summaryFoggy                2.636e+01  5.288e+00   4.986 6.20e-07 ***
## summaryLight Rain           7.782e-01  2.648e+00   0.294 0.768876    
## summaryLight Snow           7.160e+00  6.666e+00   1.074 0.282795    
## summaryMostly Cloudy        1.680e+00  1.562e+00   1.075 0.282374    
## summaryOvercast             1.637e-01  1.359e+00   0.120 0.904119    
## summaryPartly Cloudy        2.300e+00  1.178e+00   1.954 0.050758 .  
## summaryPossible Drizzle     8.866e-01  2.964e+00   0.299 0.764832    
## summaryPossible Flurries    1.226e+01  8.229e+00   1.489 0.136434    
## summaryPossible Light Rain  1.143e+01  4.286e+00   2.666 0.007676 ** 
## summaryPossible Light Snow  4.174e-01  1.140e+01   0.037 0.970786    
## summaryRain                -5.541e+00  3.679e+00  -1.506 0.132075    
## AREA                       -1.361e-05  1.136e-06 -11.977  < 2e-16 ***
## POPULATION                         NA         NA      NA       NA    
## bar                                NA         NA      NA       NA    
## nightclub                          NA         NA      NA       NA    
## hospital                           NA         NA      NA       NA    
## social_facility                    NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 85.45 on 41290 degrees of freedom
## Multiple R-squared:  0.595,  Adjusted R-squared:  0.5937 
## F-statistic: 493.1 on 123 and 41290 DF,  p-value: < 2.2e-16

Why do you think we get NA values ??

ols_pass$rank
## [1] 124
ncol(model.matrix(ols_pass))
## [1] 126
## Correlation Matrix 
# Numerical 
numeric <- regression1[, -c(1:5)]
M<-cor(numeric)
corrplot(M, method="pie")

What OLS assumption does this violate? Why is OLS regression unsuitable for this analysis? What type of regression should we employ instead?

regression2<- regression1[, c("total_passengers", "total_trip_distance", "ZIPCODE", "day", "hour", "apparentTemperature", "summary", "AREA", "number_activity")]
ols_pass1 <-  lm(total_passengers ~ day+hour+apparentTemperature+summary+AREA+number_activity, data = regression2)
summary(ols_pass1)
## 
## Call:
## lm(formula = total_passengers ~ day + hour + apparentTemperature + 
##     summary + AREA + number_activity, data = regression2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -138.78  -29.23  -12.26    8.71  719.28 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.116e+01  1.855e+00  22.186  < 2e-16 ***
## dayMonday                  -1.017e+01  1.207e+00  -8.431  < 2e-16 ***
## daySaturday                 8.826e+00  1.193e+00   7.400 1.39e-13 ***
## daySunday                   4.969e-01  1.186e+00   0.419 0.675314    
## dayThursday                -3.507e+00  1.134e+00  -3.093 0.001980 ** 
## dayTuesday                 -9.016e+00  1.122e+00  -8.036 9.56e-16 ***
## dayWednesday               -6.694e+00  1.123e+00  -5.961 2.52e-09 ***
## hour1                      -8.266e+00  2.031e+00  -4.069 4.72e-05 ***
## hour2                      -1.804e+01  2.072e+00  -8.708  < 2e-16 ***
## hour3                      -2.214e+01  2.079e+00 -10.652  < 2e-16 ***
## hour4                      -2.758e+01  2.095e+00 -13.165  < 2e-16 ***
## hour5                      -3.317e+01  2.085e+00 -15.905  < 2e-16 ***
## hour6                      -2.870e+01  2.022e+00 -14.191  < 2e-16 ***
## hour7                      -1.470e+01  1.982e+00  -7.417 1.22e-13 ***
## hour8                      -3.114e+00  1.964e+00  -1.586 0.112768    
## hour9                      -2.016e+00  1.972e+00  -1.022 0.306696    
## hour10                     -5.770e+00  1.985e+00  -2.907 0.003652 ** 
## hour11                     -6.181e+00  1.991e+00  -3.104 0.001910 ** 
## hour12                     -6.730e+00  1.981e+00  -3.398 0.000679 ***
## hour13                     -6.267e+00  1.975e+00  -3.173 0.001508 ** 
## hour14                     -2.102e+00  1.982e+00  -1.061 0.288886    
## hour15                      5.096e-01  1.967e+00   0.259 0.795599    
## hour16                      5.562e+00  1.977e+00   2.813 0.004906 ** 
## hour17                      1.286e+01  1.993e+00   6.452 1.12e-10 ***
## hour18                      2.007e+01  1.964e+00  10.215  < 2e-16 ***
## hour19                      2.005e+01  1.978e+00  10.140  < 2e-16 ***
## hour20                      1.669e+01  1.968e+00   8.483  < 2e-16 ***
## hour21                      1.518e+01  1.980e+00   7.667 1.80e-14 ***
## hour22                      1.147e+01  1.973e+00   5.814 6.15e-09 ***
## hour23                      9.114e+00  1.981e+00   4.600 4.23e-06 ***
## apparentTemperature        -6.542e-02  5.258e-02  -1.244 0.213445    
## summaryDrizzle              8.485e+00  7.697e+00   1.102 0.270266    
## summaryFoggy                1.403e+01  3.568e+00   3.932 8.46e-05 ***
## summaryLight Rain           9.542e-01  1.787e+00   0.534 0.593394    
## summaryLight Snow           1.288e+00  4.498e+00   0.286 0.774672    
## summaryMostly Cloudy        7.030e-01  1.054e+00   0.667 0.504831    
## summaryOvercast             5.444e-01  9.168e-01   0.594 0.552668    
## summaryPartly Cloudy        1.127e+00  7.945e-01   1.419 0.155951    
## summaryPossible Drizzle     1.110e+00  2.000e+00   0.555 0.579027    
## summaryPossible Flurries    5.094e+00  5.553e+00   0.917 0.358992    
## summaryPossible Light Rain  5.263e+00  2.892e+00   1.820 0.068818 .  
## summaryPossible Light Snow -4.356e-01  7.692e+00  -0.057 0.954841    
## summaryRain                -8.283e-01  2.483e+00  -0.334 0.738666    
## AREA                       -3.468e-07  1.021e-08 -33.978  < 2e-16 ***
## number_activity             3.925e+00  5.139e-02  76.372  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.67 on 41369 degrees of freedom
## Multiple R-squared:  0.1891, Adjusted R-squared:  0.1883 
## F-statistic: 219.3 on 44 and 41369 DF,  p-value: < 2.2e-16
ols_distance1 <-  lm(total_trip_distance ~ day+hour+apparentTemperature+summary+AREA+number_activity, data = regression2)
summary(ols_distance1)
## 
## Call:
## lm(formula = total_trip_distance ~ day + hour + apparentTemperature + 
##     summary + AREA + number_activity, data = regression2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -302.46  -57.58  -26.29   17.17 1782.50 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 8.451e+01  3.907e+00  21.633  < 2e-16 ***
## dayMonday                  -2.241e+01  2.542e+00  -8.816  < 2e-16 ***
## daySaturday                 1.988e+01  2.512e+00   7.914 2.55e-15 ***
## daySunday                   5.493e+00  2.498e+00   2.199 0.027894 *  
## dayThursday                -8.901e+00  2.388e+00  -3.727 0.000194 ***
## dayTuesday                 -2.084e+01  2.363e+00  -8.820  < 2e-16 ***
## dayWednesday               -1.617e+01  2.365e+00  -6.837 8.20e-12 ***
## hour1                      -1.636e+01  4.278e+00  -3.824 0.000131 ***
## hour2                      -3.624e+01  4.364e+00  -8.304  < 2e-16 ***
## hour3                      -4.331e+01  4.378e+00  -9.892  < 2e-16 ***
## hour4                      -5.029e+01  4.412e+00 -11.397  < 2e-16 ***
## hour5                      -5.713e+01  4.391e+00 -13.009  < 2e-16 ***
## hour6                      -4.357e+01  4.259e+00 -10.230  < 2e-16 ***
## hour7                      -1.757e+01  4.173e+00  -4.209 2.57e-05 ***
## hour8                       1.687e+00  4.136e+00   0.408 0.683358    
## hour9                       3.459e+00  4.153e+00   0.833 0.404922    
## hour10                     -8.739e+00  4.180e+00  -2.090 0.036587 *  
## hour11                     -1.072e+01  4.194e+00  -2.557 0.010567 *  
## hour12                     -1.216e+01  4.171e+00  -2.916 0.003543 ** 
## hour13                     -1.186e+01  4.159e+00  -2.851 0.004360 ** 
## hour14                     -3.084e+00  4.174e+00  -0.739 0.460110    
## hour15                      2.700e-01  4.143e+00   0.065 0.948032    
## hour16                      7.887e+00  4.164e+00   1.894 0.058207 .  
## hour17                      1.682e+01  4.198e+00   4.007 6.17e-05 ***
## hour18                      2.589e+01  4.137e+00   6.259 3.91e-10 ***
## hour19                      2.616e+01  4.165e+00   6.280 3.42e-10 ***
## hour20                      2.286e+01  4.144e+00   5.517 3.46e-08 ***
## hour21                      2.449e+01  4.169e+00   5.873 4.31e-09 ***
## hour22                      2.158e+01  4.156e+00   5.193 2.08e-07 ***
## hour23                      2.003e+01  4.172e+00   4.802 1.58e-06 ***
## apparentTemperature         1.949e-02  1.107e-01   0.176 0.860295    
## summaryDrizzle              1.011e+01  1.621e+01   0.624 0.532637    
## summaryFoggy                2.540e+01  7.514e+00   3.380 0.000725 ***
## summaryLight Rain           1.438e+00  3.764e+00   0.382 0.702334    
## summaryLight Snow           4.989e+00  9.473e+00   0.527 0.598397    
## summaryMostly Cloudy        6.954e-01  2.220e+00   0.313 0.754088    
## summaryOvercast             1.159e-01  1.931e+00   0.060 0.952125    
## summaryPartly Cloudy        2.316e+00  1.673e+00   1.384 0.166239    
## summaryPossible Drizzle     5.636e-01  4.212e+00   0.134 0.893547    
## summaryPossible Flurries    1.179e+01  1.169e+01   1.009 0.313209    
## summaryPossible Light Rain  1.042e+01  6.091e+00   1.711 0.087044 .  
## summaryPossible Light Snow -8.055e-01  1.620e+01  -0.050 0.960341    
## summaryRain                -4.963e+00  5.228e+00  -0.949 0.342511    
## AREA                       -7.223e-07  2.149e-08 -33.606  < 2e-16 ***
## number_activity             8.566e+00  1.082e-01  79.154  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 121.5 on 41369 degrees of freedom
## Multiple R-squared:  0.1801, Adjusted R-squared:  0.1793 
## F-statistic: 206.6 on 44 and 41369 DF,  p-value: < 2.2e-16

2) Panel Data: Random Effects Regression

$y_{it}=\beta_{1}x_{it} + a_{i} + u_{it}$
pdata <- regression1
pdata$time = as.numeric(pdata$time)
pdata$ZIPCODE = as.numeric(pdata$ZIPCODE)

pdata <- pdata.frame(pdata, index=c("ZIPCODE", "time"))
## Warning in pdata.frame(pdata, index = c("ZIPCODE", "time")): duplicate couples (id-time) in resulting pdata.frame
##  to find out which, use e.g. table(index(your_pdataframe), useNA = "ifany")
#view(table(index(pdata), useNA = "ifany")) ## Remove duplicates
pdata <- pdata[pdata$time != 1456768800 & pdata$ZIPCODE != 14,]
pdata <- pdata[pdata$POPULATION > 0, ]
view(table(index(pdata), useNA = "ifany"))
re_passenger <- plm(total_passengers ~ summary+apparentTemperature+number_activity+day+hour, data = pdata, model= "random", index=c("ZIPCODE", "time"))
summary(re_passenger)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = total_passengers ~ summary + apparentTemperature + 
##     number_activity + day + hour, data = pdata, model = "random", 
##     index = c("ZIPCODE", "time"))
## 
## Unbalanced Panel: n = 80, T = 1-743, N = 40575
## 
## Effects:
##                   var std.dev share
## idiosyncratic 1585.12   39.81 0.476
## individual    1744.68   41.77 0.524
## theta:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3100  0.9628  0.9648  0.9620  0.9650  0.9651 
## 
## Residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -214.45  -15.36   -2.47    0.27   11.26  679.59 
## 
## Coefficients:
##                               Estimate  Std. Error  z-value  Pr(>|z|)    
## (Intercept)                 20.7252535   6.2107584   3.3370 0.0008469 ***
## summaryDrizzle               8.5170708   5.3591308   1.5893 0.1120009    
## summaryFoggy                14.3300273   2.4879975   5.7597 8.428e-09 ***
## summaryLight Rain            1.2285535   1.2676362   0.9692 0.3324609    
## summaryLight Snow            2.5365655   3.1404665   0.8077 0.4192613    
## summaryMostly Cloudy         1.2280465   0.7349091   1.6710 0.0947181 .  
## summaryOvercast              0.3738778   0.6390918   0.5850 0.5585382    
## summaryPartly Cloudy         1.2029922   0.5540708   2.1712 0.0299169 *  
## summaryPossible Drizzle      1.3477698   1.3934907   0.9672 0.3334492    
## summaryPossible Flurries     5.7190933   3.8678703   1.4786 0.1392431    
## summaryPossible Light Rain   6.1635497   2.0155236   3.0580 0.0022279 ** 
## summaryPossible Light Snow   0.0040267   5.3560518   0.0008 0.9994001    
## summaryRain                 -0.9666104   1.7296746  -0.5588 0.5762713    
## apparentTemperature         -0.0525022   0.0366650  -1.4319 0.1521599    
## number_activity              2.9890804   0.8602671   3.4746 0.0005116 ***
## dayMonday                  -10.9027564   0.8416146 -12.9546 < 2.2e-16 ***
## daySaturday                  9.7296553   0.8314843  11.7015 < 2.2e-16 ***
## daySunday                    0.9395213   0.8271222   1.1359 0.2560019    
## dayThursday                 -4.0963715   0.7904723  -5.1822 2.193e-07 ***
## dayTuesday                 -10.2183666   0.7829472 -13.0512 < 2.2e-16 ***
## dayWednesday                -7.4585241   0.7827420  -9.5287 < 2.2e-16 ***
## hour1                       -9.3562197   1.4275756  -6.5539 5.605e-11 ***
## hour2                      -20.9162115   1.4455307 -14.4696 < 2.2e-16 ***
## hour3                      -25.8345536   1.4513402 -17.8005 < 2.2e-16 ***
## hour4                      -31.6431551   1.4626907 -21.6335 < 2.2e-16 ***
## hour5                      -37.7876954   1.4561885 -25.9497 < 2.2e-16 ***
## hour6                      -31.0390598   1.4121230 -21.9804 < 2.2e-16 ***
## hour7                      -15.1945579   1.3817725 -10.9964 < 2.2e-16 ***
## hour8                       -2.6661926   1.3694037  -1.9470 0.0515379 .  
## hour9                       -1.9770392   1.3755348  -1.4373 0.1506363    
## hour10                      -5.9414749   1.3847556  -4.2906 1.782e-05 ***
## hour11                      -6.9487518   1.3891331  -5.0022 5.667e-07 ***
## hour12                      -6.8868992   1.3809785  -4.9870 6.133e-07 ***
## hour13                      -5.9914853   1.3769275  -4.3513 1.353e-05 ***
## hour14                      -1.8305943   1.3822849  -1.3243 0.1853952    
## hour15                       0.8106615   1.3713395   0.5911 0.5544228    
## hour16                       5.5540364   1.3786451   4.0286 5.611e-05 ***
## hour17                      12.8996199   1.3901754   9.2791 < 2.2e-16 ***
## hour18                      19.8383764   1.3695387  14.4854 < 2.2e-16 ***
## hour19                      20.2586946   1.3789899  14.6910 < 2.2e-16 ***
## hour20                      16.8369249   1.3720747  12.2711 < 2.2e-16 ***
## hour21                      14.9913583   1.3799659  10.8636 < 2.2e-16 ***
## hour22                      12.0620431   1.3755012   8.7692 < 2.2e-16 ***
## hour23                       9.4580229   1.3811241   6.8481 7.486e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    76121000
## Residual Sum of Squares: 64237000
## R-Squared:      0.15616
## Adj. R-Squared: 0.15527
## Chisq: 7510.15 on 43 DF, p-value: < 2.22e-16

3) GLM (Generalised linear models) with a lasso selection technique

\(\sum^{n}_{i=1} \left(y_i-\beta_0-\sum^{p}_{j=1} \beta_j x_{ij}\right)^2 + \lambda \underbrace{\sum^{p}_{j=1} \mid{\beta_j}\mid}_{\ell^1-penalisation}\)

#Partition Data into Training and Test Data:

lasso <- data.matrix(regression1)
smp_size <- floor(0.75 * nrow(lasso))
set.seed(6789)
train_ind <- sample(seq_len(nrow(lasso)), size = smp_size)

length(lasso)
## [1] 662624
x <- data.matrix(lasso[, -10])
y <- data.matrix(lasso[, 10])

train.x <- data.matrix(x[train_ind, ])
test.x <- data.matrix(x[-train_ind, ])

train.y <- data.matrix(y[train_ind, ])
test.y <- data.matrix(y[-train_ind, ])

Next we create an empty list to store our resutls. This will create a space for all our different values of alpha which we will run 10 iterations for.Then we will run the regression for different alpha values.

list.of.fits <- list()
for (i in 0:10) {
  fit.name <- paste0("alpha", i/10)
  list.of.fits[[fit.name]] <-
    cv.glmnet(train.x, train.y, type.measure="mse", alpha=i/10, 
              family="gaussian")
}

Now we see which alpha (0, 0.1, … , 0.9, 1) does the best job atpredicting the values in the Testing dataset.

results <- data.frame()
for (i in 0:10) {
  fit.name <- paste0("alpha", i/10)
  
  ## Use each model to predict 'y' given the Testing dataset
  predicted <- 
    predict(list.of.fits[[fit.name]], 
            s=list.of.fits[[fit.name]]$lambda.1se, newx=test.x)
  
  ## Calculate the Mean Squared Error...
  mse <- mean((test.y - predicted)^2)
  
  ## Store the results
  temp <- data.frame(alpha=i/10, mse=mse, fit.name=fit.name)
  results <- rbind(results, temp)
}

View Results

results
##    alpha      mse fit.name
## 1    0.0 1233.375   alpha0
## 2    0.1 1075.381 alpha0.1
## 3    0.2 1075.798 alpha0.2
## 4    0.3 1060.052 alpha0.3
## 5    0.4 1075.950 alpha0.4
## 6    0.5 1060.970 alpha0.5
## 7    0.6 1060.612 alpha0.6
## 8    0.7 1057.216 alpha0.7
## 9    0.8 1081.844 alpha0.8
## 10   0.9 1053.314 alpha0.9
## 11   1.0 1069.142   alpha1