Regression on the number of day trips

#######################################################season 
Season <- function(data2) {
  d<- data2$Start.Date
  WS <- as.Date("21/12/2017", format = "%d/%m/%Y") # Winter 
  SE <- as.Date("20/3/2017",  format = "%d/%m/%Y") # Spring 
  SS <- as.Date("21/6/2017",  format = "%d/%m/%Y") # Summer 
  FE <- as.Date("22/9/2017",  format = "%d/%m/%Y") # Autumn
  
  ifelse (d >= WS | d < SE, "Winter",
          ifelse (d >= SE & d < SS, "Spring",
                  ifelse (d >= SS & d < FE, "Summer", "Autumn")))
}

bikedata$Start.Date<- as.Date(strptime(bikedata$Start.Date,
                                              format="%d/%m/%Y %H:%M",tz="UTC"))

bikedata$weekdays<- wday(bikedata$Start.Date,label = T)

bikedata$weekdays<- factor(bikedata$weekdays,ordered = FALSE)

bikedata$season<-as.factor(Season(data2=bikedata))

str(bikedata)

## Classes 'tbl_df', 'tbl' and 'data.frame':    8434530 obs. of  8 variables:
##  $ X1             : chr  "1:1" "1:2" "1:3" "1:4" ...
##  $ Duration       : int  660 660 600 660 600 420 420 480 540 360 ...
##  $ End.Date       : chr  "05/01/2017 07:15" "04/01/2017 16:16" "04/01/2017 07:27" "05/01/2017 16:13" ...
##  $ EndStation.Id  : int  819 217 819 217 819 246 433 162 358 427 ...
##  $ Start.Date     : Date, format: "2017-01-05" "2017-01-04" ...
##  $ StartStation.Id: int  217 819 217 273 217 433 594 246 246 66 ...
##  $ weekdays       : Factor w/ 7 levels "Sun","Mon","Tues",..: 5 4 4 5 6 5 5 5 4 6 ...
##  $ season         : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 6
##   .. ..$ X1             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Duration       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ End.Date       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ EndStation.Id  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Start.Date     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ StartStation.Id: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

head(bikedata,20)

## # A tibble: 20 x 8
##       X1 Duration         End.Date EndStation.Id Start.Date
##    <chr>    <int>            <chr>         <int>     <date>
##  1   1:1      660 05/01/2017 07:15           819 2017-01-05
##  2   1:2      660 04/01/2017 16:16           217 2017-01-04
##  3   1:3      600 04/01/2017 07:27           819 2017-01-04
##  4   1:4      660 05/01/2017 16:13           217 2017-01-05
##  5   1:5      600 06/01/2017 07:26           819 2017-01-06
##  6   1:6      420 05/01/2017 21:12           246 2017-01-05
##  7   1:7      420 05/01/2017 20:52           433 2017-01-05
##  8   1:8      480 05/01/2017 13:08           162 2017-01-05
##  9   1:9      540 04/01/2017 14:31           358 2017-01-04
## 10  1:10      360 06/01/2017 14:07           427 2017-01-06
## 11  1:11      360 06/01/2017 12:46            66 2017-01-06
## 12  1:12      180 04/01/2017 18:16           674 2017-01-04
## 13  1:13      300 06/01/2017 08:51           804 2017-01-06
## 14  1:14      300 05/01/2017 19:43           674 2017-01-05
## 15  1:15      240 06/01/2017 17:09           674 2017-01-06
## 16  1:16      180 06/01/2017 23:39           128 2017-01-06
## 17  1:17      300 06/01/2017 19:13           589 2017-01-06
## 18  1:18      180 05/01/2017 07:53           127 2017-01-05
## 19  1:19      300 04/01/2017 18:16           537 2017-01-04
## 20  1:20      900 04/01/2017 17:54           506 2017-01-04
## # ... with 3 more variables: StartStation.Id <int>, weekdays <fctr>,
## #   season <fctr>

sjp.frq(bikedata$weekdays)

sjp.frq(bikedata$season)

x<-plyr::count(bikedata,vars = c("Start.Date","weekdays","season"))
colnames(x)[4]<- c("ntrips")
bikedata<-join(x,bikedata)

## Joining by: Start.Date, weekdays, season

bikedata<- bikedata%>% dplyr::select("Start.Date","weekdays","season","ntrips")

head(bikedata,20)

##    Start.Date weekdays season ntrips
## 1  2017-01-04      Wed Winter  21955
## 2  2017-01-04      Wed Winter  21955
## 3  2017-01-04      Wed Winter  21955
## 4  2017-01-04      Wed Winter  21955
## 5  2017-01-04      Wed Winter  21955
## 6  2017-01-04      Wed Winter  21955
## 7  2017-01-04      Wed Winter  21955
## 8  2017-01-04      Wed Winter  21955
## 9  2017-01-04      Wed Winter  21955
## 10 2017-01-04      Wed Winter  21955
## 11 2017-01-04      Wed Winter  21955
## 12 2017-01-04      Wed Winter  21955
## 13 2017-01-04      Wed Winter  21955
## 14 2017-01-04      Wed Winter  21955
## 15 2017-01-04      Wed Winter  21955
## 16 2017-01-04      Wed Winter  21955
## 17 2017-01-04      Wed Winter  21955
## 18 2017-01-04      Wed Winter  21955
## 19 2017-01-04      Wed Winter  21955
## 20 2017-01-04      Wed Winter  21955

bikedata<- bikedata[!duplicated(bikedata),]

bikedata<- na.omit(bikedata)
str(bikedata)

## 'data.frame':    287 obs. of  4 variables:
##  $ Start.Date: Date, format: "2017-01-04" "2017-01-05" ...
##  $ weekdays  : Factor w/ 7 levels "Sun","Mon","Tues",..: 4 5 6 7 1 2 3 4 5 6 ...
##  $ season    : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ ntrips    : int  21955 23350 18810 13722 13335 37595 27843 28002 17000 20764 ...

head(bikedata,20)

##        Start.Date weekdays season ntrips
## 1      2017-01-04      Wed Winter  21955
## 21956  2017-01-05    Thurs Winter  23350
## 45306  2017-01-06      Fri Winter  18810
## 64116  2017-01-07      Sat Winter  13722
## 77838  2017-01-08      Sun Winter  13335
## 91173  2017-01-09      Mon Winter  37595
## 128768 2017-01-10     Tues Winter  27843
## 156611 2017-01-11      Wed Winter  28002
## 184613 2017-01-12    Thurs Winter  17000
## 201613 2017-01-13      Fri Winter  20764
## 222377 2017-01-14      Sat Winter  13615
## 235992 2017-01-15      Sun Winter   6672
## 242664 2017-01-16      Mon Winter  19049
## 261713 2017-01-17     Tues Winter  27060
## 288773 2017-01-18      Wed Winter  26969
## 315742 2017-01-19    Thurs Winter  26591
## 342333 2017-01-20      Fri Winter  24900
## 367233 2017-01-21      Sat Winter  17311
## 384544 2017-01-22      Sun Winter  15415
## 399959 2017-01-23      Mon Winter  23089

sjp.frq(bikedata$weekdays)

sjp.frq(bikedata$season)

ggplot(data = bikedata,aes(x=weekdays,y=ntrips,fill=season))+geom_boxplot()+facet_wrap(~season)

set.seed(14)
ind<- sample(2,nrow(bikedata),replace = T,prob = c(0.8,0.2))
train<- bikedata[ind==1,]
test<- bikedata[ind==2,]
############################################model
str(train)

## 'data.frame':    237 obs. of  4 variables:
##  $ Start.Date: Date, format: "2017-01-04" "2017-01-05" ...
##  $ weekdays  : Factor w/ 7 levels "Sun","Mon","Tues",..: 4 5 7 2 4 5 6 1 2 5 ...
##  $ season    : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ ntrips    : int  21955 23350 13722 37595 28002 17000 20764 6672 19049 26591 ...

model1<- glm(train,formula =  ntrips~ weekdays+season, family = "poisson")

model3<-glm.nb(train,formula =  ntrips~weekdays+season)

model32<-glm.nb(train,formula =  ntrips~weekdays+season+
    weekdays:season)


summary(model3)

## 
## Call:
## glm.nb(formula = ntrips ~ weekdays + season, data = train, init.theta = 20.84804221, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.3678  -0.4795   0.0414   0.5469   2.2805  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   10.02982    0.04787 209.534  < 2e-16 ***
## weekdaysMon    0.23582    0.05326   4.428 9.52e-06 ***
## weekdaysTues   0.27747    0.05628   4.930 8.22e-07 ***
## weekdaysWed    0.24130    0.05362   4.500 6.79e-06 ***
## weekdaysThurs  0.25857    0.05393   4.795 1.63e-06 ***
## weekdaysFri    0.23254    0.05582   4.166 3.11e-05 ***
## weekdaysSat    0.03805    0.05659   0.672    0.501    
## seasonSpring   0.15869    0.04037   3.931 8.47e-05 ***
## seasonSummer   0.20952    0.04370   4.794 1.63e-06 ***
## seasonWinter  -0.19230    0.04365  -4.406 1.05e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(20.848) family taken to be 1)
## 
##     Null deviance: 394.97  on 236  degrees of freedom
## Residual deviance: 238.93  on 227  degrees of freedom
## AIC: 4838.9
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  20.85 
##           Std. Err.:  1.90 
## 
##  2 x log-likelihood:  -4816.938

anova(model3)

## Warning in anova.negbin(model3): tests made without re-estimating 'theta'

## Analysis of Deviance Table
## 
## Model: Negative Binomial(20.848), link: log
## 
## Response: ntrips
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                       236     394.97              
## weekdays  6   42.975       230     352.00  1.18e-07 ***
## season    3  113.068       227     238.93 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

d<- 2 * (logLik(model1) - logLik(model3))
pchisq(d, df = 1, lower.tail = FALSE)

## 'log Lik.' 1 (df=10)

Evdence of nb is more suitable

pchisq(model32$deviance, df=model32$df.residual, lower.tail=FALSE)

## [1] 0.07895388

the P value of Chi squared fitness test is greater than 0.05, thus we failed to reject the H0, and conclude that the predicted value is similar to the observed value.

list(c(AIC(model3),"model3"),
c(AIC(model32),"model32"))

## [[1]]
## [1] "4838.93774211209" "model3"          
## 
## [[2]]
## [1] "4812.53894020949" "model32"

list(c(logLik(model3),"model3"),
c(logLik(model32),"model32"))

## [[1]]
## [1] "-2408.46887105604" "model3"           
## 
## [[2]]
## [1] "-2377.26947010474" "model32"

list(c(model32$deviance,"model32",
c(model3$deviance,"model3")))

## [[1]]
## [1] "238.495461191019" "model32"          "238.927947040877"
## [4] "model3"

AIC balances between the complexity of the model and the goodnesss of fit, since increasing the number of parameters in the model will mostly increase fit, but often it may not worth duo to complexity. Follow the equation of AIC, we want the lower value among all the models. thus model32 is lying in our favour. Also model32 has higher lower deviance and higher loglikehood.

sjp.glm(model32)

## Waiting for profiling to be done...

summary(model32)

## 
## Call:
## glm.nb(formula = ntrips ~ weekdays + season + weekdays:season, 
##     data = train, init.theta = 27.03471804, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.8185  -0.5816  -0.0857   0.4662   2.7347  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 9.94151    0.06804 146.109  < 2e-16 ***
## weekdaysMon                 0.33466    0.09622   3.478 0.000505 ***
## weekdaysTues                0.43358    0.10970   3.952 7.74e-05 ***
## weekdaysWed                 0.42166    0.09959   4.234 2.30e-05 ***
## weekdaysThurs               0.41298    0.09959   4.147 3.37e-05 ***
## weekdaysFri                 0.35734    0.10970   3.257 0.001124 ** 
## weekdaysSat                -0.04431    0.09960  -0.445 0.656428    
## seasonSpring                0.41854    0.09128   4.585 4.53e-06 ***
## seasonSummer                0.46127    0.10392   4.439 9.05e-06 ***
## seasonWinter               -0.57696    0.10397  -5.549 2.87e-08 ***
## weekdaysMon:seasonSpring   -0.38186    0.12573  -3.037 0.002388 ** 
## weekdaysTues:seasonSpring  -0.37580    0.13821  -2.719 0.006546 ** 
## weekdaysWed:seasonSpring   -0.40722    0.12925  -3.151 0.001629 ** 
## weekdaysThurs:seasonSpring -0.36237    0.12925  -2.804 0.005053 ** 
## weekdaysFri:seasonSpring   -0.31199    0.13719  -2.274 0.022957 *  
## weekdaysSat:seasonSpring    0.03126    0.13163   0.237 0.812271    
## weekdaysMon:seasonSummer   -0.24260    0.13979  -1.736 0.082650 .  
## weekdaysTues:seasonSummer  -0.36110    0.14939  -2.417 0.015640 *  
## weekdaysWed:seasonSummer   -0.38546    0.14068  -2.740 0.006144 ** 
## weekdaysThurs:seasonSummer -0.33771    0.14393  -2.346 0.018959 *  
## weekdaysFri:seasonSummer   -0.29451    0.15327  -1.921 0.054672 .  
## weekdaysSat:seasonSummer   -0.12685    0.14920  -0.850 0.395220    
## weekdaysMon:seasonWinter    0.49029    0.13982   3.506 0.000454 ***
## weekdaysTues:seasonWinter   0.40197    0.15615   2.574 0.010047 *  
## weekdaysWed:seasonWinter    0.33388    0.14217   2.348 0.018850 *  
## weekdaysThurs:seasonWinter  0.32937    0.14071   2.341 0.019250 *  
## weekdaysFri:seasonWinter    0.35370    0.15114   2.340 0.019269 *  
## weekdaysSat:seasonWinter    0.56767    0.14626   3.881 0.000104 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(27.0347) family taken to be 1)
## 
##     Null deviance: 512.05  on 236  degrees of freedom
## Residual deviance: 238.50  on 209  degrees of freedom
## AIC: 4812.5
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  27.03 
##           Std. Err.:  2.47 
## 
##  2 x log-likelihood:  -4754.539

From the summary and glm output, most of weekdays have higher impact than Sunday while Winter remains to be the season of least number of trips. With all other cofficients remains constant, Monday will have exp(0.33) more trips than Sunday.