#######################################################season
Season <- function(data2) {
d<- data2$Start.Date
WS <- as.Date("21/12/2017", format = "%d/%m/%Y") # Winter
SE <- as.Date("20/3/2017", format = "%d/%m/%Y") # Spring
SS <- as.Date("21/6/2017", format = "%d/%m/%Y") # Summer
FE <- as.Date("22/9/2017", format = "%d/%m/%Y") # Autumn
ifelse (d >= WS | d < SE, "Winter",
ifelse (d >= SE & d < SS, "Spring",
ifelse (d >= SS & d < FE, "Summer", "Autumn")))
}
bikedata$Start.Date<- as.Date(strptime(bikedata$Start.Date,
format="%d/%m/%Y %H:%M",tz="UTC"))
bikedata$weekdays<- wday(bikedata$Start.Date,label = T)
bikedata$weekdays<- factor(bikedata$weekdays,ordered = FALSE)
bikedata$season<-as.factor(Season(data2=bikedata))
str(bikedata)
## Classes 'tbl_df', 'tbl' and 'data.frame': 8434530 obs. of 8 variables:
## $ X1 : chr "1:1" "1:2" "1:3" "1:4" ...
## $ Duration : int 660 660 600 660 600 420 420 480 540 360 ...
## $ End.Date : chr "05/01/2017 07:15" "04/01/2017 16:16" "04/01/2017 07:27" "05/01/2017 16:13" ...
## $ EndStation.Id : int 819 217 819 217 819 246 433 162 358 427 ...
## $ Start.Date : Date, format: "2017-01-05" "2017-01-04" ...
## $ StartStation.Id: int 217 819 217 273 217 433 594 246 246 66 ...
## $ weekdays : Factor w/ 7 levels "Sun","Mon","Tues",..: 5 4 4 5 6 5 5 5 4 6 ...
## $ season : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 6
## .. ..$ X1 : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ Duration : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ End.Date : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ EndStation.Id : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ Start.Date : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ StartStation.Id: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
head(bikedata,20)
## # A tibble: 20 x 8
## X1 Duration End.Date EndStation.Id Start.Date
## <chr> <int> <chr> <int> <date>
## 1 1:1 660 05/01/2017 07:15 819 2017-01-05
## 2 1:2 660 04/01/2017 16:16 217 2017-01-04
## 3 1:3 600 04/01/2017 07:27 819 2017-01-04
## 4 1:4 660 05/01/2017 16:13 217 2017-01-05
## 5 1:5 600 06/01/2017 07:26 819 2017-01-06
## 6 1:6 420 05/01/2017 21:12 246 2017-01-05
## 7 1:7 420 05/01/2017 20:52 433 2017-01-05
## 8 1:8 480 05/01/2017 13:08 162 2017-01-05
## 9 1:9 540 04/01/2017 14:31 358 2017-01-04
## 10 1:10 360 06/01/2017 14:07 427 2017-01-06
## 11 1:11 360 06/01/2017 12:46 66 2017-01-06
## 12 1:12 180 04/01/2017 18:16 674 2017-01-04
## 13 1:13 300 06/01/2017 08:51 804 2017-01-06
## 14 1:14 300 05/01/2017 19:43 674 2017-01-05
## 15 1:15 240 06/01/2017 17:09 674 2017-01-06
## 16 1:16 180 06/01/2017 23:39 128 2017-01-06
## 17 1:17 300 06/01/2017 19:13 589 2017-01-06
## 18 1:18 180 05/01/2017 07:53 127 2017-01-05
## 19 1:19 300 04/01/2017 18:16 537 2017-01-04
## 20 1:20 900 04/01/2017 17:54 506 2017-01-04
## # ... with 3 more variables: StartStation.Id <int>, weekdays <fctr>,
## # season <fctr>
sjp.frq(bikedata$weekdays)
sjp.frq(bikedata$season)
x<-plyr::count(bikedata,vars = c("Start.Date","weekdays","season"))
colnames(x)[4]<- c("ntrips")
bikedata<-join(x,bikedata)
## Joining by: Start.Date, weekdays, season
bikedata<- bikedata%>% dplyr::select("Start.Date","weekdays","season","ntrips")
head(bikedata,20)
## Start.Date weekdays season ntrips
## 1 2017-01-04 Wed Winter 21955
## 2 2017-01-04 Wed Winter 21955
## 3 2017-01-04 Wed Winter 21955
## 4 2017-01-04 Wed Winter 21955
## 5 2017-01-04 Wed Winter 21955
## 6 2017-01-04 Wed Winter 21955
## 7 2017-01-04 Wed Winter 21955
## 8 2017-01-04 Wed Winter 21955
## 9 2017-01-04 Wed Winter 21955
## 10 2017-01-04 Wed Winter 21955
## 11 2017-01-04 Wed Winter 21955
## 12 2017-01-04 Wed Winter 21955
## 13 2017-01-04 Wed Winter 21955
## 14 2017-01-04 Wed Winter 21955
## 15 2017-01-04 Wed Winter 21955
## 16 2017-01-04 Wed Winter 21955
## 17 2017-01-04 Wed Winter 21955
## 18 2017-01-04 Wed Winter 21955
## 19 2017-01-04 Wed Winter 21955
## 20 2017-01-04 Wed Winter 21955
bikedata<- bikedata[!duplicated(bikedata),]
bikedata<- na.omit(bikedata)
str(bikedata)
## 'data.frame': 287 obs. of 4 variables:
## $ Start.Date: Date, format: "2017-01-04" "2017-01-05" ...
## $ weekdays : Factor w/ 7 levels "Sun","Mon","Tues",..: 4 5 6 7 1 2 3 4 5 6 ...
## $ season : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ ntrips : int 21955 23350 18810 13722 13335 37595 27843 28002 17000 20764 ...
head(bikedata,20)
## Start.Date weekdays season ntrips
## 1 2017-01-04 Wed Winter 21955
## 21956 2017-01-05 Thurs Winter 23350
## 45306 2017-01-06 Fri Winter 18810
## 64116 2017-01-07 Sat Winter 13722
## 77838 2017-01-08 Sun Winter 13335
## 91173 2017-01-09 Mon Winter 37595
## 128768 2017-01-10 Tues Winter 27843
## 156611 2017-01-11 Wed Winter 28002
## 184613 2017-01-12 Thurs Winter 17000
## 201613 2017-01-13 Fri Winter 20764
## 222377 2017-01-14 Sat Winter 13615
## 235992 2017-01-15 Sun Winter 6672
## 242664 2017-01-16 Mon Winter 19049
## 261713 2017-01-17 Tues Winter 27060
## 288773 2017-01-18 Wed Winter 26969
## 315742 2017-01-19 Thurs Winter 26591
## 342333 2017-01-20 Fri Winter 24900
## 367233 2017-01-21 Sat Winter 17311
## 384544 2017-01-22 Sun Winter 15415
## 399959 2017-01-23 Mon Winter 23089
sjp.frq(bikedata$weekdays)
sjp.frq(bikedata$season)
ggplot(data = bikedata,aes(x=weekdays,y=ntrips,fill=season))+geom_boxplot()+facet_wrap(~season)
set.seed(14)
ind<- sample(2,nrow(bikedata),replace = T,prob = c(0.8,0.2))
train<- bikedata[ind==1,]
test<- bikedata[ind==2,]
############################################model
str(train)
## 'data.frame': 237 obs. of 4 variables:
## $ Start.Date: Date, format: "2017-01-04" "2017-01-05" ...
## $ weekdays : Factor w/ 7 levels "Sun","Mon","Tues",..: 4 5 7 2 4 5 6 1 2 5 ...
## $ season : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ ntrips : int 21955 23350 13722 37595 28002 17000 20764 6672 19049 26591 ...
model1<- glm(train,formula = ntrips~ weekdays+season, family = "poisson")
model3<-glm.nb(train,formula = ntrips~weekdays+season)
model32<-glm.nb(train,formula = ntrips~weekdays+season+
weekdays:season)
summary(model3)
##
## Call:
## glm.nb(formula = ntrips ~ weekdays + season, data = train, init.theta = 20.84804221,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3678 -0.4795 0.0414 0.5469 2.2805
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 10.02982 0.04787 209.534 < 2e-16 ***
## weekdaysMon 0.23582 0.05326 4.428 9.52e-06 ***
## weekdaysTues 0.27747 0.05628 4.930 8.22e-07 ***
## weekdaysWed 0.24130 0.05362 4.500 6.79e-06 ***
## weekdaysThurs 0.25857 0.05393 4.795 1.63e-06 ***
## weekdaysFri 0.23254 0.05582 4.166 3.11e-05 ***
## weekdaysSat 0.03805 0.05659 0.672 0.501
## seasonSpring 0.15869 0.04037 3.931 8.47e-05 ***
## seasonSummer 0.20952 0.04370 4.794 1.63e-06 ***
## seasonWinter -0.19230 0.04365 -4.406 1.05e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(20.848) family taken to be 1)
##
## Null deviance: 394.97 on 236 degrees of freedom
## Residual deviance: 238.93 on 227 degrees of freedom
## AIC: 4838.9
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 20.85
## Std. Err.: 1.90
##
## 2 x log-likelihood: -4816.938
anova(model3)
## Warning in anova.negbin(model3): tests made without re-estimating 'theta'
## Analysis of Deviance Table
##
## Model: Negative Binomial(20.848), link: log
##
## Response: ntrips
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 236 394.97
## weekdays 6 42.975 230 352.00 1.18e-07 ***
## season 3 113.068 227 238.93 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
d<- 2 * (logLik(model1) - logLik(model3))
pchisq(d, df = 1, lower.tail = FALSE)
## 'log Lik.' 1 (df=10)
Evdence of nb is more suitable
pchisq(model32$deviance, df=model32$df.residual, lower.tail=FALSE)
## [1] 0.07895388
the P value of Chi squared fitness test is greater than 0.05, thus we failed to reject the H0, and conclude that the predicted value is similar to the observed value.
list(c(AIC(model3),"model3"),
c(AIC(model32),"model32"))
## [[1]]
## [1] "4838.93774211209" "model3"
##
## [[2]]
## [1] "4812.53894020949" "model32"
list(c(logLik(model3),"model3"),
c(logLik(model32),"model32"))
## [[1]]
## [1] "-2408.46887105604" "model3"
##
## [[2]]
## [1] "-2377.26947010474" "model32"
list(c(model32$deviance,"model32",
c(model3$deviance,"model3")))
## [[1]]
## [1] "238.495461191019" "model32" "238.927947040877"
## [4] "model3"
AIC balances between the complexity of the model and the goodnesss of fit, since increasing the number of parameters in the model will mostly increase fit, but often it may not worth duo to complexity. Follow the equation of AIC, we want the lower value among all the models. thus model32 is lying in our favour. Also model32 has higher lower deviance and higher loglikehood.
sjp.glm(model32)
## Waiting for profiling to be done...
summary(model32)
##
## Call:
## glm.nb(formula = ntrips ~ weekdays + season + weekdays:season,
## data = train, init.theta = 27.03471804, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.8185 -0.5816 -0.0857 0.4662 2.7347
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.94151 0.06804 146.109 < 2e-16 ***
## weekdaysMon 0.33466 0.09622 3.478 0.000505 ***
## weekdaysTues 0.43358 0.10970 3.952 7.74e-05 ***
## weekdaysWed 0.42166 0.09959 4.234 2.30e-05 ***
## weekdaysThurs 0.41298 0.09959 4.147 3.37e-05 ***
## weekdaysFri 0.35734 0.10970 3.257 0.001124 **
## weekdaysSat -0.04431 0.09960 -0.445 0.656428
## seasonSpring 0.41854 0.09128 4.585 4.53e-06 ***
## seasonSummer 0.46127 0.10392 4.439 9.05e-06 ***
## seasonWinter -0.57696 0.10397 -5.549 2.87e-08 ***
## weekdaysMon:seasonSpring -0.38186 0.12573 -3.037 0.002388 **
## weekdaysTues:seasonSpring -0.37580 0.13821 -2.719 0.006546 **
## weekdaysWed:seasonSpring -0.40722 0.12925 -3.151 0.001629 **
## weekdaysThurs:seasonSpring -0.36237 0.12925 -2.804 0.005053 **
## weekdaysFri:seasonSpring -0.31199 0.13719 -2.274 0.022957 *
## weekdaysSat:seasonSpring 0.03126 0.13163 0.237 0.812271
## weekdaysMon:seasonSummer -0.24260 0.13979 -1.736 0.082650 .
## weekdaysTues:seasonSummer -0.36110 0.14939 -2.417 0.015640 *
## weekdaysWed:seasonSummer -0.38546 0.14068 -2.740 0.006144 **
## weekdaysThurs:seasonSummer -0.33771 0.14393 -2.346 0.018959 *
## weekdaysFri:seasonSummer -0.29451 0.15327 -1.921 0.054672 .
## weekdaysSat:seasonSummer -0.12685 0.14920 -0.850 0.395220
## weekdaysMon:seasonWinter 0.49029 0.13982 3.506 0.000454 ***
## weekdaysTues:seasonWinter 0.40197 0.15615 2.574 0.010047 *
## weekdaysWed:seasonWinter 0.33388 0.14217 2.348 0.018850 *
## weekdaysThurs:seasonWinter 0.32937 0.14071 2.341 0.019250 *
## weekdaysFri:seasonWinter 0.35370 0.15114 2.340 0.019269 *
## weekdaysSat:seasonWinter 0.56767 0.14626 3.881 0.000104 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(27.0347) family taken to be 1)
##
## Null deviance: 512.05 on 236 degrees of freedom
## Residual deviance: 238.50 on 209 degrees of freedom
## AIC: 4812.5
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 27.03
## Std. Err.: 2.47
##
## 2 x log-likelihood: -4754.539
From the summary and glm output, most of weekdays have higher impact than Sunday while Winter remains to be the season of least number of trips. With all other cofficients remains constant, Monday will have exp(0.33) more trips than Sunday.