## Rows: 8,645
## Columns: 15
## $ season <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ mnth <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan,~
## $ day <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ hr <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1~
## $ holiday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ weekday <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,~
## $ workingday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ weathersit <fct> clear, clear, clear, clear, clear, cloudy/misty, clear, cle~
## $ temp <dbl> 0.24, 0.22, 0.22, 0.24, 0.24, 0.24, 0.22, 0.20, 0.24, 0.32,~
## $ atemp <dbl> 0.2879, 0.2727, 0.2727, 0.2879, 0.2879, 0.2576, 0.2727, 0.2~
## $ hum <dbl> 0.81, 0.80, 0.80, 0.75, 0.75, 0.75, 0.80, 0.86, 0.75, 0.76,~
## $ windspeed <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0896, 0.0000, 0.0~
## $ casual <dbl> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 40, 41, 1~
## $ registered <dbl> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 71, 70, 5~
## $ bikers <dbl> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, 106, 110~
Plotting
Not a good candidate for linear regression since no constant variation, as temp increase, spread increase.
Check if same issue according to working days!
Also not good for linear regression.
Creating model
##
## Call:
## glm(formula = bikers ~ temp + workingday + weathersit, family = "poisson",
## data = Bikeshare)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.885010 0.003231 1202.490 < 2e-16 ***
## temp 2.129054 0.004789 444.587 < 2e-16 ***
## workingday -0.008888 0.001943 -4.574 4.78e-06 ***
## weathersitcloudy/misty -0.042219 0.002132 -19.805 < 2e-16 ***
## weathersitlight rain/snow -0.432090 0.004023 -107.412 < 2e-16 ***
## weathersitheavy rain/snow -0.760995 0.166681 -4.566 4.98e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1052921 on 8644 degrees of freedom
## Residual deviance: 816754 on 8639 degrees of freedom
## AIC: 869804
##
## Number of Fisher Scoring iterations: 5
According to the summary, all predictors are statistically significant with p-values <<
Smoothing the model
ggplot(Bikeshare, aes(y=bikers,
x= temp,
color=weathersit))+
geom_point(alpha=.05)+
geom_smooth(method="glm",
se=FALSE,
method.args=list(family="poisson"))+
facet_wrap(~workingday)+
scale_color_brewer(palette="Dark2")## `geom_smooth()` using formula = 'y ~ x'
We observe our data is missing by computing hours of the year, (365 days by 24 hrs)= 8760 hrs but we have only 8654 rows, so we have 115 missing values
Dealing with missing values
## [1] 0
None, they deal with zero biking hours as non imputed, So we will add missing rows to our original Bikeshare data
bikes_new <- expand_grid(day=1:365,
hr = as.factor(0:23)) %>%
left_join(Bikeshare,
by = join_by(day,
hr))## # A tibble: 6 x 15
## day hr season mnth holiday weekday workingday weathersit temp atemp
## <dbl> <fct> <dbl> <fct> <dbl> <dbl> <dbl> <fct> <dbl> <dbl>
## 1 1 0 1 Jan 0 6 0 clear 0.24 0.288
## 2 1 1 1 Jan 0 6 0 clear 0.22 0.273
## 3 1 2 1 Jan 0 6 0 clear 0.22 0.273
## 4 1 3 1 Jan 0 6 0 clear 0.24 0.288
## 5 1 4 1 Jan 0 6 0 clear 0.24 0.288
## 6 1 5 1 Jan 0 6 0 cloudy/misty 0.24 0.258
## # i 5 more variables: hum <dbl>, windspeed <dbl>, casual <dbl>,
## # registered <dbl>, bikers <dbl>
We will impute missing values as appropriate for each case
bikes_imputed <- bikes_new %>%
mutate(bikers=replace_na(bikers,0))
bikes_imputed<-bikes_imputed %>%
mutate(temp=ifelse(is.na(temp),
(lead(temp)+lag(temp))/2,
temp),
temp=ifelse(is.na(temp),
lead(temp),
temp),
temp=ifelse(is.na(temp),
lag(temp),
temp),
weathersit=case_when(!is.na(weathersit)~weathersit,
TRUE~lag(weathersit)),
weathersit=case_when(!is.na(weathersit)~weathersit,
TRUE~lead(weathersit)),
workingday=mean(workingday, na.rm=TRUE),
.by=day)
head(bikes_imputed)## # A tibble: 6 x 15
## day hr season mnth holiday weekday workingday weathersit temp atemp
## <dbl> <fct> <dbl> <fct> <dbl> <dbl> <dbl> <fct> <dbl> <dbl>
## 1 1 0 1 Jan 0 6 0 clear 0.24 0.288
## 2 1 1 1 Jan 0 6 0 clear 0.22 0.273
## 3 1 2 1 Jan 0 6 0 clear 0.22 0.273
## 4 1 3 1 Jan 0 6 0 clear 0.24 0.288
## 5 1 4 1 Jan 0 6 0 clear 0.24 0.288
## 6 1 5 1 Jan 0 6 0 cloudy/misty 0.24 0.258
## # i 5 more variables: hum <dbl>, windspeed <dbl>, casual <dbl>,
## # registered <dbl>, bikers <dbl>
Compare poisson models
Summary of each model
## # A tibble: 6 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.89 0.00323 1202. 0
## 2 temp 2.13 0.00479 445. 0
## 3 workingday -0.00889 0.00194 -4.57 4.78e- 6
## 4 weathersitcloudy/misty -0.0422 0.00213 -19.8 2.68e-87
## 5 weathersitlight rain/snow -0.432 0.00402 -107. 0
## 6 weathersitheavy rain/snow -0.761 0.167 -4.57 4.98e- 6
## # A tibble: 6 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.86 0.00323 1197. 0
## 2 temp 2.16 0.00478 452. 0
## 3 workingday -0.0106 0.00194 -5.48 4.21e- 8
## 4 weathersitcloudy/misty -0.0419 0.00213 -19.6 6.52e-86
## 5 weathersitlight rain/snow -0.442 0.00402 -110. 0
## 6 weathersitheavy rain/snow -0.745 0.167 -4.47 7.74e- 6