glimpse(Bikeshare)

## Rows: 8,645
## Columns: 15
## $ season     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ mnth       <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan,~
## $ day        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ hr         <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1~
## $ holiday    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ weekday    <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,~
## $ workingday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ weathersit <fct> clear, clear, clear, clear, clear, cloudy/misty, clear, cle~
## $ temp       <dbl> 0.24, 0.22, 0.22, 0.24, 0.24, 0.24, 0.22, 0.20, 0.24, 0.32,~
## $ atemp      <dbl> 0.2879, 0.2727, 0.2727, 0.2879, 0.2879, 0.2576, 0.2727, 0.2~
## $ hum        <dbl> 0.81, 0.80, 0.80, 0.75, 0.75, 0.75, 0.80, 0.86, 0.75, 0.76,~
## $ windspeed  <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0896, 0.0000, 0.0~
## $ casual     <dbl> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 40, 41, 1~
## $ registered <dbl> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 71, 70, 5~
## $ bikers     <dbl> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, 106, 110~

Plotting

ggplot(Bikeshare, aes(y=bikers,
                      x= temp))+
  geom_point(alpha=.1)+
facet_wrap(~weathersit)

Not a good candidate for linear regression since no constant variation, as temp increase, spread increase.

Check if same issue according to working days!

ggplot(Bikeshare, aes(y=bikers,
                      x= temp))+
  geom_point(alpha=.1)+  
  facet_wrap(~workingday)

Also not good for linear regression.

Creating model

model <- glm(bikers~temp+workingday+weathersit,
             data= Bikeshare,
             family="poisson")

summary(model)

## 
## Call:
## glm(formula = bikers ~ temp + workingday + weathersit, family = "poisson", 
##     data = Bikeshare)
## 
## Coefficients:
##                            Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                3.885010   0.003231 1202.490  < 2e-16 ***
## temp                       2.129054   0.004789  444.587  < 2e-16 ***
## workingday                -0.008888   0.001943   -4.574 4.78e-06 ***
## weathersitcloudy/misty    -0.042219   0.002132  -19.805  < 2e-16 ***
## weathersitlight rain/snow -0.432090   0.004023 -107.412  < 2e-16 ***
## weathersitheavy rain/snow -0.760995   0.166681   -4.566 4.98e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1052921  on 8644  degrees of freedom
## Residual deviance:  816754  on 8639  degrees of freedom
## AIC: 869804
## 
## Number of Fisher Scoring iterations: 5

According to the summary, all predictors are statistically significant with p-values <<

Smoothing the model

ggplot(Bikeshare, aes(y=bikers,
                      x= temp,
                      color=weathersit))+
  geom_point(alpha=.05)+ 
  geom_smooth(method="glm",
              se=FALSE,
              method.args=list(family="poisson"))+
  facet_wrap(~workingday)+
  scale_color_brewer(palette="Dark2")

## `geom_smooth()` using formula = 'y ~ x'

We observe our data is missing by computing hours of the year, (365 days by 24 hrs)= 8760 hrs but we have only 8654 rows, so we have 115 missing values

Dealing with missing values

sum(Bikeshare$bikers==0)

## [1] 0

None, they deal with zero biking hours as non imputed, So we will add missing rows to our original Bikeshare data

bikes_new <- expand_grid(day=1:365, 
                         hr = as.factor(0:23)) %>%   
  left_join(Bikeshare,
            by = join_by(day,
                         hr))

head(bikes_new)

## # A tibble: 6 x 15
##     day hr    season mnth  holiday weekday workingday weathersit    temp atemp
##   <dbl> <fct>  <dbl> <fct>   <dbl>   <dbl>      <dbl> <fct>        <dbl> <dbl>
## 1     1 0          1 Jan         0       6          0 clear         0.24 0.288
## 2     1 1          1 Jan         0       6          0 clear         0.22 0.273
## 3     1 2          1 Jan         0       6          0 clear         0.22 0.273
## 4     1 3          1 Jan         0       6          0 clear         0.24 0.288
## 5     1 4          1 Jan         0       6          0 clear         0.24 0.288
## 6     1 5          1 Jan         0       6          0 cloudy/misty  0.24 0.258
## # i 5 more variables: hum <dbl>, windspeed <dbl>, casual <dbl>,
## #   registered <dbl>, bikers <dbl>

We will impute missing values as appropriate for each case

bikes_imputed <- bikes_new %>% 
  mutate(bikers=replace_na(bikers,0))

bikes_imputed<-bikes_imputed %>% 
  mutate(temp=ifelse(is.na(temp),
                      (lead(temp)+lag(temp))/2,
                     temp),
         temp=ifelse(is.na(temp),
                     lead(temp),
                     temp),
         temp=ifelse(is.na(temp),
                     lag(temp),
                     temp),
         weathersit=case_when(!is.na(weathersit)~weathersit,
                              TRUE~lag(weathersit)),
         weathersit=case_when(!is.na(weathersit)~weathersit,
                              TRUE~lead(weathersit)),
         workingday=mean(workingday, na.rm=TRUE),
                         .by=day)

head(bikes_imputed)

## # A tibble: 6 x 15
##     day hr    season mnth  holiday weekday workingday weathersit    temp atemp
##   <dbl> <fct>  <dbl> <fct>   <dbl>   <dbl>      <dbl> <fct>        <dbl> <dbl>
## 1     1 0          1 Jan         0       6          0 clear         0.24 0.288
## 2     1 1          1 Jan         0       6          0 clear         0.22 0.273
## 3     1 2          1 Jan         0       6          0 clear         0.22 0.273
## 4     1 3          1 Jan         0       6          0 clear         0.24 0.288
## 5     1 4          1 Jan         0       6          0 clear         0.24 0.288
## 6     1 5          1 Jan         0       6          0 cloudy/misty  0.24 0.258
## # i 5 more variables: hum <dbl>, windspeed <dbl>, casual <dbl>,
## #   registered <dbl>, bikers <dbl>

Compare poisson models

model_old <- glm(bikers~temp+workingday+weathersit,
                 data=Bikeshare,
                 family="poisson")
model_new <- glm(bikers~temp+workingday+weathersit,
                 data=bikes_imputed,
                 family="poisson")

Summary of each model

tidy(model_old)

## # A tibble: 6 x 5
##   term                      estimate std.error statistic  p.value
##   <chr>                        <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                3.89      0.00323   1202.   0       
## 2 temp                       2.13      0.00479    445.   0       
## 3 workingday                -0.00889   0.00194     -4.57 4.78e- 6
## 4 weathersitcloudy/misty    -0.0422    0.00213    -19.8  2.68e-87
## 5 weathersitlight rain/snow -0.432     0.00402   -107.   0       
## 6 weathersitheavy rain/snow -0.761     0.167       -4.57 4.98e- 6

tidy(model_new)

## # A tibble: 6 x 5
##   term                      estimate std.error statistic  p.value
##   <chr>                        <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                 3.86     0.00323   1197.   0       
## 2 temp                        2.16     0.00478    452.   0       
## 3 workingday                 -0.0106   0.00194     -5.48 4.21e- 8
## 4 weathersitcloudy/misty     -0.0419   0.00213    -19.6  6.52e-86
## 5 weathersitlight rain/snow  -0.442    0.00402   -110.   0       
## 6 weathersitheavy rain/snow  -0.745    0.167       -4.47 7.74e- 6

One test value

new_data <- data.frame(temp=.5,
                       workingday=0,
                       weathersit="clear")

predict(model_old,
        new_data,
        type="response")

##        1 
## 141.1096

predict(model_new,
        new_data,
        type="response")

##        1 
## 140.4326

Bikeshare Project