##########################
library(tidyverse)
library(lubridate)
library(pander) # for prettier tables
library(scales) # for making prettier axes in plots
library(plotly) 
library(devtools)
library(maps)
library(corrgram) # to investigate the correlation between variables
library(corrplot) # to plot correlation data
library(knitr)
library(kableExtra)
library(caTools)
library(modelr)
library(GGally)
library(viridis)
library(factoextra)
library(readr)
##########################

Load Dataset bikeshare

bike <- read.csv("bikeshare.csv")
head(bike)
##              datetime season holiday workingday weather temp  atemp
## 1 2011-01-01 00:00:00      1       0          0       1 9.84 14.395
## 2 2011-01-01 01:00:00      1       0          0       1 9.02 13.635
## 3 2011-01-01 02:00:00      1       0          0       1 9.02 13.635
## 4 2011-01-01 03:00:00      1       0          0       1 9.84 14.395
## 5 2011-01-01 04:00:00      1       0          0       1 9.84 14.395
## 6 2011-01-01 05:00:00      1       0          0       2 9.84 12.880
##   humidity windspeed casual registered count
## 1       81    0.0000      3         13    16
## 2       80    0.0000      8         32    40
## 3       80    0.0000      5         27    32
## 4       75    0.0000      3         10    13
## 5       75    0.0000      0          1     1
## 6       75    6.0032      0          1     1

We try to do some exploratory data analysis (EDA).

bike%>%
  ggplot(aes(temp, count))+
          geom_point(alpha = 0.4, aes(color = temp)) +
  theme_bw()

Now we try to convert temp to datetime

bike$datetime <- as.POSIXct(bike$datetime)

Now try to plot with datetime

bike%>%
  ggplot(aes(datetime, count))+
          geom_point(alpha = 0.4, aes(color = temp)) +
  theme_bw() +
# try to put some more color to see the points more clear 
 scale_color_continuous(low='#55D8CE',high='#FF6E2E')

From the graph one can see that the bike rantal counts are increasing during summer.

Now we try investigate the coorelation between temperature and rantal counts

cor_test1 <- cor(bike[, c('temp', 'count')])
cor_test1
##            temp     count
## temp  1.0000000 0.3944536
## count 0.3944536 1.0000000

One can see some kind of correlation between temp and count.

Now try to investigate the rantal counts in the different seasons.

To do that, one can convert the season column to a factor column.

bike$season <- as.factor(bike$season)
bike%>%
ggplot(aes(season,count)) + geom_boxplot(aes(color=season))+
 
  labs(x = "Seasons",
       y = "bike rental counts",
       title = "Rantal count for each season") +theme_bw()

It is difficult to say that the rental count is a linear function of a season.

One can also investigate the hourly rental counts and see what will happen.

# format the datetime column like this.
getHours <- function(x){
  format(x, "%H")
}
bike$Hours <- sapply(bike$datetime,getHours)

head(bike)
##              datetime season holiday workingday weather temp  atemp
## 1 2011-01-01 00:00:00      1       0          0       1 9.84 14.395
## 2 2011-01-01 01:00:00      1       0          0       1 9.02 13.635
## 3 2011-01-01 02:00:00      1       0          0       1 9.02 13.635
## 4 2011-01-01 03:00:00      1       0          0       1 9.84 14.395
## 5 2011-01-01 04:00:00      1       0          0       1 9.84 14.395
## 6 2011-01-01 05:00:00      1       0          0       2 9.84 12.880
##   humidity windspeed casual registered count Hours
## 1       81    0.0000      3         13    16    00
## 2       80    0.0000      8         32    40    01
## 3       80    0.0000      5         27    32    02
## 4       75    0.0000      3         10    13    03
## 5       75    0.0000      0          1     1    04
## 6       75    6.0032      0          1     1    05

Now try to plot this

 bike%>%filter(workingday==1)%>%
  ggplot(aes(Hours, count))+
  geom_point(position=position_jitter(w=1, h=0),aes(color=temp),alpha=0.5)+
 scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red'))+
  labs(x = "Hours",
       y = "Counts",
       title = "Hourly rental counts") +
  theme_bw()

 # hour_rental
#  ggplotly(hour_rental)

NB: We can see the peak between 7-9 and 17 - 19.

Non-Working day plot

Now try to see what is happening in non-working day.

bike%>%filter(workingday==0)%>%
  ggplot(aes(Hours, count))+
  geom_point(position=position_jitter(w=1,h=0),aes(color=temp),alpha=0.5)+
 scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red'))+
  labs(x = "Hours",
       y = "Counts",
       title = "Hourly rental counts") +
  theme_bw()

 # hour_rental
#  ggplotly(hour_rental)

Building a Regression Linear

model <- lm(count ~ temp, bike )
summary(model)
## 
## Call:
## lm(formula = count ~ temp, data = bike)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -293.32 -112.36  -33.36   78.98  741.44 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.0462     4.4394   1.362    0.173    
## temp          9.1705     0.2048  44.783   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 166.5 on 10884 degrees of freedom
## Multiple R-squared:  0.1556, Adjusted R-squared:  0.1555 
## F-statistic:  2006 on 1 and 10884 DF,  p-value: < 2.2e-16

We got R-squared: 0.1556 which is not good score so we can not say that a linear regression model is the best to predict bike rental count. In what will follows, we will investigate some other models.