##########################
library(tidyverse)
library(lubridate)
library(pander) # for prettier tables
library(scales) # for making prettier axes in plots
library(plotly)
library(devtools)
library(maps)
library(corrgram) # to investigate the correlation between variables
library(corrplot) # to plot correlation data
library(knitr)
library(kableExtra)
library(caTools)
library(modelr)
library(GGally)
library(viridis)
library(factoextra)
library(readr)
##########################
Load Dataset bikeshare
bike <- read.csv("bikeshare.csv")
head(bike)
## datetime season holiday workingday weather temp atemp
## 1 2011-01-01 00:00:00 1 0 0 1 9.84 14.395
## 2 2011-01-01 01:00:00 1 0 0 1 9.02 13.635
## 3 2011-01-01 02:00:00 1 0 0 1 9.02 13.635
## 4 2011-01-01 03:00:00 1 0 0 1 9.84 14.395
## 5 2011-01-01 04:00:00 1 0 0 1 9.84 14.395
## 6 2011-01-01 05:00:00 1 0 0 2 9.84 12.880
## humidity windspeed casual registered count
## 1 81 0.0000 3 13 16
## 2 80 0.0000 8 32 40
## 3 80 0.0000 5 27 32
## 4 75 0.0000 3 10 13
## 5 75 0.0000 0 1 1
## 6 75 6.0032 0 1 1
We try to do some exploratory data analysis (EDA).
bike%>%
ggplot(aes(temp, count))+
geom_point(alpha = 0.4, aes(color = temp)) +
theme_bw()
Now we try to convert
temp to datetime
bike$datetime <- as.POSIXct(bike$datetime)
Now try to plot with datetime
bike%>%
ggplot(aes(datetime, count))+
geom_point(alpha = 0.4, aes(color = temp)) +
theme_bw() +
# try to put some more color to see the points more clear
scale_color_continuous(low='#55D8CE',high='#FF6E2E')
From the graph one can see that the bike rantal counts are increasing during summer.
Now we try investigate the coorelation between temperature and rantal counts
cor_test1 <- cor(bike[, c('temp', 'count')])
cor_test1
## temp count
## temp 1.0000000 0.3944536
## count 0.3944536 1.0000000
One can see some kind of correlation between temp and count.
Now try to investigate the rantal counts in the different seasons.
To do that, one can convert the season column to a factor column.
bike$season <- as.factor(bike$season)
bike%>%
ggplot(aes(season,count)) + geom_boxplot(aes(color=season))+
labs(x = "Seasons",
y = "bike rental counts",
title = "Rantal count for each season") +theme_bw()
It is difficult to say that the rental count is a linear function of a season.
One can also investigate the hourly rental counts and see what will happen.
# format the datetime column like this.
getHours <- function(x){
format(x, "%H")
}
bike$Hours <- sapply(bike$datetime,getHours)
head(bike)
## datetime season holiday workingday weather temp atemp
## 1 2011-01-01 00:00:00 1 0 0 1 9.84 14.395
## 2 2011-01-01 01:00:00 1 0 0 1 9.02 13.635
## 3 2011-01-01 02:00:00 1 0 0 1 9.02 13.635
## 4 2011-01-01 03:00:00 1 0 0 1 9.84 14.395
## 5 2011-01-01 04:00:00 1 0 0 1 9.84 14.395
## 6 2011-01-01 05:00:00 1 0 0 2 9.84 12.880
## humidity windspeed casual registered count Hours
## 1 81 0.0000 3 13 16 00
## 2 80 0.0000 8 32 40 01
## 3 80 0.0000 5 27 32 02
## 4 75 0.0000 3 10 13 03
## 5 75 0.0000 0 1 1 04
## 6 75 6.0032 0 1 1 05
Now try to plot this
bike%>%filter(workingday==1)%>%
ggplot(aes(Hours, count))+
geom_point(position=position_jitter(w=1, h=0),aes(color=temp),alpha=0.5)+
scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red'))+
labs(x = "Hours",
y = "Counts",
title = "Hourly rental counts") +
theme_bw()
# hour_rental
# ggplotly(hour_rental)
NB: We can see the peak between 7-9 and 17 - 19.
Now try to see what is happening in non-working day.
bike%>%filter(workingday==0)%>%
ggplot(aes(Hours, count))+
geom_point(position=position_jitter(w=1,h=0),aes(color=temp),alpha=0.5)+
scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red'))+
labs(x = "Hours",
y = "Counts",
title = "Hourly rental counts") +
theme_bw()
# hour_rental
# ggplotly(hour_rental)
model <- lm(count ~ temp, bike )
summary(model)
##
## Call:
## lm(formula = count ~ temp, data = bike)
##
## Residuals:
## Min 1Q Median 3Q Max
## -293.32 -112.36 -33.36 78.98 741.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0462 4.4394 1.362 0.173
## temp 9.1705 0.2048 44.783 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 166.5 on 10884 degrees of freedom
## Multiple R-squared: 0.1556, Adjusted R-squared: 0.1555
## F-statistic: 2006 on 1 and 10884 DF, p-value: < 2.2e-16
We got R-squared: 0.1556 which is not good score so we can not say that a linear regression model is the best to predict bike rental count. In what will follows, we will investigate some other models.