Reading the bikeshare dataset
bike <- read.csv('bikeshare.csv')
head(bike)
## datetime season holiday workingday weather temp atemp humidity
## 1 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81
## 2 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80
## 3 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80
## 4 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75
## 5 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75
## 6 2011-01-01 05:00:00 1 0 0 2 9.84 12.880 75
## windspeed casual registered count
## 1 0.0000 3 13 16
## 2 0.0000 8 32 40
## 3 0.0000 5 27 32
## 4 0.0000 3 10 13
## 5 0.0000 0 1 1
## 6 6.0032 0 1 1
ploting
library(ggplot2)
ggplot(bike,aes(temp,count))+geom_point(aes(color=temp),alpha=0.4)
bike$datetime <- as.POSIXct(bike$datetime)
ggplot(bike,aes(datetime,count)) + geom_point(aes(color=temp),alpha=0.5) + scale_color_continuous(low='#55D8CE',high='#FF6E2E') +theme_bw()
cor(bike[,c('temp','count')])
## temp count
## temp 1.0000000 0.3944536
## count 0.3944536 1.0000000
Let’s explore the season data
ggplot(bike,aes(factor(season),count))+geom_boxplot(aes(color=factor(season)))
Feature Engineering Create an “hour” column that takes the hour from the datetime column. You’ll probably need to apply some function to the entire datetime column and reassign it.
bike$hour <- sapply(bike$datetime,function(x){format(x,'%H')})
head(bike)
## datetime season holiday workingday weather temp atemp humidity
## 1 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81
## 2 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80
## 3 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80
## 4 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75
## 5 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75
## 6 2011-01-01 05:00:00 1 0 0 2 9.84 12.880 75
## windspeed casual registered count hour
## 1 0.0000 3 13 16 00
## 2 0.0000 8 32 40 01
## 3 0.0000 5 27 32 02
## 4 0.0000 3 10 13 03
## 5 0.0000 0 1 1 04
## 6 6.0032 0 1 1 05
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pl <- ggplot(filter(bike,workingday==1),aes(hour,count))
pl <- pl + geom_point(position=position_jitter(w=1, h=0),aes(color=temp),alpha=0.5)
pl <- pl + scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red'))
pl + theme_bw()
for non working
pl <- ggplot(filter(bike,workingday==0),aes(hour,count))
pl <- pl + geom_point(position=position_jitter(w=1, h=0),aes(color=temp),alpha=0.5)
pl <- pl + scale_color_gradientn(colours = c('dark blue','blue','light blue','light green','yellow','orange','red'))
pl + theme_bw()
Building a model
temp.model <- lm(count ~temp,data=bike)
summary(temp.model)
##
## Call:
## lm(formula = count ~ temp, data = bike)
##
## Residuals:
## Min 1Q Median 3Q Max
## -293.32 -112.36 -33.36 78.98 741.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0462 4.4394 1.362 0.173
## temp 9.1705 0.2048 44.783 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 166.5 on 10884 degrees of freedom
## Multiple R-squared: 0.1556, Adjusted R-squared: 0.1555
## F-statistic: 2006 on 1 and 10884 DF, p-value: < 2.2e-16
prediction
6.0462+9.17*25
## [1] 235.2962
bike$hour <- sapply(bike$hour,as.numeric)
To build a Final model with the below features season holiday workingday weather temp humidity windspeed hour (factor)
final.model <- lm(count~.-casual - registered -datetime -atemp,data=bike)
summary of model
summary(final.model)
##
## Call:
## lm(formula = count ~ . - casual - registered - datetime - atemp,
## data = bike)
##
## Residuals:
## Min 1Q Median 3Q Max
## -324.61 -96.88 -31.01 55.27 688.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.91369 8.45147 5.551 2.91e-08 ***
## season 21.70333 1.35409 16.028 < 2e-16 ***
## holiday -10.29914 8.79069 -1.172 0.241
## workingday -0.71781 3.14463 -0.228 0.819
## weather -3.20909 2.49731 -1.285 0.199
## temp 7.01953 0.19135 36.684 < 2e-16 ***
## humidity -2.21174 0.09083 -24.349 < 2e-16 ***
## windspeed 0.20271 0.18639 1.088 0.277
## hour 7.61283 0.21688 35.102 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 147.8 on 10877 degrees of freedom
## Multiple R-squared: 0.3344, Adjusted R-squared: 0.3339
## F-statistic: 683 on 8 and 10877 DF, p-value: < 2.2e-16
Thank you !!