Problem Statement

The bike rental data-set is a classic data analytics problem. It is set in the context of bike-sharing system where users can rent a bike at location A and return it at location B. The users’ movement is monitored for sensing mobility or events in the city and understand if the users are using the bikes in a conducive environment (i.e. health issues)

I will attempt to answer 5 business questions related to mobility/events in city or users’ health issues

Load packages

loading the packages at the start allows you to code with ease

#library 
pacman::p_load(xlsx,tidyverse, reshape, lubridate, ggthemes, ggrepel)

Import data

#import raw data
bikes.df <- read_csv("D:/Documents/Family/TKC/20180818_MTEBA/Academic/2019Sem1/EBA5001/3/Lab/assignment/data/bikes_2011.csv")
## Parsed with column specification:
## cols(
##   instant = col_double(),
##   dteday = col_date(format = ""),
##   season = col_double(),
##   yr = col_double(),
##   mnth = col_double(),
##   hr = col_double(),
##   holiday = col_double(),
##   weekday = col_double(),
##   workingday = col_double(),
##   weathersit = col_double(),
##   temp = col_double(),
##   atemp = col_double(),
##   hum = col_double(),
##   windspeed = col_double(),
##   casual = col_double(),
##   registered = col_double(),
##   cnt = col_double()
## )
#view actual students' response data
knitr::kable(
  bikes.df[1:5, ], 
  caption = "Bike Rental Data")
Bike Rental Data
instant dteday season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
1 2011-01-01 1 0 1 0 0 6 0 1 0.24 0.2879 0.81 0 3 13 16
2 2011-01-01 1 0 1 1 0 6 0 1 0.22 0.2727 0.80 0 8 32 40
3 2011-01-01 1 0 1 2 0 6 0 1 0.22 0.2727 0.80 0 5 27 32
4 2011-01-01 1 0 1 3 0 6 0 1 0.24 0.2879 0.75 0 3 10 13
5 2011-01-01 1 0 1 4 0 6 0 1 0.24 0.2879 0.75 0 0 1 1

Check missing data

#check for missing data
missing = bikes.df %>%
  filter(!complete.cases(.))

nrow(missing)/nrow(bikes.df) #no missing data
## [1] 0

Check similar variables - temp and atemp

Temp is a normalised temperature and atemp is a normalised feeling temperature

#understand the 5 statistical values of either temp or atemp
summary(bikes.df$temp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0200  0.3200  0.5000  0.4891  0.6600  0.9600
summary(bikes.df$atemp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3182  0.4848  0.4690  0.6212  1.0000
#check if they have strong correlation
cor(bikes.df$temp,bikes.df$atemp) 
## [1] 0.9920215
#visualise the data in plot to confirm that it has a linear relationship
# Basic scatter plot
ggplot(bikes.df, aes(x=temp, y=atemp)) + 
  geom_point(size=2,alpha=0.1) +
  stat_smooth(method = 'lm') + 
  theme_economist() +
  theme(panel.border = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(size=7)) +
  labs(x= "Temperature", y= "Feel Temperature", 
       title = "Temperature vs Feel Temperature", subtitle = "Find if the both variables are linearly related")

#Both of them are linearly related and very similar so the use of either temp is acceptable.

Feature engineer variables

Some variables that are binary in nature contain a singular value. Instead, the feature-engineered variable can display 2 or more values

#feature engineer variable to make it more useful
#since the purpose is either for work or holiday, a purpose variable can be made from the 2 variables - 'workingday' or 'holiday'
bikes.df.clean <- bikes.df
table(bikes.df.clean$holiday , bikes.df.clean$workingday) 
##    
##        0    1
##   0 2495 5911
##   1  239    0
#'workingday' and 'holiday' are mutually exclusive
# if neither of the purposes is used for cycling, then it will be categorised as 'weekends' reason for cycling after checking the data

bikes.df.clean <- bikes.df.clean %>%
  mutate(purpose=case_when(workingday==1 ~ "Work",
                           holiday==1 ~ "Holiday",
                           TRUE ~ "Weekend"))

#remove 'workingday' and 'holiday' variables
bikes.df.clean <- bikes.df.clean %>%
  select(-workingday,-holiday)

#check month against season ====
table(bikes.df.clean$mnth , bikes.df.clean$season)
##     
##        1   2   3   4
##   1  688   0   0   0
##   2  649   0   0   0
##   3  470 260   0   0
##   4    0 719   0   0
##   5    0 744   0   0
##   6    0 480 240   0
##   7    0   0 744   0
##   8    0   0 731   0
##   9    0   0 525 192
##   10   0   0   0 743
##   11   0   0   0 719
##   12 261   0   0 480
#conclusion: months and seasons are similar except that seasons variable has a higher grouping level
#drop season
bikes.df.clean <- bikes.df.clean %>%
  select(-season)

#recode the variables  ====
bikes.df.clean <- bikes.df.clean %>%
  mutate(weathersit=case_when(weathersit==1 ~ "1:Clear", 
                              weathersit==2 ~ "2:Mist",
                              weathersit==3 ~ "3:Light Snow",
                              weathersit==4 ~ "4:Heavy Rain"),
         daynight= case_when(hr>=4 & hr <10 ~ "Morning",
                             hr>= 10 & hr < 16 ~ "Noon",
                             hr>= 16 & hr < 22 ~ "Evening",
                             TRUE ~ "Night"),
         temp=temp*(39+8)-8,
         atemp=round((atemp*(50+16)-16), digits = 2),
         hum=hum*100,
         windspeed=round((windspeed *67), digits = 2))

knitr::kable(
  bikes.df.clean[1:5, ], 
  caption = "Bike Rental Data with feature engineered variables")
Bike Rental Data with feature engineered variables
instant dteday yr mnth hr weekday weathersit temp atemp hum windspeed casual registered cnt purpose daynight
1 2011-01-01 0 1 0 6 1:Clear 3.28 3 81 0 3 13 16 Weekend Night
2 2011-01-01 0 1 1 6 1:Clear 2.34 2 80 0 8 32 40 Weekend Night
3 2011-01-01 0 1 2 6 1:Clear 2.34 2 80 0 5 27 32 Weekend Night
4 2011-01-01 0 1 3 6 1:Clear 3.28 3 75 0 3 10 13 Weekend Night
5 2011-01-01 0 1 4 6 1:Clear 3.28 3 75 0 0 1 1 Weekend Morning

Reshape the data

#gather the data for number of registered and casual users into long form ====
bikes.df.clean.reshaped <- bikes.df.clean %>%
  gather(type, count, casual:cnt)

#Question 1 : What was the total bikes rented in each month? (high level overview)

bikes.df.clean.reshaped %>%
  filter(type!="cnt") %>%
  group_by(mnth,type) %>%
  dplyr::summarize(average=round(mean(count),0)) %>%
  mutate(monthf=factor(month(mnth),levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE),
         percent=round(100*average/sum(average),0)) %>%
  ggplot(aes(x = as.factor(monthf), 
             y = average,fill=as.factor(type))) +
  geom_bar(stat='identity') +
  geom_text(aes(label=paste0(average, "\n" ,"(",percent,"%",")")), color="black", size=4 , position =position_stack(vjust = 0.5)) +
  labs(x= "Month", y= "Number of Bikes", 
       title = "What was the average bikes rented in each month?", subtitle = "Understand average hourly bike rental demand across the months", fill = "Type of Bikes") +
  theme_economist_white() +
  theme(#panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_text(size=6,hjust=1),
    axis.title.x=element_blank(),
    #axis.text.x=element_blank(),
    axis.ticks.x=element_blank(),
    #legend.title=element_blank(),
    legend.position = "bottom")

#Question 2 : What was the toal bikes rented in each month? (high level overview)

bikes.df.clean.reshaped %>%
  filter(type!="cnt") %>%
  group_by(hr,type, weekday) %>%
  dplyr::summarize(average=round(mean(count),0)) %>%
  mutate(weekday=ifelse(weekday==0,7,weekday),
         weekday=case_when(weekday==1 ~ "1. Mon",
                           weekday==2 ~ "2. Tue",
                           weekday==3 ~ "3. Wed",
                           weekday==4 ~ "4. Thu",
                           weekday==5 ~ "5. Fri",
                           weekday==6 ~ "6. Sat",
                           weekday==7 ~ "7. Sun")) %>%
  ggplot(aes(x = as.factor(hr), 
             y = average,fill=as.factor(type))) +
  geom_bar(stat='identity') +
  facet_grid(type~weekday) +
  #geom_text(aes(label=paste0(average)) , vjust=-0.2, color="white", size=4.5) +
  labs(x= "Month", y= "Number of Bikes", 
       title = "What was the average number of bikes rented in the different hours of the day?", subtitle = "Understand average hourly bike rental demand across the months", fill = "Type of Bikes") +
  theme_economist_white() +
  theme(#panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_text(size=8,hjust=1),
    axis.title.x=element_blank(),
    #axis.text.x=element_blank(),
    axis.ticks.x=element_blank(),
    #legend.title=element_blank(),
    legend.position = "bottom") +
  scale_x_discrete(breaks = seq(0, 20, by = 5))

#Question 3: Typically, how many bikes would be rented amongst the different types of users?

five.values <- bikes.df.clean.reshaped %>%
  group_by(type) %>% 
  summarise(five = list(fivenum(count))) %>% 
  tidyr::unnest()

bikes.df.clean.reshaped$type <- factor(bikes.df.clean.reshaped$type, levels = c("casual", "registered", "cnt"))  #order the type bar chart


bikes.df.clean.reshaped %>%
  ggplot(aes(x = factor(type), y = count, fill=type)) +
  geom_boxplot(alpha=0.3) +
  geom_text_repel(data = five.values, 
                  aes(x = factor(type), y = five, label = five), 
                  nudge_x = .5,
                  nudge_y = .5,
                  segment.alpha=.3) +
  scale_fill_brewer(palette="Dark2", name= "User Type" ,labels=c("Casual","Registered","Total")) +
  theme_economist_white() +
  labs(fill = "cnt" ,title="Typically, how many bikes were rented amongst the different types of users?", subtitle = "Benchmarking the hourly bike rental demand") +
  theme(#panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_text(size=8,hjust=1),
    axis.title.x=element_blank(),
    #axis.text.x=element_blank(),
    axis.ticks.x=element_blank(),
    #legend.title=element_blank(),
    legend.position = "bottom") +
  scale_x_discrete(labels=c("Casual", "Registered", "Total"))

#Question 4: What were the reasons that users rented bikes for?

five.values.1 <- bikes.df.clean.reshaped %>%
  group_by(type,purpose) %>% 
  summarise(five = list(fivenum(count))) %>% 
  tidyr::unnest()

bikes.df.clean.reshaped$type <- factor(bikes.df.clean.reshaped$type, levels = c("casual", "registered", "cnt")) #order the type bar chart

bikes.df.clean.reshaped %>%
  ggplot(aes(x = factor(type), y = count, fill=type)) +
  geom_boxplot(alpha=0.3, width=0.5) +
  facet_grid(.~purpose, scales = "free" , space = "free") +
  geom_text_repel(data = five.values.1, 
                  aes(x = factor(type), y = five, label = five), 
                  nudge_x = .5,
                  nudge_y = .5,
                  segment.alpha=.3,
                  size=3) +
  scale_fill_brewer(palette="Dark2", name= "User Type" ,labels=c("Casual","Registered","Total")) +
  theme_economist_white() +
  labs(fill = "cnt" ,title="What were the reasons that users rented bikes for?", subtitle = "Finding which purpose constituted the highest demand") +
  theme(#panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_text(size=8,hjust=.8),
    axis.title.x=element_blank(),
    #axis.text.x=element_blank(),
    axis.ticks.x=element_blank(),
    #legend.title=element_blank(),
    legend.position = "bottom") +
  scale_x_discrete(labels=c("Casual", "Registered", "Total")) 

#Question 5: When will the major events be happening?

library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
## 
##     here
## The following objects are masked from 'package:reshape':
## 
##     rename, round_any
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
#customisng the data frame for time series calendar heat map
t <- bikes.df.clean %>%
  mutate(weekday=ifelse(weekday==0,7,weekday), #recode sunday from 0 to 7
         weekdayf=factor(weekday,levels=rev(1:7),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE), #order mon to sun chronologically
         monthf=factor(month(bikes.df.clean$mnth),levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE), #order Jan to Dec chronologically
         weekno=as.numeric(format(bikes.df.clean$dteday,"%W")), #obtain week number
         yearmonth=factor(as.yearmon(bikes.df.clean$dteday)), #obtain year month
         daynightf=factor(bikes.df.clean$daynight,levels=c("Morning","Noon","Evening","Night"))) #order Moring to Night chronologically

t<-ddply(t,.(yearmonth),transform,monthweek=1+weekno-min(weekno)) # normalizing the week to start at 1 for every month


#check colour theme needed
#ggthemes_data$economist$fg

ggplot(t, aes(monthweek, weekdayf, fill = bikes.df.clean$cnt)) + 
  geom_tile(colour = "white") + 
  facet_grid(daynightf~monthf) + 
  scale_fill_gradient(low="blue", high="red") + 
  xlab("Week of Month") + ylab("")  +
  labs(fill = "cnt" ,title="When did the major events happen?", subtitle = "Time-Series Calendar Heatmap: Observe Bike Rental Demand and major events that happened within the city") +
  theme_economist() +
  theme(#panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_text(size=8,hjust=1),
    axis.title.x=element_blank(),
    #axis.text.x=element_blank(),
    axis.ticks.x=element_blank(),
    #legend.title=element_blank(),
    legend.position = "bottom") 

detach("package:plyr") #detach after use since it will clash with dplyr

#Question 6: identify users who are cycling in for various purpose in each hour are under high temperature

colors.temperature <- c("#5e4fa2", "#3288bd", "#66c2a5", "#abdda4", "#e6f598",
                        "#fee08b", "#fdae61", "#f46d43", "#d53e4f", "#9e0142") #provide colours for temperature to show whether bike users cycle under extremem temperature

bikes.df.clean %>%
  ggplot(aes(hr, cnt, color=atemp, group = 1)) +
  facet_grid(purpose ~ weathersit) +
  geom_point() +
  #geom_smooth() +
  stat_summary(fun.y=mean, colour="blue", geom="line", size = 1) + # draw a mean line in the data 
  labs(title="Daily Bike Rental Demand \nPer Time of Day, Work/Nonwork day, and Weather") + 
  labs(x="Hour of Day") + 
  labs(y="Bike Rentals Initiated per Hour") +
  scale_colour_gradientn("Atemp", colours=colors.temperature) +
  theme_economist() +
  labs(fill = "cnt" ,title="Who were the users that cycled under high temperature?", subtitle = "Identify the users (in red) who cycled under high temperature") +
  theme(plot.title = element_text(size = rel(1.5)),
        legend.position = "right")
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

#Question 7: how do the weather conditions affect the bike rental volume?

bikes.df.clean.reshaped %>%
  select(temp,atemp,windspeed,hum,count) %>% #select the weather condition variables
  gather(-count,key="var",value="value") %>% #reshape from wide to long format
  ggplot(aes(x=value,y=count)) +
  geom_point(aes(color=count)) +
  scale_color_gradient(low = "blue", high = "red") +
  stat_smooth(color="grey13") + 
  facet_wrap(~var, scales = "free") +
  theme_economist_white() +
  labs(fill = "cnt" ,title="How did the weather affect the bike rental demand?", subtitle = "Identify which condition had a correlation with the number of users") +
  theme(#panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    #axis.ticks.y = element_blank(),
    axis.text.y = element_text(size=8,hjust=1),
    axis.title.x=element_blank(),
    #axis.text.x=element_blank(),
    #axis.ticks.x=element_blank(),
    #legend.title=element_blank(),
    legend.position = "bottom") 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'