The bike rental data-set is a classic data analytics problem. It is set in the context of bike-sharing system where users can rent a bike at location A and return it at location B. The users’ movement is monitored for sensing mobility or events in the city and understand if the users are using the bikes in a conducive environment (i.e. health issues)
I will attempt to answer 5 business questions related to mobility/events in city or users’ health issues
loading the packages at the start allows you to code with ease
#library
pacman::p_load(xlsx,tidyverse, reshape, lubridate, ggthemes, ggrepel)
#import raw data
bikes.df <- read_csv("D:/Documents/Family/TKC/20180818_MTEBA/Academic/2019Sem1/EBA5001/3/Lab/assignment/data/bikes_2011.csv")
## Parsed with column specification:
## cols(
## instant = col_double(),
## dteday = col_date(format = ""),
## season = col_double(),
## yr = col_double(),
## mnth = col_double(),
## hr = col_double(),
## holiday = col_double(),
## weekday = col_double(),
## workingday = col_double(),
## weathersit = col_double(),
## temp = col_double(),
## atemp = col_double(),
## hum = col_double(),
## windspeed = col_double(),
## casual = col_double(),
## registered = col_double(),
## cnt = col_double()
## )
#view actual students' response data
knitr::kable(
bikes.df[1:5, ],
caption = "Bike Rental Data")
| instant | dteday | season | yr | mnth | hr | holiday | weekday | workingday | weathersit | temp | atemp | hum | windspeed | casual | registered | cnt |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2011-01-01 | 1 | 0 | 1 | 0 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.81 | 0 | 3 | 13 | 16 |
| 2 | 2011-01-01 | 1 | 0 | 1 | 1 | 0 | 6 | 0 | 1 | 0.22 | 0.2727 | 0.80 | 0 | 8 | 32 | 40 |
| 3 | 2011-01-01 | 1 | 0 | 1 | 2 | 0 | 6 | 0 | 1 | 0.22 | 0.2727 | 0.80 | 0 | 5 | 27 | 32 |
| 4 | 2011-01-01 | 1 | 0 | 1 | 3 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.75 | 0 | 3 | 10 | 13 |
| 5 | 2011-01-01 | 1 | 0 | 1 | 4 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.75 | 0 | 0 | 1 | 1 |
#check for missing data
missing = bikes.df %>%
filter(!complete.cases(.))
nrow(missing)/nrow(bikes.df) #no missing data
## [1] 0
Temp is a normalised temperature and atemp is a normalised feeling temperature
#understand the 5 statistical values of either temp or atemp
summary(bikes.df$temp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0200 0.3200 0.5000 0.4891 0.6600 0.9600
summary(bikes.df$atemp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3182 0.4848 0.4690 0.6212 1.0000
#check if they have strong correlation
cor(bikes.df$temp,bikes.df$atemp)
## [1] 0.9920215
#visualise the data in plot to confirm that it has a linear relationship
# Basic scatter plot
ggplot(bikes.df, aes(x=temp, y=atemp)) +
geom_point(size=2,alpha=0.1) +
stat_smooth(method = 'lm') +
theme_economist() +
theme(panel.border = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_text(size=7)) +
labs(x= "Temperature", y= "Feel Temperature",
title = "Temperature vs Feel Temperature", subtitle = "Find if the both variables are linearly related")
#Both of them are linearly related and very similar so the use of either temp is acceptable.
Some variables that are binary in nature contain a singular value. Instead, the feature-engineered variable can display 2 or more values
#feature engineer variable to make it more useful
#since the purpose is either for work or holiday, a purpose variable can be made from the 2 variables - 'workingday' or 'holiday'
bikes.df.clean <- bikes.df
table(bikes.df.clean$holiday , bikes.df.clean$workingday)
##
## 0 1
## 0 2495 5911
## 1 239 0
#'workingday' and 'holiday' are mutually exclusive
# if neither of the purposes is used for cycling, then it will be categorised as 'weekends' reason for cycling after checking the data
bikes.df.clean <- bikes.df.clean %>%
mutate(purpose=case_when(workingday==1 ~ "Work",
holiday==1 ~ "Holiday",
TRUE ~ "Weekend"))
#remove 'workingday' and 'holiday' variables
bikes.df.clean <- bikes.df.clean %>%
select(-workingday,-holiday)
#check month against season ====
table(bikes.df.clean$mnth , bikes.df.clean$season)
##
## 1 2 3 4
## 1 688 0 0 0
## 2 649 0 0 0
## 3 470 260 0 0
## 4 0 719 0 0
## 5 0 744 0 0
## 6 0 480 240 0
## 7 0 0 744 0
## 8 0 0 731 0
## 9 0 0 525 192
## 10 0 0 0 743
## 11 0 0 0 719
## 12 261 0 0 480
#conclusion: months and seasons are similar except that seasons variable has a higher grouping level
#drop season
bikes.df.clean <- bikes.df.clean %>%
select(-season)
#recode the variables ====
bikes.df.clean <- bikes.df.clean %>%
mutate(weathersit=case_when(weathersit==1 ~ "1:Clear",
weathersit==2 ~ "2:Mist",
weathersit==3 ~ "3:Light Snow",
weathersit==4 ~ "4:Heavy Rain"),
daynight= case_when(hr>=4 & hr <10 ~ "Morning",
hr>= 10 & hr < 16 ~ "Noon",
hr>= 16 & hr < 22 ~ "Evening",
TRUE ~ "Night"),
temp=temp*(39+8)-8,
atemp=round((atemp*(50+16)-16), digits = 2),
hum=hum*100,
windspeed=round((windspeed *67), digits = 2))
knitr::kable(
bikes.df.clean[1:5, ],
caption = "Bike Rental Data with feature engineered variables")
| instant | dteday | yr | mnth | hr | weekday | weathersit | temp | atemp | hum | windspeed | casual | registered | cnt | purpose | daynight |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2011-01-01 | 0 | 1 | 0 | 6 | 1:Clear | 3.28 | 3 | 81 | 0 | 3 | 13 | 16 | Weekend | Night |
| 2 | 2011-01-01 | 0 | 1 | 1 | 6 | 1:Clear | 2.34 | 2 | 80 | 0 | 8 | 32 | 40 | Weekend | Night |
| 3 | 2011-01-01 | 0 | 1 | 2 | 6 | 1:Clear | 2.34 | 2 | 80 | 0 | 5 | 27 | 32 | Weekend | Night |
| 4 | 2011-01-01 | 0 | 1 | 3 | 6 | 1:Clear | 3.28 | 3 | 75 | 0 | 3 | 10 | 13 | Weekend | Night |
| 5 | 2011-01-01 | 0 | 1 | 4 | 6 | 1:Clear | 3.28 | 3 | 75 | 0 | 0 | 1 | 1 | Weekend | Morning |
#gather the data for number of registered and casual users into long form ====
bikes.df.clean.reshaped <- bikes.df.clean %>%
gather(type, count, casual:cnt)
bikes.df.clean.reshaped %>%
filter(type!="cnt") %>%
group_by(mnth,type) %>%
dplyr::summarize(average=round(mean(count),0)) %>%
mutate(monthf=factor(month(mnth),levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE),
percent=round(100*average/sum(average),0)) %>%
ggplot(aes(x = as.factor(monthf),
y = average,fill=as.factor(type))) +
geom_bar(stat='identity') +
geom_text(aes(label=paste0(average, "\n" ,"(",percent,"%",")")), color="black", size=4 , position =position_stack(vjust = 0.5)) +
labs(x= "Month", y= "Number of Bikes",
title = "What was the average bikes rented in each month?", subtitle = "Understand average hourly bike rental demand across the months", fill = "Type of Bikes") +
theme_economist_white() +
theme(#panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_text(size=6,hjust=1),
axis.title.x=element_blank(),
#axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
#legend.title=element_blank(),
legend.position = "bottom")
bikes.df.clean.reshaped %>%
filter(type!="cnt") %>%
group_by(hr,type, weekday) %>%
dplyr::summarize(average=round(mean(count),0)) %>%
mutate(weekday=ifelse(weekday==0,7,weekday),
weekday=case_when(weekday==1 ~ "1. Mon",
weekday==2 ~ "2. Tue",
weekday==3 ~ "3. Wed",
weekday==4 ~ "4. Thu",
weekday==5 ~ "5. Fri",
weekday==6 ~ "6. Sat",
weekday==7 ~ "7. Sun")) %>%
ggplot(aes(x = as.factor(hr),
y = average,fill=as.factor(type))) +
geom_bar(stat='identity') +
facet_grid(type~weekday) +
#geom_text(aes(label=paste0(average)) , vjust=-0.2, color="white", size=4.5) +
labs(x= "Month", y= "Number of Bikes",
title = "What was the average number of bikes rented in the different hours of the day?", subtitle = "Understand average hourly bike rental demand across the months", fill = "Type of Bikes") +
theme_economist_white() +
theme(#panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_text(size=8,hjust=1),
axis.title.x=element_blank(),
#axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
#legend.title=element_blank(),
legend.position = "bottom") +
scale_x_discrete(breaks = seq(0, 20, by = 5))
five.values <- bikes.df.clean.reshaped %>%
group_by(type) %>%
summarise(five = list(fivenum(count))) %>%
tidyr::unnest()
bikes.df.clean.reshaped$type <- factor(bikes.df.clean.reshaped$type, levels = c("casual", "registered", "cnt")) #order the type bar chart
bikes.df.clean.reshaped %>%
ggplot(aes(x = factor(type), y = count, fill=type)) +
geom_boxplot(alpha=0.3) +
geom_text_repel(data = five.values,
aes(x = factor(type), y = five, label = five),
nudge_x = .5,
nudge_y = .5,
segment.alpha=.3) +
scale_fill_brewer(palette="Dark2", name= "User Type" ,labels=c("Casual","Registered","Total")) +
theme_economist_white() +
labs(fill = "cnt" ,title="Typically, how many bikes were rented amongst the different types of users?", subtitle = "Benchmarking the hourly bike rental demand") +
theme(#panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_text(size=8,hjust=1),
axis.title.x=element_blank(),
#axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
#legend.title=element_blank(),
legend.position = "bottom") +
scale_x_discrete(labels=c("Casual", "Registered", "Total"))
five.values.1 <- bikes.df.clean.reshaped %>%
group_by(type,purpose) %>%
summarise(five = list(fivenum(count))) %>%
tidyr::unnest()
bikes.df.clean.reshaped$type <- factor(bikes.df.clean.reshaped$type, levels = c("casual", "registered", "cnt")) #order the type bar chart
bikes.df.clean.reshaped %>%
ggplot(aes(x = factor(type), y = count, fill=type)) +
geom_boxplot(alpha=0.3, width=0.5) +
facet_grid(.~purpose, scales = "free" , space = "free") +
geom_text_repel(data = five.values.1,
aes(x = factor(type), y = five, label = five),
nudge_x = .5,
nudge_y = .5,
segment.alpha=.3,
size=3) +
scale_fill_brewer(palette="Dark2", name= "User Type" ,labels=c("Casual","Registered","Total")) +
theme_economist_white() +
labs(fill = "cnt" ,title="What were the reasons that users rented bikes for?", subtitle = "Finding which purpose constituted the highest demand") +
theme(#panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_text(size=8,hjust=.8),
axis.title.x=element_blank(),
#axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
#legend.title=element_blank(),
legend.position = "bottom") +
scale_x_discrete(labels=c("Casual", "Registered", "Total"))
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
##
## here
## The following objects are masked from 'package:reshape':
##
## rename, round_any
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
#customisng the data frame for time series calendar heat map
t <- bikes.df.clean %>%
mutate(weekday=ifelse(weekday==0,7,weekday), #recode sunday from 0 to 7
weekdayf=factor(weekday,levels=rev(1:7),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE), #order mon to sun chronologically
monthf=factor(month(bikes.df.clean$mnth),levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE), #order Jan to Dec chronologically
weekno=as.numeric(format(bikes.df.clean$dteday,"%W")), #obtain week number
yearmonth=factor(as.yearmon(bikes.df.clean$dteday)), #obtain year month
daynightf=factor(bikes.df.clean$daynight,levels=c("Morning","Noon","Evening","Night"))) #order Moring to Night chronologically
t<-ddply(t,.(yearmonth),transform,monthweek=1+weekno-min(weekno)) # normalizing the week to start at 1 for every month
#check colour theme needed
#ggthemes_data$economist$fg
ggplot(t, aes(monthweek, weekdayf, fill = bikes.df.clean$cnt)) +
geom_tile(colour = "white") +
facet_grid(daynightf~monthf) +
scale_fill_gradient(low="blue", high="red") +
xlab("Week of Month") + ylab("") +
labs(fill = "cnt" ,title="When did the major events happen?", subtitle = "Time-Series Calendar Heatmap: Observe Bike Rental Demand and major events that happened within the city") +
theme_economist() +
theme(#panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_text(size=8,hjust=1),
axis.title.x=element_blank(),
#axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
#legend.title=element_blank(),
legend.position = "bottom")
detach("package:plyr") #detach after use since it will clash with dplyr
colors.temperature <- c("#5e4fa2", "#3288bd", "#66c2a5", "#abdda4", "#e6f598",
"#fee08b", "#fdae61", "#f46d43", "#d53e4f", "#9e0142") #provide colours for temperature to show whether bike users cycle under extremem temperature
bikes.df.clean %>%
ggplot(aes(hr, cnt, color=atemp, group = 1)) +
facet_grid(purpose ~ weathersit) +
geom_point() +
#geom_smooth() +
stat_summary(fun.y=mean, colour="blue", geom="line", size = 1) + # draw a mean line in the data
labs(title="Daily Bike Rental Demand \nPer Time of Day, Work/Nonwork day, and Weather") +
labs(x="Hour of Day") +
labs(y="Bike Rentals Initiated per Hour") +
scale_colour_gradientn("Atemp", colours=colors.temperature) +
theme_economist() +
labs(fill = "cnt" ,title="Who were the users that cycled under high temperature?", subtitle = "Identify the users (in red) who cycled under high temperature") +
theme(plot.title = element_text(size = rel(1.5)),
legend.position = "right")
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
bikes.df.clean.reshaped %>%
select(temp,atemp,windspeed,hum,count) %>% #select the weather condition variables
gather(-count,key="var",value="value") %>% #reshape from wide to long format
ggplot(aes(x=value,y=count)) +
geom_point(aes(color=count)) +
scale_color_gradient(low = "blue", high = "red") +
stat_smooth(color="grey13") +
facet_wrap(~var, scales = "free") +
theme_economist_white() +
labs(fill = "cnt" ,title="How did the weather affect the bike rental demand?", subtitle = "Identify which condition had a correlation with the number of users") +
theme(#panel.grid.major.y = element_blank(),
panel.border = element_blank(),
#axis.ticks.y = element_blank(),
axis.text.y = element_text(size=8,hjust=1),
axis.title.x=element_blank(),
#axis.text.x=element_blank(),
#axis.ticks.x=element_blank(),
#legend.title=element_blank(),
legend.position = "bottom")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'