bikedata$weekdays<- wday(strptime(bikedata$Start.Date,format="%d/%m/%Y %H:%M",
tz="UTC"),label = T)
##################day time function
daytime<- function(data4=bikedata){
a<- strptime(data4$Start.Date,format="%d/%m/%Y %R",tz="UTC")
b<- format(a, "%R")
Time <- hour(hm(b))
timeofday <- hour(hm("00:00", "6:00", "12:00", "18:00", "23:59"))
names <- c("Night", "Morning", "Afternoon", "Evening")
z<-cut(x=Time, breaks=timeofday, labels=names, include.lowest=TRUE)
return(z)
}
#######################################################season
Season <- function(data2) {
d<- as.Date(strptime(data2$Start.Date,format="%d/%m/%Y %H:%M",tz="UTC"))
WS <- as.Date("21/12/2017", format = "%d/%m/%Y") # Winter
SE <- as.Date("20/3/2017", format = "%d/%m/%Y") # Spring
SS <- as.Date("21/6/2017", format = "%d/%m/%Y") # Summer
FE <- as.Date("22/9/2017", format = "%d/%m/%Y") # Autumn
ifelse (d >= WS | d < SE, "Winter",
ifelse (d >= SE & d < SS, "Spring",
ifelse (d >= SS & d < FE, "Summer", "Autumn")))
}
bikedata$season<-as.factor(Season(data2=bikedata))
bikedata$Daytime<- daytime(data4=bikedata)
bikedata$Start.Date<- as.Date(strptime(bikedata$Start.Date,
format="%d/%m/%Y %H:%M",tz="UTC"))
bikedata$weekdays<- factor(bikedata$weekdays,ordered = FALSE)
x<-(plyr::count(bikedata$Start.Date))
colnames(x)<- c("Start.Date","ntrips")
bikedata<-join(x,bikedata)
## Joining by: Start.Date
bikedata<- bikedata%>% dplyr::select("ntrips","weekdays","season","Daytime","Duration")
bikedata<- na.omit(bikedata)
str(bikedata)
## 'data.frame': 8434530 obs. of 5 variables:
## $ ntrips : int 21955 21955 21955 21955 21955 21955 21955 21955 21955 21955 ...
## $ weekdays: Factor w/ 7 levels "Sun","Mon","Tues",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ season : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Daytime : Factor w/ 4 levels "Night","Morning",..: 3 2 3 3 3 3 2 3 4 3 ...
## $ Duration: int 660 600 540 180 300 900 420 600 660 420 ...
Bar charts
#with Number of trips
ggplot(data=bikedata, aes(x=weekdays,y=ntrips,fill=Daytime)) +
geom_bar(stat="identity") +
facet_wrap(~ season, nrow=2,ncol = 2)
#with Duration
ggplot(data=bikedata, aes(x=weekdays,y=Duration)) +
geom_bar(aes(fill=Daytime), stat="identity") +
facet_wrap(~ season, nrow=2,ncol = 2)
dramatically difference between Autumn,Winter and Spring, Summer, which may trigged by the temputure change.
Weekdays are more popular for the cyclest than weekends, especialy during colder seasons,however, It is worth mentioning that the change of pattern in Weekdays during spring only have a tiny impact, unlike the others. Most people tend to cycle during morning - afternoon(6am-6pm), suggesting the possibility that people may travel to and back from work by cycle We can analysis the impact of a specific level with in a factor by regression, to see if our assumption above makes sense. The content of the bar chart with duration may rising the concern of potential intercept in our model.
Splitting the data
ind<- sample(3,nrow(bikedata),replace = T,prob = c(0.6,0.2,0.2))
train<- bikedata[ind==1,]
valid<- bikedata[ind==2,]
test<- bikedata[ind==3,]
############################################model
str(train)
## 'data.frame': 5061466 obs. of 5 variables:
## $ ntrips : int 21955 21955 21955 21955 21955 21955 21955 21955 21955 21955 ...
## $ weekdays: Factor w/ 7 levels "Sun","Mon","Tues",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ season : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Daytime : Factor w/ 4 levels "Night","Morning",..: 3 3 3 3 4 3 2 3 3 4 ...
## $ Duration: int 660 180 900 600 660 420 600 360 720 1140 ...