This lab will walk you through visualizing Divvy data and help you use RMarkdown to develop a report you can deploy to the web http://rmarkdown.rstudio.com.
Let’s first do a map in R of some of the Divvy data.
These are the packages you will need. You can install them via RStudio and the CRAN repository. Make sure you download the data file from D2L and save it in your working directory.
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(scales)
library(rCharts)
library(lubridate)
library(RColorBrewer)
library(ggplot2)
First call in the data for July, 2014.
data <- read.csv("Divvy_Trips_2014-Q3-07.csv") ##If you are using the Rdata file from d2L, then call:
### data <- july14
head(data)
## trip_id starttime stoptime bikeid tripduration
## 1 2886259 7/31/2014 23:56 8/1/2014 0:03 2602 386
## 2 2886258 7/31/2014 23:58 8/1/2014 0:07 2403 495
## 3 2886257 7/31/2014 23:58 8/1/2014 2:10 669 7947
## 4 2886256 7/31/2014 23:58 8/1/2014 0:19 2431 1282
## 5 2886255 7/31/2014 23:57 8/1/2014 2:10 2885 7972
## 6 2886254 7/31/2014 23:57 8/1/2014 2:28 2307 9067
## from_station_id from_station_name to_station_id
## 1 291 Wells St & Evergreen Ave 53
## 2 98 LaSalle St & Washington St 106
## 3 240 Sheridan Rd & Irving Park Rd 240
## 4 47 State St & Kinzie St 14
## 5 240 Sheridan Rd & Irving Park Rd 240
## 6 28 Larrabee St & Menomonee St 28
## to_station_name usertype gender birthyear
## 1 Wells St & Erie St Subscriber 2 1979
## 2 State St & Pearson St Subscriber 1 1974
## 3 Sheridan Rd & Irving Park Rd Customer 0 NA
## 4 Morgan St & 18th St Customer 0 NA
## 5 Sheridan Rd & Irving Park Rd Customer 0 NA
## 6 Larrabee St & Menomonee St Customer 0 NA
Now call in the file with the stations and subset data of all the trips leaving Sheffield & Fullerton
stations <- read.csv("Divvy_Stations_2014-Q3Q4.csv") ## D2L Rdata file users, try stations <- stations
fullerton <- subset(data, from_station_name == "Sheffield Ave & Fullerton Ave")
fullerton.dest <- data.frame(table(fullerton$to_station_name)) # Makes a frequency table for all of the departures from Sheffield & Fullerton
m1 <- merge(fullerton.dest, stations, by.x = "Var1", by.y = "name")
## Create a data frame of the frequency of rides departing Fullerton to various stations
fulldepart <- subset(m1, Freq > 0)
# Now map it
map <- Leaflet$new()
map$setView(c(41.925271, -87.653732), zoom = 12) #centers on Fullerton and Sheffield
map$tileLayer(provider = 'Stamen.TonerLite')
for (i in 1:nrow(fulldepart)) {
map$marker(c(fulldepart[i, "latitude"], fulldepart[i, "longitude"]), bindPopup = as.factor(fulldepart[i, "Freq"]))
}
You should get something like this, which is interactive:
Here are the summary statistics for frequency of travel from Fullerton:
summary(fulldepart$Freq)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 4.50 10.79 13.00 139.00
Let’s do the heatmap plot:
df <- data.frame(table(data$from_station_name))
data_df <- tbl_df(data)
fullerton <- filter(data_df, from_station_name == "Sheffield Ave & Fullerton Ave") #All Trips from Shef
fullerton.arrive <- filter(data_df, to_station_name == "Sheffield Ave & Fullerton Ave") #All Trips arriving at this staion
# Takes time/date variable & formats
fullerton$date <- as.POSIXct(fullerton$starttime, format = "%m/%d/%Y %H:%M")
# Make sa day of week
fullerton$dow <- wday(fullerton$date)
fullerton$cdow <- wday(fullerton$date, label = T)
fullerton$tod <- as.numeric(fullerton$date - as.POSIXct(strftime(fullerton$date, format = "%Y-%m-%d")))/60 # you may have to adjust to 3600 depending on the format
## Create bins for the heat map
fullerton$bins <- cut(fullerton$tod, breaks = 1:24, labels = F)
# Create a new object based on the counts
counts <- aggregate(trip_id ~ bins + dow, fullerton, length)
colnames(counts)[ncol(counts)] <- "Events"
ggplot(counts, aes(x=bins,y=8-dow))+ geom_tile(aes(fill=Events)) +
scale_fill_gradientn(colours=brewer.pal(9,"YlOrRd"),
breaks=seq(0,max(counts$Events),by=22), name = "Number of Trips") +
scale_y_continuous(breaks=7:1,labels=c("Sun","Mon","Tues","Wed","Thurs","Fri","Sat"))+
labs(x="Time of Day", y="Day of Week")+
coord_fixed()+
ggtitle(label="Trips Leaving the Sheffield-Fullerton Divvy Station, July 2014")+
theme(plot.title = element_text(colour="blue"))
Lets do a plot that looks at gender and duration.
### Gets the number of trips by destination
fullerton.dest <- data.frame(table(fullerton$to_station_name))
### Gets the number of trips by gender of arrival
### Need to separate subscriber vs. daily user
fullerton.arrive.gender <- filter(fullerton.arrive, usertype == "Subscriber")
### This next line adds a new column that re-formats the starttime variable to a date/time class
fullerton.arrive.ts <- data.frame(fullerton.arrive.gender, as.POSIXct(strptime(fullerton.arrive.gender$starttime, "%m/%d/%Y %H:%M")))
#Changes the column name of the new date classed variable
names(fullerton.arrive.ts)[13] <- "starttimeformat" #Changes the column name of t
### Next line creates new column with the tripduration formatted to be numeric for plotting
fullerton.arrive.ts.factor <- data.frame(fullerton.arrive.ts, as.numeric(gsub(",", "", as.character(fullerton.arrive.ts$tripduration))))
#Changes the column name to the new tripduration formatted variable
names(fullerton.arrive.ts.factor)[14] <- "tripdurationnumber"
fullerton.arrive.filter <- filter(fullerton.arrive.ts.factor, tripdurationnumber >= 600)
fullerton.arrive.filter <- filter(fullerton.arrive.filter, tripdurationnumber<=10000)
fullerton.arrive.filter$gender <- factor(fullerton.arrive.filter$gender)
# Create a plot of date/time vs. duration
fullerton.arrival.plot <-ggplot(fullerton.arrive.filter, aes(starttimeformat, tripdurationnumber))
#prints a plot with each trip by gender
fullerton.arrival.plot + geom_point(aes(color = factor(gender, labels = c("Female", "Male")))) +
labs(color = "Gender")
fullerton.arrival.plot + geom_point(aes(color = factor(gender, labels = c("Female", "Male")))) +
labs(color = "Gender") +
xlab("Date") +
ylab("trip duration in seconds") +
ggtitle("divvy trips by members arriving to the Sheffield and Fullerton station")
When developing your markdown document, save it early into the same directory. The markdown document
data_aug_sep <- read.csv("Divvy_Trips_2014-Q3-0809.csv")
Combine them with rbind
total.q3 <- rbind(data, data_aug_sep)
str(total.q3)
## 'data.frame': 1110970 obs. of 12 variables:
## $ trip_id : int 2886259 2886258 2886257 2886256 2886255 2886254 2886253 2886252 2886251 2886250 ...
## $ starttime : Factor w/ 112525 levels "7/1/2014 0:16",..: 30589 30591 30591 30591 30590 30590 30590 30589 30589 30589 ...
## $ stoptime : Factor w/ 112179 levels "7/1/2014 0:29",..: 38138 38142 38166 38152 38166 38167 38159 38150 38145 38168 ...
## $ bikeid : int 2602 2403 669 2431 2885 2307 192 1543 2801 3030 ...
## $ tripduration : int 386 495 7947 1282 7972 9067 6006 1161 824 9142 ...
## $ from_station_id : int 291 98 240 47 240 28 255 288 29 28 ...
## $ from_station_name: Factor w/ 300 levels "900 W Harrison",..: 281 156 242 262 242 151 134 149 203 151 ...
## $ to_station_id : int 53 106 240 14 240 28 181 17 47 28 ...
## $ to_station_name : Factor w/ 300 levels "900 W Harrison",..: 280 263 242 198 242 151 154 295 262 151 ...
## $ usertype : Factor w/ 3 levels "Customer","Dependent",..: 3 3 1 1 1 1 1 1 1 1 ...
## $ gender : int 2 1 0 0 0 0 0 0 0 0 ...
## $ birthyear : int 1979 1974 NA NA NA NA NA NA NA NA ...
I would like you all to try and prepare your final projects as an RMarkdown document so we can deploy them online. This is particularly valuable if you have anything interactive. Rstudio operates a website called rpubs where you can upload an html-ready document directly from r.
The syntax is quite simple and is explained here