Installing and loading packages
pkgs <- c("tidyverse", "sf","tmap","spatstat","maptools",
"sqldf","spatstat.geom", "leaflet.providers",
"markdown", "polyclip","ggmap")
list.packages <- pkgs[!(pkgs %in%
installed.packages()[,"Package"])]
if(length(list.packages)) install.packages(list.packages)
invisible(lapply((pkgs), library, character.only = T))
Setting environment and reading Citi Bike Data
setwd(choose.dir())
rdf <-read.csv("201306-citibike-tripdata.csv")
Q1. There are \(577,703\) rows and \(15\) columns.
str(rdf)
## 'data.frame': 577703 obs. of 15 variables:
## $ tripduration : int 695 693 2059 123 1521 2028 2057 369 1829 829 ...
## $ starttime : chr "2013-06-01 00:00:01" "2013-06-01 00:00:08" "2013-06-01 00:00:44" "2013-06-01 00:01:04" ...
## $ stoptime : chr "2013-06-01 00:11:36" "2013-06-01 00:11:41" "2013-06-01 00:35:03" "2013-06-01 00:03:07" ...
## $ start.station.id : int 444 444 406 475 2008 485 285 509 265 404 ...
## $ start.station.name : chr "Broadway & W 24 St" "Broadway & W 24 St" "Hicks St & Montague St" "E 15 St & Irving Pl" ...
## $ start.station.latitude : num 40.7 40.7 40.7 40.7 40.7 ...
## $ start.station.longitude: num -74 -74 -74 -74 -74 ...
## $ end.station.id : chr "434" "434" "406" "262" ...
## $ end.station.name : chr "9 Ave & W 18 St" "9 Ave & W 18 St" "Hicks St & Montague St" "Washington Park" ...
## $ end.station.latitude : chr "40.74317449" "40.74317449" "40.69512845" "40.6917823" ...
## $ end.station.longitude : chr "-74.00366443" "-74.00366443" "-73.99595065" "-73.9737299" ...
## $ bikeid : int 19678 16649 19599 16352 15567 18445 15693 16100 15234 16400 ...
## $ usertype : chr "Subscriber" "Subscriber" "Customer" "Subscriber" ...
## $ birth.year : chr "1983" "1984" "NULL" "1960" ...
## $ gender : int 1 1 0 1 1 0 1 1 1 1 ...
Q2.The average age is \(38\).
rdf$birth_year<-sapply(rdf$birth.year, as.numeric) #for some reason this code takes longer to execute in R_markdown but work faster in .R. So I have set eval = False.
mean(2013 - rdf$birth_year, na.rm = TRUE)
Q3. POSIXct stores date and time in seconds with the number of seconds beginning at 1 January 1970. Thus, the POSIXct format stores each date and time a single value in units of seconds. as.POSIXct method converts a date-time string into a POSIXct class.
Q4. The pipe (%>%) operator will forward a value or the result of an expression into the next function call/expression. Pipe(%>%) operator is useful when we want to execute multiple functions. This helps readability and productivity as it’s easier to follow the flow of multiple functions through the pipe(%>%) operator than going backwards when multiple function are nested. Also, using pipe(%>%) makes for more efficient and legible code.
Q5. The two codes are similar because both help us to select records from objects (e.g. dataframe) that meet the parameters we specify. In this case the parameter used for the selection is hour = 07, 08, 09. Sqldf() is the SQL.
Q6. CRS means Coordinate Reference System. 4326 is World Geographic System 1984, has ellipsoid: WGS 84
Q7. There were \(3\) map packages. They are ggmap, tmap and base R map.
Q8. The process for deriving surface from points is called interpolation.
Q9.The numbers in the density map is small because it shows the probability density function and since probability is \(0≤P≤1\), the values will fall within this range.
Q10. Creating interactive map and filling the table
rdfClean <- rdf %>%
mutate(date = as.Date(starttime),
time = format(as.POSIXct(starttime), format = "%H:%M:%S")) %>% ## SPECIAL FORMATTING TO GET H,M,S.
mutate(hour = substring(.$time, first = 1, last = 2)) %>% ## TRUNCATES THE STRING
mutate(gender = as.character(gender)) %>% ## CHANGES NUMBERS into STRINGS
dplyr::select(date, time, hour, everything()) %>% ## REORDERS THE COLUMNS
filter("birth_year" > 1930)
#Plots
tripsByHour <- rdfClean %>%
group_by(`hour`, `gender`) %>%
summarise(trips=n()) %>%
mutate(hour = as.numeric(hour))
## `summarise()` has grouped output by 'hour'. You can override using the `.groups` argument.
plot(tripsByHour$hour, tripsByHour$trips, xlab = "Hour of the Day", ylab = "Number of Trips")
tripsByHour$Gender<-factor(tripsByHour$gender, levels = c(0,1,2), labels=c('NA',"Male",'Female'))
ggplot(tripsByHour, aes(x=hour, y=trips, colour=Gender)) + geom_line()
Selecting morning station trips
morning <- rdfClean %>% filter(hour %in% c('07', '08', '09'))
morningStations <- morning %>%
group_by(`start.station.id`) %>%
summarise(lat=as.numeric(`start.station.latitude`[1]),
lon=as.numeric(`start.station.longitude`[1]),
name=`start.station.name`[1],
trips=n())
Interactive Map for Morning Trips
# Turns it into a gis file
morningStationsGeo <- st_as_sf(morningStations, coords = c("lon", "lat"), dim = "XY", crs = 4326)
# Save if you feel like it! # st_write(popularStations, "popularStations.shp", delete_layer = TRUE)
# Look at the map
tmap_mode('view')
## tmap mode set to interactive viewing
tm_shape(morningStationsGeo %>%
filter(!is.na(trips)) %>%
mutate(trips = as.numeric(trips))) + tm_dots(col = "trips", style="quantile", id = "name")
Kernel Density
boroughs <- st_read("borough.shp")
## Reading layer `borough' from data source
## `C:\Users\cod\Desktop\PhD Files\Course Works\2nd Year Falls Semester\Urban Analytics\Week 3\Assignment__Citibikes_export\ForClass\borough.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 5 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -74.25559 ymin: 40.49612 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS: WGS84(DD)
boroughs_prj <- st_transform(boroughs, crs = 2263)
df.size.proj <- st_transform(morningStationsGeo, crs = 2263)
p.sp <- as(df.size.proj, "Spatial")
p.ppp <- as(p.sp, "ppp")
kernelDensityMorningEvening <- (density(p.ppp, sigma = 1000))
plot(kernelDensityMorningEvening,
main = "Map of Morning Citibike Origins")
plot(boroughs_prj$geometry,add = T)
Heatmap in ggplot
library(ggplot2)
library(ggmap)
df.size.degree <- st_transform(process.dataGeo, crs = 4326) %>%
mutate(lat = st_coordinates(.)[,2],
lon = st_coordinates(.)[,1])
##Get borders
height <- max(st_coordinates(df.size.degree)[,2]) - min(st_coordinates(df.size.degree)[,2])
width <- max(st_coordinates(df.size.degree)[,1]) - min(st_coordinates(df.size.degree)[,1])
sac_borders <- c(bottom = min(st_coordinates(df.size.degree)[,2]) - 0.1*height,
top = max(st_coordinates(df.size.degree)[,2]) + 0.1*height,
left = min(st_coordinates(df.size.degree)[,1]) - 0.1*width,
right = max(st_coordinates(df.size.degree)[,1]) + 0.1*width)
map <- get_stamenmap(sac_borders, zoom = 12, maptype = "toner-lite")
ggmap(map) +
geom_density2d(data = df.size.degree, aes(x = lon, y = lat), size = 0.3) +
stat_density2d(data = df.size.degree, aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
geom = "polygon", bins = 10) +
scale_fill_gradient(low = "green", high = "red") +
scale_alpha(range = c(0, 0.4), guide = FALSE)+
labs(x = 'Longitude', y = 'Latitude', color = 'Router')+
labs(title="Heat Map, Morning Trips")
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
Selecting evening station trips
evening <- rdfClean %>% filter(hour %in% c("17","18","19"))
eveningStations <- evening %>%
group_by(`start.station.id`)%>%
summarise(lat=as.numeric(`start.station.latitude`[1]),
lon=as.numeric(`start.station.longitude`[1]),
name=`start.station.name`[1],
e_trips=n())
Merging morning station trips and evening station trips
process.data <- merge(morningStations,eveningStations, by.x = "start.station.id", by.y = "start.station.id")%>%
mutate(Trip_Diff = trips - e_trips)
Creating spatial data using longitude and latitude.
process.dataGeo <- st_as_sf(process.data, coords = c("lon.x", "lat.x"), dim = "XY", crs = 4326)
Creating interactive map
tmap_mode('view')
## tmap mode set to interactive viewing
tm_shape(process.dataGeo %>%
filter(!is.na(Trip_Diff)) %>%
mutate(Trip_Diff = as.numeric(Trip_Diff))) + tm_dots(col = "Trip_Diff", style="quantile", id = "name")
## Variable(s) "Trip_Diff" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
morning.Trip <- process.data[process.data$name.x %in% c("8 Ave & W 31 St N",
"Pershing Square North","W 33 St & 7 Ave"),]
morning.data <- data.frame(Morn.station = morning.Trip$name.x,Morn.Trips = morning.Trip$trips,
Even.Trips = morning.Trip$e_trips, Diff = morning.Trip$Trip_Diff)%>%
arrange(desc(Morn.Trips));morning.data
## Morn.station Morn.Trips Even.Trips Diff
## 1 8 Ave & W 31 St N 1088 1008 80
## 2 Pershing Square North 873 1221 -348
## 3 W 33 St & 7 Ave 769 711 58
Evening.Trip <- process.data[process.data$name.x %in% c("E 17 St & Broadway",
"W 20 St & 11 Ave","Broadway & W 24 St"),]
Evening.data <- data.frame(Evening.station = Evening.Trip$name.x,Morn.Trips = Evening.Trip$trips,
Even.Trips = Evening.Trip$e_trips, Diff = Evening.Trip$Trip_Diff)%>%
arrange(desc(Even.Trips));Evening.data
## Evening.station Morn.Trips Even.Trips Diff
## 1 E 17 St & Broadway 394 1675 -1281
## 2 W 20 St & 11 Ave 621 1577 -956
## 3 Broadway & W 24 St 357 1439 -1082
a<-cbind(morning.data,Evening.data);a
## Morn.station Morn.Trips Even.Trips Diff Evening.station
## 1 8 Ave & W 31 St N 1088 1008 80 E 17 St & Broadway
## 2 Pershing Square North 873 1221 -348 W 20 St & 11 Ave
## 3 W 33 St & 7 Ave 769 711 58 Broadway & W 24 St
## Morn.Trips Even.Trips Diff
## 1 394 1675 -1281
## 2 621 1577 -956
## 3 357 1439 -1082
Completed table.