library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.1
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.1
## Warning: package 'tidyr' was built under R version 3.6.1
## Warning: package 'readr' was built under R version 3.6.1
## Warning: package 'purrr' was built under R version 3.6.1
## Warning: package 'dplyr' was built under R version 3.6.1
## Warning: package 'stringr' was built under R version 3.6.1
## Warning: package 'forcats' was built under R version 3.6.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(sf)
## Warning: package 'sf' was built under R version 3.6.1
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.6.1
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(osrm)
## Warning: package 'osrm' was built under R version 3.6.1
## Data: (c) OpenStreetMap contributors, ODbL 1.0 - http://www.openstreetmap.org/copyright
## Routing: OSRM - http://project-osrm.org/
Vamos a trabajar con los datos abiertos de bicicletas de la Ciudad de Nueva York.
NYcitibike <- read.csv("201907-citibike-tripdata.csv", encoding = "UTF-8")
Transformamos las variables principales con las que vamos a trabajar, de tipo factor a numéricas.
NYcitibike$start.station.id <- as.numeric(NYcitibike$start.station.id)
NYcitibike$end.station.id <- as.numeric(NYcitibike$end.station.id)
str(NYcitibike)
## 'data.frame': 2181064 obs. of 15 variables:
## $ tripduration : int 897 267 2201 1660 109 106 550 338 469 562 ...
## $ starttime : Factor w/ 2179730 levels "2019-07-01 00:00:00.1320",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ stoptime : Factor w/ 2179656 levels "2019-07-01 00:01:59.6060",..: 50 3 269 181 2 1 18 7 13 21 ...
## $ start.station.id : num 746 181 206 49 479 523 433 732 761 1 ...
## $ start.station.name : Factor w/ 788 levels "1 Ave & E 110 St",..: 730 109 370 459 692 72 577 17 21 703 ...
## $ start.station.latitude : num 40.8 40.8 40.7 40.7 40.8 ...
## $ start.station.longitude: num -74 -74 -74 -74 -74 ...
## $ end.station.id : num 724 225 417 596 476 533 491 675 727 731 ...
## $ end.station.name : Factor w/ 801 levels "1 Ave & E 110 St",..: 363 763 479 777 704 274 422 726 16 722 ...
## $ end.station.latitude : num 40.8 40.8 40.7 40.7 40.8 ...
## $ end.station.longitude : num -74 -74 -74 -74 -74 ...
## $ bikeid : int 18340 21458 39874 38865 30256 16875 34139 39703 28266 26757 ...
## $ usertype : Factor w/ 2 levels "Customer","Subscriber": 2 1 2 2 2 2 2 2 2 2 ...
## $ birth.year : int 1966 1996 1986 1988 1997 1988 1992 1995 1989 1965 ...
## $ gender : int 1 1 1 1 1 1 1 1 1 1 ...
Visualizamos en un mapa las estaciones de bicicletas.
NYbbox <- make_bbox(NYcitibike$start.station.longitude, NYcitibike$start.station.latitude)
NYCmapa <- get_stamenmap(bbox = NYbbox,
maptype = "toner-lite",
zoom = 12)
## Source : http://tile.stamen.com/toner-lite/12/1205/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1541.png
ggmap(NYCmapa) +
geom_point(data = NYcitibike, aes(x = start.station.longitude, y = start.station.latitude), color = "skyblue3") +
labs(title = "Estaciones de bicicletas | NYCbike",
subtitle = "Julio 2019",
caption = "Fuente: citibikenyc.com",
x = "Longitud",
y = "Latitud") +
theme_minimal()
Mediante un mapa de calor buscaremos visualizar la cantidad de viajes origen-destino para conocer la concentración de los mismos. Posteriormente sacaremos los 10 viajes más realizados.
sumaNYbike <- NYcitibike %>%
group_by(start.station.id, end.station.id) %>%
summarise(sumatoria=n())
ggplot() +
geom_tile(data = sumaNYbike,
aes(x = as.factor(start.station.id), y = as.factor(end.station.id),
fill = sumatoria)) +
labs(title = "Viajes origen-destino | NYCbike",
subtitle = "Julio 2019",
caption = "Fuente: citibikenyc.com",
x = "Estación origen",
y = "Estación destino") +
scale_color_distiller(palette = "Spectral") +
theme_minimal() +
viridis::scale_fill_viridis(trans = 'log10')
Se observa concentración de los viajes en los distintos extremos de la matriz origen-destino, visualizando asimismo una diagonal que cruza desde el margen inferior izquierdo hacia el margen superior derecho.
Sacamos un top 10:
NY_top10 <- sumaNYbike %>%
ungroup() %>%
filter(start.station.id != end.station.id) %>%
rename(START.ID = start.station.id,
END.ID = end.station.id)%>%
top_n(10)
## Selecting by sumatoria
NY_top10
## # A tibble: 10 x 3
## START.ID END.ID sumatoria
## <dbl> <dbl> <int>
## 1 23 253 698
## 2 215 241 677
## 3 234 215 648
## 4 234 241 902
## 5 234 422 778
## 6 236 705 621
## 7 383 393 753
## 8 412 241 768
## 9 695 247 820
## 10 761 705 691
Y graficamos nuevamente:
ggplot() +
geom_tile(data = NY_top10,
aes(x = as.factor(START.ID),
y = as.factor(END.ID),
fill = sumatoria)) +
labs(title = "Top 10 recorridos realizados | NYCbike",
subtitle = "Julio 2019",
caption = "Fuente: citibikenyc.com",
x = "Estación origen",
y = "Estación destino") +
scale_color_distiller(palette = "Spectral") +
theme_minimal() +
viridis::scale_fill_viridis(trans = 'log10')
Agregamos las coordenadas a los recorridos para volvarlos en el mapa de la ciudad.
NY_top10 <- NY_top10 %>%
ungroup() %>%
left_join(unique (NYcitibike [c("start.station.name", "start.station.longitude", "start.station.latitude", "start.station.id")]),
by = c("START.ID" = "start.station.id"))
NY_top10 <- NY_top10 %>%
ungroup() %>%
left_join(unique (NYcitibike [c("end.station.name", "end.station.longitude", "end.station.latitude", "end.station.id")]),
by = c("END.ID" = "end.station.id"))
Confeccionamos mapa con las coordenadas obtenidas.
NYbboxTop10 <- make_bbox(NY_top10$start.station.longitude, NY_top10$start.station.latitude)
NYbboxTop10
## left bottom right top
## -74.02780 40.65582 -73.97389 40.77115
NYmapaTop10 <- get_stamenmap(NYbboxTop10, maptype = "toner-lite", zoom = 12)
ggmap(NYmapaTop10) +
geom_point(data = NY_top10, aes(x = start.station.longitude,
y = start.station.latitude),
color = "skyblue3", size = 2) +
geom_point(data = NY_top10, aes(x = end.station.longitude,
y = end.station.latitude),
color = "skyblue3", size = 2) +
labs(title = "Top 10 recorridos realizados | NYCbike",
subtitle = "Julio - 2019",
caption = "Fuente: citibikenyc.com",
x = "Longitud",
y = "Latitud") +
theme_minimal()
## Warning: Removed 1 rows containing missing values (geom_point).
Obtenemos los recorridos y trazamos las rutas…
obtener_recorrido <- function(start.station.name, start.station.longitude, start.station.latitude, end.station.name, end.station.longitude, end.station.latitude) {
ruta <- osrmRoute(src = c(start.station.name, start.station.longitude, start.station.latitude),
dst = c(end.station.name, end.station.longitude, end.station.latitude))
cbind(start.station.name = start.station.name, end.station.name = end.station.name, ruta)}
argumentosNY <- list(NY_top10$start.station.name, NY_top10$start.station.longitude, NY_top10$start.station.latitude,
NY_top10$end.station.name, NY_top10$end.station.longitude, NY_top10$end.station.latitude)
recorridosNY <- pmap_df(argumentosNY, obtener_recorrido)
recorridosNY <- recorridosNY %>%
mutate(ID = paste(start.station.name, "-", end.station.name))
recorridosNY <-left_join(recorridosNY, NY_top10)
## Joining, by = c("start.station.name", "end.station.name")
…y lo graficamos en el mapa
ggmap(NYCmapa) +
geom_path(data = recorridosNY, aes(x = lon, y = lat, color=sumatoria), size = 1.5) +
labs(title = "Top 10 Recorridos | NYCbike ",
subtitle = "Julio - 2019",
caption = "Fuente: citibikenyc.com",
x = "Longitud",
y = "Latitud") +
theme_minimal()+
scale_color_viridis_c()
Con el gráfico podemos visualizar que la mayor cantidad de viajes se presentan en la zona sur, principalmente en Manhattan y con conexiones hacia ella, lo que podría deberse a la concentración de los atractivos turísticos de la ciudad en la misma.