entrega 4

library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.0     v purrr   0.3.2
## v tibble  2.1.1     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## Warning: package 'lubridate' was built under R version 3.6.1

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

bici19_cdn <- read.csv("bici19.csv", encoding = "UTF-8" )

summary(bici19_cdn)

##  bici_id_usuario          bici_Fecha_hora_retiro bici_tiempo_uso 
##  Min.   :    33   2019-01-07 20:13:12:     4     Min.   :  0.00  
##  1st Qu.:177438   2019-01-11 14:58:09:     4     1st Qu.:  8.00  
##  Median :375196   2019-01-14 12:50:25:     4     Median : 14.00  
##  Mean   :359739   2019-01-16 15:54:47:     4     Mean   : 16.51  
##  3rd Qu.:547898   2019-01-17 14:16:34:     4     3rd Qu.: 21.00  
##  Max.   :693680   2019-01-25 08:21:43:     4     Max.   :118.00  
##                   (Other)            :204083     NA's   :131     
##           bici_nombre_estacion_origen bici_estacion_origen
##  Vera Peñaloza          :  3092       Min.   :  1.0       
##  Facultad de Medicina   :  3046       1st Qu.: 54.0       
##  Independencia          :  2902       Median : 98.0       
##  Lima                   :  2781       Mean   :100.5       
##  Godoy Cruz y Libertador:  2600       3rd Qu.:149.0       
##  Don Bosco              :  2526       Max.   :600.0       
##  (Other)                :187160                           
##        bici_nombre_estacion_destino bici_estacion_destino
##  Vera Peñaloza       :  3091        Min.   :  1.0        
##  Facultad de Medicina:  3040        1st Qu.: 55.0        
##  Independencia       :  2698        Median : 99.0        
##  Lima                :  2663        Mean   :101.2        
##  Saavedra            :  2600        3rd Qu.:149.0        
##  Don Bosco           :  2523        Max.   :600.0        
##  (Other)             :187492        NA's   :131          
##         bici_sexo        bici_edad    
##  FEMENINO    : 54268   Min.   :16.00  
##  MASCULINO   :149804   1st Qu.:25.00  
##  NO INFORMADO:    35   Median :30.00  
##                        Mean   :33.26  
##                        3rd Qu.:39.00  
##                        Max.   :87.00  
##

bicisCABA <- bici19_cdn %>% mutate (bici_Fecha_hora_retiro = ymd_hms(bici_Fecha_hora_retiro))

set.seed("99")
muestra_de_fechas <- bicisCABA %>% 
    sample_n(5) %>% 
    pull(bici_Fecha_hora_retiro)

muestra_de_fechas

## [1] "2019-01-01 01:46:58 UTC" "2019-01-18 09:30:57 UTC"
## [3] "2019-01-13 15:45:12 UTC" "2019-01-31 17:06:52 UTC"
## [5] "2019-01-16 14:07:01 UTC"

options(scipen = 20)

ggplot(bicisCABA) + 
    geom_bar(aes(x = hour(bici_Fecha_hora_retiro)))

La muestra registra que durante la mañana el pico de uso es 8 am, desciende al mediodía y comienza a subir hasta el nuevo pico 6 pm.

bicisCABA %>% 
    ggplot() +
        geom_bar(aes(x = day(bici_Fecha_hora_retiro)))

Los resultados no permiten comparar entre días pico de todo el año, ya que el dataset tiene registros de unos pocos menes en el 2019.

Analizando la distribución de datos:

bicisCABA %>% 
    count(bici_nombre_estacion_origen) %>% 
    top_n(5) %>% 
    arrange(desc(n))

## Selecting by n

## # A tibble: 5 x 2
##   bici_nombre_estacion_origen     n
##   <fct>                       <int>
## 1 Vera Peñaloza                3092
## 2 Facultad de Medicina         3046
## 3 Independencia                2902
## 4 Lima                         2781
## 5 Godoy Cruz y Libertador      2600

estacionesfrecuentes <- bicisCABA %>% 
    count(bici_nombre_estacion_origen) %>% 
    top_n(5) %>% 
    pull(bici_nombre_estacion_origen)

## Selecting by n

bicisCABA %>% 
    filter(year(bici_Fecha_hora_retiro) == 2019,
           bici_nombre_estacion_origen %in% estacionesfrecuentes) %>% 
    ggplot() +
        geom_bar(aes(x = month(bici_Fecha_hora_retiro, label = TRUE), fill = bici_nombre_estacion_origen))

bicisCABA %>% 
    filter(year(bici_Fecha_hora_retiro) == 2019,
           bici_nombre_estacion_origen %in% estacionesfrecuentes) %>% 
    ggplot() +
        geom_bar(aes(x = month(bici_Fecha_hora_retiro, label = TRUE), fill =bici_nombre_estacion_origen),
                 position = "dodge")

conteo_ <-  bicisCABA %>% 
    filter(year(bici_Fecha_hora_retiro) == 2019,
          bici_nombre_estacion_origen %in% estacionesfrecuentes) %>% 
    count(bici_nombre_estacion_origen, mes = month(bici_Fecha_hora_retiro, label = TRUE))
ggplot(conteo_) +
    geom_line(aes(x = mes, y = n, group = bici_nombre_estacion_origen, color = bici_nombre_estacion_origen))

conteo_ <-  bicisCABA %>% 
    filter(year(bici_Fecha_hora_retiro) == 2019,
          bici_nombre_estacion_origen %in% estacionesfrecuentes) %>% 
    count(bici_nombre_estacion_origen, diasemana = wday(bici_Fecha_hora_retiro, label = TRUE))
ggplot(conteo_) +
    geom_line(aes(x = diasemana, y = n, group = bici_nombre_estacion_origen, color = bici_nombre_estacion_origen))

conteo_ <-  conteo_  %>% 
    group_by(bici_nombre_estacion_origen) %>% 
    mutate(pct = n / sum(n) * 100)

ggplot(conteo_) +
    geom_line(aes(x = diasemana, y = pct, group = bici_nombre_estacion_origen, color = bici_nombre_estacion_origen))

El domingo es el día en que menos se usan las bicis, seguido por el sábado, aunque hay algunas estaciones como Vera Peñaloza y Godoy Cruz y Libertador que en el domingo tienen mayor uso.

library(ggmap)

## Warning: package 'ggmap' was built under R version 3.6.1

## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.

## Please cite ggmap if you use it! See citation("ggmap") for details.

estaciones <- read.csv("estaciones_bici_geo.csv", encoding = "UTF-8" )

summary(estaciones)

##        X                Y                        NOMBRE   
##  Min.   :-58.46   Min.   :-34.64   15 DE NOVIEMBRE  :  1  
##  1st Qu.:-58.42   1st Qu.:-34.62   25 DE MAYO       :  1  
##  Median :-58.40   Median :-34.60   9 DE JULIO       :  1  
##  Mean   :-58.40   Mean   :-34.61   ACEVEDO          :  1  
##  3rd Qu.:-58.38   3rd Qu.:-34.59   ACUÑA DE FIGUEROA:  1  
##  Max.   :-58.36   Max.   :-34.57   ADUANA           :  1  
##                                    (Other)          :193  
##                                                           DOMICILIO  
##  PLAZA FUERZA AEREA: AV. DR. J. RAMOS MEJIA Y AV DEL LIBERTADOR:  2  
##  15 DE NOVIEMBRE DE 1889 2687 ENTRE CATAMARCA Y JUJUY          :  1  
##  25 DE MAYO Y LAVALLE                                          :  1  
##  33 ORIENTALES 1439, ENTRE AV. PAVON Y AV. GARAY               :  1  
##  9 DE JULIO Y MORENO                                           :  1  
##  ACEVEDO Y PADILLA                                             :  1  
##  (Other)                                                       :192  
##                      IMAGEN          AUTOMAT   
##  ESTACION_AUTOMATICA.PNG:199   AUTOMATICA:199  
##                                                
##                                                
##                                                
##                                                
##                                                
##                                                
##                                           OBSERV       NRO_EST     
##  ABRIL 2015 (PASO DE SER MANUAL A AUTOMATICA): 26   Min.   :  1.0  
##  mar-17                                      : 17   1st Qu.: 51.5  
##  abr-17                                      : 16   Median :101.0  
##  may-17                                      : 16   Mean   :100.9  
##  ago-17                                      : 11   3rd Qu.:150.5  
##  jul-17                                      : 11   Max.   :200.0  
##  (Other)                                     :102                  
##                                              HORARIO   
##  ESTACION AUTOMATICA: DISPONIBILIDAD LAS 24 HORAS:199  
##                                                        
##                                                        
##                                                        
##                                                        
##                                                        
##                                                        
##                                  DIRE_NORM  
##  DEL LIBERTADOR AV Y RAMOS MEJIA DR AV:  2  
##  1 ZAPIOLA                            :  1  
##  101 BALCARCE                         :  1  
##  101 VIAMONTE                         :  1  
##  1016 PERU                            :  1  
##  1027 SAAVDREA                        :  1  
##  (Other)                              :192

bicisCABA <- bicisCABA %>%
rename(
NRO_EST = bici_estacion_origen
)

[c(“X”,“Y”,“NRO_EST”)]

bicisCABA2 <- bicisCABA %>% 
    left_join(estaciones[c("X","Y","NRO_EST")], 
              by = c("NRO_EST" = "NRO_EST"))

bbox <- make_bbox(bicisCABA2$X, bicisCABA2$Y)

bbox

##      left    bottom     right       top 
## -58.45966 -34.64202 -58.35849 -34.56455

CABA_mapa <- get_stamenmap(bbox, color = "bw", zoom = 12)

## Source : http://tile.stamen.com/terrain/12/1382/2467.png

## Source : http://tile.stamen.com/terrain/12/1383/2467.png

## Source : http://tile.stamen.com/terrain/12/1384/2467.png

## Source : http://tile.stamen.com/terrain/12/1382/2468.png

## Source : http://tile.stamen.com/terrain/12/1383/2468.png

## Source : http://tile.stamen.com/terrain/12/1384/2468.png

ggmap(CABA_mapa) +
    geom_point(data = bicisCABA2, aes(x = X, y = Y))

## Warning: Removed 2 rows containing missing values (geom_point).

ggmap(CABA_mapa) +
    geom_point(data = bicisCABA2, aes(x = X, y = Y),
               color = "blue", size = 0.1, alpha = 0.1)

## Warning: Removed 2 rows containing missing values (geom_point).

#Mapa de Densidad

ggmap(CABA_mapa) +
    geom_bin2d(data = bicisCABA2, 
               aes(x = X, y = Y))

## Warning: Removed 2 rows containing non-finite values (stat_bin2d).

Se puede aumentar la resolución:

ggmap(CABA_mapa) +
    geom_bin2d(data = bicisCABA2, aes(x = X, y = Y), bins = 100) +
    scale_fill_viridis_c()

## Warning: Removed 2 rows containing non-finite values (stat_bin2d).

Otra opción para mapear densdad:

ggmap(CABA_mapa) +
    geom_density2d(data = bicisCABA2, aes(x = X, y = Y, color = stat(level))) +
    scale_color_viridis_c()

## Warning: Removed 2 rows containing non-finite values (stat_density2d).

En este caso, marca la zonas de influencia o las zonas cubiertas por las estaciones. La densidad se percibe en las áreas que no tienen casi sectores sin zona influencia de estaciones.

Visualizar Múltiples Categorías:

estacionesfrecuentes2 <- bicisCABA2 %>% 
    count(bici_nombre_estacion_destino) %>% 
    top_n(5) %>% 
    pull(bici_nombre_estacion_destino)

## Selecting by n

estacionesfrecuentes2

## [1] Facultad de Medicina Independencia        Lima                
## [4] Saavedra             Vera Peñaloza       
## 171 Levels:  25 de Mayo 9 de Julio Acevedo Acuña de Figueroa ... Yatay

ggmap(CABA_mapa) +
    geom_point(data = filter(bicisCABA2, bici_nombre_estacion_destino %in% estacionesfrecuentes2), 
               aes(x = X, y = Y, color = bici_nombre_estacion_destino),
               size = 0.1, alpha = 0.1) +
    guides(color = guide_legend(override.aes = list(size=2, alpha = 1))) +
    scale_color_brewer(palette = "Set1")

Para mostrar cada categoría, en este caso cada estación, se hace un gráfico facetado:

ggmap(CABA_mapa) +
    geom_density2d(data = filter(bicisCABA2, bici_nombre_estacion_origen %in% estacionesfrecuentes2), aes(x = X, y = Y, color = stat(level))) +
    scale_color_viridis_c() +
    facet_wrap(~bici_nombre_estacion_origen)

## Warning: Computation failed in `stat_density2d()`:
## bandwidths must be strictly positive

## Warning: Computation failed in `stat_density2d()`:
## bandwidths must be strictly positive

## Warning: Computation failed in `stat_density2d()`:
## bandwidths must be strictly positive

## Warning: Computation failed in `stat_density2d()`:
## bandwidths must be strictly positive

## Warning: Computation failed in `stat_density2d()`:
## bandwidths must be strictly positive

Se puede realizar el facetado pero no aporta demasiado ya que la información es poco densa porque la variable son las estaciones, y la “categorías” en este caso son las distintas estaciones más frecuentes, y no los viajes, que darían un mapa más denso. En este dataset, los viajes o retiros de bicicletas no tiene coordenadas

En el caso de tener uan variable que contenga esa información, para comparar la densidad de los datos en el tiempo habría que realizar lo sigueinte:

estacionesfrecuentes2 <- estacionesfrecuentes2 %>% mutate(dia_semana = wday(bici_Fecha_hora_retiro, label = TRUE))

ggmap(CABA_mapa) + geom_point(data = filter(estacionesfrecuentes2, bici_nombre_estacion_origen %in% c(“Lima”, “Saavedra”)), aes(x = X, y = Y, color = bici_nombre_estacion_origen), alpha = .5, size = .2) + facet_wrap(~dia_semana)