ACTIVIDAD 1

library(tidyverse)
library(sf)
library(openxlsx)

2.a. Mapa coroplético que muestre la distribución geográfica de una variable numérica.

A partir del portal de datos de la Ciudad de Buenos Aires, analizamos si la ubicación de las estaciones de bicicletas públicas guarda alguna relación con caractéristicas demográficas y habitacionales de la Ciudad

Primero cargamos y ordenamos las bases de manera que queden listas para ser usadas como insumo para los mapas:

base_CABA <- read_sf("ACTIVIDADES/CABA/comunas.csv") %>% 
  rename(Comuna = comunas)

head(base_CABA)

## Simple feature collection with 6 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -58.53152 ymin: -34.67451 xmax: -58.3707 ymax: -34.56827
## CRS:           NA
## # A tibble: 6 x 8
##   wkt    barrios  perimetro  area  Comuna id    objeto                  geometry
##   <chr>  <chr>    <chr>      <chr> <chr>  <chr> <chr>             <MULTIPOLYGON>
## 1 MULTI~ RECOLETA 21452.838~ 6317~ 2      1     LIMIT~ (((-58.38 -34.57002, -58~
## 2 MULTI~ ALMAGRO~ 12323.432~ 6660~ 5      2     LIMIT~ (((-58.41287 -34.61412, ~
## 3 MULTI~ CABALLI~ 10990.964~ 6851~ 6      3     LIMIT~ (((-58.43061 -34.60705, ~
## 4 MULTI~ FLORES ~ 17972.257~ 1242~ 7      4     LIMIT~ (((-58.452 -34.62975, -5~
## 5 MULTI~ LINIERS~ 21411.738~ 1650~ 9      5     LIMIT~ (((-58.51925 -34.63301, ~
## 6 MULTI~ FLOREST~ 18332.037~ 1265~ 10     6     LIMIT~ (((-58.48834 -34.62016, ~

NBI <- read_csv("ACTIVIDADES/CABA/NBI-por-comuna.csv")

head(NBI)

## # A tibble: 6 x 2
##   Comuna `Hogares con NBI`
##   <chr>              <dbl>
## 1 1                   15.9
## 2 2                    2  
## 3 3                   11.9
## 4 4                   12.7
## 5 5                    6.1
## 6 6                    2.2

poblac <- read.xlsx("ACTIVIDADES/CABA/Poblacion CABA 2019.xlsx")

head(poblac)

##   Comuna   Total   Varón   Mujer Superficie.(km2)
## 1  Total 3072029 1441350 1630679        204.02176
## 2      1  255457  126828  128629         17.76490
## 3      2  149510   66744   82766          6.30245
## 4      3  193115   91151  101964          6.38500
## 5      4  239712  114629  125083         21.66900
## 6      5  187348   86159  101189          6.65905
##   Densidad.poblacional.(hab/km2)
## 1                       15057.36
## 2                       14379.87
## 3                       23722.52
## 4                       30245.11
## 5                       11062.44
## 6                       28134.34

base_CABA <- left_join(base_CABA, NBI, by = "Comuna")

base_CABA <- left_join(base_CABA, poblac, by = "Comuna")

base_CABA <- base_CABA %>% 
  rename(den_pob = `Densidad.poblacional.(hab/km2)`)

head(base_CABA)

## Simple feature collection with 6 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -58.53152 ymin: -34.67451 xmax: -58.3707 ymax: -34.56827
## CRS:           NA
## # A tibble: 6 x 14
##   wkt   barrios perimetro area  Comuna id    objeto                  geometry
##   <chr> <chr>   <chr>     <chr> <chr>  <chr> <chr>             <MULTIPOLYGON>
## 1 MULT~ RECOLE~ 21452.83~ 6317~ 2      1     LIMIT~ (((-58.38 -34.57002, -58~
## 2 MULT~ ALMAGR~ 12323.43~ 6660~ 5      2     LIMIT~ (((-58.41287 -34.61412, ~
## 3 MULT~ CABALL~ 10990.96~ 6851~ 6      3     LIMIT~ (((-58.43061 -34.60705, ~
## 4 MULT~ FLORES~ 17972.25~ 1242~ 7      4     LIMIT~ (((-58.452 -34.62975, -5~
## 5 MULT~ LINIER~ 21411.73~ 1650~ 9      5     LIMIT~ (((-58.51925 -34.63301, ~
## 6 MULT~ FLORES~ 18332.03~ 1265~ 10     6     LIMIT~ (((-58.48834 -34.62016, ~
## # ... with 6 more variables: `Hogares con NBI` <dbl>, Total <dbl>, Varón <dbl>,
## #   Mujer <dbl>, `Superficie.(km2)` <dbl>, den_pob <dbl>

estaciones_bicicletas <- st_read("ACTIVIDADES/CABA/nuevas-estaciones-bicicletas-publicas.csv")

## Reading layer `nuevas-estaciones-bicicletas-publicas' from data source `D:\Curso ciencia de datos para politicas publicas\segundo modulo\ACTIVIDADES\CABA\nuevas-estaciones-bicicletas-publicas.csv' using driver `CSV'
## Simple feature collection with 229 features and 8 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -58.51142 ymin: -34.66051 xmax: -58.35574 ymax: -34.5445
## CRS:           NA

Y a continuación realizamos el análisis gráfico.

Analizando por la densidad poblacional de cada una de las comunas, encontramos que:

ggplot() +
  geom_sf(data = base_CABA, aes(fill = den_pob)) +
  geom_sf(data = estaciones_bicicletas) +
  geom_sf_label(data = base_CABA, aes(label = Comuna), size = 2) +
  scale_fill_gradient2(low = "grey", mid = "white", high = "brown", midpoint = .02) +
  labs(title = "Densidad poblcional y ubicación de estaciones de bicicletas por Comuna",
       subtitle = "CABA - Mayo 2021",
       fill = "Densidad poblacional (hab./km2)",
       caption = "Fuente: CABA") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

Existe una acumulación importante de estaciones en las comunas más densamente pobladas. Las comunas del norte y la comuna 1 parecen esquivarle a esto. Veamos si hay alguna otra explicación para esta distribución usando la proporción de hogares con necesidades básicas insatisfechas como proxy de ver cuáles son las comunas más vulnerables socialmente.

ggplot() +
  geom_sf(data = base_CABA, aes(fill = `Hogares con NBI`)) +
  geom_sf(data = estaciones_bicicletas) +
  geom_sf_label(data = base_CABA, aes(label = Comuna), size = 2) +
  scale_fill_gradient2(low = "grey", mid = "white", high = "brown", midpoint = .02) +
  labs(title = "Hogares con NBI y ubicación de estaciones de bicicletas por Comuna",
       subtitle = "CABA - Mayo 2021",
       fill = "Porcentaje de hogares con NBI",
       caption = "Fuente: CABA") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

Encontramos que efectivamente la comuna 1 es la que mayor procentaje de hogares con necesidades básicas insatisfechas posee, lo que podría ser un dato más a tener en cuenta para la colocación de las estaciones de bicicletas. Por otro lado, mirando ambos mapas, pareciera ser muy baja la cantidad de estaciones de bicicletas presente en la comuna 8, en contraposición a la alta cantidad en las comunas 12, 13, 14 y 15

2.b. Mapa coroplético que muestre la distribución geográfica de una variable categórica

Analizamos el índice de feminidad de las comunas categorizandolas según si se encuentran con un índice mayor o menor al promedio de CABA.

Primero organizamos los datos:

centros_mujer <- read.csv("ACTIVIDADES/CABA/centros-integrales-de-la-mujer.csv")

base_sexo <- poblac %>% 
  filter(Comuna!= Total) %>%
  group_by(Comuna) %>% 
  mutate(indice_fem = Mujer/(Varón/100)) %>% 
  ungroup() %>% 
  mutate(fem = case_when(indice_fem > mean(indice_fem) ~ "Mayor",
                         indice_fem <= mean(indice_fem) ~ "Menor")) %>% 
  select(1,7,8)

head(base_sexo)

## # A tibble: 6 x 3
##   Comuna indice_fem fem  
##   <chr>       <dbl> <chr>
## 1 Total        113. Menor
## 2 1            101. Menor
## 3 2            124. Mayor
## 4 3            112. Menor
## 5 4            109. Menor
## 6 5            117. Mayor

base_CABA <- left_join(base_CABA, base_sexo, by = "Comuna")

Y a continuación realizamos el análisis gráfico:

ggplot() +
  geom_sf(data = base_CABA, aes(fill = fem), color = "white") +
  geom_point(data = centros_mujer, aes(x = long, y = lat), size = 3) +
  geom_sf_label(data = base_CABA, aes(label = Comuna), size = 2) +
  scale_fill_manual(values = c("springgreen4", "magenta4")) +
  labs(title = "Proporción de mujeres cada 100 varones por comuna",
       subtitle = "CABA - Mayo 2021",
       fill = "Índice de feminidad en relación al promedio de CABA",
       caption = "Fuente: CABA") +
  theme_light() +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

Encontramos que es muy marcado que las comunas del norte son las que mayor índice de feminidad tienen. Al mismo tiempo, se muestra la presencia de un centro de asistencia integral a la mujer en cada una de las comunas.

ACTIVIDAD 2

3. Realizar un “join espacial”, asignando a cada registro geo-referenciado la unidad geográfica (barrio, comuna, etc) que le corresponda y utilizar ggplot() para realizar lo siguiente:

3.1. Un mapa coroplético con los límites de los barrios, cuyo color de relleno indique la cantidad encontrada en cada uno

library(ggmap)

Ordenamos los datos para poder realizar los gráficos. En este caso utilicé datos de Londres.

londres <- st_read("ACTIVIDADES/Londres/London_Boroughs.gpkg")

## Reading layer `london_boroughs' from data source `D:\Curso ciencia de datos para politicas publicas\segundo modulo\ACTIVIDADES\Londres\London_Boroughs.gpkg' using driver `GPKG'
## Simple feature collection with 33 features and 7 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 503568.2 ymin: 155850.8 xmax: 561957.5 ymax: 200933.9
## Projected CRS: OSGB 1936 / British National Grid

londres <- st_transform(londres, 4326)


head(londres)

## Simple feature collection with 6 features and 7 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -0.4615023 ymin: 51.28676 xmax: 0.3340156 ymax: 51.63173
## Geodetic CRS:  WGS 84
##   objectid                 name  gss_code  hectares nonld_area ons_inner
## 1        1 Kingston upon Thames E09000021  3726.117      0.000         F
## 2        2              Croydon E09000008  8649.441      0.000         F
## 3        3              Bromley E09000006 15013.487      0.000         F
## 4        4             Hounslow E09000018  5658.541     60.755         F
## 5        5               Ealing E09000009  5554.428      0.000         F
## 6        6             Havering E09000016 11445.735    210.763         F
##   sub_2011                           geom
## 1    South POLYGON ((-0.330679 51.3290...
## 2    South POLYGON ((-0.0640212 51.318...
## 3    South POLYGON ((0.01213098 51.299...
## 4     West POLYGON ((-0.4220979 51.466...
## 5     West POLYGON ((-0.3354462 51.496...
## 6     East POLYGON ((0.2105254 51.4902...

Después, agregué una base csv de escuelas y la uní con la base de Londres del paso anterior a partir de la información geográfica, previo transformarla en una base de tipo geográfica, eliminando las escuelas fuera de Londres

escuelas <- read.csv("ACTIVIDADES/Londres/all_schools_xy_2016.csv")

escuelas_geo <- escuelas %>% 
  st_as_sf(coords = c("x", "y"), crs = 4326) %>% 
  select(geometry)

escuelas_geo <- st_join(escuelas_geo, londres) %>% 
  filter(!is.na(name))

head(escuelas_geo)

## Simple feature collection with 6 features and 7 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -0.378496 ymin: 51.5075 xmax: -0.0425897 ymax: 51.594
## Geodetic CRS:  WGS 84
##   objectid           name  gss_code hectares nonld_area ons_inner sub_2011
## 1        5         Ealing E09000009 5554.428      0.000         F     West
## 2       10         Barnet E09000003 8674.837      0.000         F    North
## 3       15 Waltham Forest E09000031 3880.793      0.000         F     East
## 4       26    Westminster E09000033 2203.005     54.308         T  Central
## 5       27         Camden E09000007 2178.932      0.000         T  Central
## 6       30        Hackney E09000012 1904.902      0.000         T     East
##                     geometry
## 1  POINT (-0.378496 51.5075)
## 2   POINT (-0.241628 51.579)
## 3  POINT (-0.0425897 51.594)
## 4  POINT (-0.150409 51.5176)
## 5  POINT (-0.193367 51.5404)
## 6 POINT (-0.0769998 51.5453)

A continuación, resumí en una base la cantidad de escuelas en cada barrio y eliminé las caraterísticas geográficas para que pueda unirse con la base de Londres.

escuelas_geo_sum <- escuelas_geo %>% 
  group_by(name) %>% 
  summarise(cantidad = n()) %>% 
  st_set_geometry(NULL)

head(escuelas_geo_sum)

## # A tibble: 6 x 2
##   name                 cantidad
##   <chr>                   <int>
## 1 Barking and Dagenham       65
## 2 Barnet                    171
## 3 Bexley                     87
## 4 Brent                     108
## 5 Bromley                   133
## 6 Camden                    111

londres_reg <- left_join(londres, escuelas_geo_sum, by ="name")

head(londres_reg)

## Simple feature collection with 6 features and 8 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -0.4615023 ymin: 51.28676 xmax: 0.3340156 ymax: 51.63173
## Geodetic CRS:  WGS 84
##   objectid                 name  gss_code  hectares nonld_area ons_inner
## 1        1 Kingston upon Thames E09000021  3726.117      0.000         F
## 2        2              Croydon E09000008  8649.441      0.000         F
## 3        3              Bromley E09000006 15013.487      0.000         F
## 4        4             Hounslow E09000018  5658.541     60.755         F
## 5        5               Ealing E09000009  5554.428      0.000         F
## 6        6             Havering E09000016 11445.735    210.763         F
##   sub_2011 cantidad                           geom
## 1    South       64 POLYGON ((-0.330679 51.3290...
## 2    South      159 POLYGON ((-0.0640212 51.318...
## 3    South      133 POLYGON ((0.01213098 51.299...
## 4     West       97 POLYGON ((-0.4220979 51.466...
## 5     West      124 POLYGON ((-0.3354462 51.496...
## 6     East       95 POLYGON ((0.2105254 51.4902...

Por último, cree una base adicional que realice el análisis de la cantidad de escuelas pero para cada una de las regiones

londres_subreg <- londres_reg %>%
  group_by(sub_2011) %>% 
  summarise(cantidad_subreg = sum(cantidad)/sum(hectares))

head(londres_subreg)

## Simple feature collection with 5 features and 2 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -0.5103751 ymin: 51.28676 xmax: 0.3340156 ymax: 51.69187
## Geodetic CRS:  WGS 84
## # A tibble: 5 x 3
##   sub_2011 cantidad_subreg                                                  geom
##   <chr>              <dbl>                                         <POLYGON [°]>
## 1 Central           0.0462 ((-0.1912135 51.48358, -0.1913915 51.48377, -0.19170~
## 2 East              0.0213 ((-0.08105036 51.52195, -0.08128417 51.52143, -0.081~
## 3 North             0.0194 ((-0.1713584 51.5724, -0.171697 51.57224, -0.1718111~
## 4 South             0.0163 ((-0.3529809 51.40872, -0.3538099 51.40914, -0.35414~
## 5 West              0.0186 ((-0.4592037 51.45657, -0.4592915 51.45661, -0.45938~

Así, con los datos ya organizados, creamos los mapas coropléticos.

En el primer mapa se realiza un análisis de la cantidad de escuelas por barrio:

bbox_londres <- as.numeric(st_bbox(londres))

mapa_londres <- get_stamenmap(bbox = bbox_londres,
                           maptype = "toner-background",
                           zoom=10)


ggmap(mapa_londres) +
  geom_sf(data = londres_reg, aes(fill = cantidad), alpha = 0.75, inherit.aes = FALSE) +
  geom_sf_label(data = londres_reg %>% filter(cantidad >= 160 | cantidad == min(cantidad)), aes(label = name),
                size = 2, inherit.aes = FALSE, vjust = 1.3) +
  scale_fill_distiller(palette = "YlOrRd", direction = 1) +
  theme_light() +
  labs(title = "Cantidad de escuelas en cada barrio de Londres",
       subtitle = "Mayo 2021",
       fill = "Cantidad de escuelas",
       caption = "Fuente: LONDON DATASTORE") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

Se puede ver que City of London es el barrio que menos escuelas posee, mientras que Barnet es el que más posee.

Pero, si realizamos el mismo análisis ajustando la cantidad de escuelas por la cantidad de hectáreas de cada barrio:

ggmap(mapa_londres) +
  geom_sf(data = londres_reg, aes(fill = cantidad/hectares), alpha = 0.75, inherit.aes = FALSE) +
  geom_sf_label(data = londres_reg %>% filter(cantidad/hectares >= 0.06 | cantidad/hectares == min(cantidad/hectares)), aes(label = name),
                size = 2, inherit.aes = FALSE) +
  scale_fill_distiller(palette = "YlOrRd", direction = 1) +
  theme_light() +
  labs(title = "Cantidad de escuelas por hectárea en cada barrio de Londres",
       subtitle = "Mayo 2021",
       fill = "Escuelas/hectárea",
       caption = "Fuente: LONDON DATASTORE") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

Se puede ver que los barrios que más escuelas por hectárea tienen son Kensington and Chelsea y Hackney, mientras que el barrio que menos escuelas por hectárea tiene es Havering.

Por último, realicé el mismo mapa pero ahora para regiones:

ggmap(mapa_londres) +
  geom_sf(data = londres_subreg, aes(fill = cantidad_subreg), alpha = 0.75, inherit.aes = FALSE, size = 1.5, color = "black") +
  geom_sf(data = londres, alpha = 0.20, inherit.aes = FALSE) +
  geom_sf_label(data = londres_reg %>% filter(cantidad/hectares >= 0.06 | cantidad/hectares == min(cantidad/hectares)), 
                aes(label = name), size = 2, inherit.aes = FALSE) +
  geom_sf_label(data = londres_subreg, aes(label = sub_2011), vjust = 1.5, color = "grey75", size = 6, inherit.aes = FALSE) +
  scale_fill_distiller(palette = "YlOrRd", direction = 1) +
  theme_light() +
  labs(title = "Cantidad de escuelas por hectárea en cada región de Londres",
       subtitle = "Mayo 2021",
       fill = "Escuelas/hectárea",
       caption = "Fuente: LONDON DATASTORE") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

Se puede ver que la zona central es la que más cantidad de escuelas por hectárea tiene, mientras que la zona sur es la que menor cantidad tiene.

ACTIVIDAD 3

library(lubridate)

2. Análisis Temporal: Realizar al menos 1 gráfico que les permita analizar la temporalidad de los datos. ¿Detectan algún patrón temporal? ¿A qué puede deberse?

En este caso voy a trabajar con datos de CABA sobre las obras civiles, públicas y privadas, nuevas o de remodelación registradas ante la Dirección General de Registro de Obras y Catastro.

Primero ordeno los datos para poder hacer el análisis temporal:

construcciones_caba <- read.csv("obrasregistradas-acumulado.csv", stringsAsFactor = TRUE, encoding = "UTF-8") %>% 
  select(Fecha.de.registro, Comuna, Barrio, Comuna.1, Tipo.obra, Superficie..m2., Lon, Lat) %>% 
  drop_na()

head(construcciones_caba)

##   Fecha.de.registro Comuna       Barrio  Comuna.1  Tipo.obra Superficie..m2.
## 1        2018-06-05     13     Belgrano Comuna 13 Obra Mayor         3740.93
## 2        2017-11-21     15 Villa Crespo Comuna 15 Obra Mayor         8467.87
## 3        2018-11-28     13        Nuñez Comuna 13 Obra Mayor         6059.00
## 4        2018-06-19     13        Nuñez Comuna 13 Obra Media         1905.20
## 5        2017-10-11     13        Nuñez Comuna 13 Obra Menor          394.67
## 6        2018-10-19     13        Nuñez Comuna 13 Obra Media         1380.12
##         Lon       Lat
## 1 -58.44747 -34.55714
## 2 -58.44125 -34.59772
## 3 -58.45675 -34.55188
## 4 -58.45708 -34.55163
## 5 -58.46443 -34.54158
## 6 -58.45760 -34.55090

Voy a realizar un análisis de la evolución en el tiempo del registro de obras en el norte y en el sur de CABA, primero por trimestre, y luego por mes.

Como los datos empiezan en septiembre de 2017 (último mes del 3er trimestre), voy a quedarme con los datos a partir de octubre de ese año, para que comience el análisis en un trimestre completo.

construcciones_caba<- construcciones_caba %>% 
  mutate(fecha_registro = ymd(Fecha.de.registro),
         mes_registro = month(fecha_registro),
         mes_registro2 = month(fecha_registro, label = TRUE, abbr = TRUE),
         año_registro = year(fecha_registro),
         año_registro2 = year(fecha_registro)) %>% 
  arrange(fecha_registro) %>% 
  select(-Fecha.de.registro) %>% 
  filter(fecha_registro > "2017-09-30") %>% 
  drop_na()

head(construcciones_caba)

##   Comuna           Barrio  Comuna.1  Tipo.obra Superficie..m2.       Lon
## 1     14          Palermo Comuna 14 Obra Media         1680.81 -58.44185
## 2      1      San Nicolas  Comuna 1 Obra Media         1122.00 -58.37855
## 3     10         Floresta Comuna 10 Obra Media          520.25 -58.48675
## 4     15         Paternal Comuna 15 Obra Media         1675.78 -58.47033
## 5     14          Palermo Comuna 14 Obra Media         1349.48 -58.44279
## 6      4 Parque Patricios  Comuna 4 Obra Mayor         2874.00 -58.39920
##         Lat fecha_registro mes_registro mes_registro2 año_registro
## 1 -34.58303     2017-10-02           10           oct         2017
## 2 -34.60343     2017-10-02           10           oct         2017
## 3 -34.62732     2017-10-02           10           oct         2017
## 4 -34.60226     2017-10-03           10           oct         2017
## 5 -34.56949     2017-10-03           10           oct         2017
## 6 -34.64414     2017-10-03           10           oct         2017
##   año_registro2
## 1          2017
## 2          2017
## 3          2017
## 4          2017
## 5          2017
## 6          2017

La separación de Comunas entre sur y norte se hizo de la siguiente manera:

construcciones_caba_resumen <- construcciones_caba %>% 
  group_by(año_registro, mes_registro, Comuna) %>% 
  mutate(periodo_registro = as.factor(case_when(año_registro == 2017 & mes_registro %in% c(10,11,12) ~ "4.2017",
                                                año_registro == 2018 & mes_registro %in% c(1,2,3) ~ "1.2018",
                                                año_registro == 2018 & mes_registro %in% c(4,5,6) ~ "2.2018",
                                                año_registro == 2018 & mes_registro %in% c(7,8,9) ~ "3.2018",
                                                año_registro == 2018 & mes_registro %in% c(10,11,12) ~ "4.2018",
                                                año_registro == 2019 & mes_registro %in% c(1,2,3) ~ "1.2019",
                                                año_registro == 2019 & mes_registro %in% c(4,5,6) ~ "2.2019",
                                                año_registro == 2019 & mes_registro %in% c(7,8,9) ~ "3.2019",
                                                año_registro == 2019 & mes_registro %in% c(10,11,12) ~ "4.2019")),
         Región = as.factor(case_when(Comuna %in% c(1,3,4,5,7,8,9) ~ "Sur",
                                      Comuna %in% c(2,6,10,11,12,13,14,15) ~ "Norte"))) %>%
  ungroup() %>% 
  group_by(periodo_registro, Región) %>% 
  summarise(cantidad = n())



head(construcciones_caba_resumen)

## # A tibble: 6 x 3
## # Groups:   periodo_registro [3]
##   periodo_registro Región cantidad
##   <fct>            <fct>     <int>
## 1 4.2017           Sur         164
## 2 4.2017           Norte       389
## 3 1.2018           Sur         133
## 4 1.2018           Norte       324
## 5 2.2018           Sur         175
## 6 2.2018           Norte       407

base_CABA <- base_CABA %>% mutate(Región = as.factor(case_when(Comuna %in% c(1,3,4,5,7,8,9) ~ "Sur",
                                      Comuna %in% c(2,6,10,11,12,13,14,15) ~ "Norte")))


bbox_caba <- as.numeric(st_bbox(base_CABA))

mapa_caba <- get_stamenmap(bbox = bbox_caba,
                           maptype = "terrain",
                           zoom = 13)


ggmap(mapa_caba) +
  geom_sf(data = base_CABA, inherit.aes = FALSE) +
  geom_sf(data = base_CABA, aes(fill = Región), inherit.aes = FALSE, alpha = 0.5) +
  geom_sf_label(data = base_CABA, aes(label = Comuna), size = 2, inherit.aes = FALSE) +
  labs(title="Regiones Norte-Sur",
         subtitle="CABA",
         fill="Región") +
  scale_fill_manual(values = c("skyblue3", "palegreen3")) +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

Y por último organicé los datos para trabajar con un período mensual, con el objetivo de comparar el año 2018 contra el 2019, ya que son los dos años que aparecen completos en el dataset original.

Para análisis agregado:

construcciones_caba_meses <- construcciones_caba %>% 
  filter(año_registro > 2017) %>% 
  group_by(mes_registro2, año_registro) %>% 
  summarise(cantidad = n())


head(construcciones_caba_meses)

## # A tibble: 6 x 3
## # Groups:   mes_registro2 [3]
##   mes_registro2 año_registro cantidad
##   <ord>                <dbl>    <int>
## 1 ene                   2018      160
## 2 ene                   2019      221
## 3 feb                   2018      130
## 4 feb                   2019      168
## 5 mar                   2018      167
## 6 mar                   2019      149

Para análisis por región:

construcciones_caba_meses_region <- construcciones_caba %>% 
  filter(año_registro > 2017) %>% 
  mutate(Región = as.factor(case_when(Comuna %in% c(1,3,4,5,7,8,9) ~ "Sur",
                                      Comuna %in% c(2,6,10,11,12,13,14,15) ~ "Norte"))) %>% 
  group_by(mes_registro2, año_registro, Región) %>% 
  summarise(cantidad = n())



head(construcciones_caba_meses_region)

## # A tibble: 6 x 4
## # Groups:   mes_registro2, año_registro [3]
##   mes_registro2 año_registro Región cantidad
##   <ord>                <dbl> <fct>     <int>
## 1 ene                   2018 Norte       116
## 2 ene                   2018 Sur          44
## 3 ene                   2019 Norte       159
## 4 ene                   2019 Sur          62
## 5 feb                   2018 Norte        93
## 6 feb                   2018 Sur          37

Una vez organizados los datos, pasamos a analizar graficamente:

ggplot(construcciones_caba_resumen) +
  geom_line(aes( x = periodo_registro, y = cantidad, group = Región, color = Región), size = 1.3) +
  geom_point(aes(x = periodo_registro, y = cantidad, color = Región, shape = Región), size = 3) +
  geom_text(aes(x = periodo_registro, y = cantidad + 20, label = cantidad), size = 3) +
  theme_light() +
  scale_color_manual(values = c("royalblue2", "lightgoldenrod2")) +
  labs(title = "Cantidad de construcciones registradas por trimestre - Norte y Sur",
       subtitle = "Cuarto trimeste 2017 a cuatro trimestre 2019 - CABA",
       fill = "Cantidad de construcciones",
       caption = "Fuente: CABA",
       y = "Cantidad",
       x = "Trimestre") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 8),
        axis.title.x = element_text(face = "bold", vjust = -2),
        axis.text.x = element_text(colour = "gray35",size = 8, angle = 30, vjust = 0.5),
        axis.title.y = element_text(face = "bold", vjust = 3 ))

Analizando la evolución temporal de registro de construcciones entre el sur y el norte se puede ver, primero, que la cantidad de construcciones es mayor en el norte que en el sur en todo momento. Segundo que, en el último trimestre pareciera haber un aumento que se repite tanto en 2018 como en 2019, más pronunciado en 2019, sobretodo en el norte. Esto último, ¿podría deberse a que el último trimestre de 2019 fue período electoral, lo que habitualmente se traslada en un aumento de la obra pública?

Si comparamos 2018 y 2019 de manera agregada para todo CABA:

ggplot(construcciones_caba_meses) +
  geom_bar(aes(x = mes_registro2, weight = cantidad, fill = año_registro)) +
  theme_light() +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 8),
        axis.title.x = element_text(face = "bold", vjust = -2),
        axis.text.x = element_text(colour = "gray35",size = 8, angle = 30, vjust = 0.5),
        axis.title.y = element_text(face = "bold", vjust = 3),
        legend.position = "none") +
  labs(title = "Cantidad de construcciones registradas por mes",
       subtitle = "2018-2019 - CABA",
       caption = "Fuente: CABA",
       y = "Cantidad",
       x = "Mes") +
  facet_wrap(~año_registro) +
  scale_fill_viridis_c(option = "turbo")

Se ve, al igual que en el gráfico anterior, que en el último trimestre se produce un aumento de las construcciones registradas. En 2019 destacan julio y diciembre, con gran diferencia con respecto a 2018.

ggplot(construcciones_caba_meses_region) +
  geom_line(aes( x = mes_registro2, y = cantidad, group = Región, color = Región), size = 1.3) +
  geom_point(aes(x = mes_registro2, y = cantidad, color = Región, shape = Región), size = 3) +
  geom_text(aes(x = mes_registro2, y = cantidad + 8, label = cantidad), size = 3) +
  theme_light() +
  scale_color_manual(values = c("royalblue2", "lightgoldenrod2")) +
  labs(title = "Cantidad de construcciones registradas por mes - Norte y Sur",
       subtitle = "2018-2019 - CABA",
       fill = "Cantidad de construcciones",
       caption = "Fuente: CABA",
       y = "Cantidad",
       x = "Mes") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 8),
        axis.title.x = element_text(face = "bold", vjust = -2),
        axis.text.x = element_text(colour = "gray35",size = 8, angle = 30, vjust = 0.5),
        axis.title.y = element_text(face = "bold", vjust = 3 )) +
  facet_wrap(~año_registro)

En este último gráfico se puede ver de nuevo muy claro lo que se veía en los gráficos anteriores. En 2019 se ve una tendencia alcista importante en períodos pre-electorales (Julio y octubre de 2019), y una caída muy fuerte en agosto de 2019 que, ¿podría deberse a la devaluación y la incertidumbre económica post PASO 2019? Habría que analizar si las caídas en septiembre y diciembre de 2018 podrían deberse a la incertidumbre económica, o por el contrario se deben a a algo estacional, porque tanto en septiembre 2018 como en agosto-septiembre-2019 hay una fuerte caída.

Por otro lado, si bien los movimientos del norte y del sur de CABA parecen moverse siempre en la misma dirección, en el norte los movimientos son más pronunciados tanto al alza como en la caída.

3. Análisis Espacial: Analizar la distribución espacial de los datos a partir de al menos 1 mapa de densidad que muestre donde se concentran la mayor cantidad de observaciones. Comparar la densidad de los datos en el tiempo (facetar). ¿Los patrones espaciales de los datos elegidos se mantienen o varían en el tiempo?

Comparando la cantidad de registros geográficamente entre 2018 y 2019:

bbox_caba <- as.numeric(st_bbox(base_CABA))

mapa_caba <- get_stamenmap(bbox = bbox_caba,
                           maptype = "terrain",
                           zoom = 13)
  


ggmap(mapa_caba) +
    stat_density_2d(data = construcciones_caba %>% 
               filter(año_registro > 2017),
             aes(x = Lon, y = Lat, fill = stat(level)), geom = "polygon", alpha = 0.8) +
  scale_fill_viridis_c(direction=-1) +
  geom_sf(data = base_CABA %>% filter(Región == "Sur"), colour = "brown", inherit.aes = FALSE, alpha = 0.2, size = 0.45) +
  geom_sf(data = base_CABA %>% filter(Región == "Norte"),colour = "grey3", inherit.aes = FALSE, alpha = 0.2, size = 0.45) +
  geom_sf_label(data = base_CABA, aes(label = Comuna), size = 1.8, inherit.aes = FALSE) +
  facet_wrap(~año_registro) +
  scale_fill_distiller(palette = "Spectral") +
  labs(title="Cantidad de construcciones registradas",
         subtitle="2018-2019 - CABA",
         caption= "Fuente: CABA",
         fill="Nivel") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

No pareciera haber grandes diferencias. Lo único que puede verse a simple vista es la desaparición de gran parte de los registros en la comuna 8 en 2019.

Por último probé si hay alguna diferncia a partir de comparar el período pre-electoral con el año previo (septiembre y octubre de 2019 vs 2018):

ggmap(mapa_caba) +
    stat_density_2d(data = construcciones_caba %>% 
               filter(año_registro > 2017 & mes_registro %in% c(9,10)),
             aes(x = Lon, y = Lat, fill = stat(level)), geom = "polygon", alpha = 0.8) +
  scale_fill_viridis_c(direction=-1) +
  geom_sf(data = base_CABA %>% filter(Región == "Sur"), colour = "brown", inherit.aes = FALSE, alpha = 0.2, size = 0.45) +
  geom_sf(data = base_CABA %>% filter(Región == "Norte"),colour = "grey3", inherit.aes = FALSE, alpha = 0.2, size = 0.45) +
  geom_sf_label(data = base_CABA, aes(label = Comuna), size = 1.8, inherit.aes = FALSE) +
  facet_wrap(~año_registro) +
  scale_fill_distiller(palette = "Spectral") +
  labs(title="Cantidad de construcciones registradas",
         subtitle="Septiembre, Octubre y Noviembre - 2018-2019 - CABA",
         caption= "Fuente: CABA",
         fill="Nivel") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

No parece haber grandes diferencias. Se ve el mismo resultado que en el mapa anterior en cuanto a los registros en la comuna 8.

ACTIVIDAD 4

library(rtweet)
library(leaflet)
library(lubridate)
library(ggrepel)

1. Descargar tweets que se originen en los alrededores de la Ciudad con la que están trabajando.

Descargué palabras realacionadas con la economía en la ciudad de Londres:

#tweets <- search_tweets(q = "ECONOMY OR BUSSINES OR INFLATION OR SALARY OR SALARIES OR UNEMPLOYMENT 
#                        OR EMPLOYMENT OR ECONOMIC+CRISIS OR ECONOMIC+GROWTH OR POVERTY OR 
#                        ECONOMIC+ACTIVITY OR GDP OR BITCOIN OR
#                        FINANCIAL OR FINANCE OR STOCK+MARKETS OR
#                        INVESTMENT OR FISCAL+DEFICIT OR CENTRAL+BANK OR ECONOMIST OR BANKRUPCY",
#                        n = 180000,
#                        lang = "en",
#                        include_rts = FALSE,
#                        geocode = "51.51542,-0.09316,16mi") 


tweets <- readRDS(file = "tweets.rds")

2. Realizar al menos 2 de las siguientes consignas:

a. ¿Cuáles son los mensajes con más repercusión? ¿Qué dicen?

tweets_mayor_repercursion <- tweets %>% 
  select(screen_name,favorite_count, retweet_count, quote_count, reply_count ,text) %>% 
  arrange(desc(favorite_count, retweet_count, quote_count, reply_count)) %>% 
  head(10)

tweets_mayor_repercursion$text

##  [1] "if the salary is so competitive why won't you tell me what it is? <U+0001F60C>"                                                                                                                                                                                                              
##  [2] "NOT TO BLAME FOR WHITE KIDS UNDERACHIEVING:\nSchool funding crisis\nYouth services slashed\nCuts to school support staff/social workers\nLibraries closed down\n4.3m kids now in poverty\nFamilies unable to afford food or heating\nAusterity\n\nTO BLAME:\nPeople saying “white privilege”"
##  [3] "I'm trying to send money from my bank account in one country to my bank account in another country. \n\nIt's been 3 days. The money hasn't arrived and I have no idea where it is.\n\nI don't want to hear #Bitcoin is too slow ever again."                                                 
##  [4] "If you voted leave, you voted to downgrade our economy. It doesn’t matter what other reasons you also had, you have to take responsibility for the full range of consequences such as they are"                                                                                              
##  [5] "Everywhere you look people telling you to leave the 9-5 and become a trader. <U+0001F602>\n\nBut remember, even if you work in a 9-5 you hate, there’s  still a decent chance you’re doing something more worthwhile than providing liquidity to a bunch of cunts in the financial markets." 
##  [6] "This is a government that:\nTrebled tuition fees\nAbolished the Educational Maintenance Allowance\nCut per pupil funding in real terms\nOversees a surge in youth unemployment during the pandemic\n\nAnd now claim they are the defenders of the education of 'white working class kids'!"  
##  [7] "We've had a pretty cushioned life here in the UK. Under this deceitful government all that has changed. Vast swathes of the country left behind, a division that has broken communities, festered resentment, increased poverty &amp; left millions in despair. Time for action! Apathy RIP" 
##  [8] "Want to know what we’re doing at 9am? We’re working through our caseload of doctors requiring legal support+employment advice, checking that we’re doing all we can to help. \n\nNHS staff are badly treated by this government and its policies. We’re pushing back. @EveryDoctorUK"        
##  [9] "I will also finance an agent <U+0001F64F><U+0001F3FE><U+270A><U+0001F3FE>#ZimbabweanLivesMatter https://t.co/UIwxNjgYb9"                                                                                                                                                                     
## [10] "Actually. If you wanted to fund the health service well, it is possible. The average nurse's salary is £33-35k per year. Lets just round that up to £37k for easier maths. Dido Harding oversees a £37 BILLION budget for Test and Trace. That could fund a MILLION nurses for a year."

Prácticamente todos los mensajes están asociados a críticas, especialmente asociadas al gasto en educación y a las condiciones de los empleados de la salud. También se destaca la palabra “Bitcoin”, que en bajadas anteriores también aparecía con bastante frecuencia, evidenciando la importancia creciente que tiene en la discusión económica.

b. ¿Cómo se distribuye la popularidad de los usuarios? ¿Quiénes son los 5 que más seguidores tienen? Graficar.

tweets_popularidad_usuarios <- tweets %>% 
  select(screen_name, followers_count, verified) %>% 
  group_by(screen_name) %>% 
  unique()

options(scipen = 999)

ggplot(tweets_popularidad_usuarios) +
  geom_histogram(aes( x = followers_count), fill = "cornflowerblue") +
  theme_light() +
  labs(title = "Cantidad de usuarios según cantidad de seguidores",
       subtitle = "Junio 2021",
       caption = "Fuente: Elaboración propia en base a Twitter",
       y = "Cantidad de usuarios",
       x = "Cantidad de seguidores") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 10),
        axis.title.x = element_text(face = "bold", vjust = -2),
        axis.text.x = element_text(colour = "gray35",size = 8, vjust = 0.5),
        axis.title.y = element_text(face = "bold", vjust = 3 ),
        plot.background = element_rect(fill = "gray89", colour = "black"))

Casi todos los usuarios recolectados poseen pocos seguidores

Para sacar alguna información útil, limité la cantidad de usuarios al 25% que menos seguidores tiene

quantile(tweets_popularidad_usuarios$followers_count)

##          0%         25%         50%         75%        100% 
##        0.00      194.00      795.00     3168.75 25624656.00

ggplot(tweets_popularidad_usuarios %>% filter(followers_count<= 855)) +
  geom_histogram(aes( x = followers_count), fill = "cornflowerblue", colour = "black") +
  theme_light() +
  labs(title = "Cantidad de usuarios según cantidad de seguidores",
       subtitle = "Junio 2021",
       caption = "Fuente: Elaboración propia en base a Twitter",
       y = "Cantidad de usuarios",
       x = "Cantidad de seguidores") +
    theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 10),
        axis.title.x = element_text(face = "bold", vjust = -0.8),
        axis.text.x = element_text(colour = "gray35",size = 8, vjust = 0.5),
        axis.title.y = element_text(face = "bold", vjust = 3 ),
        plot.background = element_rect(fill = "gray89", colour = "black"))

Se puede ver como hay cada vez menos usuarios en la muestra a medida que aumenta la cantidad de seguidores de cada usuario.

Para ver cuáles son las cuentas con más seguidores que están tuiteando sobre economía:

ggplot(tweets_popularidad_usuarios %>% 
         arrange(desc(followers_count)) %>% 
         head(10)) +
  geom_bar(aes(x = reorder(screen_name, followers_count), weight = followers_count/1000000), fill = "cornflowerblue", colour = "black") +
  coord_flip() +
  theme_light() +
  labs(title = "Usuarios más populares tuiteando sobre economía",
       subtitle = "Londres - Junio 2021",
       caption = "Fuente: Twitter",
       y = "Cantidad de millones de seguidores",
       x = "Usuario de Twitter") +
  scale_y_continuous(breaks = seq(0, 40, by = 5)) +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 10),
        axis.title.x = element_text(face = "bold", vjust = -2),
        axis.text.x = element_text(colour = "gray35",size = 10, vjust = 0.5),
        axis.title.y = element_text(face = "bold", vjust = 3 ),
        plot.background = element_rect(fill = "gray89", colour = "black"))

tweets_popularidad_usuarios %>%
  group_by(screen_name) %>% 
  unique() %>% 
  arrange(desc(followers_count)) %>% 
  head(5)

## # A tibble: 5 x 3
## # Groups:   screen_name [5]
##   screen_name  followers_count verified
##   <chr>                  <int> <lgl>   
## 1 TheEconomist        25624656 TRUE    
## 2 BBCNews             12344426 TRUE    
## 3 guardian             9753985 TRUE    
## 4 SkyNews              7157767 TRUE    
## 5 FT                   4606474 TRUE

Son todos medios informativos, destacándose entre los 5 primeros, The Economist y Financial Times, dedicados a la economía.

Por último, filtré por cuentas no verificadas, para analizar los usuarios más populares que tuiteen sobre economía que no sean medios de comunicación.

ggplot(tweets_popularidad_usuarios %>% 
         arrange(desc(followers_count)) %>% 
         filter(verified == "FALSE") %>%
         head(10)) +
  geom_bar(aes(x = reorder(screen_name, followers_count), weight = followers_count/1000), fill = "cornflowerblue", colour = "black") +
  coord_flip() +
  theme_light() +
  labs(title = "Usuarios más populares tuiteando sobre economía",
       subtitle = "Londres - Junio 2021",
       caption = "Fuente: Twitter",
       y = "Cantidad de miles de seguidores",
       x = "Usuario de Twitter") +
  scale_y_continuous(breaks = seq(0, 600, by = 100)) +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 10),
        axis.title.x = element_text(face = "bold", vjust = -2),
        axis.text.x = element_text(colour = "gray35",size = 10, vjust = 0.5),
        axis.title.y = element_text(face = "bold", vjust = 3 ),
        plot.background = element_rect(fill = "gray89", colour = "black"))

tweets_popularidad_usuarios %>%
  filter(verified == "FALSE") %>% 
  group_by(screen_name) %>% 
  unique() %>% 
  arrange(desc(followers_count)) %>% 
  head(5)

## # A tibble: 5 x 3
## # Groups:   screen_name [5]
##   screen_name     followers_count verified
##   <chr>                     <int> <lgl>   
## 1 DefendAssange            580547 FALSE   
## 2 guardianworld            473528 FALSE   
## 3 CryptoCred               319479 FALSE   
## 4 thomaspower              299966 FALSE   
## 5 JonathanPieNews          299626 FALSE

Lo que se puede ver es que, de las 10 cuentas más populares hablando de economía (que no están verificadas), hay 3 que son cuentas especializadas en criptomonedas: CryptoCred, thomaspower y Block_Flash, lo que nuevamente muestra la creciente importancia de las criptomonedas en la conversación economica de Twitter.

c. ¿En qué momento del día se realiza la mayor cantidad de tweets? Graficar.

ts_plot(tweets, by="30 minutes", tz = "Europe/England") +
  labs(title = "Evolución de tweets hablando sobre economía",
       x = "Hora",
       y = "Cantidad de tweets")

El 21 de junio fue el día con más tweets

tweets_junio21 <- tweets %>% 
  mutate(created_at = ymd_hms(format(created_at, tz = "Europe/London")),
         dia = substr(created_at,9,10)) %>% 
  filter(dia == 21)



ts_plot(tweets_junio21, by="10 minutes") +
labs(title = "Evolución de tweets hablando sobre economía",
       x = "Hora",
       y = "Cantidad de tweets")

Y, analizando el 21 de junio, a media mañana se produjo la mayor cantidad de tweets, especialmente entre las 9am y las 12pm

d. Aislando los tweets que poseen coordenadas geográficas (lat y long), crear al menos 1 mapa que muestre posición de los tweets y cantidad de seguidores del usuario que tuitea.

tweets<- lat_lng(tweets, coords = c("coords_coords", "bbox_coords", "geo_coords"))

tweets_geo <- tweets %>% 
  filter(!is.na(lat), !is.na(lng))

mapa_londres <- get_stamenmap(bbox = bbox_londres,
                              maptype = "terrain-background",
                              zoom=10)

ggmap(mapa_londres) +
  stat_density_2d(data = tweets, aes(x = lng, y = lat, fill = stat(level)), geom = "polygon", alpha = 0.8) +
  geom_sf(data = londres_subreg, fill= NA, alpha = 0.75, inherit.aes = FALSE, size = 1.5, color = "black") +
  geom_sf(data = londres_reg, fill = NA, inherit.aes = FALSE) +
  geom_label_repel(data = londres_reg %>% filter(sub_2011 == "Central"), 
                   aes(label = name, geometry = geom), stat = "sf_coordinates", 
                   min.segment.length =0, inherit.aes = FALSE, size = 2) +
  geom_label_repel(data = londres_reg %>% filter(name %in% c("Kingston upon Thames", "Newham")), 
                   aes(label = name, geometry = geom), stat = "sf_coordinates", 
                   min.segment.length =0, inherit.aes = FALSE, size = 2) +
  theme_light() +
  scale_fill_distiller(palette = "Spectral") +
  labs(title = "Cantidad de tweets hablando sobre economía",
       subtitle = "Londres - Junio 2021",
       caption = "Fuente: Elaboración propia en base a Twitter y LONDON DATASTORE") +
  theme(plot.title=element_text(size=12, face = "bold"),
        plot.subtitle=element_text(size=9, face = "bold"),
        axis.text.y = element_text(colour = "gray35",size = 6),
        axis.title.x = element_text(colour = "white"),
        axis.text.x = element_text(colour = "gray35",size = 6),
        axis.title.y = element_text(colour = "white"))

La mayor cantidad de tweets se observa en la región central de Londres y alrededores, lo que tiene bastante lógica, porque allí se concentra la actividad económica y financiera. Resaltan, un poco más alejados del centro, los municipios de Kingston upon Thames y Newham.

Por último hice un mapa interactivo con los tweets más relevantes de la zona central, para poder ver sobre qué se está hablando allí:

tweets_geo <- tweets_geo %>% 
  st_as_sf(coords = c("lng", "lat"), crs = 4326)


tweets_geo_col <- st_join(tweets_geo, londres_subreg) %>% 
  filter(sub_2011 == "Central")

leaflet(tweets_geo_col %>% 
          arrange(desc(favorite_count, retweet_count, quote_count, reply_count)) %>% 
  head(10)) %>% 
  addTiles() %>%
  addAwesomeMarkers(popup = ~text, icon = awesomeIcons(icon = "twitter", library = "fa", iconColor = "black", markerColor = "blue"))

Nuevamente se puede ver la relevancia del Bitcoin, ya que 2 de los 5 tweets que aparecen lo mencionan.

Ciencia de Datos para Ciudades

Federico Kisza

25/5/2021