#TP3 Capturando y explorando datos de Twitter

Antes de comenzar el TP3 procedemos a instalar los paquetes necesarios para su análisis:

#install.packages("rtweet")
library(rtweet)
## Warning: package 'rtweet' was built under R version 3.6.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'stringr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x purrr::flatten() masks rtweet::flatten()
## x dplyr::lag()     masks stats::lag()

Luego creamos el Twitter Token, obtenido a través de las distintas claves provistas por la cuenta de Tweiter desarrollador tramitada en la pagina de la aplicación:

twitter_token <- create_token(
  app = "adanage",
  consumer_key = "0g1wo8TannvPUNFM6BFIrT1VG",
  consumer_secret = "dlGxMevHE7IBvyyMhtumLuazSz1ZzBMAXAk0U9IcHVvA5N2yRw",
  access_token = "155912627-mFhFHXNnfAHQayiJm8I9J8f4Xiy7zl8VO55lUA49", 
  access_secret = "0U06CDJZdn5RNTcH7hr0aoqMEdpgbKjYIj4gl2eZnkmVR")

Descargar tweets que se originen en los alrededores de la Ciudad con la que están trabajando, y explorar las columnas/variables que contiene.

tweets <- search_tweets(q = "independentzia", n = 3000)
users_data(tweets) %>% head()
## # A tibble: 6 x 20
##   user_id screen_name name  location description url   protected followers_count
##   <chr>   <chr>       <chr> <chr>    <chr>       <chr> <lgl>               <int>
## 1 125232~ TomasBeraza "Sab~ "Baraka~ "Beldur be~ <NA>  FALSE                  33
## 2 144190~ Igor_XIV    "Igo~ "Bizkai~ "Taking a ~ <NA>  FALSE                 316
## 3 266300~ desdemireb~ "Des~ "PLENTZ~ "Aquí a ve~ http~ FALSE                 327
## 4 768128~ arkait_z    "Ark~ "Oreret~ "\U0001f59~ <NA>  FALSE                 289
## 5 869180~ mikel100779 "Mik~ ""       "Maialen e~ <NA>  FALSE                 251
## 6 769896~ nusky2011   "Ore~ "Gipuzk~ "Abertzale~ <NA>  FALSE                 143
## # ... with 12 more variables: friends_count <int>, listed_count <int>,
## #   statuses_count <int>, favourites_count <int>, account_created_at <dttm>,
## #   verified <lgl>, profile_url <chr>, profile_expanded_url <chr>,
## #   account_lang <lgl>, profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>
names(tweets)
##  [1] "user_id"                 "status_id"              
##  [3] "created_at"              "screen_name"            
##  [5] "text"                    "source"                 
##  [7] "display_text_width"      "reply_to_status_id"     
##  [9] "reply_to_user_id"        "reply_to_screen_name"   
## [11] "is_quote"                "is_retweet"             
## [13] "favorite_count"          "retweet_count"          
## [15] "quote_count"             "reply_count"            
## [17] "hashtags"                "symbols"                
## [19] "urls_url"                "urls_t.co"              
## [21] "urls_expanded_url"       "media_url"              
## [23] "media_t.co"              "media_expanded_url"     
## [25] "media_type"              "ext_media_url"          
## [27] "ext_media_t.co"          "ext_media_expanded_url" 
## [29] "ext_media_type"          "mentions_user_id"       
## [31] "mentions_screen_name"    "lang"                   
## [33] "quoted_status_id"        "quoted_text"            
## [35] "quoted_created_at"       "quoted_source"          
## [37] "quoted_favorite_count"   "quoted_retweet_count"   
## [39] "quoted_user_id"          "quoted_screen_name"     
## [41] "quoted_name"             "quoted_followers_count" 
## [43] "quoted_friends_count"    "quoted_statuses_count"  
## [45] "quoted_location"         "quoted_description"     
## [47] "quoted_verified"         "retweet_status_id"      
## [49] "retweet_text"            "retweet_created_at"     
## [51] "retweet_source"          "retweet_favorite_count" 
## [53] "retweet_retweet_count"   "retweet_user_id"        
## [55] "retweet_screen_name"     "retweet_name"           
## [57] "retweet_followers_count" "retweet_friends_count"  
## [59] "retweet_statuses_count"  "retweet_location"       
## [61] "retweet_description"     "retweet_verified"       
## [63] "place_url"               "place_name"             
## [65] "place_full_name"         "place_type"             
## [67] "country"                 "country_code"           
## [69] "geo_coords"              "coords_coords"          
## [71] "bbox_coords"             "status_url"             
## [73] "name"                    "location"               
## [75] "description"             "url"                    
## [77] "protected"               "followers_count"        
## [79] "friends_count"           "listed_count"           
## [81] "statuses_count"          "favourites_count"       
## [83] "account_created_at"      "verified"               
## [85] "profile_url"             "profile_expanded_url"   
## [87] "account_lang"            "profile_banner_url"     
## [89] "profile_background_url"  "profile_image_url"

¿Cómo se distribuye la popularidad de los usuarios? ¿Quiénes son los 5 que más seguidores tienen? Graficar.

tweets %>% 
    top_n(5, followers_count) %>% 
    arrange(desc(followers_count)) %>% 
    select(screen_name, followers_count, location, text)
## # A tibble: 5 x 4
##   screen_name  followers_count location     text                                
##   <chr>                  <int> <chr>        <chr>                               
## 1 jpermach               20472 Euskal Herr~ "41 urte betetzen dira JARRAI sortu~
## 2 MartxeloDiaz           12443 Iruñea-Dono~ "41 urte betetzen dira JARRAI sortu~
## 3 matalaz                 7012 Basque Coun~ "@Herripublika Independentzia esate~
## 4 bakunin69               4974 Madrid, Esp~ "Frantziar eta espainiar estatu inp~
## 5 bakunin69               4974 Madrid, Esp~ "Espainiar eta Frantziar estatuek e~
options(scipen = 20)
ggplot(tweets) +
    geom_histogram(aes(x = followers_count)) + labs(title = "Distribución de los usuarios mas populares en Twitter",
         subtitle = "Ciudad de Bilbao, España",
         caption = "Fuente: Database Desarrollo de Twitter") +
    theme_light() +
  scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

¿Cuáles son los mensajes con más repercusión? ¿Qué dicen?

Histogramas de Tweets mas populares:

ggplot(filter(tweets, !is_retweet))+
    geom_histogram(aes(x = retweet_count))+ labs(title = "Distribución de los tweets mas populares",
         subtitle = "Ciudad de Bilbao, España",
         caption = "Fuente: Database Desarrollo de Twitter") +
    theme_light() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Tweet mas popular:

tweets %>% 
    filter(!is_retweet) %>% 
    filter(retweet_count == max(retweet_count)) %>% 
    select(screen_name, retweet_count, followers_count, location, text)
## # A tibble: 1 x 5
##   screen_name   retweet_count followers_count location text                     
##   <chr>                 <int>           <int> <chr>    <chr>                    
## 1 Herriexisten~            25             311 ""       "Espainiar eta Frantziar~

Traducido en Español desde Euskera: “Los estados español y francés deciden quién, cuándo, cómo y dónde podemos ir en nuestro país. ¿DÓNDE ESTÁN LAS ESTRUCTURAS ESTATALES AHORA QUE LAS ESTRUCTURAS ESTATALES SE VENDIERON? ¿Dónde está nuestro autogobierno? Es suficiente”

¿En qué momento del día se realiza la mayor cantidad de tweets? Graficar.

library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.3
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
tweets <- tweets %>%
mutate(created_at=ymd_hms(created_at))
ts_plot(tweets, "hours") + 
  labs(title = "Distribución diaria en la que se realiza mayor cantidad de Tweets",
         subtitle = "Ciudad de Bilbao, España",
         caption = "Fuente: Database Desarrollo de Twitter") +
    theme_light()

Procedencia de los usuarios:

tweets %>%
    filter(location != "", !is.na(location)) %>% 
    count(location) %>% 
    top_n(10, n) %>% 
    ggplot() +
      geom_col(aes(x = reorder(location, n), y = n)) + 
      coord_flip() +
      labs(title = "Procedencia de los usuarios",
           x = "ubicación",
           y = "cantidad", subtitle = "Ciudad de Bilbao, España",
         caption = "Fuente: Database Desarrollo de Twitter") +
    theme_light()

Aislando los tweets que poseen coordenadas geográficas (lat y long), crear mapas que muestren posición de los tweets y cantidad de seguidores del usuario que tuitea.

Primero instalamos los las funciones necesarias para trabajar los datos y luego mapearlos:

#install.packages("osmdata")
#install.packages("leaflet")
library(osmdata)
## Warning: package 'osmdata' was built under R version 3.6.3
## Data (c) OpenStreetMap contributors, ODbL 1.0. https://www.openstreetmap.org/copyright
library(leaflet)
## Warning: package 'leaflet' was built under R version 3.6.3
library(tidyverse) 
library(sf) 
## Warning: package 'sf' was built under R version 3.6.3
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3

Tomamos las coordenadas del Museo Guggenheim Bilbao como centro de la búsqueda. La palabra buscada en los tweets el athletic en relación al club de futbol local.

tweets_fanaticos <- search_tweets(q = "athletic",
              geocode = "43.268270,-2.933573,30mi",
              include_rts = FALSE,
              n = 100000,
              retryonratelimit = TRUE)

Extraemos las cordenadas

tweets_fanaticos <- lat_lng(tweets_fanaticos)
tweets_fanaticos <- tweets_fanaticos %>% 
    select(-geo_coords, -coords_coords, -bbox_coords) 
tweets_fanaticos_geo <- tweets_fanaticos %>% 
    filter(!is.na(lat), !is.na(lng))
nrow(tweets_fanaticos_geo)
## [1] 56

Cargamos en paquete necesario

library(ggmap)
## Warning: package 'ggmap' was built under R version 3.6.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
bbox <- make_bbox(lon = tweets_fanaticos_geo$lng, lat = tweets_fanaticos_geo$lat)

bbox
##      left    bottom     right       top 
## -3.054023 42.999960 -2.366474 43.368245
mapa_bilbao <- get_stamenmap(bbox, zoom = 11)
## Source : http://tile.stamen.com/terrain/11/1006/749.png
## Source : http://tile.stamen.com/terrain/11/1007/749.png
## Source : http://tile.stamen.com/terrain/11/1008/749.png
## Source : http://tile.stamen.com/terrain/11/1009/749.png
## Source : http://tile.stamen.com/terrain/11/1010/749.png
## Source : http://tile.stamen.com/terrain/11/1006/750.png
## Source : http://tile.stamen.com/terrain/11/1007/750.png
## Source : http://tile.stamen.com/terrain/11/1008/750.png
## Source : http://tile.stamen.com/terrain/11/1009/750.png
## Source : http://tile.stamen.com/terrain/11/1010/750.png
## Source : http://tile.stamen.com/terrain/11/1006/751.png
## Source : http://tile.stamen.com/terrain/11/1007/751.png
## Source : http://tile.stamen.com/terrain/11/1008/751.png
## Source : http://tile.stamen.com/terrain/11/1009/751.png
## Source : http://tile.stamen.com/terrain/11/1010/751.png
## Source : http://tile.stamen.com/terrain/11/1006/752.png
## Source : http://tile.stamen.com/terrain/11/1007/752.png
## Source : http://tile.stamen.com/terrain/11/1008/752.png
## Source : http://tile.stamen.com/terrain/11/1009/752.png
## Source : http://tile.stamen.com/terrain/11/1010/752.png
ggmap(mapa_bilbao)

mapa_Bilbao <- get_stamenmap(bbox, maptype = "toner-lite", zoom = 11)
## Source : http://tile.stamen.com/toner-lite/11/1006/749.png
## Source : http://tile.stamen.com/toner-lite/11/1007/749.png
## Source : http://tile.stamen.com/toner-lite/11/1008/749.png
## Source : http://tile.stamen.com/toner-lite/11/1009/749.png
## Source : http://tile.stamen.com/toner-lite/11/1010/749.png
## Source : http://tile.stamen.com/toner-lite/11/1006/750.png
## Source : http://tile.stamen.com/toner-lite/11/1007/750.png
## Source : http://tile.stamen.com/toner-lite/11/1008/750.png
## Source : http://tile.stamen.com/toner-lite/11/1009/750.png
## Source : http://tile.stamen.com/toner-lite/11/1010/750.png
## Source : http://tile.stamen.com/toner-lite/11/1006/751.png
## Source : http://tile.stamen.com/toner-lite/11/1007/751.png
## Source : http://tile.stamen.com/toner-lite/11/1008/751.png
## Source : http://tile.stamen.com/toner-lite/11/1009/751.png
## Source : http://tile.stamen.com/toner-lite/11/1010/751.png
## Source : http://tile.stamen.com/toner-lite/11/1006/752.png
## Source : http://tile.stamen.com/toner-lite/11/1007/752.png
## Source : http://tile.stamen.com/toner-lite/11/1008/752.png
## Source : http://tile.stamen.com/toner-lite/11/1009/752.png
## Source : http://tile.stamen.com/toner-lite/11/1010/752.png
ggmap(mapa_Bilbao)

ggmap(mapa_Bilbao) +
    geom_point(data = tweets_fanaticos_geo, aes(x = lng, y = lat), size=4, color="salmon", alpha=.5)

tweets_fanaticos_geo <- arrange(tweets_fanaticos_geo, followers_count)

ggmap(mapa_Bilbao) + 
    geom_point(data = tweets_fanaticos_geo, 
               aes(x = lng, y = lat, color = followers_count), size=5, alpha=.5) +
  labs(title = "Tweet según popularidad del usuario",
           x = "Longitud",
           y = "Latitud", subtitle = "Ciudad de Bilbao, España",
         caption = "Fuente: Database Desarrollo de Twitter")+
    scale_color_distiller(palette = "Spectral")

ggmap(mapa_Bilbao) + 
    geom_point(data = tweets_fanaticos_geo, 
               aes(x = lng, y = lat, color = followers_count, size = retweet_count),
               alpha = .5) +
   labs(title = "Tweet según popularidad del usuario y cantidad de retweets",
           x = "Longitud",
           y = "Latitud", subtitle = "Ciudad de Bilbao, España",
         caption = "Fuente: Database Desarrollo de Twitter")+
    scale_color_distiller(palette = "Spectral")

Realizamos mapas interactivos con los tweets descargados

paleta <- colorNumeric(
  palette = "viridis",
  domain = tweets_fanaticos_geo$followers_count)
leaflet(tweets_fanaticos_geo) %>% 
    addTiles() %>% 
    addCircleMarkers(radius = ~retweet_count,
                     popup = ~text,
                     color = ~paleta(followers_count)) %>% 
    addProviderTiles(providers$CartoDB.Positron) %>% 
    addLegend(title = "seguidores", pal = paleta, values = ~followers_count)
## Assuming "lng" and "lat" are longitude and latitude, respectively

## damos por finalizado el TP3.-