library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.1
## -- Attaching packages ------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.0 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.1
## Warning: package 'tidyr' was built under R version 3.6.1
## Warning: package 'readr' was built under R version 3.6.1
## Warning: package 'purrr' was built under R version 3.6.1
## Warning: package 'dplyr' was built under R version 3.6.1
## Warning: package 'stringr' was built under R version 3.6.1
## Warning: package 'forcats' was built under R version 3.6.1
## -- Conflicts ---------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rtweet)
## Warning: package 'rtweet' was built under R version 3.6.1
##
## Attaching package: 'rtweet'
## The following object is masked from 'package:purrr':
##
## flatten
library(sf)
## Warning: package 'sf' was built under R version 3.6.1
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.6.1
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(leaflet)
## Warning: package 'leaflet' was built under R version 3.6.1
Para poder descargar datos de twitter debemos pedir autorización desde Rstudio a Twitter a partir del nombre de la aplicación y las claves otorgadas. Luego buscamos tweets por palabras de interés con la ubicación de la ciudad elegida. (Para este caso vamos a trabajar usando el archivo csv de mi grupo, por la demora en obtener la autorización de Twitter).
appname <- “RTWEET” consumer_key <- “caracteres_1” consumer_secret <- “caracteres_2”
twitter_token <- create_token( app = appname, consumer_key = consumer_key, consumer_secret = consumer_secret)
Buscamos las palabras de interés relacionadas al transporte con las coordenadas de Manhattan y en un radio de 30 millas.
tweets_NYC <- search_tweets(q = “subway OR bicycle OR rail OR taxi OR helicopter”, geocode = “40.7834282,-73.9662476,30mi”, include_rts = FALSE, n = 100000, retryonratelimit = TRUE)
Extraemos de los datos descargados las coordenadas de los tweets para generar mapas con la ubicación de los mismos. Para ello necesitamos extraer los datos de latitud y longitud, se guardan en sus columnas correspondientes, luego las sumamos al dataset y postriormente descartamos aquellos campos en formato lista.
coordenadas <- function(campo_coordenadas) {extraer_coordenadas <- function(lista_coords) {data_frame(lon = lista_coords[1], lat = lista_coords[2]}
map_df(campo_coordenadas, extraer_coordenadas)
tweets_NYC <- tweets_NYC %>% cbind(coordenadas(tweets_NYC$coords_coords)) %>% select(-geo_coords, -coords_coords, -bbox_coords)
Guardar el dataset en formato csv.
tweets_NYTr <- read.csv("E:/Documentos/Ciencia de Datos II/Tw/tweets_NYC.csv")
users_data(tweets_NYTr) %>% head()
## user_id screen_name name location
## 1 x795789 carynrose Caryn Rose New York, NY
## 2 x788158375980859393 __Naranja naranja means orange Bronx, NY
## 3 x14871302 BriHReed Brian Reed New York, NY
## 4 x21625936 DavidFBrand David Brand Queens
## 5 x21625936 DavidFBrand David Brand Queens
## 6 x21625936 DavidFBrand David Brand Queens
## description
## 1 Music writer, crabby old punk rock lady. Clips, books & newsletter @ link. WHY PATTI SMITH MATTERS tk fm @utexaspress. She / Her
## 2 spread love
## 3 Senior Producer @thisamerlife. Host of @stownpodcast.
## 4 Hype Managing Editor @QueensEagle + Licensed Social Worker || Queens, Justice/Reform, Mets || Words in @xanga @livejournal @angelfire @geocities || #SoccerInNYC
## 5 Hype Managing Editor @QueensEagle + Licensed Social Worker || Queens, Justice/Reform, Mets || Words in @xanga @livejournal @angelfire @geocities || #SoccerInNYC
## 6 Hype Managing Editor @QueensEagle + Licensed Social Worker || Queens, Justice/Reform, Mets || Words in @xanga @livejournal @angelfire @geocities || #SoccerInNYC
## url protected followers_count friends_count
## 1 https://t.co/fPuRWdkofG FALSE 7661 3282
## 2 FALSE 474 409
## 3 https://t.co/q3amcUcZV5 FALSE 34636 494
## 4 https://t.co/a8SWUEOKHQ FALSE 1786 989
## 5 https://t.co/a8SWUEOKHQ FALSE 1786 989
## 6 https://t.co/a8SWUEOKHQ FALSE 1786 989
## listed_count statuses_count favourites_count account_created_at
## 1 217 227103 136225 2007-02-26 19:22:24
## 2 0 6860 19574 2016-10-17 23:22:55
## 3 195 380 885 2008-05-22 18:04:04
## 4 40 4755 22423 2009-02-23 03:04:17
## 5 40 4755 22423 2009-02-23 03:04:17
## 6 40 4755 22423 2009-02-23 03:04:17
## verified profile_url profile_expanded_url
## 1 TRUE https://t.co/fPuRWdkofG https://manylink.co/@caryn_rose
## 2 FALSE
## 3 TRUE https://t.co/q3amcUcZV5 http://stownpodcast.org
## 4 FALSE https://t.co/a8SWUEOKHQ http://QueensEagle.com
## 5 FALSE https://t.co/a8SWUEOKHQ http://QueensEagle.com
## 6 FALSE https://t.co/a8SWUEOKHQ http://QueensEagle.com
## account_lang
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## profile_banner_url
## 1 https://pbs.twimg.com/profile_banners/795789/1540002040
## 2 https://pbs.twimg.com/profile_banners/788158375980859393/1546491222
## 3 https://pbs.twimg.com/profile_banners/14871302/1489457635
## 4 https://pbs.twimg.com/profile_banners/21625936/1547323019
## 5 https://pbs.twimg.com/profile_banners/21625936/1547323019
## 6 https://pbs.twimg.com/profile_banners/21625936/1547323019
## profile_background_url
## 1 http://abs.twimg.com/images/themes/theme15/bg.png
## 2
## 3 http://abs.twimg.com/images/themes/theme1/bg.png
## 4 http://abs.twimg.com/images/themes/theme8/bg.gif
## 5 http://abs.twimg.com/images/themes/theme8/bg.gif
## 6 http://abs.twimg.com/images/themes/theme8/bg.gif
## profile_image_url
## 1 http://pbs.twimg.com/profile_images/807997823597813760/Sc8l2N-S_normal.jpg
## 2 http://pbs.twimg.com/profile_images/1144299137871089666/NEvc9OYg_normal.jpg
## 3 http://pbs.twimg.com/profile_images/841471771912097797/-FjSncjp_normal.jpg
## 4 http://pbs.twimg.com/profile_images/1149156021060931584/MJV-lCsK_normal.jpg
## 5 http://pbs.twimg.com/profile_images/1149156021060931584/MJV-lCsK_normal.jpg
## 6 http://pbs.twimg.com/profile_images/1149156021060931584/MJV-lCsK_normal.jpg
De los datos obtenidos vamos a buscar cuáles son los mensajes con mayor repercusión.
ggplot(filter(tweets_NYTr, !is_retweet)) +
geom_histogram(aes(x = retweet_count), fill="skyblue3") +
labs(title = "Cantidad de Tweets con mayor repercusión",
subtitle = "Palabras claves: subway, rail, bycicle and taxi - NYC",
caption = "Fuente: Twitter") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
El gráfico muestra una distribución de “power law”, que significa la existencia de una gran masa de usuarios que cuentan con una popularidad mínima, o apenas un puñado de seguidores, y un número muy pequeño de usuarios que alcanza una cantidad exponencial de seguidores, cientos o miles de veces superior a la de la mayoría.
Visualizamos los usuarios con más seguidores, origen y contenido de los mensajes.
tweets_NYTr %>%
filter(5, !is_retweet) %>%
filter(retweet_count >= 1000) %>%
select(screen_name, retweet_count, followers_count, location, text)
## screen_name retweet_count followers_count location
## 1 Amy_Siskind 6374 371072 New York
## 2 rafaelshimunov 1621 25932 queens, nyc
## 3 aravahshifra 1050 298 New York, NY
## 4 MMFlint 1149 6041980 Michigan/New York City
## text
## 1 SAVE THE DATE:\nWE THE PEOPLE MARCH in DC will take place tent. on Sept 21. We have permits, just need a 24 hour period for any kinks. We will be launching a website with more info in the next few days, including bus/carpooling and solidarity marches. Stay tuned! #WeThePeopleMarch
## 2 There are so many #JewsAgainstICE arrested at Amazon here that the NYPD took over an MTA bus #NeveragainMeans https://t.co/LJBBPXPdev
## 3 Video of the first arrests at #JewsAgainstICE #NeverAgainIsNow in Midtown (before the bus showed up)\nSo proud of the way this community shows up\n\nHeard an estimate of 40 arrests, not sure if thereâ\200\231s an official number https://t.co/muxdpspQxm
## 4 As the day ends, I want to say that Presidents can be sent packing. Theyâ\200\231re not kings. Theyâ\200\231re our SERVANT. 45yrs ago today Nixon boarded a helicopter on the White House lawn & left for good. Aug 9 should be a national holiday to remind Americans that we the people hold the power https://t.co/q3UAPpeGGI
Buscando conocer el momento del día en el cual se realiza la mayor cantidad de tweets, hacemos un gráfico que presenta los picos de actividad durante el día.
ts_plot(tweets_NYTr, "hours") +
labs(title = "Cantidad de Tweets por hora",
subtitle = "Palabras claves: Subway, rail, bycicle and taxi - NYC",
caption = "Fuente: Twitter",
x = "Fecha",
y = "Cantidad de tweets") +
theme_minimal()
Vamos a visualizar a través de un gráfico cómo se distribuye la popularidad de los usuarios y aquellos que tienen más seguidores.
options(scipen = 20)
ggplot(tweets_NYTr) +
geom_histogram(aes(x = followers_count), fill="skyblue3") +
labs(title = "Popularidad de los usuarios",
subtitle = "Palabras claves: subway, rail, bycicle y taxi - NYC",
caption = "Fuente: Twitter") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Al igual que el primer gráfico de barras, el actual muestra una distribución de “power law” donde unos poco usuarios concentran muchos seguidores.
Vamos a usar otro modo de conocer la información que buscamos.
tweets_NYTr %>%
top_n(5, followers_count) %>%
arrange(desc(followers_count)) %>%
select(screen_name, followers_count, location, text)
## screen_name followers_count location
## 1 nytimes 43902439 New York City
## 2 nytimes 43902439 New York City
## 3 nytimes 43902439 New York City
## 4 nytimes 43902439 New York City
## 5 WSJ 16805450 New York, NY
## 6 WSJ 16805450 New York, NY
## 7 WSJ 16805450 New York, NY
## text
## 1 Italy's Parliament rejected motions to stop construction of a high-speed rail link between Turin and Lyon, France. Some officials called the vote a test of the governing coalition, but a political analyst noted, "Everyone gets something.â\200\235\n https://t.co/I8FmqCqx1T
## 2 The protest came after a night in which the police fired tear gas inside a subway station and charged at protesters on an escalator in another station. Many protesters were angry that a female demonstrator was hit by a projectile in her eye. https://t.co/WDnH98bc9J
## 3 Uber, craving growth, is looking to public transit for riders and revenue. Cities arenâ\200\231t sure whether to welcome it. https://t.co/aGfslLKUE4
## 4 The ambitious but contentious plan to improve bus traffic by banning nearly all cars on one of New York Cityâ\200\231s most congested routes was supposed to start this morning. But for the second time in 2 months, a court blocked the rollout of the plan. https://t.co/xDVc5pWwtl
## 5 Hong Kong police have whacked protesters with billy clubs and fired tear gas into a subway station following the return of a hard-line veteran officer to the force https://t.co/RON0Qxrnac
## 6 Police have whacked protesters with billy clubs and fired tear gas into a subway station following the return of a hard-line veteran officer to the force https://t.co/O8L5fkLjOw
## 7 Officials are trying to identify a woman who walked away after being saved from being hit by a subway train https://t.co/X84PsK9dmk
Finalmente, vamos a mostrar en un mapa la posición de los tweets y la cantidad de seguidores de los twitteros. Vamos a filtrar las observaciones que no tengan información de coordenadas.
tweets_NYT_geo <- tweets_NYTr %>%
filter(!is.na(lat), !is.na(lon))
nrow(tweets_NYT_geo)
## [1] 408
bbox <- c(min(tweets_NYT_geo$lon),
min(tweets_NYT_geo$lat),
max(tweets_NYT_geo$lon),
max(tweets_NYT_geo$lat))
Descargamos un mapa-base para visualizar los datos en escala de grises.
mapa_NY <- get_stamenmap(bbox, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/10/300/383.png
## Source : http://tile.stamen.com/toner-lite/10/301/383.png
## Source : http://tile.stamen.com/toner-lite/10/302/383.png
## Source : http://tile.stamen.com/toner-lite/10/303/383.png
## Source : http://tile.stamen.com/toner-lite/10/300/384.png
## Source : http://tile.stamen.com/toner-lite/10/301/384.png
## Source : http://tile.stamen.com/toner-lite/10/302/384.png
## Source : http://tile.stamen.com/toner-lite/10/303/384.png
## Source : http://tile.stamen.com/toner-lite/10/300/385.png
## Source : http://tile.stamen.com/toner-lite/10/301/385.png
## Source : http://tile.stamen.com/toner-lite/10/302/385.png
## Source : http://tile.stamen.com/toner-lite/10/303/385.png
Posicionamos en un mapa la ubicación de los usuarios con la mayor cantidad de seguidores.
ggmap(mapa_NY) +
geom_point(data = tweets_NYT_geo,
aes(x = lon, y = lat, color = followers_count, size = retweet_count),
alpha = .5) +
scale_color_distiller(palette = "Spectral") +
labs(title = "Posición de Tweets y Cantidad de usuarios",
subtitle = "Palabras claves: subway, rail, bycicle and taxi - NYC",
caption = "Fuente: Datos de Twitter y Google maps") +
theme_void()
Para generar una visualización interactiva de los datos utilizamos los mapas interactivos con el paquete leaflet, con la posibilidad de cambiar el nivel de zoom para observar los datos.
Primero definimos la paleta de colores que vamos a usar.
paleta <- colorNumeric(
palette = "inferno",
domain = tweets_NYT_geo$followers_count)
leaflet(tweets_NYT_geo) %>%
addTiles() %>%
addCircleMarkers(radius = ~retweet_count,
popup = ~text,
color = ~paleta(followers_count)) %>%
addLegend(title = "Seguidores", pal = paleta, values = ~followers_count)
## Assuming "lon" and "lat" are longitude and latitude, respectively