##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tweets <- mongo("tweets_mongo_covid19", url = params$con)
all <- tweets$find(query = '{}')
df <- data.frame(all)
lat <- df[!is.na(df$lat),]
country <- df[!is.na(df$country),]
location <- df[!is.na(df$location),]
with_coords <- df %>% filter(map_lgl(coords_coords, ~ all(!(NA %in% .x))))
with_geo <- df %>% filter(map_lgl(geo_coords, ~ all(!(NA %in% .x))))
with_bbox <- df %>% filter(map_lgl(bbox_coords, ~ all(!(NA %in% .x))))
amounts <- data.frame(
column = c(
'lat / long',
'country',
'location',
'coords_coords',
'geo_coords',
'bbox_coords'
),
count = c(
nrow(lat),
nrow(country),
nrow(location),
nrow(with_coords),
nrow(with_geo),
nrow(with_bbox)
)
)
ggplot(
data = amounts,
aes(x = reorder(column, -count), y = count)
) +
geom_bar(stat = "identity", fill = "steelblue") +
xlab("Columna") +
ylab("Tweets") +
labs(title = "Cantidad de Tweets con campos de location (absolutos)")
total <- nrow(df)
rel <- transform(amounts, count = count / total)
ggplot(
data = rel,
aes(x = reorder(column, -count), y = count)
) +
geom_bar(stat = "identity", fill = "steelblue") +
xlab("Columna") +
ylab("Tweets") +
labs(title = "Cantidad de Tweets con campos de location (relativos)")
Location es un campo libre, asà que, hay que ver si tiene información con sentido o no.
head(df$location)
## [1] "Los Mochis, Sinaloa" "" "Peru"
## [4] NA NA "Mexico"
Vamos a utilizar la librerÃa countrycode para hacer una normalización muy ingenua.
countries <- countryname(df$location)
head(countries)
## [1] NA NA "Peru" NA NA "Mexico"
Vemos la cantidad de paises normalizados
valid <- countries[!is.na(countries)]
length(valid)
## [1] 8525
(length(valid) / nrow(df)) * 100
## [1] 29.49113
Y buscamos la frecuencia de cada uno
valid_freq <- data.frame(table(valid))
sorted_valid_freq <- valid_freq[order(-valid_freq$Freq),]
head(sorted_valid_freq)
## valid Freq
## 5 Argentina 2001
## 19 Colombia 1000
## 49 Mexico 865
## 88 Venezuela 806
## 17 Chile 728
## 24 Ecuador 460
top_country <- head(sorted_valid_freq, 10)
ggplot(
data = top_country,
aes(x = reorder(valid, -Freq), y = Freq)
) +
geom_bar(stat = "identity", fill = "steelblue") +
xlab("Pais") +
ylab("Tweets") +
labs(title = "Cantidad de Tweets por pais (absoluto)")
total_valid_country <- length(valid)
top_country_rel <- transform(top_country, Freq = (Freq / total_valid_country) * 100)
ggplot(
data = top_country_rel,
aes(x = reorder(valid, -Freq), y = Freq)
) +
geom_bar(stat = "identity", fill = "steelblue") +
xlab("Pais") +
ylab("Tweets") +
labs(title = "Cantidad de Tweets por pais (relativo)")