## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
tweets <- mongo("tweets_mongo_covid19", url = params$con)
all <- tweets$find(query = '{}')
df <- data.frame(all)
lat <- df[!is.na(df$lat),]
country <- df[!is.na(df$country),]
location <- df[!is.na(df$location),]
with_coords <- df %>% filter(map_lgl(coords_coords, ~ all(!(NA %in% .x))))
with_geo <- df %>% filter(map_lgl(geo_coords, ~ all(!(NA %in% .x))))
with_bbox <- df %>% filter(map_lgl(bbox_coords, ~ all(!(NA %in% .x))))

amounts <- data.frame(
    column = c(
      'lat / long',
      'country',
      'location',
      'coords_coords',
      'geo_coords',
      'bbox_coords'
    ),
    count = c(
      nrow(lat),
      nrow(country),
      nrow(location),
      nrow(with_coords),
      nrow(with_geo),
      nrow(with_bbox)
    )
  )
ggplot(
  data = amounts,
  aes(x = reorder(column, -count), y = count)
) +
  geom_bar(stat = "identity", fill = "steelblue") +
  xlab("Columna") +
  ylab("Tweets") +
  labs(title = "Cantidad de Tweets con campos de location (absolutos)")

total <- nrow(df)
rel <- transform(amounts, count = count / total)
ggplot(
  data = rel,
  aes(x = reorder(column, -count), y = count)
) +
  geom_bar(stat = "identity", fill = "steelblue") +
  xlab("Columna") +
  ylab("Tweets") +
  labs(title = "Cantidad de Tweets con campos de location (relativos)")

Location es un campo libre, así que, hay que ver si tiene información con sentido o no.

head(df$location)
## [1] "Los Mochis, Sinaloa" ""                    "Peru"               
## [4] NA                    NA                    "Mexico"

Vamos a utilizar la librería countrycode para hacer una normalización muy ingenua.

countries <- countryname(df$location)
head(countries)
## [1] NA       NA       "Peru"   NA       NA       "Mexico"

Vemos la cantidad de paises normalizados

valid <- countries[!is.na(countries)]
length(valid)
## [1] 8525
(length(valid) / nrow(df)) * 100
## [1] 29.49113

Y buscamos la frecuencia de cada uno

valid_freq <- data.frame(table(valid))
sorted_valid_freq <- valid_freq[order(-valid_freq$Freq),]
head(sorted_valid_freq)
##        valid Freq
## 5  Argentina 2001
## 19  Colombia 1000
## 49    Mexico  865
## 88 Venezuela  806
## 17     Chile  728
## 24   Ecuador  460
top_country <- head(sorted_valid_freq, 10)
ggplot(
  data = top_country,
  aes(x = reorder(valid, -Freq), y = Freq)
) +
  geom_bar(stat = "identity", fill = "steelblue") +
  xlab("Pais") +
  ylab("Tweets") +
  labs(title = "Cantidad de Tweets por pais (absoluto)")

total_valid_country <- length(valid)
top_country_rel <- transform(top_country, Freq = (Freq / total_valid_country) * 100)
ggplot(
  data = top_country_rel,
  aes(x = reorder(valid, -Freq), y = Freq)
) +
  geom_bar(stat = "identity", fill = "steelblue") +
  xlab("Pais") +
  ylab("Tweets") +
  labs(title = "Cantidad de Tweets por pais (relativo)")