This data started in an excel document downloaded from Kaggle, I then

MONGODB connection information

Public Key:

mypsqeek

Private Key:

ad00fd45-9d82-4b50-89fe-e9eac52eeed3

library(tidyverse)
library(mongolite)
library(maps)
library(ggmap)
library(sf)
library(rnaturalearth)
library(countrycode)
library(ggrepel)
library(stringr)
library(DataExplorer)
library(skimr)
library(httpgd)
library(rmarkdown)
library(installr) ##Pandoc and knitting issues
connection_string = 'mongodb+srv://admin:320w49thst@dbquake.p12jh8r.mongodb.net/'
quake_collection = mongo(collection="607", db="QUAKEDATA", url=connection_string)

Pulling in the MONGODB data

quake_collection$iterate()$one()
## $`Date & Time`
## [1] "11/21/2023 17:43"
## 
## $Latitude
## [1] 31.592
## 
## $Longitude
## [1] -104.549
## 
## $Depth
## [1] 3
## 
## $Magnitude
## [1] 2.7
## 
## $Lands
## [1] "WESTERN"
## 
## $Country
## [1] "TEXAS"
quake_collection$find(limit = 3)
##        Date & Time Latitude Longitude Depth Magnitude   Lands   Country
## 1 11/21/2023 17:43   31.592  -104.549     3       2.7 WESTERN     TEXAS
## 2 11/21/2023 17:31  -24.200   -67.580   198       4.0  SALTA, ARGENTINA
## 3 11/21/2023 17:27   31.647  -104.017     5       2.9 WESTERN     TEXAS

Making my mongodb db into a dataframe in R

quake <- as_tibble(quake_collection$find())

Pulling in the poverty data from GitHub

pov <- read.csv("https://raw.githubusercontent.com/owid/poverty-data/main/datasets/pip_dataset.csv", header = T)
pov_cl <- pov %>% 
    select(country, gini, reporting_level, welfare_type, survey_year) %>% 
    filter(reporting_level == "national" & welfare_type == "income") %>% 
    rename("region"=country)

The poverty data is broken down via consumption and income. For the purposes I will be using income only.

unique(pov_cl$welfare_type)
## [1] "income"
mapdata <- map_data("world")
quake <- quake %>% 
rename("Lat"=2, "Long"=3)

Gini to the magnitude of data?

map_data("world") %>% 
ggplot() +
geom_polygon(aes(x=long, y=lat, group=group))

map_data("world") %>% 
ggplot() +
borders(fill = "gray")

map_data("world") %>% 
ggplot() +
geom_polygon(aes(x=long, y=lat, group=group), color = "gray85", fill = "gray80")+
geom_point(data = quake, 
            aes(x = Long, y = Lat), 
            color = "purple1", alpha = 0.5)+
scale_size_continuous(range = c(1,5))+ 
labs(size = "Magnitude")+
ggtitle("Earthquakes Worldwide in 2023")+
theme_minimal()

sa_countries <- c("Argentina", "Bolivia", "Brazil", "Chile", "Colombia", "Ecuador", "Guyana", "Paraguay", "Peru", "Suriname", "Uruguay", "Venezuela" )

map_data_sa <- map_data("world") %>% 
filter(region %in% sa_countries) %>% 
ggplot(aes(x = long, y = lat)) +
geom_polygon(aes(group=group, fill = region, color = "#838383"))+
theme_minimal()+
theme(legend.position = "none")

map_data_sa

labels_data_sa <-
    map_data("world") %>% 
    filter(region %in% sa_countries) %>% 
    group_by(region) %>% 
    summarise(long = mean(long), lat = mean(lat))
theme_xy_blank <- theme(
  axis.text.x = element_blank(),
  axis.text.y = element_blank(), 
  axis.ticks = element_blank(),
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.background = element_rect(fill = "#3b727c", color = "black")
)

Note, I tried isolating South American countries to see if I could look into continents further but I pulled back. I will leave the SA map generation in though.

sa_labeled_map <- map_data_sa +
  geom_text(data = labels_data_sa, 
  aes(label = region), 
  size = 3.5, hjust = 0.5, angle = -45)+
  theme_xy_blank  

sa_labeled_map

Moving away from mapping into statistical analysis

Changing the two columns of lands and Country into one (for matching purposes)

countries_world <- c("Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czechia", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Korea, North", "Korea, South", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Macedonia", "Norway", "Oman", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe")

countries_world <- toupper(countries_world)
quake$Country <- paste(quake$Country, quake$Lands, sep = ", ")

This funtion runs through my quake dataset and should extract any column variables that match any country from my list.

find_first_match <- function(text, search_list) {
  match <- str_extract(text, paste(search_list, collapse = "|"))
  if (!is.na(match)) {
    return(match)
  } else {
    return(NA)
  }
}

Where I applied the function

quake$Country <- mapply(find_first_match, quake$Country, list(countries_world), SIMPLIFY = FALSE)

Cleaned data for grouping

quake_cleaned <- quake %>% 
  select(2, 3, 4, 5, 7)

Removing non-country matches

quake_cleaned <- quake_cleaned[!is.na(quake_cleaned$Country), , drop = FALSE]

This shows that in the matching process 125 of the 195 total countries are present within the data.

quake_cleaned$Country = as.character(quake_cleaned$Country)

highest_mag <- quake_cleaned %>% 
    select(everything()) %>%
    group_by(Country) %>%
    summarise(mean_mag = mean(Magnitude))

Using survey year 2016 because it gas the most data and preparing data for the join

gini<- pov_cl %>% 
  filter(survey_year == 2016) %>% 
  mutate(region = toupper(region)) %>% 
  distinct(region, .keep_all = TRUE) 

Now that the data transformation is out of the way I want to run some statistical analysis

gini_magnitude <- inner_join(gini, highest_mag, by = c("region"="Country"))
DataExplorer::plot_correlation(gini_magnitude)
## 1 features with more than 20 categories ignored!
## region: 41 categories
## Warning in cor(x = structure(list(gini = c(0.385656427056021,
## 0.336858105452475, : the standard deviation is zero
## Warning: Removed 18 rows containing missing values (`geom_text()`).

DataExplorer::plot_density(gini_magnitude)

DataExplorer::plot_histogram(gini_magnitude)

skim(gini_magnitude)
Data summary
Name gini_magnitude
Number of rows 41
Number of columns 6
_______________________
Column type frequency:
character 3
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
region 0 1 4 11 0 41 0
reporting_level 0 1 8 8 0 1 0
welfare_type 0 1 6 6 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
gini 0 1 0.36 0.07 0.25 0.31 0.33 0.39 0.53 ▅▇▃▂▂
survey_year 0 1 2016.00 0.00 2016.00 2016.00 2016.00 2016.00 2016.00 ▁▁▇▁▁
mean_mag 0 1 2.84 0.97 1.02 2.05 2.80 3.42 5.00 ▃▆▇▃▂

Frequency Plot Magnitude

gini_magnitude %>% 
  ggplot(aes(x = mean_mag))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Frequency Plot GINI

gini_magnitude %>% 
  ggplot(aes(x = gini))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Linear Regression

gini_mag_rg <- lm(gini ~ mean_mag, data = gini_magnitude)
summary(gini_mag_rg)
## 
## Call:
## lm(formula = gini ~ mean_mag, data = gini_magnitude)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.160068 -0.054429 -0.009499  0.037868  0.172978 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.28078    0.03472   8.087 7.25e-10 ***
## mean_mag     0.02619    0.01158   2.263   0.0293 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0711 on 39 degrees of freedom
## Multiple R-squared:  0.1161, Adjusted R-squared:  0.09339 
## F-statistic:  5.12 on 1 and 39 DF,  p-value: 0.02929
gini_magnitude %>% 
  ggplot(aes(x = mean_mag, y = gini, size = mean_mag, color = "palevioletred1", fill = "gray81"))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(
    x = "Mean Magnitude Across Countries",
    y = "GINI COEFFICIENT",
    title = "Inequality ~ Magnitude Scale"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

World earthquake map

map_data("world") %>% 
ggplot() +
geom_polygon(aes(x=long, y=lat, group=group), color = "gray85", fill = "gray80")+
geom_point(data = quake_cleaned, 
            aes(x = Long, y = Lat), 
            color = "purple1", alpha = 0.5)+
scale_size_continuous(range = c(1,5))+ 
labs(size = "Magnitude")+
ggtitle("Earthquakes Worldwide in 2023 (Only Countries)")+
theme_minimal()