Final Project Code 607

This data started in an excel document downloaded from Kaggle, I then

MONGODB connection information

Public Key:

mypsqeek

Private Key:

ad00fd45-9d82-4b50-89fe-e9eac52eeed3

library(tidyverse)
library(mongolite)
library(maps)
library(ggmap)
library(sf)
library(rnaturalearth)
library(countrycode)
library(ggrepel)
library(stringr)
library(DataExplorer)
library(skimr)
library(httpgd)
library(rmarkdown)
library(installr) ##Pandoc and knitting issues

connection_string = 'mongodb+srv://admin:320w49thst@dbquake.p12jh8r.mongodb.net/'
quake_collection = mongo(collection="607", db="QUAKEDATA", url=connection_string)

Pulling in the MONGODB data

quake_collection$iterate()$one()

## $`Date & Time`
## [1] "11/21/2023 17:43"
## 
## $Latitude
## [1] 31.592
## 
## $Longitude
## [1] -104.549
## 
## $Depth
## [1] 3
## 
## $Magnitude
## [1] 2.7
## 
## $Lands
## [1] "WESTERN"
## 
## $Country
## [1] "TEXAS"

quake_collection$find(limit = 3)

##        Date & Time Latitude Longitude Depth Magnitude   Lands   Country
## 1 11/21/2023 17:43   31.592  -104.549     3       2.7 WESTERN     TEXAS
## 2 11/21/2023 17:31  -24.200   -67.580   198       4.0  SALTA, ARGENTINA
## 3 11/21/2023 17:27   31.647  -104.017     5       2.9 WESTERN     TEXAS

Making my mongodb db into a dataframe in R

quake <- as_tibble(quake_collection$find())

Pulling in the poverty data from GitHub

pov <- read.csv("https://raw.githubusercontent.com/owid/poverty-data/main/datasets/pip_dataset.csv", header = T)

pov_cl <- pov %>% 
    select(country, gini, reporting_level, welfare_type, survey_year) %>% 
    filter(reporting_level == "national" & welfare_type == "income") %>% 
    rename("region"=country)

The poverty data is broken down via consumption and income. For the purposes I will be using income only.

unique(pov_cl$welfare_type)

## [1] "income"

mapdata <- map_data("world")

quake <- quake %>% 
rename("Lat"=2, "Long"=3)

Gini to the magnitude of data?

map_data("world") %>% 
ggplot() +
geom_polygon(aes(x=long, y=lat, group=group))

map_data("world") %>% 
ggplot() +
borders(fill = "gray")

map_data("world") %>% 
ggplot() +
geom_polygon(aes(x=long, y=lat, group=group), color = "gray85", fill = "gray80")+
geom_point(data = quake, 
            aes(x = Long, y = Lat), 
            color = "purple1", alpha = 0.5)+
scale_size_continuous(range = c(1,5))+ 
labs(size = "Magnitude")+
ggtitle("Earthquakes Worldwide in 2023")+
theme_minimal()

sa_countries <- c("Argentina", "Bolivia", "Brazil", "Chile", "Colombia", "Ecuador", "Guyana", "Paraguay", "Peru", "Suriname", "Uruguay", "Venezuela" )

map_data_sa <- map_data("world") %>% 
filter(region %in% sa_countries) %>% 
ggplot(aes(x = long, y = lat)) +
geom_polygon(aes(group=group, fill = region, color = "#838383"))+
theme_minimal()+
theme(legend.position = "none")

map_data_sa

labels_data_sa <-
    map_data("world") %>% 
    filter(region %in% sa_countries) %>% 
    group_by(region) %>% 
    summarise(long = mean(long), lat = mean(lat))

theme_xy_blank <- theme(
  axis.text.x = element_blank(),
  axis.text.y = element_blank(), 
  axis.ticks = element_blank(),
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  panel.background = element_rect(fill = "#3b727c", color = "black")
)

Note, I tried isolating South American countries to see if I could look into continents further but I pulled back. I will leave the SA map generation in though.

sa_labeled_map <- map_data_sa +
  geom_text(data = labels_data_sa, 
  aes(label = region), 
  size = 3.5, hjust = 0.5, angle = -45)+
  theme_xy_blank  

sa_labeled_map

Moving away from mapping into statistical analysis

Changing the two columns of lands and Country into one (for matching purposes)

countries_world <- c("Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czechia", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Korea, North", "Korea, South", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Macedonia", "Norway", "Oman", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe")

countries_world <- toupper(countries_world)
quake$Country <- paste(quake$Country, quake$Lands, sep = ", ")

This funtion runs through my quake dataset and should extract any column variables that match any country from my list.

find_first_match <- function(text, search_list) {
  match <- str_extract(text, paste(search_list, collapse = "|"))
  if (!is.na(match)) {
    return(match)
  } else {
    return(NA)
  }
}

Where I applied the function

quake$Country <- mapply(find_first_match, quake$Country, list(countries_world), SIMPLIFY = FALSE)

Cleaned data for grouping

quake_cleaned <- quake %>% 
  select(2, 3, 4, 5, 7)

Removing non-country matches

quake_cleaned <- quake_cleaned[!is.na(quake_cleaned$Country), , drop = FALSE]

This shows that in the matching process 125 of the 195 total countries are present within the data.

quake_cleaned$Country = as.character(quake_cleaned$Country)

highest_mag <- quake_cleaned %>% 
    select(everything()) %>%
    group_by(Country) %>%
    summarise(mean_mag = mean(Magnitude))

Using survey year 2016 because it gas the most data and preparing data for the join

gini<- pov_cl %>% 
  filter(survey_year == 2016) %>% 
  mutate(region = toupper(region)) %>% 
  distinct(region, .keep_all = TRUE)

Now that the data transformation is out of the way I want to run some statistical analysis

gini_magnitude <- inner_join(gini, highest_mag, by = c("region"="Country"))

DataExplorer::plot_correlation(gini_magnitude)

## 1 features with more than 20 categories ignored!
## region: 41 categories

## Warning in cor(x = structure(list(gini = c(0.385656427056021,
## 0.336858105452475, : the standard deviation is zero

## Warning: Removed 18 rows containing missing values (`geom_text()`).

DataExplorer::plot_density(gini_magnitude)

DataExplorer::plot_histogram(gini_magnitude)

skim(gini_magnitude)

Data summary
Name	gini_magnitude
Number of rows	41
Number of columns	6
_______________________
Column type frequency:
character	3
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
region	1	4	11	41
reporting_level	1	8	8	1
welfare_type	1	6	6	1

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
gini	1	0.36	0.07	0.25	0.31	0.33	0.39	0.53	▅▇▃▂▂
survey_year	1	2016.00	0.00	2016.00	2016.00	2016.00	2016.00	2016.00	▁▁▇▁▁
mean_mag	1	2.84	0.97	1.02	2.05	2.80	3.42	5.00	▃▆▇▃▂

Frequency Plot Magnitude

gini_magnitude %>% 
  ggplot(aes(x = mean_mag))+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Frequency Plot GINI

gini_magnitude %>% 
  ggplot(aes(x = gini))+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Linear Regression

gini_mag_rg <- lm(gini ~ mean_mag, data = gini_magnitude)

summary(gini_mag_rg)

## 
## Call:
## lm(formula = gini ~ mean_mag, data = gini_magnitude)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.160068 -0.054429 -0.009499  0.037868  0.172978 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.28078    0.03472   8.087 7.25e-10 ***
## mean_mag     0.02619    0.01158   2.263   0.0293 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0711 on 39 degrees of freedom
## Multiple R-squared:  0.1161, Adjusted R-squared:  0.09339 
## F-statistic:  5.12 on 1 and 39 DF,  p-value: 0.02929

gini_magnitude %>% 
  ggplot(aes(x = mean_mag, y = gini, size = mean_mag, color = "palevioletred1", fill = "gray81"))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(
    x = "Mean Magnitude Across Countries",
    y = "GINI COEFFICIENT",
    title = "Inequality ~ Magnitude Scale"
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `geom_smooth()` using formula = 'y ~ x'

## Warning: The following aesthetics were dropped during statistical transformation: size
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

World earthquake map

map_data("world") %>% 
ggplot() +
geom_polygon(aes(x=long, y=lat, group=group), color = "gray85", fill = "gray80")+
geom_point(data = quake_cleaned, 
            aes(x = Long, y = Lat), 
            color = "purple1", alpha = 0.5)+
scale_size_continuous(range = c(1,5))+ 
labs(size = "Magnitude")+
ggtitle("Earthquakes Worldwide in 2023 (Only Countries)")+
theme_minimal()

Final Project Code 607

Jonathan Burns

2023-12-13

This data started in an excel document downloaded from Kaggle, I then

MONGODB connection information

Public Key:

mypsqeek

Private Key:

ad00fd45-9d82-4b50-89fe-e9eac52eeed3

Pulling in the MONGODB data

Making my mongodb db into a dataframe in R

Pulling in the poverty data from GitHub

The poverty data is broken down via consumption and income. For the purposes I will be using income only.

Gini to the magnitude of data?

Note, I tried isolating South American countries to see if I could look into continents further but I pulled back. I will leave the SA map generation in though.

Moving away from mapping into statistical analysis

Changing the two columns of lands and Country into one (for matching purposes)

This funtion runs through my quake dataset and should extract any column variables that match any country from my list.

Where I applied the function

Cleaned data for grouping

Removing non-country matches

This shows that in the matching process 125 of the 195 total countries are present within the data.

Using survey year 2016 because it gas the most data and preparing data for the join

Now that the data transformation is out of the way I want to run some statistical analysis

Frequency Plot Magnitude

Frequency Plot GINI

Linear Regression

World earthquake map