Project 3: Overview

Over the last decade, birth rate trends have emerged as a significant topic amid global economic uncertainty, wealth inequality, climate change, urbanization, and an ever-changing geopolitical landscape. Should we be more concerned about overpopulation or a population plateau? Will the increasing costs of child-rearing deter people from having children altogether? These are critical questions we aim to explore in this project.

For this analysis, we focus on fertility rates across a carefully selected set of countries. Using data from the World Bank (births per 1,000 people and per woman), we will compare fertility trends with other key indicators such as GDP, death rate, and urbanization. The goal is to uncover interesting correlations that can contribute to the discussion on population dynamics and economic factors.

Countries:
Niger, Japan, US, China, Benin, Angola, Czech Republic, Italy, Egypt, Bangladesh, South Korea

Group Members, Collaboration Tools, Data Sources

Project Group:
Ahmed Hassan, Zach Rose, Amish Rasheed, Allie Wrubel

Collaboration Tools:
To facilitate communication and efficient teamwork, we will utilize several collaboration tools throughout the project. Slack will serve as our primary communication platform, allowing for real-time discussions and file sharing. Email for formal communication and sharing larger documents or updates. Google Colab for collaboratively developing and running R scripts. All files will be stored in a shared public repository on GitHub <https://github.com/awrubes/Data607_project3>.

Data Sources:
Datasets are sourced from the World Bank Group website <https://www.worldbank.org>

Birth Rate <https://github.com/awrubes/Data607_project3/tree/main/data/birth_rate>
Death Rate <https://github.com/awrubes/Data607_project3/tree/main/data/death_rate>
Fertility Replacement Rate <https://github.com/awrubes/Data607_project3/tree/main/data/fertility_replacement_rate>
GDP <https://github.com/awrubes/Data607_project3/tree/main/data/gdp>
Population <https://github.com/awrubes/Data607_project3/tree/main/data/population>
Urbanization <https://github.com/awrubes/Data607_project3/tree/main/data/urbanization>

Loading Data

All of our data is available as csv files in a public GitHub repository. In order to import the data using the below R script you must clone the github repo to your local computer.

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
library(dplyr)
library(ggplot2)
library(httr)
library(purrr)
library(readr)
library(scales)
library(lubridate)
read_and_clean_csv_files <- function(data_directory, pattern, suffix) {
  csv_files <- list.files(path = data_directory, pattern = pattern, full.names = TRUE)
  
  read_and_clean_csv <- function(file) {
    country_name <- gsub(suffix, "", basename(file))
    country_name <- gsub("-", " ", country_name)
    country_name <- str_to_title(country_name)
    
    df <- read.csv(file, stringsAsFactors = FALSE, sep = ",")
    df$country_name <- country_name
    
    return(df)
  }
  
  list_of_dfs <- map(csv_files, read_and_clean_csv)
  combined_data <- bind_rows(list_of_dfs)
  return(combined_data)
}

# Load different datasets using the generic function
combined_birth_data <- read_and_clean_csv_files("data/birth_rate", "*.csv", "_birth_rate.csv")
combined_death_data <- read_and_clean_csv_files("data/death_rate", "*.csv", "_death_rate.csv")
combined_fertility_data <- read_and_clean_csv_files("data/fertility_replacement_rate", "*.csv", "_fertility_replacement_rate.csv")
combined_gdp_data <- read_and_clean_csv_files("data/gdp", "*.csv", "-gdp-gross-domestic-product.csv")
combined_population_data <- read_and_clean_csv_files("data/population", "*.csv", "-population-2024-10-12.csv")
combined_urbanization_data <- read_and_clean_csv_files("data/urbanization", "*.csv", "-urban-population.csv")


global_population_data_df <- read.csv("data/population/world_population.csv", stringsAsFactors = FALSE, sep = ",")
global_fertility_data_df <- read.csv("data/fertility_replacement_rate/world_fertility_rate.csv", stringsAsFactors = FALSE, sep = ",")

Global Data

global_population_data_df$date <- as.Date(global_population_data_df$date, format = "%m/%d/%Y")

ggplot(global_population_data_df, aes(x = date, y = Population)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +  
  labs(title = "Global Population Over Time", 
       x = "Year", 
       y = "Population (in Billions)") +
  scale_y_continuous(
    breaks = seq(1e9, 8e9, by = 1e9),  
    labels = function(x) paste0(x / 1e9, "B") 
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

global_fertility_data_df$date <- as.Date(global_fertility_data_df$date, format = "%m/%d/%Y")

ggplot(global_fertility_data_df, aes(x = date, y = Births.per.Woman)) +
  geom_line(color = "green") +
  geom_point(color = "red") +  
  labs(title = "Global Fertility Rate Since 1950", 
       x = "Year", 
       y = "Births per Woman") +
    scale_y_continuous(breaks = seq(min(global_fertility_data_df$Births.per.Woman), 
                                  max(global_fertility_data_df$Births.per.Woman), 
                                  by = 0.5)) +  
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

Birth rates

combined_birth_data$date <- as.Date(combined_birth_data$date, format = "%m/%d/%Y")


combined_birth_data <- combined_birth_data %>%
  filter(!is.na(date) & !is.na(Births.per.1000.People))

plot_country_trends <- function(df, x_column, y_column, title, x_label, y_label) {
  ggplot(df, aes_string(x = x_column, y = y_column, color = "country_name", group = "country_name")) +
    geom_line(size = 1) +
    geom_point(size = 1.5) +  
    labs(title = title, x = x_label, y = y_label) +
    scale_x_date(date_breaks = "10 years", date_labels = "%Y", 
                 limits = c(as.Date("1950-01-01"), as.Date("2024-12-31"))) +  
    theme_minimal() +
    theme(legend.title = element_blank())
}

birth_rate_plot <- plot_country_trends(
  df = combined_birth_data,
  x_column = "date",           
  y_column = "Births.per.1000.People", 
  title = "Birth Rate by Country",
  x_label = "Year",
  y_label = "Births per 1000 People"
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(birth_rate_plot)
## Warning: Removed 760 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 760 rows containing missing values or values outside the scale range
## (`geom_point()`).

Death Rate

combined_death_data$date <- as.Date(combined_death_data$date, format = "%Y-%m-%d")

combined_death_data <- combined_death_data %>%
  filter(!is.na(date) & !is.na(Deaths.per.1000.People) & 
         date >= as.Date("1950-12-31") & date <= as.Date("2024-12-31"))

plot_country_trends <- function(df, x_column, y_column, title, x_label, y_label) {
  ggplot(df, aes_string(x = x_column, y = y_column, color = "country_name", group = "country_name")) +
    geom_line(size = 1) +
    geom_point(size = 1.5) +  
    labs(title = title, x = x_label, y = y_label) +
    scale_x_date(date_breaks = "10 years", date_labels = "%Y") +  
    theme_minimal() +
    theme(legend.title = element_blank())
}

death_rate_plot <- plot_country_trends(
  df = combined_death_data,
  x_column = "date",          
  y_column = "Deaths.per.1000.People", 
  title = "Death Rate by Country",
  x_label = "Year",
  y_label = "Deaths per 1000 People"
)

print(death_rate_plot)

fertility rate

combined_fertility_data$date <- as.Date(combined_fertility_data$date, format = "%Y-%m-%d")

combined_fertility_data <- combined_fertility_data %>%
  filter(!is.na(date) & !is.na(Births.per.Woman) & 
         date >= as.Date("1950-01-01") & date <= as.Date("2024-12-31"))

plot_country_trends <- function(df, x_column, y_column, title, x_label, y_label) {
  ggplot(df, aes_string(x = x_column, y = y_column, color = "country_name", group = "country_name")) +
    geom_line(size = 1) +
    geom_point(size = 1.5) +  
    labs(title = title, x = x_label, y = y_label) +
    scale_x_date(date_breaks = "10 years", date_labels = "%Y") +  
    theme_minimal() +
    theme(legend.title = element_blank())
}

fertility_rate_plot <- plot_country_trends(
  df = combined_fertility_data,
  x_column = "date",           
  y_column = "Births.per.Woman", 
  title = "Fertility Rate by Country",
  x_label = "Year",
  y_label = "Births per Woman"
)

print(fertility_rate_plot)

GDP

combined_gdp_data$date <- as.Date(combined_gdp_data$date, format = "%Y-%m-%d")
combined_gdp_data <- combined_gdp_data %>%
  filter(!is.na(date) & !is.na(GDP...Billions.of.US...) & 
         date >= as.Date("1950-01-01") & date <= as.Date("2024-12-31"))

plot_country_trends <- function(df, x_column, y_column, title, x_label, y_label) {
  ggplot(df, aes_string(x = x_column, y = y_column, color = "country_name", group = "country_name")) +
    geom_line(size = 1) +
    geom_point(size = 1.5) +  
    labs(title = title, x = x_label, y = y_label) +
    scale_x_date(date_breaks = "10 years", date_labels = "%Y") +  
    scale_y_continuous(labels = function(x) paste0(x / 1e3, "B")) +  
    theme_minimal() +
    theme(legend.title = element_blank())
}



advanced_economies <- combined_gdp_data %>%
  filter(country_name %in% c("United States", "China", "Japan", "Italy", "South Korea"))

gdp_other_countries <- combined_gdp_data %>%
  filter(!country_name %in% c("United States", "China", "Japan", "Italy", "South Korea"))

advanced_economies_plot <- plot_country_trends(
  df = advanced_economies,
  x_column = "date",             
  y_column = "GDP...Billions.of.US...",  
  title = "GDP for USA, China, Japan, Italy, and South Korea",
  x_label = "Year",
  y_label = "GDP in Billions of US Dollars"
)

gdp_other_countries_plot <- plot_country_trends(
  df = gdp_other_countries,
  x_column = "date",             
  y_column = "GDP...Billions.of.US...",  
  title = "GDP for Other Countries",
  x_label = "Year",
  y_label = "GDP in Billions of US Dollars"
)

print(advanced_economies_plot)

print(gdp_other_countries_plot)

population

combined_population_data$date <- as.Date(combined_population_data$date, format = "%Y-%m-%d")

combined_population_data <- combined_population_data %>%
  filter(!is.na(date) & !is.na(Population) & 
         date >= as.Date("1950-01-01") & date <= as.Date("2024-12-31"))

china_population_data <- combined_population_data %>%
  filter(country_name == "China")

rest_of_countries_population_data <- combined_population_data %>%
  filter(country_name != "China")

plot_country_trends <- function(df, x_column, y_column, title, x_label, y_label) {
  ggplot(df, aes_string(x = x_column, y = y_column, color = "country_name", group = "country_name")) +
    geom_line(size = 1) +
    geom_point(size = 1.5) +  
    labs(title = title, x = x_label, y = y_label) +
    scale_x_date(date_breaks = "10 years", date_labels = "%Y") +
    scale_y_continuous(
      breaks = seq(0, max(df[[y_column]], na.rm = TRUE), by = 50e6),   
      labels = function(x) paste0(x / 1e6, "M")  
    ) +
    theme_minimal() +
    theme(legend.title = element_blank())
}

china_population_plot <- plot_country_trends(
  df = china_population_data,
  x_column = "date",             
  y_column = "Population",      
  title = "Population for China",
  x_label = "Year",
  y_label = "Population (in Millions)"
)

rest_of_countries_population_plot <- plot_country_trends(
  df = rest_of_countries_population_data,
  x_column = "date",             
  y_column = "Population",      
  title = "Population for Other Countries",
  x_label = "Year",
  y_label = "Population (in Millions)"
)

print(china_population_plot)

print(rest_of_countries_population_plot)

urbanization

combined_urbanization_data$date <- as.Date(combined_urbanization_data$date, format = "%Y-%m-%d")

combined_urbanization_data <- combined_urbanization_data %>%
  filter(!is.na(date) & !is.na(Urban.Population) & 
         date >= as.Date("1950-01-01") & date <= as.Date("2024-12-31"))
china_urbanization_data <- combined_urbanization_data %>%
  filter(country_name == "China")

japan_usa_urbanization_data <- combined_urbanization_data %>%
  filter(country_name %in% c("Japan", "United States"))

rest_of_countries_urbanization_data <- combined_urbanization_data %>%
  filter(!country_name %in% c("China", "Japan", "United States"))

plot_country_trends <- function(df, x_column, y_column, title, x_label, y_label, y_break) {
  ggplot(df, aes_string(x = x_column, y = y_column, color = "country_name", group = "country_name")) +
    geom_line(size = 1) +
    geom_point(size = 1.5) +  
    labs(title = title, x = x_label, y = y_label) +
    scale_x_date(date_breaks = "10 years", date_labels = "%Y") +
    scale_y_continuous(
      breaks = seq(0, max(df[[y_column]], na.rm = TRUE), by = y_break),  
      labels = function(x) paste0(x / 1e6, "M")  
    ) +
    theme_minimal() +
    theme(legend.title = element_blank())
}

china_urban_population_plot <- plot_country_trends(
  df = china_urbanization_data,
  x_column = "date",             
  y_column = "Urban.Population",      
  title = "Urban Population for China",
  x_label = "Year",
  y_label = "Urban Population (in Millions)",
  y_break = 1e8  
)

japan_usa_urban_population_plot <- plot_country_trends(
  df = japan_usa_urbanization_data,
  x_column = "date",             
  y_column = "Urban.Population",      
  title = "Urban Population for Japan and USA",
  x_label = "Year",
  y_label = "Urban Population (in Millions)",
  y_break = 5e7  
)

rest_of_countries_urban_population_plot <- plot_country_trends(
  df = rest_of_countries_urbanization_data,
  x_column = "date",             
  y_column = "Urban.Population",      
  title = "Urban Population for Other Countries",
  x_label = "Year",
  y_label = "Urban Population (in Millions)",
  y_break = 1e7 
)

print(china_urban_population_plot)

print(japan_usa_urban_population_plot)

print(rest_of_countries_urban_population_plot)

library(DBI)
## Warning: package 'DBI' was built under R version 4.3.3
library(RMySQL)
## Warning: package 'RMySQL' was built under R version 4.3.3
host <- "database-1.cdyqsq4eynri.us-east-2.rds.amazonaws.com"
port <- 3306
dbname <- "Project_3"
user <- "admin"
password <- "Data607_Project3"
con <- dbConnect(RMySQL::MySQL(),
                 dbname = dbname,
                 host = host,
                 port = port,
                 user = user,
                 password = password)
library(dplyr)
library(ggplot2)
birth_data <- dbGetQuery(con, "SELECT country_name, year, birth_rate_value AS birth_rate FROM birth_rate")
## Warning in dbSendQuery(conn, statement, ...): Decimal MySQL column 2 imported
## as numeric
urbanization_data <- dbGetQuery(con, "SELECT country_name, year, urban_population_value FROM urbanization")
## Warning in dbSendQuery(conn, statement, ...): Decimal MySQL column 2 imported
## as numeric
merged_data <- birth_data %>%
  left_join(urbanization_data, by = c("country_name", "year"))

merged_data <- na.omit(merged_data)
country_models <- merged_data %>%
  group_by(country_name) %>%
  do(model = lm(birth_rate ~ urban_population_value, data = .))

model_summaries <- country_models %>%
  summarise(
    country_name,
    model_summary = list(summary(model))
  )

for (i in 1:nrow(model_summaries)) {
  cat("Country:", model_summaries$country_name[i], "\n")
  print(model_summaries$model_summary[[i]])
  cat("\n")
}
## Country: italy 
## 
## Call:
## lm(formula = birth_rate ~ urban_population_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0628 -1.4345  0.2116  1.1704  2.4041 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             5.141e+01  2.111e+00   24.36   <2e-16 ***
## urban_population_value -1.053e-06  5.578e-08  -18.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.411 on 61 degrees of freedom
## Multiple R-squared:  0.8538, Adjusted R-squared:  0.8514 
## F-statistic: 356.2 on 1 and 61 DF,  p-value: < 2.2e-16
## 
## 
## Country: japan 
## 
## Call:
## lm(formula = birth_rate ~ urban_population_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9897 -1.3071 -0.0551  0.8993  3.7216 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             3.282e+01  1.040e+00   31.54   <2e-16 ***
## urban_population_value -2.184e-07  1.080e-08  -20.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.47 on 61 degrees of freedom
## Multiple R-squared:  0.8702, Adjusted R-squared:  0.8681 
## F-statistic: 409.1 on 1 and 61 DF,  p-value: < 2.2e-16
## 
## 
## Country: niger 
## 
## Call:
## lm(formula = birth_rate ~ urban_population_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0278 -0.4454 -0.1182  0.6175  0.7684 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             5.917e+01  1.218e-01  485.86   <2e-16 ***
## urban_population_value -3.310e-06  6.125e-08  -54.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5652 on 61 degrees of freedom
## Multiple R-squared:  0.9795, Adjusted R-squared:  0.9792 
## F-statistic:  2921 on 1 and 61 DF,  p-value: < 2.2e-16
## 
## 
## Country: south korea 
## 
## Call:
## lm(formula = birth_rate ~ urban_population_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2006 -1.4620  0.1739  0.8176  5.6976 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             4.193e+01  6.201e-01   67.62   <2e-16 ***
## urban_population_value -8.186e-07  2.003e-08  -40.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.909 on 61 degrees of freedom
## Multiple R-squared:  0.9648, Adjusted R-squared:  0.9642 
## F-statistic:  1671 on 1 and 61 DF,  p-value: < 2.2e-16
## 
## 
## Country: united states 
## 
## Call:
## lm(formula = birth_rate ~ urban_population_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1342 -0.6606  0.0657  0.3879  4.0259 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.467e+01  7.166e-01   34.42   <2e-16 ***
## urban_population_value -4.741e-08  3.507e-09  -13.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.267 on 61 degrees of freedom
## Multiple R-squared:  0.7497, Adjusted R-squared:  0.7456 
## F-statistic: 182.8 on 1 and 61 DF,  p-value: < 2.2e-16
urban_coefficients_df <- country_models %>%
  summarise(
    country_name,
    coefficient = coef(summary(model))["urban_population_value", "Estimate"]
  )
print(urban_coefficients_df   )
## # A tibble: 5 × 2
##   country_name    coefficient
##   <chr>                 <dbl>
## 1 italy         -0.00000105  
## 2 japan         -0.000000218 
## 3 niger         -0.00000331  
## 4 south korea   -0.000000819 
## 5 united states -0.0000000474
ggplot(urban_coefficients_df, aes(x = reorder(country_name, coefficient), y = coefficient)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # Flip coordinates for better visibility
  labs(title = "Effect of Urbanization on Birth Rate by Country",
       x = "Country",
       y = "Coefficient of Urban Population") +
  theme_minimal()

death_data <- dbGetQuery(con, "SELECT country_name, year, death_rate_value AS death_rate FROM death_rate")
## Warning in dbSendQuery(conn, statement, ...): Decimal MySQL column 2 imported
## as numeric
merged_data <- birth_data %>%
  left_join(death_data, by = c("country_name", "year"))

merged_data <- na.omit(merged_data)
country_models <- merged_data %>%
  group_by(country_name) %>%
  do(model = lm(birth_rate ~ death_rate, data = .))

model_summaries <- country_models %>%
  summarise(
    country_name,
    model_summary = list(summary(model))
  )

for (i in 1:nrow(model_summaries)) {
  cat("Country:", model_summaries$country_name[i], "\n")
  print(model_summaries$model_summary[[i]])
  cat("\n")
}
## Country: angola 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.850  -5.564   1.492   4.703   9.609 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.53731    0.92457   24.38   <2e-16 ***
## death_rate   1.18777    0.06202   19.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.118 on 149 degrees of freedom
## Multiple R-squared:  0.7111, Adjusted R-squared:  0.7092 
## F-statistic: 366.8 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: bangladesh 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.277 -10.935   1.986  11.392  15.250 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.9621     2.2922   1.292    0.198    
## death_rate    1.7148     0.1802   9.513   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.88 on 149 degrees of freedom
## Multiple R-squared:  0.3779, Adjusted R-squared:  0.3737 
## F-statistic:  90.5 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: benin 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.8996  -6.6335   0.8011   7.1681  11.1540 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 21.66386    1.16471   18.60   <2e-16 ***
## death_rate   0.90827    0.07674   11.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.633 on 149 degrees of freedom
## Multiple R-squared:  0.4846, Adjusted R-squared:  0.4811 
## F-statistic: 140.1 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: china 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.230  -9.436  -2.712  10.053  22.046 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.2277     2.2817   2.291   0.0234 *  
## death_rate    0.9543     0.1806   5.284 4.42e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.07 on 149 degrees of freedom
## Multiple R-squared:  0.1578, Adjusted R-squared:  0.1521 
## F-statistic: 27.92 on 1 and 149 DF,  p-value: 4.416e-07
## 
## 
## Country: egypt 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.392  -5.752   1.727   5.330  10.584 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.5155     1.1408   7.464 6.48e-12 ***
## death_rate    1.8533     0.1054  17.591  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.729 on 149 degrees of freedom
## Multiple R-squared:  0.675,  Adjusted R-squared:  0.6728 
## F-statistic: 309.4 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: italy 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9742 -2.5657 -0.3183  1.2446  6.6633 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  21.9838     1.3046  16.851   <2e-16 ***
## death_rate   -1.0037     0.1065  -9.428   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.039 on 149 degrees of freedom
## Multiple R-squared:  0.3736, Adjusted R-squared:  0.3694 
## F-statistic: 88.88 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: japan 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3209 -2.8988 -0.1982  1.1616 15.2330 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 19.93333    0.96350   20.69   <2e-16 ***
## death_rate  -0.85063    0.08139  -10.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.602 on 149 degrees of freedom
## Multiple R-squared:  0.423,  Adjusted R-squared:  0.4191 
## F-statistic: 109.2 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: niger 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6058  -3.8409  -0.5412   6.7992  11.3783 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.60250    1.02891   23.91   <2e-16 ***
## death_rate   1.25826    0.06298   19.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.512 on 149 degrees of freedom
## Multiple R-squared:  0.7282, Adjusted R-squared:  0.7264 
## F-statistic: 399.2 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: south korea 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.156 -7.406 -6.689  2.545 30.138 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.15865    2.28627   5.756 4.76e-08 ***
## death_rate   0.06468    0.17935   0.361    0.719    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.31 on 149 degrees of freedom
## Multiple R-squared:  0.000872,   Adjusted R-squared:  -0.005834 
## F-statistic:  0.13 on 1 and 149 DF,  p-value: 0.7189
## 
## 
## Country: united states 
## 
## Call:
## lm(formula = birth_rate ~ death_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9785 -1.6734 -1.1075  0.0941 10.7214 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.8183     3.4112   9.621  < 2e-16 ***
## death_rate   -1.9973     0.3525  -5.665 7.34e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.508 on 149 degrees of freedom
## Multiple R-squared:  0.1772, Adjusted R-squared:  0.1717 
## F-statistic:  32.1 on 1 and 149 DF,  p-value: 7.342e-08
death_coefficients_df <- country_models %>%
  summarise(
    country_name,
    coefficient = coef(summary(model))["death_rate", "Estimate"]
  )

ggplot(death_coefficients_df, aes(x = reorder(country_name, coefficient), y = coefficient)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # Flip coordinates for better visibility
  labs(title = "Effect of Death Rate on Birth Rate by Country",
       x = "Country",
       y = "Coefficient of Death Rate") +
  theme_minimal()

gdp_data <- dbGetQuery(con, "SELECT country_name, year, gdp_value FROM gdp")
## Warning in dbSendQuery(conn, statement, ...): Decimal MySQL column 2 imported
## as numeric
merged_data <- birth_data %>%
  left_join(gdp_data, by = c("country_name", "year"))

merged_data <- na.omit(merged_data)
country_models <- merged_data %>%
  group_by(country_name) %>%
  do(model = lm(birth_rate ~ gdp_value, data = .))

model_summaries <- country_models %>%
  summarise(
    country_name,
    model_summary = list(summary(model))
  )

for (i in 1:nrow(model_summaries)) {
  cat("Country:", model_summaries$country_name[i], "\n")
  print(model_summaries$model_summary[[i]])
  cat("\n")
}
## Country: angola 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4208 -0.7999  0.4482  1.2130  3.3679 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.704775   0.305648  162.62  < 2e-16 ***
## gdp_value   -0.065275   0.006453  -10.12 1.14e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.987 on 61 degrees of freedom
## Multiple R-squared:  0.6265, Adjusted R-squared:  0.6204 
## F-statistic: 102.3 on 1 and 61 DF,  p-value: 1.145e-14
## 
## 
## Country: bangladesh 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.899  -7.092   1.212   6.778  12.406 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 40.134460   1.120798  35.809  < 2e-16 ***
## gdp_value   -0.077082   0.008402  -9.174 4.34e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.254 on 61 degrees of freedom
## Multiple R-squared:  0.5798, Adjusted R-squared:  0.5729 
## F-statistic: 84.16 on 1 and 61 DF,  p-value: 4.34e-13
## 
## 
## Country: benin 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5041 -0.8663  0.1368  0.8891  1.6261 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.66354    0.17755  262.82   <2e-16 ***
## gdp_value   -0.72273    0.02616  -27.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.061 on 61 degrees of freedom
## Multiple R-squared:  0.926,  Adjusted R-squared:  0.9248 
## F-statistic: 763.4 on 1 and 61 DF,  p-value: < 2.2e-16
## 
## 
## Country: china 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.248  -6.837  -1.568   5.711  14.314 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.2557569  1.1952592  21.130  < 2e-16 ***
## gdp_value   -0.0011594  0.0002064  -5.617 5.08e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.034 on 61 degrees of freedom
## Multiple R-squared:  0.3409, Adjusted R-squared:  0.3301 
## F-statistic: 31.56 on 1 and 61 DF,  p-value: 5.081e-07
## 
## 
## Country: egypt 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.449 -5.209  1.753  3.575  9.270 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 38.021186   0.862944  44.060  < 2e-16 ***
## gdp_value   -0.042368   0.005492  -7.714 1.37e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.28 on 61 degrees of freedom
## Multiple R-squared:  0.4938, Adjusted R-squared:  0.4855 
## F-statistic:  59.5 on 1 and 61 DF,  p-value: 1.368e-10
## 
## 
## Country: italy 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4238 -1.5673 -0.2536  1.7440  3.3284 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15.7386264  0.4038154   38.98   <2e-16 ***
## gdp_value   -0.0038827  0.0003095  -12.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.95 on 61 degrees of freedom
## Multiple R-squared:  0.7207, Adjusted R-squared:  0.7161 
## F-statistic: 157.4 on 1 and 61 DF,  p-value: < 2.2e-16
## 
## 
## Country: japan 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6648 -0.7969  0.1478  0.7383  2.3937 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.737e+01  2.653e-01   65.45   <2e-16 ***
## gdp_value   -1.802e-03  7.354e-05  -24.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.239 on 61 degrees of freedom
## Multiple R-squared:  0.9078, Adjusted R-squared:  0.9063 
## F-statistic: 600.6 on 1 and 61 DF,  p-value: < 2.2e-16
## 
## 
## Country: niger 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0056 -0.6220  0.1334  0.8075  1.2503 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 57.57144    0.16543  348.02   <2e-16 ***
## gdp_value   -0.91676    0.02853  -32.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9334 on 61 degrees of freedom
## Multiple R-squared:  0.9442, Adjusted R-squared:  0.9433 
## F-statistic:  1033 on 1 and 61 DF,  p-value: < 2.2e-16
## 
## 
## Country: south korea 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.046 -4.750 -1.728  3.757 16.074 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.938472   1.052350   24.65  < 2e-16 ***
## gdp_value   -0.013648   0.001322  -10.32 5.18e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.138 on 61 degrees of freedom
## Multiple R-squared:  0.636,  Adjusted R-squared:  0.6301 
## F-statistic: 106.6 on 1 and 61 DF,  p-value: 5.184e-15
## 
## 
## Country: united states 
## 
## Call:
## lm(formula = birth_rate ~ gdp_value, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1250 -0.7122 -0.2668  0.2492  5.3030 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.755e+01  3.001e-01   58.47  < 2e-16 ***
## gdp_value   -2.831e-04  2.784e-05  -10.17 9.36e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.543 on 61 degrees of freedom
## Multiple R-squared:  0.629,  Adjusted R-squared:  0.6229 
## F-statistic: 103.4 on 1 and 61 DF,  p-value: 9.356e-15
gdp_coefficients_df <- country_models %>%
  summarise(
    country_name,
    coefficient = coef(summary(model))["gdp_value", "Estimate"]
  )

ggplot(gdp_coefficients_df, aes(x = reorder(country_name, coefficient), y = coefficient)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Effect of GDP on Birth Rate by Country",
       x = "Country",
       y = "Coefficient of GDP") +
  theme_minimal()

fertility_data <- dbGetQuery(con, "SELECT country_name, year, fertility_rate_value AS fertility_rate FROM fertility_rate")
## Warning in dbSendQuery(conn, statement, ...): Decimal MySQL column 2 imported
## as numeric
merged_data <- birth_data %>%
  left_join(fertility_data, by = c("country_name", "year"))


merged_data <- na.omit(merged_data)
country_models <- merged_data %>%
  group_by(country_name) %>%
  do(model = lm(birth_rate ~ fertility_rate, data = .))

model_summaries <- country_models %>%
  summarise(
    country_name,
    model_summary = list(summary(model))
  )

for (i in 1:nrow(model_summaries)) {
  cat("Country:", model_summaries$country_name[i], "\n")
  print(model_summaries$model_summary[[i]])
  cat("\n")
}
## Country: angola 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6957 -1.4473  0.1783  0.9413  6.1621 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.57296    0.43944   10.41   <2e-16 ***
## fertility_rate  6.46684    0.08182   79.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.737 on 149 degrees of freedom
## Multiple R-squared:  0.9767, Adjusted R-squared:  0.9765 
## F-statistic:  6247 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: bangladesh 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5947 -2.6923 -0.3835  2.3383  5.0008 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.2379     0.4299  -0.553    0.581    
## fertility_rate   7.1829     0.1132  63.435   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.847 on 149 degrees of freedom
## Multiple R-squared:  0.9643, Adjusted R-squared:  0.9641 
## F-statistic:  4024 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: benin 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.15412 -0.90259  0.05851  1.01378  1.65059 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.1679     0.2648   15.74   <2e-16 ***
## fertility_rate   6.3558     0.0543  117.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.103 on 149 degrees of freedom
## Multiple R-squared:  0.9892, Adjusted R-squared:  0.9892 
## F-statistic: 1.37e+04 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: china 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.855 -2.244 -1.508  2.152  6.949 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.9334     0.4344  -2.149   0.0333 *  
## fertility_rate   6.8692     0.1466  46.867   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.765 on 149 degrees of freedom
## Multiple R-squared:  0.9365, Adjusted R-squared:  0.936 
## F-statistic:  2197 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: egypt 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0239 -1.4536 -0.0835  1.0647  4.7417 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.82612    0.35165   2.349   0.0201 *  
## fertility_rate  7.02772    0.08882  79.121   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.8 on 149 degrees of freedom
## Multiple R-squared:  0.9768, Adjusted R-squared:  0.9766 
## F-statistic:  6260 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: italy 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.12768 -1.79400 -0.04313  1.46212  3.15430 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -5.1852     0.6330  -8.191 1.08e-13 ***
## fertility_rate   9.2279     0.3775  24.443  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.716 on 149 degrees of freedom
## Multiple R-squared:  0.8004, Adjusted R-squared:  0.7991 
## F-statistic: 597.5 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: japan 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8205 -2.1327 -0.4062  2.0687  3.3994 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -9.0070     0.8087  -11.14   <2e-16 ***
## fertility_rate  11.5322     0.4710   24.48   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.116 on 149 degrees of freedom
## Multiple R-squared:  0.8009, Adjusted R-squared:  0.7996 
## F-statistic: 599.5 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: niger 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3355 -1.1621  0.0442  0.9917  4.5663 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.56700    0.52926    4.85 3.07e-06 ***
## fertility_rate  6.87081    0.08852   77.62  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.238 on 149 degrees of freedom
## Multiple R-squared:  0.9759, Adjusted R-squared:  0.9757 
## F-statistic:  6025 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: south korea 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9535 -2.8703 -0.4689  1.7278  5.5789 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.9248     0.4043  -4.761 4.52e-06 ***
## fertility_rate   7.3675     0.1550  47.537  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.814 on 149 degrees of freedom
## Multiple R-squared:  0.9381, Adjusted R-squared:  0.9377 
## F-statistic:  2260 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: united states 
## 
## Call:
## lm(formula = birth_rate ~ fertility_rate, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6648 -1.2379 -0.2820  0.5419  3.5155 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.6976     0.5363  -3.166  0.00188 ** 
## fertility_rate   7.4912     0.2565  29.210  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.491 on 149 degrees of freedom
## Multiple R-squared:  0.8513, Adjusted R-squared:  0.8503 
## F-statistic: 853.2 on 1 and 149 DF,  p-value: < 2.2e-16
fertility_coefficients_df <- country_models %>%
  summarise(
    country_name,
    coefficient = coef(summary(model))["fertility_rate", "Estimate"]
  )

ggplot(fertility_coefficients_df, aes(x = reorder(country_name, coefficient), y = coefficient)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # Flip coordinates for better visibility
  labs(title = "Effect of Fertility Rate on Birth Rate by Country",
       x = "Country",
       y = "Coefficient of Fertility Rate") +
  theme_minimal()

population_data <- dbGetQuery(con, "SELECT country_name, year, population_value AS population FROM population")
## Warning in dbSendQuery(conn, statement, ...): Decimal MySQL column 2 imported
## as numeric
merged_data <- birth_data %>%
  left_join(population_data, by = c("country_name", "year"))

merged_data <- na.omit(merged_data)
country_models <- merged_data %>%
  group_by(country_name) %>%
  do(model = lm(birth_rate ~ population, data = .))

model_summaries <- country_models %>%
  summarise(
    country_name,
    model_summary = list(summary(model))
  )

population_coefficients_df <- country_models %>%
  summarise(
    country_name,
    coefficient = coef(summary(model))["population", "Estimate"]
  )

for (i in 1:nrow(model_summaries)) {
  cat("Country:", model_summaries$country_name[i], "\n")
  print(model_summaries$model_summary[[i]])
  cat("\n")
}
## Country: angola 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5772 -1.8108  0.1097  1.6348  2.7613 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.085e+01  2.265e-01  224.48   <2e-16 ***
## population  -2.593e-07  3.369e-09  -76.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.783 on 149 degrees of freedom
## Multiple R-squared:  0.9755, Adjusted R-squared:  0.9753 
## F-statistic:  5927 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: bangladesh 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2297 -1.2146  0.5564  1.9864  3.4478 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.227e+01  5.997e-01  103.83   <2e-16 ***
## population  -2.644e-07  3.758e-09  -70.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.575 on 149 degrees of freedom
## Multiple R-squared:  0.9708, Adjusted R-squared:  0.9706 
## F-statistic:  4950 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: benin 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4147 -2.1532 -0.1038  2.1635  3.6999 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.645e+01  3.053e-01  152.14   <2e-16 ***
## population  -7.082e-07  1.296e-08  -54.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.317 on 149 degrees of freedom
## Multiple R-squared:  0.9525, Adjusted R-squared:  0.9522 
## F-statistic:  2988 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: china 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.275  -5.470   2.174   4.470  13.766 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.826e+01  2.853e+00   16.92   <2e-16 ***
## population  -2.887e-08  2.524e-09  -11.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.004 on 149 degrees of freedom
## Multiple R-squared:  0.4677, Adjusted R-squared:  0.4641 
## F-statistic: 130.9 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: egypt 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8554 -1.8844  0.3184  1.7123 10.2288 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.652e+01  6.142e-01   75.74   <2e-16 ***
## population  -1.771e-07  4.677e-09  -37.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.622 on 149 degrees of freedom
## Multiple R-squared:  0.9058, Adjusted R-squared:  0.9052 
## F-statistic:  1433 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: italy 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.147 -2.621 -1.776  0.014  8.874 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.125e+00  2.351e+00   3.456 0.000715 ***
## population  3.466e-08  4.531e-08   0.765 0.445478    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.833 on 149 degrees of freedom
## Multiple R-squared:  0.003912,   Adjusted R-squared:  -0.002773 
## F-statistic: 0.5852 on 1 and 149 DF,  p-value: 0.4455
## 
## 
## Country: japan 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.447 -3.152 -2.894  1.351 15.895 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.305e+01  2.372e+00   5.501  1.6e-07 ***
## population  -2.576e-08  2.224e-08  -1.158    0.249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.721 on 149 degrees of freedom
## Multiple R-squared:  0.008924,   Adjusted R-squared:  0.002272 
## F-statistic: 1.342 on 1 and 149 DF,  p-value: 0.2486
## 
## 
## Country: niger 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1365 -3.6672  0.7692  2.9749  6.6127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.516e+01  4.040e-01  136.56   <2e-16 ***
## population  -2.677e-07  5.479e-09  -48.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.492 on 149 degrees of freedom
## Multiple R-squared:  0.9412, Adjusted R-squared:  0.9409 
## F-statistic:  2387 on 1 and 149 DF,  p-value: < 2.2e-16
## 
## 
## Country: south korea 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.2971  -7.4657   0.8455   5.4037  20.4056 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.968e+01  3.240e+00  12.248  < 2e-16 ***
## population  -6.541e-07  7.991e-08  -8.185 1.12e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.396 on 149 degrees of freedom
## Multiple R-squared:  0.3102, Adjusted R-squared:  0.3055 
## F-statistic:    67 on 1 and 149 DF,  p-value: 1.119e-13
## 
## 
## Country: united states 
## 
## Call:
## lm(formula = birth_rate ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6648 -1.2379 -0.2820  0.5419  3.5155 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.6976     0.5363  -3.166  0.00188 ** 
## population    7.4912     0.2565  29.210  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.491 on 149 degrees of freedom
## Multiple R-squared:  0.8513, Adjusted R-squared:  0.8503 
## F-statistic: 853.2 on 1 and 149 DF,  p-value: < 2.2e-16
positive_coefficients <- population_coefficients_df %>%
  filter(coefficient > 0)

ggplot(positive_coefficients, aes(x = reorder(country_name, coefficient), y = coefficient)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  coord_flip() +
  labs(title = "Positive Effect of Population on Birth Rate",
       x = "Country",
       y = "Coefficient of Population") +
  theme_minimal()

negative_coefficients <- population_coefficients_df %>%
  filter(coefficient <= 0)

ggplot(negative_coefficients, aes(x = reorder(country_name, coefficient), y = coefficient)) +
  geom_bar(stat = "identity", fill = "salmon") +
  coord_flip() +
  labs(title = "Negative Effect of Population on Birth Rate",
       x = "Country",
       y = "Coefficient of Population") +
  theme_minimal()