Forecasting birth rates by economic and demographic factors
1.Birth_rate: https://databank.worldbank.org/reports.aspx?source=2&series=SP.DYN.CBRT.IN&country=
2.Unemployment: https://data.worldbank.org/indicator/SL.UEM.TOTL.ZS?end=2022&name_desc=false&start=2011
3.Education https://data.worldbank.org/indicator/SE.TER.ENRR?end=2022&start=2010
4.GDP: https://databank.worldbank.org/indicator/NY.GDP.MKTP.KD.ZG/1ff4a498/Popular-Indicators
In recent years, many countries have experienced population problems. Three of the five countries with the lowest birth rates in the world are in Asia, which shows that the problem of declining birth rates in Asia cannot be ignored. Therefore, it is of great significance to explore the birth rate and its influencing factors in Asia. This study aims to explore the factors that influence birth rates and predict future birth rates. We selected the birth rates and economic-related factors of 10 Asian countries and regions over the past 20 years. The selected countries include the three countries with the lowest birth rates in Asia: South Korea, Japan, and Singapore.
First of all, the birth rate is an important indicator of a country’s population replacement and progress. Declining birth rates and an aging population will inevitably bring serious social and economic consequences, such as increased pressure on pension funds and medical expenses; labor shortages hindering economic growth, etc. Second, lower birth rates may lead to labor shortages and demographic instability. This situation will put huge pressure on social welfare systems, health services and economic growth. To sum up, the continued decline in birth rates will undoubtedly have profound consequences for the country’s future.
library(tidyverse)
## Warning: 程辑包'tidyverse'是用R版本4.3.2 来建造的
## Warning: 程辑包'ggplot2'是用R版本4.3.2 来建造的
## Warning: 程辑包'tibble'是用R版本4.3.2 来建造的
## Warning: 程辑包'tidyr'是用R版本4.3.2 来建造的
## Warning: 程辑包'readr'是用R版本4.3.2 来建造的
## Warning: 程辑包'purrr'是用R版本4.3.2 来建造的
## Warning: 程辑包'dplyr'是用R版本4.3.2 来建造的
## Warning: 程辑包'forcats'是用R版本4.3.2 来建造的
## Warning: 程辑包'lubridate'是用R版本4.3.2 来建造的
library(dplyr)
library(readxl)
library(ggplot2)
library(reshape2)
## Warning: 程辑包'reshape2'是用R版本4.3.2 来建造的
library(Metrics)
## Warning: 程辑包'Metrics'是用R版本4.3.2 来建造的
library(randomForest)
## Warning: 程辑包'randomForest'是用R版本4.3.2 来建造的
library(tree)
## Warning: 程辑包'tree'是用R版本4.3.2 来建造的
library(caret)
## Warning: 程辑包'caret'是用R版本4.3.2 来建造的
library(rpart)
## Warning: 程辑包'rpart'是用R版本4.3.2 来建造的
library(rpart.plot)
## Warning: 程辑包'rpart.plot'是用R版本4.3.2 来建造的
library(corrplot)
## Warning: 程辑包'corrplot'是用R版本4.3.2 来建造的
# We choose 10 Asia countries
interested_countries = c("Malaysia","China","Japan","Singapore", "Korea, Rep.", "India","Thailand","Indonesia","Macao SAR, China","Hong Kong SAR, China")
# 01: clean births dataset
df <- read.csv('births.csv')
df_selected_countries <- df %>%
filter(Country.Name %in% interested_countries) %>% select(-Series.Name,-Series.Code,-Country.Code)
df <- df_selected_countries
dfBirth<- df %>%
rename(Country = Country.Name) %>%
gather(key = "Year", value = "Birth_rate", -Country) %>%
mutate(Year = sub("X(\\d{4}).*", "\\1", Year)) %>%
filter(Birth_rate != "..")
# 02: clean unemployment dataset
df <- read.csv("unemployment.csv")
df <- df %>% filter(Country.Name %in% interested_countries) %>% select(-Series.Name,-Series.Code,-Country.Code)
dfUnemp<- df %>%
rename(Country = Country.Name) %>%
gather(key = "Year", value = "Unemployment", -Country) %>%
mutate(Year = sub("X(\\d{4}).*", "\\1", Year)) %>%
filter(Unemployment != "..")
# 03: clean education dataset
df <- read.csv("education.csv")
df <- df %>% filter(Country.Name %in% interested_countries) %>% select(-Series.Name,-Series.Code,-Country.Code)
dfEdu<- df %>%
rename(Country = Country.Name) %>%
gather(key = "Year", value = "Education", -Country) %>%
mutate(Year = sub("X(\\d{4}).*", "\\1", Year)) %>%
filter(Education != "..")
dfEdu$Education <- round(as.numeric(dfEdu$Education), digits = 2)
# 04: clean GDP dataset
df <- read.csv("GDP.csv")
dfGdp <- df %>%
subset(grepl("GDP growth", Series.Name, ignore.case = TRUE)) %>%
filter(Country.Name %in% interested_countries)%>% select(-Series.Name,-Series.Code,-Country.Code)
dfGdp<- dfGdp %>%
rename(Country = Country.Name) %>%
gather(key = "Year", value = "GDP", -Country) %>%
mutate(Year = sub("X(\\d{4}).*", "\\1", Year)) %>%
filter(GDP != "..")
dfGdp$GDP <- round(as.numeric(dfGdp$GDP), digits = 2)
# 05: merge
dfList <- list( dfBirth, dfUnemp, dfEdu, dfGdp)
dfMerged <- Reduce(function(x, y) merge(x, y, all=TRUE), dfList)
# clean data by omitting null value
dfMerged <- na.omit(dfMerged)
# convert variable to numeric except for Country column
dfMerged <- dfMerged %>% mutate(across(-Country, as.numeric))
# Preliminary exploration
# GDP over years
ggplot(dfMerged, aes(x = Year, y = GDP, color = Country, group = Country)) +
geom_line() +
labs(title = "GDP Over the Years",
x = "Year",
y = "GDP") +
theme_minimal() +
scale_x_continuous(breaks = seq(min(dfMerged$Year), max(dfMerged$Year), by = 5))
# Education over years
ggplot(dfMerged, aes(x = Year, y = Education, color = Country, group = Country)) +
geom_line() +
labs(title = "Education over the Years",
x = "Year",
y = "Education") +
theme_minimal() +
scale_x_continuous(breaks = seq(min(dfMerged$Year), max(dfMerged$Year), by = 5))
# Unemployment over years
ggplot(dfMerged, aes(x = Year, y = Unemployment, color = Country, group = Country)) +
geom_line() +
labs(title = "Unemployment over the Years",
x = "Year",
y = "Unemployment") +
theme_minimal() +
scale_x_continuous(breaks = seq(min(dfMerged$Year), max(dfMerged$Year), by = 5))
# Birth rate over years
ggplot(dfMerged, aes(x = Year, y = Birth_rate, color = Country, group = Country)) +
geom_line() +
labs(title = "Birth rate over the Years",
x = "Year",
y = "Birth rate") +
theme_minimal() +
scale_x_continuous(breaks = seq(min(dfMerged$Year), max(dfMerged$Year), by = 5))
# Function to create scatter plot for a given country
create_scatter_plot <- function(dfCountry, country) {
# List of variables to plot
variables_of_interest <- c("Unemployment", "Education", "GDP")
# Reshape data to long format
df_long <- dfCountry %>%
pivot_longer(cols = variables_of_interest,
names_to = "Variable",
values_to = "Value")
ggplot(df_long, aes(x = Year, color = Birth_rate)) +
geom_point(aes(y = Value)) +
labs(title = paste("Scatterplot of Variables against Birth rate over Years for", country),
x = "Year",
color = "Birth rate") +
scale_color_gradient(name = "Birth rate", low = "red", high = "lightblue") +
theme_minimal() +
facet_wrap(~ Variable, scales = "free_y", ncol = 1)
}
# List of countries
countries <- unique(dfMerged$Country)
# Loop through countries and display scatter plots
for (country in countries) {
dfCountry <- dfMerged %>% filter(Country == country)
g <- create_scatter_plot(dfCountry, country)
print(g)
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(variables_of_interest)
##
## # Now:
## data %>% select(all_of(variables_of_interest))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Correlation
# Choose data
correlation_data <- dfMerged[c("Birth_rate", "Unemployment", "Year", "Education", "GDP")]
# Draw heat map
non_numeric_columns <- names(correlation_data[sapply(correlation_data, function(x) !is.numeric(x))])
# Calculate correlation matrix
correlation_matrix <- cor(correlation_data, use = "complete.obs")
corrplot(correlation_matrix, method = "circle", type = "lower", tl.cex = 0.8)
# Create a copy of the data
regression_data <- dfMerged %>% select(-Year, -Country)
train_and_evaluate_regression <- function(model_type, data) {
set.seed(123) # Set seed for reproducibility
# Split the data set into training and testing sets
train_index <- sample(seq_len(nrow(data)), 0.8 * nrow(data))
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Create the regression model
model <- switch(model_type,
linear = lm(Birth_rate ~ Unemployment + Education + GDP, data = train_data),
tree = tree(Birth_rate ~ Unemployment + Education + GDP, data = train_data),
rf = randomForest(Birth_rate ~ ., data = train_data),
stop("Invalid model type. Supported types: linear, tree, rf.")
)
# Make predictions on the test set
predictions <- predict(model, newdata = test_data)
# Calculate RMSE and R-squared
rmse <- sqrt(mean((test_data$Birth_rate - predictions)^2))
r_squared <- 1 - sum((test_data$Birth_rate - predictions)^2) / sum((test_data$Birth_rate - mean(test_data$Birth_rate))^2)
# Output performance metrics
cat("Model Type:", model_type, "\n")
cat("RMSE:", rmse, "\n")
cat("R-squared:", r_squared, "\n\n")
# Return the performance metrics as a data frame
return(data.frame(
Model = model_type,
RMSE = rmse,
R_squared = r_squared
))
}
combine_regression_metrics <- function(model_types, data) {
metrics_list <- list()
cat("Regression Model Evaluation Metrics:\n")
cat("==================================\n")
for (model_type in model_types) {
model <- train_and_evaluate_regression(model_type, data)
# Store metrics in the list
metrics_list <- c(metrics_list, list(model))
}
# Combine metrics into a data frame
combined_metrics <- do.call(rbind, metrics_list)
# Return the combined data frame
return(combined_metrics)
}
# Specify the model types
model_types <- c("linear", "tree", "rf")
# combine the regression metrics for specified models
evaluation_metrics_regression <- combine_regression_metrics(model_types, regression_data)
## Regression Model Evaluation Metrics:
## ==================================
## Model Type: linear
## RMSE: 3.162653
## R-squared: 0.6144027
##
## Model Type: tree
## RMSE: 2.303022
## R-squared: 0.7955311
##
## Model Type: rf
## RMSE: 2.34786
## R-squared: 0.787492
classification_data <- dfMerged %>% select(-Year)
# Create classification labels
breaks <- c(-Inf, 10, 20, Inf)
labels <- c("low", "medium", "high")
classification_data$Birth_rate <- cut(classification_data$Birth_rate, breaks = breaks, labels = labels, include.lowest = TRUE)
# Function to train and evaluate classification models
train_and_evaluate_classification <- function(model_type, data) {
set.seed(123) # Set seed for reproducibility
# Split the dataset
splitIndex <- createDataPartition(data$Birth_rate, p = .8, list = FALSE, times = 1)
trainData <- data[splitIndex, ]
testData <- data[-splitIndex, ]
# Create the classification model
if (model_type == "decision_tree") {
model <- rpart(Birth_rate ~ ., data = trainData, method = "class")
} else if (model_type == "random_forest") {
model <- randomForest(Birth_rate ~ ., data = trainData)
} else if (model_type == "log_reg" ){
model <- glm(Birth_rate ~ ., data=trainData, family = binomial)
}else {
stop("Invalid model type. Supported types: decision_tree, random_forest, log_reg")
}
# Make predictions on the test set
if( model_type == "log_reg" ){
predictions <- ifelse(predict(model, testData, type = "response") > 0.5, "medium", "low")
} else {
predictions <- predict(model, testData, type = "class")
}
# Calculate evaluation metrics
confusionMatrix <- table(predictions, testData$Birth_rate)
accuracy <- sum(diag(confusionMatrix)) / sum(confusionMatrix)
recall <- confusionMatrix["medium", "medium"] / sum(confusionMatrix["medium", ])
precision <- confusionMatrix["medium", "medium"] / sum(confusionMatrix[, "medium"])
f1_score <- (2 * precision * recall) / (precision + recall)
# Output classification metrics
cat(paste("Model Type:", model_type, "\n"))
cat(paste("Accuracy:", accuracy, "\n"))
cat(paste("Recall:", recall, "\n"))
cat(paste("Precision:", precision, "\n"))
cat(paste("F1 Score:", f1_score, "\n\n"))
cat("Confusion Matrix:\n")
print(confusionMatrix)
cat("\n")
# Return the performance metrics as a data frame
return(data.frame(
Model = model_type,
Accuracy = accuracy,
Recall = recall,
Precision = precision,
F1_Score = f1_score
))
}
combine_classification_metrics <- function(model_types, data) {
metrics_list <- list()
cat("Classification Model Evaluation Metrics:\n")
cat("==================================\n")
for (model_type in model_types) {
model <- train_and_evaluate_classification(model_type, data)
# Store metrics in the list
metrics_list <- c(metrics_list, list(model))
}
# Combine metrics into a data frame
combined_metrics <- do.call(rbind, metrics_list)
# Return the combined data frame
return(combined_metrics)
}
# Specify the model types
model_types <- c("decision_tree", "random_forest", "log_reg")
# combine the classification metrics for specified models
evaluation_metrics_classification <- combine_classification_metrics(model_types, classification_data)
## Classification Model Evaluation Metrics:
## ==================================
## Model Type: decision_tree
## Accuracy: 0.777777777777778
## Recall: 0.823529411764706
## Precision: 0.736842105263158
## F1 Score: 0.777777777777778
##
## Confusion Matrix:
##
## predictions low medium high
## low 10 4 0
## medium 2 14 1
## high 0 1 4
##
## Model Type: random_forest
## Accuracy: 0.833333333333333
## Recall: 0.882352941176471
## Precision: 0.789473684210526
## F1 Score: 0.833333333333333
##
## Confusion Matrix:
##
## predictions low medium high
## low 11 4 0
## medium 1 15 1
## high 0 0 4
##
## Model Type: log_reg
## Accuracy: 0.722222222222222
## Recall: 0.714285714285714
## Precision: 0.789473684210526
## F1 Score: 0.75
##
## Confusion Matrix:
##
## predictions low medium high
## low 11 4 0
## medium 1 15 5
## Evaluation metrics for regression models
## Model RMSE R_squared
## 1 linear 3.162653 0.6144027
## 2 tree 2.303022 0.7955311
## 3 rf 2.347860 0.7874920
## Evaluation metrics for classification models
## Model Accuracy Recall Precision F1_Score
## 1 decision_tree 0.7777778 0.8235294 0.7368421 0.7777778
## 2 random_forest 0.8333333 0.8823529 0.7894737 0.8333333
## 3 log_reg 0.7222222 0.7142857 0.7894737 0.7500000
## Feature Importance
## Education Education 2528.242
## Unemployment Unemployment 1726.888
## GDP GDP 1296.355