This Project explores factors related to unemployment fluctuations in Malaysia, Singapore, Indonesia, and Thailand.
Organization: Government
Target User: Workforce Personnel in Malaysia
Potential Benefit:
To identify the contributing factors of unemployment fluctuations in Malaysia, Singapore, Thailand and Indonesia.
To develop a predictive model for forecasting the future unemployment rates for all 4 countries.
To evaluate the performance of predictive models in forecasting.
# Install and load necessary packages
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
# Load data
data <- read.csv('Employment_Dataset.csv')
check_data_info <- function(data_frame) {
if (!is.data.frame(data_frame)) {
stop("Input is not a data frame.")
}
library(data.table)
# Extract column names and data types
col_names <- names(data_frame)
data_types <- sapply(data_frame, class)
# Create a data.table for better formatting
info_table <- data.table(Column_Name = col_names, Data_Type = data_types)
# Display result in a table format
print(knitr::kable(info_table))
return(info_table)
}
result_table <- check_data_info(data)
##
##
## |Column_Name |Data_Type |
## |:-------------------------|:---------|
## |Year |integer |
## |Country |character |
## |UnEmployment |numeric |
## |GDP |numeric |
## |Inflation |numeric |
## |Migration |integer |
## |FDI_Inflows |numeric |
## |FDI_Outflows |numeric |
## |Trade |numeric |
## |Dollar_Exchange_Rate |numeric |
## |Employment_In_Industry |numeric |
## |Employment_In_Agriculture |numeric |
## |Employment_In_Services |numeric |
## |Self_Employment |numeric |
## |Population |integer |
## |Labour_Force |integer |
## |Youth_Unemployment |numeric |
# View structure
str(data)
## 'data.frame': 84 obs. of 17 variables:
## $ Year : int 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 ...
## $ Country : chr "Malaysia" "Malaysia" "Malaysia" "Malaysia" ...
## $ UnEmployment : num 3.53 3.48 3.61 3.54 3.53 3.32 3.23 3.32 3.66 3.39 ...
## $ GDP : num 0.518 5.391 5.788 6.783 5.332 ...
## $ Inflation : num 1.42 1.81 1.09 1.42 2.98 ...
## $ Migration : int 198361 224999 226003 244359 245376 251101 247032 217538 194571 114491 ...
## $ FDI_Inflows : num 0.597 3.166 2.921 3.508 2.734 ...
## $ FDI_Outflows : num 0.288 1.878 1.919 1.453 2.042 ...
## $ Trade : num 203 199 194 210 204 ...
## $ Dollar_Exchange_Rate : num 3.8 3.8 3.8 3.8 3.79 ...
## $ Employment_In_Industry : num 33.1 32 32 30.1 29.7 ...
## $ Employment_In_Agriculture: num 15.1 14.9 14.3 14.6 14.6 ...
## $ Employment_In_Services : num 51.8 53.1 53.7 55.3 55.6 ...
## $ Self_Employment : num 24.6 23.3 23.8 25.4 24.5 ...
## $ Population : int 23542517 24142445 24739411 25333247 25923536 26509413 27092604 27664296 28217204 28717731 ...
## $ Labour_Force : int 9627564 9908787 10200556 10504004 10820941 11150813 11495136 11759089 12181055 12521566 ...
## $ Youth_Unemployment : num 11.2 11.2 11.6 11.5 11.4 ...
# Summary statistics
summary(data)
## Year Country UnEmployment GDP
## Min. :2001 Length:84 Min. :0.250 Min. :-6.067
## 1st Qu.:2006 Class :character 1st Qu.:2.810 1st Qu.: 3.442
## Median :2011 Mode :character Median :3.626 Median : 4.837
## Mean :2011 Mean :3.580 Mean : 4.311
## 3rd Qu.:2016 3rd Qu.:4.353 3rd Qu.: 5.861
## Max. :2021 Max. :8.060 Max. :14.520
## Inflation Migration FDI_Inflows FDI_Outflows
## Min. :-1.1387 Min. :-108492 Min. :-1.856 Min. :-1.2442
## 1st Qu.: 0.9434 1st Qu.: -2898 1st Qu.: 1.971 1st Qu.: 0.8123
## Median : 2.1050 Median : 58896 Median : 3.040 Median : 2.0950
## Mean : 2.9307 Mean : 53757 Mean : 6.943 Mean : 4.5964
## 3rd Qu.: 4.3062 3rd Qu.: 103874 3rd Qu.: 5.734 3rd Qu.: 6.0274
## Max. :13.1087 Max. : 251101 Max. :32.691 Max. :22.5941
## Trade Dollar_Exchange_Rate Employment_In_Industry
## Min. : 32.97 Min. : 1.249 Min. :14.42
## 1st Qu.: 90.80 1st Qu.: 2.741 1st Qu.:19.70
## Median :131.92 Median : 17.380 Median :22.24
## Mean :174.51 Mean : 2814.865 Mean :22.74
## 3rd Qu.:233.56 3rd Qu.: 2149.649 3rd Qu.:26.49
## Max. :437.33 Max. :14481.000 Max. :33.11
## Employment_In_Agriculture Employment_In_Services Self_Employment
## Min. : 0.3247 Min. :32.92 Min. :12.55
## 1st Qu.: 7.5040 1st Qu.:41.22 1st Qu.:21.30
## Median :21.9279 Median :50.51 Median :38.93
## Mean :22.1142 Mean :55.15 Mean :38.01
## 3rd Qu.:38.7549 3rd Qu.:64.92 3rd Qu.:55.61
## Max. :47.6099 Max. :85.26 Max. :65.47
## Population Labour_Force Youth_Unemployment
## Min. : 4114826 Min. : 2194671 Min. : 1.253
## 1st Qu.: 19082780 1st Qu.: 8102533 1st Qu.: 7.618
## Median : 48611883 Median : 26291618 Median :10.598
## Mean : 87233397 Mean : 43506686 Mean :10.879
## 3rd Qu.:107978936 3rd Qu.: 56318272 3rd Qu.:12.985
## Max. :273753191 Max. :136201932 Max. :26.379
# Check for missing data
missing_values <- colSums(is.na(data))
# Display columns with missing data
cat("Columns with missing data:\n")
## Columns with missing data:
print(missing_values[missing_values > 0])
## named numeric(0)
# Extract the columns that need to be scaled
columns_to_scale <- c("UnEmployment", "GDP", "Trade", "FDI_Inflows", "FDI_Outflows", "Migration","Inflation", "Dollar_Exchange_Rate")
# Scale the selected columns
data_scaled <- data
data_scaled[columns_to_scale] <- scale(data[columns_to_scale])
# selecting all the country list from the dataset
unique_countries <- unique(data$Country)
getFilteredData <- function(country){
country_data <- filter(data_scaled, Country == country)
return (country_data)
}
# Plot unemployment trends
# Function to plot unemployment trends
plot_employment_trend <- function(country) {
graphData <- getFilteredData(country)
print(country)
p <- ggplot(graphData, aes(x = Year, y = UnEmployment)) +
geom_line() +
labs(title = paste("Unemployment Trends Over Time in", country),
x = "Year",
y = "Unemployment Rate")
print(p)
}
# Loop through each unique country
for (country in unique_countries) {
plot_employment_trend(country)
}
## [1] "Malaysia"
## [1] "Singapore"
## [1] "Indonesia"
## [1] "Thailand"
# Function to plot bar chart for employment by industry
plot_employment_by_industry <- function(country) {
cat("Country:", country, "\n")
country_data <- data %>% filter(Country == country)
# Extract relevant columns for employment in different industries
employment_data <- country_data %>%
select(Year, Employment_In_Industry, Employment_In_Agriculture, Employment_In_Services)
# Convert data to long format for ggplot
employment_data_long <- tidyr::gather(employment_data, key = "Industry", value = "Employment", -Year)
# Plot line graph using ggplot2
p <- ggplot(employment_data_long, aes(x = Year, y = Employment, fill = Industry)) +
geom_bar(stat = "identity") +
labs(title = paste("Employment by Industry -", country),
x = "Year", y = "Employment") +
theme_minimal()
print(p)
}
# Loop through unique countries and call the plot_employment_by_industry function
for (country in unique_countries) {
tryCatch({
plot_employment_by_industry(country)
}, error = function(e) {
cat("Error for country", country, ":", conditionMessage(e), "\n")
})
}
## Country: Malaysia
## Country: Singapore
## Country: Indonesia
## Country: Thailand
# Function to plot line graph for self-employment trends by country
plot_self_employment_trend <- function(country) {
cat("Country:", country, "\n")
country_data <- data %>% filter(Country == country)
# Extract relevant columns for self-employment
self_employment_data <- country_data %>%
select(Year, Self_Employment)
# Plot line graph for self-employment
p <- ggplot(self_employment_data, aes(x = Year, y = Self_Employment)) +
geom_line() +
labs(title = paste("Self-Employment Trend -", country),
x = "Year", y = "Self-Employment") +
theme_minimal()
print(p)
}
# Loop through unique countries and call the plot_self_employment_trend function
for (country in unique_countries) {
tryCatch({
plot_self_employment_trend(country)
}, error = function(e) {
cat("Error for country", country, ":", conditionMessage(e), "\n")
})
}
## Country: Malaysia
## Country: Singapore
## Country: Indonesia
## Country: Thailand
# Compute correlation matrix
get_correlation_matrix <- function(country){
graphData <- getFilteredData(country)
print(country)
correlation_data <- graphData[, c("UnEmployment", "GDP", "Trade", "FDI_Inflows", "FDI_Outflows", "Migration","Inflation", "Dollar_Exchange_Rate")]
correlation_matrix <- cor(correlation_data)
print(correlation_matrix)
# Plot heatmap with larger dimensions
p <- ggplot(data = as.data.frame(as.table(correlation_matrix)),
aes(x = Var1, y = Var2, fill = Freq)) +
geom_tile() +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
labs(title = paste("Correlation Heatmap of Factors and Unemployment", country),
x = "Factors",
y = "Factors") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
theme_minimal() +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
}
# Loop through each unique country
for (country in unique_countries) {
get_correlation_matrix(country)
}
## [1] "Malaysia"
## UnEmployment GDP Trade FDI_Inflows
## UnEmployment 1.00000000 -0.7209383 -0.058615901 -0.283634817
## GDP -0.72093834 1.0000000 0.266091343 0.646733520
## Trade -0.05861590 0.2660913 1.000000000 0.003286075
## FDI_Inflows -0.28363482 0.6467335 0.003286075 1.000000000
## FDI_Outflows -0.50799041 0.4064109 -0.033823275 0.567009357
## Migration -0.05904085 0.2079650 0.957180185 -0.029282437
## Inflation -0.49630059 0.5563658 0.214721277 0.498880022
## Dollar_Exchange_Rate 0.56143216 -0.3126269 -0.245503690 -0.187254412
## FDI_Outflows Migration Inflation Dollar_Exchange_Rate
## UnEmployment -0.50799041 -0.05904085 -0.4963006 0.5614322
## GDP 0.40641095 0.20796496 0.5563658 -0.3126269
## Trade -0.03382328 0.95718019 0.2147213 -0.2455037
## FDI_Inflows 0.56700936 -0.02928244 0.4988800 -0.1872544
## FDI_Outflows 1.00000000 0.05758928 0.5310946 -0.7535614
## Migration 0.05758928 1.00000000 0.2358385 -0.2568324
## Inflation 0.53109455 0.23583852 1.0000000 -0.2987648
## Dollar_Exchange_Rate -0.75356136 -0.25683242 -0.2987648 1.0000000
## [1] "Singapore"
## UnEmployment GDP Trade FDI_Inflows
## UnEmployment 1.00000000 0.12109199 0.3634263 -0.5089832
## GDP 0.12109199 1.00000000 0.3869397 0.2901947
## Trade 0.36342625 0.38693969 1.0000000 -0.3759824
## FDI_Inflows -0.50898323 0.29019469 -0.3759824 1.0000000
## FDI_Outflows -0.35443654 0.01460203 -0.2147825 0.5487100
## Migration 0.01209435 0.25157930 0.6264947 -0.1978181
## Inflation -0.26854907 0.24248095 0.5299305 -0.2078929
## Dollar_Exchange_Rate 0.67580508 0.01710725 0.3073204 -0.3428536
## FDI_Outflows Migration Inflation Dollar_Exchange_Rate
## UnEmployment -0.35443654 0.01209435 -0.2685491 0.67580508
## GDP 0.01460203 0.25157930 0.2424810 0.01710725
## Trade -0.21478245 0.62649470 0.5299305 0.30732040
## FDI_Inflows 0.54871005 -0.19781809 -0.2078929 -0.34285359
## FDI_Outflows 1.00000000 0.19060401 -0.1888395 -0.12037788
## Migration 0.19060401 1.00000000 0.4347900 -0.12016148
## Inflation -0.18883953 0.43479001 1.0000000 -0.35746107
## Dollar_Exchange_Rate -0.12037788 -0.12016148 -0.3574611 1.00000000
## [1] "Indonesia"
## UnEmployment GDP Trade FDI_Inflows
## UnEmployment 1.0000000 0.2826872 0.7895883 -0.3049496
## GDP 0.2826872 1.0000000 0.3617690 0.1900522
## Trade 0.7895883 0.3617690 1.0000000 -0.4152607
## FDI_Inflows -0.3049496 0.1900522 -0.4152607 1.0000000
## FDI_Outflows 0.2567562 0.2372453 0.2761383 0.5758054
## Migration -0.9465180 -0.3719661 -0.7093818 0.2962782
## Inflation 0.7132115 0.2797169 0.8455061 -0.3378844
## Dollar_Exchange_Rate -0.7696329 -0.4061431 -0.7186374 0.3091353
## FDI_Outflows Migration Inflation Dollar_Exchange_Rate
## UnEmployment 0.2567562 -0.9465180 0.7132115 -0.7696329
## GDP 0.2372453 -0.3719661 0.2797169 -0.4061431
## Trade 0.2761383 -0.7093818 0.8455061 -0.7186374
## FDI_Inflows 0.5758054 0.2962782 -0.3378844 0.3091353
## FDI_Outflows 1.0000000 -0.2152340 0.1629397 -0.1849486
## Migration -0.2152340 1.0000000 -0.6490843 0.7926288
## Inflation 0.1629397 -0.6490843 1.0000000 -0.5835666
## Dollar_Exchange_Rate -0.1849486 0.7926288 -0.5835666 1.0000000
## [1] "Thailand"
## UnEmployment GDP Trade FDI_Inflows
## UnEmployment 1.000000000 0.08949185 -0.22520029 0.3058054
## GDP 0.089491846 1.00000000 0.39359833 0.7121966
## Trade -0.225200286 0.39359833 1.00000000 0.4936688
## FDI_Inflows 0.305805395 0.71219660 0.49366876 1.0000000
## FDI_Outflows -0.606509715 -0.36730847 -0.27539159 -0.4353973
## Migration 0.658130679 0.38203130 0.22838751 0.3764940
## Inflation 0.008600351 0.38148076 0.79239992 0.5333555
## Dollar_Exchange_Rate 0.837469797 0.38221861 -0.09179996 0.4242800
## FDI_Outflows Migration Inflation Dollar_Exchange_Rate
## UnEmployment -0.6065097 0.6581307 0.008600351 0.83746980
## GDP -0.3673085 0.3820313 0.381480759 0.38221861
## Trade -0.2753916 0.2283875 0.792399918 -0.09179996
## FDI_Inflows -0.4353973 0.3764940 0.533355504 0.42428000
## FDI_Outflows 1.0000000 -0.6756076 -0.383583475 -0.70584003
## Migration -0.6756076 1.0000000 0.256603886 0.71585041
## Inflation -0.3835835 0.2566039 1.000000000 0.06453896
## Dollar_Exchange_Rate -0.7058400 0.7158504 0.064538957 1.00000000
# Select columns for analysis
selected_columns <- c("UnEmployment", "GDP", "Trade", "FDI_Inflows", "FDI_Outflows", "Migration","Inflation", "Dollar_Exchange_Rate")
# Plot pairs plot
plot_pairplot <- function(country){
graphData <- getFilteredData(country)
print(country)
p <- ggpairs(graphData, columns = selected_columns, title = "Pair Plot of Selected Variables")
print(p)
}
# Loop through each unique country
for (country in unique_countries) {
plot_pairplot(country)
}
## [1] "Malaysia"
## [1] "Singapore"
## [1] "Indonesia"
## [1] "Thailand"
# Select columns for analysis
factors_of_interest <- c( "GDP", "Trade", "FDI_Inflows", "FDI_Outflows", "Migration","Inflation", "Dollar_Exchange_Rate")
# Create separate plots for each factor
plots_list <- list()
plot_againt_unemployment <- function(country){
plots_list <- list()
graphData <- getFilteredData(country)
print(country)
for (factor in factors_of_interest) {
# Plot unemployment_rate against the current factor with year and two lines
plot <- ggplot(graphData, aes(x = Year, y = !!rlang::sym(factor))) +
geom_line(aes(y = UnEmployment), color = "red", linetype = "solid") +
geom_line(color = "blue", linetype = "dashed") +
labs(title = paste(country, " Unemployment Rate and ", factor, " Over Time"),
x = "Year",
y = "Scaled Value") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(legend.position = "bottom") + # Optional: Adjust legend position
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "black"))
plots_list[[factor]] <- plot
}
# Display the list of plots
print(plots_list)
}
# Loop through each unique country
for (country in unique_countries) {
plot_againt_unemployment(country)
}
## [1] "Malaysia"
## $GDP
##
## $Trade
##
## $FDI_Inflows
##
## $FDI_Outflows
##
## $Migration
##
## $Inflation
##
## $Dollar_Exchange_Rate
##
## [1] "Singapore"
## $GDP
##
## $Trade
##
## $FDI_Inflows
##
## $FDI_Outflows
##
## $Migration
##
## $Inflation
##
## $Dollar_Exchange_Rate
##
## [1] "Indonesia"
## $GDP
##
## $Trade
##
## $FDI_Inflows
##
## $FDI_Outflows
##
## $Migration
##
## $Inflation
##
## $Dollar_Exchange_Rate
##
## [1] "Thailand"
## $GDP
##
## $Trade
##
## $FDI_Inflows
##
## $FDI_Outflows
##
## $Migration
##
## $Inflation
##
## $Dollar_Exchange_Rate
mdata <- read.csv('Employment_Dataset.csv')
################ Linear regression model ################
linear_models <- function(country) {
cat("Country:", country, "\n")
# Filter data for the specified country
country_data <- mdata %>% filter(Country == country)
# Remove 'Year' & 'Country' columns
country_data <- country_data %>% select(-Country)
# Remove variables with only one unique value
country_data <- country_data %>%
select_if(~ n_distinct(.) > 1)
# Set target variable (unemployment rate)
target <- country_data$UnEmployment # Assuming UnemploymentRate is the column you want to predict
# Remove target variable column
features <- country_data %>% select(-UnEmployment)
# Split the data into training and testing sets
set.seed(42) # For repeatability of results
splitIndex <- createDataPartition(target, p = .80, list = FALSE)
train_data <- country_data[splitIndex,]
test_data <- country_data[-splitIndex,]
# Create a linear regression model
lr_model <- lm(UnEmployment ~ ., data = train_data)
# Use test set for prediction
predictions <- predict(lr_model, test_data)
# Calculate root mean square error (RMSE) and coefficient of determination (R²)
rmse <- sqrt(mean((predictions - test_data$UnEmployment)^2))
r2 <- summary(lr_model)$r.squared
# Print performance indicators
print(paste("RMSE:", rmse))
print(paste("R²:", r2))
cat("\n")
# View model summary
cat("Linear regression model:\n")
print(summary(lr_model))
cat("---------------------------------------------------\n")
}
for (country in unique_countries) {
linear_models(country)
}
## Country: Malaysia
## [1] "RMSE: 0.297698716605363"
## [1] "R²: 0.999331422026865"
##
## Linear regression model:
##
## Call:
## lm(formula = UnEmployment ~ ., data = train_data)
##
## Residuals:
## 1 2 3 4 6 7 8
## 0.0023042 0.0047021 0.0026684 -0.0163718 -0.0047621 0.0200403 -0.0092978
## 11 13 14 15 16 17 18
## 0.0008147 0.0100383 -0.0077662 -0.0119501 -0.0062913 0.0208227 0.0001181
## 19 20 21
## -0.0041081 0.0002419 -0.0012033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.174e+04 1.179e+05 -0.354 0.783
## Year -6.033e-01 5.078e-01 -1.188 0.445
## GDP -1.102e-01 1.880e-02 -5.864 0.108
## Inflation 4.002e-02 2.249e-02 1.779 0.326
## Migration 2.364e-06 2.733e-06 0.865 0.546
## FDI_Inflows 6.375e-02 4.448e-02 1.433 0.388
## FDI_Outflows 4.016e-02 1.165e-01 0.345 0.789
## Trade 1.014e-02 1.355e-02 0.749 0.591
## Dollar_Exchange_Rate -2.040e-01 2.382e-01 -0.857 0.549
## Employment_In_Industry 4.292e+02 1.182e+03 0.363 0.778
## Employment_In_Agriculture 4.290e+02 1.182e+03 0.363 0.778
## Employment_In_Services 4.294e+02 1.183e+03 0.363 0.778
## Self_Employment -9.237e-02 3.881e-02 -2.380 0.253
## Population 2.623e-07 4.889e-07 0.537 0.686
## Labour_Force 9.847e-07 6.841e-07 1.439 0.387
## Youth_Unemployment 4.858e-01 1.001e-01 4.854 0.129
##
## Residual standard error: 0.04012 on 1 degrees of freedom
## Multiple R-squared: 0.9993, Adjusted R-squared: 0.9893
## F-statistic: 99.65 on 15 and 1 DF, p-value: 0.07847
##
## ---------------------------------------------------
## Country: Singapore
## [1] "RMSE: 0.931830315189885"
## [1] "R²: 0.989811910302992"
##
## Linear regression model:
##
## Call:
## lm(formula = UnEmployment ~ ., data = train_data)
##
## Residuals:
## 1 2 3 5 6 7 8 9
## -0.020015 0.068549 -0.020348 -0.033207 0.121053 -0.178033 0.009088 0.063678
## 10 12 13 15 17 18 19 20
## -0.045584 -0.040575 0.130056 -0.067351 0.055698 -0.111036 0.119893 -0.109380
## 21
## 0.057515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.659e+06 1.638e+06 1.013 0.496
## Year 8.510e-01 4.936e-01 1.724 0.335
## GDP -1.920e-02 3.843e-02 -0.500 0.705
## Inflation -1.726e-01 1.921e-01 -0.898 0.534
## Migration -1.878e-05 1.366e-05 -1.375 0.400
## FDI_Inflows 1.687e-02 5.034e-02 0.335 0.794
## FDI_Outflows -1.118e-02 4.413e-02 -0.253 0.842
## Trade 1.939e-02 2.012e-02 0.964 0.512
## Dollar_Exchange_Rate 9.530e+00 7.646e+00 1.246 0.430
## Employment_In_Industry -1.661e+04 1.639e+04 -1.013 0.496
## Employment_In_Agriculture -1.660e+04 1.638e+04 -1.013 0.496
## Employment_In_Services -1.661e+04 1.639e+04 -1.013 0.496
## Self_Employment 1.050e+00 6.075e-01 1.729 0.334
## Population -5.346e-06 4.903e-06 -1.090 0.472
## Labour_Force 7.687e-06 6.393e-06 1.203 0.442
## Youth_Unemployment 2.404e-01 2.188e-01 1.099 0.470
##
## Residual standard error: 0.3569 on 1 degrees of freedom
## Multiple R-squared: 0.9898, Adjusted R-squared: 0.837
## F-statistic: 6.477 on 15 and 1 DF, p-value: 0.3001
##
## ---------------------------------------------------
## Country: Indonesia
## [1] "RMSE: 1.40561178355356"
## [1] "R²: 0.999584296799524"
##
## Linear regression model:
##
## Call:
## lm(formula = UnEmployment ~ ., data = train_data)
##
## Residuals:
## 1 2 4 5 7 8 9 10
## -0.033308 0.030935 0.040429 -0.018176 -0.036158 0.038491 -0.041968 0.016267
## 11 12 14 15 16 18 19 20
## -0.020531 0.041570 -0.036695 0.002557 0.010944 0.030779 -0.035627 0.009344
## 21
## 0.001149
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.994e+06 3.262e+05 -6.113 0.1032
## Year 3.976e+00 5.739e-01 6.928 0.0913 .
## GDP 3.086e-01 7.013e-02 4.400 0.1423
## Inflation -8.378e-03 5.010e-02 -0.167 0.8945
## Migration 5.732e-06 1.026e-05 0.559 0.6756
## FDI_Inflows -2.722e-01 1.437e-01 -1.895 0.3091
## FDI_Outflows 8.048e-02 1.254e-01 0.642 0.6368
## Trade -6.272e-02 2.221e-02 -2.824 0.2167
## Dollar_Exchange_Rate 3.969e-04 1.631e-04 2.434 0.2482
## Employment_In_Industry 1.987e+04 3.256e+03 6.102 0.1034
## Employment_In_Agriculture 1.987e+04 3.256e+03 6.102 0.1034
## Employment_In_Services 1.987e+04 3.256e+03 6.102 0.1034
## Self_Employment -3.767e-01 1.327e-01 -2.839 0.2156
## Population -1.610e-06 2.519e-07 -6.393 0.0988 .
## Labour_Force 6.233e-07 1.137e-07 5.480 0.1149
## Youth_Unemployment 3.651e-01 5.018e-02 7.275 0.0870 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1215 on 1 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9933
## F-statistic: 160.3 on 15 and 1 DF, p-value: 0.06191
##
## ---------------------------------------------------
## Country: Thailand
## [1] "RMSE: 0.226690747694035"
## [1] "R²: 0.998658412659223"
##
## Linear regression model:
##
## Call:
## lm(formula = UnEmployment ~ ., data = train_data)
##
## Residuals:
## 1 2 4 5 6 8 9 10
## 0.030806 -0.020634 -0.004023 -0.004164 -0.024489 0.015084 -0.017998 0.026284
## 11 13 14 15 16 17 19 20
## 0.001596 -0.014367 -0.003321 0.020852 0.034665 -0.043432 0.002017 0.002534
## 21
## -0.001408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.857e+05 4.419e+05 -0.420 0.747
## Year 1.616e-01 1.285e-01 1.258 0.428
## GDP -1.951e-02 4.606e-02 -0.424 0.745
## Inflation -9.795e-02 1.177e-01 -0.832 0.558
## Migration 9.073e-07 8.639e-06 0.105 0.933
## FDI_Inflows -3.471e-02 1.593e-01 -0.218 0.863
## FDI_Outflows 7.253e-02 2.471e-01 0.293 0.818
## Trade 2.810e-02 2.662e-02 1.056 0.483
## Dollar_Exchange_Rate -5.422e-02 1.112e-01 -0.488 0.711
## Employment_In_Industry 1.854e+03 4.420e+03 0.420 0.747
## Employment_In_Agriculture 1.854e+03 4.420e+03 0.420 0.747
## Employment_In_Services 1.854e+03 4.420e+03 0.420 0.747
## Self_Employment -1.189e-01 1.284e-01 -0.926 0.524
## Population -6.420e-07 5.893e-07 -1.089 0.473
## Labour_Force -1.930e-07 2.702e-07 -0.714 0.605
## Youth_Unemployment 1.795e-01 5.801e-02 3.095 0.199
##
## Residual standard error: 0.0837 on 1 degrees of freedom
## Multiple R-squared: 0.9987, Adjusted R-squared: 0.9785
## F-statistic: 49.63 on 15 and 1 DF, p-value: 0.111
##
## ---------------------------------------------------
rdata <- read.csv('Employment_Dataset.csv')
################ Random forest model ################
random_forest_models <- function(country) {
country_data <- rdata %>% filter(Country == country)
# remove'Year'&'Country'column
country_data <- country_data %>% select(-Year, -Country)
# set target variable (unemployment)
target <- country_data$UnEmployment
# remove target variable column
features <- country_data %>% select(-UnEmployment)
# split the data into training and testing sets
set.seed(42) # For repeatability of results
splitIndex <- createDataPartition(target, p = .80, list = FALSE)
train_data <- country_data[splitIndex,]
test_data <- country_data[-splitIndex,]
# create a random forest model
rf_model <- randomForest(UnEmployment ~ ., data = train_data, ntree = 500)
# use test set prediction
rm_predictions <- predict(rf_model, test_data)
# calculate root mean square error (RMSE) and coefficient of determination (R²)
rmse <- sqrt(mean((rm_predictions - test_data$UnEmployment)^2))
r2 <- 1 - sum((rm_predictions - test_data$UnEmployment)^2) / sum((test_data$UnEmployment - mean(test_data$UnEmployment))^2)
# printing performance indicators
print(paste("RMSE:", rmse))
print(paste("R²:", r2))
cat("\n")
# view model summary
cat("Random Forest model:\n")
print(summary(rf_model))
# print predicted values
cat("Predicted Unemployment Values:\n")
print(rm_predictions)
}
for (country in unique_countries) {
random_forest_models(country)
}
## [1] "RMSE: 0.185353668723145"
## [1] "R²: 0.205642022910306"
##
## Random Forest model:
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 17 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 17 -none- numeric
## importance 14 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 17 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
## Predicted Unemployment Values:
## 5 9 10 12
## 3.411901 3.385876 3.176018 3.150444
## [1] "RMSE: 0.476955709021393"
## [1] "R²: 0.683064011399075"
##
## Random Forest model:
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 17 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 17 -none- numeric
## importance 14 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 17 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
## Predicted Unemployment Values:
## 4 11 14 16
## 4.929990 4.084264 3.820721 3.886161
## [1] "RMSE: 0.284429665884742"
## [1] "R²: 0.967063098579755"
##
## Random Forest model:
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 17 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 17 -none- numeric
## importance 14 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 17 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
## Predicted Unemployment Values:
## 3 6 13 17
## 6.706036 7.251022 4.401768 4.257785
## [1] "RMSE: 0.159957766288499"
## [1] "R²: 0.814615862004212"
##
## Random Forest model:
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 17 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 17 -none- numeric
## importance 14 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 17 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
## Predicted Unemployment Values:
## 3 7 12 18
## 1.7556431 1.2077277 0.8132156 0.7438152
################ ARIMA model ################
adata <- read.csv('Employment_Dataset.csv')
arima_models <- function(country) {
country_data <- adata %>% filter(Country == country)
# remove Country' column
country_data <- country_data %>% select(-Country)
country_data <- country_data %>% arrange(Year) %>% na.omit() # ensure data is sorted by year and remove missing values
# split the data into training and testing sets
split_year <- max(country_data$Year) - 3 # using the last three years of data as the test set
train_data <- country_data %>% filter(Year <= split_year)
test_data <- country_data %>% filter(Year > split_year)
# set target variable (unemployment)
train_ts <- ts(train_data$UnEmployment, start = min(train_data$Year), end = max(train_data$Year), frequency = 1)
# fit ARIMA model
arima_model <- auto.arima(train_ts)
# use test set prediction
test_years <- length(test_data$Year)
forecasted_values <- forecast(arima_model, h = test_years)
# calculate root mean square error (RMSE)
rmse <- sqrt(mean((forecasted_values$mean - test_data$UnEmployment)^2))
# printing performance indicators
print(paste("RMSE:", rmse))
# view model summary
cat("Arima model:\n")
print(summary(arima_model))
cat("---------------------------------------------------\n")
}
for (country in unique_countries) {
arima_models(country)
}
## [1] "RMSE: 0.835806197631963"
## Arima model:
## Series: train_ts
## ARIMA(0,1,0)
##
## sigma^2 = 0.03825: log likelihood = 3.62
## AIC=-5.24 AICc=-4.97 BIC=-4.4
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01258167 0.1900749 0.1496406 -0.5368283 4.558351 0.9456838
## ACF1
## Training set 0.003483444
## ---------------------------------------------------
## [1] "RMSE: 0.413748313188908"
## Arima model:
## Series: train_ts
## ARIMA(0,1,0)
##
## sigma^2 = 0.7329: log likelihood = -21.48
## AIC=44.96 AICc=45.23 BIC=45.8
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.006402222 0.8320021 0.5323756 -1.656461 11.33675 0.9448152
## ACF1
## Training set -0.1118996
## ---------------------------------------------------
## [1] "RMSE: 0.569561234635925"
## Arima model:
## Series: train_ts
## ARIMA(0,1,0)
##
## sigma^2 = 0.3154: log likelihood = -14.31
## AIC=30.63 AICc=30.89 BIC=31.46
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.09355111 0.5457481 0.4764489 -2.260104 8.496725 0.9451145
## ACF1
## Training set 0.2099203
## ---------------------------------------------------
## [1] "RMSE: 0.231433215708838"
## Arima model:
## Series: train_ts
## ARIMA(0,1,0)
##
## sigma^2 = 0.1084: log likelihood = -5.23
## AIC=12.47 AICc=12.73 BIC=13.3
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1015222 0.3199225 0.2051444 -14.76246 27.77432 0.9451099
## ACF1
## Training set -0.1883341
## ---------------------------------------------------