Data Collection

We collected datasets from the World Bank database for four countries, encompassing population statistics, workforce data, employment sectors, unemployment rates, and economic factors.

# Install and load necessary packages
library(GGally)

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(caret)

## Loading required package: lattice

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::lift()           masks caret::lift()
## ✖ randomForest::margin()  masks ggplot2::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

# Load data
data <- read.csv('Employment_Dataset.csv')

check_data_info <- function(data_frame) {
  if (!is.data.frame(data_frame)) {
    stop("Input is not a data frame.")
  }
  
  library(data.table)
  
  # Extract column names and data types
  col_names <- names(data_frame)
  data_types <- sapply(data_frame, class)
  
  # Create a data.table for better formatting
  info_table <- data.table(Column_Name = col_names, Data_Type = data_types)
  
  # Display result in a table format
  print(knitr::kable(info_table))
  
  return(info_table)
}


result_table <- check_data_info(data)

## 
## 
## |Column_Name               |Data_Type |
## |:-------------------------|:---------|
## |Year                      |integer   |
## |Country                   |character |
## |UnEmployment              |numeric   |
## |GDP                       |numeric   |
## |Inflation                 |numeric   |
## |Migration                 |integer   |
## |FDI_Inflows               |numeric   |
## |FDI_Outflows              |numeric   |
## |Trade                     |numeric   |
## |Dollar_Exchange_Rate      |numeric   |
## |Employment_In_Industry    |numeric   |
## |Employment_In_Agriculture |numeric   |
## |Employment_In_Services    |numeric   |
## |Self_Employment           |numeric   |
## |Population                |integer   |
## |Labour_Force              |integer   |
## |Youth_Unemployment        |numeric   |

# View structure
str(data)

## 'data.frame':    84 obs. of  17 variables:
##  $ Year                     : int  2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 ...
##  $ Country                  : chr  "Malaysia" "Malaysia" "Malaysia" "Malaysia" ...
##  $ UnEmployment             : num  3.53 3.48 3.61 3.54 3.53 3.32 3.23 3.32 3.66 3.39 ...
##  $ GDP                      : num  0.518 5.391 5.788 6.783 5.332 ...
##  $ Inflation                : num  1.42 1.81 1.09 1.42 2.98 ...
##  $ Migration                : int  198361 224999 226003 244359 245376 251101 247032 217538 194571 114491 ...
##  $ FDI_Inflows              : num  0.597 3.166 2.921 3.508 2.734 ...
##  $ FDI_Outflows             : num  0.288 1.878 1.919 1.453 2.042 ...
##  $ Trade                    : num  203 199 194 210 204 ...
##  $ Dollar_Exchange_Rate     : num  3.8 3.8 3.8 3.8 3.79 ...
##  $ Employment_In_Industry   : num  33.1 32 32 30.1 29.7 ...
##  $ Employment_In_Agriculture: num  15.1 14.9 14.3 14.6 14.6 ...
##  $ Employment_In_Services   : num  51.8 53.1 53.7 55.3 55.6 ...
##  $ Self_Employment          : num  24.6 23.3 23.8 25.4 24.5 ...
##  $ Population               : int  23542517 24142445 24739411 25333247 25923536 26509413 27092604 27664296 28217204 28717731 ...
##  $ Labour_Force             : int  9627564 9908787 10200556 10504004 10820941 11150813 11495136 11759089 12181055 12521566 ...
##  $ Youth_Unemployment       : num  11.2 11.2 11.6 11.5 11.4 ...

# Summary statistics
summary(data)

##       Year        Country           UnEmployment        GDP        
##  Min.   :2001   Length:84          Min.   :0.250   Min.   :-6.067  
##  1st Qu.:2006   Class :character   1st Qu.:2.810   1st Qu.: 3.442  
##  Median :2011   Mode  :character   Median :3.626   Median : 4.837  
##  Mean   :2011                      Mean   :3.580   Mean   : 4.311  
##  3rd Qu.:2016                      3rd Qu.:4.353   3rd Qu.: 5.861  
##  Max.   :2021                      Max.   :8.060   Max.   :14.520  
##    Inflation         Migration        FDI_Inflows      FDI_Outflows    
##  Min.   :-1.1387   Min.   :-108492   Min.   :-1.856   Min.   :-1.2442  
##  1st Qu.: 0.9434   1st Qu.:  -2898   1st Qu.: 1.971   1st Qu.: 0.8123  
##  Median : 2.1050   Median :  58896   Median : 3.040   Median : 2.0950  
##  Mean   : 2.9307   Mean   :  53757   Mean   : 6.943   Mean   : 4.5964  
##  3rd Qu.: 4.3062   3rd Qu.: 103874   3rd Qu.: 5.734   3rd Qu.: 6.0274  
##  Max.   :13.1087   Max.   : 251101   Max.   :32.691   Max.   :22.5941  
##      Trade        Dollar_Exchange_Rate Employment_In_Industry
##  Min.   : 32.97   Min.   :    1.249    Min.   :14.42         
##  1st Qu.: 90.80   1st Qu.:    2.741    1st Qu.:19.70         
##  Median :131.92   Median :   17.380    Median :22.24         
##  Mean   :174.51   Mean   : 2814.865    Mean   :22.74         
##  3rd Qu.:233.56   3rd Qu.: 2149.649    3rd Qu.:26.49         
##  Max.   :437.33   Max.   :14481.000    Max.   :33.11         
##  Employment_In_Agriculture Employment_In_Services Self_Employment
##  Min.   : 0.3247           Min.   :32.92          Min.   :12.55  
##  1st Qu.: 7.5040           1st Qu.:41.22          1st Qu.:21.30  
##  Median :21.9279           Median :50.51          Median :38.93  
##  Mean   :22.1142           Mean   :55.15          Mean   :38.01  
##  3rd Qu.:38.7549           3rd Qu.:64.92          3rd Qu.:55.61  
##  Max.   :47.6099           Max.   :85.26          Max.   :65.47  
##    Population         Labour_Force       Youth_Unemployment
##  Min.   :  4114826   Min.   :  2194671   Min.   : 1.253    
##  1st Qu.: 19082780   1st Qu.:  8102533   1st Qu.: 7.618    
##  Median : 48611883   Median : 26291618   Median :10.598    
##  Mean   : 87233397   Mean   : 43506686   Mean   :10.879    
##  3rd Qu.:107978936   3rd Qu.: 56318272   3rd Qu.:12.985    
##  Max.   :273753191   Max.   :136201932   Max.   :26.379

# Check for missing data
missing_values <- colSums(is.na(data))

# Display columns with missing data
cat("Columns with missing data:\n")

## Columns with missing data:

print(missing_values[missing_values > 0])

## named numeric(0)

# Extract the columns that need  to be scaled
columns_to_scale <- c("UnEmployment", "GDP", "Trade", "FDI_Inflows", "FDI_Outflows", "Migration","Inflation", "Dollar_Exchange_Rate")

# Scale the selected columns
data_scaled <- data
data_scaled[columns_to_scale] <- scale(data[columns_to_scale])

# selecting all the country list from the dataset
unique_countries <- unique(data$Country)

getFilteredData <- function(country){
   country_data <- filter(data_scaled, Country == country)
   return (country_data)
}

# Plot unemployment trends
# Function to plot unemployment trends
plot_employment_trend <- function(country) {
 
 graphData <- getFilteredData(country)
 print(country)
 p <- ggplot(graphData, aes(x = Year, y = UnEmployment)) +
    geom_line() +
    labs(title = paste("Unemployment Trends Over Time in", country),
         x = "Year",
         y = "Unemployment Rate")
 print(p)
}

# Loop through each unique country
for (country in unique_countries) {
  plot_employment_trend(country)
}

## [1] "Malaysia"

## [1] "Singapore"

## [1] "Indonesia"

## [1] "Thailand"

# Function to plot bar chart for employment by industry
plot_employment_by_industry <- function(country) {
  cat("Country:", country, "\n")
  country_data <- data %>% filter(Country == country)

  # Extract relevant columns for employment in different industries
  employment_data <- country_data %>% 
    select(Year, Employment_In_Industry, Employment_In_Agriculture, Employment_In_Services)

   # Convert data to long format for ggplot
  employment_data_long <- tidyr::gather(employment_data, key = "Industry", value = "Employment", -Year)
  
  # Plot line graph using ggplot2
 p <- ggplot(employment_data_long, aes(x = Year, y = Employment, fill = Industry)) +
  geom_bar(stat = "identity") +
  labs(title = paste("Employment by Industry -", country),
       x = "Year", y = "Employment") +
  theme_minimal()
  print(p)
}

# Loop through unique countries and call the plot_employment_by_industry function
for (country in unique_countries) {
  tryCatch({
    plot_employment_by_industry(country)
  }, error = function(e) {
    cat("Error for country", country, ":", conditionMessage(e), "\n")
  })
}

## Country: Malaysia

## Country: Singapore

## Country: Indonesia

## Country: Thailand

# Function to plot line graph for self-employment trends by country
plot_self_employment_trend <- function(country) {
  cat("Country:", country, "\n")
  country_data <- data %>% filter(Country == country)

  # Extract relevant columns for self-employment
  self_employment_data <- country_data %>% 
    select(Year, Self_Employment)

  # Plot line graph for self-employment
  p <- ggplot(self_employment_data, aes(x = Year, y = Self_Employment)) +
    geom_line() +
    labs(title = paste("Self-Employment Trend -", country),
         x = "Year", y = "Self-Employment") +
    theme_minimal()
  
  print(p)
}

# Loop through unique countries and call the plot_self_employment_trend function
for (country in unique_countries) {
  tryCatch({
    plot_self_employment_trend(country)
  }, error = function(e) {
    cat("Error for country", country, ":", conditionMessage(e), "\n")
  })
}

## Country: Malaysia

## Country: Singapore

## Country: Indonesia

## Country: Thailand

# Compute correlation matrix
get_correlation_matrix <- function(country){
  
  graphData <- getFilteredData(country)
  print(country)
  
  correlation_data <- graphData[, c("UnEmployment", "GDP", "Trade", "FDI_Inflows", "FDI_Outflows", "Migration","Inflation", "Dollar_Exchange_Rate")]
  
  correlation_matrix <- cor(correlation_data)
  print(correlation_matrix)

# Plot heatmap with larger dimensions
p <- ggplot(data = as.data.frame(as.table(correlation_matrix)),
       aes(x = Var1, y = Var2, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  labs(title = paste("Correlation Heatmap of Factors and Unemployment", country),
       x = "Factors",
       y = "Factors") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
  theme_minimal() +
  theme(legend.position = "right") +
  theme(plot.title = element_text(hjust = 0.5))

print(p)

}

# Loop through each unique country
for (country in unique_countries) {
  get_correlation_matrix(country)
}

## [1] "Malaysia"
##                      UnEmployment        GDP        Trade  FDI_Inflows
## UnEmployment           1.00000000 -0.7209383 -0.058615901 -0.283634817
## GDP                   -0.72093834  1.0000000  0.266091343  0.646733520
## Trade                 -0.05861590  0.2660913  1.000000000  0.003286075
## FDI_Inflows           -0.28363482  0.6467335  0.003286075  1.000000000
## FDI_Outflows          -0.50799041  0.4064109 -0.033823275  0.567009357
## Migration             -0.05904085  0.2079650  0.957180185 -0.029282437
## Inflation             -0.49630059  0.5563658  0.214721277  0.498880022
## Dollar_Exchange_Rate   0.56143216 -0.3126269 -0.245503690 -0.187254412
##                      FDI_Outflows   Migration  Inflation Dollar_Exchange_Rate
## UnEmployment          -0.50799041 -0.05904085 -0.4963006            0.5614322
## GDP                    0.40641095  0.20796496  0.5563658           -0.3126269
## Trade                 -0.03382328  0.95718019  0.2147213           -0.2455037
## FDI_Inflows            0.56700936 -0.02928244  0.4988800           -0.1872544
## FDI_Outflows           1.00000000  0.05758928  0.5310946           -0.7535614
## Migration              0.05758928  1.00000000  0.2358385           -0.2568324
## Inflation              0.53109455  0.23583852  1.0000000           -0.2987648
## Dollar_Exchange_Rate  -0.75356136 -0.25683242 -0.2987648            1.0000000

## [1] "Singapore"
##                      UnEmployment        GDP      Trade FDI_Inflows
## UnEmployment           1.00000000 0.12109199  0.3634263  -0.5089832
## GDP                    0.12109199 1.00000000  0.3869397   0.2901947
## Trade                  0.36342625 0.38693969  1.0000000  -0.3759824
## FDI_Inflows           -0.50898323 0.29019469 -0.3759824   1.0000000
## FDI_Outflows          -0.35443654 0.01460203 -0.2147825   0.5487100
## Migration              0.01209435 0.25157930  0.6264947  -0.1978181
## Inflation             -0.26854907 0.24248095  0.5299305  -0.2078929
## Dollar_Exchange_Rate   0.67580508 0.01710725  0.3073204  -0.3428536
##                      FDI_Outflows   Migration  Inflation Dollar_Exchange_Rate
## UnEmployment          -0.35443654  0.01209435 -0.2685491           0.67580508
## GDP                    0.01460203  0.25157930  0.2424810           0.01710725
## Trade                 -0.21478245  0.62649470  0.5299305           0.30732040
## FDI_Inflows            0.54871005 -0.19781809 -0.2078929          -0.34285359
## FDI_Outflows           1.00000000  0.19060401 -0.1888395          -0.12037788
## Migration              0.19060401  1.00000000  0.4347900          -0.12016148
## Inflation             -0.18883953  0.43479001  1.0000000          -0.35746107
## Dollar_Exchange_Rate  -0.12037788 -0.12016148 -0.3574611           1.00000000

## [1] "Indonesia"
##                      UnEmployment        GDP      Trade FDI_Inflows
## UnEmployment            1.0000000  0.2826872  0.7895883  -0.3049496
## GDP                     0.2826872  1.0000000  0.3617690   0.1900522
## Trade                   0.7895883  0.3617690  1.0000000  -0.4152607
## FDI_Inflows            -0.3049496  0.1900522 -0.4152607   1.0000000
## FDI_Outflows            0.2567562  0.2372453  0.2761383   0.5758054
## Migration              -0.9465180 -0.3719661 -0.7093818   0.2962782
## Inflation               0.7132115  0.2797169  0.8455061  -0.3378844
## Dollar_Exchange_Rate   -0.7696329 -0.4061431 -0.7186374   0.3091353
##                      FDI_Outflows  Migration  Inflation Dollar_Exchange_Rate
## UnEmployment            0.2567562 -0.9465180  0.7132115           -0.7696329
## GDP                     0.2372453 -0.3719661  0.2797169           -0.4061431
## Trade                   0.2761383 -0.7093818  0.8455061           -0.7186374
## FDI_Inflows             0.5758054  0.2962782 -0.3378844            0.3091353
## FDI_Outflows            1.0000000 -0.2152340  0.1629397           -0.1849486
## Migration              -0.2152340  1.0000000 -0.6490843            0.7926288
## Inflation               0.1629397 -0.6490843  1.0000000           -0.5835666
## Dollar_Exchange_Rate   -0.1849486  0.7926288 -0.5835666            1.0000000

## [1] "Thailand"
##                      UnEmployment         GDP       Trade FDI_Inflows
## UnEmployment          1.000000000  0.08949185 -0.22520029   0.3058054
## GDP                   0.089491846  1.00000000  0.39359833   0.7121966
## Trade                -0.225200286  0.39359833  1.00000000   0.4936688
## FDI_Inflows           0.305805395  0.71219660  0.49366876   1.0000000
## FDI_Outflows         -0.606509715 -0.36730847 -0.27539159  -0.4353973
## Migration             0.658130679  0.38203130  0.22838751   0.3764940
## Inflation             0.008600351  0.38148076  0.79239992   0.5333555
## Dollar_Exchange_Rate  0.837469797  0.38221861 -0.09179996   0.4242800
##                      FDI_Outflows  Migration    Inflation Dollar_Exchange_Rate
## UnEmployment           -0.6065097  0.6581307  0.008600351           0.83746980
## GDP                    -0.3673085  0.3820313  0.381480759           0.38221861
## Trade                  -0.2753916  0.2283875  0.792399918          -0.09179996
## FDI_Inflows            -0.4353973  0.3764940  0.533355504           0.42428000
## FDI_Outflows            1.0000000 -0.6756076 -0.383583475          -0.70584003
## Migration              -0.6756076  1.0000000  0.256603886           0.71585041
## Inflation              -0.3835835  0.2566039  1.000000000           0.06453896
## Dollar_Exchange_Rate   -0.7058400  0.7158504  0.064538957           1.00000000

# Select columns for analysis
selected_columns <- c("UnEmployment", "GDP", "Trade", "FDI_Inflows", "FDI_Outflows", "Migration","Inflation", "Dollar_Exchange_Rate")

# Plot pairs plot

plot_pairplot <- function(country){
  graphData <- getFilteredData(country)
  print(country)
 p <- ggpairs(graphData, columns = selected_columns, title = "Pair Plot of Selected Variables") 
 print(p)
}

# Loop through each unique country
for (country in unique_countries) {
  plot_pairplot(country)
}

## [1] "Malaysia"

## [1] "Singapore"

## [1] "Indonesia"

## [1] "Thailand"

# Select columns for analysis
factors_of_interest <- c( "GDP", "Trade", "FDI_Inflows", "FDI_Outflows", "Migration","Inflation", "Dollar_Exchange_Rate")

# Create separate plots for each factor
plots_list <- list()

plot_againt_unemployment <- function(country){
  plots_list <- list()
  
  graphData <- getFilteredData(country)
  
  print(country)
  
 for (factor in factors_of_interest) {
  # Plot unemployment_rate against the current factor with year and two lines
  plot <- ggplot(graphData, aes(x = Year, y = !!rlang::sym(factor))) +
    geom_line(aes(y = UnEmployment), color = "red", linetype = "solid") +
    geom_line(color = "blue", linetype = "dashed") +
    labs(title = paste(country, " Unemployment Rate and ", factor, " Over Time"),
         x = "Year",
         y = "Scaled Value") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5)) +
    theme(legend.position = "bottom") +  # Optional: Adjust legend position
    theme(panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_line(color = "black"))

  plots_list[[factor]] <- plot
}

  # Display the list of plots
  print(plots_list)
  
}

# Loop through each unique country
for (country in unique_countries) {
  plot_againt_unemployment(country)
}

## [1] "Malaysia"
## $GDP

## 
## $Trade

## 
## $FDI_Inflows

## 
## $FDI_Outflows

## 
## $Migration

## 
## $Inflation

## 
## $Dollar_Exchange_Rate

## 
## [1] "Singapore"
## $GDP

## 
## $Trade

## 
## $FDI_Inflows

## 
## $FDI_Outflows

## 
## $Migration

## 
## $Inflation

## 
## $Dollar_Exchange_Rate

## 
## [1] "Indonesia"
## $GDP

## 
## $Trade

## 
## $FDI_Inflows

## 
## $FDI_Outflows

## 
## $Migration

## 
## $Inflation

## 
## $Dollar_Exchange_Rate

## 
## [1] "Thailand"
## $GDP

## 
## $Trade

## 
## $FDI_Inflows

## 
## $FDI_Outflows

## 
## $Migration

## 
## $Inflation

## 
## $Dollar_Exchange_Rate

mdata <- read.csv('Employment_Dataset.csv')

################ Linear regression model ################
linear_models <- function(country) {
  cat("Country:", country, "\n")
  
  # Filter data for the specified country
  country_data <- mdata %>% filter(Country == country)
  
  # Remove 'Year' & 'Country' columns
  country_data <- country_data %>% select(-Country)
  
  # Remove variables with only one unique value
  country_data <- country_data %>% 
    select_if(~ n_distinct(.) > 1)
  
  # Set target variable (unemployment rate)
  target <- country_data$UnEmployment  # Assuming UnemploymentRate is the column you want to predict
  
  # Remove target variable column
  features <- country_data %>% select(-UnEmployment)
  
  # Split the data into training and testing sets
  set.seed(42) # For repeatability of results
  splitIndex <- createDataPartition(target, p = .80, list = FALSE)
  train_data <- country_data[splitIndex,]
  test_data <- country_data[-splitIndex,]
  
  # Create a linear regression model
  lr_model <- lm(UnEmployment ~ ., data = train_data)
  
  # Use test set for prediction
  predictions <- predict(lr_model, test_data)
  
  # Calculate root mean square error (RMSE) and coefficient of determination (R²)
  rmse <- sqrt(mean((predictions - test_data$UnEmployment)^2))
  r2 <- summary(lr_model)$r.squared
  
  # Print performance indicators
  print(paste("RMSE:", rmse))
  print(paste("R²:", r2))
  cat("\n")
  
  # View model summary
  cat("Linear regression model:\n")
  print(summary(lr_model))
  cat("---------------------------------------------------\n")
  
}

for (country in unique_countries) {
  linear_models(country)
}

## Country: Malaysia 
## [1] "RMSE: 0.297698716605363"
## [1] "R²: 0.999331422026865"
## 
## Linear regression model:
## 
## Call:
## lm(formula = UnEmployment ~ ., data = train_data)
## 
## Residuals:
##          1          2          3          4          6          7          8 
##  0.0023042  0.0047021  0.0026684 -0.0163718 -0.0047621  0.0200403 -0.0092978 
##         11         13         14         15         16         17         18 
##  0.0008147  0.0100383 -0.0077662 -0.0119501 -0.0062913  0.0208227  0.0001181 
##         19         20         21 
## -0.0041081  0.0002419 -0.0012033 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)
## (Intercept)               -4.174e+04  1.179e+05  -0.354    0.783
## Year                      -6.033e-01  5.078e-01  -1.188    0.445
## GDP                       -1.102e-01  1.880e-02  -5.864    0.108
## Inflation                  4.002e-02  2.249e-02   1.779    0.326
## Migration                  2.364e-06  2.733e-06   0.865    0.546
## FDI_Inflows                6.375e-02  4.448e-02   1.433    0.388
## FDI_Outflows               4.016e-02  1.165e-01   0.345    0.789
## Trade                      1.014e-02  1.355e-02   0.749    0.591
## Dollar_Exchange_Rate      -2.040e-01  2.382e-01  -0.857    0.549
## Employment_In_Industry     4.292e+02  1.182e+03   0.363    0.778
## Employment_In_Agriculture  4.290e+02  1.182e+03   0.363    0.778
## Employment_In_Services     4.294e+02  1.183e+03   0.363    0.778
## Self_Employment           -9.237e-02  3.881e-02  -2.380    0.253
## Population                 2.623e-07  4.889e-07   0.537    0.686
## Labour_Force               9.847e-07  6.841e-07   1.439    0.387
## Youth_Unemployment         4.858e-01  1.001e-01   4.854    0.129
## 
## Residual standard error: 0.04012 on 1 degrees of freedom
## Multiple R-squared:  0.9993, Adjusted R-squared:  0.9893 
## F-statistic: 99.65 on 15 and 1 DF,  p-value: 0.07847
## 
## ---------------------------------------------------
## Country: Singapore 
## [1] "RMSE: 0.931830315189885"
## [1] "R²: 0.989811910302992"
## 
## Linear regression model:
## 
## Call:
## lm(formula = UnEmployment ~ ., data = train_data)
## 
## Residuals:
##         1         2         3         5         6         7         8         9 
## -0.020015  0.068549 -0.020348 -0.033207  0.121053 -0.178033  0.009088  0.063678 
##        10        12        13        15        17        18        19        20 
## -0.045584 -0.040575  0.130056 -0.067351  0.055698 -0.111036  0.119893 -0.109380 
##        21 
##  0.057515 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)
## (Intercept)                1.659e+06  1.638e+06   1.013    0.496
## Year                       8.510e-01  4.936e-01   1.724    0.335
## GDP                       -1.920e-02  3.843e-02  -0.500    0.705
## Inflation                 -1.726e-01  1.921e-01  -0.898    0.534
## Migration                 -1.878e-05  1.366e-05  -1.375    0.400
## FDI_Inflows                1.687e-02  5.034e-02   0.335    0.794
## FDI_Outflows              -1.118e-02  4.413e-02  -0.253    0.842
## Trade                      1.939e-02  2.012e-02   0.964    0.512
## Dollar_Exchange_Rate       9.530e+00  7.646e+00   1.246    0.430
## Employment_In_Industry    -1.661e+04  1.639e+04  -1.013    0.496
## Employment_In_Agriculture -1.660e+04  1.638e+04  -1.013    0.496
## Employment_In_Services    -1.661e+04  1.639e+04  -1.013    0.496
## Self_Employment            1.050e+00  6.075e-01   1.729    0.334
## Population                -5.346e-06  4.903e-06  -1.090    0.472
## Labour_Force               7.687e-06  6.393e-06   1.203    0.442
## Youth_Unemployment         2.404e-01  2.188e-01   1.099    0.470
## 
## Residual standard error: 0.3569 on 1 degrees of freedom
## Multiple R-squared:  0.9898, Adjusted R-squared:  0.837 
## F-statistic: 6.477 on 15 and 1 DF,  p-value: 0.3001
## 
## ---------------------------------------------------
## Country: Indonesia 
## [1] "RMSE: 1.40561178355356"
## [1] "R²: 0.999584296799524"
## 
## Linear regression model:
## 
## Call:
## lm(formula = UnEmployment ~ ., data = train_data)
## 
## Residuals:
##         1         2         4         5         7         8         9        10 
## -0.033308  0.030935  0.040429 -0.018176 -0.036158  0.038491 -0.041968  0.016267 
##        11        12        14        15        16        18        19        20 
## -0.020531  0.041570 -0.036695  0.002557  0.010944  0.030779 -0.035627  0.009344 
##        21 
##  0.001149 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)               -1.994e+06  3.262e+05  -6.113   0.1032  
## Year                       3.976e+00  5.739e-01   6.928   0.0913 .
## GDP                        3.086e-01  7.013e-02   4.400   0.1423  
## Inflation                 -8.378e-03  5.010e-02  -0.167   0.8945  
## Migration                  5.732e-06  1.026e-05   0.559   0.6756  
## FDI_Inflows               -2.722e-01  1.437e-01  -1.895   0.3091  
## FDI_Outflows               8.048e-02  1.254e-01   0.642   0.6368  
## Trade                     -6.272e-02  2.221e-02  -2.824   0.2167  
## Dollar_Exchange_Rate       3.969e-04  1.631e-04   2.434   0.2482  
## Employment_In_Industry     1.987e+04  3.256e+03   6.102   0.1034  
## Employment_In_Agriculture  1.987e+04  3.256e+03   6.102   0.1034  
## Employment_In_Services     1.987e+04  3.256e+03   6.102   0.1034  
## Self_Employment           -3.767e-01  1.327e-01  -2.839   0.2156  
## Population                -1.610e-06  2.519e-07  -6.393   0.0988 .
## Labour_Force               6.233e-07  1.137e-07   5.480   0.1149  
## Youth_Unemployment         3.651e-01  5.018e-02   7.275   0.0870 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1215 on 1 degrees of freedom
## Multiple R-squared:  0.9996, Adjusted R-squared:  0.9933 
## F-statistic: 160.3 on 15 and 1 DF,  p-value: 0.06191
## 
## ---------------------------------------------------
## Country: Thailand 
## [1] "RMSE: 0.226690747694035"
## [1] "R²: 0.998658412659223"
## 
## Linear regression model:
## 
## Call:
## lm(formula = UnEmployment ~ ., data = train_data)
## 
## Residuals:
##         1         2         4         5         6         8         9        10 
##  0.030806 -0.020634 -0.004023 -0.004164 -0.024489  0.015084 -0.017998  0.026284 
##        11        13        14        15        16        17        19        20 
##  0.001596 -0.014367 -0.003321  0.020852  0.034665 -0.043432  0.002017  0.002534 
##        21 
## -0.001408 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)
## (Intercept)               -1.857e+05  4.419e+05  -0.420    0.747
## Year                       1.616e-01  1.285e-01   1.258    0.428
## GDP                       -1.951e-02  4.606e-02  -0.424    0.745
## Inflation                 -9.795e-02  1.177e-01  -0.832    0.558
## Migration                  9.073e-07  8.639e-06   0.105    0.933
## FDI_Inflows               -3.471e-02  1.593e-01  -0.218    0.863
## FDI_Outflows               7.253e-02  2.471e-01   0.293    0.818
## Trade                      2.810e-02  2.662e-02   1.056    0.483
## Dollar_Exchange_Rate      -5.422e-02  1.112e-01  -0.488    0.711
## Employment_In_Industry     1.854e+03  4.420e+03   0.420    0.747
## Employment_In_Agriculture  1.854e+03  4.420e+03   0.420    0.747
## Employment_In_Services     1.854e+03  4.420e+03   0.420    0.747
## Self_Employment           -1.189e-01  1.284e-01  -0.926    0.524
## Population                -6.420e-07  5.893e-07  -1.089    0.473
## Labour_Force              -1.930e-07  2.702e-07  -0.714    0.605
## Youth_Unemployment         1.795e-01  5.801e-02   3.095    0.199
## 
## Residual standard error: 0.0837 on 1 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9785 
## F-statistic: 49.63 on 15 and 1 DF,  p-value: 0.111
## 
## ---------------------------------------------------

rdata <- read.csv('Employment_Dataset.csv')

################ Random forest model ################
random_forest_models <- function(country) {
  
  country_data <- rdata %>% filter(Country == country)
  
  # remove'Year'&'Country'column
  country_data <- country_data %>% select(-Year, -Country)
  
  # set target variable (unemployment)
  target <- country_data$UnEmployment
  
  # remove target variable column
  features <- country_data %>% select(-UnEmployment)
  
  # split the data into training and testing sets
  set.seed(42) #  For repeatability of results
  splitIndex <- createDataPartition(target, p = .80, list = FALSE)
  train_data <- country_data[splitIndex,]
  test_data <- country_data[-splitIndex,]
  
  # create a random forest model
  rf_model <- randomForest(UnEmployment ~ ., data = train_data, ntree = 500)
  
  # use test set prediction
  rm_predictions <- predict(rf_model, test_data)
  
  # calculate root mean square error (RMSE) and coefficient of determination (R²)
  rmse <- sqrt(mean((rm_predictions - test_data$UnEmployment)^2))
  r2 <- 1 - sum((rm_predictions - test_data$UnEmployment)^2) / sum((test_data$UnEmployment - mean(test_data$UnEmployment))^2)
  
  # printing performance indicators
  print(paste("RMSE:", rmse))
  print(paste("R²:", r2))
  cat("\n")
  
  # view model summary
  cat("Random Forest model:\n")
  print(summary(rf_model))
  
   # print predicted values
  cat("Predicted Unemployment Values:\n")
  print(rm_predictions)
}

for (country in unique_countries) {
  random_forest_models(country)
}

## [1] "RMSE: 0.185353668723145"
## [1] "R²: 0.205642022910306"
## 
## Random Forest model:
##                 Length Class  Mode     
## call              4    -none- call     
## type              1    -none- character
## predicted        17    -none- numeric  
## mse             500    -none- numeric  
## rsq             500    -none- numeric  
## oob.times        17    -none- numeric  
## importance       14    -none- numeric  
## importanceSD      0    -none- NULL     
## localImportance   0    -none- NULL     
## proximity         0    -none- NULL     
## ntree             1    -none- numeric  
## mtry              1    -none- numeric  
## forest           11    -none- list     
## coefs             0    -none- NULL     
## y                17    -none- numeric  
## test              0    -none- NULL     
## inbag             0    -none- NULL     
## terms             3    terms  call     
## Predicted Unemployment Values:
##        5        9       10       12 
## 3.411901 3.385876 3.176018 3.150444 
## [1] "RMSE: 0.476955709021393"
## [1] "R²: 0.683064011399075"
## 
## Random Forest model:
##                 Length Class  Mode     
## call              4    -none- call     
## type              1    -none- character
## predicted        17    -none- numeric  
## mse             500    -none- numeric  
## rsq             500    -none- numeric  
## oob.times        17    -none- numeric  
## importance       14    -none- numeric  
## importanceSD      0    -none- NULL     
## localImportance   0    -none- NULL     
## proximity         0    -none- NULL     
## ntree             1    -none- numeric  
## mtry              1    -none- numeric  
## forest           11    -none- list     
## coefs             0    -none- NULL     
## y                17    -none- numeric  
## test              0    -none- NULL     
## inbag             0    -none- NULL     
## terms             3    terms  call     
## Predicted Unemployment Values:
##        4       11       14       16 
## 4.929990 4.084264 3.820721 3.886161 
## [1] "RMSE: 0.284429665884742"
## [1] "R²: 0.967063098579755"
## 
## Random Forest model:
##                 Length Class  Mode     
## call              4    -none- call     
## type              1    -none- character
## predicted        17    -none- numeric  
## mse             500    -none- numeric  
## rsq             500    -none- numeric  
## oob.times        17    -none- numeric  
## importance       14    -none- numeric  
## importanceSD      0    -none- NULL     
## localImportance   0    -none- NULL     
## proximity         0    -none- NULL     
## ntree             1    -none- numeric  
## mtry              1    -none- numeric  
## forest           11    -none- list     
## coefs             0    -none- NULL     
## y                17    -none- numeric  
## test              0    -none- NULL     
## inbag             0    -none- NULL     
## terms             3    terms  call     
## Predicted Unemployment Values:
##        3        6       13       17 
## 6.706036 7.251022 4.401768 4.257785 
## [1] "RMSE: 0.159957766288499"
## [1] "R²: 0.814615862004212"
## 
## Random Forest model:
##                 Length Class  Mode     
## call              4    -none- call     
## type              1    -none- character
## predicted        17    -none- numeric  
## mse             500    -none- numeric  
## rsq             500    -none- numeric  
## oob.times        17    -none- numeric  
## importance       14    -none- numeric  
## importanceSD      0    -none- NULL     
## localImportance   0    -none- NULL     
## proximity         0    -none- NULL     
## ntree             1    -none- numeric  
## mtry              1    -none- numeric  
## forest           11    -none- list     
## coefs             0    -none- NULL     
## y                17    -none- numeric  
## test              0    -none- NULL     
## inbag             0    -none- NULL     
## terms             3    terms  call     
## Predicted Unemployment Values:
##         3         7        12        18 
## 1.7556431 1.2077277 0.8132156 0.7438152

################ ARIMA model ################
adata <- read.csv('Employment_Dataset.csv')

arima_models <- function(country)  {
  
  country_data <- adata %>% filter(Country == country)
  
  # remove Country' column
  country_data <- country_data %>% select(-Country)
  
  country_data <- country_data %>% arrange(Year) %>% na.omit() # ensure data is sorted by year and remove missing values
  
  # split the data into training and testing sets
  split_year <- max(country_data$Year) - 3 # using the last three years of data as the test set
  train_data <- country_data %>% filter(Year <= split_year)
  test_data <- country_data %>% filter(Year > split_year)
  
  # set target variable (unemployment)
  train_ts <- ts(train_data$UnEmployment, start = min(train_data$Year), end = max(train_data$Year), frequency = 1)
  
  # fit ARIMA model
  arima_model <- auto.arima(train_ts)
  
  # use test set prediction
  test_years <- length(test_data$Year)
  forecasted_values <- forecast(arima_model, h = test_years)
  
  # calculate root mean square error (RMSE)
  rmse <- sqrt(mean((forecasted_values$mean - test_data$UnEmployment)^2))
  
  # printing performance indicators
  print(paste("RMSE:", rmse))
  
  # view model summary
  cat("Arima model:\n")
  print(summary(arima_model))
  cat("---------------------------------------------------\n")
}

for (country in unique_countries) {
  arima_models(country)
}

## [1] "RMSE: 0.835806197631963"
## Arima model:
## Series: train_ts 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.03825:  log likelihood = 3.62
## AIC=-5.24   AICc=-4.97   BIC=-4.4
## 
## Training set error measures:
##                       ME      RMSE       MAE        MPE     MAPE      MASE
## Training set -0.01258167 0.1900749 0.1496406 -0.5368283 4.558351 0.9456838
##                     ACF1
## Training set 0.003483444
## ---------------------------------------------------
## [1] "RMSE: 0.413748313188908"
## Arima model:
## Series: train_ts 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.7329:  log likelihood = -21.48
## AIC=44.96   AICc=45.23   BIC=45.8
## 
## Training set error measures:
##                        ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -0.006402222 0.8320021 0.5323756 -1.656461 11.33675 0.9448152
##                    ACF1
## Training set -0.1118996
## ---------------------------------------------------
## [1] "RMSE: 0.569561234635925"
## Arima model:
## Series: train_ts 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.3154:  log likelihood = -14.31
## AIC=30.63   AICc=30.89   BIC=31.46
## 
## Training set error measures:
##                       ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -0.09355111 0.5457481 0.4764489 -2.260104 8.496725 0.9451145
##                   ACF1
## Training set 0.2099203
## ---------------------------------------------------
## [1] "RMSE: 0.231433215708838"
## Arima model:
## Series: train_ts 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.1084:  log likelihood = -5.23
## AIC=12.47   AICc=12.73   BIC=13.3
## 
## Training set error measures:
##                      ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -0.1015222 0.3199225 0.2051444 -14.76246 27.77432 0.9451099
##                    ACF1
## Training set -0.1883341
## ---------------------------------------------------

Unemployment Analysis

7-Jan-2024

Project Background

Objectives

Data Collection