Project Overview

This project employs ARIMA (AutoRegressive Integrated Moving Average) modeling to analyze and forecast Kenya’s unemployment rate trends from 2000 to 2029. Using annual time series data spanning 25 years (2000-2024), the study identifies underlying patterns and structural shifts in Kenya’s labor market dynamics. The analysis is particularly relevant given the dramatic changes observed in recent years, including the sharp rise in unemployment following the 2017 period, potentially attributed to economic disruptions, policy transitions, and the global COVID-19 pandemic. By applying rigorous econometric techniques, this research provides evidence-based projections to inform policymakers, economists, and stakeholders in developing targeted employment strategies and fiscal interventions aimed at addressing Kenya’s persistent unemployment challenges.

Setting working directory

setwd("D:/Analysis Projects")

Loading necessary libraries

library(readxl)      # For Excel files
library(tidyverse)   # Data manipulation

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(forecast)    # ARIMA modeling
library(tseries)     # Time series tests

## Warning: package 'tseries' was built under R version 4.5.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(ggplot2)     # Visualization
library(lubridate)   # Date handling

Data importation

data=read.csv("unemployment.csv")
head(data)

##   Year Unemployment_Rate
## 1 2000              2.89
## 2 2001              2.88
## 3 2002              2.92
## 4 2003              2.87
## 5 2004              2.85
## 6 2005              2.75

Data inspection

str(data)

## 'data.frame':    25 obs. of  2 variables:
##  $ Year             : int  2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ...
##  $ Unemployment_Rate: num  2.89 2.88 2.92 2.87 2.85 2.75 2.68 2.65 2.76 2.82 ...

Check missing values

colSums(is.na(data))

##              Year Unemployment_Rate 
##                 0                 0

Create time series object

unemp_ts <- ts(data$Unemployment_Rate, 
               start = 2000, 
               end = 2024, 
               frequency = 1)

Visualize raw data

plot(unemp_ts, 
     main = "Kenya Unemployment Rate (2000-2024)",
     xlab = "Year", ylab = "Unemployment Rate (%)",
     col = "steelblue", lwd = 2)
grid()

Exploratory Data Analysis & Stationarity Checks

autoplot(unemp_ts) +
  geom_smooth(method = "loess", color = "red") +
  labs(title = "Kenya Unemployment: Trend Analysis",
       subtitle = "Source: World Bank/ILO Estimates") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Stationarity Tests

ADF Test (Null: series has unit root = non-stationary)

adf_test <- adf.test(unemp_ts)
print(adf_test)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  unemp_ts
## Dickey-Fuller = -2.4039, Lag order = 2, p-value = 0.4185
## alternative hypothesis: stationary

KPSS Test (Null: series is stationary)

kpss_test <- kpss.test(unemp_ts)
print(kpss_test)

## 
##  KPSS Test for Level Stationarity
## 
## data:  unemp_ts
## KPSS Level = 0.64405, Truncation lag parameter = 2, p-value = 0.01863

unemp_diff <- diff(unemp_ts, differences = 1)
unemp_diff1 <- diff(unemp_diff, differences = 2)
unemp_diff2 <- diff(unemp_diff1, differences = 3)
adf.test(unemp_diff2)

## Warning in adf.test(unemp_diff2): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  unemp_diff2
## Dickey-Fuller = -4.5445, Lag order = 2, p-value = 0.01
## alternative hypothesis: stationary

plot differenced rate

plot(unemp_diff2, main = "Second Difference of Unemployment Rate")

ACF & PACF plots for model identification

par(mfrow = c(2,1))
acf(unemp_diff2, main = "ACF - Differenced Series")
pacf(unemp_diff2, main = "PACF - Differenced Series")

Auto ARIMA (recommended starting point)

auto_model <- auto.arima(unemp_ts, 
                         seasonal = FALSE,  # Annual data
                         stepwise = FALSE, 
                         approximation = FALSE)
summary(auto_model)

## Series: unemp_ts 
## ARIMA(1,1,0) 
## 
## Coefficients:
##          ar1
##       0.7374
## s.e.  0.1250
## 
## sigma^2 = 0.03938:  log likelihood = 4.88
## AIC=-5.75   AICc=-5.18   BIC=-3.4
## 
## Training set error measures:
##                      ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 0.02279687 0.1903484 0.1037104 0.7685899 2.799581 0.6285479
##                   ACF1
## Training set 0.1740154

Model Diagnostics

Residual analysis

checkresiduals(auto_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 4.9559, df = 4, p-value = 0.2919
## 
## Model df: 1.   Total lags used: 5

Ljung-Box test for residual autocorrelation

Box.test(auto_model$residuals, lag = 10, type = "Ljung-Box")

## 
##  Box-Ljung test
## 
## data:  auto_model$residuals
## X-squared = 7.3708, df = 10, p-value = 0.69

Normality test on residuals

shapiro.test(auto_model$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  auto_model$residuals
## W = 0.73652, p-value = 2.354e-05

Plot residuals

par(mfrow = c(2,2))
plot(auto_model$residuals, type = "l", main = "Residuals")
hist(auto_model$residuals, breaks = 10, main = "Residual Histogram")
qqnorm(auto_model$residuals); qqline(auto_model$residuals)
acf(auto_model$residuals, main = "Residual ACF")

Forecasting and Visualization

Generate forecasts (5 years ahead)

forecast_horizon <- 5
unemp_forecast <- forecast(auto_model, h = forecast_horizon)

Plot forecast with confidence intervals

plot(unemp_forecast, 
     include = 2000:2024,
     main = "Kenya Unemployment Rate Forecast (2025-2029)",
     xlab = "Year", ylab = "Unemployment Rate (%)",
     col = "blue", shadecols = "lightgray")
lines(2025:2029, unemp_forecast$mean, col = "red", lwd = 2)
legend("topleft", legend = c("Historical", "Forecast", "95% CI"),
       col = c("blue", "red", "gray"), lwd = c(1,2,10), bty = "n")

Extract forecast values

forecast_df <- data.frame(
  Year = 2025:2029,
  Forecast = as.numeric(unemp_forecast$mean),
  Lo95 = as.numeric(unemp_forecast$lower[,2]),
  Hi95 = as.numeric(unemp_forecast$upper[,2])
)
print(forecast_df)

##   Year Forecast     Lo95     Hi95
## 1 2025 5.326765 4.937807 5.715724
## 2 2026 5.250641 4.470924 6.030358
## 3 2027 5.194507 4.013319 6.375695
## 4 2028 5.153114 3.577196 6.729033
## 5 2029 5.122592 3.166831 7.078353

Project Summary

The ARIMA model reveals a concerning structural shift in Kenya’s unemployment trajectory. After remaining relatively stable at 2.7-2.9% from 2000-2016, unemployment surged dramatically to approximately 5.6% by 2020-2022. The forecast indicates a modest declining trend, with rates projected to decrease gradually from 5.3% in 2025 to approximately 5.1% by 2029. However, the widening 95% confidence interval suggests increasing uncertainty in long-term predictions. While the slight downward trend is encouraging, unemployment rates remain significantly elevated compared to historical levels, indicating the need for sustained policy interventions to accelerate job creation and economic recovery.

Unemployment rate prediction Using ARIMA

CANON TUMWET

2026-05-23