# UNICEF Education Data Analysis
# Author: Elvira Khwatenge
# Date: Sys.Date()
if (!require("tidyverse")) install.packages("tidyverse", dependencies = TRUE)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if (!require("janitor")) install.packages("janitor", dependencies = TRUE)
## Loading required package: janitor
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
if (!require("dplyr")) install.packages("dplyr", dependencies = TRUE)
if (!require("ggplot2")) install.packages("ggplot2", dependencies = TRUE)
if (!require("magrittr")) install.packages("magrittr", dependencies = TRUE)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(tidyverse)
library(janitor)
library(dplyr)
library(ggplot2)
library(magrittr)
education_data <- read.csv("EDUCATION.csv") %>%
clean_names() %>%
select(ref_area, geographic_area, indicator, sex, education_level, wealth_quintile, residence, time_period, obs_value) %>%
filter(!is.na(obs_value))
# Check if data is available
if(nrow(education_data) == 0) {
stop("No data available after initial filtering.")
}
gender_disparity <- education_data %>%
filter(sex %in% c("Male", "Female")) %>%
group_by(geographic_area, sex) %>%
summarise(avg_completion_rate = mean(obs_value, na.rm = TRUE))
## `summarise()` has grouped output by 'geographic_area'. You can override using
## the `.groups` argument.
# Plot gender disparities by country
ggplot(gender_disparity, aes(x = geographic_area, y = avg_completion_rate, fill = sex)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Average Completion Rate by Gender and Country", x = "Country", y = "Completion Rate")

wealth_disparity <- education_data %>%
filter(wealth_quintile != "Total") %>%
group_by(wealth_quintile) %>%
summarise(avg_completion_rate = mean(obs_value, na.rm = TRUE))
# Plot completion rates by wealth quintile
ggplot(wealth_disparity, aes(x = wealth_quintile, y = avg_completion_rate, fill = wealth_quintile)) +
geom_bar(stat = "identity") +
labs(title = "Completion Rate by Wealth Quintile", x = "Wealth Quintile", y = "Completion Rate")

residence_disparity <- education_data %>%
filter(residence %in% c("Urban", "Rural")) %>%
group_by(geographic_area, residence) %>%
summarise(avg_completion_rate = mean(obs_value, na.rm = TRUE))
## `summarise()` has grouped output by 'geographic_area'. You can override using
## the `.groups` argument.
# Plot urban vs rural completion rates
ggplot(residence_disparity, aes(x = geographic_area, y = avg_completion_rate, fill = residence)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Completion Rate by Residence Type and Country", x = "Country", y = "Completion Rate")

time_trends <- education_data %>%
group_by(time_period) %>%
summarise(avg_completion_rate = mean(obs_value, na.rm = TRUE))
# Plot trends in completion rates over time
ggplot(time_trends, aes(x = time_period, y = avg_completion_rate)) +
geom_line() +
geom_point() +
labs(title = "Trends in Completion Rate Over Time", x = "Year", y = "Average Completion Rate")

country_comparison <- education_data %>%
group_by(geographic_area) %>%
summarise(avg_completion_rate = mean(obs_value, na.rm = TRUE))
# Plot completion rates by country with horizontal bars
ggplot(country_comparison, aes(x = reorder(geographic_area, avg_completion_rate), y = avg_completion_rate)) +
geom_bar(stat = "identity", fill = "green") +
coord_flip() +
labs(title = "Average Completion Rate by Country", x = "Country", y = "Completion Rate") +
theme_minimal()

# Option: Display Only the Top and Bottom 10 Countries
top_countries <- head(country_comparison[order(-country_comparison$avg_completion_rate), ], 10)
bottom_countries <- head(country_comparison[order(country_comparison$avg_completion_rate), ], 10)
selected_countries <- bind_rows(top_countries, bottom_countries)
# Plot top and bottom countries with horizontal bars
ggplot(selected_countries, aes(x = reorder(geographic_area, avg_completion_rate), y = avg_completion_rate)) +
geom_bar(stat = "identity", fill = "blue") +
coord_flip() +
labs(title = "Top and Bottom Countries by Completion Rate", x = "Country", y = "Completion Rate") +
theme_minimal()

cat("Findings and Discussion\n")
## Findings and Discussion
cat("Significant differences in education completion rates exist between males and females, with females often having lower rates in some regions, suggesting a need for gender-focused educational support.\n")
## Significant differences in education completion rates exist between males and females, with females often having lower rates in some regions, suggesting a need for gender-focused educational support.
cat("Wealth quintiles correlate with completion rates, as students from higher-income backgrounds tend to have better outcomes, pointing to a need for financial support for lower-income students.\n")
## Wealth quintiles correlate with completion rates, as students from higher-income backgrounds tend to have better outcomes, pointing to a need for financial support for lower-income students.
cat("Rural areas consistently report lower completion rates than urban counterparts, indicating a need for improved educational infrastructure in rural areas.\n")
## Rural areas consistently report lower completion rates than urban counterparts, indicating a need for improved educational infrastructure in rural areas.
cat("While there is a general improvement in education completion rates over recent years, some regions show stagnation or decline, highlighting areas that may require targeted interventions.\n")
## While there is a general improvement in education completion rates over recent years, some regions show stagnation or decline, highlighting areas that may require targeted interventions.