It contains 31 columns, including variables such as:
REF_AREA, Geographic area: Country codes and names. INDICATOR, Indicator: Codes and descriptions for specific education-related indicators. SEX, Sex: Gender data for the observations. EDUCATION_LEVEL, Education Level: The level of education. WEALTH_QUINTILE, Wealth Quintile: Income categorization, such as total or specific wealth levels. RESIDENCE, Residence: Urban or rural classification. TIME_PERIOD: The year of the observation. OBS_VALUE: The value of the educational indicator. Many columns have missing values, especially fields like OBS_STATUS, OBS_CONF, and FREQ_COLL.
##```{r, echo=FALSE} # Load Libraries
# View initial data structure
str(education_data)
## 'data.frame': 1177 obs. of 31 variables:
## $ REF_AREA : chr "AFG" "AFG" "AFG" "AFG" ...
## $ Geographic.area : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ INDICATOR : chr "ED_CR_L3" "ED_CR_L3" "ED_CR_L3" "ED_CR_L3" ...
## $ Indicator : chr "Completion rate for youth of upper secondary education school age" "Completion rate for youth of upper secondary education school age" "Completion rate for youth of upper secondary education school age" "Completion rate for youth of upper secondary education school age" ...
## $ SEX : chr "F" "M" "_T" "_T" ...
## $ Sex : chr "Female" "Male" "Total" "Total" ...
## $ EDUCATION_LEVEL : chr "ISCED11_3" "ISCED11_3" "ISCED11_3" "ISCED11_3" ...
## $ Education.Level : chr "Upper secondary education" "Upper secondary education" "Upper secondary education" "Upper secondary education" ...
## $ WEALTH_QUINTILE : chr "_T" "_T" "Q1" "Q2" ...
## $ Wealth.Quintile : chr "Total" "Total" "Lowest" "Second" ...
## $ RESIDENCE : chr "_T" "_T" "_T" "_T" ...
## $ Residence : chr "Total" "Total" "Total" "Total" ...
## $ UNIT_MEASURE : chr "PCNT" "PCNT" "PCNT" "PCNT" ...
## $ Unit.of.measure : chr "%" "%" "%" "%" ...
## $ UNIT_MULTIPLIER : logi NA NA NA NA NA NA ...
## $ Unit.multiplier : logi NA NA NA NA NA NA ...
## $ SOURCE_LINK : logi NA NA NA NA NA NA ...
## $ SERIES_FOOTNOTE : logi NA NA NA NA NA NA ...
## $ TIME_PERIOD : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ OBS_VALUE : num 14.4 32.3 11.2 13.5 13.3 ...
## $ OBS_STATUS : logi NA NA NA NA NA NA ...
## $ Observation.Status : logi NA NA NA NA NA NA ...
## $ DATA_SOURCE : chr "DHS 2015" "DHS 2015" "DHS 2015" "DHS 2015" ...
## $ OBS_CONF : logi NA NA NA NA NA NA ...
## $ Observation.confidentaility : logi NA NA NA NA NA NA ...
## $ TIME_PERIOD_METHOD : logi NA NA NA NA NA NA ...
## $ Time.period.activity.related.to.when.the.data.are.collected: logi NA NA NA NA NA NA ...
## $ COVERAGE_TIME : logi NA NA NA NA NA NA ...
## $ OBS_FOOTNOTE : logi NA NA NA NA NA NA ...
## $ FREQ_COLL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Time.interval.at.which.the.source.data.are.collected : chr "Not Known" "Not Known" "Not Known" "Not Known" ...
colnames(education_data)
## [1] "REF_AREA"
## [2] "Geographic.area"
## [3] "INDICATOR"
## [4] "Indicator"
## [5] "SEX"
## [6] "Sex"
## [7] "EDUCATION_LEVEL"
## [8] "Education.Level"
## [9] "WEALTH_QUINTILE"
## [10] "Wealth.Quintile"
## [11] "RESIDENCE"
## [12] "Residence"
## [13] "UNIT_MEASURE"
## [14] "Unit.of.measure"
## [15] "UNIT_MULTIPLIER"
## [16] "Unit.multiplier"
## [17] "SOURCE_LINK"
## [18] "SERIES_FOOTNOTE"
## [19] "TIME_PERIOD"
## [20] "OBS_VALUE"
## [21] "OBS_STATUS"
## [22] "Observation.Status"
## [23] "DATA_SOURCE"
## [24] "OBS_CONF"
## [25] "Observation.confidentaility"
## [26] "TIME_PERIOD_METHOD"
## [27] "Time.period.activity.related.to.when.the.data.are.collected"
## [28] "COVERAGE_TIME"
## [29] "OBS_FOOTNOTE"
## [30] "FREQ_COLL"
## [31] "Time.interval.at.which.the.source.data.are.collected"
# Clean column names and select relevant columns
education_data <- janitor::clean_names(education_data)
education_data <- education_data[ , c("ref_area", "geographic_area", "indicator", "sex",
"education_level", "wealth_quintile", "residence",
"time_period", "obs_value")]
# Drop rows with missing obs_value
education_data <- education_data[!is.na(education_data$obs_value), ]
# Check if data exists after filtering
if(nrow(education_data) == 0) {
stop("No data available after initial filtering.")
}
# Select relevant columns
education_data_selected <- dplyr::select(education_data, ref_area, geographic_area, indicator, sex,
education_level, wealth_quintile, residence,
time_period, obs_value)
# View the selected columns
colnames(education_data_selected)
## [1] "ref_area" "geographic_area" "indicator" "sex"
## [5] "education_level" "wealth_quintile" "residence" "time_period"
## [9] "obs_value"
# Remove rows with NA in OBS_VALUE column (base R alternative)
education_data_selected <- education_data_selected[!is.na(education_data_selected$OBS_VALUE), ]
# View data structure
str(education_data)
## 'data.frame': 1177 obs. of 9 variables:
## $ ref_area : chr "AFG" "AFG" "AFG" "AFG" ...
## $ geographic_area: chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ indicator : chr "ED_CR_L3" "ED_CR_L3" "ED_CR_L3" "ED_CR_L3" ...
## $ sex : chr "F" "M" "_T" "_T" ...
## $ education_level: chr "ISCED11_3" "ISCED11_3" "ISCED11_3" "ISCED11_3" ...
## $ wealth_quintile: chr "_T" "_T" "Q1" "Q2" ...
## $ residence : chr "_T" "_T" "_T" "_T" ...
## $ time_period : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ obs_value : num 14.4 32.3 11.2 13.5 13.3 ...
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
# Filter for male and female and check if any rows match
gender_disparity <- education_data[education_data$sex %in% c("Male", "Female"), ]
if(nrow(gender_disparity) > 0) {
# Aggregate the data by geographic area and sex
gender_disparity <- aggregate(obs_value ~ geographic_area + sex, data = gender_disparity, FUN = mean, na.rm = TRUE)
colnames(gender_disparity)[3] <- "avg_completion_rate"
# Plot gender disparities
ggplot(gender_disparity, aes(x = geographic_area, y = avg_completion_rate, fill = sex)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Average Completion Rate by Gender and Country", x = "Country", y = "Completion Rate")
} else {
print("No data available for gender disparity analysis.")
}
## [1] "No data available for gender disparity analysis."
Influence of Wealth on Education
# Filter out "Total" wealth quintile and check if any rows match
wealth_disparity <- education_data[education_data$wealth_quintile != "Total", ]
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
if(nrow(wealth_disparity) > 0) {
# Aggregate data by wealth quintile
wealth_disparity <- aggregate(obs_value ~ wealth_quintile, data = wealth_disparity, FUN = mean, na.rm = TRUE)
colnames(wealth_disparity)[2] <- "avg_completion_rate"
# Plot completion rates by wealth quintile
ggplot(wealth_disparity, aes(x = wealth_quintile, y = avg_completion_rate, fill = wealth_quintile)) +
geom_bar(stat = "identity") +
labs(title = "Completion Rate by Wealth Quintile", x = "Wealth Quintile", y = "Completion Rate")
} else {
print("No data available for wealth disparity analysis.")
}
# Filter for urban and rural residence and check if any rows match
residence_disparity <- education_data[education_data$residence %in% c("Urban", "Rural"), ]
if(nrow(residence_disparity) > 0) {
# Aggregate data by geographic area and residence
residence_disparity <- aggregate(obs_value ~ geographic_area + residence, data = residence_disparity, FUN = mean, na.rm = TRUE)
colnames(residence_disparity)[3] <- "avg_completion_rate"
# Plot urban vs rural completion rates
ggplot(residence_disparity, aes(x = geographic_area, y = avg_completion_rate, fill = residence)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Completion Rate by Residence Type and Country", x = "Country", y = "Completion Rate")
} else {
print("No data available for urban vs. rural analysis.")
}
## [1] "No data available for urban vs. rural analysis."
# Aggregate data by time period and check if any rows match
if(nrow(education_data) > 0) {
time_trends <- aggregate(obs_value ~ time_period, data = education_data, FUN = mean, na.rm = TRUE)
colnames(time_trends)[2] <- "avg_completion_rate"
# Plot trends over time
ggplot(time_trends, aes(x = time_period, y = avg_completion_rate)) +
geom_line() +
geom_point() +
labs(title = "Trends in Completion Rate Over Time", x = "Year", y = "Average Completion Rate")
} else {
print("No data available for time trends analysis.")
}
# Aggregate data by geographic area and check if any rows match
if(nrow(education_data) > 0) {
country_comparison <- aggregate(obs_value ~ geographic_area, data = education_data, FUN = mean, na.rm = TRUE)
colnames(country_comparison)[2] <- "avg_completion_rate"
# Plot completion rates by country
ggplot(country_comparison, aes(x = reorder(geographic_area, -avg_completion_rate), y = avg_completion_rate)) +
geom_bar(stat = "identity", fill = "blue") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Average Completion Rate by Country", x = "Country", y = "Completion Rate")
} else {
print("No data available for country comparison analysis.")
}
Option 1: Horizontal Bar Plot
# Plot completion rates by country using a horizontal bar chart
ggplot(country_comparison, aes(x = reorder(geographic_area, avg_completion_rate), y = avg_completion_rate)) +
geom_bar(stat = "identity", fill = "green") +
coord_flip() + # Flip the axes for a horizontal bar plot
labs(title = "Average Completion Rate by Country", x = "Country", y = "Completion Rate") +
theme_minimal()
Option 2: Display Only the Top and Bottom 10 Countries
# Sort and select the top 10 and bottom 10 countries by average completion rate
top_countries <- head(country_comparison[order(-country_comparison$avg_completion_rate), ], 10)
bottom_countries <- head(country_comparison[order(country_comparison$avg_completion_rate), ], 10)
# Combine the top and bottom countries into one data frame
selected_countries <- rbind(top_countries, bottom_countries)
# Plot only the top and bottom countries with a horizontal bar chart
ggplot(selected_countries, aes(x = reorder(geographic_area, avg_completion_rate), y = avg_completion_rate)) +
geom_bar(stat = "identity", fill = "blue") +
coord_flip() +
labs(title = "Top and Bottom Countries by Completion Rate", x = "Country", y = "Completion Rate") +
theme_minimal()
There exists significant differences in education completion rates between males and females in many regions, with females often having lower rates in certain countries hence need for gender-focused educational support programs. The Wealth quintiles show a correlation with education completion rates, with students from higher wealth brackets having better educational outcomes. Therefore financial support and scholarship programs for lower-income families could reduce this disparity.The rural areas consistently report lower completion rates than urban counterparts, highlighting a need for improved educational infrastructure and resources in rural regions to bridge this gap.While there has been a general improvement in education completion rates over recent years, some countries and regions show stagnation or decline, indicating areas where interventions might be urgently required.