This project analyzes the impact of COVID-19 on student education, focusing on internet access, dropout rates, and financial conditions. The goal is to identify key factors contributing to educational disruption and predict dropout risk.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(moments)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.3
data <- read.csv("C:/Users/PANASHE/OneDrive/open_one_time_covid_education_impact.csv")
str(data)
## 'data.frame': 4436 obs. of 27 variables:
## $ submission_id : num 4.57e+15 6.44e+15 5.00e+15 5.52e+15 5.03e+15 ...
## $ submission_date : chr "2021-03-17" "2021-03-29" "2021-03-18" "2021-03-24" ...
## $ gender : chr "Female" "Male" "Female" "Male" ...
## $ age : chr "Over 45 years old" "26 to 35 years old" "26 to 35 years old" "36 to 45 years old" ...
## $ geography : chr "Suburban/Peri-urban" "Suburban/Peri-urban" "City center or metropolitan area" "Suburban/Peri-urban" ...
## $ financial_situation : chr "I can afford food and regular expenses, but nothing else" "I cannot afford enough food for my family" "I can comfortably afford food, clothes, and furniture, and I have savings" "I can afford food, but nothing else" ...
## $ education : chr "University or college degree completed" "University or college degree completed" "University or college degree completed" "University or college degree completed" ...
## $ employment_status : chr "I am unemployed" "I am unemployed" "I work full-time, either as an employee or self-employed" "I work full-time, either as an employee or self-employed" ...
## $ submission_state : chr "Miranda" "Miranda" "Miranda" "Miranda" ...
## $ are_there_children_0_to_2_yrs_out_of_educational_system : int 0 0 1 0 0 0 0 0 0 1 ...
## $ were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school : int 1 1 1 0 1 0 1 0 0 1 ...
## $ are_there_children_who_stopped_enrolling_in_primary_education : int 1 0 1 0 0 1 0 0 0 0 ...
## $ are_there_children_who_stopped_enrolling_in_secondary_education : int 0 0 1 0 0 1 0 0 0 0 ...
## $ are_children_attending_face_to_face_classes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ can_children_observe_deterioration_of_basic_services_of_school : int 1 1 1 1 1 0 1 1 1 1 ...
## $ do_children_3_and_17_yrs_receive_regular_school_meals : chr "Every day" "No" "No" "No" ...
## $ are_there_teachers_at_scheduled_class_hours : chr "Irregularly" "Irregularly" "There are not enough" "There are enough" ...
## $ are_children_3_to_17_yrs_dealing_with_irregular_school_activity : int 0 1 1 1 1 0 1 1 0 0 ...
## $ are_children_being_teached_by_unqualified_people : int 0 0 1 1 0 1 0 0 1 0 ...
## $ did_teachers_leave_the_educational_system : int 0 1 1 1 1 1 0 1 1 0 ...
## $ do_school_and_the_teachers_have_internet_connection : int 1 0 0 0 0 1 1 0 1 1 ...
## $ do_children_have_internet_connection : int 1 1 1 1 1 0 1 0 0 1 ...
## $ do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity : int 0 1 0 0 1 0 0 1 1 0 ...
## $ does_home_shows_severe_deficit_of_electricity : int 0 0 1 0 0 0 0 0 0 1 ...
## $ does_home_shows_severe_deficit_of_internet : int 0 0 0 0 0 0 0 0 0 0 ...
## $ do_children_3_to_17_yrs_miss_class_or_in_lower_grade : int 0 0 0 0 0 0 0 0 0 0 ...
## $ are_children_promoted_with_a_modality_different_from_formal_evaluation: int 0 0 1 0 1 1 0 0 1 0 ...
summary(data)
## submission_id submission_date gender age
## Min. :4.504e+15 Length:4436 Length:4436 Length:4436
## 1st Qu.:5.077e+15 Class :character Class :character Class :character
## Median :5.642e+15 Mode :character Mode :character Mode :character
## Mean :5.633e+15
## 3rd Qu.:6.188e+15
## Max. :6.755e+15
## geography financial_situation education employment_status
## Length:4436 Length:4436 Length:4436 Length:4436
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## submission_state are_there_children_0_to_2_yrs_out_of_educational_system
## Length:4436 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :0.0000
## Mean :0.2949
## 3rd Qu.:1.0000
## Max. :1.0000
## were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.6132
## 3rd Qu.:1.0000
## Max. :1.0000
## are_there_children_who_stopped_enrolling_in_primary_education
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2065
## 3rd Qu.:0.0000
## Max. :1.0000
## are_there_children_who_stopped_enrolling_in_secondary_education
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1943
## 3rd Qu.:0.0000
## Max. :1.0000
## are_children_attending_face_to_face_classes
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1637
## 3rd Qu.:0.0000
## Max. :1.0000
## can_children_observe_deterioration_of_basic_services_of_school
## Min. :0.0000
## 1st Qu.:1.0000
## Median :1.0000
## Mean :0.8005
## 3rd Qu.:1.0000
## Max. :1.0000
## do_children_3_and_17_yrs_receive_regular_school_meals
## Length:4436
## Class :character
## Mode :character
##
##
##
## are_there_teachers_at_scheduled_class_hours
## Length:4436
## Class :character
## Mode :character
##
##
##
## are_children_3_to_17_yrs_dealing_with_irregular_school_activity
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.6431
## 3rd Qu.:1.0000
## Max. :1.0000
## are_children_being_teached_by_unqualified_people
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3165
## 3rd Qu.:1.0000
## Max. :1.0000
## did_teachers_leave_the_educational_system
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.6643
## 3rd Qu.:1.0000
## Max. :1.0000
## do_school_and_the_teachers_have_internet_connection
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.5604
## 3rd Qu.:1.0000
## Max. :1.0000
## do_children_have_internet_connection
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.6285
## 3rd Qu.:1.0000
## Max. :1.0000
## do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.6655
## 3rd Qu.:1.0000
## Max. :1.0000
## does_home_shows_severe_deficit_of_electricity
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2845
## 3rd Qu.:1.0000
## Max. :1.0000
## does_home_shows_severe_deficit_of_internet
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.5791
## 3rd Qu.:1.0000
## Max. :1.0000
## do_children_3_to_17_yrs_miss_class_or_in_lower_grade
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2464
## 3rd Qu.:0.0000
## Max. :1.0000
## are_children_promoted_with_a_modality_different_from_formal_evaluation
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4272
## 3rd Qu.:1.0000
## Max. :1.0000
colnames(data)
## [1] "submission_id"
## [2] "submission_date"
## [3] "gender"
## [4] "age"
## [5] "geography"
## [6] "financial_situation"
## [7] "education"
## [8] "employment_status"
## [9] "submission_state"
## [10] "are_there_children_0_to_2_yrs_out_of_educational_system"
## [11] "were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school"
## [12] "are_there_children_who_stopped_enrolling_in_primary_education"
## [13] "are_there_children_who_stopped_enrolling_in_secondary_education"
## [14] "are_children_attending_face_to_face_classes"
## [15] "can_children_observe_deterioration_of_basic_services_of_school"
## [16] "do_children_3_and_17_yrs_receive_regular_school_meals"
## [17] "are_there_teachers_at_scheduled_class_hours"
## [18] "are_children_3_to_17_yrs_dealing_with_irregular_school_activity"
## [19] "are_children_being_teached_by_unqualified_people"
## [20] "did_teachers_leave_the_educational_system"
## [21] "do_school_and_the_teachers_have_internet_connection"
## [22] "do_children_have_internet_connection"
## [23] "do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity"
## [24] "does_home_shows_severe_deficit_of_electricity"
## [25] "does_home_shows_severe_deficit_of_internet"
## [26] "do_children_3_to_17_yrs_miss_class_or_in_lower_grade"
## [27] "are_children_promoted_with_a_modality_different_from_formal_evaluation"
dim(data)
## [1] 4436 27
sum(is.na(data))
## [1] 0
colSums(is.na(data))
## submission_id
## 0
## submission_date
## 0
## gender
## 0
## age
## 0
## geography
## 0
## financial_situation
## 0
## education
## 0
## employment_status
## 0
## submission_state
## 0
## are_there_children_0_to_2_yrs_out_of_educational_system
## 0
## were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school
## 0
## are_there_children_who_stopped_enrolling_in_primary_education
## 0
## are_there_children_who_stopped_enrolling_in_secondary_education
## 0
## are_children_attending_face_to_face_classes
## 0
## can_children_observe_deterioration_of_basic_services_of_school
## 0
## do_children_3_and_17_yrs_receive_regular_school_meals
## 0
## are_there_teachers_at_scheduled_class_hours
## 0
## are_children_3_to_17_yrs_dealing_with_irregular_school_activity
## 0
## are_children_being_teached_by_unqualified_people
## 0
## did_teachers_leave_the_educational_system
## 0
## do_school_and_the_teachers_have_internet_connection
## 0
## do_children_have_internet_connection
## 0
## do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity
## 0
## does_home_shows_severe_deficit_of_electricity
## 0
## does_home_shows_severe_deficit_of_internet
## 0
## do_children_3_to_17_yrs_miss_class_or_in_lower_grade
## 0
## are_children_promoted_with_a_modality_different_from_formal_evaluation
## 0
The dataset contains both categorical and numerical variables. Missing values are checked to ensure data reliability before analysis.
data$gender <- as.factor(data$gender)
data$age <- as.factor(data$age)
data$geography <- as.factor(data$geography)
data$education <- as.factor(data$education)
data$employment_status <- as.factor(data$employment_status)
Categorical variables are converted into factors to ensure proper statistical analysis and modeling.
mean(data$do_children_have_internet_connection) * 100
## [1] 62.84941
table(data$do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity)
##
## 0 1
## 1484 2952
table(data$were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school)
##
## 0 1
## 1716 2720
mean(data$were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school) * 100
## [1] 61.3165
table(data$are_children_attending_face_to_face_classes)
##
## 0 1
## 3710 726
table(data$age, data$were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school)
##
## 0 1
## 16 to 25 years old 549 753
## 26 to 35 years old 464 840
## 36 to 45 years old 413 674
## Not Available 1 2
## Over 45 years old 289 450
## Under 16 0 1
table(data$financial_situation)
##
## I can afford food and regular expenses, but nothing else
## 1060
## I can afford food, but nothing else
## 1445
## I can afford food, regular expenses, and clothes, but nothing else
## 244
## I can comfortably afford food, clothes, and furniture, and I have savings
## 157
## I can comfortably afford food, clothes, and furniture, but I don’t have savings
## 127
## I cannot afford enough food for my family
## 1163
## Not Available
## 1
## Prefer not to answer
## 239
A significant percentage of students lack internet access, indicating a digital divide. Financial conditions vary widely and influence access to education and return rates.
table(data$do_children_have_internet_connection,
data$do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity)
##
## 0 1
## 0 485 1163
## 1 999 1789
table(data$does_home_shows_severe_deficit_of_electricity,
data$do_children_3_to_17_yrs_miss_virtual_class_due_to_lack_of_electricity)
##
## 0 1
## 0 1270 1904
## 1 214 1048
table(data$financial_situation,
data$were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school)
##
## 0
## I can afford food and regular expenses, but nothing else 442
## I can afford food, but nothing else 530
## I can afford food, regular expenses, and clothes, but nothing else 89
## I can comfortably afford food, clothes, and furniture, and I have savings 56
## I can comfortably afford food, clothes, and furniture, but I don’t have savings 54
## I cannot afford enough food for my family 434
## Not Available 1
## Prefer not to answer 110
##
## 1
## I can afford food and regular expenses, but nothing else 618
## I can afford food, but nothing else 915
## I can afford food, regular expenses, and clothes, but nothing else 155
## I can comfortably afford food, clothes, and furniture, and I have savings 101
## I can comfortably afford food, clothes, and furniture, but I don’t have savings 73
## I cannot afford enough food for my family 729
## Not Available 0
## Prefer not to answer 129
table(data$does_home_shows_severe_deficit_of_internet,
data$were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school)
##
## 0 1
## 0 828 1039
## 1 888 1681
table(data$geography,
data$were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school)
##
## 0 1
## City center or metropolitan area 748 1172
## Not Available 1 0
## Rural 406 735
## Suburban/Peri-urban 561 813
table(data$are_children_3_to_17_yrs_dealing_with_irregular_school_activity,
data$were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school)
##
## 0 1
## 0 748 835
## 1 968 1885
table(data$did_teachers_leave_the_educational_system,
data$are_children_3_to_17_yrs_dealing_with_irregular_school_activity)
##
## 0 1
## 0 934 555
## 1 649 2298
There is a clear relationship between financial status and dropout, with lower-income groups showing higher dropout rates.
data$internet_access <- as.factor(data$do_children_have_internet_connection)
data$return_to_school <- as.factor(data$were_children_3_to_17_yrs_enrolled_and_did_not_return_to_school)
data$irregular_activity <- as.factor(data$are_children_3_to_17_yrs_dealing_with_irregular_school_activity)
data$electricity_issue <- as.factor(data$does_home_shows_severe_deficit_of_electricity)
data$financial_status <- as.factor(data$financial_situation)
model2 <- glm(return_to_school ~ internet_access + electricity_issue + financial_status + geography,
data = data, family = "binomial")
data$dropout_risk <- predict(model2, type = "response")
# ================================
# CORRELATION ANALYSIS
# ================================
data$internet_num <- as.numeric(data$internet_access)
data$return_num <- as.numeric(data$return_to_school)
data$electricity_num <- as.numeric(data$electricity_issue)
data$financial_num <- as.numeric(data$financial_status)
cor_matrix <- cor(data[, c("internet_num", "return_num", "electricity_num", "financial_num", "dropout_risk")])
cor_matrix
## internet_num return_num electricity_num financial_num
## internet_num 1.000000000 0.009096232 -0.11804453 -0.049153182
## return_num 0.009096232 1.000000000 0.09149783 -0.002601488
## electricity_num -0.118044528 0.091497829 1.00000000 0.050075172
## financial_num -0.049153182 -0.002601488 0.05007517 1.000000000
## dropout_risk 0.079628692 0.114286379 0.80097447 -0.022773434
## dropout_risk
## internet_num 0.07962869
## return_num 0.11428638
## electricity_num 0.80097447
## financial_num -0.02277343
## dropout_risk 1.00000000
cor_melt <- melt(cor_matrix)
ggplot(cor_melt, aes(Var1, Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2))) +
labs(title = "Correlation Heatmap") +
theme_minimal()
# ================================
# PRESCRIPTIVE ANALYSIS
# ================================
prop.table(table(data$internet_access, data$return_to_school), 1)
##
## 0 1
## 0 0.3925971 0.6074029
## 1 0.3834290 0.6165710
high_risk_students <- data[data$dropout_risk > 0.7, ]
nrow(high_risk_students)
## [1] 381
aggregate(dropout_risk ~ financial_status + geography, data = data, mean)
## financial_status
## 1 I can afford food and regular expenses, but nothing else
## 2 I can afford food, but nothing else
## 3 I can afford food, regular expenses, and clothes, but nothing else
## 4 I can comfortably afford food, clothes, and furniture, and I have savings
## 5 I can comfortably afford food, clothes, and furniture, but I don’t have savings
## 6 I cannot afford enough food for my family
## 7 Prefer not to answer
## 8 Not Available
## 9 I can afford food and regular expenses, but nothing else
## 10 I can afford food, but nothing else
## 11 I can afford food, regular expenses, and clothes, but nothing else
## 12 I can comfortably afford food, clothes, and furniture, and I have savings
## 13 I can comfortably afford food, clothes, and furniture, but I don’t have savings
## 14 I cannot afford enough food for my family
## 15 Prefer not to answer
## 16 I can afford food and regular expenses, but nothing else
## 17 I can afford food, but nothing else
## 18 I can afford food, regular expenses, and clothes, but nothing else
## 19 I can comfortably afford food, clothes, and furniture, and I have savings
## 20 I can comfortably afford food, clothes, and furniture, but I don’t have savings
## 21 I cannot afford enough food for my family
## 22 Prefer not to answer
## geography dropout_risk
## 1 City center or metropolitan area 5.856496e-01
## 2 City center or metropolitan area 6.288990e-01
## 3 City center or metropolitan area 6.378863e-01
## 4 City center or metropolitan area 6.459355e-01
## 5 City center or metropolitan area 5.726864e-01
## 6 City center or metropolitan area 6.237103e-01
## 7 City center or metropolitan area 5.373316e-01
## 8 Not Available 9.482496e-06
## 9 Rural 6.161654e-01
## 10 Rural 6.597386e-01
## 11 Rural 6.578040e-01
## 12 Rural 6.699145e-01
## 13 Rural 6.086975e-01
## 14 Rural 6.499037e-01
## 15 Rural 5.712774e-01
## 16 Suburban/Peri-urban 5.652749e-01
## 17 Suburban/Peri-urban 6.128347e-01
## 18 Suburban/Peri-urban 6.207245e-01
## 19 Suburban/Peri-urban 6.225738e-01
## 20 Suburban/Peri-urban 5.633299e-01
## 21 Suburban/Peri-urban 6.026952e-01
## 22 Suburban/Peri-urban 5.130942e-01
# ================================
# VISUALIZATION
# ================================
ggplot(data, aes(x = internet_access, fill = internet_access)) + geom_bar()
ggplot(data, aes(x = return_to_school, fill = return_to_school)) + geom_bar()
ggplot(data, aes(x = age, fill = return_to_school)) +
geom_bar(position = "dodge")
ggplot(data, aes(x = financial_status, fill = return_to_school)) +
geom_bar(position = "fill")
ggplot(data, aes(x = dropout_risk)) +
geom_histogram(bins = 20, fill = "purple", alpha = 0.7) +
geom_vline(xintercept = 0.7, color = "red", linetype = "dashed")
ggplot(data, aes(x = return_to_school, y = dropout_risk, fill = return_to_school)) +
stat_summary(fun = mean, geom = "bar")
ggplot(data, aes(x = dropout_risk, fill = return_to_school)) +
geom_density(alpha = 0.4)
ggplot(data, aes(x = financial_num, y = dropout_risk)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue")
## `geom_smooth()` using formula = 'y ~ x'
skewness(data$dropout_risk)
## [1] 0.003830186
kurtosis(data$dropout_risk)
## [1] 5.629831
The distribution shows concentration of risk among specific groups and presence of extreme cases.
The analysis shows that internet access, electricity availability, and financial status are the main drivers of student dropout. Students from disadvantaged backgrounds are at higher risk. Predictive modeling helps identify vulnerable groups, allowing targeted interventions to reduce educational inequality.C