library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(readr)
education_inequality <- read_excel("education_inequality_data.xlsx")
Column Descriptions
id — Unique identifier for each school entry.
school_name — Name of the school (synthetically generated).
state — U.S. state where the school is located.
school_type — Type of institution: Public, Private, or Charter.
grade_level — Primary level served by the school: Elementary,
Middle, or High.
funding_per_student_usd — Annual funding per student in U.S.
dollars.
avg_test_score_percent — Average student performance score
(0–100%).
student_teacher_ratio — Average number of students per teacher.
percent_low_income — Percentage of students from low-income
households.
percent_minority — Percentage of students from minority
backgrounds.
internet_access_percent — Percentage of students with internet
access at school.
dropout_rate_percent — Annual dropout rate among students (as a
percentage).
summary(education_inequality)
## id school_name state school_type
## Min. : 1.0 Length:1000 Length:1000 Length:1000
## 1st Qu.: 250.8 Class :character Class :character Class :character
## Median : 500.5 Mode :character Mode :character Mode :character
## Mean : 500.5
## 3rd Qu.: 750.2
## Max. :1000.0
## grade_level funding_per_student_usd avg_test_score_percent
## Length:1000 Min. : 5013 Min. : 40.00
## Class :character 1st Qu.: 9726 1st Qu.: 54.58
## Mode :character Median :15362 Median : 69.90
## Mean :15154 Mean : 70.00
## 3rd Qu.:20280 3rd Qu.: 85.62
## Max. :24982 Max. :100.00
## student_teacher_ratio percent_low_income percent_minority
## Min. :10.00 Min. :10.20 Min. : 5.00
## 1st Qu.:14.80 1st Qu.:33.40 1st Qu.:29.30
## Median :19.40 Median :54.10 Median :49.45
## Mean :19.73 Mean :53.76 Mean :49.65
## 3rd Qu.:25.00 3rd Qu.:74.60 3rd Qu.:69.95
## Max. :30.00 Max. :94.90 Max. :94.90
## internet_access_percent dropout_rate_percent
## Min. : 50.00 Min. : 0.500
## 1st Qu.: 62.40 1st Qu.: 4.050
## Median : 74.75 Median : 7.685
## Mean : 74.69 Mean : 7.766
## 3rd Qu.: 87.33 3rd Qu.:11.492
## Max. :100.00 Max. :14.990
ggplot(education_inequality, aes(x = percent_low_income)) +
geom_histogram(bins = 30, color = "red" ) +
labs(
title = "Frequency of Low-income Students by Percentage",
x = "Low-income Students (%)",
xlim=c(5000, 25000),
ylim=c(0,100))

ggplot(education_inequality, aes(x = percent_low_income, y = dropout_rate_percent)) +
geom_point(aes(color = grade_level, shape = school_type)) +
geom_smooth(method = "lm") +
labs(
title = "Percentage of Low-income Students per Dropout Rate",
subtitle = "the visual correlation between income and dropout rates per student",
x = "Income by Percentage", y = "Average Dropout Rates",
color = "Grade Level", shape = "School Type")
## `geom_smooth()` using formula = 'y ~ x'

cor(education_inequality$percent_low_income, education_inequality$dropout_rate_percent)
## [1] 0.001965227