library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(readr)
education_inequality <- read_excel("education_inequality_data.xlsx")

Column Descriptions

id — Unique identifier for each school entry.

school_name — Name of the school (synthetically generated).

state — U.S. state where the school is located.

school_type — Type of institution: Public, Private, or Charter.

grade_level — Primary level served by the school: Elementary, Middle, or High.

funding_per_student_usd — Annual funding per student in U.S. dollars.

avg_test_score_percent — Average student performance score (0–100%).

student_teacher_ratio — Average number of students per teacher.

percent_low_income — Percentage of students from low-income households.

percent_minority — Percentage of students from minority backgrounds.

internet_access_percent — Percentage of students with internet access at school.

dropout_rate_percent — Annual dropout rate among students (as a percentage).

summary(education_inequality)
##        id         school_name           state           school_type       
##  Min.   :   1.0   Length:1000        Length:1000        Length:1000       
##  1st Qu.: 250.8   Class :character   Class :character   Class :character  
##  Median : 500.5   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 500.5                                                           
##  3rd Qu.: 750.2                                                           
##  Max.   :1000.0                                                           
##  grade_level        funding_per_student_usd avg_test_score_percent
##  Length:1000        Min.   : 5013           Min.   : 40.00        
##  Class :character   1st Qu.: 9726           1st Qu.: 54.58        
##  Mode  :character   Median :15362           Median : 69.90        
##                     Mean   :15154           Mean   : 70.00        
##                     3rd Qu.:20280           3rd Qu.: 85.62        
##                     Max.   :24982           Max.   :100.00        
##  student_teacher_ratio percent_low_income percent_minority
##  Min.   :10.00         Min.   :10.20      Min.   : 5.00   
##  1st Qu.:14.80         1st Qu.:33.40      1st Qu.:29.30   
##  Median :19.40         Median :54.10      Median :49.45   
##  Mean   :19.73         Mean   :53.76      Mean   :49.65   
##  3rd Qu.:25.00         3rd Qu.:74.60      3rd Qu.:69.95   
##  Max.   :30.00         Max.   :94.90      Max.   :94.90   
##  internet_access_percent dropout_rate_percent
##  Min.   : 50.00          Min.   : 0.500      
##  1st Qu.: 62.40          1st Qu.: 4.050      
##  Median : 74.75          Median : 7.685      
##  Mean   : 74.69          Mean   : 7.766      
##  3rd Qu.: 87.33          3rd Qu.:11.492      
##  Max.   :100.00          Max.   :14.990
ggplot(education_inequality, aes(x = percent_low_income)) +
  geom_histogram(bins = 30, color = "red" ) +
  labs(
    title = "Frequency of Low-income Students by Percentage",
    x = "Low-income Students (%)",
    xlim=c(5000, 25000),
    ylim=c(0,100))

ggplot(education_inequality, aes(x = percent_low_income, y = dropout_rate_percent)) + 
  geom_point(aes(color = grade_level, shape = school_type)) + 
    geom_smooth(method = "lm") +
  labs(
    title = "Percentage of Low-income Students per Dropout Rate",
    subtitle = "the visual correlation between income and dropout rates per student",
    x = "Income by Percentage", y = "Average Dropout Rates",
    color = "Grade Level", shape = "School Type")
## `geom_smooth()` using formula = 'y ~ x'

cor(education_inequality$percent_low_income, education_inequality$dropout_rate_percent)
## [1] 0.001965227