library(tidyverse) # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl) # For reading Excel files
library(car) # For regression diagnostics
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(lmtest) # For testing linear models
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
district_data <- read_excel("district_data.xlsx")
# Look at the data structure
str(district_data)
## tibble [82 × 136] (S3: tbl_df/tbl/data.frame)
## $ DISTNAME : chr [1:82] "ALAMO HEIGHTS ISD" "HARLANDALE ISD" "EDGEWOOD ISD" "SAN ANTONIO ISD" ...
## $ DISTRICT : chr [1:82] "015901" "015904" "015905" "015907" ...
## $ DZCNTYNM : chr [1:82] "015 BEXAR" "015 BEXAR" "015 BEXAR" "015 BEXAR" ...
## $ REGION : chr [1:82] "20" "20" "20" "20" ...
## $ DZCAMPUS : num [1:82] 6 28 27 97 18 8 75 15 19 124 ...
## $ DPETALLC : num [1:82] 4731 12160 8165 45212 7871 ...
## $ DPETBLAP : num [1:82] 2 0.5 1.4 5.6 1.3 0.5 6.9 8.8 3.6 7 ...
## $ DPETHISP : num [1:82] 39.9 97.2 96.3 90.1 95.2 91.7 61.5 74.4 89.7 68 ...
## $ DPETWHIP : num [1:82] 52.5 2 0.8 3 2.8 6.8 22.8 13.8 4.9 16.7 ...
## $ DPETINDP : num [1:82] 0.3 0 0 0.1 0.2 0.1 0.2 0.2 0.1 0.1 ...
## $ DPETASIP : num [1:82] 2.8 0.1 1.2 0.5 0.2 0.2 4.3 0.6 0.5 4 ...
## $ DPETPCIP : num [1:82] 0.1 0 0 0 0.1 0.1 0.1 0.1 0.1 0.2 ...
## $ DPETTWOP : num [1:82] 2.4 0.2 0.2 0.7 0.3 0.7 4 2.1 1.2 4 ...
## $ DPETECOP : num [1:82] 21.6 87.8 94.2 88.6 93.6 86.1 52.5 59.6 81.4 54.3 ...
## $ DPETLEPP : num [1:82] 7.3 19.8 24.5 23.5 21 22.1 18.4 13.6 19.1 12 ...
## $ DPETSPEP : num [1:82] 11.4 12.9 12.8 14.5 14.9 12.1 14 13.3 17.5 14.7 ...
## $ DPETBILP : num [1:82] 12.8 22.1 24.4 26 21 21 20.7 15.9 19 11.2 ...
## $ DPETVOCP : num [1:82] 19.1 24.7 32 22.6 29 30.7 24.5 27.9 25.3 21.2 ...
## $ DPETGIFP : num [1:82] 22.1 6.7 2.7 7.3 6 5 9 7.2 10.7 7.7 ...
## $ DA0AT22R : num [1:82] 93.8 87.6 85.6 85 86.1 91.2 92.5 89.5 88.6 91.8 ...
## $ DA0912DR22R : num [1:82] 0.5 2.8 5.7 2.7 6.3 2.4 2.4 3.6 1.4 1.1 ...
## $ DAGC4X22R : num [1:82] 98.5 89.8 84.1 87 77.6 93.5 89.4 88 94.4 93.6 ...
## $ DAGC5X21R : num [1:82] 98.3 92.6 88.7 89.7 86.1 88.6 92.4 92.7 97.2 95.5 ...
## $ DAGC6X20R : num [1:82] 99.3 95.5 89 90.3 87.7 90.8 94.1 95.1 96.6 95.7 ...
## $ DA0GR22N : num [1:82] 422 917 575 2678 543 ...
## $ DA0GS22N : num [1:82] 363 701 537 2339 438 ...
## $ DDA00A001S23R: num [1:82] 89 67 55 61 56 81 80 63 69 77 ...
## $ DDA00A001223R: num [1:82] 69 33 24 31 25 52 55 30 38 50 ...
## $ DDA00A001323R: num [1:82] 32 8 6 9 7 18 24 7 11 19 ...
## $ DDA00AR01S23R: num [1:82] 90 66 56 61 54 80 80 66 72 78 ...
## $ DDA00AR01223R: num [1:82] 74 36 27 35 28 54 58 36 44 54 ...
## $ DDA00AR01323R: num [1:82] 32 8 5 10 7 19 24 8 12 18 ...
## $ DDA00AM01S23R: num [1:82] 88 65 52 58 54 80 79 58 64 74 ...
## $ DDA00AM01223R: num [1:82] 64 28 20 25 19 52 49 22 29 41 ...
## $ DDA00AM01323R: num [1:82] 29 7 5 7 5 21 20 6 9 15 ...
## $ DDA00AC01S23R: num [1:82] 89 69 56 62 59 85 82 63 69 79 ...
## $ DDA00AC01223R: num [1:82] 67 35 19 29 26 48 57 24 34 51 ...
## $ DDA00AC01323R: num [1:82] 32 9 4 8 6 12 27 4 8 20 ...
## $ DDA00AS01S23R: num [1:82] 87 73 63 69 69 82 83 67 74 82 ...
## $ DDA00AS01223R: num [1:82] 68 40 32 35 32 47 61 36 43 58 ...
## $ DDA00AS01323R: num [1:82] 44 15 12 15 12 18 38 14 17 33 ...
## $ DDB00A001S23R: num [1:82] 81 58 58 55 59 87 67 57 70 73 ...
## $ DDB00A001223R: num [1:82] 58 25 22 24 21 58 37 25 38 43 ...
## $ DDB00A001323R: num [1:82] 15 4 4 6 8 23 13 5 11 14 ...
## $ DDH00A001S23R: num [1:82] 81 67 55 61 56 80 77 62 68 75 ...
## $ DDH00A001223R: num [1:82] 55 33 23 31 25 51 48 28 37 45 ...
## $ DDH00A001323R: num [1:82] 21 8 5 9 7 17 18 6 11 15 ...
## $ DDW00A001S23R: num [1:82] 94 63 50 72 64 86 90 72 80 87 ...
## $ DDW00A001223R: num [1:82] 78 33 17 50 27 62 72 39 51 66 ...
## $ DDW00A001323R: num [1:82] 41 8 6 24 10 28 37 12 17 31 ...
## $ DDI00A001S23R: num [1:82] 82 67 NA 47 53 57 79 67 71 71 ...
## $ DDI00A001223R: num [1:82] 82 33 NA 16 29 29 49 40 17 48 ...
## $ DDI00A001323R: num [1:82] 29 33 NA 4 18 14 24 13 0 18 ...
## $ DD300A001S23R: num [1:82] 93 100 80 62 78 100 91 77 85 82 ...
## $ DD300A001223R: num [1:82] 79 79 52 36 50 95 77 47 61 64 ...
## $ DD300A001323R: num [1:82] 46 64 24 16 33 63 48 18 13 34 ...
## $ DD400A001S23R: num [1:82] 100 NA -1 50 33 100 73 76 79 87 ...
## $ DD400A001223R: num [1:82] 89 NA -1 38 0 83 41 65 42 62 ...
## $ DD400A001323R: num [1:82] 22 NA -1 13 0 67 16 12 32 24 ...
## $ DD200A001S23R: num [1:82] 95 86 75 69 53 91 88 74 70 83 ...
## $ DD200A001223R: num [1:82] 78 41 39 45 16 62 66 42 34 59 ...
## $ DD200A001323R: num [1:82] 45 14 7 18 0 15 33 12 8 26 ...
## $ DDE00A001S23R: num [1:82] 73 65 54 59 55 79 70 60 67 69 ...
## $ DDE00A001223R: num [1:82] 43 32 23 28 24 49 40 27 34 37 ...
## $ DDE00A001323R: num [1:82] 13 8 5 8 6 17 13 6 9 11 ...
## $ DA0CT22R : num [1:82] 68.7 83.9 77.6 81.9 69.8 87.3 84.1 40.9 85.3 88.5 ...
## $ DA0CC22R : num [1:82] 66.6 14.2 7.8 12 9.8 9.2 38.6 22.1 9.6 29.4 ...
## $ dA0csa22r : num [1:82] 1161 902 857 890 879 ...
## $ dA0caa22r : num [1:82] 25.3 19.9 20.5 16.1 NA 18.3 23.4 19.5 15.8 24.7 ...
## $ DPSATOFC : num [1:82] 660 1972 1314 7028 1152 ...
## $ DPSTTOFC : num [1:82] 358 846 551 3004 509 ...
## $ DPSCTOFP : num [1:82] 1 1.2 1.7 0.5 1.2 1.2 0.7 2.3 0.4 0.3 ...
## $ DPSSTOFP : num [1:82] 3 2.7 4.8 2.8 3.4 2.6 2.8 2.7 2.5 2.1 ...
## $ DPSUTOFP : num [1:82] 7 14 10.8 16.5 11.2 11.6 12.7 10.2 11 13.3 ...
## $ DPSTTOFP : num [1:82] 54.3 42.9 42 42.7 44.2 43.5 50.4 46.2 45.6 52.1 ...
## $ DPSETOFP : num [1:82] 14.9 10.4 9.3 10.4 9.9 9.6 8.6 11.9 7.6 8.3 ...
## $ DPSXTOFP : num [1:82] 19.8 28.9 31.5 27.2 30.2 31.5 24.9 26.7 32.8 24 ...
## $ DPSCTOSA : num [1:82] 144939 105680 130381 125407 115823 ...
## $ DPSSTOSA : num [1:82] 88400 83689 75694 97339 82259 ...
## $ DPSUTOSA : num [1:82] 67118 69312 70200 67938 69213 ...
## $ DPSTTOSA : num [1:82] 60575 61216 60199 58682 61254 ...
## $ DPSAMIFP : num [1:82] 49.4 85.1 89.6 84.1 88 78.7 56.1 61.5 78.7 64.5 ...
## $ DPSAKIDR : num [1:82] 7.2 6.2 6.2 6.4 6.8 6.9 7.3 7.9 6.6 7.7 ...
## $ DPSTKIDR : num [1:82] 13.2 14.4 14.8 15.1 15.5 15.9 14.4 17.1 14.4 14.9 ...
## $ DPST05FP : num [1:82] 20.6 31.1 52 40.9 30 37 28.6 41.5 31.7 30.3 ...
## $ DPSTEXPA : num [1:82] 14.1 11.3 8.8 9.7 11.5 10.1 10.7 9.4 11.5 11.8 ...
## $ DPSTADFP : num [1:82] 40.9 28.4 34.3 37.8 30.2 21.6 34.2 28.3 31.9 32.3 ...
## $ DPSTURNR : num [1:82] 20.7 18.7 27 23.9 22.6 19.5 21.1 24 20.1 14.2 ...
## $ DPSTBLFP : num [1:82] 2.8 2.2 4.9 8.5 1.8 3.1 4.2 9 2.4 3.5 ...
## $ DPSTHIFP : num [1:82] 35.3 75.9 77.6 68.9 81.3 64.8 41 45.7 67.6 51.9 ...
## $ DPSTWHFP : num [1:82] 60.2 21.1 16 20.3 16.1 31 52.4 43.1 27.9 41.8 ...
## $ DPSTINFP : num [1:82] 0 0.2 0 0 0 0 0.1 0.1 0.2 0.3 ...
## $ DPSTASFP : num [1:82] 0.6 0.1 0.9 0.8 0.2 0.4 1.4 0.5 0.6 0.9 ...
## $ DPSTPIFP : num [1:82] 0.6 0.1 0 0.1 0 0 0.2 0 0.1 0.1 ...
## $ DPSTTWFP : num [1:82] 0.6 0.2 0.5 1.3 0.6 0.7 0.6 1.6 1.2 1.4 ...
## $ DPSTREFP : num [1:82] 68.9 68.9 81.4 65.2 64.2 66.5 65.9 58.8 76.5 62.6 ...
## $ DPSTSPFP : num [1:82] 11.3 14.2 8.3 11.3 11.1 9.8 15.3 11.8 11.5 9.9 ...
## $ DPSTCOFP : num [1:82] 5 0 0 6.9 5.1 2.3 3.2 10.9 1 3.9 ...
## $ DPSTBIFP : num [1:82] 1.8 8.3 3.9 11.3 9.5 8.3 5 6.6 7.2 11.3 ...
## [list output truncated]
## Select and Clean Data
names(district_data)
## [1] "DISTNAME" "DISTRICT" "DZCNTYNM" "REGION"
## [5] "DZCAMPUS" "DPETALLC" "DPETBLAP" "DPETHISP"
## [9] "DPETWHIP" "DPETINDP" "DPETASIP" "DPETPCIP"
## [13] "DPETTWOP" "DPETECOP" "DPETLEPP" "DPETSPEP"
## [17] "DPETBILP" "DPETVOCP" "DPETGIFP" "DA0AT22R"
## [21] "DA0912DR22R" "DAGC4X22R" "DAGC5X21R" "DAGC6X20R"
## [25] "DA0GR22N" "DA0GS22N" "DDA00A001S23R" "DDA00A001223R"
## [29] "DDA00A001323R" "DDA00AR01S23R" "DDA00AR01223R" "DDA00AR01323R"
## [33] "DDA00AM01S23R" "DDA00AM01223R" "DDA00AM01323R" "DDA00AC01S23R"
## [37] "DDA00AC01223R" "DDA00AC01323R" "DDA00AS01S23R" "DDA00AS01223R"
## [41] "DDA00AS01323R" "DDB00A001S23R" "DDB00A001223R" "DDB00A001323R"
## [45] "DDH00A001S23R" "DDH00A001223R" "DDH00A001323R" "DDW00A001S23R"
## [49] "DDW00A001223R" "DDW00A001323R" "DDI00A001S23R" "DDI00A001223R"
## [53] "DDI00A001323R" "DD300A001S23R" "DD300A001223R" "DD300A001323R"
## [57] "DD400A001S23R" "DD400A001223R" "DD400A001323R" "DD200A001S23R"
## [61] "DD200A001223R" "DD200A001323R" "DDE00A001S23R" "DDE00A001223R"
## [65] "DDE00A001323R" "DA0CT22R" "DA0CC22R" "dA0csa22r"
## [69] "dA0caa22r" "DPSATOFC" "DPSTTOFC" "DPSCTOFP"
## [73] "DPSSTOFP" "DPSUTOFP" "DPSTTOFP" "DPSETOFP"
## [77] "DPSXTOFP" "DPSCTOSA" "DPSSTOSA" "DPSUTOSA"
## [81] "DPSTTOSA" "DPSAMIFP" "DPSAKIDR" "DPSTKIDR"
## [85] "DPST05FP" "DPSTEXPA" "DPSTADFP" "DPSTURNR"
## [89] "DPSTBLFP" "DPSTHIFP" "DPSTWHFP" "DPSTINFP"
## [93] "DPSTASFP" "DPSTPIFP" "DPSTTWFP" "DPSTREFP"
## [97] "DPSTSPFP" "DPSTCOFP" "DPSTBIFP" "DPSTVOFP"
## [101] "DPSTGOFP" "DPFVTOTK" "DPFTADPR" "DPFRAALLT"
## [105] "DPFRAALLK" "DPFRAOPRT" "DPFRASTAP" "DZRVLOCP"
## [109] "DPFRAFEDP" "DPFRAORVT" "DPFUNAB1T" "DPFUNA4T"
## [113] "DPFEAALLT" "DPFEAOPFT" "DPFEAOPFK" "DPFEAINSP"
## [117] "DZEXADMP" "DZEXADSP" "DZEXPLAP" "DZEXOTHP"
## [121] "DPFEAINST" "DPFEAINSK" "DPFPAREGP" "DPFPASPEP"
## [125] "DPFPACOMP" "DPFPABILP" "DPFPAVOCP" "DPFPAGIFP"
## [129] "DPFPAATHP" "DPFPAHSAP" "DPFPREKP" "DPFPAOTHP"
## [133] "DISTSIZE" "COMMTYPE" "PROPWLTH" "TAXRATE"
# Select only columns we need
clean_data <- district_data %>%
select(DISTNAME, DPFEAOPFK, DAGC4X22R, DPETECOP)
# Remove any rows with missing values
clean_data <- clean_data %>%
filter(!is.na(DPFEAOPFK), !is.na(DAGC4X22R), !is.na(DPETECOP))
# Basic summary statistics
summary(clean_data)
## DISTNAME DPFEAOPFK DAGC4X22R DPETECOP
## Length:82 Min. : 9592 Min. :77.6 Min. : 0.40
## Class :character 1st Qu.:10652 1st Qu.:89.5 1st Qu.:43.45
## Mode :character Median :11595 Median :93.6 Median :59.90
## Mean :11748 Mean :92.7 Mean :57.71
## 3rd Qu.:12492 3rd Qu.:96.7 3rd Qu.:84.03
## Max. :15612 Max. :99.8 Max. :94.20
# More detailed summary statistics
clean_data %>%
summarize(
mean_spending = mean(DPFEAOPFK),
median_spending = median(DPFEAOPFK),
sd_spending = sd(DPFEAOPFK),
mean_grad_rate = mean(DAGC4X22R),
median_grad_rate = median(DAGC4X22R),
sd_grad_rate = sd(DAGC4X22R),
mean_econ_disadv = mean(DPETECOP),
median_econ_disadv = median(DPETECOP),
sd_econ_disadv = sd(DPETECOP)
)
## # A tibble: 1 × 9
## mean_spending median_spending sd_spending mean_grad_rate median_grad_rate
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 11748. 11595 1261. 92.7 93.6
## # ℹ 4 more variables: sd_grad_rate <dbl>, mean_econ_disadv <dbl>,
## # median_econ_disadv <dbl>, sd_econ_disadv <dbl>
# Calculate correlations between key variables
cor.test(clean_data$DPFEAOPFK, clean_data$DAGC4X22R)
##
## Pearson's product-moment correlation
##
## data: clean_data$DPFEAOPFK and clean_data$DAGC4X22R
## t = -5.4086, df = 80, p-value = 6.427e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.6603094 -0.3384454
## sample estimates:
## cor
## -0.5174485
cor.test(clean_data$DPETECOP, clean_data$DAGC4X22R)
##
## Pearson's product-moment correlation
##
## data: clean_data$DPETECOP and clean_data$DAGC4X22R
## t = -9.3772, df = 80, p-value = 1.596e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8129615 -0.6009769
## sample estimates:
## cor
## -0.7236131
cor.test(clean_data$DPFEAOPFK, clean_data$DPETECOP)
##
## Pearson's product-moment correlation
##
## data: clean_data$DPFEAOPFK and clean_data$DPETECOP
## t = 7.9587, df = 80, p-value = 9.744e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5232170 0.7705928
## sample estimates:
## cor
## 0.6647475
# Histogram of graduation rates
hist(clean_data$DAGC4X22R,
main = "Distribution of Graduation Rates",
xlab = "Graduation Rate (%)",
breaks = 10)
# Histogram of per-pupil spending
hist(clean_data$DPFEAOPFK,
main = "Distribution of Per-Pupil Spending",
xlab = "Per-Pupil Spending ($)",
breaks = 10)
# Histogram of economically disadvantaged
hist(clean_data$DPETECOP,
main = "Distribution of Economically Disadvantaged Students",
xlab = "Economically Disadvantaged (%)",
breaks = 10)
## Scatterplots
# Graduation rate vs. per-pupil spending
ggplot(clean_data, aes(x = DPFEAOPFK, y = DAGC4X22R)) +
geom_point() +
geom_smooth(method = "lm") +
labs(
title = "Graduation Rate vs. Per-Pupil Spending",
x = "Per-Pupil Spending ($)",
y = "Graduation Rate (%)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Graduation rate vs. economically disadvantaged
ggplot(clean_data, aes(x = DPETECOP, y = DAGC4X22R)) +
geom_point() +
geom_smooth(method = "lm") +
labs(
title = "Graduation Rate vs. Economically Disadvantaged",
x = "Economically Disadvantaged (%)",
y = "Graduation Rate (%)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Simple Linear Regression Analysis
# Model 1: Graduation rate vs. per-pupil spending
model1 <- lm(DAGC4X22R ~ DPFEAOPFK, data = clean_data)
summary(model1)
##
## Call:
## lm(formula = DAGC4X22R ~ DPFEAOPFK, data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.954 -1.911 0.913 2.863 7.508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.176e+02 4.624e+00 25.427 < 2e-16 ***
## DPFEAOPFK -2.117e-03 3.914e-04 -5.409 6.43e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.443 on 80 degrees of freedom
## Multiple R-squared: 0.2678, Adjusted R-squared: 0.2586
## F-statistic: 29.25 on 1 and 80 DF, p-value: 6.427e-07
# Model 2: Graduation rate vs. economically disadvantaged
model2 <- lm(DAGC4X22R ~ DPETECOP, data = clean_data)
summary(model2)
##
## Call:
## lm(formula = DAGC4X22R ~ DPETECOP, data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1424 -2.1035 0.1437 2.2392 9.4902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 100.67698 0.93798 107.334 < 2e-16 ***
## DPETECOP -0.13819 0.01474 -9.377 1.6e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.583 on 80 degrees of freedom
## Multiple R-squared: 0.5236, Adjusted R-squared: 0.5177
## F-statistic: 87.93 on 1 and 80 DF, p-value: 1.596e-14
# Model 3: Multiple regression with both predictors
model3 <- lm(DAGC4X22R ~ DPFEAOPFK + DPETECOP, data = clean_data)
summary(model3)
##
## Call:
## lm(formula = DAGC4X22R ~ DPFEAOPFK + DPETECOP, data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0428 -1.9355 0.0577 2.4315 9.3163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.033e+02 4.327e+00 23.88 < 2e-16 ***
## DPFEAOPFK -2.670e-04 4.241e-04 -0.63 0.531
## DPETECOP -1.299e-01 1.980e-02 -6.56 5.11e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.597 on 79 degrees of freedom
## Multiple R-squared: 0.526, Adjusted R-squared: 0.514
## F-statistic: 43.83 on 2 and 79 DF, p-value: 1.561e-13
# Basic diagnostic plots for the multiple regression model
par(mfrow = c(2, 2))
plot(model3)
par(mfrow = c(1, 1))
#Looking at Bexar County as compared ot other surrounding counties
clean_data <- clean_data %>%
mutate(county = str_extract(district_data$DZCNTYNM, "\\d+ ([A-Z]+)"))
clean_data <- clean_data %>%
mutate(county = str_replace(county, "\\d+ ", ""))
# Calculate summary statistics by county
county_comparison <- clean_data %>%
group_by(county) %>%
summarize(
count = n(),
mean_spending = mean(DPFEAOPFK),
mean_graduation = mean(DAGC4X22R),
mean_econ_disadv = mean(DPETECOP),
spending_grad_correlation = cor(DPFEAOPFK, DAGC4X22R)
) %>%
arrange(desc(mean_econ_disadv))
print(county_comparison)
## # A tibble: 6 × 6
## county count mean_spending mean_graduation mean_econ_disadv
## <chr> <int> <dbl> <dbl> <dbl>
## 1 BEXAR 12 12758 89.4 73.2
## 2 HARRIS 19 11832. 90.6 67.1
## 3 DALLAS 14 11956. 93.1 61.5
## 4 TARRANT 16 11584. 92.5 60.0
## 5 TRAVIS 7 11698. 94.5 42.9
## 6 COLLIN 14 10771. 97.4 32.7
## # ℹ 1 more variable: spending_grad_correlation <dbl>
# Create a visualization comparing counties
ggplot(county_comparison, aes(x = mean_econ_disadv, y = mean_graduation, size = mean_spending, color = county)) +
geom_point(alpha = 0.7) +
geom_text(aes(label = county), vjust = -1, size = 3) +
labs(
title = "County Comparison: Graduation Rates vs. Economic Disadvantage",
x = "Mean Economic Disadvantage (%)",
y = "Mean Graduation Rate (%)",
size = "Mean Per-Pupil\nSpending ($)"
) +
theme_minimal() +
theme(legend.position = "right")
# Visualize the per-pupil spending and graduation rate relationship by county
ggplot(clean_data, aes(x = DPFEAOPFK, y = DAGC4X22R, color = county)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Graduation Rate vs. Per-Pupil Spending by County",
x = "Per-Pupil Spending ($)",
y = "Graduation Rate (%)"
) +
theme_minimal() +
theme(legend.position = "right")
## `geom_smooth()` using formula = 'y ~ x'
# Run separate correlation tests for each county
county_correlations <- clean_data %>%
group_by(county) %>%
summarize(
correlation = cor(DPFEAOPFK, DAGC4X22R),
p_value = cor.test(DPFEAOPFK, DAGC4X22R)$p.value,
sample_size = n()
) %>%
arrange(correlation)
print(county_correlations)
## # A tibble: 6 × 4
## county correlation p_value sample_size
## <chr> <dbl> <dbl> <int>
## 1 TARRANT -0.636 0.00812 16
## 2 DALLAS -0.573 0.0322 14
## 3 HARRIS -0.499 0.0298 19
## 4 TRAVIS -0.328 0.472 7
## 5 BEXAR -0.198 0.537 12
## 6 COLLIN -0.134 0.647 14