Capstone Data Analysis

R Markdown

library(tidyverse)  # For data manipulation and visualization

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl)     # For reading Excel files
library(car)        # For regression diagnostics

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(lmtest)     # For testing linear models

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

district_data <- read_excel("district_data.xlsx")

# Look at the data structure 
str(district_data)

## tibble [82 × 136] (S3: tbl_df/tbl/data.frame)
##  $ DISTNAME     : chr [1:82] "ALAMO HEIGHTS ISD" "HARLANDALE ISD" "EDGEWOOD ISD" "SAN ANTONIO ISD" ...
##  $ DISTRICT     : chr [1:82] "015901" "015904" "015905" "015907" ...
##  $ DZCNTYNM     : chr [1:82] "015 BEXAR" "015 BEXAR" "015 BEXAR" "015 BEXAR" ...
##  $ REGION       : chr [1:82] "20" "20" "20" "20" ...
##  $ DZCAMPUS     : num [1:82] 6 28 27 97 18 8 75 15 19 124 ...
##  $ DPETALLC     : num [1:82] 4731 12160 8165 45212 7871 ...
##  $ DPETBLAP     : num [1:82] 2 0.5 1.4 5.6 1.3 0.5 6.9 8.8 3.6 7 ...
##  $ DPETHISP     : num [1:82] 39.9 97.2 96.3 90.1 95.2 91.7 61.5 74.4 89.7 68 ...
##  $ DPETWHIP     : num [1:82] 52.5 2 0.8 3 2.8 6.8 22.8 13.8 4.9 16.7 ...
##  $ DPETINDP     : num [1:82] 0.3 0 0 0.1 0.2 0.1 0.2 0.2 0.1 0.1 ...
##  $ DPETASIP     : num [1:82] 2.8 0.1 1.2 0.5 0.2 0.2 4.3 0.6 0.5 4 ...
##  $ DPETPCIP     : num [1:82] 0.1 0 0 0 0.1 0.1 0.1 0.1 0.1 0.2 ...
##  $ DPETTWOP     : num [1:82] 2.4 0.2 0.2 0.7 0.3 0.7 4 2.1 1.2 4 ...
##  $ DPETECOP     : num [1:82] 21.6 87.8 94.2 88.6 93.6 86.1 52.5 59.6 81.4 54.3 ...
##  $ DPETLEPP     : num [1:82] 7.3 19.8 24.5 23.5 21 22.1 18.4 13.6 19.1 12 ...
##  $ DPETSPEP     : num [1:82] 11.4 12.9 12.8 14.5 14.9 12.1 14 13.3 17.5 14.7 ...
##  $ DPETBILP     : num [1:82] 12.8 22.1 24.4 26 21 21 20.7 15.9 19 11.2 ...
##  $ DPETVOCP     : num [1:82] 19.1 24.7 32 22.6 29 30.7 24.5 27.9 25.3 21.2 ...
##  $ DPETGIFP     : num [1:82] 22.1 6.7 2.7 7.3 6 5 9 7.2 10.7 7.7 ...
##  $ DA0AT22R     : num [1:82] 93.8 87.6 85.6 85 86.1 91.2 92.5 89.5 88.6 91.8 ...
##  $ DA0912DR22R  : num [1:82] 0.5 2.8 5.7 2.7 6.3 2.4 2.4 3.6 1.4 1.1 ...
##  $ DAGC4X22R    : num [1:82] 98.5 89.8 84.1 87 77.6 93.5 89.4 88 94.4 93.6 ...
##  $ DAGC5X21R    : num [1:82] 98.3 92.6 88.7 89.7 86.1 88.6 92.4 92.7 97.2 95.5 ...
##  $ DAGC6X20R    : num [1:82] 99.3 95.5 89 90.3 87.7 90.8 94.1 95.1 96.6 95.7 ...
##  $ DA0GR22N     : num [1:82] 422 917 575 2678 543 ...
##  $ DA0GS22N     : num [1:82] 363 701 537 2339 438 ...
##  $ DDA00A001S23R: num [1:82] 89 67 55 61 56 81 80 63 69 77 ...
##  $ DDA00A001223R: num [1:82] 69 33 24 31 25 52 55 30 38 50 ...
##  $ DDA00A001323R: num [1:82] 32 8 6 9 7 18 24 7 11 19 ...
##  $ DDA00AR01S23R: num [1:82] 90 66 56 61 54 80 80 66 72 78 ...
##  $ DDA00AR01223R: num [1:82] 74 36 27 35 28 54 58 36 44 54 ...
##  $ DDA00AR01323R: num [1:82] 32 8 5 10 7 19 24 8 12 18 ...
##  $ DDA00AM01S23R: num [1:82] 88 65 52 58 54 80 79 58 64 74 ...
##  $ DDA00AM01223R: num [1:82] 64 28 20 25 19 52 49 22 29 41 ...
##  $ DDA00AM01323R: num [1:82] 29 7 5 7 5 21 20 6 9 15 ...
##  $ DDA00AC01S23R: num [1:82] 89 69 56 62 59 85 82 63 69 79 ...
##  $ DDA00AC01223R: num [1:82] 67 35 19 29 26 48 57 24 34 51 ...
##  $ DDA00AC01323R: num [1:82] 32 9 4 8 6 12 27 4 8 20 ...
##  $ DDA00AS01S23R: num [1:82] 87 73 63 69 69 82 83 67 74 82 ...
##  $ DDA00AS01223R: num [1:82] 68 40 32 35 32 47 61 36 43 58 ...
##  $ DDA00AS01323R: num [1:82] 44 15 12 15 12 18 38 14 17 33 ...
##  $ DDB00A001S23R: num [1:82] 81 58 58 55 59 87 67 57 70 73 ...
##  $ DDB00A001223R: num [1:82] 58 25 22 24 21 58 37 25 38 43 ...
##  $ DDB00A001323R: num [1:82] 15 4 4 6 8 23 13 5 11 14 ...
##  $ DDH00A001S23R: num [1:82] 81 67 55 61 56 80 77 62 68 75 ...
##  $ DDH00A001223R: num [1:82] 55 33 23 31 25 51 48 28 37 45 ...
##  $ DDH00A001323R: num [1:82] 21 8 5 9 7 17 18 6 11 15 ...
##  $ DDW00A001S23R: num [1:82] 94 63 50 72 64 86 90 72 80 87 ...
##  $ DDW00A001223R: num [1:82] 78 33 17 50 27 62 72 39 51 66 ...
##  $ DDW00A001323R: num [1:82] 41 8 6 24 10 28 37 12 17 31 ...
##  $ DDI00A001S23R: num [1:82] 82 67 NA 47 53 57 79 67 71 71 ...
##  $ DDI00A001223R: num [1:82] 82 33 NA 16 29 29 49 40 17 48 ...
##  $ DDI00A001323R: num [1:82] 29 33 NA 4 18 14 24 13 0 18 ...
##  $ DD300A001S23R: num [1:82] 93 100 80 62 78 100 91 77 85 82 ...
##  $ DD300A001223R: num [1:82] 79 79 52 36 50 95 77 47 61 64 ...
##  $ DD300A001323R: num [1:82] 46 64 24 16 33 63 48 18 13 34 ...
##  $ DD400A001S23R: num [1:82] 100 NA -1 50 33 100 73 76 79 87 ...
##  $ DD400A001223R: num [1:82] 89 NA -1 38 0 83 41 65 42 62 ...
##  $ DD400A001323R: num [1:82] 22 NA -1 13 0 67 16 12 32 24 ...
##  $ DD200A001S23R: num [1:82] 95 86 75 69 53 91 88 74 70 83 ...
##  $ DD200A001223R: num [1:82] 78 41 39 45 16 62 66 42 34 59 ...
##  $ DD200A001323R: num [1:82] 45 14 7 18 0 15 33 12 8 26 ...
##  $ DDE00A001S23R: num [1:82] 73 65 54 59 55 79 70 60 67 69 ...
##  $ DDE00A001223R: num [1:82] 43 32 23 28 24 49 40 27 34 37 ...
##  $ DDE00A001323R: num [1:82] 13 8 5 8 6 17 13 6 9 11 ...
##  $ DA0CT22R     : num [1:82] 68.7 83.9 77.6 81.9 69.8 87.3 84.1 40.9 85.3 88.5 ...
##  $ DA0CC22R     : num [1:82] 66.6 14.2 7.8 12 9.8 9.2 38.6 22.1 9.6 29.4 ...
##  $ dA0csa22r    : num [1:82] 1161 902 857 890 879 ...
##  $ dA0caa22r    : num [1:82] 25.3 19.9 20.5 16.1 NA 18.3 23.4 19.5 15.8 24.7 ...
##  $ DPSATOFC     : num [1:82] 660 1972 1314 7028 1152 ...
##  $ DPSTTOFC     : num [1:82] 358 846 551 3004 509 ...
##  $ DPSCTOFP     : num [1:82] 1 1.2 1.7 0.5 1.2 1.2 0.7 2.3 0.4 0.3 ...
##  $ DPSSTOFP     : num [1:82] 3 2.7 4.8 2.8 3.4 2.6 2.8 2.7 2.5 2.1 ...
##  $ DPSUTOFP     : num [1:82] 7 14 10.8 16.5 11.2 11.6 12.7 10.2 11 13.3 ...
##  $ DPSTTOFP     : num [1:82] 54.3 42.9 42 42.7 44.2 43.5 50.4 46.2 45.6 52.1 ...
##  $ DPSETOFP     : num [1:82] 14.9 10.4 9.3 10.4 9.9 9.6 8.6 11.9 7.6 8.3 ...
##  $ DPSXTOFP     : num [1:82] 19.8 28.9 31.5 27.2 30.2 31.5 24.9 26.7 32.8 24 ...
##  $ DPSCTOSA     : num [1:82] 144939 105680 130381 125407 115823 ...
##  $ DPSSTOSA     : num [1:82] 88400 83689 75694 97339 82259 ...
##  $ DPSUTOSA     : num [1:82] 67118 69312 70200 67938 69213 ...
##  $ DPSTTOSA     : num [1:82] 60575 61216 60199 58682 61254 ...
##  $ DPSAMIFP     : num [1:82] 49.4 85.1 89.6 84.1 88 78.7 56.1 61.5 78.7 64.5 ...
##  $ DPSAKIDR     : num [1:82] 7.2 6.2 6.2 6.4 6.8 6.9 7.3 7.9 6.6 7.7 ...
##  $ DPSTKIDR     : num [1:82] 13.2 14.4 14.8 15.1 15.5 15.9 14.4 17.1 14.4 14.9 ...
##  $ DPST05FP     : num [1:82] 20.6 31.1 52 40.9 30 37 28.6 41.5 31.7 30.3 ...
##  $ DPSTEXPA     : num [1:82] 14.1 11.3 8.8 9.7 11.5 10.1 10.7 9.4 11.5 11.8 ...
##  $ DPSTADFP     : num [1:82] 40.9 28.4 34.3 37.8 30.2 21.6 34.2 28.3 31.9 32.3 ...
##  $ DPSTURNR     : num [1:82] 20.7 18.7 27 23.9 22.6 19.5 21.1 24 20.1 14.2 ...
##  $ DPSTBLFP     : num [1:82] 2.8 2.2 4.9 8.5 1.8 3.1 4.2 9 2.4 3.5 ...
##  $ DPSTHIFP     : num [1:82] 35.3 75.9 77.6 68.9 81.3 64.8 41 45.7 67.6 51.9 ...
##  $ DPSTWHFP     : num [1:82] 60.2 21.1 16 20.3 16.1 31 52.4 43.1 27.9 41.8 ...
##  $ DPSTINFP     : num [1:82] 0 0.2 0 0 0 0 0.1 0.1 0.2 0.3 ...
##  $ DPSTASFP     : num [1:82] 0.6 0.1 0.9 0.8 0.2 0.4 1.4 0.5 0.6 0.9 ...
##  $ DPSTPIFP     : num [1:82] 0.6 0.1 0 0.1 0 0 0.2 0 0.1 0.1 ...
##  $ DPSTTWFP     : num [1:82] 0.6 0.2 0.5 1.3 0.6 0.7 0.6 1.6 1.2 1.4 ...
##  $ DPSTREFP     : num [1:82] 68.9 68.9 81.4 65.2 64.2 66.5 65.9 58.8 76.5 62.6 ...
##  $ DPSTSPFP     : num [1:82] 11.3 14.2 8.3 11.3 11.1 9.8 15.3 11.8 11.5 9.9 ...
##  $ DPSTCOFP     : num [1:82] 5 0 0 6.9 5.1 2.3 3.2 10.9 1 3.9 ...
##  $ DPSTBIFP     : num [1:82] 1.8 8.3 3.9 11.3 9.5 8.3 5 6.6 7.2 11.3 ...
##   [list output truncated]

## Select and Clean Data 
names(district_data)

##   [1] "DISTNAME"      "DISTRICT"      "DZCNTYNM"      "REGION"       
##   [5] "DZCAMPUS"      "DPETALLC"      "DPETBLAP"      "DPETHISP"     
##   [9] "DPETWHIP"      "DPETINDP"      "DPETASIP"      "DPETPCIP"     
##  [13] "DPETTWOP"      "DPETECOP"      "DPETLEPP"      "DPETSPEP"     
##  [17] "DPETBILP"      "DPETVOCP"      "DPETGIFP"      "DA0AT22R"     
##  [21] "DA0912DR22R"   "DAGC4X22R"     "DAGC5X21R"     "DAGC6X20R"    
##  [25] "DA0GR22N"      "DA0GS22N"      "DDA00A001S23R" "DDA00A001223R"
##  [29] "DDA00A001323R" "DDA00AR01S23R" "DDA00AR01223R" "DDA00AR01323R"
##  [33] "DDA00AM01S23R" "DDA00AM01223R" "DDA00AM01323R" "DDA00AC01S23R"
##  [37] "DDA00AC01223R" "DDA00AC01323R" "DDA00AS01S23R" "DDA00AS01223R"
##  [41] "DDA00AS01323R" "DDB00A001S23R" "DDB00A001223R" "DDB00A001323R"
##  [45] "DDH00A001S23R" "DDH00A001223R" "DDH00A001323R" "DDW00A001S23R"
##  [49] "DDW00A001223R" "DDW00A001323R" "DDI00A001S23R" "DDI00A001223R"
##  [53] "DDI00A001323R" "DD300A001S23R" "DD300A001223R" "DD300A001323R"
##  [57] "DD400A001S23R" "DD400A001223R" "DD400A001323R" "DD200A001S23R"
##  [61] "DD200A001223R" "DD200A001323R" "DDE00A001S23R" "DDE00A001223R"
##  [65] "DDE00A001323R" "DA0CT22R"      "DA0CC22R"      "dA0csa22r"    
##  [69] "dA0caa22r"     "DPSATOFC"      "DPSTTOFC"      "DPSCTOFP"     
##  [73] "DPSSTOFP"      "DPSUTOFP"      "DPSTTOFP"      "DPSETOFP"     
##  [77] "DPSXTOFP"      "DPSCTOSA"      "DPSSTOSA"      "DPSUTOSA"     
##  [81] "DPSTTOSA"      "DPSAMIFP"      "DPSAKIDR"      "DPSTKIDR"     
##  [85] "DPST05FP"      "DPSTEXPA"      "DPSTADFP"      "DPSTURNR"     
##  [89] "DPSTBLFP"      "DPSTHIFP"      "DPSTWHFP"      "DPSTINFP"     
##  [93] "DPSTASFP"      "DPSTPIFP"      "DPSTTWFP"      "DPSTREFP"     
##  [97] "DPSTSPFP"      "DPSTCOFP"      "DPSTBIFP"      "DPSTVOFP"     
## [101] "DPSTGOFP"      "DPFVTOTK"      "DPFTADPR"      "DPFRAALLT"    
## [105] "DPFRAALLK"     "DPFRAOPRT"     "DPFRASTAP"     "DZRVLOCP"     
## [109] "DPFRAFEDP"     "DPFRAORVT"     "DPFUNAB1T"     "DPFUNA4T"     
## [113] "DPFEAALLT"     "DPFEAOPFT"     "DPFEAOPFK"     "DPFEAINSP"    
## [117] "DZEXADMP"      "DZEXADSP"      "DZEXPLAP"      "DZEXOTHP"     
## [121] "DPFEAINST"     "DPFEAINSK"     "DPFPAREGP"     "DPFPASPEP"    
## [125] "DPFPACOMP"     "DPFPABILP"     "DPFPAVOCP"     "DPFPAGIFP"    
## [129] "DPFPAATHP"     "DPFPAHSAP"     "DPFPREKP"      "DPFPAOTHP"    
## [133] "DISTSIZE"      "COMMTYPE"      "PROPWLTH"      "TAXRATE"

# Select only columns we need 
clean_data <- district_data %>%
  select(DISTNAME, DPFEAOPFK, DAGC4X22R, DPETECOP)

# Remove any rows with missing values 
clean_data <- clean_data %>%
  filter(!is.na(DPFEAOPFK), !is.na(DAGC4X22R), !is.na(DPETECOP))

Descriptive Statistics

# Basic summary statistics
summary(clean_data)

##    DISTNAME           DPFEAOPFK       DAGC4X22R       DPETECOP    
##  Length:82          Min.   : 9592   Min.   :77.6   Min.   : 0.40  
##  Class :character   1st Qu.:10652   1st Qu.:89.5   1st Qu.:43.45  
##  Mode  :character   Median :11595   Median :93.6   Median :59.90  
##                     Mean   :11748   Mean   :92.7   Mean   :57.71  
##                     3rd Qu.:12492   3rd Qu.:96.7   3rd Qu.:84.03  
##                     Max.   :15612   Max.   :99.8   Max.   :94.20

# More detailed summary statistics 
clean_data %>%
  summarize(
    mean_spending = mean(DPFEAOPFK),
    median_spending = median(DPFEAOPFK),
    sd_spending = sd(DPFEAOPFK),
    
    mean_grad_rate = mean(DAGC4X22R),
    median_grad_rate = median(DAGC4X22R),
    sd_grad_rate = sd(DAGC4X22R),
    
    mean_econ_disadv = mean(DPETECOP),
    median_econ_disadv = median(DPETECOP),
    sd_econ_disadv = sd(DPETECOP)
  )

## # A tibble: 1 × 9
##   mean_spending median_spending sd_spending mean_grad_rate median_grad_rate
##           <dbl>           <dbl>       <dbl>          <dbl>            <dbl>
## 1        11748.           11595       1261.           92.7             93.6
## # ℹ 4 more variables: sd_grad_rate <dbl>, mean_econ_disadv <dbl>,
## #   median_econ_disadv <dbl>, sd_econ_disadv <dbl>

# Calculate correlations between key variables
cor.test(clean_data$DPFEAOPFK, clean_data$DAGC4X22R)

## 
##  Pearson's product-moment correlation
## 
## data:  clean_data$DPFEAOPFK and clean_data$DAGC4X22R
## t = -5.4086, df = 80, p-value = 6.427e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6603094 -0.3384454
## sample estimates:
##        cor 
## -0.5174485

cor.test(clean_data$DPETECOP, clean_data$DAGC4X22R)

## 
##  Pearson's product-moment correlation
## 
## data:  clean_data$DPETECOP and clean_data$DAGC4X22R
## t = -9.3772, df = 80, p-value = 1.596e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8129615 -0.6009769
## sample estimates:
##        cor 
## -0.7236131

cor.test(clean_data$DPFEAOPFK, clean_data$DPETECOP)

## 
##  Pearson's product-moment correlation
## 
## data:  clean_data$DPFEAOPFK and clean_data$DPETECOP
## t = 7.9587, df = 80, p-value = 9.744e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5232170 0.7705928
## sample estimates:
##       cor 
## 0.6647475

Data Visualization

# Histogram of graduation rates 
hist(clean_data$DAGC4X22R, 
     main = "Distribution of Graduation Rates",
     xlab = "Graduation Rate (%)",
     breaks = 10)

# Histogram of per-pupil spending
hist(clean_data$DPFEAOPFK, 
     main = "Distribution of Per-Pupil Spending",
     xlab = "Per-Pupil Spending ($)",
     breaks = 10)

# Histogram of economically disadvantaged
hist(clean_data$DPETECOP, 
     main = "Distribution of Economically Disadvantaged Students",
     xlab = "Economically Disadvantaged (%)",
     breaks = 10)

## Scatterplots

# Graduation rate vs. per-pupil spending
ggplot(clean_data, aes(x = DPFEAOPFK, y = DAGC4X22R)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(
    title = "Graduation Rate vs. Per-Pupil Spending",
    x = "Per-Pupil Spending ($)",
    y = "Graduation Rate (%)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Graduation rate vs. economically disadvantaged
ggplot(clean_data, aes(x = DPETECOP, y = DAGC4X22R)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(
    title = "Graduation Rate vs. Economically Disadvantaged",
    x = "Economically Disadvantaged (%)",
    y = "Graduation Rate (%)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Simple Linear Regression Analysis

# Model 1: Graduation rate vs. per-pupil spending
model1 <- lm(DAGC4X22R ~ DPFEAOPFK, data = clean_data)
summary(model1)

## 
## Call:
## lm(formula = DAGC4X22R ~ DPFEAOPFK, data = clean_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.954  -1.911   0.913   2.863   7.508 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.176e+02  4.624e+00  25.427  < 2e-16 ***
## DPFEAOPFK   -2.117e-03  3.914e-04  -5.409 6.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.443 on 80 degrees of freedom
## Multiple R-squared:  0.2678, Adjusted R-squared:  0.2586 
## F-statistic: 29.25 on 1 and 80 DF,  p-value: 6.427e-07

# Model 2: Graduation rate vs. economically disadvantaged
model2 <- lm(DAGC4X22R ~ DPETECOP, data = clean_data)
summary(model2)

## 
## Call:
## lm(formula = DAGC4X22R ~ DPETECOP, data = clean_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1424  -2.1035   0.1437   2.2392   9.4902 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 100.67698    0.93798 107.334  < 2e-16 ***
## DPETECOP     -0.13819    0.01474  -9.377  1.6e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.583 on 80 degrees of freedom
## Multiple R-squared:  0.5236, Adjusted R-squared:  0.5177 
## F-statistic: 87.93 on 1 and 80 DF,  p-value: 1.596e-14

# Model 3: Multiple regression with both predictors
model3 <- lm(DAGC4X22R ~ DPFEAOPFK + DPETECOP, data = clean_data)
summary(model3)

## 
## Call:
## lm(formula = DAGC4X22R ~ DPFEAOPFK + DPETECOP, data = clean_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0428  -1.9355   0.0577   2.4315   9.3163 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.033e+02  4.327e+00   23.88  < 2e-16 ***
## DPFEAOPFK   -2.670e-04  4.241e-04   -0.63    0.531    
## DPETECOP    -1.299e-01  1.980e-02   -6.56 5.11e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.597 on 79 degrees of freedom
## Multiple R-squared:  0.526,  Adjusted R-squared:  0.514 
## F-statistic: 43.83 on 2 and 79 DF,  p-value: 1.561e-13

Regression Diagnostics

# Basic diagnostic plots for the multiple regression model
par(mfrow = c(2, 2))
plot(model3)

par(mfrow = c(1, 1))

#Looking at Bexar County as compared ot other surrounding counties

clean_data <- clean_data %>%
  mutate(county = str_extract(district_data$DZCNTYNM, "\\d+ ([A-Z]+)"))

clean_data <- clean_data %>%
  mutate(county = str_replace(county, "\\d+ ", ""))

# Calculate summary statistics by county
county_comparison <- clean_data %>%
  group_by(county) %>%
  summarize(
    count = n(),
    mean_spending = mean(DPFEAOPFK),
    mean_graduation = mean(DAGC4X22R),
    mean_econ_disadv = mean(DPETECOP),
    spending_grad_correlation = cor(DPFEAOPFK, DAGC4X22R)
  ) %>%
  arrange(desc(mean_econ_disadv))

print(county_comparison)

## # A tibble: 6 × 6
##   county  count mean_spending mean_graduation mean_econ_disadv
##   <chr>   <int>         <dbl>           <dbl>            <dbl>
## 1 BEXAR      12        12758             89.4             73.2
## 2 HARRIS     19        11832.            90.6             67.1
## 3 DALLAS     14        11956.            93.1             61.5
## 4 TARRANT    16        11584.            92.5             60.0
## 5 TRAVIS      7        11698.            94.5             42.9
## 6 COLLIN     14        10771.            97.4             32.7
## # ℹ 1 more variable: spending_grad_correlation <dbl>

# Create a visualization comparing counties
ggplot(county_comparison, aes(x = mean_econ_disadv, y = mean_graduation, size = mean_spending, color = county)) +
  geom_point(alpha = 0.7) +
  geom_text(aes(label = county), vjust = -1, size = 3) +
  labs(
    title = "County Comparison: Graduation Rates vs. Economic Disadvantage",
    x = "Mean Economic Disadvantage (%)",
    y = "Mean Graduation Rate (%)",
    size = "Mean Per-Pupil\nSpending ($)"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

# Visualize the per-pupil spending and graduation rate relationship by county
ggplot(clean_data, aes(x = DPFEAOPFK, y = DAGC4X22R, color = county)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Graduation Rate vs. Per-Pupil Spending by County",
    x = "Per-Pupil Spending ($)",
    y = "Graduation Rate (%)"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

## `geom_smooth()` using formula = 'y ~ x'

# Run separate correlation tests for each county
county_correlations <- clean_data %>%
  group_by(county) %>%
  summarize(
    correlation = cor(DPFEAOPFK, DAGC4X22R),
    p_value = cor.test(DPFEAOPFK, DAGC4X22R)$p.value,
    sample_size = n()
  ) %>%
  arrange(correlation)

print(county_correlations)

## # A tibble: 6 × 4
##   county  correlation p_value sample_size
##   <chr>         <dbl>   <dbl>       <int>
## 1 TARRANT      -0.636 0.00812          16
## 2 DALLAS       -0.573 0.0322           14
## 3 HARRIS       -0.499 0.0298           19
## 4 TRAVIS       -0.328 0.472             7
## 5 BEXAR        -0.198 0.537            12
## 6 COLLIN       -0.134 0.647            14

Capstone Data Analysis

Masha Sturrock/Dee Torres

2025-03-25

R Markdown

Descriptive Statistics

Data Visualization

Regression Diagnostics