library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readr)
bexar_schools<-read.csv("bexar_schools.csv")
summary(bexar_schools[c("DPETECOP", "DA0912DR21R", "DA0CC21R")])
##     DPETECOP      DA0912DR21R        DA0CC21R    
##  Min.   : 3.70   Min.   : 0.000   Min.   :-1.00  
##  1st Qu.:57.23   1st Qu.: 0.000   1st Qu.:11.60  
##  Median :80.40   Median : 1.100   Median :19.40  
##  Mean   :66.88   Mean   : 2.906   Mean   :26.67  
##  3rd Qu.:87.33   3rd Qu.: 3.500   3rd Qu.:35.00  
##  Max.   :97.30   Max.   :22.600   Max.   :97.70  
##                  NA's   :7        NA's   :11
Cleand_Data <- bexar_schools %>%
  filter(!is.na(DPETECOP) & !is.na(DA0912DR21R) & !is.na(DA0CC21R))
variables_description <- pastecs::stat.desc(Cleand_Data[c("DPETECOP", "DA0912DR21R", "DA0CC21R")])
print(variables_description)
##                  DPETECOP DA0912DR21R   DA0CC21R
## nbr.val        29.0000000  29.0000000  29.000000
## nbr.null        0.0000000   8.0000000   1.000000
## nbr.na          0.0000000   0.0000000   0.000000
## min             3.7000000   0.0000000  -1.000000
## max            90.3000000   5.2000000  97.700000
## range          86.6000000   5.2000000  98.700000
## sum          1821.8000000  46.2000000 773.500000
## median         68.7000000   0.8000000  19.400000
## mean           62.8206897   1.5931034  26.672414
## SE.mean         5.1531744   0.3065754   4.176995
## CI.mean.0.95   10.5557992   0.6279911   8.556187
## var           770.1009852   2.7256650 505.971355
## std.dev        27.7506934   1.6509588  22.493807
## coef.var        0.4417445   1.0363161   0.843336

DPETECOP (Percentage of Economically Disadvantaged Students):

This measure reflects the fraction of students at a particular school who come from a relatively economically disadvantaged background. A higher value indicates that there are more students with lower-income financial capabilities at the school.

The data reveals that both low and high poverty levels are spread among the schools, from 3.7% to 90.3%. The median of 68.7% suggests that more than half of the schools have a more that is economically disadvantaged population of 68.7%. The mean of 62.82% is slightly lower than the median, an indication of a slight skew in lower percentages. The large difference in displacement (86.6%) between the maximum and the minimum values in this category suggests that there is much differences in income in the various schools in the district.

DA0912DR21R (Annual Dropout Rate for Grades 9-12):

This measure indicates the percentage of high school population of students, who left high school during the academic year of 2020-2021. It is socially acceptable to target for lower values because they suggest fewer number of students quitting school prematurely.

The rates vary from 0% to 5.2 % with a rather low median as 0.8 %. The mean (1.59%) exceeds the median suggesting that some of the schools have much worse dropout rates than the others boosting up the average. Also we need to note that the information on the proportion or percentage of students dropping out was not submitted be eight schools which could change the perception of the situation.

DA0CC21R (Percentage At/Above College Admissions Criterion): This measure reflects the proportion of students who have met or surpassed a certain standard for the purpose of college admissions. A higher value indicates that more students are prepared for further studies in various schools.

College readiness presented within schools also have been very differently depicted varying from -1% (which would require additional analysis) up to 97.7%. The median (19.4%) being less than the mean (26.67%) attests that there are some schools in the sample that have college readiness rates that are far above the mean.

summary(Cleand_Data[c("DPETECOP", "DA0912DR21R", "DA0CC21R")])
##     DPETECOP      DA0912DR21R       DA0CC21R    
##  Min.   : 3.70   Min.   :0.000   Min.   :-1.00  
##  1st Qu.:48.90   1st Qu.:0.000   1st Qu.:11.60  
##  Median :68.70   Median :0.800   Median :19.40  
##  Mean   :62.82   Mean   :1.593   Mean   :26.67  
##  3rd Qu.:86.10   3rd Qu.:2.700   3rd Qu.:35.00  
##  Max.   :90.30   Max.   :5.200   Max.   :97.70
ggplot(Cleand_Data, aes(x = DPETECOP)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Histogram of Economically Disadvantaged Students",
       x = "Percentage of Economically Disadvantaged Students",
       y = "Frequency")

ggplot(Cleand_Data, aes(x = DA0912DR21R)) +
  geom_histogram(binwidth = 1, fill = "lightgreen", color = "black") +
  labs(title = "Histogram of Annual Dropout Rate (Grades 9-12)",
       x = "Annual Dropout Rate (%)",
       y = "Frequency")

ggplot(Cleand_Data, aes(x = DA0CC21R)) +
  geom_histogram(binwidth = 5, fill = "salmon", color = "black") +
  labs(title = "Histogram of College Admissions Criterion",
       x = "Percentage At/Above College Admissions Criterion",
       y = "Frequency")

bexar_schools_transformed <- Cleand_Data %>%
  mutate(DPETECOP_sqrt = sqrt(DPETECOP),DA0912DR21R_sqrt = sqrt(DA0912DR21R),DA0CC21R_sqrt = sqrt(DA0CC21R))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `DA0CC21R_sqrt = sqrt(DA0CC21R)`.
## Caused by warning in `sqrt()`:
## ! NaNs produced
ggplot(bexar_schools_transformed, aes(x = DPETECOP_sqrt)) +
  geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black") +
  labs(title = "Histogram of Sqrt Transformed Economically Disadvantaged Students",
       x = "Square Root of Percentage",
       y = "Frequency")

ggplot(bexar_schools_transformed, aes(x = DA0912DR21R_sqrt)) +
  geom_histogram(binwidth = 0.2, fill = "lightgreen", color = "black") +
  labs(title = "Histogram of Sqrt Transformed Annual Dropout Rate",
       x = "Square Root of Dropout Rate",
       y = "Frequency")

ggplot(bexar_schools_transformed, aes(x = DA0CC21R_sqrt)) +
  geom_histogram(binwidth = 0.5, fill = "salmon", color = "black") +
  labs(title = "Histogram of Sqrt Transformed College Admissions Criterion",
       x = "Square Root of Percentage",
       y = "Frequency")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

cor_matrix <- cor(bexar_schools_transformed[c("DPETECOP", "DA0912DR21R", "DA0CC21R")], use = "complete.obs")
print(cor_matrix)
##               DPETECOP DA0912DR21R   DA0CC21R
## DPETECOP     1.0000000   0.4182435 -0.7557066
## DA0912DR21R  0.4182435   1.0000000 -0.3009042
## DA0CC21R    -0.7557066  -0.3009042  1.0000000