library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readr)
bexar_schools<-read.csv("bexar_schools.csv")
summary(bexar_schools[c("DPETECOP", "DA0912DR21R", "DA0CC21R")])
## DPETECOP DA0912DR21R DA0CC21R
## Min. : 3.70 Min. : 0.000 Min. :-1.00
## 1st Qu.:57.23 1st Qu.: 0.000 1st Qu.:11.60
## Median :80.40 Median : 1.100 Median :19.40
## Mean :66.88 Mean : 2.906 Mean :26.67
## 3rd Qu.:87.33 3rd Qu.: 3.500 3rd Qu.:35.00
## Max. :97.30 Max. :22.600 Max. :97.70
## NA's :7 NA's :11
Cleand_Data <- bexar_schools %>%
filter(!is.na(DPETECOP) & !is.na(DA0912DR21R) & !is.na(DA0CC21R))
variables_description <- pastecs::stat.desc(Cleand_Data[c("DPETECOP", "DA0912DR21R", "DA0CC21R")])
print(variables_description)
## DPETECOP DA0912DR21R DA0CC21R
## nbr.val 29.0000000 29.0000000 29.000000
## nbr.null 0.0000000 8.0000000 1.000000
## nbr.na 0.0000000 0.0000000 0.000000
## min 3.7000000 0.0000000 -1.000000
## max 90.3000000 5.2000000 97.700000
## range 86.6000000 5.2000000 98.700000
## sum 1821.8000000 46.2000000 773.500000
## median 68.7000000 0.8000000 19.400000
## mean 62.8206897 1.5931034 26.672414
## SE.mean 5.1531744 0.3065754 4.176995
## CI.mean.0.95 10.5557992 0.6279911 8.556187
## var 770.1009852 2.7256650 505.971355
## std.dev 27.7506934 1.6509588 22.493807
## coef.var 0.4417445 1.0363161 0.843336
DPETECOP (Percentage of Economically Disadvantaged Students):
This measure reflects the fraction of students at a particular school who come from a relatively economically disadvantaged background. A higher value indicates that there are more students with lower-income financial capabilities at the school.
The data reveals that both low and high poverty levels are spread among the schools, from 3.7% to 90.3%. The median of 68.7% suggests that more than half of the schools have a more that is economically disadvantaged population of 68.7%. The mean of 62.82% is slightly lower than the median, an indication of a slight skew in lower percentages. The large difference in displacement (86.6%) between the maximum and the minimum values in this category suggests that there is much differences in income in the various schools in the district.
DA0912DR21R (Annual Dropout Rate for Grades 9-12):
This measure indicates the percentage of high school population of students, who left high school during the academic year of 2020-2021. It is socially acceptable to target for lower values because they suggest fewer number of students quitting school prematurely.
The rates vary from 0% to 5.2 % with a rather low median as 0.8 %. The mean (1.59%) exceeds the median suggesting that some of the schools have much worse dropout rates than the others boosting up the average. Also we need to note that the information on the proportion or percentage of students dropping out was not submitted be eight schools which could change the perception of the situation.
DA0CC21R (Percentage At/Above College Admissions Criterion): This measure reflects the proportion of students who have met or surpassed a certain standard for the purpose of college admissions. A higher value indicates that more students are prepared for further studies in various schools.
College readiness presented within schools also have been very differently depicted varying from -1% (which would require additional analysis) up to 97.7%. The median (19.4%) being less than the mean (26.67%) attests that there are some schools in the sample that have college readiness rates that are far above the mean.
summary(Cleand_Data[c("DPETECOP", "DA0912DR21R", "DA0CC21R")])
## DPETECOP DA0912DR21R DA0CC21R
## Min. : 3.70 Min. :0.000 Min. :-1.00
## 1st Qu.:48.90 1st Qu.:0.000 1st Qu.:11.60
## Median :68.70 Median :0.800 Median :19.40
## Mean :62.82 Mean :1.593 Mean :26.67
## 3rd Qu.:86.10 3rd Qu.:2.700 3rd Qu.:35.00
## Max. :90.30 Max. :5.200 Max. :97.70
ggplot(Cleand_Data, aes(x = DPETECOP)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Histogram of Economically Disadvantaged Students",
x = "Percentage of Economically Disadvantaged Students",
y = "Frequency")
ggplot(Cleand_Data, aes(x = DA0912DR21R)) +
geom_histogram(binwidth = 1, fill = "lightgreen", color = "black") +
labs(title = "Histogram of Annual Dropout Rate (Grades 9-12)",
x = "Annual Dropout Rate (%)",
y = "Frequency")
ggplot(Cleand_Data, aes(x = DA0CC21R)) +
geom_histogram(binwidth = 5, fill = "salmon", color = "black") +
labs(title = "Histogram of College Admissions Criterion",
x = "Percentage At/Above College Admissions Criterion",
y = "Frequency")
bexar_schools_transformed <- Cleand_Data %>%
mutate(DPETECOP_sqrt = sqrt(DPETECOP),DA0912DR21R_sqrt = sqrt(DA0912DR21R),DA0CC21R_sqrt = sqrt(DA0CC21R))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `DA0CC21R_sqrt = sqrt(DA0CC21R)`.
## Caused by warning in `sqrt()`:
## ! NaNs produced
ggplot(bexar_schools_transformed, aes(x = DPETECOP_sqrt)) +
geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black") +
labs(title = "Histogram of Sqrt Transformed Economically Disadvantaged Students",
x = "Square Root of Percentage",
y = "Frequency")
ggplot(bexar_schools_transformed, aes(x = DA0912DR21R_sqrt)) +
geom_histogram(binwidth = 0.2, fill = "lightgreen", color = "black") +
labs(title = "Histogram of Sqrt Transformed Annual Dropout Rate",
x = "Square Root of Dropout Rate",
y = "Frequency")
ggplot(bexar_schools_transformed, aes(x = DA0CC21R_sqrt)) +
geom_histogram(binwidth = 0.5, fill = "salmon", color = "black") +
labs(title = "Histogram of Sqrt Transformed College Admissions Criterion",
x = "Square Root of Percentage",
y = "Frequency")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
cor_matrix <- cor(bexar_schools_transformed[c("DPETECOP", "DA0912DR21R", "DA0CC21R")], use = "complete.obs")
print(cor_matrix)
## DPETECOP DA0912DR21R DA0CC21R
## DPETECOP 1.0000000 0.4182435 -0.7557066
## DA0912DR21R 0.4182435 1.0000000 -0.3009042
## DA0CC21R -0.7557066 -0.3009042 1.0000000