library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
grad_rates<-read.csv("grad_rates.csv")
student_spending<-read.csv("student_spending.csv")
colnames(grad_rates)
## [1] "School.Year" "State" "NCES.LEA.ID" "LEA"
## [5] "School" "NCES.SCH.ID" "Data.Group" "Data.Description"
## [9] "Value" "Numerator" "Denominator" "Population"
## [13] "Subgroup" "Characteristics" "Age.Grade" "Academic.Subject"
## [17] "Outcome" "Program.Type"
pastecs::stat.desc(grad_rates$Value)
## nbr.val nbr.null nbr.na min max range sum median
## NA NA NA NA NA NA NA NA
## mean SE.mean CI.mean var std.dev coef.var
## NA NA NA NA NA NA
str(grad_rates$Value)
## chr [1:701] "86.60%" "73.90%" "93.70%" "N/A" "N/A" "81.00%" "82.80%" "N/A" ...
str(student_spending)
## 'data.frame': 91 obs. of 28 variables:
## $ Table.4..Student.membership.and.current.expenditures.per.pupil.for.public.elementary.and.secondary.education..by.function..subfunction..and.state.or.jurisdiction..FY.2021: chr "State or jurisdiction" "" "" "United States6" ...
## $ X : chr "School year 2020\x9621 \nstudent membership1" "" "" "49,211,213" ...
## $ X.1 : logi NA NA NA NA NA NA ...
## $ X.2 : chr "Current expenditures per pupil2" "" "Total" "$14,295" ...
## $ X.3 : chr "" "" "" "7, 8" ...
## $ X.4 : chr "" "" "Instruction" "$8,680" ...
## $ X.5 : chr "" "" "" "7, 8" ...
## $ X.6 : chr "" "Support services3" "Total support services" "$5,161" ...
## $ X.7 : chr "" "" "" "7, 8" ...
## $ X.8 : chr "" "" "Student support services5" "$927" ...
## $ X.9 : chr "" "" "" "7, 8" ...
## $ X.10 : chr "" "" "Instruc-\ntional staff\nsupport" "$727" ...
## $ X.11 : chr "" "" "" "7, 8" ...
## $ X.12 : chr "" "" "General \nadminis-\ntration" "$279" ...
## $ X.13 : chr "" "" "" "7, 8" ...
## $ X.14 : chr "" "" "School \nadminis-\ntration" "$818" ...
## $ X.15 : chr "" "" "" "7, 8" ...
## $ X.16 : chr "" "" "Operations \nand \nmaintenance" "$1,331" ...
## $ X.17 : chr "" "" "" "7, 8" ...
## $ X.18 : chr "" "" "Student \ntrans-\nportation" "$501" ...
## $ X.19 : chr "" "" "" "7, 8" ...
## $ X.20 : chr "" "" "Other \nsupport \nservices" "$580" ...
## $ X.21 : chr "" "" "" "7, 8" ...
## $ X.22 : chr "" "" "Food services" "$435" ...
## $ X.23 : chr "" "" "" "7, 8" ...
## $ X.24 : chr "" "" "Enterprise operations4" "$19" ...
## $ X.25 : int NA NA NA 8 NA NA NA NA NA 8 ...
## $ X.26 : logi NA NA NA NA NA NA ...
colnames(student_spending)
## [1] "Table.4..Student.membership.and.current.expenditures.per.pupil.for.public.elementary.and.secondary.education..by.function..subfunction..and.state.or.jurisdiction..FY.2021"
## [2] "X"
## [3] "X.1"
## [4] "X.2"
## [5] "X.3"
## [6] "X.4"
## [7] "X.5"
## [8] "X.6"
## [9] "X.7"
## [10] "X.8"
## [11] "X.9"
## [12] "X.10"
## [13] "X.11"
## [14] "X.12"
## [15] "X.13"
## [16] "X.14"
## [17] "X.15"
## [18] "X.16"
## [19] "X.17"
## [20] "X.18"
## [21] "X.19"
## [22] "X.20"
## [23] "X.21"
## [24] "X.22"
## [25] "X.23"
## [26] "X.24"
## [27] "X.25"
## [28] "X.26"
grad_rates$Value<-as.numeric(gsub("%","", grad_rates$Value))
## Warning: NAs introduced by coercion
str(grad_rates$Value)
## num [1:701] 86.6 73.9 93.7 NA NA 81 82.8 NA 89.8 71.4 ...
pastecs::stat.desc(grad_rates$Value)
## nbr.val nbr.null nbr.na min max range
## 6.760000e+02 0.000000e+00 2.500000e+01 3.000000e+01 9.700000e+01 6.700000e+01
## sum median mean SE.mean CI.mean.0.95 var
## 5.265680e+04 8.000000e+01 7.789467e+01 4.721739e-01 9.271061e-01 1.507129e+02
## std.dev coef.var
## 1.227652e+01 1.576041e-01
The average graduation rate is 67.9%, with a range from 0% to 97%, indicating variation across different states and districts.
ggplot(grad_rates, aes(x = Value)) +
geom_histogram(binwidth = 5) +
labs(title = "Distribution of Graduation Rates",
x = "Graduation Rate (%)",
y = "Count") +
theme_minimal()
## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_bin()`).
grad_rates_clean <- grad_rates %>% dplyr::mutate(Sqrt_Grad_Rate = sqrt(Value))
ggplot(grad_rates_clean, aes(x = Sqrt_Grad_Rate)) + geom_histogram(binwidth = 1) + labs(title = "Histogram of Transformed Graduation Rates",x = "Square Root of Graduation Rate",y = "Count") + theme_minimal()
## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_bin()`).