##Introduction My research question for is, “Is there a relationship between the enrollment status of students and the student’s race?” The dataset I selected contains information from The State Education Department for the public schools of the state of New York.I retrieved this data set from GitHub which led me to data.nysed.gov.The data set includes data on student enrollment, country, Needs-To-Resources, group,district, and public schools. This data set provides a variety of information of NewYork’s public schools from 2023 to 2025.
To answer my proposed research question, I am focusing on the demographic factor data set, and taking a look at the following variables: NUM_WHITE, NUM_HISP, NUM_BLACK, NUM_ASIAN, NUM_Multi, NUM_AM-IND. Each race, which is a column, holds a series of number of enrollments for a variety of entities(schools, districts, location) over the course of 3 years, from 2023 to 2025. I will be utilizing these columns to find the averages, to see if there is a relationship between the average number of enrollment and student race.
#Importing My Data Set
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
dem_factors <- read_csv("C:/DATA101/demographics.csv")
## Rows: 16606 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ENTITY_CD, ENTITY_NAME
## dbl (33): YEAR, NUM_ELL, PER_ELL, NUM_AM_IND, PER_AM_IND, NUM_BLACK, PER_BLA...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Exploring My Data
str(dem_factors)
## spc_tbl_ [16,606 × 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ENTITY_CD : chr [1:16606] "000000000001" "000000000002" "000000000003" "000000000004" ...
## $ ENTITY_NAME : chr [1:16606] "NYC Public Schools" "Large Cities" "High Need/Resource Urban-Suburban Districts" "High Need/Resource Rural Districts" ...
## $ YEAR : num [1:16606] 2023 2023 2023 2023 2023 ...
## $ NUM_ELL : num [1:16606] 129782 14300 35438 2143 33962 ...
## $ PER_ELL : num [1:16606] 16 16 18 2 5 4 9 6 0 2 ...
## $ NUM_AM_IND : num [1:16606] 9743 509 580 2136 2529 ...
## $ PER_AM_IND : num [1:16606] 1 1 0 2 0 0 1 0 0 0 ...
## $ NUM_BLACK : num [1:16606] 159551 34103 39603 3894 44836 ...
## $ PER_BLACK : num [1:16606] 20 38 21 3 7 4 49 20 1 9 ...
## $ NUM_HISP : num [1:16606] 336708 29646 87002 11476 126167 ...
## $ PER_HISP : num [1:16606] 42 33 45 9 18 16 39 11 2 8 ...
## $ NUM_ASIAN : num [1:16606] 151113 6779 7905 1055 29061 ...
## $ PER_ASIAN : num [1:16606] 19 8 4 1 4 14 4 11 1 4 ...
## $ NUM_WHITE : num [1:16606] 125091 13658 46914 108669 457875 ...
## $ PER_WHITE : num [1:16606] 16 15 24 82 66 63 6 51 94 71 ...
## $ NUM_Multi : num [1:16606] 15638 4027 9895 5173 28665 ...
## $ PER_Multi : num [1:16606] 2 5 5 4 4 4 2 6 2 8 ...
## $ NUM_SWD : num [1:16606] 189036 19399 32099 22936 109933 ...
## $ PER_SWD : num [1:16606] 24 22 17 17 16 15 18 14 18 15 ...
## $ NUM_FEMALE : num [1:16606] 384400 43293 92919 64894 336061 ...
## $ PER_FEMALE : num [1:16606] 48 49 48 49 49 49 51 49 49 49 ...
## $ NUM_MALE : num [1:16606] 413331 45403 98945 67415 352691 ...
## $ PER_MALE : num [1:16606] 52 51 52 51 51 51 49 51 51 51 ...
## $ NUM_NONBINARY: num [1:16606] 113 26 35 94 381 135 33 16 5 24 ...
## $ PER_NONBINARY: num [1:16606] 0 0 0 0 0 0 0 0 0 0 ...
## $ NUM_ECDIS : num [1:16606] 601178 74564 142102 79437 301330 ...
## $ PER_ECDIS : num [1:16606] 75 84 74 60 44 19 82 46 56 54 ...
## $ NUM_MIGRANT : num [1:16606] 23 23 286 775 737 98 14 20 3 1 ...
## $ PER_MIGRANT : num [1:16606] 0 0 0 1 0 0 0 0 0 0 ...
## $ NUM_HOMELESS : num [1:16606] 73675 3833 8274 2952 9279 ...
## $ PER_HOMELESS : num [1:16606] 9 4 4 2 1 0 8 3 1 3 ...
## $ NUM_FOSTER : num [1:16606] 4196 254 782 553 1537 ...
## $ PER_FOSTER : num [1:16606] 1 0 0 0 0 0 0 0 0 0 ...
## $ NUM_ARMED : num [1:16606] 3215 17 286 3206 2021 ...
## $ PER_ARMED : num [1:16606] 0 0 0 2 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. ENTITY_CD = col_character(),
## .. ENTITY_NAME = col_character(),
## .. YEAR = col_double(),
## .. NUM_ELL = col_double(),
## .. PER_ELL = col_double(),
## .. NUM_AM_IND = col_double(),
## .. PER_AM_IND = col_double(),
## .. NUM_BLACK = col_double(),
## .. PER_BLACK = col_double(),
## .. NUM_HISP = col_double(),
## .. PER_HISP = col_double(),
## .. NUM_ASIAN = col_double(),
## .. PER_ASIAN = col_double(),
## .. NUM_WHITE = col_double(),
## .. PER_WHITE = col_double(),
## .. NUM_Multi = col_double(),
## .. PER_Multi = col_double(),
## .. NUM_SWD = col_double(),
## .. PER_SWD = col_double(),
## .. NUM_FEMALE = col_double(),
## .. PER_FEMALE = col_double(),
## .. NUM_MALE = col_double(),
## .. PER_MALE = col_double(),
## .. NUM_NONBINARY = col_double(),
## .. PER_NONBINARY = col_double(),
## .. NUM_ECDIS = col_double(),
## .. PER_ECDIS = col_double(),
## .. NUM_MIGRANT = col_double(),
## .. PER_MIGRANT = col_double(),
## .. NUM_HOMELESS = col_double(),
## .. PER_HOMELESS = col_double(),
## .. NUM_FOSTER = col_double(),
## .. PER_FOSTER = col_double(),
## .. NUM_ARMED = col_double(),
## .. PER_ARMED = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
head(dem_factors)
## # A tibble: 6 × 35
## ENTITY_CD ENTITY_NAME YEAR NUM_ELL PER_ELL NUM_AM_IND PER_AM_IND NUM_BLACK
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 000000000001 NYC Public… 2023 129782 16 9743 1 159551
## 2 000000000002 Large Citi… 2023 14300 16 509 1 34103
## 3 000000000003 High Need/… 2023 35438 18 580 0 39603
## 4 000000000004 High Need/… 2023 2143 2 2136 2 3894
## 5 000000000005 Average Ne… 2023 33962 5 2529 0 44836
## 6 000000000006 Low Need D… 2023 13643 4 584 0 14339
## # ℹ 27 more variables: PER_BLACK <dbl>, NUM_HISP <dbl>, PER_HISP <dbl>,
## # NUM_ASIAN <dbl>, PER_ASIAN <dbl>, NUM_WHITE <dbl>, PER_WHITE <dbl>,
## # NUM_Multi <dbl>, PER_Multi <dbl>, NUM_SWD <dbl>, PER_SWD <dbl>,
## # NUM_FEMALE <dbl>, PER_FEMALE <dbl>, NUM_MALE <dbl>, PER_MALE <dbl>,
## # NUM_NONBINARY <dbl>, PER_NONBINARY <dbl>, NUM_ECDIS <dbl>, PER_ECDIS <dbl>,
## # NUM_MIGRANT <dbl>, PER_MIGRANT <dbl>, NUM_HOMELESS <dbl>,
## # PER_HOMELESS <dbl>, NUM_FOSTER <dbl>, PER_FOSTER <dbl>, NUM_ARMED <dbl>, …
summary(dem_factors)
## ENTITY_CD ENTITY_NAME YEAR NUM_ELL
## Length:16606 Length:16606 Min. :2023 Min. : 0
## Class :character Class :character 1st Qu.:2023 1st Qu.: 8
## Mode :character Mode :character Median :2024 Median : 28
## Mean :2024 Mean : 283
## 3rd Qu.:2025 3rd Qu.: 77
## Max. :2025 Max. :280312
## NA's :1292
## PER_ELL NUM_AM_IND PER_AM_IND NUM_BLACK
## Min. : 0.00 Min. : 0.00 Min. : 0.0000 Min. : 0.0
## 1st Qu.: 2.00 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.: 7.0
## Median : 6.00 Median : 1.00 Median : 0.0000 Median : 33.0
## Mean : 10.54 Mean : 17.89 Mean : 0.7473 Mean : 364.7
## 3rd Qu.: 15.00 3rd Qu.: 4.00 3rd Qu.: 1.0000 3rd Qu.: 106.0
## Max. :100.00 Max. :18122.00 Max. :99.0000 Max. :382380.0
## NA's :1292
## PER_BLACK NUM_HISP PER_HISP NUM_ASIAN
## Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 1.00 1st Qu.: 29.0 1st Qu.: 6.00 1st Qu.: 3.0
## Median : 6.00 Median : 89.0 Median : 19.00 Median : 12.0
## Mean : 15.71 Mean : 719.2 Mean : 27.33 Mean : 255.6
## 3rd Qu.: 22.00 3rd Qu.: 225.0 3rd Qu.: 43.00 3rd Qu.: 49.0
## Max. :100.00 Max. :744672.0 Max. :100.00 Max. :257714.0
##
## PER_ASIAN NUM_WHITE PER_WHITE NUM_Multi
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 23.0 1st Qu.: 6.00 1st Qu.: 4.00
## Median : 2.000 Median : 197.0 Median : 46.00 Median : 13.00
## Mean : 7.291 Mean : 888.2 Mean : 45.22 Mean : 78.47
## 3rd Qu.: 8.000 3rd Qu.: 388.0 3rd Qu.: 81.00 3rd Qu.: 29.00
## Max. :94.000 Max. :980161.0 Max. :100.00 Max. :87228.00
##
## PER_Multi NUM_SWD PER_SWD NUM_FEMALE
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0
## 1st Qu.: 1.000 1st Qu.: 55.0 1st Qu.: 15.00 1st Qu.: 144
## Median : 3.000 Median : 85.0 Median : 18.00 Median : 215
## Mean : 3.513 Mean : 455.5 Mean : 20.27 Mean : 1131
## 3rd Qu.: 5.000 3rd Qu.: 138.0 3rd Qu.: 23.00 3rd Qu.: 355
## Max. :25.000 Max. :480579.0 Max. :100.00 Max. :1179812
## NA's :89 NA's :89
## PER_FEMALE NUM_MALE PER_MALE NUM_NONBINARY
## Min. : 0.00 Min. : 0 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 47.00 1st Qu.: 153 1st Qu.: 49.00 1st Qu.: 0.0000
## Median : 49.00 Median : 228 Median : 51.00 Median : 0.0000
## Mean : 48.51 Mean : 1192 Mean : 51.31 Mean : 0.8777
## 3rd Qu.: 51.00 3rd Qu.: 370 3rd Qu.: 53.00 3rd Qu.: 0.0000
## Max. :100.00 Max. :1242577 Max. :100.00 Max. :1023.0000
##
## PER_NONBINARY NUM_ECDIS PER_ECDIS NUM_MIGRANT
## Min. : 0.00000 Min. : 0 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00000 1st Qu.: 144 1st Qu.: 39.00 1st Qu.: 0.000
## Median : 0.00000 Median : 250 Median : 58.00 Median : 0.000
## Mean : 0.02391 Mean : 1400 Mean : 58.82 Mean : 2.739
## 3rd Qu.: 0.00000 3rd Qu.: 429 3rd Qu.: 84.00 3rd Qu.: 0.000
## Max. :14.00000 Max. :1440928 Max. :100.00 Max. :2388.000
## NA's :63 NA's :63 NA's :5262
## PER_MIGRANT NUM_HOMELESS PER_HOMELESS NUM_FOSTER
## Min. : 0.0000 Min. : 0.0 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 5.0 1st Qu.: 1.000 1st Qu.: 0.00
## Median : 0.0000 Median : 16.0 Median : 3.000 Median : 1.00
## Mean : 0.1444 Mean : 153.9 Mean : 6.529 Mean : 11.11
## 3rd Qu.: 0.0000 3rd Qu.: 45.0 3rd Qu.: 9.000 3rd Qu.: 4.00
## Max. :14.0000 Max. :155242.0 Max. :84.000 Max. :8637.00
## NA's :5262 NA's :1654 NA's :1654 NA's :4676
## PER_FOSTER NUM_ARMED PER_ARMED
## Min. : 0.0000 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.00 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.00 Median : 0.0000
## Mean : 0.4381 Mean : 12.93 Mean : 0.4614
## 3rd Qu.: 1.0000 3rd Qu.: 2.00 3rd Qu.: 0.0000
## Max. :100.0000 Max. :11317.00 Max. :80.0000
## NA's :4676 NA's :4787 NA's :4787
#Utilizing EDA Functions
#Clean variable names by changing them to lowercase for easier reading
#names(dem_factors) <- tolower(names(dem_factors))
#Check for any NAs
colSums(is.na(dem_factors))
## ENTITY_CD ENTITY_NAME YEAR NUM_ELL PER_ELL
## 0 0 0 1292 1292
## NUM_AM_IND PER_AM_IND NUM_BLACK PER_BLACK NUM_HISP
## 0 0 0 0 0
## PER_HISP NUM_ASIAN PER_ASIAN NUM_WHITE PER_WHITE
## 0 0 0 0 0
## NUM_Multi PER_Multi NUM_SWD PER_SWD NUM_FEMALE
## 0 0 89 89 0
## PER_FEMALE NUM_MALE PER_MALE NUM_NONBINARY PER_NONBINARY
## 0 0 0 0 0
## NUM_ECDIS PER_ECDIS NUM_MIGRANT PER_MIGRANT NUM_HOMELESS
## 63 63 5262 5262 1654
## PER_HOMELESS NUM_FOSTER PER_FOSTER NUM_ARMED PER_ARMED
## 1654 4676 4676 4787 4787
#Selecting and Summarizing Necessary Data
necessary_cols <- dem_factors |>
select(NUM_BLACK, NUM_AM_IND, NUM_ASIAN, NUM_HISP, NUM_WHITE, NUM_Multi)
necessary_cols
## # A tibble: 16,606 × 6
## NUM_BLACK NUM_AM_IND NUM_ASIAN NUM_HISP NUM_WHITE NUM_Multi
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 159551 9743 151113 336708 125091 15638
## 2 34103 509 6779 29646 13658 4027
## 3 39603 580 7905 87002 46914 9895
## 4 3894 2136 1055 11476 108669 5173
## 5 44836 2529 29061 126167 457875 28665
## 6 14339 584 46954 54698 217893 12637
## 7 86054 1650 6273 67736 10061 3614
## 8 7770 99 4213 4198 19556 2429
## 9 77 11 44 102 5099 85
## 10 2198 48 1000 1938 16968 1850
## # ℹ 16,596 more rows
dem_race <- necessary_cols |>
summarise(avg_black = round(mean(NUM_BLACK), digits = 2),
avg_ind = round(mean(NUM_AM_IND), digits = 2),
avg_asian = round(mean(NUM_ASIAN), digits =2),
avg_hisp = round(mean(NUM_HISP), digits = 2),
avg_white = round(mean(NUM_WHITE), digits = 2),
avg_multi = round(mean(NUM_Multi), digits =2))
dem_race
## # A tibble: 1 × 6
## avg_black avg_ind avg_asian avg_hisp avg_white avg_multi
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 365. 17.9 256. 719. 888. 78.5
#Displaying Information as a Bar Plot
barplot(c(dem_race$avg_white,dem_race$avg_hisp,dem_race$avg_black,
dem_race$avg_asian, dem_race$avg_multi,dem_race$avg_ind),
ylab = "Average Enrollment Amount", xlab = "Races")
#Conclusion Conclusion: