##Introduction My research question for is, “Is there a relationship between the enrollment status of students and the student’s race?” The dataset I selected contains information from The State Education Department for the public schools of the state of New York.I retrieved this data set from GitHub which led me to data.nysed.gov.The data set includes data on student enrollment, country, Needs-To-Resources, group,district, and public schools. This data set provides a variety of information of NewYork’s public schools from 2023 to 2025.

To answer my proposed research question, I am focusing on the demographic factor data set, and taking a look at the following variables: NUM_WHITE, NUM_HISP, NUM_BLACK, NUM_ASIAN, NUM_Multi, NUM_AM-IND. Each race, which is a column, holds a series of number of enrollments for a variety of entities(schools, districts, location) over the course of 3 years, from 2023 to 2025. I will be utilizing these columns to find the averages, to see if there is a relationship between the average number of enrollment and student race.

#Importing My Data Set

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

dem_factors <- read_csv("C:/DATA101/demographics.csv")
## Rows: 16606 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): ENTITY_CD, ENTITY_NAME
## dbl (33): YEAR, NUM_ELL, PER_ELL, NUM_AM_IND, PER_AM_IND, NUM_BLACK, PER_BLA...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Exploring My Data

str(dem_factors)
## spc_tbl_ [16,606 × 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ENTITY_CD    : chr [1:16606] "000000000001" "000000000002" "000000000003" "000000000004" ...
##  $ ENTITY_NAME  : chr [1:16606] "NYC Public Schools" "Large Cities" "High Need/Resource Urban-Suburban Districts" "High Need/Resource Rural Districts" ...
##  $ YEAR         : num [1:16606] 2023 2023 2023 2023 2023 ...
##  $ NUM_ELL      : num [1:16606] 129782 14300 35438 2143 33962 ...
##  $ PER_ELL      : num [1:16606] 16 16 18 2 5 4 9 6 0 2 ...
##  $ NUM_AM_IND   : num [1:16606] 9743 509 580 2136 2529 ...
##  $ PER_AM_IND   : num [1:16606] 1 1 0 2 0 0 1 0 0 0 ...
##  $ NUM_BLACK    : num [1:16606] 159551 34103 39603 3894 44836 ...
##  $ PER_BLACK    : num [1:16606] 20 38 21 3 7 4 49 20 1 9 ...
##  $ NUM_HISP     : num [1:16606] 336708 29646 87002 11476 126167 ...
##  $ PER_HISP     : num [1:16606] 42 33 45 9 18 16 39 11 2 8 ...
##  $ NUM_ASIAN    : num [1:16606] 151113 6779 7905 1055 29061 ...
##  $ PER_ASIAN    : num [1:16606] 19 8 4 1 4 14 4 11 1 4 ...
##  $ NUM_WHITE    : num [1:16606] 125091 13658 46914 108669 457875 ...
##  $ PER_WHITE    : num [1:16606] 16 15 24 82 66 63 6 51 94 71 ...
##  $ NUM_Multi    : num [1:16606] 15638 4027 9895 5173 28665 ...
##  $ PER_Multi    : num [1:16606] 2 5 5 4 4 4 2 6 2 8 ...
##  $ NUM_SWD      : num [1:16606] 189036 19399 32099 22936 109933 ...
##  $ PER_SWD      : num [1:16606] 24 22 17 17 16 15 18 14 18 15 ...
##  $ NUM_FEMALE   : num [1:16606] 384400 43293 92919 64894 336061 ...
##  $ PER_FEMALE   : num [1:16606] 48 49 48 49 49 49 51 49 49 49 ...
##  $ NUM_MALE     : num [1:16606] 413331 45403 98945 67415 352691 ...
##  $ PER_MALE     : num [1:16606] 52 51 52 51 51 51 49 51 51 51 ...
##  $ NUM_NONBINARY: num [1:16606] 113 26 35 94 381 135 33 16 5 24 ...
##  $ PER_NONBINARY: num [1:16606] 0 0 0 0 0 0 0 0 0 0 ...
##  $ NUM_ECDIS    : num [1:16606] 601178 74564 142102 79437 301330 ...
##  $ PER_ECDIS    : num [1:16606] 75 84 74 60 44 19 82 46 56 54 ...
##  $ NUM_MIGRANT  : num [1:16606] 23 23 286 775 737 98 14 20 3 1 ...
##  $ PER_MIGRANT  : num [1:16606] 0 0 0 1 0 0 0 0 0 0 ...
##  $ NUM_HOMELESS : num [1:16606] 73675 3833 8274 2952 9279 ...
##  $ PER_HOMELESS : num [1:16606] 9 4 4 2 1 0 8 3 1 3 ...
##  $ NUM_FOSTER   : num [1:16606] 4196 254 782 553 1537 ...
##  $ PER_FOSTER   : num [1:16606] 1 0 0 0 0 0 0 0 0 0 ...
##  $ NUM_ARMED    : num [1:16606] 3215 17 286 3206 2021 ...
##  $ PER_ARMED    : num [1:16606] 0 0 0 2 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ENTITY_CD = col_character(),
##   ..   ENTITY_NAME = col_character(),
##   ..   YEAR = col_double(),
##   ..   NUM_ELL = col_double(),
##   ..   PER_ELL = col_double(),
##   ..   NUM_AM_IND = col_double(),
##   ..   PER_AM_IND = col_double(),
##   ..   NUM_BLACK = col_double(),
##   ..   PER_BLACK = col_double(),
##   ..   NUM_HISP = col_double(),
##   ..   PER_HISP = col_double(),
##   ..   NUM_ASIAN = col_double(),
##   ..   PER_ASIAN = col_double(),
##   ..   NUM_WHITE = col_double(),
##   ..   PER_WHITE = col_double(),
##   ..   NUM_Multi = col_double(),
##   ..   PER_Multi = col_double(),
##   ..   NUM_SWD = col_double(),
##   ..   PER_SWD = col_double(),
##   ..   NUM_FEMALE = col_double(),
##   ..   PER_FEMALE = col_double(),
##   ..   NUM_MALE = col_double(),
##   ..   PER_MALE = col_double(),
##   ..   NUM_NONBINARY = col_double(),
##   ..   PER_NONBINARY = col_double(),
##   ..   NUM_ECDIS = col_double(),
##   ..   PER_ECDIS = col_double(),
##   ..   NUM_MIGRANT = col_double(),
##   ..   PER_MIGRANT = col_double(),
##   ..   NUM_HOMELESS = col_double(),
##   ..   PER_HOMELESS = col_double(),
##   ..   NUM_FOSTER = col_double(),
##   ..   PER_FOSTER = col_double(),
##   ..   NUM_ARMED = col_double(),
##   ..   PER_ARMED = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(dem_factors)
## # A tibble: 6 × 35
##   ENTITY_CD    ENTITY_NAME  YEAR NUM_ELL PER_ELL NUM_AM_IND PER_AM_IND NUM_BLACK
##   <chr>        <chr>       <dbl>   <dbl>   <dbl>      <dbl>      <dbl>     <dbl>
## 1 000000000001 NYC Public…  2023  129782      16       9743          1    159551
## 2 000000000002 Large Citi…  2023   14300      16        509          1     34103
## 3 000000000003 High Need/…  2023   35438      18        580          0     39603
## 4 000000000004 High Need/…  2023    2143       2       2136          2      3894
## 5 000000000005 Average Ne…  2023   33962       5       2529          0     44836
## 6 000000000006 Low Need D…  2023   13643       4        584          0     14339
## # ℹ 27 more variables: PER_BLACK <dbl>, NUM_HISP <dbl>, PER_HISP <dbl>,
## #   NUM_ASIAN <dbl>, PER_ASIAN <dbl>, NUM_WHITE <dbl>, PER_WHITE <dbl>,
## #   NUM_Multi <dbl>, PER_Multi <dbl>, NUM_SWD <dbl>, PER_SWD <dbl>,
## #   NUM_FEMALE <dbl>, PER_FEMALE <dbl>, NUM_MALE <dbl>, PER_MALE <dbl>,
## #   NUM_NONBINARY <dbl>, PER_NONBINARY <dbl>, NUM_ECDIS <dbl>, PER_ECDIS <dbl>,
## #   NUM_MIGRANT <dbl>, PER_MIGRANT <dbl>, NUM_HOMELESS <dbl>,
## #   PER_HOMELESS <dbl>, NUM_FOSTER <dbl>, PER_FOSTER <dbl>, NUM_ARMED <dbl>, …
summary(dem_factors)
##   ENTITY_CD         ENTITY_NAME             YEAR         NUM_ELL      
##  Length:16606       Length:16606       Min.   :2023   Min.   :     0  
##  Class :character   Class :character   1st Qu.:2023   1st Qu.:     8  
##  Mode  :character   Mode  :character   Median :2024   Median :    28  
##                                        Mean   :2024   Mean   :   283  
##                                        3rd Qu.:2025   3rd Qu.:    77  
##                                        Max.   :2025   Max.   :280312  
##                                                       NA's   :1292    
##     PER_ELL         NUM_AM_IND         PER_AM_IND        NUM_BLACK       
##  Min.   :  0.00   Min.   :    0.00   Min.   : 0.0000   Min.   :     0.0  
##  1st Qu.:  2.00   1st Qu.:    0.00   1st Qu.: 0.0000   1st Qu.:     7.0  
##  Median :  6.00   Median :    1.00   Median : 0.0000   Median :    33.0  
##  Mean   : 10.54   Mean   :   17.89   Mean   : 0.7473   Mean   :   364.7  
##  3rd Qu.: 15.00   3rd Qu.:    4.00   3rd Qu.: 1.0000   3rd Qu.:   106.0  
##  Max.   :100.00   Max.   :18122.00   Max.   :99.0000   Max.   :382380.0  
##  NA's   :1292                                                            
##    PER_BLACK         NUM_HISP           PER_HISP        NUM_ASIAN       
##  Min.   :  0.00   Min.   :     0.0   Min.   :  0.00   Min.   :     0.0  
##  1st Qu.:  1.00   1st Qu.:    29.0   1st Qu.:  6.00   1st Qu.:     3.0  
##  Median :  6.00   Median :    89.0   Median : 19.00   Median :    12.0  
##  Mean   : 15.71   Mean   :   719.2   Mean   : 27.33   Mean   :   255.6  
##  3rd Qu.: 22.00   3rd Qu.:   225.0   3rd Qu.: 43.00   3rd Qu.:    49.0  
##  Max.   :100.00   Max.   :744672.0   Max.   :100.00   Max.   :257714.0  
##                                                                         
##    PER_ASIAN        NUM_WHITE          PER_WHITE        NUM_Multi       
##  Min.   : 0.000   Min.   :     0.0   Min.   :  0.00   Min.   :    0.00  
##  1st Qu.: 1.000   1st Qu.:    23.0   1st Qu.:  6.00   1st Qu.:    4.00  
##  Median : 2.000   Median :   197.0   Median : 46.00   Median :   13.00  
##  Mean   : 7.291   Mean   :   888.2   Mean   : 45.22   Mean   :   78.47  
##  3rd Qu.: 8.000   3rd Qu.:   388.0   3rd Qu.: 81.00   3rd Qu.:   29.00  
##  Max.   :94.000   Max.   :980161.0   Max.   :100.00   Max.   :87228.00  
##                                                                         
##    PER_Multi         NUM_SWD            PER_SWD         NUM_FEMALE     
##  Min.   : 0.000   Min.   :     0.0   Min.   :  0.00   Min.   :      0  
##  1st Qu.: 1.000   1st Qu.:    55.0   1st Qu.: 15.00   1st Qu.:    144  
##  Median : 3.000   Median :    85.0   Median : 18.00   Median :    215  
##  Mean   : 3.513   Mean   :   455.5   Mean   : 20.27   Mean   :   1131  
##  3rd Qu.: 5.000   3rd Qu.:   138.0   3rd Qu.: 23.00   3rd Qu.:    355  
##  Max.   :25.000   Max.   :480579.0   Max.   :100.00   Max.   :1179812  
##                   NA's   :89         NA's   :89                        
##    PER_FEMALE        NUM_MALE          PER_MALE      NUM_NONBINARY      
##  Min.   :  0.00   Min.   :      0   Min.   :  0.00   Min.   :   0.0000  
##  1st Qu.: 47.00   1st Qu.:    153   1st Qu.: 49.00   1st Qu.:   0.0000  
##  Median : 49.00   Median :    228   Median : 51.00   Median :   0.0000  
##  Mean   : 48.51   Mean   :   1192   Mean   : 51.31   Mean   :   0.8777  
##  3rd Qu.: 51.00   3rd Qu.:    370   3rd Qu.: 53.00   3rd Qu.:   0.0000  
##  Max.   :100.00   Max.   :1242577   Max.   :100.00   Max.   :1023.0000  
##                                                                         
##  PER_NONBINARY        NUM_ECDIS         PER_ECDIS       NUM_MIGRANT      
##  Min.   : 0.00000   Min.   :      0   Min.   :  0.00   Min.   :   0.000  
##  1st Qu.: 0.00000   1st Qu.:    144   1st Qu.: 39.00   1st Qu.:   0.000  
##  Median : 0.00000   Median :    250   Median : 58.00   Median :   0.000  
##  Mean   : 0.02391   Mean   :   1400   Mean   : 58.82   Mean   :   2.739  
##  3rd Qu.: 0.00000   3rd Qu.:    429   3rd Qu.: 84.00   3rd Qu.:   0.000  
##  Max.   :14.00000   Max.   :1440928   Max.   :100.00   Max.   :2388.000  
##                     NA's   :63        NA's   :63       NA's   :5262      
##   PER_MIGRANT       NUM_HOMELESS       PER_HOMELESS      NUM_FOSTER     
##  Min.   : 0.0000   Min.   :     0.0   Min.   : 0.000   Min.   :   0.00  
##  1st Qu.: 0.0000   1st Qu.:     5.0   1st Qu.: 1.000   1st Qu.:   0.00  
##  Median : 0.0000   Median :    16.0   Median : 3.000   Median :   1.00  
##  Mean   : 0.1444   Mean   :   153.9   Mean   : 6.529   Mean   :  11.11  
##  3rd Qu.: 0.0000   3rd Qu.:    45.0   3rd Qu.: 9.000   3rd Qu.:   4.00  
##  Max.   :14.0000   Max.   :155242.0   Max.   :84.000   Max.   :8637.00  
##  NA's   :5262      NA's   :1654       NA's   :1654     NA's   :4676     
##    PER_FOSTER         NUM_ARMED          PER_ARMED      
##  Min.   :  0.0000   Min.   :    0.00   Min.   : 0.0000  
##  1st Qu.:  0.0000   1st Qu.:    0.00   1st Qu.: 0.0000  
##  Median :  0.0000   Median :    0.00   Median : 0.0000  
##  Mean   :  0.4381   Mean   :   12.93   Mean   : 0.4614  
##  3rd Qu.:  1.0000   3rd Qu.:    2.00   3rd Qu.: 0.0000  
##  Max.   :100.0000   Max.   :11317.00   Max.   :80.0000  
##  NA's   :4676       NA's   :4787       NA's   :4787

#Utilizing EDA Functions

#Clean variable names by changing them to lowercase for easier reading

#names(dem_factors) <- tolower(names(dem_factors))

#Check for any NAs
colSums(is.na(dem_factors))
##     ENTITY_CD   ENTITY_NAME          YEAR       NUM_ELL       PER_ELL 
##             0             0             0          1292          1292 
##    NUM_AM_IND    PER_AM_IND     NUM_BLACK     PER_BLACK      NUM_HISP 
##             0             0             0             0             0 
##      PER_HISP     NUM_ASIAN     PER_ASIAN     NUM_WHITE     PER_WHITE 
##             0             0             0             0             0 
##     NUM_Multi     PER_Multi       NUM_SWD       PER_SWD    NUM_FEMALE 
##             0             0            89            89             0 
##    PER_FEMALE      NUM_MALE      PER_MALE NUM_NONBINARY PER_NONBINARY 
##             0             0             0             0             0 
##     NUM_ECDIS     PER_ECDIS   NUM_MIGRANT   PER_MIGRANT  NUM_HOMELESS 
##            63            63          5262          5262          1654 
##  PER_HOMELESS    NUM_FOSTER    PER_FOSTER     NUM_ARMED     PER_ARMED 
##          1654          4676          4676          4787          4787

#Selecting and Summarizing Necessary Data

necessary_cols <- dem_factors |>
  select(NUM_BLACK, NUM_AM_IND, NUM_ASIAN, NUM_HISP, NUM_WHITE, NUM_Multi)

necessary_cols
## # A tibble: 16,606 × 6
##    NUM_BLACK NUM_AM_IND NUM_ASIAN NUM_HISP NUM_WHITE NUM_Multi
##        <dbl>      <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
##  1    159551       9743    151113   336708    125091     15638
##  2     34103        509      6779    29646     13658      4027
##  3     39603        580      7905    87002     46914      9895
##  4      3894       2136      1055    11476    108669      5173
##  5     44836       2529     29061   126167    457875     28665
##  6     14339        584     46954    54698    217893     12637
##  7     86054       1650      6273    67736     10061      3614
##  8      7770         99      4213     4198     19556      2429
##  9        77         11        44      102      5099        85
## 10      2198         48      1000     1938     16968      1850
## # ℹ 16,596 more rows
dem_race <- necessary_cols |>
          summarise(avg_black = round(mean(NUM_BLACK), digits = 2), 
                   avg_ind = round(mean(NUM_AM_IND), digits = 2),
                   avg_asian = round(mean(NUM_ASIAN), digits =2), 
                   avg_hisp = round(mean(NUM_HISP), digits = 2),
                   avg_white = round(mean(NUM_WHITE), digits = 2), 
                   avg_multi = round(mean(NUM_Multi), digits =2))
dem_race
## # A tibble: 1 × 6
##   avg_black avg_ind avg_asian avg_hisp avg_white avg_multi
##       <dbl>   <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
## 1      365.    17.9      256.     719.      888.      78.5

#Displaying Information as a Bar Plot

barplot(c(dem_race$avg_white,dem_race$avg_hisp,dem_race$avg_black, 
          dem_race$avg_asian, dem_race$avg_multi,dem_race$avg_ind), 
        ylab = "Average Enrollment Amount", xlab = "Races")

#Conclusion Conclusion: