Import Library/Packages
library(tidyverse)
package 㤼㸱tidyverse㤼㸲 was built under R version 4.0.5Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages --------------------------------------------------- tidyverse 1.3.1 --
v ggplot2 3.3.5 v purrr 0.3.4
v tibble 3.1.3 v dplyr 1.0.7
v tidyr 1.1.3 v stringr 1.4.0
v readr 2.0.0 v forcats 0.5.1
package 㤼㸱ggplot2㤼㸲 was built under R version 4.0.5package 㤼㸱readr㤼㸲 was built under R version 4.0.5package 㤼㸱dplyr㤼㸲 was built under R version 4.0.5-- Conflicts ------------------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
Load the processed public school file from Part 2 https://rpubs.com/caryjim/nces-ps-data-processing
public_school <- read.csv("./Datasets/public_school_processed.csv", header = T, stringsAsFactors = T)
# This time, stringsAsFactors is applied to ensure all characters are converted
View columns
colnames(public_school)
[1] "State.Name" "ST" "School.Name"
[4] "NCES.SchID" "Agency_Name" "County_Name"
[7] "COUNTY.ID" "Urban.centric.Locale" "Latitude"
[10] "Longitude" "Title.I.School.Status" "Updated.Status"
[13] "Agency.Type" "School.Type" "Members"
[16] "Male" "Female" "FTE.Equivalent"
[19] "Pupil.Teacher.Ratio"
View data type
str(public_school)
'data.frame': 94619 obs. of 19 variables:
$ State.Name : Factor w/ 102 levels "Alabama","ALABAMA",..: 43 95 47 47 45 55 29 77 88 99 ...
$ ST : Factor w/ 51 levels "AK","AL","AR",..: 20 48 24 24 23 30 16 39 44 49 ...
$ School.Name : Factor w/ 82716 levels "1 LT Charles W. Whitcomb School",..: 1 2 3 4 5 6 7 8 9 10 ...
$ NCES.SchID : num 2.51e+11 5.30e+11 2.71e+11 2.71e+11 2.63e+11 ...
$ Agency_Name : Factor w/ 16253 levels "21st Century Charter Sch of Gary",..: 8707 8775 4196 4196 11576 10432 1 2 15484 9138 ...
$ County_Name : Factor w/ 1874 levels "Abbeville County",..: 1105 1552 278 278 863 712 934 323 765 447 ...
$ COUNTY.ID : int 25017 53061 27019 27019 26077 31079 18089 42029 48215 55025 ...
$ Urban.centric.Locale : Factor w/ 4 levels "City","Rural",..: 3 3 3 3 1 2 1 3 3 3 ...
$ Latitude : num 42.4 48.1 44.8 44.8 42.2 ...
$ Longitude : num -71.5 -122.2 -93.6 -93.6 -85.6 ...
$ Title.I.School.Status: Factor w/ 7 levels "1-Title I targeted assistance eligible school-No program",..: 5 6 2 2 6 6 5 6 4 1 ...
$ Updated.Status : Factor w/ 5 levels "1-Open","3-New",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Agency.Type : Factor w/ 6 levels "1-Regular local school district that is NOT a component of a supervisory union",..: 1 1 1 1 1 1 4 4 1 1 ...
$ School.Type : Factor w/ 4 levels "1-Regular school",..: 1 1 4 4 1 1 1 1 1 1 ...
$ Members : int 1308 178 37 366 530 182 934 1039 331 38 ...
$ Male : int 681 77 20 198 269 96 481 383 159 20 ...
$ Female : int 627 101 17 168 261 86 453 656 172 18 ...
$ FTE.Equivalent : num 118.8 7.3 NA NA 29.3 ...
$ Pupil.Teacher.Ratio : num 11 24.4 NA NA 18.1 ...
In the original public school dataset, state names entries are inconsistent. There are some state names as ALL CAPS and some as lower cases. Therefore, using state abbreviation is a better varaible unless we parse the chartacters in the full state names.
# A quick count of schools by each state in the processed data
table(public_school$ST)
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL
506 1342 1060 2055 10026 1891 1001 224 219 4004 2300 292 1322 730 3983
IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE
1874 1315 1392 1370 1845 1395 571 3329 2112 2282 900 817 2639 478 1005
NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT
488 2566 865 687 4757 3500 1798 1243 2903 290 1213 683 1782 8774 1028
VA VT WA WI WV WY
1858 296 2334 2240 669 366
Order the number of school per state
sort(table(public_school$ST))
DE DC RI HI VT WY ND NH AK ME WV SD NV ID MT
219 224 290 292 296 366 478 488 506 571 669 683 687 730 817
NM MS CT NE UT AR SC OR KS IA AL LA KY MD TN
865 900 1001 1005 1028 1060 1213 1243 1315 1322 1342 1370 1392 1395 1782
OK MA VA IN CO AZ MN WI MO GA WA NJ NC PA MI
1798 1845 1858 1874 1891 2055 2112 2240 2282 2300 2334 2566 2639 2903 3329
OH IL FL NY TX CA
3500 3983 4004 4757 8774 10026
Note. It would be interesting to see the number of public school per state and compare it to the current digital opportunity status of these regions.
table(public_school$Urban.centric.Locale)
City Rural Suburb Town
25897 26370 30087 12265
table(public_school$Title.I.School.Status)
1-Title I targeted assistance eligible school-No program
5271
2-Title I targeted assistance school
9311
3-Title I schoolwide eligible-Title I targeted assistance program
2508
4-Title I schoolwide eligible school-No program
6155
5-Title I schoolwide school
46136
6-Not a Title I school
24429
Unknown
809
Aggregate the types of Locale by State and Provide a Count
# Collapse the data by State and their locale
public_school %>%
group_by(ST, Urban.centric.Locale) %>%
summarize(locale = n())
`summarise()` has grouped output by 'ST'. You can override using the `.groups` argument.
Review the Title 1 information by state
public_school %>%
group_by(ST, Title.I.School.Status) %>%
summarize(each_count = n())
`summarise()` has grouped output by 'ST'. You can override using the `.groups` argument.
We can see that some states have more reported categories of Title I status
public_school %>%
group_by(ST) %>%
summarize(total_enrollment = sum(Members))
Group selected variables by state level and calculate a percentage of that information or a total count
# Create a new dataframe with state level information
schools_state <-
public_school %>%
group_by(ST) %>%
summarise(Num_of_school = n(), Total_enrollment = sum(Members),
male = sum(Male, na.rm = TRUE), Female = sum(Female, na.rm = TRUE))
Beware that some schools have missing values for one of the gender and the use of sum() of those return a NA value. So we have to use a different method by adding na.rm = True.
head(schools_state) #There are 51 rows
public_school %>%
group_by(ST) %>%
summarise(Urbanicity = n_distinct(Urban.centric.Locale))
# This method gives me the count of categories, but not how many within the category
t1 <- public_school %>%
group_by(ST, Urban.centric.Locale) %>%
summarise(Locale_count = n())
`summarise()` has grouped output by 'ST'. You can override using the `.groups` argument.
t1
t1<- spread(t1, Urban.centric.Locale, Locale_count)
t1
#Row number 8 and row 40 in the t1 dataframe
t1[8, 4] <- 0
t1[8, 5] <- 0
t1[40, 5] <- 0
t1
#Create a total column
t1$total <- t1$City + t1$Suburb + t1$Town + t1$Rural
#Replace each locale by its own percentage
#Create a list first to check if calculation are done correctly
t1$City <- t1$City/t1$total
#Replace each locale by its own percentage
t1$Suburb <- t1$Suburb/t1$total
t1$Town <- t1$Town/t1$total
t1$Rural <- t1$Rural/t1$total
t1
Note. The percentage of each locale by school types can also be used as a comparison to digital opportunity.
t2 <- public_school %>%
group_by(ST, Title.I.School.Status) %>%
summarise(count = n())
`summarise()` has grouped output by 'ST'. You can override using the `.groups` argument.
head(t2)
levels(t2$Title.I.School.Status)
[1] "1-Title I targeted assistance eligible school-No program"
[2] "2-Title I targeted assistance school"
[3] "3-Title I schoolwide eligible-Title I targeted assistance program"
[4] "4-Title I schoolwide eligible school-No program"
[5] "5-Title I schoolwide school"
[6] "6-Not a Title I school"
[7] "Unknown"
levels(t2$Title.I.School.Status) <- c("TAE_No", "TAE", "SW_TAE", "SW_No", "SW", "NOT_TitleI", "Unknown")
head(t2)
t2 <- spread(t2, Title.I.School.Status, count)
t2
Since we have to perform calculation, we will impute all NA as zero
t2[is.na(t2)] = 0
t2$total <- rowSums(t2[,2:8])
t2
t2$Unknown <- t2$Unknown/t2$total
t2$TAE_No <- t2$TAE_No/t2$total
t2$TAE <- t2$TAE/t2$total
t2$SW_TAE <- t2$SW_TAE/t2$total
t2$SW_No <-t2$SW_No/t2$total
t2$SW <- t2$SW/t2$total
t2$NOT_TitleI <- t2$NOT_TitleI/t2$total
Verify the calculation
#Confirming the percentages are calculated correctly
public_school %>%
group_by(ST, Title.I.School.Status) %>%
summarise(each_count = n()) %>%
mutate(Title1_percent = each_count/sum(each_count))
`summarise()` has grouped output by 'ST'. You can override using the `.groups` argument.
Merge schools_state with urbanciity/rurality table
schools_state <- left_join(schools_state, t1, by = "ST")
head(schools_state)
#We can drop the total count for urbanicity/rurality -column index 10
schools_state <- schools_state[-c(10)]
head(schools_state)
Merge schools_state with Title 1 info
#Append the title 1 information by each state
schools_state <- left_join(schools_state, t2, by = "ST")
head(schools_state)
#Again, drop the total count column , index 17
schools_state <- schools_state[-c(17)]
head(schools_state)