remove previous vectors
rm(list = ls())
graphics.off()
cat("\014")
libraries
library(foreign)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
library(haven)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.3
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
Working Directory
setwd("~/Yale University/1semester/EPH505/data/codebuild")
getwd()
## [1] "B:/OneDrive/Documents/Yale University/1semester/EPH505/data/codebuild"
# extract data-frames (pre-fix, pipe-install, filter-install)
prefx = read_excel("School Testing Results.xlsx",sheet = 1)
## New names:
## • `Result (mg/L)` -> `Result (mg/L)...3`
## • `Result (mg/L)` -> `Result (mg/L)...5`
pipes = read_excel("School Testing Results.xlsx",sheet = 2)
## New names:
## • `Result (ppb)` -> `Result (ppb)...3`
## • `Result (ppb)` -> `Result (ppb)...7`
filts = read_excel("School Testing Results.xlsx",sheet = 3)
## New names:
## • `Result (ppb)` -> `Result (ppb)...3`
## • `Result (ppb)` -> `Result (ppb)...7`
# convert from tibble to data-frame
prefx = as.data.frame(prefx)
pipes = as.data.frame(pipes)
filts = as.data.frame(filts)
{
# read-data pre-data from file, condition, filter for eisenhower-gym
prefx=read_excel("School Testing Results.xlsx",sheet = 1)
prefx=as.data.frame(prefx)
# read-data post-data from file, condition, filter for eisenhower-gym (i love wininger)
filts = read_excel("School Testing Results.xlsx",sheet = 3)
filts = as.data.frame(filts)
}
## New names:
## New names:
## • `Result (mg/L)` -> `Result (mg/L)...3`
## • `Result (mg/L)` -> `Result (mg/L)...5`
check data corners, overview
{
View(filts)
View(pipes)
View(prefx)
}
prefx data frame
# Convert lead test result column to numeric * 1000 for prefx in ppb
prefx$`LeadResult_ppb` <- as.numeric(as.character((prefx$`Result (mg/L)...3`)))
## Warning: NAs introduced by coercion
prefx$`LeadResult_ppb`<- (prefx$`LeadResult_ppb`) * 1000
#convert copper test result column to numeric * 1000 for ppb
prefx$`CopperResult_ppb` <- as.numeric(as.character(prefx$`Result (mg/L)...5`))
## Warning: NAs introduced by coercion
prefx$CopperResult_ppb <- (prefx$CopperResult_ppb) * 1000
# Check for NAs introduced
prefx_na_count_copper <- sum(is.na(prefx$CopperResult_ppb))
prefx_na_count_lead <- sum(is.na(prefx$`LeadResult_ppb`))
print(paste("Number of NAs introduced for lead", prefx_na_count_lead,
"and copper", prefx_na_count_copper))
## [1] "Number of NAs introduced for lead 3 and copper 157"
pipes data frame
#convert lead test result column to numeric in pipes
pipes$`Result (ppb)...3` <- as.numeric(as.character(pipes$`Result (ppb)...3`))
## Warning: NAs introduced by coercion
pipes$`Result (ppb)...7` <- as.numeric(as.character(pipes$`Result (ppb)...7`))
## Warning: NAs introduced by coercion
pipes_na_count_lead <- sum(is.na(pipes$`Result (ppb)...3`))
pipes_na_count_copper <- sum(is.na(pipes$`Result (ppb)...7`))
print(paste("number of NAs introduced for lead results was",
pipes_na_count_lead, "and for copper was", pipes_na_count_copper))
## [1] "number of NAs introduced for lead results was 1438 and for copper was 529"
filter data frame
#convert lead test result column to numeric in filts
filts$`Result (ppb)...3` <- as.numeric(as.character(filts$`Result (ppb)...3`))
## Warning: NAs introduced by coercion
filts$`Result (ppb)...7` <- as.numeric(as.character(filts$`Result (ppb)...7`))
## Warning: NAs introduced by coercion
filts_na_count_lead <- sum(is.na(filts$`Result (ppb)...3`))
filts_na_count_copper <- sum(is.na(filts$`Result (ppb)...7`))
print(paste("number of NAs introduced for lead results was",
filts_na_count_lead, "and for copper was", filts_na_count_copper))
## [1] "number of NAs introduced for lead results was 2069 and for copper was 1027"
##dates of assessments
maxdate_prefx <- max(prefx$`Sampling Assessment Date`)
mindate_prefx <- min(prefx$`Sampling Assessment Date`)
print(prefx_date_range <- interval(mindate_prefx, maxdate_prefx))
## [1] 2015-10-02 UTC--2016-01-23 UTC
maxdate_filts <- max(filts$`Sampling Assessment Date`)
mindate_filts <- min(filts$`Sampling Assessment Date`)
print(filts_date_range <- interval(mindate_filts, maxdate_filts))
## [1] 2015-10-31 UTC--2016-12-03 UTC
maxdate_pipes <- max(pipes$`Sampling Assessment Date`, na.rm = TRUE)
mindate_pipes <- min(pipes$`Sampling Assessment Date`, na.rm = TRUE)
print(pipes_date_range <- interval(mindate_pipes, maxdate_pipes))
## [1] 2016-04-05 UTC--2016-08-11 UTC
print(paste("the dates of examinations before the fix was", prefx_date_range))
## [1] "the dates of examinations before the fix was 2015-10-02 UTC--2016-01-23 UTC"
print(paste("the range for exams after new pipes were fitted was",
pipes_date_range))
## [1] "the range for exams after new pipes were fitted was 2016-04-05 UTC--2016-08-11 UTC"
print(paste("and the range for exams after new filters was", filts_date_range))
## [1] "and the range for exams after new filters was 2015-10-31 UTC--2016-12-03 UTC"
##number of schools in each data frame
schools_number <- data.frame(
DataSource = c("prefx", "pipes", "filts"),
UniqueCount = c(
length(unique(prefx$SCH_Name)),
length(unique(pipes$SCH_Name)),
length(unique(filts$SCH_Name))
)
)
print(schools_number)
## DataSource UniqueCount
## 1 prefx 16
## 2 pipes 16
## 3 filts 7
admin building data frame, lead danger above 15 ppb.
#dangerous lead results in the admin building data frame
lead_danger <- 15
lead_danger_prefx_admin <-
prefx %>% filter(LeadResult_ppb > lead_danger,
SCH_Name == "Administration Building")
lead_danger_prefx <- prefx%>% filter(LeadResult_ppb > lead_danger)
print(nrow(lead_danger_prefx_admin))
## [1] 4
print(nrow(lead_danger_prefx))
## [1] 435
dangerous lead results data frame and table of schools with dangerous results
#dangerous lead results in all buildings prefx data frame
lead_danger_prefx <- prefx %>% filter(LeadResult_ppb > 15)
#dangerous lead results in all buildings pipes data frame
lead_danger_pipes <- pipes %>% filter(`Result (ppb)...3` > 15)
#dangerous lead results in all buildings filts data frame
lead_danger_filts <- filts %>% filter(`Result (ppb)...3` > 15)
#table of dangerous test results from all schools, all data frames
schools_table_pipes <- data.frame(
UniqueCount = c(table(lead_danger_pipes$SCH_Name))
)
schools_table_prefx <- data.frame(
UniqueCount = c(table(lead_danger_prefx$SCH_Name))
)
schools_table_filts <- data.frame(
UniqueCount = c(table(lead_danger_filts$SCH_Name))
)
#print tables on number of schools and school testing numbers
print(schools_table_prefx)
## UniqueCount
## Administration Building 4
## Brownell STEM Academy 17
## Doyle/Ryder Elementary 34
## Durant Tuuri Mott Elementary 45
## Eagles Nest Academy 4
## Eisenhower Elementary 24
## Flint Schools Central Kitchen 6
## Freeman Elementary 13
## Holmes STEM Academy 23
## Manley School 22
## Michigan School for the Deaf & Learning Resource Center 1
## Neithercut Elementary School 31
## Northwestern High School 67
## Pierce Elementary 46
## Potter Elementary 28
## Southwestern Classical Academy 70
print(schools_table_filts)
## UniqueCount
## Brownell STEM Academy 31
## Durant Tuuri Mott Elementary 68
## Eisenhower Elementary 27
## Freeman Elementary 21
## Holmes STEM Academy 58
## Michigan School for the Deaf & Learning Resource Center 20
print(schools_table_pipes)
## UniqueCount
## Administration Building 6
## Cathedral of Faith Head Start 1
## Central Kitchen 14
## Eagles Nest Academy 4
## Genesee STEM Academy 1
## Mott Early Childhood Learning 4
## New Standard Academy 23
## St. John Vianney School 2
## St. Pius X School 2
## Summerfield Academy 3
dangerous test results for copper in all data frames above 1300 ppb
#dangerous test results for copper in all data frames
copper_danger_prefx <- prefx %>% filter(CopperResult_ppb > 1300)
print(nrow(copper_danger_prefx))
## [1] 61
copper_danger_pipes <- pipes %>% filter(`Result (ppb)...7` > 1300)
print(nrow(copper_danger_pipes))
## [1] 27
copper_danger_filts <- filts %>% filter(`Result (ppb)...7` > 1300)
print(nrow(copper_danger_filts))
## [1] 44
separated a data frame for filter tests that happened after August 3, 2016, since that’s when the last pipe test was done. it’s unclear if the last pipes were replaced after filter tests began, which can influence results.
{
august_filts_dangerous <- filts %>% filter(`Sampling Assessment Date` >
as.Date("2016-08-03"), )
august_filts_dangerous <- filts %>% filter(`Result (ppb)...3` > 15)
}
table of dangerous lead test results from all zip codes, all data frames
#table of dangerous test results from all zip codes, all data frames
lead_danger_pipes_zips <- data.frame(
UniqueCount = c(table(lead_danger_pipes$SCH_Zip))
)
lead_danger_prefx_zips <- data.frame(
UniqueCount = c(table(lead_danger_prefx$SCH_Zip))
)
lead_danger_filts_zips <- data.frame(
UniqueCount = c(table(lead_danger_filts$SCH_zip))
)
print(lead_danger_prefx_zips)
## UniqueCount
## 48503 69
## 48504 89
## 48505 67
## 48506 74
## 48507 136
print(lead_danger_pipes_zips)
## UniqueCount
## 48503 24
## 48504 9
## 48505 24
## 48532 3
print(lead_danger_filts_zips)
## [1] UniqueCount
## <0 rows> (or 0-length row.names)
##number of dangerous lead tests as a box plot
{
boxplot(
lead_danger_prefx$LeadResult_ppb,
lead_danger_filts$`Result (ppb)...3`,
lead_danger_pipes$`Result (ppb)...3`,
august_filts_dangerous$`Result (ppb)...3`,
names = c("pre-fix lead", "post-filters", "post-pipes",
"filters after August 3"),
ylab = "Lead Concentration (ppb)",
main = "dangerous lead test results"
)
}
##number of dangerous lead tests as a bar plot
{
barplot(c(nrow(lead_danger_prefx), nrow(lead_danger_filts),
nrow(lead_danger_pipes), nrow(august_filts_dangerous)),
names.arg = c("pre-fix Lead", "post-filters",
"post-pipes", "filters after august 3"),
ylab = "number of dangerous lead test results",
main = "dangerous lead test results")
}
##number of dangerous lead test results by zipcode as a bar plot
zip_codes_prefx <- c(48503, 48504, 48505, 48506, 48507)
# Add the new columns
lead_danger_prefx_zips <- cbind(lead_danger_prefx_zips,
ZipCode = zip_codes_prefx)
#bar plot lead danger prefx
barplot(lead_danger_prefx_zips$UniqueCount,
names.arg = zip_codes_prefx,
main = "number of dangerous lead test results before fix",
xlab = "zip code")
#bar plot lead danger pipes
zip_codes_pipes <- c(48503, 48504, 48505, 48532)
lead_danger_pipes_zips <- cbind(lead_danger_pipes_zips,
ZipCode = zip_codes_pipes)
barplot(lead_danger_pipes_zips$UniqueCount,
names.arg = zip_codes_pipes,
main = "number of dangerous lead test results after pipes installed",
xlab = "zip code")
##summary statistics, prefx pipes and filts for copper and lead
summary(prefx$LeadResult_ppb, na.rm = TRUE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 2.00 14.29 9.00 2856.00 3
prefx_lead_sd <- sd(prefx$LeadResult_ppb, na.rm = TRUE)
print(prefx_lead_sd)
## [1] 84.95139
summary(prefx$CopperResult_ppb, na.rm = TRUE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 60.0 130.0 257.9 280.0 23600.0 157
prefx_copper_sd <- sd(prefx$CopperResult_ppb, na.rm = TRUE)
print(prefx_copper_sd)
## [1] 728.1431
summary(pipes$`Result (ppb)...3`, na.rm = TRUE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 1.00 2.00 13.47 6.00 415.00 1438
pipes_lead_sd <- sd(pipes$`Result (ppb)...3`, na.rm = TRUE)
print(pipes_lead_sd)
## [1] 44.38871
summary(pipes$`Result (ppb)...7`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 50.0 90.0 170.0 248.1 270.0 5880.0 529
pipes_copper_sd <- sd(pipes$`Result (ppb)...7`, na.rm = TRUE)
print(pipes_copper_sd)
## [1] 333.8137
summary(filts$`Result (ppb)...3`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 2.00 4.00 23.38 9.00 2856.00 2069
filts_lead_sd <- sd(filts$`Result (ppb)...3`, na.rm = TRUE)
print(filts_lead_sd)
## [1] 138.6359
summary(filts$`Result (ppb)...7`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 50.0 100.0 180.0 301.8 370.0 23600.0 1027
filts_copper_sd <- sd(filts$`Result (ppb)...7`, na.rm = TRUE)
print(filts_copper_sd)
## [1] 705.2229
mean changes when 0 values in the prefx data frame are removed and treated as NA’s
zero_count <- prefx %>%
summarise(count_of_zeros = sum(`Result (mg/L)...3` == 0, na.rm = TRUE))
print(zero_count)
## count_of_zeros
## 1 912
see number of zeros below
prefx <- prefx %>%
mutate(LeadResult_ppb_NAs = ifelse(LeadResult_ppb == 0, NA, LeadResult_ppb))
summary(prefx$LeadResult_ppb_NAs)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 2.0 5.0 20.6 13.0 2856.0 915
summary(prefx$`LeadResult_ppb`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 2.00 14.29 9.00 2856.00 3