rm(list = ls())
library(utf8)
library(cli)
library(crayon)
library(dplyr)
library(tidyr)
library(lubridate)
library(readxl)
library(sas7bdat)*N = 735,228
# when updating, change date range in line ~37
cases <- read.csv("inputs/hhs.protect.cases.091220.csv")
nrow(cases) # N = 3,193 counties (cumulative cases stored in columns)[1] 3193
[1] 238
*N = 238
# calculate last day of dataset
data.day.1 <- ymd("2020/01/22")
data.day.1 + ncol(cases.org) - 4 - 1 # last date = 2020-09-11[1] "2020-09-11"
colnames(cases.org)[1] <- "county.fips"
cases.org$county.fips <- as.character(cases.org$county.fips)
# add leading zero to county.fips with <6 digits
nrow(cases.org) # N = 3,193[1] 3193
[1] 2869
[1] 324
cases.org[nchar(cases.org$county.fips)==4, ]$county.fips <-
paste0("0", cases.org[nchar(cases.org$county.fips)==4, ]$county.fips)
nrow(cases.org[nchar(cases.org$county.fips)==5, ]) # N = 3,193[1] 3193
# convert from wide to long
cases.long <- cases.org %>%
gather(key=date, n.cases, X1.22.2020:X9.11.2020) %>%
group_by(county.fips) %>%
mutate(lag.n.cases = dplyr::lag(n.cases)) %>%
mutate(daily.n.cases = n.cases - lag.n.cases)
cases.long# import county population data
county.p <- read.sas7bdat("inputs/uscounties2.sas7bdat", debug = FALSE)
nrow(county.p) # N = 3,142[1] 3142
colnames(county.p)[3] <- "pop.2019"
colnames(county.p)[5] <- "county.fips"
colnames(county.p)[6] <- "msa"
county.p#temp <- county.p %>% dplyr::select(County, State, county.fips, pop.2019)
#write.csv(temp, "all.county.fips.codes.csv")county.pop <- county.p
# data cleaning
county.pop$county.fips <- as.character(county.pop$county.fips)
# add leading zeros to FIPs
nrow(county.pop) # N = 3,142[1] 3142
[1] 2826
[1] 316
county.pop[nchar(county.pop$county.fips)==4, ]$county.fips <-
paste0("0",county.pop[nchar(county.pop$county.fips)==4, ]$county.fips)
nrow(county.pop[nchar(county.pop$county.fips)==5, ]) # N = 3,193[1] 3142
# join case and population data
# calculate daily incidence / 100k population
incidence.by.county <- cases.long %>%
left_join(county.pop, by = "county.fips") %>%
mutate(n.case.per.100k = ((daily.n.cases/pop.2019) * 100000))
incidence.by.county# calculate moving average over 7 days, as: average of cases over 7 days / 100k population
# using current day and averages over 6 lagged days
incidence.by.county <- incidence.by.county %>%
group_by(county.fips) %>%
mutate(lag1 = lag(daily.n.cases),
lag2 = lag(daily.n.cases, 2),
lag3 = lag(daily.n.cases, 3),
lag4 = lag(daily.n.cases, 4),
lag5 = lag(daily.n.cases, 5),
lag6 = lag(daily.n.cases, 6),
lag.avg.7day = ((daily.n.cases + lag1 + lag2 + lag3 + lag4 + lag5 + lag6)/7/pop.2019) * 100000)
# calculate moving average over 7 days, as: avg of cases over 7 days / 100k population
# calculate using current day and 7 leads
incidence.by.county <- incidence.by.county %>%
group_by(county.fips) %>%
mutate(lead1 = lead(daily.n.cases),
lead2 = lead(daily.n.cases, 2),
lead3 = lead(daily.n.cases, 3),
lead4 = lead(daily.n.cases, 4),
lead5 = lead(daily.n.cases, 5),
lead6 = lead(daily.n.cases, 6),
lead.avg.7day = ((daily.n.cases + lead1 + lead2 + lead3 + lead4 + lead5 + lead6)/7/pop.2019) * 100000)
incidence.by.county# calculate moving average over 7 days, as: avg of cases over 7 days / 100k population
# calculate using current day and 7 leads
incidence.by.county <- incidence.by.county %>%
group_by(county.fips) %>%
mutate(avg.7day = ((daily.n.cases + lead1 + lead2 + lead3 + lag1 + lag2 + lag3)/7/pop.2019) * 100000)
incidence.by.county# clean up data
incidence.by.county$date <- gsub("X", "", incidence.by.county$date)
incidence.by.county$date <- mdy(incidence.by.county$date)
# exclude cases that were not allocated to a county
# n = 50 county.fips codes designated for "unallocated" cases, for each state
incidence.by.county %>%
ungroup() %>%
select(county.fips) %>%
distinct() %>%
summarize('n.counties' = n()) # N = 3,193[1] 747162
incidence.by.county <- incidence.by.county[!grepl("Unallocated", incidence.by.county$county), ]
nrow(incidence.by.county) # N = 722,890 ***(N = 735,462?)***[1] 735462
incidence.by.county %>%
ungroup() %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 3,143# there is one county.fips code with no county name or data
incidence.by.county <- incidence.by.county %>%
filter(county.fips != "02270")
nrow(incidence.by.county) # N = 722,660 ***(N = 735,228?)***[1] 735228
incidence.by.county %>%
ungroup() %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 3,192 ***(N = 3,142?)***# import data
hs.counties <- read_excel("inputs/Emerging Counties Processed 2020-09-09.xlsx", sheet = "History")
nrow(hs.counties) # N = 13,726 for 9/9/2020 data[1] 13726
tibble [13,726 x 10] (S3: tbl_df/tbl/data.frame)
$ Data Date : POSIXct[1:13726], format: "2020-03-08" "2020-03-09" ...
$ FIPS : num [1:13726] 53033 53033 36119 53033 36119 ...
$ County : chr [1:13726] "King County" "King County" "Westchester County" "King County" ...
$ State : chr [1:13726] "WA" "WA" "NY" "WA" ...
$ FEMA Region : num [1:13726] 10 10 2 10 2 10 9 2 10 9 ...
$ CBSA Code : num [1:13726] 42660 42660 35620 42660 35620 ...
$ CBSA : chr [1:13726] "Seattle-Tacoma-Bellevue, WA" "Seattle-Tacoma-Bellevue, WA" "New York-Newark-Jersey City, NY-NJ-PA" "Seattle-Tacoma-Bellevue, WA" ...
$ Last on List - Data Date: POSIXct[1:13726], format: NA "2020-03-08" ...
$ Days Since Last on List : num [1:13726] NA 1 NA 1 1 1 NA 1 1 1 ...
$ New by CDC Definition : logi [1:13726] TRUE FALSE TRUE FALSE FALSE FALSE ...
# convert to date format
colnames(hs.counties)[1] <- "data.date"
hs.counties$data.date <- ymd(hs.counties$data.date)
# convert fips code to character
colnames(hs.counties)[2] <- "county.fips"
hs.counties$county.fips <- as.character(hs.counties$county.fips)
# add leading zeros
nrow(hs.counties) # N = 13,560 ***(N = 13,726?)***[1] 13726
[1] 11640
nrow(hs.counties[nchar(hs.counties$county.fips)==4, ]) # N = 2,078, ***(N = 2,086?) counties with dropped leading zeros in fips codes[1] 2086
hs.counties[nchar(hs.counties$county.fips)==4, ]$county.fips <- paste0("0", hs.counties[nchar(hs.counties$county.fips)==4, ]$county.fips)
nrow(hs.counties[nchar(hs.counties$county.fips)==5, ]) # N = 13,560 ***(N = 13,726?)***[1] 13726
# convert date format
colnames(hs.counties)[10] <- "new.cdc"
# add indicator for newly identified HS
hs.counties$new.hs <- 0
hs.counties[!is.na(hs.counties$new.cdc) & hs.counties$new.cdc == TRUE, "new.hs"] <- 1
# add indicator variable to differentiate HS counties after merge
hs.counties$hs.status <- 1
# limit dataset to fips code and hs status
hs.counties <- hs.counties %>%
select(county.fips, data.date, hs.status, new.cdc, new.hs)# import data
ihe.data <- read_excel("inputs/ihe.hotspots.abstraction.091320.xlsx", sheet = "university.sample.082620")
nrow(ihe.data) # N = 152 for 9/13/2020 data[1] 152
# convert fips to character, add leading zeros
ihe.data$county.fips <- as.character(ihe.data$county.fips)
# add a leading zero to any county.fips that only have 6 digits
nrow(ihe.data) # N = 152[1] 152
[1] 119
[1] 33
ihe.data[nchar(ihe.data$county.fips)==4, ]$county.fips <-
paste0("0", ihe.data[nchar(ihe.data$county.fips)==4, ]$county.fips)
nrow(ihe.data[nchar(ihe.data$county.fips)==5, ]) # N = 152[1] 152
# add variable indicating IHE after merge
ihe.data$ihe.status <- 1
# recast dates
ihe.data$open.date <- ymd(ihe.data$open.date)
# exclude observations
ihe.data <- ihe.data %>%
filter(include == 1)
nrow(ihe.data) # N = 149[1] 149
# finalize dataset for use in next step
#ihe.data <- ihe.data %>%
# select(ihe.status, unitid, county.fips, include, open.date, open.type)
#str(ihe.data)
#nrow(ihe.data) # N = 149#ihe.data <- ihe.data %>%
# group_by(county.fips) %>%
# filter(open.date == min(open.date))
#nrow(ihe.data) # N = 134 (16 were dropped because they shared a county with another school)
## there is one county w 2 IHEs with identical open.dates and open.type: 06037
#ihe.data <- ihe.data %>%
# group_by(county.fips) %>%
# slice(1)
#nrow(ihe.data) # N = 133, (1 more dropped for a total of 17)
#sum(ihe.data$ihe.status)# join the two datasets
all.county.data <- left_join(county.pop, ihe.data, by = "county.fips")
# assign ihs=0 to counties with no IHE
all.county.data[is.na(all.county.data$ihe.status), "ihe.status"] <- 0
# get median and set as open.date for ihe==0 counties
median.date <- median(all.county.data$open.date, na.rm = TRUE)
all.county.data$open.date.v1 <- all.county.data$open.date # copy over IHE county open.dates
all.county.data[all.county.data$ihe.status == 0, "open.date.v1"] <- median.date # replace NA with median value
# drop the include variable
all.county.data <- all.county.data %>%
select(-matches("include"))[1] 149
all.county.data %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 133 ***(N = 3,142?)***
2020-07-27 2020-08-10 2020-08-12 2020-08-17 2020-08-18 2020-08-19 2020-08-20
1 3 1 23 1 7 4
2020-08-22 2020-08-24 2020-08-25 2020-08-26 2020-08-27 2020-08-29 2020-08-31
4 52 3 6 1 2 10
2020-09-01 2020-09-02 2020-09-07 2020-09-08 2020-09-09 2020-09-14 2020-09-16
5 7 1 2 2 1 1
2020-09-19 2020-09-23 2020-09-27 2020-09-28 2020-09-29 2020-09-30
1 2 1 6 1 1
### write file to send to GRASP (and in general)
#write.csv(all.county.data, "outputs/ihe.hotspots.all.counties.csv")
write.csv(all.county.data, "outputs/ihe.hotspots.all.counties2.csv")# drop msa type and population counts, because they'll be brought in with the incidence file
# also drop open.date.v1, since we'll assign that in analysis file based on subsample used in analysis
all.county.data_1 <- all.county.data %>%
select(-matches(c("msa", "pop.2019", "open.date.v1")))# NOTE: merged county info will be exported to GRASP to return matches
# merge incidence and hotspot data
temp <- left_join(incidence.by.county,hs.counties,
by = c("county.fips"="county.fips",
"date" = "data.date"))
# merge incidence/hotspot and ihe abstraction data
temp <- left_join(temp, all.county.data,
by = "county.fips")# simple diagnostics on data
# counties in dataset
a.data %>%
ungroup %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 3,142# counties with no IHEs
a.data %>%
ungroup %>%
filter(ihe.status == 0) %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 3,009# counties with IHEs
a.data %>%
ungroup %>%
filter(ihe.status == 1) %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 133# count by ihe.status
(t0.a <- a.data %>%
select(county.fips, ihe.status) %>%
distinct() %>%
group_by(ihe.status) %>%
summarize(count = n()))# count by open.type
(t0.b <- a.data %>%
select(county.fips, ihe.status, open.type) %>%
distinct() %>%
group_by(ihe.status, open.type) %>%
summarize(count = n()) %>%
tidyr::spread(key = open.type, value = count))[1] "2020-08-26"
[1] "2020-08-24"
# make exclusions
# exclude data from the first part of the year, drop all before June 1st
a.data <- a.data %>%
filter((date >= mdy("6/1/2020")))
nrow(a.data) # N = 311,157 ***(N = 323,626)[1] 325274
# counties in dataset
a.data %>%
ungroup %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 3,142# exclude any IHE counties that have open date after cutoff (8/21/2020)
# cut off date is 8/24/2020, -14 will allow for calc 7 day average at day +11
cut.off.date <- mdy("9/11/2020") - 14
# drop IHE counties with open date after cut.off.date
a.data <- a.data %>%
filter(ihe.status == 0 | open.date <= cut.off.date)
# total counties in dataset
a.data %>%
ungroup %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 3100? ***(N = 3,110)***# count by ihe.status
(t0.a <- a.data %>%
select(county.fips, ihe.status) %>%
distinct() %>%
group_by(ihe.status) %>%
summarize(count = n()))# count by open.type
(t0.b <- a.data %>%
select(county.fips, ihe.status, open.type) %>%
distinct() %>%
group_by(ihe.status, open.type) %>%
summarize(count = n()) %>%
tidyr::spread(key = open.type, value = count))[1] "2020-08-21"
[1] "2020-08-24"
# 1.0 make any assumptions
# 1.1 establish the open date for any non-IHE counties
median.o.date <- median(a.data$open.date, na.rm = TRUE)
a.data$open.date.v2 <- a.data$open.date # copy over open.date for IHE counties
a.data[a.data$ihe.status == 0, "open.date.v2"] <- median.o.date
# 1.2 assign day 0
a.data$days.from.open.date <- a.data$date - a.data$open.date.v2
# 1.3 remove any data outside observation window
a.data <- a.data %>%
filter(days.from.open.date >= -28)
a.data %>%
ungroup %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 3100 ***(N = 3,110***)[1] 146695
# counties with IHEs in MSA = 1
a.data %>%
ungroup %>%
filter(ihe.status == 1) %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 91 ***(N = 101)***# counties with IHEs in MSA = 1
a.data %>%
ungroup %>%
filter(ihe.status == 0) %>%
select(county.fips) %>%
distinct() %>%
summarize ('n.counties' = n()) # N = 91 ***(N = 3009)***# set the windows of observation
looks <- c(-21, -14, -7, 0, 7, 14)
# means
(t1.a <- a.data %>%
filter(days.from.open.date %in% looks) %>%
group_by(ihe.status, days.from.open.date) %>%
summarize(avg = mean(avg.7day, na.rm = TRUE)) %>%
tidyr::spread(key = days.from.open.date, value = avg))# medians
(t1.a <- a.data %>%
filter(days.from.open.date %in% looks) %>%
group_by(ihe.status, days.from.open.date) %>%
summarize(median = median(avg.7day, na.rm = TRUE)) %>%
tidyr::spread(key = days.from.open.date, value = median))# counts
(t1.a <- a.data %>%
filter(days.from.open.date %in% looks) %>%
group_by(ihe.status, days.from.open.date) %>%
summarize(count = n()) %>%
tidyr::spread(key = days.from.open.date, value = count))# hotspots, counts
# generate simple before/after variable
a.data$after.day.0 <- a.data$days.from.open.date > 0
# counts of new hotspots
(t2.a <- a.data %>%
group_by(ihe.status, after.day.0) %>%
summarize(count = sum(new.hs, na.rm = TRUE)) %>%
tidyr::spread(key = after.day.0, value = count))# counts of total counties
(t2.b <- a.data %>%
group_by(ihe.status, after.day.0) %>%
distinct(county.fips) %>%
summarize(count = n()) %>%
tidyr::spread(key = after.day.0, value = count))[1] 0.03356597
[1] 0.1089109
[1] 0.05018278
[1] 0.3168317
# means by ihe status and open type
(t1.a <- a.data %>%
filter(days.from.open.date %in% looks) %>%
group_by(ihe.status, open.type, days.from.open.date) %>%
summarize(avg = mean(avg.7day, na.rm = TRUE)) %>%
tidyr::spread(key = days.from.open.date, value = avg))