tidyDisasters tidyYRBSCatalina Cañizares. MSc; Mark J. Macgowan. Ph.D.; Gabriel Odom. Ph.D., Th.D.
tidyDisasters package
tidyDisasterstidyYRBS package
tidyYRBSGrowing evidence suggests that surviving a natural disaster or a terrorist attack event increases the likelihood of presenting a mental health disorders, disruptive psychological functioning and may increase the likelihood of suicidaliy (Fitzpatrick & Spialek, 2020; Tang et al., 2018; Zakour, 2019).
tidyDisastersID:
2ece821d-39e1-40a1-9556-31ab914686fa
Variables of interest:
State
County
Start date
End date
Disaster type
Challenge:
No way to filter by the number of people killed
ID with no information
ID:
1990-0707-USA
Variables of interest:
State
County
Start date
End date
Disaster type
Number of people killed
Number of people wounded
Challenge:
States and counties smashed in a single cell and missing locations and end dates.
ID:
201606120001
Variables of interest:
State
County
Start date
End date
Disaster type
Number of people killed
Number of people wounded
FEMA 60,499 X 6
# A tibble: 7 × 8
id disas…¹ state place…² disas…³ incidentBeginDate incidentEndDate
<chr> <dbl> <chr> <dbl> <chr> <dttm> <dttm>
1 2ece821… 852 WA 99067 DR 1990-01-06 00:00:00 1990-01-14 00:00:00
2 7ccbd3b… 852 WA 99005 DR 1990-01-06 00:00:00 1990-01-14 00:00:00
3 2768b59… 852 WA 99069 DR 1990-01-06 00:00:00 1990-01-14 00:00:00
4 fd19e41… 852 WA 99053 DR 1990-01-06 00:00:00 1990-01-14 00:00:00
5 45c152c… 852 WA 99033 DR 1990-01-06 00:00:00 1990-01-14 00:00:00
6 0b84703… 852 WA 99027 DR 1990-01-06 00:00:00 1990-01-14 00:00:00
7 8341af9… 852 WA 99041 DR 1990-01-06 00:00:00 1990-01-14 00:00:00
# … with 1 more variable: incidentType <chr>, and abbreviated variable names
# ¹disasterNumber, ²placeCode, ³disasterType
EMDAT 2,766 X 12
# A tibble: 1 × 11
`Dis No` Disast…¹ Locat…² Start…³ Start…⁴ Start…⁵ End Y…⁶ End M…⁷ End D…⁸
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1990-0707-USA Flood Washin… 1990 1 9 1990 1 11
# … with 2 more variables: `Total Deaths` <dbl>, `No Injured` <dbl>, and
# abbreviated variable names ¹`Disaster Type`, ²Location, ³`Start Year`,
# ⁴`Start Month`, ⁵`Start Day`, ⁶`End Year`, ⁷`End Month`, ⁸`End Day`
The Location Vector
[1] "Washington, Oregon, Idaho, Montana, Wyoming, Colorado"
The GTD 39 X 12
# A tibble: 1 × 8
eventid iyear imonth iday provstate city nkill nwound
<dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 201606120001 2016 6 12 Florida Orlando 50 53
# Example of separation and unnecessary information
emdatWrangled1_df %>%
select(event_id, Location) %>%
mutate(LocationOrig = Location) %>%
separate_rows(Location, sep = ",|;") %>%
mutate(Location = str_to_lower(Location)) %>%
mutate(
Location = str_replace(
Location,
pattern = "provinces", replacement = "province"
)
) %>%
mutate(
Location = str_remove(Location, pattern = "province")
) %>%
mutate(Location = str_trim(Location, side = "both")) %>%
mutate(
Location = str_remove(Location, pattern = " \\)")
# Mapped the states to state abbreviations
emdatLocWrangled1_df <-
emdatWrangled1_df %>%
select(event_id, Location) %>%
mutate(LocationOrig = Location) %>%
mutate(
# replacePattern1_char is a dictionary created previously
Location = str_replace_all(Location, replacePattern1_char)
) %>%
mutate(Location = str_to_lower(Location)) emdatLocWrangled2_df <-
emdatLocWrangled1_df %>%
### Spelling issues ###
mutate(
Location = str_replace_all(
Location, pattern = "georgie", replacement = "georgia"
),
Location = str_replace_all(
Location, pattern = "virginie", replacement = "virginia"
),
Location = str_replace_all(
Location, pattern = "virgina", replacement = "virginia"
),
Location = str_replace_all(
Location, pattern = "caroline du nord", replacement = "north carolina"
),
Location = str_replace_all(
Location, pattern = "north craolina", replacement = "north carolina"
),
Location = str_replace_all(
Location, pattern = "nth.carolina", replacement = "north carolina"
),
Location = str_replace_all(
Location, pattern = "norh carolina", replacement = "north carolina"
),
Location = str_replace_all(
Location, pattern = "californie", replacement = "california"
),
Location = str_replace_all(
Location, pattern = "louisiane", replacement = "louisiana"
),
Location = str_replace_all(
Location, pattern = "lousiana", replacement = "louisiana"
),
# MS smart quotes strike again
Location = str_replace_all(
Location, pattern = "hawai‘i", replacement = "hawaii"
),
Location = str_replace_all(
Location, pattern = "hawai", replacement = "hawaii"
),
Location = str_replace_all(
Location, pattern = "hawaiii", replacement = "hawaii"
),
Location = str_replace_all(
Location, pattern = "okhlahoma", replacement = "oklahoma"
),
Location = str_replace_all(
Location, pattern = "ohlahoma", replacement = "oklahoma"
),
Location = str_replace_all(
Location, pattern = "okhahoma", replacement = "oklahoma"
),
Location = str_replace_all(
Location, pattern = "washigton", replacement = "washington"
),
Location = str_replace_all(
Location, pattern = "wsahington", replacement = "washington"
),
Location = str_replace_all(
Location, pattern = "mississipi", replacement = "mississippi"
),
Location = str_replace_all(
Location, pattern = "mississppi", replacement = "mississippi"
),
Location = str_replace_all(
Location, pattern = "mssissippi", replacement = "mississippi"
),
Location = str_replace_all(
Location, pattern = "pennsylvannia", replacement = "pennsylvania"
),
Location = str_replace_all(
Location, pattern = "pensylvania", replacement = "pennsylvania"
),
Location = str_replace_all(
Location, pattern = "pennsylvanie", replacement = "pennsylvania"
),
Location = str_replace_all(
Location, pattern = "pennyslvania", replacement = "pennsylvania"
),
Location = str_replace_all(
Location, pattern = "nebarska", replacement = "nebraska"
),
Location = str_replace_all(
Location, pattern = "ilinois", replacement = "illinois"
),
Location = str_replace_all(
Location, pattern = "new mexique", replacement = "new mexico"
),
Location = str_replace_all(
Location, pattern = "missourri", replacement = "missouri"
),
Location = str_replace_all(
Location, pattern = "arkanasas", replacement = "arkansas"
),
Location = str_replace_all(
Location, pattern = "arkensas", replacement = "arkansas"
),
Location = str_replace_all(
Location, pattern = "tannesse", replacement = "tennessee"
),
Location = str_replace_all(
Location, pattern = "tennesse", replacement = "tennessee"
),
Location = str_replace_all(
Location, pattern = "tennesseee", replacement = "tennessee"
),
Location = str_replace_all(
Location, pattern = "kensas", replacement = "kansas"
),
Location = str_replace_all(
Location, pattern = "hansas", replacement = "kansas"
),
Location = str_replace_all(
Location, pattern = "massachusets", replacement = "massachusetts"
),
Location = str_replace_all(
Location, pattern = "massachussetts", replacement = "massachusetts"
),
Location = str_replace_all(
Location, pattern = "massachussets", replacement = "massachusetts"
),
Location = str_replace_all(
Location, pattern = "minissotta", replacement = "minnesota"
),
Location = str_replace_all(
Location, pattern = "minnessota", replacement = "minnesota"
),
Location = str_replace_all(
Location, pattern = "indianaa", replacement = "indiana"
),
Location = str_replace_all(
Location, pattern = "indiania", replacement = "indiana"
),
Location = str_replace_all(
Location, pattern = "montania", replacement = "montana"
),
Location = str_replace_all(
Location, pattern = "rhode islands", replacement = "rhode island"
),
Location = str_replace_all(
Location, pattern = "sth.carolina", replacement = "SC"
),
Location = str_replace_all(
Location, pattern = "floride", replacement = "florida"
),
Location = str_replace_all(
Location, pattern = "new jesey", replacement = "new jersey"
),
Location = str_replace_all(
Location, pattern = "south dacota", replacement = "south dakota"
)
) # Added states where only the city was mentioned
mutate(
Location = case_when(
event_id == "1991-0426-USA" ~ "oakland (CA)",
TRUE ~ Location
),
Location = case_when(
event_id == "1994-0149-USA" ~
"los angeles (CA)",
TRUE ~ Location
),
Location = case_when(
event_id == "1995-0288-USA" ~
"atlanta (GA), detroit (MI), philadelphia (PA), chicago (IL), milwaukee (WI)",
TRUE ~ Location
),
Location = case_when(
event_id == "1995-0073-USA" ~
"los angeles (CA), dallas (TX), fort worth (TX), NM",
TRUE ~ Location
)
### Remove Directional Words ###
# Eliminate words "southern", "south", "eastern", "north", "west", "western"
# "area", "near", "areas", "northern"
emdatLocWrangled2_df <-
emdatLocWrangled_df %>%
mutate(
Location = str_remove_all(Location, pattern = "central")
) %>%
mutate(
Location = str_remove_all(Location, pattern = "southern")
) %>%
mutate(
Location = str_remove_all(Location, pattern = "south")
) %>%
mutate(
Location = str_remove_all(Location, pattern = "eastern")
) %>%
mutate(
Location = str_remove_all(Location, pattern = "east")
# Replaced New England with the actual states
mutate (
Location = str_replace_all(
Location, pattern = "new england",
replacement = "RI, CT, MA, NH, VT, ME"
)
)
# Split state abbreviations from regions
rowwise() %>%
mutate(
State = case_when(
!inProgress ~ Location,
inProgress ~ paste(
unlist(
str_extract_all(
Location, pattern = "[A-Z]{2}"
)
),
collapse = ", "
)
)
) %>%
mutate(
Region = case_when(
!inProgress ~ NA_character_,
inProgress ~ paste(
unlist(
str_remove_all(
Location, pattern = "[A-Z]{2}"
)
),
collapse = ", "
)
)
) %>%
ungroup()From here
# A tibble: 762 × 2
`Dis No` Location
<chr> <chr>
1 1990-0346-USA Oklahoma, Kansas, Nebraska, Missouri, Iowa, Illinois, Alabama,…
2 1990-0357-USA Texas, Oklahoma, Arkansas, Virginie, Caroline du Nord
3 1990-0359-USA Texas, Oklahoma, Arkansas, Kansas, Missouri, Illinois, Indiana…
4 1990-0360-USA Nebraska, Iowa, Kansas, Missouri, Texas, Oklahoma, Arkansas
5 1990-0363-USA Colorado, Kansas, Missouri, Indiana, Kentucky
6 1990-0350-USA Texas
7 1990-0353-USA Oklahoma
8 1990-0355-USA Texas, Kansas, Louisiane, Alabama, Georgie, Caroline du Sud
9 1990-0362-USA Indiana, Illinois, Ohio, Kansas, Kentucky
10 1990-0707-USA Washington, Oregon, Idaho, Montana, Wyoming, Colorado
# … with 752 more rows
To here
# A tibble: 1,369 × 3
event_id State County
<chr> <chr> <chr>
1 1990-0073-USA OH belmont
2 1991-0192-USA CA los angeles
3 1991-0218-USA NY richmond
4 1991-0426-USA CA alameda
5 1992-0120-USA CA humboldt
6 1992-0215-USA CA san bernardino
7 1992-0271-USA HI kauai
8 1992-0271-USA HI honolulu
9 1993-0126-USA NE boyd
10 1993-0126-USA NE butler
# … with 1,359 more rows
###### Pivot the dates to make the ranges long ################################
emdatclean_final<- emdatLocClean3_df %>%
mutate(diff= end_date- begin_date) %>%
group_by(`region_id`) %>%
mutate(Day = list(seq (begin_date, end_date, by = 'day'))) %>%
unnest(Day) %>%
ungroup() %>%
select(Day, diff, begin_date, end_date, everything())
# We have some events where the end happens before the beginning
femaRaw1_df %>%
mutate(duration = end_date - begin_date) %>%
# rowwise() %>%
mutate(duration = end_date - begin_date) %>%
filter(duration < 0)
femaRaw2_df <-
femaRaw1_df %>%
mutate(
# Hurricane Earl: https://www.fema.gov/disaster/1241
begin_date = case_when(
eventid == "DR-1241-FL" ~ as_date("1998-09-03"),
TRUE ~ begin_date
),
end_date = case_when(
eventid == "DR-1241-FL" ~ as_date("1998-09-04"),
TRUE ~ end_date
### Imputation by Mean ###
ImputeMean <- function(x) {
replace(x, is.na(x), mean(x, na.rm = TRUE))
}# A tibble: 16,167 × 4
Day diff begin_date end_date
<date> <drtn> <date> <date>
1 1990-06-21 0 days 1990-06-21 1990-06-21
2 1990-07-01 27 days 1990-07-01 1990-07-28
3 1990-07-02 27 days 1990-07-01 1990-07-28
4 1990-07-03 27 days 1990-07-01 1990-07-28
5 1990-07-04 27 days 1990-07-01 1990-07-28
6 1990-07-05 27 days 1990-07-01 1990-07-28
7 1990-07-06 27 days 1990-07-01 1990-07-28
8 1990-07-07 27 days 1990-07-01 1990-07-28
9 1990-07-08 27 days 1990-07-01 1990-07-28
10 1990-07-09 27 days 1990-07-01 1990-07-28
# … with 16,157 more rows
36,224,806 x 5
# A tibble: 14 × 5
Day state county femaID emdatID
<date> <chr> <chr> <chr> <chr>
1 2008-04-04 TX san saba EM-3284-TX_43119 2008-0131-USA_0011
2 2008-04-08 TX crosby EM-3284-TX_11121 2008-0131-USA_0011
3 1998-06-07 FL manatee DR-1223-FL_1011 1998-0185-USA_0001
4 2008-04-15 TX brazos EM-3284-TX_2424 2008-0131-USA_0011
5 2005-12-28 TX san saba DR-1624-TX_47129 2005-0724-USA_0001
6 2008-04-04 TX guadalupe EM-3284-TX_19500 2008-0131-USA_0011
7 2006-01-01 TX harris DR-1624-TX_22625 2005-0724-USA_0001
8 2005-12-29 TX swisher DR-1624-TX_50469 2005-0724-USA_0001
9 2008-04-03 TX jeff davis EM-3284-TX_25413 2008-0131-USA_0011
10 2011-06-05 TX mason DR-1999-TX_9514 2011-0639-USA_0003
11 2008-04-02 TX ochiltree EM-3284-TX_40318 2008-0131-USA_0011
12 2008-03-19 TX jones EM-3284-TX_26062 2008-0131-USA_0011
13 2011-04-24 TX wheeler DR-1999-TX_13219 2011-0140-USA_0001
14 2008-03-21 TX williamson EM-3284-TX_50365 2008-0131-USA_0011
This key contains the beginning year, month, state, and a ticker to differentiate events with the same year, month and state.
# A tibble: 7 × 4
femaID emdatID smashedID eventKey
<chr> <chr> <chr> <chr>
1 DR-852-WA_001 1990-0707-USA_0001 DR-852-WA_1990-0707-USA_0001 1990_01_WA_02
2 DR-852-WA_002 1990-0707-USA_0001 DR-852-WA_1990-0707-USA_0001 1990_01_WA_02
3 DR-852-WA_003 1990-0707-USA_0001 DR-852-WA_1990-0707-USA_0001 1990_01_WA_02
4 DR-852-WA_004 1990-0707-USA_0001 DR-852-WA_1990-0707-USA_0001 1990_01_WA_02
5 DR-852-WA_005 1990-0707-USA_0001 DR-852-WA_1990-0707-USA_0001 1990_01_WA_02
6 DR-852-WA_006 1990-0707-USA_0001 DR-852-WA_1990-0707-USA_0001 1990_01_WA_02
7 DR-852-WA_007 1990-0707-USA_0001 DR-852-WA_1990-0707-USA_0001 1990_01_WA_02
FEMA and EMDAT had different ways of categorizing and naming the types of natural disaster.
Therefore:
# A tibble: 10,412 × 4
eventKey incident_type hazard_cluster hazard_type
<chr> <chr> <chr> <chr>
1 1990_01_MS_01 Severe Storms Precipitation-related METEOROLOGICAL and HYDR…
2 1990_01_OR_02 Flood Flood METEOROLOGICAL and HYDR…
3 1990_01_WA_02 Flood Flood METEOROLOGICAL and HYDR…
4 1990_02_AL_01 Severe Storms Precipitation-related METEOROLOGICAL and HYDR…
5 1990_02_GA_01 Severe Storms Precipitation-related METEOROLOGICAL and HYDR…
6 1990_02_IL_01 Severe Ice Storm Precipitation-related METEOROLOGICAL and HYDR…
7 1990_02_TN_01 Flood Flood METEOROLOGICAL and HYDR…
8 1990_03_AL_02 Severe Storms Precipitation-related METEOROLOGICAL and HYDR…
9 1990_03_FL_01 Flood Flood METEOROLOGICAL and HYDR…
10 1990_03_NY_01 Snow Precipitation-related METEOROLOGICAL and HYDR…
# … with 10,402 more rows
tidyDisasters Package-allKeys_df
-disastCasualties_df
-disastDates_df
-disastLocations_df
-disastTypes_df
-emdat_hazard_cluster_df
-fema_hazard_cluster_df
-hazard_report_df
tidyDisasters package# Can we find the largest Ice Storm since the 90's?
library(tidyDisasters)
data("disastLocations_df")
data("disastTypes_df")
data("disastDates_df")
disastLocations_df %>%
left_join(disastTypes_df) %>%
left_join(disastDates_df) %>%
mutate(Year = year(eventStart)) %>%
filter(incident_type %in% c("Severe Ice Storm", "Snow")) %>%
group_by(state, county, Year) %>%
summarise(Frozen = n() >= 1L, .groups = "keep") %>%
group_by(Year) %>%
summarise(Count = sum(Frozen)) %>%
ggplot() +
aes(x = Year, y = Count) +
labs(
title = "Number of Counties Affected by Snow Storms Since the 90s",
caption = "Data from the tidyDisasters R Package",
y = "No. Counties Frozen"
) +
scale_x_continuous(breaks = seq(1990,2020,5)) +
theme_classic() +
geom_point() +
gghighlight(Count > 400)We were able to plot and find the 1993 “Storm of the Century” that affected almost 800 counties and the 1996 blizzard that affected around 500 counties and is remembered as the greatest snowstorm in terms of the amount of snow that fell.
tidyDisasters package# Are disasters becoming more prevalent?
data("disastTypes_df")
data("disastDates_df")
disastTypes_df %>%
left_join(disastDates_df) %>%
mutate(Year = year(eventStart)) %>%
filter(
hazard_cluster %in% c(
"Precipitation-related", " Wind-related",
"Environmental degradation (Forestry)", "Flood", "Convective-related",
"Pressure-related"
)
) %>%
group_by(Year, hazard_cluster) %>%
count(hazard_cluster) %>%
ggplot() +
aes(x = Year, y = n, color = hazard_cluster) +
labs(
title = "Disasters tendency since 1990",
caption = "Data from the tidyDisasters R Package",
y = "Count"
) +
geom_point() +
facet_wrap(~hazard_cluster, scales = "free_y") +
theme_classic() +
theme(legend.position = "none")This plot evidences the trend on six different types of hazard clusters.
tidyYRBSR and Wranglespss syntax because it exceed the number of characters allowed and it generated an error# A tibble: 235,761 × 6
record is_hopeless suicide_considered suicide_planned suicide_attem…¹ suici…²
<chr> <lgl> <lgl> <lgl> <int> <lgl>
1 120 TRUE FALSE TRUE 0 FALSE
2 121 FALSE FALSE FALSE 0 FALSE
3 123 FALSE FALSE FALSE 0 FALSE
4 125 TRUE TRUE TRUE 3 TRUE
5 126 TRUE TRUE TRUE 6 TRUE
6 127 TRUE TRUE TRUE 1 FALSE
7 5864 TRUE TRUE FALSE 0 FALSE
8 7370 FALSE TRUE TRUE 1 FALSE
9 7371 FALSE FALSE FALSE 0 FALSE
10 9659 TRUE FALSE FALSE 0 FALSE
# … with 235,751 more rows, and abbreviated variable names ¹suicide_attempts,
# ²suicide_injury
# A tibble: 33,634 × 10
record nAlcoholDays nMariju…¹ used_…² used_…³ used_…⁴ used_…⁵ used_…⁶ used_…⁷
<chr> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 136 0 0 FALSE FALSE FALSE FALSE FALSE FALSE
2 137 0 0 FALSE FALSE FALSE FALSE FALSE FALSE
3 3976 1.5 4.5 FALSE FALSE FALSE FALSE FALSE FALSE
4 5027 1.5 1.12 TRUE TRUE TRUE TRUE TRUE TRUE
5 5871 1.5 1.12 TRUE TRUE FALSE TRUE TRUE TRUE
6 7382 0 0 FALSE FALSE FALSE FALSE FALSE FALSE
7 7383 0 0 FALSE FALSE FALSE FALSE FALSE FALSE
8 9664 1.5 1.12 FALSE FALSE TRUE FALSE FALSE FALSE
9 9667 0 1.12 FALSE FALSE FALSE FALSE FALSE FALSE
10 9668 7.5 4.5 TRUE TRUE FALSE TRUE TRUE TRUE
# … with 33,624 more rows, 1 more variable: used_injection <lgl>, and
# abbreviated variable names ¹nMarijuanaDays, ²used_prescrip_pain,
# ³used_cocaine, ⁴used_inhalant, ⁵used_heroin, ⁶used_methamphetamine,
# ⁷used_MDMA
tidyYRBS Packagehs_districths_demographicshs_suicidehs_substance_useclean_yrbs_2019raw_yrbs_2019tidyYRBSdata(hs_substance_use)
the_data <- left_join(hs_demographics, hs_substance_use)
# Weights
the_data_weights <- the_data |>
srvyr::as_survey_design(
ids=PSU,
weights=weight,
strata=stratum,
nest = TRUE
)
alc_by_sex <- the_data_weights |>
mutate(
used_alcohol = case_when(
nAlcoholDays == 0 ~ 0,
nAlcoholDays > 0 ~ 1
)
) |>
filter(Grade == 12) |>
group_by(year, Sex) |>
summarise(prevalence = mean(used_alcohol, na.rm=T),
n = n()
) |>
filter(!is.na(Sex))
alc_by_sex <- the_data |>
mutate(
used_alcohol = case_when(
nAlcoholDays == 0 ~ 0,
nAlcoholDays > 0 ~ 1)) |>
filter(Grade == 12) |>
group_by(year, Sex) |>
summarise(prevalence = mean(used_alcohol, na.rm=T)) |>
filter(!is.na(Sex))
ggplot(alc_by_sex, aes(year, prevalence, color=Sex, label=Sex)) +
geom_smooth(alpha=0.1, size=0) +
geom_textline() +
scale_color_manual(values=c("#420039", "indianred1")) +
theme_minimal(base_size = 12) +
theme(legend.position = "none") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
scale_x_continuous(breaks = seq(1990,2020,2)) +
theme(axis.text.x = element_text(angle = 90)) +
scale_y_continuous(lim=c(.25,.6),
breaks = seq(0,1,0.05),
labels = scales::percent) +
labs(y="Alcohol use prevalence", x="",
title="Any alcohol use",
caption = "Grade 12th. YRBS, 1990-2019")tidyDisasters to CRANtidyYRBS to CRANThe End2 years in 20 minutes…
tidyDisasters and tidyYRBS