Synopsis

The National Oceanic and Atmospheric Administration maintains a database of significant weather events via the National Climatic Data Center. The following r code outlines the steps taken to obtain this data from the NOAA and then process it for use in a choropleth map showing the frequency of severe weather and its economic impact in the continental United States.

Load required libraries

library(stringr)
library(dplyr)
library(ggplot2)
library(lubridate)
library(datasets)
library(htmltab)
library(data.table)


Obtain raw data from NOAA storm events archive

Read the file listing on the NOAA website and the save the file names in a character vector.

filenames <- htmltab("https://www1.ncdc.noaa.gov/pub/data/swdi/stormevents/csvfiles/",which = 1)
filenames <- filenames$Name

Create full paths for file names retrieved above.

filepaths <- paste("http://www1.ncdc.noaa.gov/pub/data/swdi/stormevents/csvfiles/", filenames, sep = "")

Create a new directory and download all files with “details” in the filename. These are the primary data files (‘fatality’ and ‘location’ files contain supplementary information and are not necessary for this analysis). A regular expression is used to extract the file name from the end of the file path as the parameter for the ‘destfile’ argument.

if(!dir.exists("data")){dir.create("data")}

filepaths <- filepaths[grep("details", filepaths)]

sapply(filepaths, function(x){
     download.file(x, destfile = paste("data/", str_sub(x, regexpr("\\/[^\\/]*$", x)+1, -1), sep = ""))
})

Read all csv files in “data” directory and bind into a single data frame. Variable names are converted to lower case characters to simplify code.

dat <- data.frame()

for(i in 1:length(list.files("data"))){
     dat <- rbind(dat, read.csv(list.files("data", full.names = TRUE)[i], stringsAsFactors = FALSE))
}

names(dat) <- tolower(names(dat))


Clean and process data

The data documentation notes that many event types were not recorded prior to 1996. A barplot illustrates the impact of this change in record keeping.

ggplot(dat, aes(factor(year))) +
     geom_bar(fill = "dark blue") +
     theme(axis.text.x = element_text(size = 9, angle = 90, vjust = 0.5, hjust=1)) +
     xlab("Year") + ylab ("Event Count")

To remove any bias introduced by this inconsistency in data collection, the data is subset to include only the years from 1996 to 2015.

dat <- filter(dat, dat$year >= 1996)

The ‘event_type’ variable records the category of the weather event. As many of these categories are quite similar (i.e. “high wind” and “strong wind”), they are grouped together into clusters of like events.

dat$event_type <- gsub("Marine Hail", "Hail", dat$event_type)
dat$event_type <- gsub("Flash Flood|Lakeshore Flood|Coastal Flood", "Flood", dat$event_type)
dat$event_type <- gsub(
     "High Wind|Strong Wind|Marine High Wind|Marine Strong Wind|Thunderstorm Wind|Marine Thunderstorm Wind", 
     "High Winds", dat$event_type)
dat$event_type <- gsub(
     "Heavy Snow|Winter Weather|Winter Storm|Blizzard|Ice Storm|Frost\\/Freeze|Lake-Effect Snow|Sleet", 
     "Snow/Ice", dat$event_type)
dat$event_type <- gsub("Waterspout|Funnel Cloud", "Tornado", dat$event_type)
dat$event_type <- gsub("Excessive Heat", "Heat", dat$event_type)
dat$event_type <- gsub("Dense Fog|Marine Dense Fog|Freezing Fog", "Fog", dat$event_type)
dat$event_type <- gsub("Dense Smoke", "Wildfire", dat$event_type)
dat$event_type <- gsub("Debris Flow", "Avalanche", dat$event_type)
dat$event_type <- gsub("Sneakerwave|Tsunami|High Surf|Seiche", "Heavy Waves", dat$event_type)
dat$event_type <- gsub("Cold\\/Wind Chill|Extreme Cold\\/Wind Chill", "Cold", dat$event_type)
dat$event_type <- gsub("Marine Lightning", "Lightning", dat$event_type)
dat$event_type <- gsub("Hurricane \\(Typhoon\\)|Tropical Storm|Tropical Depression", "Hurricane",
                       dat$event_type)
dat$event_type <- gsub("Dust Devil", "Dust Storm", dat$event_type)

Combine injuries_direct with injuries_indirect, as well as deaths_direct with deaths_indirect.

dat <- mutate(dat, injuries = injuries_direct + injuries_indirect, deaths = deaths_direct + deaths_indirect)

The two columns that represent damage in dollars, ‘damage_property’ and ’damage_crops“, are stored as character vectors, not numeric values. The letters”K“,”M“, and”B" are appended to the numbers to indicate thousands, millions, and billions. These values must be standardized in order to perform an accurate analysis of economic damage.

dat$pdamexp <- ifelse(grepl("K|M|B|T", dat$damage_property), str_sub(dat$damage_property, -1, -1), "")
dat$pdamexp <- gsub("K", 3, dat$pdamexp)
dat$pdamexp <- gsub("M", 6, dat$pdamexp)
dat$pdamexp <- gsub("B", 9, dat$pdamexp)
dat$pdamexp <- gsub("T", 12, dat$pdamexp)
dat$damage_property <- ifelse(grepl("K|M|B|T", dat$damage_property), 
                              as.numeric(str_sub(dat$damage_property, 1, -2))*10^as.numeric(dat$pdamexp),
                              as.numeric(dat$damage_property))

dat$cdamexp <- ifelse(grepl("K|M|B|T", dat$damage_crops), str_sub(dat$damage_crops, -1, -1), "")
dat$cdamexp <- gsub("K", 3, dat$cdamexp)
dat$cdamexp <- gsub("M", 6, dat$cdamexp)
dat$cdamexp <- gsub("B", 9, dat$cdamexp)
dat$cdamexp <- gsub("T", 12, dat$cdamexp)
dat$damage_crops <- ifelse(grepl("K|M|B|T", dat$damage_crops), 
                              as.numeric(str_sub(dat$damage_crops, 1, -2))*10^as.numeric(dat$cdamexp),
                              as.numeric(dat$damage_crops))

The dollar values obtained after the processing above need to be adjusted for inflation. A monthly CPI index is obtained from the St. Louis Federal Reserve and used to convert all dollar values to today’s dollars.

#download.file("http://research.stlouisfed.org/fred2/data/CPIAUCSL.csv", destfile = "cpi.csv")
cpidat <-read.csv("cpi.csv", header = TRUE, stringsAsFactors = FALSE)

cpidat <- cpidat %>%
     mutate(year = year(DATE)) %>%
     group_by(year) %>%
     summarize(cpi = mean(VALUE)) %>%
     mutate(adj = cpi[year == 2015]/cpi)

dat <- dat %>%
     mutate(damage_total = ifelse(is.na(damage_property), 0, damage_property) +
                 ifelse(is.na(damage_crops), 0, damage_crops)) %>%
     mutate(damage_total_adj = damage_total * cpidat$adj[match(dat$year, cpidat$year)]) %>%
     select(episode_id, event_id, state, state_fips, year, event_type, cz_type, 
            cz_fips, cz_name, wfo, begin_date_time, end_date_time, episode_narrative, 
            event_narrative, injuries, deaths, damage_total_adj)

In order to create an accurate, county-level, choropleth map of storm activity, an inconsistency in the cz_name variable must be addressed. Three types of names are captured in this field - U.S. counties, National Weather Service forecast zones, and marine zones. As the forecast zones may represent multiple counties, they must be sub-divided into their consituent parts in order to create a map of storm events by county.
First a county-to-forecast-zone correlation file is downloaded from the NOAA and read into a data frame. The forecast zone data is then subset to include only the necessary variables.

download.file("http://www.nws.noaa.gov/geodata/catalog/wsom/data/bp10nv15.dbx", "zones.dbx")

zones <- read.csv("zones.dbx", header = FALSE, sep = "|", stringsAsFactors = FALSE, 
                  col.names = c("state", "zone", "cwa", "name", "state_zone", "countyname", 
                                "fips", "time_zone", "fe_area", "lat", "lon"))

zones <- zones %>%
     select(state, zone, cwa, name, state_zone, countyname) %>%
     mutate(state_name = toupper(state.name[match(zones$state, state.abb)])) %>%
     distinct()

Next, the storm data is subset to include only the necessary fields. The data is also filtered to remove all events not in the continental United States.

dat <- dat %>%
     filter(state %in% toupper(state.name), state != "HAWAII", state != "ALASKA", cz_type != "M") %>%
     select(episode_id, event_id, state, event_type, cz_type, cz_fips, cz_name, begin_date_time, 
            damage_total_adj, injuries, deaths)

A function is used to create a new data frame with accurate county-level data. Data from each row with a ‘cz_type’ of ‘county’ is copied directly to new data frame. Data from each row with a zone that represents a single county is also copied directly to a new data frame. Data from each row that represents a multi-county zone is copied once for each county in that zone. Categorical variables (episode id, event id, state, event type, and date/time) are repeated, while quantitative variables (damage, injuries, and deaths) are attributed proportionally to each county.

datrows <- as.numeric(row.names(dat))

weatherdat <- lapply(datrows, function(x){
    if(dat$cz_type[x] == "C"){
     applydat <- dat[x,c(1:4,7:11)]
    }
    
    else{
        matchzones <- filter(zones, state_name == dat$state[x])
        matchlist <- grep(paste("^",dat$cz_fips[x],"$", sep = ""), matchzones$zone)
        matchcount <- length(matchlist)
        
        if(matchcount == 0){
            applydat <- dat[x,c(1:4,7:11)]
        }
        
        else{
            applydat <- data.frame(matrix(nrow = matchcount, ncol = 9))
            colnames(applydat) <- c('episode_id', 'event_id', 'state', 'event_type', 'cz_name', 
                                    'begin_date_time', 'damage_total_adj', 'injuries', 'deaths')
            
            applydat$episode_id[1:matchcount] <- rep(dat$episode_id[x], matchcount)
            applydat$event_id[1:matchcount] <- rep(dat$event_id[x], matchcount)
            applydat$state[1:matchcount] <- rep(dat$state[x], matchcount)
            applydat$event_type[1:matchcount] <- rep(dat$event_type[x], matchcount)
            applydat$cz_name[1:matchcount] <- matchzones$countyname[matchlist]
            applydat$begin_date_time[1:matchcount] <- rep(dat$begin_date_time[x], matchcount)
            applydat$damage_total_adj[1:matchcount] <- rep(dat$damage_total_adj[x]/matchcount,matchcount)
            applydat$injuries[1:matchcount] <- rep(dat$injuries[x]/matchcount, matchcount)
            applydat$deaths[1:matchcount] <- rep(dat$deaths[x]/matchcount, matchcount)
            return(applydat)
        }
    }
    
})

weatherdat <- rbindlist(weatherdat)
weatherdat <- weatherdat %>%
    select(episodeid = episode_id, eventid = event_id, state= state, event_type = event_type,
           county = cz_name, datetime = begin_date_time, damage = damage_total_adj, 
           injuries = injuries, deaths = deaths)

The county names used in the NOAA data set are not always uniform. Occasionally, additional descriptive words are appended (e.g. “Coastal Monterey County” and “Pima County Deserts”). The following regular expressions are used to standardize the county names as much as possible.

weatherdat$county <- gsub(" \\(C\\)", "", weatherdat$county)
weatherdat$county <- gsub("^C ", "", weatherdat$county)
weatherdat$county <- gsub("^CITY OF ", "", weatherdat$county)
weatherdat$county <- gsub("^COASTAL ", "", weatherdat$county)
weatherdat$county <- gsub("^E ", "", weatherdat$county)
weatherdat$county <- gsub("^EASTERN ", "", weatherdat$county)
weatherdat$county <- gsub("^INLAND ", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY DESERTS", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY MOUNTAINS", "", weatherdat$county)
weatherdat$county <- gsub(" DESERTS", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY VALLEYS", "", weatherdat$county)
weatherdat$county <- gsub("^LOWER ", "", weatherdat$county)
weatherdat$county <- gsub("^MAINLAND ", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY EXCEPT SURPRISE VALLEY", "", weatherdat$county)
weatherdat$county <- gsub(" T X E", "", weatherdat$county)
weatherdat$county <- gsub("NE ", "", weatherdat$county)
weatherdat$county <- gsub("NC ", "", weatherdat$county)
weatherdat$county <- gsub("^NORTHERN ", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY COASTAL AREAS", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY COASTAL PLAIN", "", weatherdat$county)
weatherdat$county <- gsub(" / E LARAMIE", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY EASTERN DESERTS", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY VALLEY/THE INLAND EMPIRE", "", weatherdat$county)
weatherdat$county <- gsub(" T SW \\& X SE", "", weatherdat$county)
weatherdat$county <- gsub(" CO\\.", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY", "", weatherdat$county)
weatherdat$county <- gsub(" FOOTHILLS", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY LAKES", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY LAKES REGION", "", weatherdat$county)
weatherdat$county <- gsub(" COUNTY LAKES REGION / SIERRA EXCEPT W", "", weatherdat$county)
weatherdat$county <- gsub("^SOUTHWEST ", "", weatherdat$county)
weatherdat$county <- gsub("^.*ELKO.*$", "ELKO", weatherdat$county)
weatherdat$county <- gsub("^.*KERN.*$", "KERN", weatherdat$county)
weatherdat$county <- gsub("^.*GILA.*$", "GILA", weatherdat$county)
weatherdat$county <- gsub("^SW ", "", weatherdat$county)
weatherdat$county <- gsub("^THURSTON.*$", "THURSTON", weatherdat$county)
weatherdat$county <- gsub(" COUNTY MOUNTAINS", "", weatherdat$county)
weatherdat$county <- gsub(" MTNS", "", weatherdat$county)
weatherdat$county <- gsub("^UPPER ", "", weatherdat$county)
weatherdat$county <- gsub(" MTNS", "", weatherdat$county)
weatherdat$county <- gsub("^W ", "", weatherdat$county)
weatherdat$county <- gsub("^W/S ", "", weatherdat$county)
weatherdat$county <- gsub("^WESTERN ", "", weatherdat$county)
weatherdat$county <- gsub("^EASTERN/CENTRAL ", "", weatherdat$county)
weatherdat$county <- gsub(" AND VICINITY", "", weatherdat$county)

The data is now grouped by state, county, and event type. Summary statistics are computed for damage, injuries and deaths, and each event type is ranked by its count. The resulting data frame is written to a .csv file and then used to produce a visualization in Tableau.

weatherdat <- weatherdat %>%
     group_by(state, county, event_type) %>%
     summarise(event_count = n_distinct(eventid), damage = sum(damage), injuries = sum(injuries), 
          deaths = sum(deaths)) %>%
     mutate(catrank = row_number(-event_count), catpercent = event_count/sum(event_count),
            drank = row_number(-damage), dpercent = damage/sum(damage),
            stateabbr = state.abb[match(state, toupper(state.name))])

write.csv(weatherdat, "weatherdat.csv", row.names = FALSE)


Results

Link to visualization