Synopsis

The United States Storm Data information are analyzed based on observed data from the National Weather Service (NWS), as emerged from the analysis some severe storms impact more than others in the United States as shown in the Results section.

The following analysis involves data processing, caching, loading of observed data from source, extrapolation of the 48 standard event types table, and finally creation of a new data-set obtained through manipulation of the data to reach a tidy format.

The standard storm events are 48, the first step of analysis matched 44 out of 48 events using stringr::str_replace_all(), while applying amatch function from {stringdist} package applying the "osa" method was able to match the whole set of 48 events. Not precisely but acceptable.

The selection of distinct type of event leaded to clearer info-graphic visualization of the damages caused by the storm events.

Further adjustments of the original data included checks of provided geocodes, with addition of some extra information, such as US state codes from the US fips codes web site.

In particular, two questions are addressed:

Data Processing

# set a cache folder 
R.cache::setCacheRootPath("pa2_cache/data")

# set the starting time for processing 
t.start<-Sys.time()

# Download, unzip and remove the compressed data file.
bz2File<-"pa2_cache/data/repdata-data-StormData.csv.bz2"

if (!file.exists(bz2File)){
    url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
    download.file(url, bz2File, mode="wb")
}

#Load the datafile into a dataframe.
df <- read.table("pa2_cache/data/repdata-data-StormData.csv.bz2", comment.char = "#", 
                  header = TRUE, sep = ",", na.strings = "")

dim(df) # 902297     37
## [1] 902297     37
# set ending time
t.end<-Sys.time()
t.tot<-t.end-t.start
t.tot # total time for processing data
## Time difference of 55.18155 secs

The analysis shows which types of events are the most harmful with respect to population health (fatalities and injuries) and economical damages of properties and crops across the United States of America.

To answer the questions I decided to make some manipulate the data to reduce the redundancy in the information to make a plot and a map. added the fips codes website as well as the standard list of storm event types to the original data, this procedure reduced the redundancy of the information dramatically.

# select only unique state codes
url <- "https://www.census.gov/2010census/xls/fips_codes_website.xls"
# utils::download.file(url,"fips_codes_website.xls") # download the .xls file directly from the web site

# read the file 
fips_codes_website <- readxl::read_excel("fips_codes_website.xls")

names(fips_codes_website) <- janitor::make_clean_names(names(fips_codes_website))

# assining a variable to unique fips cods 
state_off <- fips_codes_website %>% count(state_abbreviation,state_fips_code)

# state abreviations to substitute in the source data set due to misleading entries
states <- state_off %>% select(-n) %>% mutate(state_fips_code=as.double(state_fips_code))
# dim(states) # 52  2

Then I extracted the table for the 48 standard event categories from: Strom Data Documentation to identify the main categories of the storms.

# library(tabulizer) this is the library needed
# it requires java for OS

# click "done" on locate area of interest page that opens in viewer as the area list has already been located
# event_tb<- tabulizer::locate_areas("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf",pages=6)

# once the table's located it extracts it 
event_tb_located <- tabulizer::extract_tables("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf",
                                  output = "data.frame",
                                  pages=c(6),
                                  area=list(c(191.27184,59.94175,560.35922,559.74757)),
                                  guess=FALSE)

# reduce the extraction binding the rows to obtain a workable data set
event_cat <- reduce(event_tb_located,bind_rows)
rm(event_tb_located) # remove the originals

# select and rename the variables
part_one <- event_cat %>% select(Event.Name,Designator)
part_two <- event_cat %>% select(Event.Name="Event.Name.1",Designator="Designator.1")

# bind the rows of the two parts
my_eve_tb <- rbind(part_one,part_two)
rm(part_one,part_two) # remove the originals
rm(event_cat)

# remove empty rows
my_eve_tb <- my_eve_tb %>% janitor::remove_empty("rows") #library(janitor)
  
# select and adjust   
my_eve_tb <- my_eve_tb[c(2:25,27:50),] %>%
  mutate(
    evtype=tolower(Event.Name),
    ev_zone=Designator) %>%
  select(evtype,ev_zone)

# save a copy as .csv file in the cache folder
write.csv(my_eve_tb,"pa2_cache/data/standard_events.csv")

# read it back 
my_eve_tb <- read_csv("pa2_cache/data/standard_events.csv",
                    col_types = cols(X1 = col_skip()))
# dim(my_eve_tb) # 48  2
# View(my_eve_tb%>%count(evtype))

To add some workable latitude and longitude to the original data set decided to extrapolate and add the values taken from {rnaturalearth} package for US countries.

# load the data for the US states
us <- rnaturalearth::ne_states(country = "United States of America", returnclass = "sf")

geo_codes <- us %>%
  dplyr::select(postal,latitude,longitude) 

# dim(geo_codes) # 51  4 - missing "PR" Puerto Rico 

All the information are united in a single data-frame to continue processing the data with some further cleaning in the storm names.

# The event type column (evtype) contains several typos and other out of standard terms for event types
# View(plyr::count(df$EVTYPE))


# making some changes to df
my_df <- df %>%
  janitor::clean_names() %>%                           # clean all variables name
  rename(fips = state, state=state_2) %>%              # rename state with fips
  select("fips","county","countyname","state",
          "bgn_date",                                  # select only needed columns
         "evtype","mag","fatalities","injuries",
         "propdmg","propdmgexp","cropdmg","cropdmgexp") %>%
  mutate(evtype=tolower(evtype)) %>%                   # clean names in the entire dataset of 898 distinct event types
  filter(!evtype=="?",                                 # filter all the event type which are not "?"
         !grepl("^summary",evtype)) %>%                # cut of the event types defined as 'summary...'
  mutate(evtype=gsub("^ ","",evtype),                  # cut off blank spaces at the begin of the event type
         evtype=gsub("\\d+.*$","",evtype),             # all digits
         evtype=gsub("[//]"," ",evtype),               # all puntuation
         evtype=gsub("[\\]"," ",evtype),
         evtype=gsub("[-]"," ",evtype),
         evtype=gsub("[(]","",evtype),
         evtype=gsub("[;]","",evtype),
         evtype=gsub(" [fg]$","",evtype),              # some spare letters
         evtype=gsub(", may $","",evtype),             # 'may' at the end
         evtype=gsub("\\W+$","",evtype),               # Not a Word
         evtype=gsub(" $","",evtype),                  # blank spaces at the end
         evtype=gsub("tstm","thunderstorm",evtype),
         evtype=gsub("^urban.*","flood",evtype),
         evtype=gsub("^vog","dense fog",evtype),
         evtype=gsub("^beach.*","flood",evtype),
         evtype=gsub(".*coastal.*","flood",evtype),
         bgn_date=as.Date(bgn_date,"%m/%d/%Y"),        # format the date columns as date
         bgn_date=as.factor(bgn_date)) %>%
  arrange(bgn_date) %>%                                # order by the earliest date
  left_join(states,by=c("fips"="state_fips_code","state"="state_abbreviation")) %>% # add the correct state abreviations by the fips codes
  left_join(geo_codes,by=c("state"="postal")) %>%       # add latitude and longitude by states 
  select(-geometry) 

The original data set contained 985 unique events, after some major correction of typos, the event type column is now made of 665 distinct entries, while the standard table foresee just 48 categories. So, there are still some other steps to take.The first step is to further tidy the event type column using stringr::str_replace_all() function which replaces the evtype column with the rows containing the words found in the standard list. The result reduced the original list to 400 elements, but it required one more step to get to 48.Then I tried stringdist::amatch() method="osa" Optimal String Alignment distance and able to find 48 different entries, checking the three vectors decided to keep the last one as the storms highest frequencies equals to the events that answer to the questions.

 # evtype = original
 # evtype_std = first step
 # unique_std_type = second step

# First step tried substituting with str_replace()
std_type <- my_eve_tb$evtype                              # making a vector with 48 standard event types
names(std_type) = paste0('.*', std_type, '.*')            # assigning names to variables to have a date.frame
# the distinct event types have now reduced to 400 out of 985
my_df$evtype_std <- stringr::str_replace_all(my_df$evtype, std_type) 

# Second step applied the amatch method: Optimal String Alignment distance and found 48 different entries
i <- stringdist::amatch(my_df$evtype,std_type, method="osa", maxDist=Inf)
merged <- data.frame(my_df%>%select(evtype,evtype_std),std_type[i]) 
names(merged) <- c("evtype","evtype_std","unique_std_type")        # naming

# View(merged%>%count(evtype,evtype_std,unique_std_type))


# decided to keep just the second step
my_df_short <- cbind(merged$unique_std_type,my_df[,-c(6,16)]) %>%
  rename(evtype_48="merged$unique_std_type")

Next step of the data cleaning is to check for property and crop data:

  • propdmgexp is Property Damage Estimates it is coded with letters and numbers meaning the values of propdmg are between 10 to 1,000,000,000 (B or b = Billion, M or m = Million, K or k = Thousand, H or h = Hundred) (EXP = exponent)

  • cropdmgexp is Crop Damage Estimates this is coded as the same as the property damage variable

# option scientific notations off 
options(scipen = 999)    

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# cleaning of the property and crop variables with transformation and omogenization of the data
my_df_short <- my_df_short %>% 
  mutate(propdmgexp=tolower(propdmgexp),
         cropdmgexp=tolower(cropdmgexp),
         propdmgexp=gsub("[-?]","0",propdmgexp),                    # decoding into binary
         propdmgexp=gsub("[+]","1",propdmgexp),
         cropdmgexp=gsub("[?]","0",cropdmgexp))


my_df_short$propdmgexp[my_df_short$propdmgexp == ""] <- 0            # assigning 0-value to empty rows 
my_df_short$cropdmgexp[my_df_short$cropdmgexp == ""] <- 0

# this vector is for substituting 
numeric <- c(0,1,2,3,4,5,6,7,8)

my_df_short$propdmgexp[my_df_short$propdmgexp %in% numeric] <- 10   # decoding all the entries as multiples of 10
my_df_short$cropdmgexp[my_df_short$cropdmgexp %in% numeric] <- 10

my_df_short$propdmgexp[my_df_short$propdmgexp=="h"] <- 100
my_df_short$propdmgexp[my_df_short$propdmgexp=="k"] <- 1000
my_df_short$propdmgexp[my_df_short$propdmgexp=="m"] <- 1000000
my_df_short$propdmgexp[my_df_short$propdmgexp=="b"] <- 1000000000

my_df_short$cropdmgexp[my_df_short$cropdmgexp %in% numeric] <- 10
my_df_short$cropdmgexp[my_df_short$cropdmgexp=="k"] <- 1000
my_df_short$cropdmgexp[my_df_short$cropdmgexp=="m"] <- 1000000
my_df_short$cropdmgexp[my_df_short$cropdmgexp=="b"] <- 1000000000

my_df_short$propdmgexp <- as.double(my_df_short$propdmgexp)         # format as double 
my_df_short$cropdmgexp<- as.double(my_df_short$cropdmgexp)

my_df_short$propdmgexp[is.na(my_df_short$propdmgexp)] <- 0          # assigning 0-value to empty rows 
my_df_short$cropdmgexp[is.na(my_df_short$cropdmgexp)] <- 0

my_df_short <- my_df_short %>%
  mutate(prop_dmg=ifelse(propdmg<1,0,round((propdmg*propdmgexp)/10^3)),
         crop_dmg=ifelse(cropdmg<1,0,round((cropdmg*cropdmgexp)/10^3))) 

The original data set contained some extra state identiication codes, they were about less than 2% of the whole geo information, so decided to drop these values. Below are some checks of the data.

# missing latitude and longitude for PR= Puerto Rico
# dim(my_df_short) # checking dimentions
# sum(is.na(my_df_short$longitude)) # and the numbers of NA in the longitute/latitude
# my_df_short%>%filter(state=="XX")
# sum(is.na(my_df_short)) 1589
# DataExplorer::profile_missing(my_df_short)



# found that the missing values are for PR, adding Puerto Rico lat=18.220833    lng=-66.590149
# my_df_short$state[my_df_short$state=="PR"]
# my_df_short%>%filter(state=="PR")
my_df_short$latitude[my_df_short$state=="PR"] <- 18.220833      # adding latitude
my_df_short$longitude[my_df_short$state=="PR"] <- -66.590149    # adding longitude

Results

deaths_plot <- my_df_short %>% 
  arrange(fatalities) %>%
  group_by(evtype_48) %>% 
  summarize(tot_deaths=sum(fatalities)) %>% 
  ungroup() %>% 
  mutate(evtype_48 = stringr::str_to_title(evtype_48)) %>%       # setting the list of events with capital letters
  arrange(-tot_deaths) %>%
  filter(tot_deaths>=226) %>% 
  # plotting
  ggplot(aes(x=tot_deaths,y=fct_reorder(evtype_48,tot_deaths), fill=evtype_48,color=evtype_48)) + 
  geom_col() + 
  geom_text(aes(label= scales::comma(tot_deaths,accuracy=2)),  
            position=position_dodge(width = 0.5), hjust=-0.1, size=3) +
  guides(fill="none",color="none") + 
  labs(y="",x="Total fatalities (1950 - 2011)",
       title="Which types of events are most harmful with respect to population fatalities?") + 
  xlim(0,6000) +
  theme_classic() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        plot.title=element_text(size=11))
injury_plot <- my_df_short %>% 
  arrange(injuries) %>%
  group_by(evtype_48) %>% 
  summarize(tot_injuries=sum(injuries)) %>% 
  ungroup() %>% 
  mutate(evtype_48 = stringr::str_to_title(evtype_48)) %>%       # setting the list of events with capital letters
  arrange(-tot_injuries) %>%
  filter(tot_injuries>=1520) %>%
  # plotting
  ggplot(aes(x=tot_injuries, y=fct_reorder(evtype_48,tot_injuries), fill=evtype_48,color=evtype_48)) + 
  geom_col() + 
  geom_text(aes(label= scales::comma(tot_injuries,accuracy=2)),
            position=position_dodge(width = 0.5), hjust=-0.1, size=3) +
  guides(fill="none",color="none") + 
  labs(y="",x="Total injuries (1950 - 2011)",
       title="Which types of events are most harmful with respect to population injuries?",
       caption="**Datasource: United States Storm Data information are analyzed based on observed data from the National Weather Service (NWS)") + 
  xlim(0,95000) +
  theme_classic() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        plot.title=element_text(size=11))
require(patchwork)
deaths_plot/injury_plot

eco_dmg_df <- my_df_short %>%
  select(1,bgn_date,10:15) %>%
  arrange(-propdmg) %>%
  mutate(bgn_date=as.Date(bgn_date,"%Y-%m-%d"),
         yearmon=zoo::as.yearmon(bgn_date),
         year=lubridate::year(bgn_date),
         evtype_48 = stringr::str_to_title(evtype_48))
  

prop_dmg_plot <- eco_dmg_df %>% 
  group_by(year,evtype_48) %>% 
  summarize(tot_prop_dmg=sum(propdmg),tot_crop_dmg=sum(cropdmg)) %>%
  ungroup() %>%
  filter(tot_prop_dmg>0) %>% 
  arrange(tot_prop_dmg) %>%
  group_by(evtype_48) %>%
  summarize(Property=sum(tot_prop_dmg)) %>%
  ungroup() %>%
  arrange(-Property) %>%
  filter(Property>=127585.49) %>% # summary(Property)
  ggplot(aes(x=Property,y=fct_reorder(evtype_48,Property),fill=evtype_48,color=evtype_48)) +
  geom_histogram(bins=30, position = "dodge",stat="identity")+
  geom_text(aes(label= scales::dollar(Property,accuracy=2)),
            position=position_dodge(width = 0.5), hjust=-0.1, size=3) + 
  guides(fill="none",color="none") +
  xlim(0,3500000) +
  labs(title="Which types of events cause the greatest Property damages?",
       subtitle = "1950 - 2011",
       y="",x="Property damages (thousand$)") +
  theme_classic()+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x.top = element_text())
crop_dmg_plot <- eco_dmg_df %>% 
  group_by(year,evtype_48) %>% 
  summarize(tot_prop_dmg=sum(propdmg),tot_crop_dmg=sum(cropdmg)) %>%
  ungroup() %>%
  filter(tot_crop_dmg>0) %>% 
  arrange(tot_crop_dmg) %>%
  group_by(evtype_48) %>%
  summarize(Crops=sum(tot_crop_dmg)) %>%
  ungroup() %>%
  arrange(-Crops) %>%
  filter(Crops>=8102.16) %>% # summary(Property)
  ggplot(aes(x=Crops,y=fct_reorder(evtype_48,Crops),fill=evtype_48,color=evtype_48)) +
  geom_histogram(bins=30, position = "dodge",stat="identity") +
  geom_text(aes(label= scales::dollar(Crops,accuracy=2)),
            position=position_dodge(width = 0.5), hjust=-0.1, size=3) + 
  guides(fill="none",color="none") +
  xlim(0,650000) +
  labs(title="Which types of events cause the greatest Crop damages?",
       subtitle = "1950 - 2011",
       caption="**Datasource: United States Storm Data information are analyzed based on observed data from the National Weather Service (NWS)",
       y="",x="Crop damages (thousand$)") +
  theme_classic() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x.top = element_text())
require(patchwork)

prop_dmg_plot/crop_dmg_plot

This map contains the total Property and Crop economic damages by state and size.

# map
library(maps)
us_states <- map_data("state")


map_plot <- ggplot(data=subset(my_df_short,!latitude>50 & !latitude<25),
       aes(longitude,latitude,size=cropdmgexp+propdmg,color=evtype_48),alpha=0.3) +
  geom_point(aes(group=evtype_48)) +
  geom_text(aes(label=evtype_48),check_overlap = TRUE,size=2, color="black")+
  geom_polygon(data=us_states,aes(long,lat,group=group),color="black",fill=NA,size=0.1) +
  guides(color="none",size="none") +
  viridis::scale_color_viridis(discrete=TRUE) + 
  coord_map(projection = "albers",lat0=39,long1=45) +
  labs(title="Where the top storm events hit the most the economy in the US?",
       subtitle="Total Property and Crop damages between 1950 - 2011",
       caption="**Datasource: United States Storm Data information are analyzed based on observed data from the National Weather Service (NWS)",
       x="",y="",color="Event type") +
  ggthemes::theme_map()+
  theme( plot.title = element_text(size=14),
         plot.title.position = "panel",
         panel.grid = element_blank(),
         axis.text = element_blank(),
         axis.ticks.x = element_blank(),
         axis.line.x = element_blank(),
         strip.text = element_text(),
         legend.position = "none",
         strip.background = element_blank())

map_plot

Resources: