Synopsis

Storms and other severe weather conditions cause both public health and economic problems. Many weather conditions can cause fatalities, injuries and damage to property and crops. Preventing the extent of such damages is a key goal for the authorities.

This project uses the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to compare the extent of damages caused by various weather events. The data analysis is presented below.

Data Processing

Prepare for data analysis by installing the R packages that we will need through this analysis.

knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Read the data

Assuming that you have downloaded the file, you can continue towards reading the data and cleaning it up for the analysis.

data <- read.csv("./data/FStormData.csv.bz2", header = TRUE, sep = ",", stringsAsFactors=FALSE)

Check the data

str(data)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Cleaning the data

For our analysis, We will consider only those columns that are relevant to this analysis and do the following: 1. Change variable names to lower case 2. Change event types into upper case

colnames(data) <- tolower(colnames(data))

data <- subset(x=data, injuries > 0 | fatalities > 0 | propdmg > 0 | cropdmg > 0,
        select=c(evtype, fatalities, injuries, propdmg, propdmgexp, cropdmg, cropdmgexp))

data$evtype <- toupper(data$evtype)

# Now check the exponent labels used in propdmgexp and cropdmgexp
unique(data$propdmgexp)
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "4" "h" "2" "7" "3" "H" "-"
unique(data$cropdmgexp)
## [1] ""  "M" "K" "m" "B" "?" "0" "k"
# Convert all exponent labels into upper case
data$propdmgexp <- toupper(data$propdmgexp)
data$cropdmgexp <- toupper(data$cropdmgexp)

# Assign numeric values to property damage alphanumeric exponents.
    propdmgKey <-  c("\"\"" = 10^0,
                     "-" = 10^0, 
                     "+" = 10^0,
                     "0" = 10^0,
                     "1" = 10^1,
                     "2" = 10^2,
                     "3" = 10^3,
                     "4" = 10^4,
                     "5" = 10^5,
                     "6" = 10^6,
                     "7" = 10^7,
                     "8" = 10^8,
                     "9" = 10^9,
                     "H" = 10^2,
                     "K" = 10^3,
                     "M" = 10^6,
                     "B" = 10^9)
data$propdmgexp <- propdmgKey[as.character(data$propdmgexp)]
data$propdmgexp[is.na(data$propdmgexp)] <- 10^0

# Do the same for crop damage exponent labels
cropdmgKey <-  c("\"\"" = 10^0,
                     "?" = 10^0, 
                     "0" = 10^0,
                     "K" = 10^3,
                     "M" = 10^6,
                     "B" = 10^9)
data$cropdmgexp <- cropdmgKey[as.character(data$cropdmgexp)]
data$cropdmgexp[is.na(data$cropdmgexp)] <- 10^0

Now regroup the similar categories in the event type column, and sum the data for all the event types.

data$evtype <- gsub('.*STORM.*', 'STORM', data$evtype)
data$evtype <- gsub('.*FLOOD.*', 'FLOOD', data$evtype)
data$evtype <- gsub('.*WIND.*', 'WIND', data$evtype)
data$evtype <- gsub('.*TORN.*', 'TORNADO', data$evtype)
data$evtype <- gsub('.*HAIL.*', 'HAIL', data$evtype)
data$evtype <- gsub('.*HURRICANE.*', 'HURRICANE', data$evtype)
data$evtype <- gsub('.*RAIN.*', 'RAIN', data$evtype)
data$evtype <- gsub('.*SNOW.*', 'SNOW', data$evtype)
data$evtype <- gsub('.*COLD.*', 'COLD', data$evtype)
data$evtype <- gsub('.*LOW.*TEMPER.*', 'COLD', data$evtype)
data$evtype <- gsub('.*FROST.*', 'COLD', data$evtype)
data$evtype <- gsub('.*HIGH.*TEMPER.*', 'HEAT', data$evtype)
data$evtype <- gsub('.*HEAT.*', 'HEAT', data$evtype)
data$evtype <- gsub('.*FIRE.*', 'FIRE', data$evtype)

data <- group_by(data, evtype)
data2 <- summarise(data, 
                   all_fatalities=sum(fatalities), 
                   all_injuries=sum(injuries), 
                   all_propdmg=sum(propdmg), 
                   all_cropdmg=sum(cropdmg))

Separate out the data according to fatalities, injuries, property damage and crop damage. This will help us in plotting separate graphs side-by-side, which will allow us to compare the impact of weather conditions easily.

Impact Due to Severe Weather Events on Public Health

# Fatalities data and plot
data_fatalities <- arrange(select(data2, evtype, all_fatalities), desc(all_fatalities))[1:10,]

fatalities_plot <- ggplot() + geom_bar(data = data_fatalities, aes(x = data_fatalities$evtype, y = data_fatalities$all_fatalities, fill = interaction(all_fatalities, evtype)), stat = "identity", show.legend = F) + xlab("Weather Events") + ylab("No. of Fatailities") + ggtitle("Top 10 Events Causing Fatalities") + theme(axis.text.x = element_text(size=8, angle=45, hjust=1), title = element_text(size=12))

# Injuries data and plot
data_injuries <- arrange(select(data2, evtype, all_injuries), desc(all_injuries))[1:10,]

injuries_plot <- ggplot() + geom_bar(data = data_injuries, aes(x = data_injuries$evtype, y = data_injuries$all_injuries, fill = interaction(all_injuries, evtype)), stat = "identity", show.legend = F) + xlab("Weather Events") + ylab("No. of Injuries") + ggtitle("Top 10 Events Causing Injuries") + theme(axis.text.x = element_text(size=8, angle=45, hjust=1), title = element_text(size=12))

Compare the impact

grid.arrange(fatalities_plot, injuries_plot, ncol=2)

Impact Due to Severe Weather Events on Economy

data_propdmg <- arrange(select(data2, evtype, all_propdmg), desc(all_propdmg))[1:10,]

propdmg_plot <- ggplot() + geom_bar(data = data_propdmg, aes(x = data_propdmg$evtype, y = data_propdmg$all_propdmg, fill = interaction(all_propdmg, evtype)), stat = "identity", show.legend = F) + xlab("Weather Events") + ylab("No. of Property Damages") + ggtitle("Top 10 Events Causing Prop Damages") + theme(axis.text.x = element_text(size=8, angle=45, hjust=1), title = element_text(size=9))

data_cropdmg <- arrange(select(data2, evtype, all_cropdmg), desc(all_cropdmg))[1:10,]

cropdmg_plot <- ggplot() + geom_bar(data = data_cropdmg, aes(x = data_cropdmg$evtype, y = data_cropdmg$all_cropdmg, fill = interaction(all_cropdmg, evtype)), stat = "identity", show.legend = F) + xlab("Weather Events") + ylab("No. of Crop Damages") + ggtitle("Top 10 Events Causing Crop Damages") + theme(axis.text.x = element_text(size=8, angle=45, hjust=1), title = element_text(size=9))

Compare the impact

grid.arrange(propdmg_plot, cropdmg_plot, ncol=2)

Result

From the first set of graphs above, we can see that TORNADOs cause the highest number of fatalities followed by HEAT. As for injuries, TORNADOs cause the maximum injuries, followed by HEAT, FLOOD, and WIND. All three - HEAT, FLOOD, and WIND, cause almost similar extent in damages.

From the second set of graphs, we can deduce that HAIL and FLOOD cause maximum crop damage and TORNADOs cause maximum property damages.