Storms and other severe weather conditions cause both public health and economic problems. Many weather conditions can cause fatalities, injuries and damage to property and crops. Preventing the extent of such damages is a key goal for the authorities.
This project uses the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to compare the extent of damages caused by various weather events. The data analysis is presented below.
Prepare for data analysis by installing the R packages that we will need through this analysis.
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Assuming that you have downloaded the file, you can continue towards reading the data and cleaning it up for the analysis.
data <- read.csv("./data/FStormData.csv.bz2", header = TRUE, sep = ",", stringsAsFactors=FALSE)
str(data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
For our analysis, We will consider only those columns that are relevant to this analysis and do the following: 1. Change variable names to lower case 2. Change event types into upper case
colnames(data) <- tolower(colnames(data))
data <- subset(x=data, injuries > 0 | fatalities > 0 | propdmg > 0 | cropdmg > 0,
select=c(evtype, fatalities, injuries, propdmg, propdmgexp, cropdmg, cropdmgexp))
data$evtype <- toupper(data$evtype)
# Now check the exponent labels used in propdmgexp and cropdmgexp
unique(data$propdmgexp)
## [1] "K" "M" "" "B" "m" "+" "0" "5" "6" "4" "h" "2" "7" "3" "H" "-"
unique(data$cropdmgexp)
## [1] "" "M" "K" "m" "B" "?" "0" "k"
# Convert all exponent labels into upper case
data$propdmgexp <- toupper(data$propdmgexp)
data$cropdmgexp <- toupper(data$cropdmgexp)
# Assign numeric values to property damage alphanumeric exponents.
propdmgKey <- c("\"\"" = 10^0,
"-" = 10^0,
"+" = 10^0,
"0" = 10^0,
"1" = 10^1,
"2" = 10^2,
"3" = 10^3,
"4" = 10^4,
"5" = 10^5,
"6" = 10^6,
"7" = 10^7,
"8" = 10^8,
"9" = 10^9,
"H" = 10^2,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
data$propdmgexp <- propdmgKey[as.character(data$propdmgexp)]
data$propdmgexp[is.na(data$propdmgexp)] <- 10^0
# Do the same for crop damage exponent labels
cropdmgKey <- c("\"\"" = 10^0,
"?" = 10^0,
"0" = 10^0,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
data$cropdmgexp <- cropdmgKey[as.character(data$cropdmgexp)]
data$cropdmgexp[is.na(data$cropdmgexp)] <- 10^0
Now regroup the similar categories in the event type column, and sum the data for all the event types.
data$evtype <- gsub('.*STORM.*', 'STORM', data$evtype)
data$evtype <- gsub('.*FLOOD.*', 'FLOOD', data$evtype)
data$evtype <- gsub('.*WIND.*', 'WIND', data$evtype)
data$evtype <- gsub('.*TORN.*', 'TORNADO', data$evtype)
data$evtype <- gsub('.*HAIL.*', 'HAIL', data$evtype)
data$evtype <- gsub('.*HURRICANE.*', 'HURRICANE', data$evtype)
data$evtype <- gsub('.*RAIN.*', 'RAIN', data$evtype)
data$evtype <- gsub('.*SNOW.*', 'SNOW', data$evtype)
data$evtype <- gsub('.*COLD.*', 'COLD', data$evtype)
data$evtype <- gsub('.*LOW.*TEMPER.*', 'COLD', data$evtype)
data$evtype <- gsub('.*FROST.*', 'COLD', data$evtype)
data$evtype <- gsub('.*HIGH.*TEMPER.*', 'HEAT', data$evtype)
data$evtype <- gsub('.*HEAT.*', 'HEAT', data$evtype)
data$evtype <- gsub('.*FIRE.*', 'FIRE', data$evtype)
data <- group_by(data, evtype)
data2 <- summarise(data,
all_fatalities=sum(fatalities),
all_injuries=sum(injuries),
all_propdmg=sum(propdmg),
all_cropdmg=sum(cropdmg))
Separate out the data according to fatalities, injuries, property damage and crop damage. This will help us in plotting separate graphs side-by-side, which will allow us to compare the impact of weather conditions easily.
# Fatalities data and plot
data_fatalities <- arrange(select(data2, evtype, all_fatalities), desc(all_fatalities))[1:10,]
fatalities_plot <- ggplot() + geom_bar(data = data_fatalities, aes(x = data_fatalities$evtype, y = data_fatalities$all_fatalities, fill = interaction(all_fatalities, evtype)), stat = "identity", show.legend = F) + xlab("Weather Events") + ylab("No. of Fatailities") + ggtitle("Top 10 Events Causing Fatalities") + theme(axis.text.x = element_text(size=8, angle=45, hjust=1), title = element_text(size=12))
# Injuries data and plot
data_injuries <- arrange(select(data2, evtype, all_injuries), desc(all_injuries))[1:10,]
injuries_plot <- ggplot() + geom_bar(data = data_injuries, aes(x = data_injuries$evtype, y = data_injuries$all_injuries, fill = interaction(all_injuries, evtype)), stat = "identity", show.legend = F) + xlab("Weather Events") + ylab("No. of Injuries") + ggtitle("Top 10 Events Causing Injuries") + theme(axis.text.x = element_text(size=8, angle=45, hjust=1), title = element_text(size=12))
grid.arrange(fatalities_plot, injuries_plot, ncol=2)
data_propdmg <- arrange(select(data2, evtype, all_propdmg), desc(all_propdmg))[1:10,]
propdmg_plot <- ggplot() + geom_bar(data = data_propdmg, aes(x = data_propdmg$evtype, y = data_propdmg$all_propdmg, fill = interaction(all_propdmg, evtype)), stat = "identity", show.legend = F) + xlab("Weather Events") + ylab("No. of Property Damages") + ggtitle("Top 10 Events Causing Prop Damages") + theme(axis.text.x = element_text(size=8, angle=45, hjust=1), title = element_text(size=9))
data_cropdmg <- arrange(select(data2, evtype, all_cropdmg), desc(all_cropdmg))[1:10,]
cropdmg_plot <- ggplot() + geom_bar(data = data_cropdmg, aes(x = data_cropdmg$evtype, y = data_cropdmg$all_cropdmg, fill = interaction(all_cropdmg, evtype)), stat = "identity", show.legend = F) + xlab("Weather Events") + ylab("No. of Crop Damages") + ggtitle("Top 10 Events Causing Crop Damages") + theme(axis.text.x = element_text(size=8, angle=45, hjust=1), title = element_text(size=9))
grid.arrange(propdmg_plot, cropdmg_plot, ncol=2)
From the first set of graphs above, we can see that TORNADOs cause the highest number of fatalities followed by HEAT. As for injuries, TORNADOs cause the maximum injuries, followed by HEAT, FLOOD, and WIND. All three - HEAT, FLOOD, and WIND, cause almost similar extent in damages.
From the second set of graphs, we can deduce that HAIL and FLOOD cause maximum crop damage and TORNADOs cause maximum property damages.