This report aims to assess the the impact of severe weather conditions in the United States with respect to population health as well as economic consequences. We used the NOAA Strom database available on: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2
First we load the neccessary libraries:
library(data.table)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
We read the data into a data table and extract the columns that contain information about the number of injuries and fatalitites, as well as the cost from damaged properties and crops:
dt1 <- fread('data', stringsAsFactors=TRUE)
dt2 <- data.table(events=dt1$EVTYPE, injuries=dt1$INJURIES, fatalities=dt1$FATALITIES, property_damage = dt1$PROPDMG, pd_multiplier=dt1$PROPDMGEXP, crop_damage = dt1$CROPDMG, cd_multiplier = dt1$CROPDMGEXP)
To assess the impact of severe weather conditions on health conditions, we arrange the number of injuries and fatalitiets in a descending order and list the first few lines of the data table:
## Question 1
population_data <- aggregate(cbind(injuries, fatalities) ~ events, dt2, sum, na.rm=TRUE)
arranged_population_data <- arrange(population_data, desc(injuries+fatalities))
head(arranged_population_data)
## events injuries fatalities
## 1 TORNADO 91346 5633
## 2 EXCESSIVE HEAT 6525 1903
## 3 TSTM WIND 6957 504
## 4 FLOOD 6789 470
## 5 LIGHTNING 5230 816
## 6 HEAT 2100 937
It appears that tornadoes cause considerably larger number of injries and fatalities than other events. To visualize the data, we only observe the six largest numbers and categorize the rest of the events as “OTHERS”:
# pick the top 7 events and arrange the rest into a single group
n_other <- 7
n_total <- dim(arranged_population_data)[1]
critical_population_data1 <- arranged_population_data[1:n_other-1,]
other_injuries <- sum(arranged_population_data[(n_other):n_total,2])
other_fatalities <- sum(arranged_population_data[(n_other):n_total,3])
critical_population_data2 <- data.frame(events="OTHERS", injuries =other_injuries, fatalities=other_fatalities)
critical_population_data3 <- rbind(critical_population_data1, critical_population_data2)
We convert the data to percentages:
critical_population_data3$injuries_percent <- paste(as.character(round(with(critical_population_data3, injuries / sum(injuries))*1000)/10), '%')
critical_population_data3$fatalities_percent <- paste(as.character(round(with(critical_population_data3, fatalities / sum(fatalities))*1000)/10), '%')
We visualize the data in percentages. Note that the total number of injuries and fatalitites are 140528 and 15145 respectively:
par(mfrow=c(1,2))
par(mar=c(1,1,1,1))
with(critical_population_data3, pie(injuries, labels = injuries_percent, col=hcl.colors(n_other), main="Injuries"))
legend(x=xy.coords(x=0.31,y=1.05), legend=critical_population_data3$events, fill=hcl.colors(n_other), cex=0.6)
with(critical_population_data3, pie(fatalities, labels = fatalities_percent, col=hcl.colors(n_other), main="Fatalities"))
Next, we aimed to assess the economic burden of the severe weather conditions. We assess the total property damage and and crop damage cost. The documentation provided on https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf indicates that the costs are reported in different units. We first examine the reported units to ensure that we sum the costs correctly:
table(dt2$pd_multiplier)
##
## + - 0 1 2 3 4 5 6
## 465934 5 1 216 25 13 4 4 28 4
## 7 8 ? B H K M h m
## 5 1 8 40 6 424665 11330 1 7
table(dt2$cd_multiplier)
##
## 0 2 ? B K M k m
## 618413 19 1 7 9 281832 1994 21 1
We then convert the units to mulipliers:
dt2$pd_multiplier_num[dt2$pd_multiplier=='h' | dt2$pd_multiplier=='H'] <- 100
dt2$pd_multiplier_num[dt2$pd_multiplier=='k' | dt2$pd_multiplier=='K'] <- 1000
dt2$pd_multiplier_num[dt2$pd_multiplier=='m' | dt2$pd_multiplier=='M'] <- 1000000
dt2$pd_multiplier_num[dt2$pd_multiplier=='b' | dt2$pd_multiplier=='B'] <- 1000000000
dt2$cd_multiplier_num[dt2$cd_multiplier=='h' | dt2$cd_multiplier=='H'] <- 100
dt2$cd_multiplier_num[dt2$cd_multiplier=='k' | dt2$cd_multiplier=='K'] <- 1000
dt2$cd_multiplier_num[dt2$cd_multiplier=='m' | dt2$cd_multiplier=='M'] <- 1000000
dt2$cd_multiplier_num[dt2$cd_multiplier=='b' | dt2$cd_multiplier=='B'] <- 1000000000
Subsequently we use the multipleirs to correctly find the property damage and crop damage costs:
dt2$property_damage <- dt2$property_damage * dt2$pd_multiplier_num
dt2$crop_damage <- dt2$crop_damage * dt2$cd_multiplier_num
Now we can sort the costs in a descening order similar to what we did earlier
cost_data <- aggregate(cbind(property_damage, crop_damage) ~ events, dt2, sum, na.rm=TRUE)
arranged_cost_data <- arrange(cost_data, desc(property_damage+crop_damage))
head(arranged_cost_data)
## events property_damage crop_damage
## 1 FLOOD 132836489050 5170955450
## 2 HURRICANE/TYPHOON 26740295000 2607872800
## 3 TORNADO 16166771690 353376460
## 4 HURRICANE 9716358000 2688910000
## 5 RIVER FLOOD 5079635000 5028734000
## 6 HAIL 7991783690 2028807900
A quick examination of the numbers suggest that the largest cost is associated with flood. Now we can rearrange the data to only view the damage costs associated with the 6 most costly events, and then categorize the rest of the events as OTHERS.
critical_cost_data1 <- arranged_cost_data[1:n_other-1,]
other_property <- sum(arranged_cost_data[(n_other):n_total,3],na.rm=T)
other_crop <- sum(arranged_cost_data[(n_other):n_total,3],na.rm=T)
critical_cost_data2 <- data.frame(events="OTHERS", property_damage =other_property, crop_damage=other_crop)
critical_cost_data3 <- rbind(critical_cost_data1, critical_cost_data2)
We convert all costs to bilions of dollars:
critical_cost_data3$property_damage_amount <- paste('$', as.character(round(with(critical_cost_data3, property_damage / 100000000))/10), 'B', sep='')
critical_cost_data3$crop_damage_amount <- paste('$', as.character(round(with(critical_cost_data3, crop_damage / 100000000))/10), 'B', sep='')
We visualize the total cost in bilions of dollars Note that the total damage cost on properties and crops are $211.3B and $30.7B respectively:
par(mfrow=c(1,2))
par(mar=c(1,1,1,1))
with(critical_cost_data3, pie(property_damage, labels = property_damage_amount, col=hcl.colors(n_other), main="Property Damage"))
legend(x=xy.coords(x=-0.1,y=1.05), legend=critical_cost_data3$events, fill=hcl.colors(n_other), cex=0.6)
with(critical_cost_data3, pie(crop_damage, labels = crop_damage_amount, col=hcl.colors(n_other), main="Crop Damage"))