Weather events cause both economic loss and public health problems in the United States. Property and crop damage cost millions of dollars each year, and a great number of fatalities and injuries are produced. Using data to prevent some of these consequences is a possibility we will explore in this paper.
The National Oceanic and Atmospheric Administration’s storm database describes different types of events and its outcomes (economic cost, injuries and fatalities), including when and where they occur from 1950 to 2011. Here we examine the most harmful events and focus specially in the States affected by them.
First, we download and load the file.
library(utils)
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile="Storm.csv.bz2")
Storm <- read.csv("Storm.csv.bz2", header=TRUE)
Exploring the name of the columns in the data-frame gives important information about the kind of data we are dealing with.
colnames(Storm)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
Storm$DATE <- as.Date(Storm$BGN_DATE, format = "%m/%d/%Y")
Storm$YEAR <- format(Storm$DATE,'%Y')
library(plyr)
library(dplyr)
# Two new columns named 'PROPDMGEXP.N' and 'CROPDMGEXP.N' are generated based on 'PROPDMGEXP' and 'CROPDMGEXP'. For instance, if 'PROPDMGEXP' value was "K", 'PROPDMGEXP.N' will contain '1000'.
Storm$PROPDMGEXP.N <- revalue(Storm$PROPDMGEXP, c("K"="1000", "M"="10^6", "m"="10^6", "B"="10^9"))
Storm$CROPDMGEXP.N <- revalue(Storm$CROPDMGEXP, c("K"="1000", "M"="10^6", "m"="10^6", "B"="10^9"))
# 'PROPDMGEXP' is multiplied by 'PROPDMGEXP.N', obtaining the total economic loss in property damage ('PROPDMG.T'). The same process is repeated to compute total crop damage cost ('CROPDMG.T').
Storm$PROPDMG.T <- as.numeric(Storm$PROPDMG) * as.numeric(as.character(Storm$PROPDMGEXP.N))
## Warning: NAs introduced by coercion
Storm$CROPDMG.T <- as.numeric(Storm$CROPDMG) * as.numeric(as.character(Storm$CROPDMGEXP.N))
## Warning: NAs introduced by coercion
Storm$EL <- rowSums(cbind(Storm$PROPDMG.T, Storm$CROPDMG.T), na.rm=TRUE)
Initially, and with the porpouse of exploring the data, we plot three relevant variables (Economic Loss, Fatalities and Injuries) put in relation with the State in which the weather event occurrs and the type of event. This allows us to see the State with more fatalities or the event type that caused more economic loss, for example.
# First, we create six new data.frames corresponding to the total sum of (EL, FATALITIES, INJURIES) by (EVTYPE, STATE).
ELpS <- ddply(Storm, .(STATE), summarize, sum = sum(EL, na.rm = TRUE))
ELpE <- ddply(Storm, .(EVTYPE), summarize, sum = sum(EL, na.rm = TRUE))
FpS <- ddply(Storm, .(STATE), summarize, sum = sum(FATALITIES, na.rm = TRUE))
IpS <- ddply(Storm, .(STATE), summarize, sum = sum(INJURIES, na.rm = TRUE))
FpE <- ddply(Storm, .(EVTYPE), summarize, sum = sum(FATALITIES, na.rm = TRUE))
IpE <- ddply(Storm, .(EVTYPE), summarize, sum = sum(INJURIES, na.rm = TRUE))
# Then, we plot the first six rows of those data.frames (arranged by the sum).
par(mfrow=c(3,2))
barplot(head(arrange(ELpS, desc(sum)))$sum, names.arg = head(arrange(ELpS, desc(sum)))$STATE, main="Economic Loss ~ State", las=2)
barplot(head(arrange(ELpE, desc(sum)))$sum, names.arg = head(arrange(ELpE, desc(sum)))$EVTYPE, main="Economic Loss ~ Event Type",las=2)
barplot(head(arrange(FpS, desc(sum)))$sum, names.arg = head(arrange(FpS, desc(sum)))$STATE, main="Fatalities ~ State", las=2)
barplot(head(arrange(FpE, desc(sum)))$sum, names.arg = head(arrange(FpE, desc(sum)))$EVTYPE, main="Fatalities ~ Event Type", las=2)
barplot(head(arrange(IpS, desc(sum)))$sum, names.arg = head(arrange(IpS, desc(sum)))$STATE, main="Injuries ~ State", las=2)
barplot(head(arrange(IpE, desc(sum)))$sum, names.arg = head(arrange(IpE, desc(sum)))$EVTYPE, main="Injuries ~ Event Type", las=2)
As we see in the three plots of the left, Texas is the most affected State in economic loss and injuries. In fatalities Illinois appears as the most striked territory.
Tornados are, by far, the most relevant weather event in economic loss, fatalities and injuries.
We are specially interested in the economic impact and fatalities ocurred in relation with the State the events occurred. We hope this will lead to more effective prevention measures and interventions.
But results may be biased by census numbers – it’s obvious that a State with a bigger population (like Texas) will have more economic loss and fatalities. For that reason we will weight both variables with the total census of each State. This population data is obtained from census.gov, and corresponds to 2010 census.
library(ggplot2)
library(maps)
library(mapproj)
# Create 'pop' data.frame with 'region' and 'CENSUS'
Total <- NULL
download.file("https://www.census.gov/popest/data/national/totals/2015/files/NST-EST2015-alldata.csv", destfile="NST-EST2015-alldata.csv")
population <- read.csv("NST-EST2015-alldata.csv", header=TRUE)
pop <- cbind(STATE=as.numeric(population$STATE), NAME=as.character(population$NAME), CENSUS=as.numeric(population$CENSUS2010POP))
pop <- as.data.frame(pop)
pop$region <- pop$NAME
pop <- pop[-c(1:5), ]
pop$region <- tolower(pop$region)
# Merge 'pop' dataframe with 'ELpS' (Economic Loss per State), and then divide economic loss by census in each State.
for (i in 1:nrow(ELpS)) {ELpS$STATENAME[i] <- state.name[match(ELpS$STATE[i],state.abb)]}
ELpS$STATENAME <- tolower(ELpS$STATENAME)
all_states <- map_data("state")
ELpS$region <- ELpS$STATENAME
Total <- merge(all_states, ELpS, by="region")
Total <- merge(pop, Total, by="region")
Total$STATE.x <- NULL
Total$STATE.y <- NULL
for (i in 1:nrow(Total)) Total$tot[i] <- (Total$sum[i]/as.numeric(as.character(Total$CENSUS[i])))
# Create Map.
p <- ggplot() + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill=Total$tot),colour="white"
) + scale_fill_continuous(low = "thistle2", high = "darkred", guide="colorbar")
p1 <- p + theme_bw() + labs(fill = "Economic Loss \nper Capita \nin USD", title = "Economic Impact of Natural Events (1950 - 2011)", x="", y="")
p1 + scale_y_continuous(breaks=c()) + scale_x_continuous(breaks=c()) + theme(panel.border = element_blank())
# The same process as above is repeated using FpS (Fatalities per State)
for (i in 1:nrow(FpS)) {FpS$STATENAME[i] <- state.name[match(FpS$STATE[i],state.abb)]}
FpS$STATENAME <- tolower(FpS$STATENAME)
all_states <- map_data("state")
FpS$region <- FpS$STATENAME
Total <- merge(all_states, FpS, by = "region")
Total <- merge(pop, Total, by = "region")
Total$STATE.x <- NULL
Total$STATE.y <- NULL
for (i in 1:nrow(Total)) Total$tot[i] <- (Total$sum[i]/as.numeric(as.character(Total$CENSUS[i])))
p <- ggplot() + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill = Total$tot),colour="white"
) + scale_fill_continuous(low = "thistle2", high = "darkred", guide="colorbar")
p1 <- p + theme_bw() + labs(fill = "Fatalities \nper Capita", title = "Fatalities caused by Natural Events (1950 - 2011)", x="", y="")
p1 + scale_y_continuous(breaks=c()) + scale_x_continuous(breaks=c()) + theme(panel.border = element_blank())