In this report we want to analyse how severe weather events affet the population health and his economic effects. For that we will explore the NOAA Storm Database. The events in the database start in the year 1950 and end in November 2011.
We want to address the following questions:
- Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health (fatalities and injuries)?
- Across the United States, which types of events have the greatest economic consequences?
Here are listed the packages needed for the analysis
library(dplyr)
library(reshape2)
library(ggplot2)
Data can be downloaded from here
DF <- read.csv("repdata-data-StormData.csv.bz2")
After reading the file we check the dimmension of file and inspect the first columns
dim(DF)
## [1] 902297 37
head(DF[,1:12])
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE
## 1 TORNADO 0
## 2 TORNADO 0
## 3 TORNADO 0
## 4 TORNADO 0
## 5 TORNADO 0
## 6 TORNADO 0
DF_clean <- DF[, c("BGN_DATE","EVTYPE", "FATALITIES", "INJURIES","PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
DF_clean <- DF_clean[complete.cases(DF_clean),] # We remove rows with NAs
DF_clean$EVTYPE[grepl("HEAT", DF_clean$EVTYPE)]<-"HEAT"
DF_clean$EVTYPE[grepl("WARM", DF_clean$EVTYPE)]<-"HEAT"
DF_clean$EVTYPE[grepl("RIP CURRENT", DF_clean$EVTYPE)]<-"RIP CURRENT"
DF_clean$EVTYPE[grepl("FLOOD", DF_clean$EVTYPE)]<-"FLOOD"
DF_clean$EVTYPE[grepl("THUNDERSTORM", DF_clean$EVTYPE)]<-"THUNDERSTORM"
DF_clean$EVTYPE[grepl("WIND", DF_clean$EVTYPE)]<-"WIND"
DF_clean$EVTYPE[grepl("SNOW", DF_clean$EVTYPE)]<-"SNOW"
DF_clean$EVTYPE[grepl("FIRE", DF_clean$EVTYPE)]<-"WILDFIRE"
DF_clean$EVTYPE[grepl("HURRICANE", DF_clean$EVTYPE)]<-"HURRICANE"
DF_clean$EVTYPE[grepl("TYPHOON", DF_clean$EVTYPE)]<-"HURRICANE"
colnames(DF_clean)[2] <- "events" #Rename the value "EVTYPE"" by "events"
levels(DF_clean$PROPDMGEXP)<-c(levels(DF_clean$PROPDMGEXP), 0:9)
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP=="B"]<-"9"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP=="M"]<-"6"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP=="m"]<-"6"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP=="K"]<-"3"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP=="H"]<-"2"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP=="h"]<-"2"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP==" "]<-"0"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP==""]<-"0"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP=="+"]<-"0"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP=="-"]<-"0"
DF_clean$PROPDMGEXP[DF_clean$PROPDMGEXP=="?"]<-"0"
DF_clean$PROPDMGEXP[is.na(DF_clean$PROPDMGEXP)]<-"0"
DF_clean$PROPDMGEXP <- as.numeric(as.character(DF_clean$PROPDMGEXP))
table(DF_clean$PROPDMGEXP)
##
## 0 1 2 3 4 5 6 7 8 9
## 466164 25 20 424669 4 28 11341 5 1 40
levels(DF_clean$CROPDMGEXP)<-c(levels(DF_clean$CROPDMGEXP), 0:9)
DF_clean$CROPDMGEXP[DF_clean$CROPDMGEXP=="B"]<-"9"
DF_clean$CROPDMGEXP[DF_clean$CROPDMGEXP=="M"]<-"6"
DF_clean$CROPDMGEXP[DF_clean$CROPDMGEXP=="m"]<-"6"
DF_clean$CROPDMGEXP[DF_clean$CROPDMGEXP=="K"]<-"3"
DF_clean$CROPDMGEXP[DF_clean$CROPDMGEXP=="k"]<-"3"
DF_clean$CROPDMGEXP[DF_clean$CROPDMGEXP=="?"]<-"0"
DF_clean$CROPDMGEXP[DF_clean$CROPDMGEXP==""]<-"0"
DF_clean$CROPDMGEXP[is.na(DF_clean$CROPDMGEXP)]<-"0"
DF_clean$CROPDMGEXP <- as.numeric(as.character(DF_clean$CROPDMGEXP))
table(DF_clean$CROPDMGEXP)
##
## 0 2 3 6 9
## 618439 1 281853 1995 9
DF_clean$Total_DM <- ((DF_clean$PROPDMG*10^(DF_clean$PROPDMGEXP)) +
(DF_clean$CROPDMG*10^(DF_clean$CROPDMGEXP)))
Now we collect all the results in one list: