Data from NOAA Storm Database was given to understand how weather affects people and properties in United States. Here, we analyze this file to determine Tornado as the most lethal weather for people (considering deaths and injuries), whereas Floods constitutes a serious risk for properties and crops with 150 billion dollars lost in a large period of time (1950-2011). ### Data Processing Let’s download the bz2 file.
fileURL<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
#Download file
download.file(fileURL,"data.bz2","curl")
Now lets read the file and select just relevant information for our purpose. The events in the database start in the year 1950 and end in November 2011, however, we do not need to look this.
read.csv("data.bz2")->data
data <- data[ , c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
str(data)
## 'data.frame': 902297 obs. of 7 variables:
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
Let’s organize data from fatalities and injuries, selecting the top10 for each group. Then, let’s sum these values for each type of weather events.
#Selecting the top 10 weather for fatalities
fatalEvents <- aggregate(FATALITIES ~ EVTYPE, data=data, sum)
data.frame(fatalEvents,condition=rep("fatality",nrow(fatalEvents)))->fatalEvents
fatalEvents <- fatalEvents[order(-fatalEvents$FATALITIES), ][1:10, ]
#Selecting the top 10 weather for injuries
injuriesEvents <- aggregate(INJURIES ~ EVTYPE, data=data, sum)
data.frame(injuriesEvents,condition=rep("injuries",nrow(injuriesEvents)))->injuriesEvents
injuriesEvents <- injuriesEvents[order(-injuriesEvents$INJURIES), ][1:10, ]
#Just to see the head of each group
head(fatalEvents)
## EVTYPE FATALITIES condition
## 834 TORNADO 5633 fatality
## 130 EXCESSIVE HEAT 1903 fatality
## 153 FLASH FLOOD 978 fatality
## 275 HEAT 937 fatality
## 464 LIGHTNING 816 fatality
## 856 TSTM WIND 504 fatality
head(injuriesEvents)
## EVTYPE INJURIES condition
## 834 TORNADO 91346 injuries
## 856 TSTM WIND 6957 injuries
## 170 FLOOD 6789 injuries
## 130 EXCESSIVE HEAT 6525 injuries
## 464 LIGHTNING 5230 injuries
## 275 HEAT 2100 injuries
#Combinating both fatalities and injuries in a same object
colnames(fatalEvents)->colnames(injuriesEvents)
rbind(fatalEvents,injuriesEvents)->damage
damage[which(damage$FATALITIES!=0),]->damage
Now, let’s edit the exponential values for crop and properties (CROPDMGEXP and PROPDMGEXP). Remember that “K” (or “k”) stands for kilo (10^3), “M” (or “m”) stands for mega (10^6), and so on.
#Create a matrix with possibilities to transform exponential data.
c("k","K","m","M","B","b","H","h")->letters
c(3,3,6,6,9,9,2,2)->values
data.frame(letters=letters,values=values)->matriz
#Using our matrix, let's transform data object
transform(data,CROPDMGEXP=ifelse(data$CROPDMGEXP%in%matriz$letters,matriz$values[match(data$CROPDMGEXP,matriz$letters)],data$CROPDMGEXP))->data
as.numeric(data$CROPDMGEXP)->data$CROPDMGEXP
## Warning: NAs introducidos por coerción
transform(data,PROPDMGEXP=ifelse(data$PROPDMGEXP%in%matriz$letters,matriz$values[match(data$PROPDMGEXP,matriz$letters)],data$PROPDMGEXP))->data
as.numeric(data$PROPDMGEXP)->data$PROPDMGEXP
## Warning: NAs introducidos por coerción
#Convert data in ten values
ifelse(is.na(data$PROPDMGEXP),1,10^data$PROPDMGEXP)->data$PROPDMGEXP
ifelse(is.na(data$CROPDMGEXP),1,10^data$CROPDMGEXP)->data$CROPDMGEXP
#Add a column with a total value of properties and crop damage
(data$PROPDMG*data$PROPDMGEXP)+(data$CROPDMG*data$CROPDMGEXP)->result
cbind(data,result)->data
#Just to see our data
head(data)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP result
## 1 TORNADO 0 15 25.0 1000 0 1 25000
## 2 TORNADO 0 0 2.5 1000 0 1 2500
## 3 TORNADO 0 2 25.0 1000 0 1 25000
## 4 TORNADO 0 2 2.5 1000 0 1 2500
## 5 TORNADO 0 2 2.5 1000 0 1 2500
## 6 TORNADO 0 6 2.5 1000 0 1 2500
According to our results, Tornado is the weather event most harmful to population health.
#Construct a object with sum of fatalities and injuries
aggregate(result~EVTYPE,data=data,sum)->total
#Organizing the information
total <- total[order(-total$result), ][1:10, ]
head(total)
## EVTYPE result
## 170 FLOOD 150319678257
## 411 HURRICANE/TYPHOON 71913712800
## 834 TORNADO 57362333946
## 670 STORM SURGE 43323541000
## 244 HAIL 18761221986
## 153 FLASH FLOOD 18243991078
#Plotting a graph to summarize
library("ggplot2")
ggplot(damage,aes(x=EVTYPE,y=FATALITIES))->g
g+geom_bar(stat="identity")+facet_grid(condition~.,scales="free_y")+theme(axis.text.x = element_text(angle = 90, hjust = 1))+xlab("Number of affected people")+ylab("Type of event")
According to our analysis, Flood is the weather event which more economic consequences show.
#Just plotting our result
ggplot(total,aes(x=EVTYPE,y=result))->g
g+geom_bar(stat="identity",fill="green")+theme(axis.text.x = element_text(angle = 90, hjust = 1))+ylab("Total damage ($)")+xlab("Type of event")+ggtitle("Top 10 Weather Events damaging properties and crop")