To this project, we taken the storm database from National Oceanic and Atmospheric Administration’s (NOAA) First, loaded data to R using read.csv() function
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df<-read.csv("repdata_data_StormData.csv.bz2")
The dplyr library will help to process the data
The data has 37 columns, most of there are not important to our analysis so we only will take EVTYPE,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP
df2<-select(df,c("EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP"))
To know how much afect an event to poblation, lets sum all data from injuries and fatalities events
# Let's check total number of injuries and fatalities by event
health<-aggregate(cbind(df2$INJURIES,df2$FATALITIES),by=list(df2$EVTYPE),FUN=sum,na.rm=TRUE)
names(health)<-c("EVTYPE","Total_Injuries","Total_Fatalities")
Now we have te total for each event, so we need also the total sum of injuries and fatalities
health$Total<-health$Total_Injuries + health$Total_Fatalities
str(health)
## 'data.frame': 985 obs. of 4 variables:
## $ EVTYPE : chr " HIGH SURF ADVISORY" " COASTAL FLOOD" " FLASH FLOOD" " LIGHTNING" ...
## $ Total_Injuries : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Total_Fatalities: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Total : num 0 0 0 0 0 0 0 0 0 0 ...
We can appreciate that are a lot of columns with 0 in both, injuries and fatalities
Let’s eliminate that elements
health<-filter(health,Total_Fatalities != 0 & Total_Injuries != 0)
str(health)
## 'data.frame': 106 obs. of 4 variables:
## $ EVTYPE : chr "AVALANCHE" "BLACK ICE" "BLIZZARD" "blowing snow" ...
## $ Total_Injuries : num 170 24 805 1 13 2 1 48 12 342 ...
## $ Total_Fatalities: num 224 1 101 1 1 3 3 35 95 18 ...
## $ Total : num 394 25 906 2 14 5 4 83 107 360 ...
Now we need to order the data With the argument (-total) we order in decreasing
health<-arrange(health,-Total)
Checking the most harmful event
k<-health[which.max(health$Total_Injuries),"EVTYPE"]
p<-health[which.max(health$Total_Fatalities),"EVTYPE"]
i<-health[health$EVTYPE=="TORNADO","Total_Injuries"]/sum(health$Total_Injuries)
o<-health[health$EVTYPE=="TORNADO","Total_Fatalities"]/sum(health$Total_Fatalities)
Now we can appreciate that the most harmful event related with injuries irTORNADO Also for fatalities TORNADO is the most harmful
Is important to know how much affect this, so the percentage of irTORNADO over all events is: 0.6516522
Also, for fatalities, the percentage over all is: 0.3770667
In our data, we have diferent types of values like “M” or “B” This are the prefixes of IS units. To calculate te total of economic consequences, we need change te letter for a numeric value
df2$PROPDMGEXP <- gsub("[Hh]", "2", df2$PROPDMGEXP)
df2$PROPDMGEXP <- gsub("[Kk]", "3", df2$PROPDMGEXP)
df2$PROPDMGEXP <- gsub("[Mm]", "6", df2$PROPDMGEXP)
df2$PROPDMGEXP <- gsub("[Bb]", "9", df2$PROPDMGEXP)
df2$PROPDMGEXP <- gsub("\\+", "1", df2$PROPDMGEXP)
df2$PROPDMGEXP <- gsub("\\?|\\-|\\ ", "0", df2$PROPDMGEXP)
df2$PROPDMGEXP <- as.numeric(df2$PROPDMGEXP)
df2$CROPDMGEXP <- gsub("[Hh]", "2", df2$CROPDMGEXP)
df2$CROPDMGEXP <- gsub("[Kk]", "3", df2$CROPDMGEXP)
df2$CROPDMGEXP <- gsub("[Mm]", "6", df2$CROPDMGEXP)
df2$CROPDMGEXP <- gsub("[Bb]", "9", df2$CROPDMGEXP)
df2$CROPDMGEXP <- gsub("\\+", "1", df2$CROPDMGEXP)
df2$CROPDMGEXP <- gsub("\\-|\\?|\\ ", "0", df2$CROPDMGEXP)
df2$CROPDMGEXP <- as.numeric(df2$CROPDMGEXP)
df2$PROPDMGEXP[is.na(df2$PROPDMGEXP)] <- 0
df2$CROPDMGEXP[is.na(df2$CROPDMGEXP)] <- 0
Adding the final value (the damaged elevated to the power of)
df2 <- mutate(df2, PROPDMGTOTAL = PROPDMG * (10 ^ PROPDMGEXP), CROPDMGTOTAL = CROPDMG * (10 ^ CROPDMGEXP))
Now we can calculate the total of economic consequences by event and also, aggregate the sum of both in each case
loss<-aggregate(cbind(df2$PROPDMGTOTAL,df2$CROPDMGTOTAL),by=list(df2$EVTYPE),FUN=sum,na.rm=TRUE)
names(loss)<-c("EVTYPE","PROPDMGTOTAL","CROPDMGTOTAL")
loss$TOTAL <- loss$PROPDMGTOTAL + loss$CROPDMGTOTAL
Lastly, order the data to easy manipulate
loss<-arrange(loss,-TOTAL)
head(loss)
## EVTYPE PROPDMGTOTAL CROPDMGTOTAL TOTAL
## 1 FLOOD 144657709807 5661968450 150319678257
## 2 HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3 TORNADO 56947381217 414953270 57362334487
## 4 STORM SURGE 43323536000 5000 43323541000
## 5 HAIL 15735267513 3025954473 18761221986
## 6 FLASH FLOOD 16822673979 1421317100 18243991079
After making all data prossesing, we can see the results
For the mos damage event, we only put the most 10 harmful
par(mfrow=c(2,1))
barplot(health$Total_Injuries[1:10],names.arg=health$EVTYPE[1:10],main="Total injuries in US by event",col="orange",cex.names=0.7,las=2)
barplot(health$Total_Fatalities[1:10],names.arg=health$EVTYPE[1:10],main="Total fatalities in US by event",col="red",cex.names=0.7,las=2)
Also, here is the top 10 of the events that had the gratest economic consequences
par(mfrow=c(2,1))
barplot(loss$PROPDMGTOTAL[1:10],names.arg=loss$EVTYPE[1:10],main="Total propiety damages by event",col="orange",cex.names=0.7,las=2)
barplot(loss$CROPDMGTOTAL[1:10],names.arg=loss$EVTYPE[1:10],main="Total crop damages by event",col="red",cex.names=0.7,las=2)
As we can see, in the US the most harmful event is Tornado, However this no generate the most economic consequence as far as flood. To this point, we need decide what is mot important for a nation, if the lives of their population or their economics