This is a simple data analysis on NOAA Storm Databse. The goal is to identify which types of events are most harmful to population health, and which types of events have the greatest economic consequences.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
First, let’s load raw data into R, and perform the data transformation:
stormdata<-read.csv("StormData.csv")
stormdata$EVTYPE <- tolower(stormdata$EVTYPE)
stormdata$EVTYPE <- sub("^tstm.*", "tropical storm", stormdata$EVTYPE)
stormdata$EVTYPE <- sub("tropical storm.*", "tropical storm", stormdata$EVTYPE)
stormdata$EVTYPE <- sub("^tornado.*","tornado",stormdata$EVTYPE)
stormdata$EVTYPE <- sub("^thunder.*", "thunderstorm", stormdata$EVTYPE)
stormdata$EVTYPE <- sub("^wild.*", "wildfire", stormdata$EVTYPE)
stormdata$EVTYPE <- sub(".*flash flood.*", "flash flood", stormdata$EVTYPE)
stormdata$EVTYPE <- sub("^heat.*", "heat", stormdata$EVTYPE)
stormdata$EVTYPE <- sub("^hurricane.*", "hurricane", stormdata$EVTYPE)
stormdata$EVTYPE <- sub("^rip.*", "rip current", stormdata$EVTYPE)
stormdata$EVTYPE <- sub("winter weather.*", "winter weather", stormdata$EVTYPE)
stormdata$PROPDMG <- ifelse(stormdata$PROPDMGEXP %in% c("h", "H", "2"), stormdata$PROPDMG*100, stormdata$PROPDMG)
stormdata$PROPDMG <- ifelse(stormdata$PROPDMGEXP %in% c("k", "K", "3"), stormdata$PROPDMG*1000, stormdata$PROPDMG)
stormdata$PROPDMG <- ifelse(stormdata$PROPDMGEXP %in% c("4"), stormdata$PROPDMG*10000, stormdata$PROPDMG)
stormdata$PROPDMG <- ifelse(stormdata$PROPDMGEXP %in% c("5"), stormdata$PROPDMG*100000, stormdata$PROPDMG)
stormdata$PROPDMG <- ifelse(stormdata$PROPDMGEXP %in% c("m", "M", "6"), stormdata$PROPDMG*1000000, stormdata$PROPDMG)
stormdata$PROPDMG <- ifelse(stormdata$PROPDMGEXP %in% c("7"), stormdata$PROPDMG*10000000, stormdata$PROPDMG)
stormdata$PROPDMG <- ifelse(stormdata$PROPDMGEXP %in% c("8"), stormdata$PROPDMG*100000000, stormdata$PROPDMG)
stormdata$PROPDMG <- ifelse(stormdata$PROPDMGEXP %in% c("b", "B", "9"), stormdata$PROPDMG*1000000000, stormdata$PROPDMG)
stormdata$CROPDMG <- ifelse(stormdata$CROPDMGEXP %in% c("h", "H", "2"), stormdata$CROPDMG*100, stormdata$CROPDMG)
stormdata$CROPDMG <- ifelse(stormdata$CROPDMGEXP %in% c("k", "K", "3"), stormdata$CROPDMG*1000, stormdata$CROPDMG)
stormdata$CROPDMG <- ifelse(stormdata$CROPDMGEXP %in% c("4"), stormdata$CROPDMG*10000, stormdata$CROPDMG)
stormdata$CROPDMG <- ifelse(stormdata$CROPDMGEXP %in% c("5"), stormdata$CROPDMG*100000, stormdata$CROPDMG)
stormdata$CROPDMG <- ifelse(stormdata$CROPDMGEXP %in% c("m", "M", "6"), stormdata$CROPDMG*1000000, stormdata$CROPDMG)
stormdata$CROPDMG <- ifelse(stormdata$CROPDMGEXP %in% c("7"), stormdata$CROPDMG*10000000, stormdata$CROPDMG)
stormdata$CROPDMG <- ifelse(stormdata$CROPDMGEXP %in% c("8"), stormdata$CROPDMG*100000000, stormdata$CROPDMG)
stormdata$CROPDMG <- ifelse(stormdata$CROPDMGEXP %in% c("b", "B", "9"), stormdata$CROPDMG*1000000000, stormdata$CROPDMG)
To identify which types of events are most harmful to population health, we calculate the sum of Fatalities and Injuries for each type of events and plot the result
healthdamage<-aggregate(stormdata$FATALITIES+stormdata$INJURIES, by=list(stormdata$EVTYPE), FUN=sum)
sortDamage<-healthdamage[order(healthdamage$x, decreasing = TRUE),]
top5<-sortDamage[1:5,]
qplot(Group.1, x, data=top5, ylab="Health Damage")
Apparently storm type “TORNADO” causes most damage to population health.
Now let’s move on to the second analysis, by calculating the total sum of PROPDMG and CROPDMG, we will be able to identify the storm type which causes most economic consequnces.
concequence<-aggregate(stormdata$PROPDMG+stormdata$CROPDMG, by=list(stormdata$EVTYPE), FUN=sum)
sortDamage2<-concequence[order(concequence$x, decreasing = TRUE),]
top52<-sortDamage2[1:5,]
qplot(Group.1, x, data=top52, ylab="Economic Consequence")
The plot suggests “Flood” cause most economic consequence.