StormDataAnalyses

Data processing

download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "stormdata.bz2")
stormdata <- read.csv("stormdata.bz2")
dim(stormdata)

## [1] 902297     37

length(unique(stormdata$EVTYPE))

## [1] 985

After loading our data and having a quick look with the dim() and unique() functions we can see that we are dealing with a very large and messy dataset. Calling unique on the EVTYPE variable returns 985 different results.

To make our work easier we will remove any data variables not required for our analyses.

According to the NOAA full reporting of all 48 weather events only began in 1996. To get an accurate comparison of weather events we are only going to look at data from 1996 onwards:

stormdata$BGN_DATE <- as.Date(as.character(stormdata$BGN_DATE), format = "%m/%d/%Y %H:%M:%OS")
stormdata1 <- subset(stormdata, BGN_DATE >= "1996-01-01")
stormdata2 <- stormdata1[, c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG","PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]

Effects on population health

First we are going to look at which events cause the most fatalities. We subset the data so that we only have observations with one fatality or more and then we aggregate the total number of fatalities by event type.

fatalevent <- stormdata2[stormdata2$FATALITIES>0,]
fatalevent_bytype <- aggregate(fatalevent$FATALITIES, list(fatalevent$EVTYPE), sum)
names(fatalevent_bytype) <- c("EVTYPE", "FATALITIES")
dim(fatalevent_bytype)

## [1] 109   2

A quick look at our aggregated fatality data above reveals 109 unique weather events causing fatalities. This is over double the official 48 events. To deal with this we are going to take the 3rd quantile (ie events that caused 42 deaths or more) and manually group different EVTYPES which correspond to the same event.

thirdquantile <- subset(fatalevent_bytype, FATALITIES >= 42)
thirdquantile <- arrange(thirdquantile, desc(FATALITIES))

This narrows down our data to 28 different events, however it is obvious there is significant overlap with names like Excessive Heat and Heat, TSTM WIND and Thunderstorm wind, RIP CURRENT and RIP CURRENTS. We are therefore going to rename these obsevations to their official names and re-aggregate our data:

thirdquantile[7,1] <- "THUNDERSTORM WIND"
thirdquantile[8,1] <- "EXCESSIVE HEAT"
thirdquantile[11,1] <- "RIP CURRENT"
fatalities_sorted <- aggregate(thirdquantile$FATALITIES, list(thirdquantile$EVTYPE), sum)
names(fatalities_sorted) <- c("EVTYPE", "FATALITIES")
fatalities_sorted <- arrange(fatalities_sorted, desc(FATALITIES))
head(fatalities_sorted)

##           EVTYPE FATALITIES
## 1 EXCESSIVE HEAT       2034
## 2        TORNADO       1511
## 3    FLASH FLOOD        887
## 4      LIGHTNING        651
## 5    RIP CURRENT        542
## 6          FLOOD        414

We repeat the same process for injuries:

injuryevent <- stormdata2[stormdata2$INJURIES>0,]
injurybytype <- aggregate(injuryevent$INJURIES, list(injuryevent$EVTYPE), sum)
names(injurybytype) <- c("EVTYPE", "INJURIES")
dim(injurybytype)

## [1] 106   2

Like we did for fatality data we take the third quantile:

injury_3rdquant <- filter(injurybytype, INJURIES>=154) 
injury_3rdquant <- arrange(injury_3rdquant, desc(INJURIES))

Cleaning up our EVTYPE names:

injury_3rdquant[5,1] <- "THUNDERSTORM WIND"
injury_sorted <- aggregate(injury_3rdquant$INJURIES, list(injury_3rdquant$EVTYPE), sum)
names(injury_sorted) <- c("EVTYPE", "INJURIES")
injury_sorted <- arrange(injury_sorted, desc(INJURIES))
head(injury_sorted)

##              EVTYPE INJURIES
## 1           TORNADO    20667
## 2             FLOOD     6758
## 3    EXCESSIVE HEAT     6391
## 4 THUNDERSTORM WIND     5029
## 5         LIGHTNING     4141
## 6       FLASH FLOOD     1674

Economic effects

Looking at our data we can see that next to Cropdmg and Propdmg variables there is an exponent column specifying the size of the number.

table(stormdata2$CROPDMGEXP)

## 
##             ?      0      2      B      k      K      m      M 
## 373069      0      0      0      4      0 278686      0   1771

table(stormdata2$PROPDMGEXP)

## 
##             -      ?      +      0      1      2      3      4      5 
## 276185      0      0      0      1      0      0      0      0      0 
##      6      7      8      B      h      H      K      m      M 
##      0      0      0     32      0      0 369938      0   7374

From the above tables we can see that the majority of values are in B (billions) M (millions) and K (thousands). Since we want to know which events cause the greatest amount of economic damage we are only going to look at events that caused damage in the billions (B) and millions (M).

propertydmg <- stormdata2[stormdata2$PROPDMGEXP== "M" | stormdata2$PROPDMGEXP== "B",]
dim(propertydmg)

## [1] 7406    8

cropdmg <- stormdata2[stormdata2$CROPDMGEXP== "M" | stormdata2$CROPDMGEXP== "B",]
dim(cropdmg)

## [1] 1775    8

Next we convert observations with a “B” exponent into “millions” so that all PROPDMG and CROPDMG numbers are the same units:

propertydmg$millions <- with(propertydmg, ifelse(PROPDMGEXP=="B", PROPDMG*1000, PROPDMG))
cropdmg$millions <- with(cropdmg, ifelse(CROPDMGEXP=="B", CROPDMG*1000, CROPDMG))

Now that we have done away with the exponent column we can aggregate our data:

propertydmg_byevent <- aggregate(propertydmg$millions, list(propertydmg$EVTYPE), sum)
names(propertydmg_byevent) <- c("EVTYPE", "PROPERTY_DMG_MILLIONS")
propertydmg_byevent <- arrange(propertydmg_byevent, desc(PROPERTY_DMG_MILLIONS))
head(propertydmg_byevent)

##              EVTYPE PROPERTY_DMG_MILLIONS
## 1             FLOOD             143140.66
## 2 HURRICANE/TYPHOON              69303.87
## 3       STORM SURGE              43174.93
## 4           TORNADO              23447.22
## 5              HAIL              14032.06
## 6       FLASH FLOOD              13987.63

Repeat the same process for crop damage:

cropdmg_byevent <- aggregate(cropdmg$millions, list(cropdmg$EVTYPE), sum)
names(cropdmg_byevent) <- c("EVTYPE", "CROP_DMG_MILLIONS")
cropdmg_byevent <- arrange(cropdmg_byevent, desc(CROP_DMG_MILLIONS))
head(cropdmg_byevent)

##              EVTYPE CROP_DMG_MILLIONS
## 1           DROUGHT          13346.62
## 2             FLOOD           4827.78
## 3         HURRICANE           2739.31
## 4 HURRICANE/TYPHOON           2604.17
## 5              HAIL           1979.67
## 6      EXTREME COLD           1284.64

Results

Plotting our results for population health analyses:

fatalitieshead <- head(fatalities_sorted)
par(mfrow=c(2,1))
barplot(fatalitieshead$FATALITIES, main = "Top 6 causes of fatalities", xlab = "EVTYPE", ylab = "FATALITIES", names.arg = fatalitieshead$EVTYPE, legend.text = TRUE, cex.names = 0.35, col = rainbow(5), cex.axis = 0.5, ylim = range(fatalitieshead$FATALITIES))
injuryhead <- head(injury_sorted)
barplot(injuryhead$INJURIES, main = "Top 6 causes of INJURY", xlab = "EVTYPE", ylab = "INJURIES", names.arg = injuryhead$EVTYPE, legend.text = TRUE, cex.names = 0.35, col = rainbow(5), cex.axis = 0.5, ylim = range(injuryhead$INJURIES))

Plotting our results for economic damage analysis:

headcropdmg <- head(cropdmg_byevent)
barplot(headcropdmg$CROP_DMG_MILLIONS, main = "Top 6 causes of crop damage", xlab = "EVTYPE", ylab = "Damage($millions)", names.arg = headcropdmg$EVTYPE, cex.names = 0.5, col = rainbow(5), ylim =  c(0, 13500))

damagevector <- propertydmg_byevent[1:6,2]
barplot(damagevector, main = "Top 6 causes of property damage", xlab = "EVTYPE", ylab = "Damage($millions)", names.arg = propertydmg_byevent$EVTYPE[1:6], col = rainbow(5), cex.names = 0.5, ylim = c(0, 143200))

StormDataAnalyses

Tim English

11 July 2019

Synopsis

Data processing

Effects on population health

Economic effects

Results