This report will explore the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database contains characteristics of major storms and weather events in the United States during 1950-2011. This report will find answer for two questions.
This report is part of a course project for Reproducible Research on Coursera.
This dataset can be downloaded from Storm Data
#workingDirectory <- "C:\\Users\\jack\\Desktop\\DataScience Specialization\\5 Reproducible Research\\week4"
#if(!file.exists(workingDirectory)) {
# dir.create(workingDirectory)
#}
#setwd(workingDirectory)
#url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
#download.file(url, destfile = "StormData.csv.bz2", mode = "wb")
data.raw <- read.table("StormData.csv.bz2", comment.char = "#", header = TRUE, sep = ",", na.strings = "")
Data are filtered to have only necessary columns so that it is faster when doing a process. We select columns “FATALITIES” and “INJURIES” to address “population health”.
# data for question 1
data.q1 <- data.raw[data.raw$FATALITIES > 0 | data.raw$INJURIES > 0, c("EVTYPE","FATALITIES","INJURIES")]
We found that data on column “EVTYPE” (Event Type) does not consistent. For example, for event type “Huricane”, some row is “HURRICANE ERIN” and some row is “HURRICANE EDOUARD”. So we need to map these unofficial event type to be official event type. The official event type comes from Storm Data Documentation page 6. There are 48 event types officially.
# official event type
official_ev <- c("Astronomical Low Tide", "Avalanche", "Blizzard", "Coastal Flood", "Cold/Wind Chill",
"Debris Flow", "Dense Fog", "Dense Smoke", "Drought", "Dust Devil", "Dust Storm",
"Excessive Heat", "Extreme Cold/Wind Chill", "Flash Flood", "Flood", "Freezing Fog",
"Frost/Freeze", "Funnel Cloud", "Hail", "Heat", "Heavy Rain", "Heavy Snow", "High Surf",
"High Wind", "Hurricane/Typhoon", "Ice Storm", "Lakeshore Flood", "Lake-Effect Snow",
"Lightning", "Marine Hail", "Marine High Wind", "Marine Strong Wind", "Marine Thunderstorm Wind",
"Rip Current", "Seiche", "Sleet", "Storm Tide", "Strong Wind", "Thunderstorm Wind",
"Tornado", "Tropical Depression", "Tropical Storm", "Tsunami", "Volcanic Ash", "Waterspout",
"Wildfire", "Winter Storm", "Winter Weather")
Then we map EVTYPE to be official_ev. We use function ‘amatch’ to do mapping.
library(stringdist)
## Warning: package 'stringdist' was built under R version 3.3.2
unofficial_ev_q1 <- unique(as.character(toupper(data.q1$EVTYPE)))
compare_ev_q1 <- cbind(unofficial_ev_q1, official_ev[amatch(unofficial_ev_q1, toupper(official_ev), maxDist = 2)])
compare_ev_q1 <- as.data.frame(compare_ev_q1)
names(compare_ev_q1) <- c("unoffice_ev", "official_ev")
data.q1$official_ev <- NULL
for (i in unique(data.q1$EVTYPE)) {
data.q1[data.q1$EVTYPE == i,c("official_ev")] = as.character(compare_ev_q1[compare_ev_q1$unoffice_ev == as.character(toupper(i)),2])
}
Now we have official_ev for each EVTYPE but not all of EVTYPE can be mapped. So We need to map them manually. We will aggregate FATALITIES and INJURIES based on each unmapped EVTYPE and then sorting them to find which unmapped EVTYPE should be mapped manually.
data.q1[is.na(data.q1$official_ev), c("official_ev")] = "unmapped"
data.q1.unmapped <- data.q1[data.q1$official_ev == "unmapped",]
tmp <- aggregate(data.q1.unmapped$FATALITIES ~ data.q1.unmapped$EVTYPE, FUN = sum )
tail(tmp[order(tmp[,2]),], n=10)
## data.q1.unmapped$EVTYPE data.q1.unmapped$FATALITIES
## 153 UNSEASONABLY WARM AND DRY 29
## 12 COLD 35
## 93 LANDSLIDE 38
## 64 HEAVY SURF/HIGH SURF 42
## 76 HURRICANE 61
## 37 FOG 62
## 27 EXTREME HEAT 96
## 26 EXTREME COLD 160
## 53 HEAT WAVE 172
## 145 TSTM WIND 504
tmp <- aggregate(data.q1.unmapped$INJURIES ~ data.q1.unmapped$EVTYPE, FUN = sum )
tail(tmp[order(tmp[,2]),], n=10)
## data.q1.unmapped$EVTYPE data.q1.unmapped$INJURIES
## 161 WIND 86
## 149 TSTM WIND/HAIL 95
## 88 ICE 137
## 27 EXTREME HEAT 155
## 45 GLAZE 216
## 26 EXTREME COLD 231
## 53 HEAT WAVE 309
## 160 WILD/FOREST FIRE 545
## 37 FOG 734
## 145 TSTM WIND 6957
So we will map these EVTYPE manually based on information from Storm Data Documentation (We are not sure that we map it properly).
data.q1[data.q1$EVTYPE == "TSTM WIND", c("official_ev")] = "Marine Thunderstorm Wind"
data.q1[data.q1$EVTYPE == "HEAT WAVE", c("official_ev")] = "Excessive Heat"
data.q1[data.q1$EVTYPE == "EXTREME COLD", c("official_ev")] = "Extreme Cold/Wind Chill"
data.q1[data.q1$EVTYPE == "EXTREME HEAT", c("official_ev")] = "Excessive Heat"
data.q1[data.q1$EVTYPE == "FOG", c("official_ev")] = "Dense Fog"
data.q1[data.q1$EVTYPE == "HURRICANE", c("official_ev")] = "Hurricane/Typhoon"
data.q1[data.q1$EVTYPE == "HEAVY SURF/HIGH SURF", c("official_ev")] = "High Surf"
data.q1[data.q1$EVTYPE == "COLD", c("official_ev")] = "Cold/Wind Chill"
data.q1[data.q1$EVTYPE == "UNSEASONABLY WARM AND DRY", c("official_ev")] = "Heat"
data.q1[data.q1$EVTYPE == "WILD/FOREST FIRE", c("official_ev")] = "Wildfire"
data.q1[data.q1$EVTYPE == "GLAZE", c("official_ev")] = "Freezing Fog"
data.q1[data.q1$EVTYPE == "ICE", c("official_ev")] = "Ice Storm"
data.q1[data.q1$EVTYPE == "TSTM WIND/HAIL", c("official_ev")] = "Marine Thunderstorm Wind"
data.q1[data.q1$EVTYPE == "WIND", c("official_ev")] = "Strong Wind"
Data are filtered to have only necessary columns for faster processing. We select columns “PROPDMG” (property damage) and “CROPDMG” (crop damage) to address “economic consequences”.
data.q2 <- data.raw[data.raw$PROPDMG > 0 | data.raw$CROPDMG > 0, c("EVTYPE","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
PROPDMGEXP is exponent value for PROPDMG and CROPDMGEXP is the same for CROPDMG. We will find TOTALDMG (value of total Damage in $) which is sum of PROPDMG and CROPDMG.
data.q2$TOTALPROPDMG <- 0
data.q2[is.na(data.q2$PROPDMGEXP),c("PROPDMGEXP")] = "?"
data.q2[data.q2$PROPDMGEXP == "B",c("TOTALPROPDMG")] = data.q2[data.q2$PROPDMGEXP == "B",c("PROPDMG")] * 1000000000
data.q2[data.q2$PROPDMGEXP == "h" | data.q2$PROPDMGEXP == "H",c("TOTALPROPDMG")] = data.q2[data.q2$PROPDMGEXP == "h" | data.q2$PROPDMGEXP == "H",c("PROPDMG")] * 100
data.q2[data.q2$PROPDMGEXP == "K",c("TOTALPROPDMG")] = data.q2[data.q2$PROPDMGEXP == "K",c("PROPDMG")] * 1000
data.q2[data.q2$PROPDMGEXP == "m" | data.q2$PROPDMGEXP == "M",c("TOTALPROPDMG")] = data.q2[data.q2$PROPDMGEXP == "m" | data.q2$PROPDMGEXP == "M",c("PROPDMG")] * 1000000
data.q2$TOTALCROPDMG <- 0
data.q2[is.na(data.q2$CROPDMGEXP),c("CROPDMGEXP")] = "?"
data.q2[data.q2$CROPDMGEXP == "B",c("TOTALCROPDMG")] = data.q2[data.q2$CROPDMGEXP == "B",c("CROPDMG")] * 1000000000
data.q2[data.q2$CROPDMGEXP == "k" | data.q2$CROPDMGEXP == "K", c("TOTALCROPDMG")] = data.q2[data.q2$CROPDMGEXP == "k" | data.q2$CROPDMGEXP == "K",c("CROPDMG")] * 1000
data.q2[data.q2$CROPDMGEXP == "m" | data.q2$CROPDMGEXP == "M", c("TOTALCROPDMG")] = data.q2[data.q2$CROPDMGEXP == "m" | data.q2$CROPDMGEXP == "M",c("CROPDMG")] * 1000000
data.q2$TOTALDMG <- data.q2$TOTALPROPDMG + data.q2$TOTALCROPDMG
Same as question 1, We need to map unofficial EVTYPE to be official_ev.
library(stringdist)
unofficial_ev_q2 <- unique(as.character(toupper(data.q2$EVTYPE)))
compare_ev_q2 <- cbind(unofficial_ev_q2, official_ev[amatch(unofficial_ev_q2, toupper(official_ev), maxDist = 2)])
compare_ev_q2 <- as.data.frame(compare_ev_q2)
names(compare_ev_q2) <- c("unoffice_ev", "official_ev")
data.q2$official_ev <- NULL
for (i in unique(data.q2$EVTYPE)) {
data.q2[data.q2$EVTYPE == i,c("official_ev")] = as.character(compare_ev_q2[compare_ev_q2$unoffice_ev == as.character(toupper(i)),2])
}
Since not all of EVTYPE can be mapped to official_ev. We will aggregate TOTALDMG based on each unmapped EVTYPE and then sorting them to find which unmapped EVTYPE should be mapped manually.
data.q2[is.na(data.q2$official_ev), c("official_ev")] = "unmapped"
data.q2.unmapped <- data.q2[data.q2$official_ev == "unmapped",]
tmp <- aggregate(data.q2.unmapped$TOTALDMG ~ data.q2.unmapped$EVTYPE, FUN = sum )
tail(tmp[order(tmp[,2]),], n=15)
## data.q2.unmapped$EVTYPE data.q2.unmapped$TOTALDMG
## 171 LANDSLIDE 344613000
## 156 HURRICANE ERIN 394110000
## 67 FREEZE 446430000
## 307 TYPHOON 601055000
## 215 SEVERE THUNDERSTORM 1205560000
## 40 EXTREME COLD 1360710400
## 284 TORNADOES, TSTM WIND, HAIL 1602500000
## 114 HEAVY RAIN/SEVERE WEATHER 2500000000
## 327 WILD/FOREST FIRE 3108626330
## 159 HURRICANE OPAL 3191846000
## 246 STORM SURGE/TIDE 4642038000
## 290 TSTM WIND 5038935790
## 209 RIVER FLOOD 10148404500
## 153 HURRICANE 14610229010
## 245 STORM SURGE 43323541000
So we will map these EVTYPE manually based on information from Storm Data Documentation (We are not sure that we map it properly).
data.q2[data.q2$EVTYPE == "TYPHOON", c("official_ev")] = "Hurricane/Typhoon"
data.q2[data.q2$EVTYPE == "SEVERE THUNDERSTORM", c("official_ev")] = "Thunderstorm Wind"
data.q2[data.q2$EVTYPE == "EXTREME COLD", c("official_ev")] = "Extreme Cold/Wind Chill"
data.q2[data.q2$EVTYPE == "TORNADOES, TSTM WIND, HAIL", c("official_ev")] = "Tornado"
data.q2[data.q2$EVTYPE == "HEAVY RAIN/SEVERE WEATHER", c("official_ev")] = "Heavy Rain"
data.q2[data.q2$EVTYPE == "WILD/FOREST FIRE", c("official_ev")] = "Wildfire"
data.q2[data.q2$EVTYPE == "HURRICANE OPAL", c("official_ev")] = "Hurricane/Typhoon"
data.q2[data.q2$EVTYPE == "STORM SURGE/TIDE", c("official_ev")] = "Storm Tide"
data.q2[data.q2$EVTYPE == "TSTM WIND", c("official_ev")] = "Marine Thunderstorm Wind"
data.q2[data.q2$EVTYPE == "RIVER FLOOD", c("official_ev")] = "Flash Flood"
data.q2[data.q2$EVTYPE == "HURRICANE", c("official_ev")] = "Hurricane/Typhoon"
data.q2[data.q2$EVTYPE == "STORM SURGE", c("official_ev")] = "Storm Tide"
We separate meaning of “population health”" into two terms “FATALITIES” and “INJURIES”.
Here are most FATALITIES events.
data.q1.plot1 <- aggregate(data.q1$FATALITIES ~ data.q1$official_ev, FUN = sum )
data.q1.plot1 <- data.q1.plot1[order(data.q1.plot1[,2]),]
names(data.q1.plot1) <- c("official_ev", "FATALITIES")
tail(data.q1.plot1, n=10)
## official_ev FATALITIES
## 10 Extreme Cold/Wind Chill 285
## 12 Flood 470
## 34 unmapped 491
## 26 Marine Thunderstorm Wind 519
## 27 Rip Current 572
## 23 Lightning 817
## 16 Heat 966
## 11 Flash Flood 980
## 9 Excessive Heat 2171
## 31 Tornado 5633
slices <- tail(data.q1.plot1,n=6)[,c("FATALITIES")]
lbls <- tail(data.q1.plot1,n=6)[,c("official_ev")]
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # add % to labels
lbls <- paste(lbls, "(", slices, ")") # add fatalities value
pie(slices,labels = lbls, col=rainbow(length(lbls)), main="Most Fatalities Events")
Here are most INJURIES events.
data.q1.plot2 <- aggregate(data.q1$INJURIES ~ data.q1$official_ev, FUN = sum )
data.q1.plot2 <- data.q1.plot2[order(data.q1.plot2[,2]),]
names(data.q1.plot2) <- c("official_ev", "INJURIES")
tail(data.q1.plot2, n=10)
## official_ev INJURIES
## 36 Wildfire 1606
## 11 Flash Flood 1777
## 16 Heat 2100
## 22 Ice Storm 2112
## 30 Thunderstorm Wind 2411
## 23 Lightning 5230
## 12 Flood 6789
## 9 Excessive Heat 6989
## 26 Marine Thunderstorm Wind 7078
## 31 Tornado 91346
slices <- tail(data.q1.plot2,n=6)[,c("INJURIES")]
lbls <- tail(data.q1.plot2,n=6)[,c("official_ev")]
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # add % to labels
lbls <- paste(lbls, "(", slices, ")") # add injuries value
pie(slices,labels = lbls, col=rainbow(length(lbls)), main="Most Injuries Events", cex = 0.8)
We use TOTALDMG (total damage in $) which is sum of PROPDMG (property damage) and CROPDMG (crop damage) to address economic consequences.
Here are greatest economic consequences events.
data.q2.plot <- aggregate(data.q2$TOTALDMG ~ data.q2$official_ev, FUN = sum )
data.q2.plot <- data.q2.plot[order(data.q2.plot[,2]),]
names(data.q2.plot) <- c("official_ev", "TOTALDMG")
tail(data.q2.plot, n=10)
## official_ev TOTALDMG
## 40 Tropical Storm 8382236550
## 45 Wildfire 8894313130
## 25 Ice Storm 8967041310
## 8 Drought 15018672000
## 18 Hail 18764271720
## 13 Flash Flood 27719811610
## 35 Storm Tide 47965579000
## 38 Tornado 58954615090
## 24 Hurricane/Typhoon 90316842810
## 14 Flood 150325728250
slices <- tail(data.q2.plot,n=8)[,c("TOTALDMG")]
lbls <- tail(data.q2.plot,n=8)[,c("official_ev")]
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # add % to labels
lbls <- paste(lbls, "($", round(slices/1000000000,digits = 2) , "billion)") # add total damage in $billion
pie(slices,labels = lbls, col=rainbow(length(lbls)), main="Greatest Economic Consequences Events", cex=0.8)