Here the influence of storm events on USA is estimated using the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. It was found that across the United States, excessive heat, tornadoes and floods affect public health most considerably. Floods and hurricanes have the greatest economic consequences.
library(knitr)
library(data.table)
if(!file.exists("storm.csv.bz")) {
URL<- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(URL, "storm.csv.bz")
bz<- bzfile("storm.csv.bz")
storm<- read.csv(bz)
unlink(bz)
dateDownloaded<- date()
}
The file was downloaded at Sat Mar 14 13:13:32 2015. In the code below, the variables BGN_DATE, EVTYPE, FATALITIES:CROPGMGEXP, and REFNUM are selected for the analysis. Other variables aren’t needed for the questions studied. Main damage entries are enough for the goal, so I exclude the remark column.
DT<- as.data.table(storm)
names(DT)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
DT<- DT[,.(BGN_DATE,EVTYPE,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP,REFNUM)]
Next is data cleaning. Fistly, variable names are converted to more convenient form.
suppressWarnings(names(DT)<- c("bg.date", "type", "fatalities", "injuries", "prop.damage", "prop.damage.exp", "crop.damage", "crop.damage.exp", "ref.num"))
Then pre-1996 rows are filtered out since they contain many poor and invalid entries.
DT<- DT[, bg.date := as.character(bg.date)]
DT<- DT[, bg.date := sapply(strsplit(bg.date, " "), "[", 1)]
DT<- DT[, bg.date := as.Date(bg.date, "%M/%d/%Y")]
DT<- DT<- DT[bg.date>as.Date("1996-01-01")]
str(DT)
## Classes 'data.table' and 'data.frame': 653530 obs. of 9 variables:
## $ bg.date : Date, format: "1996-03-06" "1996-03-11" ...
## $ type : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 972 834 856 856 856 244 359 856 856 856 ...
## $ fatalities : num 0 0 0 0 0 0 0 0 0 0 ...
## $ injuries : num 0 0 0 0 0 0 0 0 0 0 ...
## $ prop.damage : num 380 100 3 5 2 0 400 12 8 12 ...
## $ prop.damage.exp: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 1 17 17 17 17 ...
## $ crop.damage : num 38 0 0 0 0 0 0 0 0 0 ...
## $ crop.damage.exp: Factor w/ 9 levels "","?","0","2",..: 7 1 1 1 1 1 1 1 1 1 ...
## $ ref.num : num 248768 248769 248770 248771 248772 ...
## - attr(*, ".internal.selfref")=<externalptr>
There’s over 900 invalid types of events and many invalid exponents.
unique(DT$prop.damage.exp)
## [1] K M B 0
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
# Clean up this column. Only "", "B", "K", and "M" exponents are valid according to the codebook.
DT<- DT[, prop.damage.exp := as.character(prop.damage.exp)]
DT<- DT[prop.damage.exp %in% c("", "B", "K", "M")]
# `crop.damage.exp` too.
DT<- DT[, crop.damage.exp := as.character(crop.damage.exp)]
DT<- DT[crop.damage.exp %in% c("", "B", "K", "M")]
Exponent columns are needed to be transformed into computable form.
DT$prop.damage.exp[DT$prop.damage.exp==""] <- 1
DT$prop.damage.exp[DT$prop.damage.exp=="K"] <- 1000
DT$prop.damage.exp[DT$prop.damage.exp=="M"] <- 1000000
DT$prop.damage.exp[DT$prop.damage.exp=="B"] <- 1000000000
DT<- DT[, prop.damage.exp := as.numeric(prop.damage.exp)]
DT$crop.damage.exp[DT$crop.damage.exp==""] <- 1
DT$crop.damage.exp[DT$crop.damage.exp=="K"] <- 1000
DT$crop.damage.exp[DT$crop.damage.exp=="M"] <- 1000000
DT$crop.damage.exp[DT$crop.damage.exp=="B"] <- 1000000000
DT<- DT[, crop.damage.exp := as.numeric(crop.damage.exp)]
Overall damage is computed below.
DT$prop<- DT$prop.damage*DT$prop.damage.exp
DT$crop<- DT$crop.damage*DT$crop.damage.exp
DT$overall.damage<- DT$prop+DT$crop
Select the variables again
DT<- DT[,.(type, fatalities, injuries, ref.num, overall.damage)]
Filter out events without any damage
DT$any<- DT$fatalities+DT$injuries+DT$overall.damage
DT<- DT[any!=0]
DT$any<- NULL
DT$type<- tolower(as.character(DT$type))
Cleaning of the main misspellings
DT$type<- gsub("tstm", "thunderstorm", DT$type)
DT$type<- gsub("[ ][(][a-z][0-9][0-9][)]", "", DT$type)
DT$type<- gsub("[ ][(][0-9][0-9][)]|[ ][0-9][0-9]", "", DT$type)
DT$type<- gsub("[ ]$", "", DT$type)
DT$type<- gsub("small hail", "hail", DT$type)
DT$type<- gsub("winds", "wind", DT$type)
DT$type<- gsub("cold temperature", "cold", DT$type)
DT$type<- gsub("cold weather", "cold", DT$type)
DT$type<- gsub("rip currents", "rip current", DT$type)
nrow(DT)
## [1] 201318
length(unique(DT$type))
## [1] 166
Number of observations has reduced considerably, and number of event types either. A bulk of event types belongs to proper categories (run table(DT$type) to make sure).
dt_f <- DT[,. (f_sum=sum(fatalities)), by=type][order(-f_sum)][1:10]
head(dt_f, n=5)
## type f_sum
## 1: excessive heat 1797
## 2: tornado 1511
## 3: flash flood 887
## 4: lightning 651
## 5: rip current 542
dotchart(dt_f$f_sum, labels=dt_f$type, pch=19, main="Top 10 event types with highest fatality rates")
dt_i <- DT[,. (f_sum=sum(injuries)), by=type][order(-f_sum)][1:10]
head(dt_i, n=5)
## type f_sum
## 1: tornado 20667
## 2: flood 6758
## 3: excessive heat 6391
## 4: thunderstorm wind 5033
## 5: lightning 4141
dotchart(dt_i$f_sum, labels=dt_i$type, pch=19, main="Top 10 event types with highest injury rates")
dt_d <- DT[,. (f_sum=sum(overall.damage)), by=type][order(-f_sum)][1:10]
head(dt_d, n=5)
## type f_sum
## 1: flood 148919611950
## 2: hurricane/typhoon 71913712800
## 3: storm surge 43193541000
## 4: tornado 24900370720
## 5: hail 17092035870
dotchart(dt_d$f_sum, labels=dt_d$type, pch=19, main="Top 10 event types with highest property and crop damage rates")