This document explores the National Oceanic and Atmospheric Administration (NOAA) Storm Database to adress the severity of weather events between 1950 and November 2011. We begin by defining the questions, give short answers then focus on the the fully reproducible research below.
Answer: Tornados
Answer: Flood and Drought
We prefilter the data by making use of a mapping table for the exponential fields. We present the data separately and for the final results make use of a weighting logic, since fatalities are orders of magnitude more harmful than injuries, similar property vs crop damage.
The Data used for the analysis is downloaded directly from the course website Storm Data [47Mb]
# fetch data from URL directly into a data.frame
getCSVBz2DataFromUrl <- function(url){
dest <- tempfile(fileext = ".csv.bz2")
setInternet2(use = TRUE) # support for HTTPS
download.file(url, dest, mode = "wb")
read.csv(dest)
}
# define URL
url = 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
# retrieve storms data
storms = getCSVBz2DataFromUrl(url)
# some benchmarks
dim(storms)
## [1] 902297 37
colnames(storms)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
print(object.size(storms), units="Mb")
## 409.4 Mb
We prefilter the dataset and extract only the columns that contain the data that we need.
Variable Description
STATE the State this event took place
EVTYPE the type of weather event (full )
BGN_DATE Event beginn (trends / timeseries)
FATALITIES number of fatalities
INJURIES number of injuried people
PROPDMG Economic property loss
PROPDMGEXP Economic property loss multiplier value
CROPDMG Economic crop loss
CROPDMGEXP Economic crop loss multiplier
Next we use a powers of 10 mapping table for the exponential fields, merge it twice then normalize the fields.
opts = c('STATE', 'EVTYPE', 'BGN_DATE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')
ds = storms[, opts]
# converting the date
ds$BGN_DATE = strptime(as.character(ds$BGN_DATE),"%m/%e/%Y %H:%M:%S" )
# retain the year
ds$BGN_YEAR = format(ds$BGN_DATE, '%Y')
ds$BGN_DATE = NULL
#since the ninenties there is a constant increase in the number of recorded weather events
table(ds$BGN_YEAR)
##
## 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961
## 223 269 272 492 609 1413 1703 2184 2213 1813 1945 2246
## 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
## 2389 1968 2348 2855 2388 2688 3312 2926 3215 3471 2168 4463
## 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985
## 5386 4975 3768 3728 3657 4279 6146 4517 7132 8322 7335 7979
## 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
## 8726 7367 7257 10410 10946 12522 13534 12607 20631 27970 32270 28680
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
## 38128 31289 34471 34962 36293 39752 39363 39184 44034 43289 55663 45817
## 2010 2011
## 48161 62174
# mapping the exponents
mappingPowersOf10 = data.frame(
Exp = c("", "-", "?", "+", "0", "1", "2", "3", "4", "5", "6", "7", "8", "B", "h", "H", "K", "m", "M"),
Val = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 2, 2, 3, 6, 6)
)
# merge the exponential markings from the source with the proper values
ds = merge(ds, mappingPowersOf10, by.x = "PROPDMGEXP", by.y = "Exp")
# merge the two fields into a normalised one
ds$PropertyDamage = ds$PROPDMG* (10^ds$Val)
ds$PROPDMG = NULL
ds$PROPDMGEXP = NULL
ds$Val = NULL
# merge the exponential markings from the source with the proper values
ds = merge(ds, mappingPowersOf10, by.x = "CROPDMGEXP", by.y = "Exp")
# merge the two fields into a normalised one
ds$CropDamage = ds$CROPDMG* (10^ds$Val)
ds$CROPDMG = NULL
ds$CROPDMGEXP = NULL
ds$Val = NULL
# defining the weight
weightFATALITIES = 10
# applying the weight
ds$COMBINED = weightFATALITIES*ds$FATALITIES + ds$INJURIES
# dropping two thirds of the data
ds = subset(ds, COMBINED>0 | PropertyDamage>0 | CropDamage > 0)
# aggregating by Event Type
aggHUMAN = merge (
aggregate(FATALITIES ~ EVTYPE, ds, sum),
aggregate(INJURIES ~ EVTYPE, ds, sum)
)
# applying the weight
aggHUMAN$COMBINED = weightFATALITIES*aggHUMAN$FATALITIES + aggHUMAN$INJURIES
# selecting just top 10, combining all remaining entities into another factor
toDrop = subset(aggHUMAN, COMBINED<3617)
aggHUMAN = head(aggHUMAN[order(- aggHUMAN$COMBINED),], 9)
aggHUMAN = rbind(aggHUMAN,
data.frame(EVTYPE='OTHERS', FATALITIES = sum(toDrop$FATALITIES), INJURIES = sum(toDrop$INJURIES), COMBINED = sum(toDrop$INJURIES)))
aggPROPERTY = aggregate(PropertyDamage ~ EVTYPE, ds, sum)
aggCROP = aggregate(CropDamage ~ EVTYPE, ds, sum)
aggPROPERTY = head(aggPROPERTY[order(- aggPROPERTY$PropertyDamage),], 10)
aggCROP = head(aggCROP[order(- aggCROP$CropDamage),], 10)
# checking dimensions
sapply(c(aggCROP, aggPROPERTY), length)
## EVTYPE CropDamage EVTYPE PropertyDamage
## 10 10 10 10
# plot the weather events most harmful to the human population
ggplot(data=aggHUMAN, aes(reorder(EVTYPE, -COMBINED), COMBINED, fill=EVTYPE, color=EVTYPE)) +
geom_bar(stat = "identity") +
geom_text(aes(label=EVTYPE), vjust=0.8, hjust=0, color="black", angle=10) +
ggtitle("Most harmful weather events") +
labs(x="",y="Combined damage to human population") +
theme(text = element_text(size = 8), axis.line=element_blank(),axis.text.x=element_blank(),
plot.title = element_text(color="#666666", face="bold", size=20) )
# plot the weather events with greatest impact to Propery
ggplot(data=aggPROPERTY, aes(reorder(EVTYPE, -PropertyDamage), PropertyDamage, fill=EVTYPE, color=EVTYPE)) +
geom_bar(stat = "identity") +
geom_text(aes(label=EVTYPE), vjust=0.8, hjust=0, color="black", angle=10) +
ggtitle("Events with greatest impact to Property") +
labs(x="",y="Normalized Damage to Property") +
theme(text = element_text(size = 8), axis.line=element_blank(),axis.text.x=element_blank(),
plot.title = element_text(color="#666666", face="bold", size=20) )
# plot the weather events with greatest impact to Crops
ggplot(data=aggCROP, aes(reorder(EVTYPE, -CropDamage), CropDamage, fill=EVTYPE, color=EVTYPE)) +
geom_bar(stat = "identity") +
geom_text(aes(label=EVTYPE), vjust=0.8, hjust=0, color="black", angle=10) +
ggtitle("Events with greatest impact to Crops") +
labs(x="",y="Normalized Damage to Crops") +
theme(text = element_text(size = 8), axis.line=element_blank(),axis.text.x=element_blank(),
plot.title = element_text(color="#666666", face="bold", size=20) )