Danilo Verdugo C
24-6-2016
Severe natural disasters have important and dangerous effects on people, property and public health. As an attempt to preparation and study of future disasters NOAA agency has prepared a historical database with severe weather events.
In the present study we will study such a database and answer two key questions:
Detailed description of the NOAA database can be found in code blook
Load libraries, set some parameter.
library(dplyr)
library(stringr)
library(maps)
library(choroplethr)
library(choroplethrMaps)
library(ggplot2)
library(gridExtra)
data("county.fips")
We load data from website, then unzip in local directory.
setwd("C:/Users/danilo.verdugo/Dropbox/personal/COURSERA DATA SCIENTIST/curso 5/semana 4/trabajo")
if(!file.exists("StormData.csv.bz2")){
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "StormData.csv.bz2")
}
stormdata <- read.csv(bzfile("StormData.csv.bz2"), sep = ",", header = TRUE, stringsAsFactors = FALSE)
str(stormdata)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Select some columns that are relevant and discard the rest.
storm <- select(stormdata, COUNTYNAME, STATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
#NOTE: I use counti.fips data to build column "state,county" for mapping.!!
storm$STATE <- state.name[match(storm$STATE,state.abb)]
storm <- mutate(storm,FIPSNAME=str_c(tolower(STATE),",",tolower(COUNTYNAME)))
#remove not used col.
storm[,"STATE"] <- NULL
storm[,"COUNTYNAME"] <- NULL
#remove from memory original data for preserve some memory.
rm(stormdata)
In the original database, some characters in PROPDMGEXP and CROPDMGEXP must be removed. Then convert alphanumeric exponents to numeric for the convenience of data aggregation.
storm$PROPDMGEXP[storm$PROPDMGEXP==""|storm$PROPDMGEXP=="-"|storm$PROPDMGEXP=="+"|storm$PROPDMGEXP=="?"] = 0
storm[(storm$PROPDMGEXP == "H" | storm$PROPDMGEXP == "h"), ]$PROPDMGEXP <- 2
storm[(storm$PROPDMGEXP == "K" | storm$PROPDMGEXP == "k"), ]$PROPDMGEXP <- 3
storm[(storm$PROPDMGEXP == "M" | storm$PROPDMGEXP == "m"), ]$PROPDMGEXP <- 6
storm[(storm$PROPDMGEXP == "B" | storm$PROPDMGEXP == "b"), ]$PROPDMGEXP <- 9
storm$CROPDMGEXP[storm$CROPDMGEXP==""|storm$CROPDMGEXP=="?"] = 0
storm[(storm$CROPDMGEXP == "K" | storm$CROPDMGEXP == "k"), ]$CROPDMGEXP <- 3
storm[(storm$CROPDMGEXP == "M" | storm$CROPDMGEXP == "m"), ]$CROPDMGEXP <- 6
storm[(storm$CROPDMGEXP == "B" | storm$CROPDMGEXP == "b"), ]$CROPDMGEXP <- 9
#set to numeric for next calculation
storm <- transform(storm,EVTYPE = as.character(EVTYPE), FATALITIES = as.numeric(FATALITIES), INJURIES = as.numeric(INJURIES), PROPDMG = as.numeric(PROPDMG), PROPDMGEXP = as.numeric(PROPDMGEXP), CROPDMG = as.numeric(CROPDMG), CROPDMGEXP = as.numeric(CROPDMGEXP))
#make a new column with total damage
storm <- mutate(storm, DMGTOT = PROPDMG * (10 ^ PROPDMGEXP) + CROPDMG * (10 ^ CROPDMGEXP))
#check exponent
table(storm$PROPDMGEXP)
##
## 0 1 2 3 4 5 6 7 8 9
## 466164 25 20 424669 4 28 11341 5 1 40
table(storm$CROPDMGEXP)
##
## 0 2 3 6 9
## 618439 1 281853 1995 9
#remove col.
storm[,"PROPDMG"] <- NULL
storm[,"CROPDMG"] <- NULL
storm[,"PROPDMGEXP"] <- NULL
storm[,"CROPDMGEXP"] <- NULL
#building index to mapping by county (we need FIPS county code)
storm <- merge(storm, county.fips, by.x = "FIPSNAME", by.y = "polyname")
storm[,"FIPSNAME"] <- NULL
Now we calculate the total of deaths and injuries by event type and exploratory graphs to answer question number one.
fatalities <- aggregate(FATALITIES ~ EVTYPE , data = storm, FUN = sum)
fatalities <- fatalities[order(fatalities$FATALITIES, decreasing = TRUE), ]
MaxFatalities <- fatalities[1:10, ]
MaxFatalities
## EVTYPE FATALITIES
## 347 TORNADO 5439
## 56 FLASH FLOOD 908
## 187 LIGHTNING 774
## 361 TSTM WIND 482
## 70 FLOOD 298
## 231 RIP CURRENTS 165
## 277 THUNDERSTORM WIND 128
## 230 RIP CURRENT 105
## 135 HEAT 80
## 137 HEAT WAVE 65
injuries <- aggregate(INJURIES ~ EVTYPE, data = storm, FUN = sum)
injuries <- injuries[order(injuries$INJURIES, decreasing = TRUE), ]
MaxInjuries <- injuries[1:10, ]
MaxInjuries
## EVTYPE INJURIES
## 347 TORNADO 87904
## 361 TSTM WIND 6746
## 187 LIGHTNING 5032
## 70 FLOOD 4716
## 56 FLASH FLOOD 1754
## 277 THUNDERSTORM WIND 1436
## 108 HAIL 1356
## 303 THUNDERSTORM WINDS 861
## 413 WILD/FOREST FIRE 530
## 231 RIP CURRENTS 249
par(mfrow = c(1, 2), mar = c(15, 4, 3, 2), mgp = c(3, 1, 0), cex = 0.8)
barplot(MaxFatalities$FATALITIES, las = 3, names.arg = MaxFatalities$EVTYPE, main = "Weather Events With\n The Top 10 Highest Fatalities", ylab = "Number of Fatalities", col = "grey")
barplot(MaxInjuries$INJURIES, las = 3, names.arg = MaxInjuries$EVTYPE, main = "Weather Events With\n The Top 10 Highest Injuries", ylab = "Number of Injuries", col = "grey")
Now we calculate the total of damage by event type and exploratory graphs to answer question number dos.
par(mfrow = c(1, 1))
propdmg <- aggregate(DMGTOT ~ EVTYPE, data = storm, FUN = sum)
propdmg <- propdmg[order(propdmg$DMGTOT, decreasing = TRUE), ]
propdmgMax <- propdmg[1:10, ]
propdmgMax <- mutate(propdmgMax,DMGTOT=DMGTOT/1e9)
propdmgMax
## EVTYPE DMGTOT
## 70 FLOOD 135.460561
## 347 TORNADO 56.028802
## 56 FLASH FLOOD 17.677727
## 108 HAIL 17.519106
## 361 TSTM WIND 4.975960
## 277 THUNDERSTORM WIND 3.807605
## 413 WILD/FOREST FIRE 3.084086
## 146 HEAVY RAIN/SEVERE WEATHER 2.500000
## 414 WILDFIRE 2.357301
## 303 THUNDERSTORM WINDS 1.849197
barplot(propdmgMax$DMGTOT, las = 3, names.arg = propdmgMax$EVTYPE, main = "Weather Events With\n The Top 10 Highest Economical Damage", ylab = "Damage (U$ Billion)", col = "grey")
Questions 1.
From the above tables and figure, we obtain Top 10 event types that led to most severe damages of population health, counts of fatalities and injuries. Among them, Tornado is the major one.
Questions 2.
From the above tables and figure, we obtain Top 10 event types are most expensive. Among them, Flood is the major one.
datamap <- select(storm,fips,EVTYPE,FATALITIES,DMGTOT)
tornadofatal <- filter(datamap,EVTYPE=="TORNADO")
trnfatal <- aggregate(FATALITIES ~ fips, data = tornadofatal, FUN = sum)
names(trnfatal)[names(trnfatal)=="fips"] <- "region"
names(trnfatal)[names(trnfatal)=="FATALITIES"] <- "value"
choro = CountyChoropleth$new(trnfatal)
choro$title = "Fatalities by Tornado"
choro$ggplot_scale = scale_fill_brewer(name="N° Fatalities", palette=3, drop=FALSE)
suppressWarnings(g1 <- choro$render())
flooddmg <- filter(datamap,EVTYPE=="FLOOD")
flooddmg <- mutate(datamap,DMGTOT= DMGTOT/1e9)
floodfatal <- aggregate(DMGTOT ~ fips, data = flooddmg, FUN = sum)
names(floodfatal)[names(floodfatal)=="fips"] <- "region"
names(floodfatal)[names(floodfatal)=="DMGTOT"] <- "value"
choro = CountyChoropleth$new(floodfatal)
choro$title = "National Damage by Flood"
choro$ggplot_scale = scale_fill_brewer(name="U$ Billions", palette=2, drop=FALSE)
suppressWarnings(g2 <- choro$render())
grid.arrange(g1, g2, ncol=1, nrow=2)