by Chan Chee-Foong on 20 Apr 2016
This analysis involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
The data use from this analysis can be downloaded from here.
Other references:
This analysis will address the following questions:
Download the zip file if it is not found in the working directory and unzip the file for analysis.
datadir <- "./stormdata"
datafile <- "repdata_data_StormData.csv"
zipfile <- "repdata_data_StormData.csv.bz2"
datadirfile <- paste(datadir, datafile, sep="/")
zipdirfile <- paste(datadir, zipfile, sep="/")
if (!file.exists(datadirfile)) {
if (!file.exists(datadir)) {dir.create(datadir)}
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url, destfile = zipdirfile, mode = 'wb')
bunzip2 (filename = zipdirfile, remove = FALSE, skip=TRUE)
}
Load the csv file in to myStormData dataframe.
myStormData <- read.csv(datadirfile, header = TRUE, na.strings = 'NA')
dim(myStormData)
## [1] 902297 37
str(myStormData)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436774 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
The following fields will be used for the purpose of this analysis:
Observed that the Base 10 exponent indicators will need some cleaning up.
summary(myStormData$PROPDMGEXP)
## - ? + 0 1 2 3 4 5
## 465934 1 8 5 216 25 13 4 4 28
## 6 7 8 B h H K m M
## 4 5 1 40 1 6 424665 7 11330
summary(myStormData$CROPDMGEXP)
## ? 0 2 B k K m M
## 618413 7 19 1 9 21 281832 1 1994
Subsetting the whole database with only required fields. Prepare data for analysis by creating new fields and setting all fields to the correct classes for ease of use.
myAnalysisData <- select(myStormData, EVTYPE, BGN_DATE, END_DATE, COUNTYNAME, STATE,
FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
Processing the Base 10 exponent indicators
myAnalysisData$PROPDMGEXP <- as.character(myAnalysisData$PROPDMGEXP)
myAnalysisData$CROPDMGEXP <- as.character(myAnalysisData$CROPDMGEXP)
EXPCHAR <- data.frame(abbr = as.character(c('H','h','K','k','M','m','B','b')),
value = c(2,2,3,3,6,6,9,9), stringsAsFactors = FALSE)
EXPNUM <- data.frame(abbr = as.character(c(0:9)), value = c(0:9), stringsAsFactors = FALSE)
EXP <- rbind(EXPCHAR, EXPNUM)
myAnalysisData <- left_join(x=myAnalysisData, y=EXP, by = c('PROPDMGEXP' = 'abbr'))
colnames(myAnalysisData)[which(names(myAnalysisData) == 'value')] <- 'PROPDMGEXPCLEAN'
myAnalysisData <- left_join(x=myAnalysisData, y=EXP, by = c('CROPDMGEXP' = 'abbr'))
colnames(myAnalysisData)[which(names(myAnalysisData) == 'value')] <- 'CROPDMGEXPCLEAN'
myAnalysisData[is.na(myAnalysisData$PROPDMGEXPCLEAN),'PROPDMGEXPCLEAN'] <- 0
myAnalysisData[is.na(myAnalysisData$CROPDMGEXPCLEAN),'CROPDMGEXPCLEAN'] <- 0
myAnalysisData$PROPDMGEXPCLEAN <- 10^as.numeric(myAnalysisData$PROPDMGEXPCLEAN)
myAnalysisData$CROPDMGEXPCLEAN <- 10^as.numeric(myAnalysisData$CROPDMGEXPCLEAN)
Calculate
myAnalysisData$PROPDMGVALUE <- (myAnalysisData$PROPDMG * myAnalysisData$PROPDMGEXPCLEAN)
myAnalysisData$CROPDMGVALUE <- (myAnalysisData$CROPDMG * myAnalysisData$CROPDMGEXPCLEAN)
myAnalysisData$TOTALDMGVALUE <- myAnalysisData$PROPDMGVALUE + myAnalysisData$CROPDMGVALUE
myAnalysisData$TOTALLIFEAFFECTED <- myAnalysisData$FATALITIES + myAnalysisData$INJURIES
Determine the Month and Year for each event recorded
myAnalysisData$DATE <- mdy(sapply(strsplit(as.character(myAnalysisData$BGN_DATE), ' '), function(x) {x[1]}))
myAnalysisData$YEAR <- as.factor(year(myAnalysisData$DATE))
myAnalysisData$MONTH <- as.factor(lubridate::month(myAnalysisData$DATE, label = TRUE, abbr = TRUE))
For this analysis, I choose to analysis data from year 2005 to the latest year available at 2011. This will give me the 6-7 years of latest data. Earlier years data were found not to be too clean. For example, ‘EVTYPE’ was found to have invalid and inconsisent entries. Data entries in the earlier years could be biased giving more emphasis only to events reported or observed. Only ‘TORNADO’, ‘TSTN WIND’, ‘HAIL’ are recorded before 1990.
Choosing to use the most recent data is also more reflective of the current environmental changes and technological advancement to capture data more consistently. Events like ‘Excessive Heat’ and ‘Heat’ are more pronouced in the recent years.
However, the analysis can be repeated by setting the start and end year in the code below.
## Set Start Year and End Year for Analysis
startYear <- 2005
endYear <- 2011
Grouping Data by Event Type
myDatabyGrp <- myAnalysisData %>% filter(as.numeric(as.character(YEAR)) >= startYear & as.numeric(as.character(YEAR)) <= endYear) %>% group_by(EVTYPE)
Calculating the fatalities and injuries by Event Type
mostLifeAffected <- myDatabyGrp %>% summarise(NumOfLifeAffected = sum(TOTALLIFEAFFECTED)) %>%
arrange(desc(NumOfLifeAffected))
mostFatalities <- myDatabyGrp %>% summarise(NumOfFatalities = sum(FATALITIES)) %>%
arrange(desc(NumOfFatalities))
mostInjuries <- myDatabyGrp %>% summarise(NumOfInjuries = sum(INJURIES)) %>%
arrange(desc(NumOfInjuries))
mostLifeAffected <- full_join(mostLifeAffected, mostFatalities, by = 'EVTYPE')
mostLifeAffected <- full_join(mostLifeAffected, mostInjuries, by = 'EVTYPE')
datatable(mostLifeAffected,
options = list(pageLength = 5),
colnames = c('EVENTS', 'TOTAL', 'FATALITIES', 'INJURIES'),
caption = 'Table 1: Top Harmful Events resulting in Fatalities and Injuries.')
Calculating total damage by value by Event Type
mostDMG <- myDatabyGrp %>% summarise(TotalDmgValue = sum(TOTALDMGVALUE)) %>%
arrange(desc(TotalDmgValue))
mostPROPDMG <- myDatabyGrp %>% summarise(PropDmgValue = sum(PROPDMGVALUE)) %>%
arrange(desc(PropDmgValue))
mostCROPDMG <- myDatabyGrp %>% summarise(CropDmgValue = sum(CROPDMGVALUE)) %>%
arrange(desc(CropDmgValue))
mostDMG <- full_join(mostDMG, mostPROPDMG, by = 'EVTYPE')
mostDMG <- full_join(mostDMG, mostCROPDMG, by = 'EVTYPE')
datatable(mostDMG,
options = list(pageLength = 5),
colnames = c('EVENTS', 'TOTAL DAMAGE', 'PROPERTY DAMAGE', 'CROP DAMAGE'),
caption = 'Table 2: Top Harmful Events resulting in Damage by Value.')
Set the number of top most harmful events for analysis
topNumEVENT <- 5
Q1) Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
A1) Tornado is the most harmful event causing the most death and injuries in recent years. This is follow by excessive heat and lightning.
dataPlot1 <- arrange(mostLifeAffected, desc(NumOfLifeAffected))[c(1:topNumEVENT),]
dataPlot1 <- dataPlot1[,c(1,3,4)]
dataPlot1 <- melt(dataPlot1, id=c("EVTYPE"), measure.vars = c("NumOfFatalities", "NumOfInjuries"))
colnames(dataPlot1) <- c('EVENT.TYPE','TYPE','COUNT')
dataPlot1$TYPE <- as.character(dataPlot1$TYPE)
dataPlot1[dataPlot1$TYPE == 'NumOfFatalities',]$TYPE <- 'FATALITIES'
dataPlot1[dataPlot1$TYPE == 'NumOfInjuries',]$TYPE <- 'INJURIES'
g1 <- ggplot(data=dataPlot1, aes(x=reorder(EVENT.TYPE,COUNT), y=COUNT, fill=TYPE)) +
geom_bar(stat="identity") +
ylab('Number of Lives Affected (Fatalities + Injuries)') +
xlab('') +
labs(title = 'Top events resulting in the most fatalities and injuries') +
theme(legend.title = element_blank()) +
theme(legend.position = 'top') +
scale_fill_brewer(palette="Paired") +
coord_flip()
g1
Q2) Across the United States, which types of events have the greatest economic consequences?
A2) Flood causes the most property damage by value of a total more than $100b in recent years. This is follow by hurricane/typhoon and storm surge.
dataPlot2 <- arrange(mostDMG, desc(TotalDmgValue))[c(1:topNumEVENT),]
dataPlot2 <- dataPlot2[,c(1,3,4)]
dataPlot2 <- melt(dataPlot2, id=c("EVTYPE"), measure.vars = c("PropDmgValue", "CropDmgValue"))
colnames(dataPlot2) <- c('EVENT.TYPE','TYPE','VALUE')
dataPlot2$TYPE <- as.character(dataPlot2$TYPE)
dataPlot2[dataPlot2$TYPE == 'PropDmgValue',]$TYPE <- 'PROPERTY DAMAGE'
dataPlot2[dataPlot2$TYPE == 'CropDmgValue',]$TYPE <- 'CROP DAMAGE'
g2 <- ggplot(data=dataPlot2, aes(x=reorder(EVENT.TYPE,VALUE), y=VALUE/10^9, fill=TYPE)) +
geom_bar(stat="identity") +
ylab('Damage Value (Billions)') +
xlab('') +
labs(title = 'Top events resulting in the most damage by value') +
theme(legend.title = element_blank()) +
theme(legend.position = 'top') +
scale_fill_brewer(palette="Paired") +
coord_flip()
g2
Since tornado is one of the most harmful events. This analysis here attempt to study the frequency of occurances by month on top 20 states most prone to tornado.
topEVENT <- c('TORNADO')
numOfState <- 20
Grouping Data by Event Type, Month and State and count the number of occurances
myDatabyGrp2 <- myAnalysisData %>% filter(EVTYPE %in% topEVENT) %>% group_by(EVTYPE, MONTH, STATE)
dataPlot3 <- myDatabyGrp2 %>% summarise(FREQUENCY = n()) %>% arrange(desc(FREQUENCY))
dataPlot4 <- dataPlot3 %>% ungroup()
dataPlot4$EVTYPE <- as.factor(as.character(dataPlot4$EVTYPE))
dataPlot4$STATE <- as.factor(as.character(dataPlot4$STATE))
dataPlot4 <- complete(dataPlot4, EVTYPE, MONTH, STATE, fill = list(FREQUENCY=0))
Determine the top risky states with frequent occurance of the top harmful event
myDatabyGrp3 <- myAnalysisData %>% filter(EVTYPE %in% topEVENT) %>% group_by(STATE)
riskyState <- myDatabyGrp3 %>% summarise(FREQUENCY = n()) %>% arrange(desc(FREQUENCY))
riskyState <- as.character(riskyState$STATE[1:numOfState])
Plotting the results in a Heatmap. It is observed that Tornado occurs most frequently in Texas, Kansas and Oklahoma. in the summer months of Apr - Jun. For most states, Tornado seems to occur often in the summer months.
gg1 <- ggplot(filter(dataPlot4, STATE %in% riskyState),
aes(x=reorder(STATE, desc(FREQUENCY)), y=MONTH, fill=FREQUENCY))
gg1 <- gg1 + geom_tile(color="white", size=0.1)
gg1 <- gg1 + scale_x_discrete(expand=c(0,0))
gg1 <- gg1 + scale_y_discrete(expand=c(0,0))
gg1 <- gg1 + scale_fill_gradient(low = "light yellow", high = "red")
gg1 <- gg1 + coord_equal()
gg1 <- gg1 + labs(x='', y='', title='Number of Tornado Occurance by Month in Tornado-prone States')
gg1 <- gg1 + theme(axis.ticks=element_blank())
gg1 <- gg1 + theme(axis.text=element_text(size=8))
gg1 <- gg1 + theme(panel.border=element_blank())
gg1 <- gg1 + theme(plot.title=element_text(hjust=0.5, size=12))
gg1 <- gg1 + theme(panel.margin.x=unit(0.5, "cm"))
gg1 <- gg1 + theme(panel.margin.y=unit(0.5, "cm"))
gg1 <- gg1 + theme(legend.title=element_text(size=8))
gg1 <- gg1 + theme(legend.title.align=1)
gg1 <- gg1 + theme(legend.text=element_text(size=8))
gg1 <- gg1 + theme(legend.position="bottom")
gg1 <- gg1 + theme(legend.key.size=unit(0.2, "cm"))
gg1 <- gg1 + theme(legend.key.width=unit(1, "cm"))
gg1