The analysis aims to answer these two questions:
echo = TRUE # Always make code visible
options(scipen = 1) # Turn off scientific notations for numbers
library(utils)
library(ggplot2)
library(plyr)
require(gridExtra)
## Loading required package: gridExtra
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'gridExtra'
library(reshape2)
Download and extract (.bz2) the storm data inside the data folder. Read the data from the working directory.
rawdata <- read.csv("data/repdata-data-StormData.csv", header = TRUE, stringsAsFactors = FALSE)
dim(rawdata)
## [1] 902297 37
There are 902297 rows and 37 columns in total.
if (dim(rawdata)[2] == 37) {
rawdata$year <- as.numeric(format(as.Date(rawdata$BGN_DATE, format = "%m/%d/%Y %H:%M:%S"), "%Y"))
}
hist(rawdata$year,
breaks = 30,
col = "purple")
convert the data to lower case
colnames(rawdata) <- tolower(colnames(rawdata))
rawdata$evtype <- tolower(rawdata$evtype)
rawdata$cropdmgexp <- tolower(rawdata$cropdmgexp)
rawdata$propdmgexp <- tolower(rawdata$propdmgexp)
rawdata$cropdmgmag <- 1
rawdata$cropdmgmag[rawdata$cropdmgexp == 'h'] <- 1e2
rawdata$cropdmgmag[rawdata$cropdmgexp == 'k'] <- 1e3
rawdata$cropdmgmag[rawdata$cropdmgexp == 'm'] <- 1e6
rawdata$cropdmgmag[rawdata$cropdmgexp == 'b'] <- 1e9
rawdata$propdmgmag <- 1
rawdata$propdmgmag[rawdata$propdmgexp == 'h'] <- 1e2
rawdata$propdmgmag[rawdata$propdmgexp == 'k'] <- 1e3
rawdata$propdmgmag[rawdata$propdmgexp == 'm'] <- 1e6
rawdata$propdmgmag[rawdata$propdmgexp == 'b'] <- 1e9
#get the damage value by applying multiplier
rawdata$cropdmgval <- rawdata$cropdmg * rawdata$cropdmgmag
rawdata$propdmgval <- rawdata$propdmg * rawdata$propdmgmag
Unique events in the dataset.
unique(rawdata$evtype)[1:25]
## [1] "tornado" "tstm wind"
## [3] "hail" "freezing rain"
## [5] "snow" "ice storm/flash flood"
## [7] "snow/ice" "winter storm"
## [9] "hurricane opal/high winds" "thunderstorm winds"
## [11] "record cold" "hurricane erin"
## [13] "hurricane opal" "heavy rain"
## [15] "lightning" "thunderstorm wind"
## [17] "dense fog" "rip current"
## [19] "thunderstorm wins" "flash flood"
## [21] "flash flooding" "high winds"
## [23] "funnel cloud" "tornado f0"
## [25] "thunderstorm winds lightning"
# summary(rawdata) # Uncomment the summary output
head(rawdata, n = 3)
## state__ bgn_date bgn_time time_zone county countyname state
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## evtype bgn_range bgn_azi bgn_locati end_date end_time county_end
## 1 tornado 0 0
## 2 tornado 0 0
## 3 tornado 0 0
## countyendn end_range end_azi end_locati length width f mag fatalities
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## injuries propdmg propdmgexp cropdmg cropdmgexp wfo stateoffic zonenames
## 1 15 25.0 k 0
## 2 0 2.5 k 0
## 3 2 25.0 k 0
## latitude longitude latitude_e longitude_ remarks refnum year cropdmgmag
## 1 3040 8812 3051 8806 1 1950 1
## 2 3042 8755 0 0 2 1950 1
## 3 3340 8742 0 0 3 1951 1
## propdmgmag cropdmgval propdmgval
## 1 1000 0 25000
## 2 1000 0 2500
## 3 1000 0 25000
Aggregate the data to form complete dataset.
newStormData <- aggregate(cbind(injuries, fatalities, cropdmgval, propdmgval) ~ evtype, rawdata, sum)
#calculate the health impact (incl. injuries and fatalities) due to storms
newStormData$hlthdmg <- newStormData$injuries + newStormData$fatalities
public_health_impact_data <- newStormData[order(newStormData$hlthdmg,
decreasing = TRUE),
c('evtype', 'injuries', 'fatalities', 'hlthdmg')]
major_health_impact <- public_health_impact_data[1:10,]
#calculate economic impact (incl. crop and property damage) due to storms
newStormData$econdmg <- newStormData$cropdmgval + newStormData$propdmgval
economic_impact_data <- newStormData[order(newStormData$econdmg,
decreasing = TRUE),
c('evtype', 'cropdmgval', 'propdmgval', 'econdmg')]
major_economic_impact <- economic_impact_data[1:10,]
# breakdown by evtype and variable,value pair.
major_health_impact <- major_health_impact[, -4]
major_economic_impact <- major_economic_impact[, -4]
major_health_impact <- melt(major_health_impact, id.vars = 'evtype', variable.name = 'dmgctgry')
major_economic_impact <- melt(major_economic_impact, id.vars = 'evtype', variable.name = 'dmgctgry')
View the Health Impact Data
str(major_health_impact)
## 'data.frame': 20 obs. of 3 variables:
## $ evtype : chr "tornado" "excessive heat" "tstm wind" "flood" ...
## $ dmgctgry: Factor w/ 2 levels "injuries","fatalities": 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 91346 6525 6957 6789 5230 ...
View the Economic Impact Data
str(major_economic_impact)
## 'data.frame': 20 obs. of 3 variables:
## $ evtype : chr "flood" "hurricane/typhoon" "tornado" "storm surge" ...
## $ dmgctgry: Factor w/ 2 levels "cropdmgval","propdmgval": 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 5.66e+09 2.61e+09 4.15e+08 5.00e+03 3.03e+09 ...
Events (as indicated in the EVTYPE variable) most harmful with respect to population health.
ggplot(major_health_impact,
aes(x = reorder(evtype, value), y = value, fill = dmgctgry)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = 'Health Impacting Weather Events',
color = "black",
x = 'Weather Events',
y = 'Most Health Impact | Health Damage (in thousands)') + coord_flip()
Events have the greatest economic consequences.
ggplot(major_economic_impact,
aes(x = reorder(evtype, value), y = value, fill = dmgctgry)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = 'Health Impacting Weather Events',
color = "black",
x = 'Weather Events',
y = 'Most Economic Impact | Property and Crop Damage (in thousands)') + coord_flip()
economic_impact_data[1:5,]
## evtype cropdmgval propdmgval econdmg
## 148 flood 5661968450 144657709807 150319678257
## 367 hurricane/typhoon 2607872800 69305840000 71913712800
## 754 tornado 414953270 56937160779 57352114049
## 595 storm surge 5000 43323536000 43323541000
## 206 hail 3025954473 15732267543 18758222016
Floods have the greatest economic damage of all weather events accounting for a total economic damage of 150,319,678,257 USD
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.