The goal of this analysis is to determine which type of weather events across the US are the most harmful to public health and which have the greatest economic consequences. I used the NOAA Storm Database which tracked weather events from 1950 until 2011. I determined that tornados are the most significant storm event with respect to harming people and causing economic damage.
if( !file.exists('data')){
dir.create('data')
fileUrl <- 'http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2'
download.file(fileUrl, destfile = 'data/sampledata.csv.bz2')
}
data <- read.csv('data/sampledata.csv.bz2', header=TRUE)
#head(data, 5) # display the first 5 rows
I am interested in 5 columns from the whole data set:
EVTYPE : Weather Event type
FATALITIES : no. of deaths
INJURIES : no. of injured
CROPDMG : no. of crop damage
PROPDMG : no. of property damage
sapply(data, class) # Retrieve name and class of all columns in the data.frame called "data"
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME
## "numeric" "factor" "factor" "factor" "numeric" "factor"
## STATE EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE
## "factor" "factor" "numeric" "factor" "factor" "factor"
## END_TIME COUNTY_END COUNTYENDN END_RANGE END_AZI END_LOCATI
## "factor" "numeric" "logical" "numeric" "factor" "factor"
## LENGTH WIDTH F MAG FATALITIES INJURIES
## "numeric" "numeric" "integer" "numeric" "numeric" "numeric"
## PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC
## "numeric" "factor" "numeric" "factor" "factor" "factor"
## ZONENAMES LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS
## "factor" "numeric" "numeric" "numeric" "numeric" "factor"
## REFNUM
## "numeric"
Apparently, required data are technically in the correct form.
is.special <- function(x){
if(is.numeric(x)) !is.finite(x) else { is.na(x) || is.null(x)}
}
test <- sapply(data$FATALITIES, is.special)
test <- factor(test) # coerce test to factor
levels(test) # return FALSE only indicating no NA, INF, NaN or NULL values exist
## [1] "FALSE"
# levels(data$EVTYPE) # Lots of inconsistent levels appear
data$EVTYPE <- toupper(data$EVTYPE)
data$EVTYPE <- gsub('.*TSTM.*', 'THUNDERSTORM', data$EVTYPE)
data$EVTYPE <- gsub('.*THUNDERSTORM.*','THUNDERSTORM', data$EVTYPE)
data$EVTYPE <- gsub('.*HEAT.*', 'HEAT', data$EVTYPE)
data$EVTYPE <- gsub('.*FLOOD.*', 'FLOOD', data$EVTYPE)
data$EVTYPE <- gsub('.*FLD.*', 'FLOOD', data$EVTYPE)
data$EVTYPE <- gsub('.*LIGHTNING.*', 'LIGHTNING', data$EVTYPE)
data$EVTYPE <- gsub('.*TROPICAL STORM.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*HURRICANE.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*CYCLONE.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*TROPICAL DEPRESSION.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*TYPHOON.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*STORM SURGE.*', 'TROPICAL STORM', data$EVTYPE)
data$EVTYPE <- gsub('.*TORNADO.*', 'TORNADO', data$EVTYPE)
data$EVTYPE <- gsub('.*SNOW.*', 'SNOW', data$EVTYPE)
data$EVTYPE <- gsub('.*WINTER.*', 'WINTER', data$EVTYPE)
data$EVTYPE <- gsub('.*ICE.*', 'ICE', data$EVTYPE)
data$EVTYPE <- gsub('.*BLIZZARD.*', 'BLIZZARD', data$EVTYPE)
data$EVTYPE <- gsub('.*COLD.*', 'COLD', data$EVTYPE)
data$EVTYPE <- gsub('.*ICY.*', 'ICE', data$EVTYPE)
data$EVTYPE <- gsub('.*FREEZE.*', 'FREEZE', data$EVTYPE)
data$EVTYPE <- gsub('.*WIND.*', 'WINDS', data$EVTYPE)
data$EVTYPE <- gsub('.*RAIN.*', 'RAIN', data$EVTYPE)
data$EVTYPE <- gsub('.*SHOWER.*', 'HEAVY RAIN', data$EVTYPE)
data$EVTYPE <- gsub('.*FIRE.*', 'FIRE', data$EVTYPE)
data$EVTYPE <- gsub('.*SLIDE.*', 'LANDSLIDE', data$EVTYPE)
data$EVTYPE <- gsub('.*FOG.*', 'FOG', data$EVTYPE)
data$EVTYPE <- gsub('.*HAIL.*', 'HAIL', data$EVTYPE)
Now, data is ready for analysis…
library(data.table)
# select required data
health <- data.table(subset(data, select = c('EVTYPE', 'FATALITIES', 'INJURIES')))
# calculate sum of dead and injured people grouped by EVTYPE
health.totals <- health[,list(FATALITIES=sum(FATALITIES), INJURIES=sum(INJURIES)), by='EVTYPE']
# add a new column called HUMAN_HARM that stores the sum of dead and injured people
health.totals$HUMAN_HARM <- health.totals$FATALITIES + health.totals$INJURIES
# sort the data descendingly and store the top 5 rows
health.top <- head(subset(health.totals[order(health.totals$HUMAN_HARM, decreasing=T)]), 5)
Now, we plot the sorted data (health.top)
# convert data.table to data.frame
health.top <- data.frame(health.top)
# plot bar chart
library(ggplot2)
ggplot(data=health.top, aes(x=EVTYPE, y = HUMAN_HARM)) +
geom_bar(color="black", fill="red", width=0.7, stat="identity") +
xlab("Event Type") + ylab("Human Harm") +
ggtitle("Human Harm vs. Event Type")
# store the worst event
event <- toString(with(health.top, EVTYPE[1]))
event
## [1] "TORNADO"
Apparently, TORNADO is the worst event. Now, the break down of the human harm needs to be checked…
## draw a bar chart for 3 quantitative data
library(data.table)
names(health.top) <- c("Event", "Deaths", "Injuries", "Human_Harm")
library(reshape2)
melted <- melt(health.top ,id.vars = 'Event')
ggplot(melted, aes(x = Event , y = value)) +
geom_bar(aes(fill = variable ),position = "dodge",stat="identity", width=0.7) +
labs(x ='EVENT TYPE' , y = 'TOTAL',title = 'TOP 5 Fatalities/Injuries Count by Event Type')
From the above plots, we may conclude that the most harmful event is TORNADO because it causes the maximum number of injuries is 9.1407 × 104 and the maximum number of death is 5636. The sum of all the aforementioned human harms is 9.7043 × 104.
# select required data
library(data.table)
property <- data.table(subset(data, select=c('EVTYPE', 'CROPDMG', 'PROPDMG')))
# calculate sum of crop and property damages grouped by EVTYPE
property.totals <- property[,list(CROPDMG=sum(CROPDMG), PROPDMG=sum(PROPDMG)), by='EVTYPE']
# add a new column called HUMAN_HARM that stores the sum of crop and property damages
property.totals$ECONOMIC_HARM <- property.totals$CROPDMG + property.totals$PROPDMG
# sort the data descendingly and store the top 5 rows
property.top <- head(subset(property.totals[order(property.totals$ECONOMIC_HARM, decreasing=T)]), 5)
Now, we plot the sorted data (property.top)
# convert data.table to data.frame
property.top <- data.frame(property.top)
# plot bar chart
library(ggplot2)
names(property.top) <- c("Event", "Crop_Damage", "Property_Damage", "Economic_Harm")
library(reshape2)
melted <- melt(property.top ,id.vars = 'Event')
# plot bar chart (Note: divide y by 10^6 to scale to millions)
ggplot(melted, aes(x = Event , y = value/10**6)) +
geom_bar(aes(fill = variable ),position = "dodge",stat="identity", width=0.7) +
labs(x ='EVENT TYPE' , y = 'TOTAL Dollars Lost (millions)', title = 'TOP 5 Crop/Prop Damages by Event Type')
# store the worst event
event1 <- toString(with(property.top, Event[1]))
event1
## [1] "TORNADO"
From the above plot, we may conclude that the most harmful event is TORNADO because it causes the maximum economical loss : 3.3158 million dollars..
sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-apple-darwin13.1.0 (64-bit)
##
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.4 evaluate_0.5.5 formatR_1.0 htmltools_0.2.4
## [5] knitr_1.6 rmarkdown_0.2.68 stringr_0.6.2 tools_3.1.1
## [9] yaml_2.1.13