This analysis aims to find top events that has the most negative effects on population health and economy by enviromental events.The data used in this analysis is Storm Data from National Oceianic and Atmospheric Administration. This data dates from the start of 1950 to November 2011. This report contains 2 more sections. The first is Data processing and it shows how the data was read and preprocessed to reach the results. The second section shows The Results obtained by this analysis.
file1url<-("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2")
download.file(file1url,destfile = "NOAA.csv.bz2", method = "curl")
noaa<-read.csv("NOAA.csv.bz2")
file2url<-("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf")
download.file(file2url,destfile = "NOAAcodebook.pdf",method = "curl")
Downloadtime<-date()
str(noaa)
head(noaa)
tail(noaa)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(reshape2)
library(knitr)
dataf<- noaa %>% select(EVTYPE,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)
colnames(dataf)<-c("events","fatalities","injuries","property.damage",
"property.damage.exponent","crop.damage","crop.damage.exponent")
It was found that: events column contains typos and different symbols which will make it hard to group same events together
dataf$events<-toupper(as.character(dataf$events))
dataf$events<-str_replace_all(dataf$events,c("AVALANCE"="AVALANCHE",
"HAIL 0 75"="HAIL 075","HAIL 0 75"="HAIL 075","AVALANCE"="AVALANCHE",
"HAIL 75"="HAIL 075","HAIL 1 00"="HAIL 100","HAIL 1 75"="HAIL 175",
"HAIL 0 88"="HAIL 088","HAIL 88"="HAIL 088","HAIL 0 75"="HAIL 075",
"HAIL 75"="HAIL 075","HAIL 1 00"="HAIL 100", "HAIL 1 75"="HAIL 175",
"HAIL 0 88"="HAIL 088","HAIL 88"="HAIL 088", "WINTERY"="WINTER",
"WINTRY"="WINTER","LIGHTN"="LIGHTNING","MIRCOBURST"="MICROBURST",
"FLOODIN"="FLOOD","SML"="SMALL","FLD"="FLOOD","TSTMW"="TSTM WIND",
"TORNADOE"="TORNADO","TORNDAO"="TORNADO","VOG"="FOG","THUNDERSTORM"="TSTM",
"TSTMW"="TSTM","WINDS"="WIND","TSTMW"="TSTM WIND","WIND WIND"="WIND",
"TSTMS"="TSTM","TSTM W IND"="TSTM WIND","TUNDERSTORM"="TSTM","WAYTER"="WATER",
"WATER SPOUT"="WATERSPOUT","WILDFIRE"="WILD FIRE","UNSEASONAL"="UNSEASONABLY",
"UNSEASONABLE"="UNSEASONABLY","S$"="","ING$"="", "\\s$"="","AND$"="",
"WIN$"="WIND","MPH$"="","\\s$"= ""," G$"= "","[[:punct:]]"=" ","WND"="WIND"))
property.damage.exponent and crop.damage.exponent columns found to have different symbols
From the Noaacodebook.pdf these symbols can be understood as:
H,h = hundreds = 100
K,k = kilos = thousands = 1,000
M,m = millions = 1,000,000
B,b = billions = 1,000,000,000
Other symbols and numbers:
“+”,“-,”?“= 0 ambigious symbols no source to unviel the meaing behind these symbols all the links that could be accessed in the Noaa site doesn’t give any explaination.
black/empty character = 0
numeric 0,1,2,3,4,5,6,7,8 = will be kept as is
c<-levels(as.factor(dataf$property.damage.exponent))
d<-as.character(c(0,0,0,10000000000,100,100,1000,1000000,1000000))
c<-c[c(2:4,14:19)]
dataf$property.damage.exponent<-str_replace_all(dataf$property.damage.exponent,c("\\-"= "0","\\?"="0",
"\\+"="0","B"=1000000000,"h"="100","H"="100","m"="1000000",
"M"="1000000","K"="1000","k"="1000"))
dataf$property.damage.exponent[!nzchar(dataf$property.damage.exponent)] <- "0"
e<-levels(as.factor(dataf$crop.damage.exponent))
dataf$crop.damage.exponent<-str_replace_all(dataf$crop.damage.exponent,c("\\?"="0",
"B"=1000000000,"m"="1000000","M"="1000000","K"="1000","k"="1000"))
dataf$crop.damage.exponent[!nzchar(dataf$crop.damage.exponent)] <- "0"
dataf$property.damage.exponent<-as.numeric(dataf$property.damage.exponent)
dataf$crop.damage.exponent<-as.numeric(dataf$crop.damage.exponent)
fainjdata<-dataf %>% select(events,fatalities,injuries) %>% group_by(events) %>%
summarise(total.fatalities= sum(fatalities),
total.injuries = sum(injuries)) %>%
arrange(total.fatalities,total.injuries)
max.event.pop<-as.character(tail(fainjdata,10)[10,1])
max.fatalities<-as.character(tail(fainjdata,10)[10,2])
max.injuries<-as.character(tail(fainjdata,10)[10,3])
table1<-kable(tail(fainjdata,10))
| events | total.fatalities | total.injuries |
|---|---|---|
| AVALANCHE | 225 | 170 |
| HIGH WIND | 283 | 1439 |
| FLOOD | 470 | 6789 |
| RIP CURRENT | 572 | 529 |
| TSTM WIND | 701 | 9358 |
| LIGHTNING | 816 | 5230 |
| HEAT | 937 | 2100 |
| FLASH FLOOD | 980 | 1777 |
| EXCESSIVE HEAT | 1903 | 6525 |
| TORNADO | 5633 | 91346 |
g1<-ggplot(melt(tail(fainjdata,10), id.vars = "events"),
aes(value, variable, colour = events))
g1+ geom_point(size = 4)+labs(x="Number of incidents",y="Injuries and Fatalities",
title = "Top 10 events harmful to population health\nfrom 1950 to November 2011")+theme(plot.title = element_text(lineheight=.8, face="bold"))
It was found that the most event affecting population health with either fatalities or injuries is TORNADO with fatalities 5633 persons and injuries of 91346 persons in the time period of the analysis.
There are two types of damage data collected property and crop damage data. for each there are two columns damage column and the damage exponent column. The approach is to multiply the damage column to the damage exponent column then sum the property and crop damage for each event.
dmgdata<-dataf %>% select(events,property.damage,property.damage.exponent,
crop.damage,crop.damage.exponent) %>% group_by(events) %>%
mutate(prop.damage.total =property.damage*property.damage.exponent,
crop.damage.total=crop.damage*crop.damage.exponent) %>%
mutate(damage = prop.damage.total+crop.damage.total) %>%
summarise(total.damage = sum(damage)) %>% arrange(total.damage)
dmgdata<-dmgdata[!is.na(dmgdata[,2]),]
dmgdata<-dmgdata %>% mutate(total.damage.in.billions = total.damage/1000000000) %>%
select(events,total.damage.in.billions)
dmgdata<-tail(dmgdata,8)
max.event.dmg<-as.character(dmgdata[8,1])
max.damage<-as.character(dmgdata[8,2])
table2<-kable(dmgdata)
| events | total.damage.in.billions |
|---|---|
| HURRICANE | 14.61023 |
| DROUGHT | 15.01867 |
| FLASH FLOOD | 17.57086 |
| HAIL | 18.75822 |
| STORM SURGE | 43.32354 |
| TORNADO | 57.35212 |
| HURRICANE TYPHOON | 71.91371 |
| FLOOD | 150.32573 |
g2<-ggplot(dmgdata,aes(total.damage.in.billions,events,colour = events))
g2+geom_point(size=4)+scale_x_continuous(breaks = seq(from=10,to=200, by=20))+
labs(x="Total damage in Billions",y="Events",
title="Top 8 events have the greatest economic consequences\nfrom 1950 to November 2011")+theme(plot.title = element_text(lineheight=.8, face="bold"))
It was found that the most event having economic consequences is FLOOD with 150.32572825 Billion USD in the time period of the analysis.