This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
In our conclusion, we found Tornado is the most harmful event, causing 5633 fatalities and 91346 injuries in USA during 1950-2011. From perspective on the economic consequences, flood causes the most damage to our society, in terms of lose on crop and property damage.
The data for this assignment come in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. The Data source is provided by U.S. National Oceanic and Atmospheric Administration’s. URL: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2
# download the file if it's not exists
if(!file.exists('repdata-data-StormData.csv.bz2')){
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile='repdata-data-StormData.csv.bz2')
}
# uncompress the file if it's exitst
if(file.exists('repdata-data-StormData.csv.bz2')){
StormData <- read.csv(bzfile('repdata-data-StormData.csv.bz2'), header = TRUE)
}
str(StormData)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "000","0000","0001",..: 152 167 2645 1563 2524 3126 122 1563 3126 3126 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 826 826 826 826 826 826 826 826 826 826 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels ""," Christiansburg",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels ""," CANTON"," TULIA",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","+","-","0",..: 16 16 16 16 16 16 16 16 16 16 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","0","2","?",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","\t","\t\t",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
dim(StormData)
## [1] 902297 37
names(StormData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
head(StormData,3)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
These are the 7 columns we need from the data set:
NewStormData <- StormData[,c("EVTYPE", "FATALITIES", "INJURIES",
"PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
# Calculate and sorting the numbers if fatalities and injuries
sumofdead <- aggregate(FATALITIES ~ EVTYPE, data = NewStormData, "sum")
sumofinjuries <- aggregate(INJURIES ~ EVTYPE, data = NewStormData, "sum")
top10sumofdead <- sumofdead[order(-sumofdead$FATALITIES),][1:10,]
top10sumofinjuries <- sumofinjuries[order(-sumofinjuries$INJURIES),][1:10,]
top10sumofdead
## EVTYPE FATALITIES
## 826 TORNADO 5633
## 124 EXCESSIVE HEAT 1903
## 151 FLASH FLOOD 978
## 271 HEAT 937
## 453 LIGHTNING 816
## 846 TSTM WIND 504
## 167 FLOOD 470
## 572 RIP CURRENT 368
## 343 HIGH WIND 248
## 19 AVALANCHE 224
top10sumofinjuries
## EVTYPE INJURIES
## 826 TORNADO 91346
## 846 TSTM WIND 6957
## 167 FLOOD 6789
## 124 EXCESSIVE HEAT 6525
## 453 LIGHTNING 5230
## 271 HEAT 2100
## 422 ICE STORM 1975
## 151 FLASH FLOOD 1777
## 753 THUNDERSTORM WIND 1488
## 241 HAIL 1361
par(mfrow=c(1,2),mar=c(10,3,10,3), las=2, cex=0.8)
# Top 10 events causing fatalities
barplot(top10sumofdead$FATALITIES,
names.arg = top10sumofdead$EVTYPE,
col = top10sumofdead$EVTYPE,
main = "Top 10 events causing fatalities",
ylab = "number of events")
# Top 10 events causing injuries
barplot(top10sumofinjuries$INJURIES,
names.arg = top10sumofinjuries$EVTYPE,
col = top10sumofinjuries$EVTYPE,
main = "Top 10 events causing injuries",
ylab = "number of events")
Alphabetical characters used to signify magnitude include “K” for thousands, “M” for millions, and “B” for billions.
NewStormData$PROPDMGEXP <- as.character(NewStormData$PROPDMGEXP)
NewStormData$CROPDMGEXP <- as.character(NewStormData$CROPDMGEXP)
NewStormData$PROPDMGEXP = gsub("H|h","2",NewStormData$PROPDMGEXP)
NewStormData$CROPDMGEXP = gsub("H|h","2",NewStormData$CROPDMGEXP)
NewStormData$PROPDMGEXP = gsub("K|k","3",NewStormData$PROPDMGEXP)
NewStormData$CROPDMGEXP = gsub("K|k","3",NewStormData$CROPDMGEXP)
NewStormData$PROPDMGEXP = gsub("M|m","6",NewStormData$PROPDMGEXP)
NewStormData$CROPDMGEXP = gsub("M|m","6",NewStormData$CROPDMGEXP)
NewStormData$PROPDMGEXP = gsub("B|b","9",NewStormData$PROPDMGEXP)
NewStormData$CROPDMGEXP = gsub("B|b","9",NewStormData$CROPDMGEXP)
NewStormData$PROPDMGEXP <- as.numeric(NewStormData$PROPDMGEXP)
## Warning: NAs introduced by coercion
NewStormData$CROPDMGEXP <- as.numeric(NewStormData$CROPDMGEXP)
## Warning: NAs introduced by coercion
NewStormData$PROPDMGEXP[is.na(NewStormData$PROPDMGEXP)] = 0
NewStormData$CROPDMGEXP[is.na(NewStormData$CROPDMGEXP)] = 0
NewStormData$NewPropDMG <- NewStormData$PROPDMG *10^NewStormData$PROPDMGEXP
NewStormData$NewCropDMG <- NewStormData$CROPDMG *10^NewStormData$CROPDMGEXP
sumofpropertyDMG <- aggregate(NewPropDMG ~ EVTYPE, data = NewStormData, "sum")
top10sumofpropmg <- sumofpropertyDMG[order(-sumofpropertyDMG$NewPropDMG),][1:10,]
top10sumofpropmg
## EVTYPE NewPropDMG
## 167 FLOOD 144657709807
## 393 HURRICANE/TYPHOON 69305840000
## 826 TORNADO 56947380676
## 656 STORM SURGE 43323536000
## 151 FLASH FLOOD 16822673978
## 241 HAIL 15735267513
## 385 HURRICANE 11868319010
## 839 TROPICAL STORM 7703890550
## 962 WINTER STORM 6688497251
## 343 HIGH WIND 5270046295
sumofcropDMG <- aggregate(NewCropDMG ~ EVTYPE, data = NewStormData, "sum")
top10sumofcropmg <- sumofcropDMG[order(-sumofcropDMG$NewCropDMG),][1:10,]
top10sumofcropmg
## EVTYPE NewCropDMG
## 91 DROUGHT 13972566000
## 167 FLOOD 5661968450
## 577 RIVER FLOOD 5029459000
## 422 ICE STORM 5022113500
## 241 HAIL 3025954473
## 385 HURRICANE 2741910000
## 393 HURRICANE/TYPHOON 2607872800
## 151 FLASH FLOOD 1421317100
## 132 EXTREME COLD 1292973000
## 198 FROST/FREEZE 1094086000
NewStormData$sumofDamage <- NewStormData$NewPropDMG + NewStormData$NewCropDMG
Damages <- aggregate(sumofDamage ~ EVTYPE, data = NewStormData, "sum")
top10damage <- Damages[order(-Damages$sumofDamage),][1:10,]
top10damage
## EVTYPE sumofDamage
## 167 FLOOD 150319678257
## 393 HURRICANE/TYPHOON 71913712800
## 826 TORNADO 57362333946
## 656 STORM SURGE 43323541000
## 241 HAIL 18761221986
## 151 FLASH FLOOD 18243991078
## 91 DROUGHT 15018672000
## 385 HURRICANE 14610229010
## 577 RIVER FLOOD 10148404500
## 422 ICE STORM 8967041360
par(mfrow=c(1,3),mar=c(10,3,10,3), las=2, cex=0.5, title=" ")
## Warning in par(mfrow = c(1, 3), mar = c(10, 3, 10, 3), las = 2, cex =
## 0.5, : "title" is not a graphical parameter
# Top events causing Property Damage
barplot(top10sumofpropmg$NewPropDMG,
names.arg = top10sumofpropmg$EVTYPE,
col = top10sumofdead$EVTYPE,
main = "Top events causing Property Damage",
ylab = "number of events")
# Top events causing Crop Damage
barplot(top10sumofcropmg$NewCropDMG,
names.arg = top10sumofcropmg$EVTYPE,
col = top10sumofcropmg$EVTYPE,
main = "Top events causing Crop Damage",
ylab = "number of events")
# Top events causing Most Economic impact
barplot(top10damage$sumofDamage,
names.arg = top10damage$EVTYPE,
col = top10damage$EVTYPE,
main = "Top events causing Most Economic impact",
ylab = "number of events")
In our conclusion, we found Tornado is the most harmful event, causing 5633 fatalities and 91346 injuries in USA during 1950-2011. From perspective on the economic consequences, flood causes the most damage to our society, in terms of lose on crop and property damage.