Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
Exploratory Analysis on NOAA Storm Database from 1950 to 2011.
The data for this assignment come in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
In this section I will load data into a dataframe and then briefly describe its columns.
Load data:
StormData <- fread("repdata_data_StormData.csv")
Dataframe dimensions:
dim(StormData)
## [1] 902297 37
Dataframe header:
head(StormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1: 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2: 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3: 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4: 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5: 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6: 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1: TORNADO 0 0 NA
## 2: TORNADO 0 0 NA
## 3: TORNADO 0 0 NA
## 4: TORNADO 0 0 NA
## 5: TORNADO 0 0 NA
## 6: TORNADO 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1: 0 14.0 100 3 0 0 15 25.0
## 2: 0 2.0 150 2 0 0 0 2.5
## 3: 0 0.1 123 2 0 0 2 25.0
## 4: 0 0.0 100 2 0 0 2 2.5
## 5: 0 0.0 150 2 0 0 2 2.5
## 6: 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1: K 0 3040 8812
## 2: K 0 3042 8755
## 3: K 0 3340 8742
## 4: K 0 3458 8626
## 5: K 0 3412 8642
## 6: K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1: 3051 8806 1
## 2: 0 0 2
## 3: 0 0 3
## 4: 0 0 4
## 5: 0 0 5
## 6: 0 0 6
Now, since it is a quite big dataframe I will check columns names and use only the relevant ones.
names(StormData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
data<- StormData[, c(8,23:28)]
head(data)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1: TORNADO 0 15 25.0 K 0
## 2: TORNADO 0 0 2.5 K 0
## 3: TORNADO 0 2 25.0 K 0
## 4: TORNADO 0 2 2.5 K 0
## 5: TORNADO 0 2 2.5 K 0
## 6: TORNADO 0 6 2.5 K 0
Now I will briefly analyze the dataset using the summary function:
summary(data)
## EVTYPE FATALITIES INJURIES PROPDMG
## Length:902297 Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## Class :character 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00
## Mode :character Median : 0.0000 Median : 0.0000 Median : 0.00
## Mean : 0.0168 Mean : 0.1557 Mean : 12.06
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.50
## Max. :583.0000 Max. :1700.0000 Max. :5000.00
## PROPDMGEXP CROPDMG CROPDMGEXP
## Length:902297 Min. : 0.000 Length:902297
## Class :character 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.000 Mode :character
## Mean : 1.527
## 3rd Qu.: 0.000
## Max. :990.000
summary(data$FATALITIES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0168 0.0000 583.0000
summary(data$INJURIES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1557 0.0000 1700.0000
Now let’s merge fatalities and injuries together in the total_dataset.
#injuries aggregation
total_injuries<-aggregate(INJURIES~EVTYPE, data, sum)
total_injuries<-arrange(total_injuries, desc(INJURIES))
total_injuries<-total_injuries[1:20,]
#fatalities aggregation
total_fatalities<-aggregate(FATALITIES~EVTYPE, data, sum)
total_fatalities<-arrange(total_fatalities, desc(FATALITIES))
total_fatalities<-total_fatalities[1:20,]
#final dataset
total_dataset<-merge(total_injuries, total_fatalities, by="EVTYPE")
total_dataset<-arrange(total_dataset, desc(INJURIES, FATALITIES))
total_dataset
## EVTYPE INJURIES FATALITIES
## 1 TORNADO 91346 5633
## 2 TSTM WIND 6957 504
## 3 FLOOD 6789 470
## 4 EXCESSIVE HEAT 6525 1903
## 5 LIGHTNING 5230 816
## 6 HEAT 2100 937
## 7 FLASH FLOOD 1777 978
## 8 THUNDERSTORM WIND 1488 133
## 9 WINTER STORM 1321 206
## 10 HIGH WIND 1137 248
## 11 HEAVY SNOW 1021 127
## 12 BLIZZARD 805 101
events<-total_dataset$EVTYPE
barplot(t(total_dataset[,-1]), names.arg = events, beside = T, ylim=c(0,95000), cex.names = 0.8, las=2, col=c("#FF6633","#3399FF"), main="Most Harmful Events")
legend("topright", c("Fatalities","Injuries"), fill=c("#FF6633","#3399FF"), bty = "n")
And as we can see from the plot above, Tornados, Winds, Floods, Excessive Heat and Lightnings are the most harmful events.
In order to see the results we need to convert property and crop damage into numbers, also we need to aggregate Crop and Prop damage in a single variable and sort the dataset by highest number.
#number conversion
data$PROPDAMAGE=0
data[data$PROPDMGEXP == "H", ]$PROPDAMAGE = data[data$PROPDMGEXP == "H",]$PROPDMG*10^2
data[data$PROPDMGEXP == "K", ]$PROPDAMAGE = data[data$PROPDMGEXP == "K",]$PROPDMG*10^3
data[data$PROPDMGEXP == "M", ]$PROPDAMAGE = data[data$PROPDMGEXP == "M",]$PROPDMG*10^6
data[data$PROPDMGEXP == "B", ]$PROPDAMAGE = data[data$PROPDMGEXP == "B",]$PROPDMG*10^9
data$CROPDAMAGE=0
data[data$CROPDMGEXP == "H", ]$CROPDAMAGE = data[data$CROPDMGEXP == "H",]$CROPDMG*10^2
data[data$CROPDMGEXP == "K", ]$CROPDAMAGE = data[data$CROPDMGEXP == "K",]$CROPDMG*10^3
data[data$CROPDMGEXP == "M", ]$CROPDAMAGE = data[data$CROPDMGEXP == "M",]$CROPDMG*10^6
data[data$CROPDMGEXP == "B", ]$CROPDAMAGE = data[data$CROPDMGEXP == "B",]$CROPDMG*10^9
#final sorted dataset
damage_final_total<-aggregate(PROPDAMAGE+CROPDAMAGE~EVTYPE, data, sum)
names(damage_final_total)=c("EVENT_TYPE","TOTAL_DAMAGE")
damage_final_total<-arrange(damage_final_total, desc(TOTAL_DAMAGE))
damage_final_total<-damage_final_total[1:20,]
damage_final_total$TOTAL_DAMAGE<-damage_final_total$TOTAL_DAMAGE/10^9
damage_final_total$EVENT_TYPE<-factor(damage_final_total$EVENT_TYPE, levels = damage_final_total$EVENT_TYPE)
head(damage_final_total)
## EVENT_TYPE TOTAL_DAMAGE
## 1 FLOOD 150.31968
## 2 HURRICANE/TYPHOON 71.91371
## 3 TORNADO 57.34061
## 4 STORM SURGE 43.32354
## 5 HAIL 18.75290
## 6 FLASH FLOOD 17.56213
And by plotting the resulting dataset:
with(damage_final_total, barplot(TOTAL_DAMAGE, names.arg = EVENT_TYPE, beside = T, cex.names = 0.8, las=2, col="#003366", main = "Top Property and Crop Damage", ylab = "Total Damage in $Billions"))
And as we can see, Floods,Hurricanes, Tornados, Storms, Hails and Flash Floods have the biggest impact on Crops and Properties.