In this report we aim to identify the type of severe weather events that were the most harmful and with the greatest economic consequences in USA from 1950 to 2011. Our overall hypothesis is that a small number of events represent more than 80% of the harm with respect to the population health and more than 80% of the economic consequences. To investigate this hypothesis we used data from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. From these data we found that 10% of events caused 90% of fatalities and 96% of injuries. With regard to economic consequences, 97% were caused by 5% of the events.
We downloaded the data file from the coursera course website. The data were in the form of a comma-separated-value file compressed via the bzip2 algorithm to reduce its size. After reading in the data file we checked the first few row in the dataset and the structure.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.1
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
data<-read.csv("repdata_data_StormData.csv.bz2", header = TRUE, sep = ",",stringsAsFactors = FALSE)
dim(data)
## [1] 902297 37
str(data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Then we created a subset of database including only variables that will be nused in the analysis. We changed the variable date from character format to date format and also checked for missiong values
data_1<-select(data, BGN_DATE, COUNTYNAME, STATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, CROPDMG )
data_1$date2<-mdy_hms(data_1$BGN_DATE)
We looked at the summary of the variables of interest to have an idea of their distributions
head(data_1)
## BGN_DATE COUNTYNAME STATE EVTYPE FATALITIES INJURIES PROPDMG
## 1 4/18/1950 0:00:00 MOBILE AL TORNADO 0 15 25.0
## 2 4/18/1950 0:00:00 BALDWIN AL TORNADO 0 0 2.5
## 3 2/20/1951 0:00:00 FAYETTE AL TORNADO 0 2 25.0
## 4 6/8/1951 0:00:00 MADISON AL TORNADO 0 2 2.5
## 5 11/15/1951 0:00:00 CULLMAN AL TORNADO 0 2 2.5
## 6 11/15/1951 0:00:00 LAUDERDALE AL TORNADO 0 6 2.5
## CROPDMG date2
## 1 0 1950-04-18
## 2 0 1950-04-18
## 3 0 1951-02-20
## 4 0 1951-06-08
## 5 0 1951-11-15
## 6 0 1951-11-15
summary(data_1)
## BGN_DATE COUNTYNAME STATE
## Length:902297 Length:902297 Length:902297
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## EVTYPE FATALITIES INJURIES
## Length:902297 Min. : 0.0000 Min. : 0.0000
## Class :character 1st Qu.: 0.0000 1st Qu.: 0.0000
## Mode :character Median : 0.0000 Median : 0.0000
## Mean : 0.0168 Mean : 0.1557
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :583.0000 Max. :1700.0000
## PROPDMG CROPDMG date2
## Min. : 0.00 Min. : 0.000 Min. :1950-01-03 00:00:00
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.:1995-04-20 00:00:00
## Median : 0.00 Median : 0.000 Median :2002-03-18 00:00:00
## Mean : 12.06 Mean : 1.527 Mean :1998-12-27 23:37:48
## 3rd Qu.: 0.50 3rd Qu.: 0.000 3rd Qu.:2007-07-28 00:00:00
## Max. :5000.00 Max. :990.000 Max. :2011-11-30 00:00:00
We calculated the total of fatalities, injuries, property and crop damages per event type.
tot_evtype<-data_1%>%group_by(EVTYPE)%>%
summarise(TotalFatalities= sum(FATALITIES, na.rm = TRUE),
TotalInjuries= sum(INJURIES, na.rm = TRUE),
TotalPropdmg= sum(PROPDMG, na.rm = TRUE),
TotalCropdmg= sum(CROPDMG, na.rm = TRUE))
We selected events with at least one injury or one death and calculated total injuries and total deaths and we calculated summaries and totals for fatalities and injuries
harmfulEvent<- tot_evtype%>% filter(TotalFatalities>0|TotalInjuries>0)
apply(harmfulEvent [,2:3],2,sum,na.rm=TRUE)
## TotalFatalities TotalInjuries
## 15145 140528
summary(harmfulEvent[,2:3])
## TotalFatalities TotalInjuries
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.00 1st Qu.: 0.00
## Median : 2.00 Median : 2.00
## Mean : 68.84 Mean : 638.76
## 3rd Qu.: 10.25 3rd Qu.: 35.25
## Max. :5633.00 Max. :91346.00
The results from the summary show that 10% of events caused 90% of fatalities.
top_fatalities<-harmfulEvent[,1:2]%>% filter(TotalFatalities>quantile(harmfulEvent$TotalFatalities,0.90))
head(arrange(top_fatalities,-TotalFatalities))
## # A tibble: 6 x 2
## EVTYPE TotalFatalities
## <chr> <dbl>
## 1 TORNADO 5633
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
## 6 TSTM WIND 504
prop_top_fatalities<-sum(top_fatalities$TotalFatalities)/sum(harmfulEvent$TotalFatalities)
prop_top_fatalities
## [1] 0.9050512
The graph below show the top 10% events with the most fatalities
top_fatal_desc<-arrange(top_fatalities, (TotalFatalities))
par(mar=c(2,10,2,2))
barplot(top_fatal_desc$TotalFatalities, width = 1, horiz = TRUE, names.arg = top_fatal_desc$EVTYPE, las=1, main = "Top 10% Events with the Highest Number of Fatalities, USA, 1950-2011", xlab = "Total Number of Deaths")
top_injuries<-harmfulEvent[,c(1,3)]%>% filter(TotalInjuries>quantile(harmfulEvent$TotalInjuries,0.90))
prop_top_injuries<-sum(top_injuries$TotalInjuries)/sum(harmfulEvent$TotalInjuries)
prop_top_injuries
## [1] 0.9634023
head(arrange(top_injuries, -TotalInjuries))
## # A tibble: 6 x 2
## EVTYPE TotalInjuries
## <chr> <dbl>
## 1 TORNADO 91346
## 2 TSTM WIND 6957
## 3 FLOOD 6789
## 4 EXCESSIVE HEAT 6525
## 5 LIGHTNING 5230
## 6 HEAT 2100
The graph below show that 10%of events that caused 96% of injuries
top_injuries_desc<-arrange(top_injuries, (TotalInjuries))
par(mar=c(2,10,2,2))
barplot(top_injuries_desc$TotalInjuries, width = 1, horiz = TRUE, names.arg = top_injuries_desc$EVTYPE, las=1, main = "Top 10% Events with the Highest Number of Injuries, USA, 1950-2011", xlab = "Total Number of Injuries")
We created a subset of data with events that had at least one property damage or one crop damage. Then we created a variable called total damages by adding crop and property damages. We checked the first few variables, the summary and total of all the damages.
Econ_cnsq<-tot_evtype%>% filter(TotalPropdmg>0|TotalCropdmg>0)
Econ_cnsq$tot_dmg<-Econ_cnsq$TotalPropdmg+Econ_cnsq$TotalCropdmg
head(Econ_cnsq)
## # A tibble: 6 x 6
## EVTYPE TotalFatalities TotalInjuries TotalPropdmg TotalCropdmg tot_dmg
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 " HIGH~ 0 0 200 0 200
## 2 " FLASH ~ 0 0 50 0 50
## 3 " TSTM W~ 0 0 108 0 108
## 4 " TSTM W~ 0 0 8 0 8
## 5 ? 0 0 5 0 5
## 6 AGRICULT~ 0 0 0 28.8 28.8
sum(Econ_cnsq$tot_dmg)
## [1] 12262327
summary(Econ_cnsq$tot_dmg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 10 59 28451 601 3312277
top_Econ_cnsq<-Econ_cnsq[,c(1,6)]%>% filter(tot_dmg>quantile(Econ_cnsq$tot_dmg,0.95))
Prop_top_dmg<-sum(top_Econ_cnsq$tot_dmg)/sum(Econ_cnsq$tot_dmg)
Prop_top_dmg
## [1] 0.9683318
head(arrange(top_Econ_cnsq,-tot_dmg))
## # A tibble: 6 x 2
## EVTYPE tot_dmg
## <chr> <dbl>
## 1 TORNADO 3312277.
## 2 FLASH FLOOD 1599325.
## 3 TSTM WIND 1445168.
## 4 HAIL 1268290.
## 5 FLOOD 1067976.
## 6 THUNDERSTORM WIND 943636.
The results above show that 5% of events caused 97% of the damages. The graph below show the top 5% of events with the most damages.
top_Econ_des<-arrange(top_Econ_cnsq,(tot_dmg))
par(mar=c(2,10,2,2))
barplot(top_Econ_des$tot_dmg, width = 1, horiz = TRUE, names.arg = top_Econ_des$EVTYPE, las=1, main = "Top 5% Events with the Highest Economic Consequences, USA, 1950-2011", xlab = "Total Damages in $")