We carry out an statistical analysis of natural disasters collected in “repdata_data_StormData.csv”, a dataset of events that occurred in USA since 1950 till now. Our main purpose is to identify the worst events not only on human health but also on local economy. We only use a few variables from the 37 columns that comprises the dataset collected by the National Weather Office. The analysis deals with all records during the date range to compose an indicator of the strength of the impact on human beings (fatalities and injuries) as well on the economy (property damage and crop damage. This report should be considered as a first attempt at evaluating the full scale of natural events. Further work should be go on the direction of standardizing the event types.
We first load the data and examine its structure and dimensions.
rm(list=ls())
storm = read.csv("repdata_data_StormData.csv", header = T)
str(storm)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Since not all columns are useful for our analysis we select the columns that will be needed.
colnames(storm)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
colnam = c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG")
library(dplyr)
storm_tidy = select(storm, all_of(colnam))
head(storm_tidy, 10)
## EVTYPE FATALITIES INJURIES PROPDMG CROPDMG
## 1 TORNADO 0 15 25.0 0
## 2 TORNADO 0 0 2.5 0
## 3 TORNADO 0 2 25.0 0
## 4 TORNADO 0 2 2.5 0
## 5 TORNADO 0 2 2.5 0
## 6 TORNADO 0 6 2.5 0
## 7 TORNADO 0 1 2.5 0
## 8 TORNADO 0 0 2.5 0
## 9 TORNADO 1 14 25.0 0
## 10 TORNADO 0 0 25.0 0
For the analysis we create two dataframes: one that contains columns that imply harm to human beings by event type and the second one that contains columns that signal economic damage by event type. On both dataframes input columns are normalized by means of scale() function before constructing an indicator of the strength of the event type impact.
First, the dataframe for economic impact:
attach(storm_tidy)
evtype_hlt = aggregate(x=cbind( FATALITIES, INJURIES), FUN = sum , by=list(EVTYPE= trimws(toupper(EVTYPE))))
row_evtype = evtype_hlt[,1]
evtype_hlt_scal = scale(evtype_hlt[,2:3])
evtype_hlt_scal = as.data.frame(evtype_hlt_scal)
evtype_hlt_scal$impact = rowSums(evtype_hlt_scal)
rownames(evtype_hlt_scal) = row_evtype
head(evtype_hlt_scal)
## FATALITIES INJURIES impact
## ? -0.08164964 -0.05102759 -0.1326772
## ABNORMAL WARMTH -0.08164964 -0.05102759 -0.1326772
## ABNORMALLY DRY -0.08164964 -0.05102759 -0.1326772
## ABNORMALLY WET -0.08164964 -0.05102759 -0.1326772
## ACCUMULATED SNOWFALL -0.08164964 -0.05102759 -0.1326772
## AGRICULTURAL FREEZE -0.08164964 -0.05102759 -0.1326772
Now, the dataframe for health harm:
evtype_dmg = aggregate(x=cbind( PROPDMG, CROPDMG), FUN = sum , by=list(EVTYPE= trimws(toupper(EVTYPE))))
row_evtype = evtype_dmg[,1]
evtype_dmg_scal = scale(evtype_dmg[,2:3])
evtype_dmg_scal = as.data.frame(evtype_dmg_scal)
evtype_dmg_scal$impact = rowSums(evtype_dmg_scal)
rownames(evtype_dmg_scal) = row_evtype
head(evtype_dmg_scal)
## PROPDMG CROPDMG impact
## ? -0.0890019 -0.07098853 -0.1599904
## ABNORMAL WARMTH -0.0890383 -0.07098853 -0.1600268
## ABNORMALLY DRY -0.0890383 -0.07098853 -0.1600268
## ABNORMALLY WET -0.0890383 -0.07098853 -0.1600268
## ACCUMULATED SNOWFALL -0.0890383 -0.07098853 -0.1600268
## AGRICULTURAL FREEZE -0.0890383 -0.06966699 -0.1587053
df.plot1 = arrange(.data = evtype_hlt_scal, desc(impact), .by_group = T)
{
barplot(df.plot1[1:10,3], ylim=c(0,60), mar=c(4,2,2,2), oma=c(2,2,2,2),
main="Harm on Health", ylab="Impact", xaxt='n')
lablist.x <-as.vector(rownames(df.plot1)[1:10])
axis(side=1, labels = lablist.x, tick =TRUE, las=2, at=seq(0.7,11.5,1.2), cex.axis=0.5)
}
Fig. 1. Health Harm by Event Type
df.plot1 = arrange(.data = evtype_dmg_scal, desc(impact), .by_group = T)
barplot(df.plot1[1:10,3], ylim = c(0, 35), mar=c(0,2,2,2), oma=c(0,2,2,2),
main="Economic Impact", ylab="Impact", xaxt='n')
lablist.x <-as.vector(rownames(df.plot1)[1:10])
axis(side=1, labels = lablist.x, tick =TRUE, las=2, at=seq(0.7,11.5,1.2), cex.axis=0.5)
Fig. 2. Economic Impact by Event Type