Synopsis

We carry out an statistical analysis of natural disasters collected in “repdata_data_StormData.csv”, a dataset of events that occurred in USA since 1950 till now. Our main purpose is to identify the worst events not only on human health but also on local economy. We only use a few variables from the 37 columns that comprises the dataset collected by the National Weather Office. The analysis deals with all records during the date range to compose an indicator of the strength of the impact on human beings (fatalities and injuries) as well on the economy (property damage and crop damage. This report should be considered as a first attempt at evaluating the full scale of natural events. Further work should be go on the direction of standardizing the event types.

Data Processing

We first load the data and examine its structure and dimensions.

rm(list=ls())
storm = read.csv("repdata_data_StormData.csv", header = T)
str(storm)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Since not all columns are useful for our analysis we select the columns that will be needed.

colnames(storm)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
colnam = c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG") 

library(dplyr)
storm_tidy = select(storm, all_of(colnam))
head(storm_tidy, 10)
##     EVTYPE FATALITIES INJURIES PROPDMG CROPDMG
## 1  TORNADO          0       15    25.0       0
## 2  TORNADO          0        0     2.5       0
## 3  TORNADO          0        2    25.0       0
## 4  TORNADO          0        2     2.5       0
## 5  TORNADO          0        2     2.5       0
## 6  TORNADO          0        6     2.5       0
## 7  TORNADO          0        1     2.5       0
## 8  TORNADO          0        0     2.5       0
## 9  TORNADO          1       14    25.0       0
## 10 TORNADO          0        0    25.0       0

For the analysis we create two dataframes: one that contains columns that imply harm to human beings by event type and the second one that contains columns that signal economic damage by event type. On both dataframes input columns are normalized by means of scale() function before constructing an indicator of the strength of the event type impact.
First, the dataframe for economic impact:

attach(storm_tidy)

evtype_hlt = aggregate(x=cbind( FATALITIES, INJURIES), FUN =  sum , by=list(EVTYPE= trimws(toupper(EVTYPE))))
row_evtype =  evtype_hlt[,1]

evtype_hlt_scal = scale(evtype_hlt[,2:3])
evtype_hlt_scal = as.data.frame(evtype_hlt_scal)
evtype_hlt_scal$impact = rowSums(evtype_hlt_scal)
rownames(evtype_hlt_scal) = row_evtype
head(evtype_hlt_scal)
##                       FATALITIES    INJURIES     impact
## ?                    -0.08164964 -0.05102759 -0.1326772
## ABNORMAL WARMTH      -0.08164964 -0.05102759 -0.1326772
## ABNORMALLY DRY       -0.08164964 -0.05102759 -0.1326772
## ABNORMALLY WET       -0.08164964 -0.05102759 -0.1326772
## ACCUMULATED SNOWFALL -0.08164964 -0.05102759 -0.1326772
## AGRICULTURAL FREEZE  -0.08164964 -0.05102759 -0.1326772

Now, the dataframe for health harm:

evtype_dmg = aggregate(x=cbind( PROPDMG, CROPDMG), FUN =  sum , by=list(EVTYPE= trimws(toupper(EVTYPE))))
row_evtype =  evtype_dmg[,1]

evtype_dmg_scal = scale(evtype_dmg[,2:3])
evtype_dmg_scal = as.data.frame(evtype_dmg_scal)
evtype_dmg_scal$impact = rowSums(evtype_dmg_scal)
rownames(evtype_dmg_scal) = row_evtype
head(evtype_dmg_scal)
##                         PROPDMG     CROPDMG     impact
## ?                    -0.0890019 -0.07098853 -0.1599904
## ABNORMAL WARMTH      -0.0890383 -0.07098853 -0.1600268
## ABNORMALLY DRY       -0.0890383 -0.07098853 -0.1600268
## ABNORMALLY WET       -0.0890383 -0.07098853 -0.1600268
## ACCUMULATED SNOWFALL -0.0890383 -0.07098853 -0.1600268
## AGRICULTURAL FREEZE  -0.0890383 -0.06966699 -0.1587053

Results

df.plot1 = arrange(.data = evtype_hlt_scal, desc(impact), .by_group = T)
{
barplot(df.plot1[1:10,3], ylim=c(0,60), mar=c(4,2,2,2), oma=c(2,2,2,2),
        main="Harm on Health", ylab="Impact", xaxt='n')

lablist.x <-as.vector(rownames(df.plot1)[1:10]) 
axis(side=1, labels = lablist.x, tick =TRUE, las=2, at=seq(0.7,11.5,1.2), cex.axis=0.5)
}
Fig. 1. Health Harm by Event Type

Fig. 1. Health Harm by Event Type

df.plot1 = arrange(.data = evtype_dmg_scal, desc(impact), .by_group = T)
barplot(df.plot1[1:10,3], ylim = c(0, 35), mar=c(0,2,2,2), oma=c(0,2,2,2),
        main="Economic Impact", ylab="Impact", xaxt='n')

lablist.x <-as.vector(rownames(df.plot1)[1:10]) 
axis(side=1, labels = lablist.x, tick =TRUE, las=2, at=seq(0.7,11.5,1.2), cex.axis=0.5)
Fig. 2. Economic Impact by Event Type

Fig. 2. Economic Impact by Event Type