library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, last

1. Summary

This is an analysis of “Storm Data”" database. Firstly, I’ve recovered the raw information from the web and I’ve named the data frame as “STORM”. Secondly, I’ve cleaned the variable EVTYPE and I’ve created a smaller dataframe “Stormsummary” grouping by EVTYPE. The columns are: Event type(EVTYPE), Injuries (INJURIES) ,Fatalities (FATALITIES), Crop Damage (CROPDMG), Property damage (PROPDMG). Later, I’ve created some table reports and graphics for the exploratory analysis. The main conclusions are that Tornado is the main category for fatalities and injuries. Additionally, Tornados represent the main the highest cost for property damage and Hail represent the highest cost for crop damage.

2. Data processing

2.1. Extracting data

download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", dest="tmp.bz2")
           
data <- read.csv(bzfile("tmp.bz2"), 
                 header=TRUE,
                 sep=",",
                 stringsAsFactors=FALSE)

I’ve converted the data frame to a data table which have a faster process. Later, I did a general recognition of the information.

storm<-data.table(data)
str(storm)
## Classes 'data.table' and 'data.frame':   902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(storm)
##     STATE__       BGN_DATE           BGN_TIME          TIME_ZONE        
##  Min.   : 1.0   Length:902297      Length:902297      Length:902297     
##  1st Qu.:19.0   Class :character   Class :character   Class :character  
##  Median :30.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :31.2                                                           
##  3rd Qu.:45.0                                                           
##  Max.   :95.0                                                           
##                                                                         
##      COUNTY       COUNTYNAME           STATE              EVTYPE         
##  Min.   :  0.0   Length:902297      Length:902297      Length:902297     
##  1st Qu.: 31.0   Class :character   Class :character   Class :character  
##  Median : 75.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :100.6                                                           
##  3rd Qu.:131.0                                                           
##  Max.   :873.0                                                           
##                                                                          
##    BGN_RANGE          BGN_AZI           BGN_LOCATI       
##  Min.   :   0.000   Length:902297      Length:902297     
##  1st Qu.:   0.000   Class :character   Class :character  
##  Median :   0.000   Mode  :character   Mode  :character  
##  Mean   :   1.484                                        
##  3rd Qu.:   1.000                                        
##  Max.   :3749.000                                        
##                                                          
##    END_DATE           END_TIME           COUNTY_END COUNTYENDN    
##  Length:902297      Length:902297      Min.   :0    Mode:logical  
##  Class :character   Class :character   1st Qu.:0    NA's:902297   
##  Mode  :character   Mode  :character   Median :0                  
##                                        Mean   :0                  
##                                        3rd Qu.:0                  
##                                        Max.   :0                  
##                                                                   
##    END_RANGE          END_AZI           END_LOCATI       
##  Min.   :  0.0000   Length:902297      Length:902297     
##  1st Qu.:  0.0000   Class :character   Class :character  
##  Median :  0.0000   Mode  :character   Mode  :character  
##  Mean   :  0.9862                                        
##  3rd Qu.:  0.0000                                        
##  Max.   :925.0000                                        
##                                                          
##      LENGTH              WIDTH                F               MAG         
##  Min.   :   0.0000   Min.   :   0.000   Min.   :0.0      Min.   :    0.0  
##  1st Qu.:   0.0000   1st Qu.:   0.000   1st Qu.:0.0      1st Qu.:    0.0  
##  Median :   0.0000   Median :   0.000   Median :1.0      Median :   50.0  
##  Mean   :   0.2301   Mean   :   7.503   Mean   :0.9      Mean   :   46.9  
##  3rd Qu.:   0.0000   3rd Qu.:   0.000   3rd Qu.:1.0      3rd Qu.:   75.0  
##  Max.   :2315.0000   Max.   :4400.000   Max.   :5.0      Max.   :22000.0  
##                                         NA's   :843563                    
##    FATALITIES          INJURIES            PROPDMG       
##  Min.   :  0.0000   Min.   :   0.0000   Min.   :   0.00  
##  1st Qu.:  0.0000   1st Qu.:   0.0000   1st Qu.:   0.00  
##  Median :  0.0000   Median :   0.0000   Median :   0.00  
##  Mean   :  0.0168   Mean   :   0.1557   Mean   :  12.06  
##  3rd Qu.:  0.0000   3rd Qu.:   0.0000   3rd Qu.:   0.50  
##  Max.   :583.0000   Max.   :1700.0000   Max.   :5000.00  
##                                                          
##   PROPDMGEXP           CROPDMG         CROPDMGEXP       
##  Length:902297      Min.   :  0.000   Length:902297     
##  Class :character   1st Qu.:  0.000   Class :character  
##  Mode  :character   Median :  0.000   Mode  :character  
##                     Mean   :  1.527                     
##                     3rd Qu.:  0.000                     
##                     Max.   :990.000                     
##                                                         
##      WFO             STATEOFFIC         ZONENAMES            LATITUDE   
##  Length:902297      Length:902297      Length:902297      Min.   :   0  
##  Class :character   Class :character   Class :character   1st Qu.:2802  
##  Mode  :character   Mode  :character   Mode  :character   Median :3540  
##                                                           Mean   :2875  
##                                                           3rd Qu.:4019  
##                                                           Max.   :9706  
##                                                           NA's   :47    
##    LONGITUDE        LATITUDE_E     LONGITUDE_       REMARKS         
##  Min.   :-14451   Min.   :   0   Min.   :-14455   Length:902297     
##  1st Qu.:  7247   1st Qu.:   0   1st Qu.:     0   Class :character  
##  Median :  8707   Median :   0   Median :     0   Mode  :character  
##  Mean   :  6940   Mean   :1452   Mean   :  3509                     
##  3rd Qu.:  9605   3rd Qu.:3549   3rd Qu.:  8735                     
##  Max.   : 17124   Max.   :9706   Max.   :106220                     
##                   NA's   :40                                        
##      REFNUM      
##  Min.   :     1  
##  1st Qu.:225575  
##  Median :451149  
##  Mean   :451149  
##  3rd Qu.:676723  
##  Max.   :902297  
## 

2.2. Cleaning the data and subsetting a dataframe

After checking the dataframe, we had the objective to analyze which events were the most costly and dangerous.

2.2.1. Cleaning the data

I made some corrections in the column EVTYPE. Mainly, I tried to reduce the categories or unique elements in this variable. Initially it had 985 elements

evtypecheck1<-as.data.frame(unique(storm$EVTYPE))
length(unique(evtypecheck1$`unique(storm$EVTYPE)`))
## [1] 985

After cleaning the data, the categories reduced to 309 elements

storm$EVTYPE<-toupper(storm$EVTYPE) #Converting all variables upper case

storm$EVTYPE <- gsub('.*STORM.*', 'STORM', storm$EVTYPE)
storm$EVTYPE <- gsub('.*FLOOD.*', 'FLOOD', storm$EVTYPE)
storm$EVTYPE <- gsub('.*WIND.*', 'WIND', storm$EVTYPE)
storm$EVTYPE <- gsub('.*TORN.*', 'TORNADO', storm$EVTYPE)
storm$EVTYPE <- gsub('.*HAIL.*', 'HAIL', storm$EVTYPE)
storm$EVTYPE <- gsub('.*HURRICANE.*', 'HURRICANE', storm$EVTYPE)
storm$EVTYPE <- gsub('.*RAIN.*', 'RAIN', storm$EVTYPE)
storm$EVTYPE <- gsub('.*SNOW.*', 'SNOW', storm$EVTYPE)
storm$EVTYPE <- gsub('.*COLD.*', 'COLD', storm$EVTYPE)
storm$EVTYPE <- gsub('.*LOW.*TEMPER.*', 'COLD', storm$EVTYPE)
storm$EVTYPE <- gsub('.*FROST.*', 'COLD', storm$EVTYPE)
storm$EVTYPE <- gsub('.*HIGH.*TEMPER.*', 'HEAT', storm$EVTYPE)
storm$EVTYPE <- gsub('.*HEAT.*', 'HEAT', storm$EVTYPE)
storm$EVTYPE <- gsub('.*FIRE.*', 'FIRE', storm$EVTYPE)
storm$EVTYPE <- gsub('AVALANCE', 'AVALANCHE', storm$EVTYPE)
storm$EVTYPE <- gsub('BEACH EROSIN', 'BEACH EROSION', storm$EVTYPE)
storm$EVTYPE <- gsub('FUNNEL CLOUD.*', 'FUNNEL CLOUD', storm$EVTYPE)
storm$EVTYPE <- gsub('HEAVY PRECIP.*', 'HEAVY PRECIPITATION', storm$EVTYPE)


evtypecheck2<-as.data.frame(unique(storm$EVTYPE))

length(unique(evtypecheck2$`unique(storm$EVTYPE)`))
## [1] 312
2.2.2. Subsetting a dataframe
zz<-group_by(storm, EVTYPE)
stormsummary<-summarise(zz, allfatalities=sum(FATALITIES), allinjuries=sum(INJURIES), 
                        allpropdmg=sum(PROPDMG), allcropdmg=sum(CROPDMG))

3. Results: Data analysis with graphics and tables

To simplify the analysis, I categorized the 10 main categories in the variable EVTYPE.

In that sense, I’ve created four additional dataframes

#Fatalities and injuries
mainfatalities<-arrange(stormsummary, desc(allfatalities))[1:10 ,]
maininjuries<-arrange(stormsummary, desc(allinjuries))[1:10,]

#Property and crop damage
mainpropdamage<-arrange(stormsummary, desc(allpropdmg))[1:10,]
maincropdamage<-arrange(stormsummary, desc(allcropdmg))[1:10,]

I’ve prepared an analysis comparing injuries and damages in terms of occurrence and the economic cost of crop and property damages.

par(mfrow=c(1,2), mar=c(7,3,1,1))

First, let’s see the injuries and damages graph and their respective tables

barplot(mainfatalities$allfatalities, names.arg = mainfatalities$EVTYPE, las=2, cex.names = 0.7, cex.axis=0.7, main="fatalities") 

select(mainfatalities, EVTYPE, allfatalities)
## Source: local data table [10 x 2]
## 
##          EVTYPE allfatalities
##           (chr)         (dbl)
## 1       TORNADO          5636
## 2          HEAT          3138
## 3         FLOOD          1525
## 4          WIND          1240
## 5     LIGHTNING           816
## 6         STORM           633
## 7   RIP CURRENT           368
## 8     AVALANCHE           225
## 9          COLD           223
## 10 RIP CURRENTS           204
barplot(maininjuries$allinjuries, names.arg = maininjuries$EVTYPE, las=2, cex.names = 0.7, cex.axis=0.7, main="injuries")

select(maininjuries, EVTYPE, allinjuries)
## Source: local data table [10 x 2]
## 
##       EVTYPE allinjuries
##        (chr)       (dbl)
## 1    TORNADO       91407
## 2       HEAT        9224
## 3       WIND        9044
## 4      FLOOD        8602
## 5      STORM        6692
## 6  LIGHTNING        5230
## 7       FIRE        1608
## 8       HAIL        1371
## 9  HURRICANE        1328
## 10      SNOW        1121

Now, let’s see the crop and property cost

barplot(mainpropdamage$allpropdmg, names.arg =mainpropdamage$EVTYPE, las=2, cex.names =0.7, cex.axis = 0.5, main = "Property") 

select(mainpropdamage, EVTYPE, allpropdmg)
## Source: local data table [10 x 2]
## 
##                  EVTYPE allpropdmg
##                   (chr)      (dbl)
## 1               TORNADO 3215748.41
## 2                 FLOOD 2435606.51
## 3                  WIND 1806049.95
## 4                 STORM 1613789.63
## 5                  HAIL  689069.78
## 6             LIGHTNING  603351.78
## 7                  SNOW  150127.16
## 8                  FIRE  125223.29
## 9                  RAIN   57871.71
## 10 URBAN/SML STREAM FLD   26051.94
barplot(maincropdamage$allcropdmg, names.arg = maincropdamage$EVTYPE, las=2, cex.names = 0.7, cex.axis = 0.5, main="Crop")

select(maincropdamage, EVTYPE, allcropdmg)
## Source: local data table [10 x 2]
## 
##       EVTYPE allcropdmg
##        (chr)      (dbl)
## 1       HAIL  581418.36
## 2      FLOOD  364476.73
## 3       WIND  137539.11
## 4    TORNADO  100026.77
## 5      STORM   99430.69
## 6    DROUGHT   33898.62
## 7       COLD   14520.88
## 8       RAIN   12447.30
## 9  HURRICANE   10802.79
## 10      FIRE    9565.74

5. Conclusions

The analysis shows that “Tornado”" is the event that causes the most injuries and fatalities across United States.

In terms of property damage, “Tornado” impplies the maximum cost. On the other hand, “Hail” represents the highest cost for crops.