Data Processing

Loading Data

To load the data, I used the option to read bz connections available in the function read.csv() and stored it in an object: ‘storm’

# loading the data
storm<-read.csv(bzfile(description="~/storm/repdata-data-StormData.csv.bz2",
                       open="r"), stringsAsFactors=FALSE)

I check on the variable names to know which ones I need for my analysis

names(storm)

##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

Selecting Variables

I then select the variables I need. I am going to use packages dplyr, tidyr and lubridates.

# loading packages
require(tidyr)
require(dplyr)
require(lubridate)

# creating a subset from object 'storm'

stormSubset <- storm %>%
        select(date = BGN_DATE,
               evtype = EVTYPE, 
               fatalities = FATALITIES,
               injuries = INJURIES,
               property = PROPDMG,
               crop = CROPDMG,
               exp1 = PROPDMGEXP,
               exp2 = CROPDMGEXP)

Basic Explorations

I run some explorations on my subset

# inspecting variable classes
stormSubset %>% 
        sapply(., function(.) class(.))

##        date      evtype  fatalities    injuries    property        crop 
## "character" "character"   "integer"   "integer"   "numeric"   "numeric" 
##        exp1        exp2 
## "character" "character"

# exploring fatalit
stormSubset %>% count(fatalities) %>% 
        arrange(desc(n)) %>%
        top_n(n = 20)

## Selecting by n

## Source: local data frame [21 x 2]
## 
##    fatalities      n
## 1           0 895323
## 2           1   5010
## 3           2    996
## 4           3    314
## 5           4    166
## 6           5    114
## 7           6     71
## 8           7     53
## 9           8     33
## 10          9     30
## ..        ...    ...

# exploring injuries
stormSubset %>% count(injuries) %>% 
        arrange(desc(n)) %>%
        top_n(n = 20)

## Selecting by n

## Source: local data frame [20 x 2]
## 
##    injuries      n
## 1         0 884693
## 2         1   7756
## 3         2   3134
## 4         3   1552
## 5         4    931
## 6         5    709
## 7         6    529
## 8         7    280
## 9        10    271
## 10        8    255
## 11        9    186
## 12       12    181
## 13       15    138
## 14       20    130
## 15       11    109
## 16       13     84
## 17       14     84
## 18       30     67
## 19       25     65
## 20       50     58

# exploring property damages
stormSubset%>% count(property) %>% 
        arrange(desc(n)) %>%
        top_n(n = 20)

## Selecting by n

## Source: local data frame [20 x 2]
## 
##    property      n
## 1       0.0 663123
## 2       5.0  32655
## 3      10.0  22018
## 4       1.0  19069
## 5       2.0  17872
## 6      25.0  17696
## 7      50.0  13793
## 8       3.0  10745
## 9      20.0   9307
## 10     15.0   8735
## 11    250.0   8476
## 12      2.5   8220
## 13      0.5   6790
## 14    100.0   6269
## 15     30.0   4443
## 16    500.0   3929
## 17      4.0   3420
## 18      8.0   2967
## 19     75.0   2442
## 20     40.0   2412

# exploring exponents related to property damages
stormSubset %>% count(exp1) %>% 
        arrange(desc(n)) %>%
        top_n(n = 20)

## Selecting by n

## Source: local data frame [19 x 2]
## 
##    exp1      n
## 1       465934
## 2     K 424665
## 3     M  11330
## 4     0    216
## 5     B     40
## 6     5     28
## 7     1     25
## 8     2     13
## 9     ?      8
## 10    m      7
## 11    H      6
## 12    +      5
## 13    7      5
## 14    3      4
## 15    4      4
## 16    6      4
## 17    -      1
## 18    8      1
## 19    h      1

# exploring crop damages
stormSubset %>% count(crop) %>% 
        arrange(desc(n)) %>%
        top_n(n = 20)

## Selecting by n

## Source: local data frame [20 x 2]
## 
##     crop      n
## 1    0.0 880198
## 2    5.0   4276
## 3   10.0   2381
## 4   50.0   2011
## 5    1.0   1404
## 6  100.0   1237
## 7    2.0   1160
## 8   25.0    846
## 9   20.0    834
## 10 500.0    720
## 11   3.0    655
## 12  15.0    630
## 13   0.5    575
## 14 250.0    515
## 15 200.0    476
## 16  30.0    331
## 17   4.0    301
## 18  75.0    290
## 19 150.0    270
## 20 300.0    249

# exploring exponents related to crop damages
stormSubset %>% count(exp2) %>% 
        arrange(desc(n)) %>%
        top_n(n = 20)

## Selecting by n

## Source: local data frame [9 x 2]
## 
##   exp2      n
## 1      618413
## 2    K 281832
## 3    M   1994
## 4    k     21
## 5    0     19
## 6    B      9
## 7    ?      7
## 8    2      1
## 9    m      1

# exploring even types

stormSubset %>% count(evtype) %>% 
        arrange(desc(n))%>%
        top_n(n = 20)

## Selecting by n

## Source: local data frame [20 x 2]
## 
##                      evtype      n
## 1                      HAIL 288661
## 2                 TSTM WIND 219940
## 3         THUNDERSTORM WIND  82563
## 4                   TORNADO  60652
## 5               FLASH FLOOD  54277
## 6                     FLOOD  25326
## 7        THUNDERSTORM WINDS  20843
## 8                 HIGH WIND  20212
## 9                 LIGHTNING  15754
## 10               HEAVY SNOW  15708
## 11               HEAVY RAIN  11723
## 12             WINTER STORM  11433
## 13           WINTER WEATHER   7026
## 14             FUNNEL CLOUD   6839
## 15         MARINE TSTM WIND   6175
## 16 MARINE THUNDERSTORM WIND   5812
## 17               WATERSPOUT   3796
## 18              STRONG WIND   3566
## 19     URBAN/SML STREAM FLD   3392
## 20                 WILDFIRE   2761

# exploring date

stormSubset %>% count(date) %>% 
        arrange(desc(n))%>%
        top_n(n = 20)

## Selecting by n

## Source: local data frame [20 x 2]
## 
##                 date    n
## 1  5/25/2011 0:00:00 1202
## 2  4/27/2011 0:00:00 1193
## 3   6/9/2011 0:00:00 1030
## 4  5/30/2004 0:00:00 1016
## 5   4/4/2011 0:00:00 1009
## 6   4/2/2006 0:00:00  981
## 7   4/7/2006 0:00:00  973
## 8  4/19/2011 0:00:00  970
## 9  5/31/1998 0:00:00  933
## 10 5/22/2011 0:00:00  919
## 11  6/4/2008 0:00:00  900
## 12 5/21/2004 0:00:00  899
## 13 5/26/2011 0:00:00  885
## 14 3/12/2006 0:00:00  883
## 15 6/21/2011 0:00:00  876
## 16  5/6/2003 0:00:00  830
## 17  4/3/2007 0:00:00  801
## 18 5/24/2004 0:00:00  798
## 19 6/10/2008 0:00:00  782
## 20 5/25/2008 0:00:00  779

Basic Data Munging

I need to perform some basic data munging

Each variable forms a column

Categories ‘exp1’ and ‘exp2’ have to be gather under one variable and their values have to be stored in another one. First, I need to normalize them. I also require package ‘magrittr’

# loading magrittr
require(magrittr)

# gathering 'exp1' and 'exp2'
stormSubset %<>%  
        mutate(exp1 = chartr('m', 'M', exp1), 
               exp2 = chartr('k', 'K', exp2),
               exp2 = chartr('m', 'M', exp2)) %>%
        gather(exp, amount, exp1:exp2)

# inspecting
glimpse(stormSubset)

## Observations: 1804594
## Variables:
## $ date       (chr) "4/18/1950 0:00:00", "4/18/1950 0:00:00", "2/20/195...
## $ evtype     (chr) "TORNADO", "TORNADO", "TORNADO", "TORNADO", "TORNAD...
## $ fatalities (int) 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 4, 0, ...
## $ injuries   (int) 15, 0, 2, 2, 2, 6, 1, 0, 14, 0, 3, 3, 26, 12, 6, 50...
## $ property   (dbl) 25.0, 2.5, 25.0, 2.5, 2.5, 2.5, 2.5, 2.5, 25.0, 25....
## $ crop       (dbl) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ exp        (fctr) exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, ex...
## $ amount     (chr) "K", "K", "K", "K", "K", "K", "K", "K", "K", "K", "...

stormSubset %>% count(amount) %>% 
        arrange(desc(n))%>%
        top_n(n = 20)

## Source: local data frame [18 x 2]
## 
##    amount       n
## 1         1084347
## 2       K  706518
## 3       M   13332
## 4       0     235
## 5       B      49
## 6       5      28
## 7       1      25
## 8       ?      15
## 9       2      14
## 10      H       6
## 11      +       5
## 12      7       5
## 13      3       4
## 14      4       4
## 15      6       4
## 16      -       1
## 17      8       1
## 18      h       1

# filtering out values different to 'k', 'm' and 'b'

stormSubset %<>%
         filter(amount == 'K' | amount == 'B' | amount == 'M') 

#inspecting
stormSubset %>% count(amount) %>% 
        arrange(desc(n))%>%
        top_n(n = 20)

## Source: local data frame [3 x 2]
## 
##   amount      n
## 1      K 706518
## 2      M  13332
## 3      B     49

# inspecting
glimpse(stormSubset)

## Observations: 719899
## Variables:
## $ date       (chr) "4/18/1950 0:00:00", "4/18/1950 0:00:00", "2/20/195...
## $ evtype     (chr) "TORNADO", "TORNADO", "TORNADO", "TORNADO", "TORNAD...
## $ fatalities (int) 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 4, 0, ...
## $ injuries   (int) 15, 0, 2, 2, 2, 6, 1, 0, 14, 0, 3, 3, 26, 12, 6, 50...
## $ property   (dbl) 25.0, 2.5, 25.0, 2.5, 2.5, 2.5, 2.5, 2.5, 25.0, 25....
## $ crop       (dbl) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ exp        (fctr) exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, ex...
## $ amount     (chr) "K", "K", "K", "K", "K", "K", "K", "K", "K", "K", "...

Fatalities and injuries are two categories of one single variable. Let’s gather them together and filter out zero damage cots. I also need to translate amount characters into their numeric counterpart

stormSubset %<>%
        gather(damage, cost, property:crop) %>%
        filter(cost > 0) 


stormSubset %<>%
        mutate (amount = gsub('K', 
                           '1000', amount),
                amount = gsub('M', 
                           '1000000', amount),
                amount = gsub('B', 
                           '1000000000', amount))

# inspecting
glimpse(stormSubset)

## Observations: 372852
## Variables:
## $ date       (chr) "4/18/1950 0:00:00", "4/18/1950 0:00:00", "2/20/195...
## $ evtype     (chr) "TORNADO", "TORNADO", "TORNADO", "TORNADO", "TORNAD...
## $ fatalities (int) 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 4, 0, 0, ...
## $ injuries   (int) 15, 0, 2, 2, 2, 6, 1, 0, 14, 0, 3, 3, 26, 6, 50, 2,...
## $ exp        (fctr) exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, ex...
## $ amount     (chr) "1000", "1000", "1000", "1000", "1000", "1000", "10...
## $ damage     (fctr) property, property, property, property, property, ...
## $ cost       (dbl) 25.0, 2.5, 25.0, 2.5, 2.5, 2.5, 2.5, 2.5, 25.0, 25....

# transforming variable 'amount' into a numeric one
stormSubset$amount <-as.numeric(stormSubset$amount)
class(stormSubset$amount)

## [1] "numeric"

# multiplying 'cost' * 'amount' and tranforming the cost into billions 
# eliminating then variable 'amount' from my dataset

stormSubset %<>%  
        mutate(cost = amount * cost, 
                 cost=cost/10^9 ) %>%
        select(-amount)

Gathering no ‘fatalities’, and ‘injuries’ under a common variable, and creating a new one to store their values

# gathering  and filtering out harm
stormSubset %<>%
       gather(harm, harmImpact,
               fatalities:injuries) %>%
        filter(harmImpact > 0) 

# inspecting
glimpse(stormSubset)

## Observations: 18290
## Variables:
## $ date       (chr) "2/13/1952 0:00:00", "2/13/1952 0:00:00", "3/22/195...
## $ evtype     (chr) "TORNADO", "TORNADO", "TORNADO", "TORNADO", "TORNAD...
## $ exp        (fctr) exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, ex...
## $ damage     (fctr) property, property, property, property, property, ...
## $ cost       (dbl) 2.5e-05, 2.5e-04, 2.5e-05, 2.5e-05, 2.5e-03, 2.5e-0...
## $ harm       (fctr) fatalities, fatalities, fatalities, fatalities, fa...
## $ harmImpact (int) 1, 1, 4, 1, 6, 7, 2, 5, 25, 2, 4, 1, 3, 1, 2, 1, 10...

Fixing Variable ‘date’

I now fix the varible date

# getting rid of hms:
dtes <-strsplit(as.character(stormSubset$date), " ") 
#mutating variable date and subsetting to get data from  1990 up
# and changing cases within evtype categories 

stormSubset %<>%
        mutate(date=sapply(dtes, '[', 1)) 


stormSubset %<>% filter(mdy(date) > mdy('1/1/1990')) %>%
        mutate(date = mdy(date), evtype = tolower(evtype))


glimpse(stormSubset)

## Observations: 12483
## Variables:
## $ date       (time) 1991-03-29, 1992-03-10, 1990-03-14, 1992-10-03, 19...
## $ evtype     (chr) "tornado", "tornado", "tornado", "tornado", "tornad...
## $ exp        (fctr) exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, ex...
## $ damage     (fctr) property, property, property, property, property, ...
## $ cost       (dbl) 0.00025, 0.00025, 0.00025, 0.00250, 0.02500, 0.0025...
## $ harm       (fctr) fatalities, fatalities, fatalities, fatalities, fa...
## $ harmImpact (int) 5, 2, 1, 1, 3, 1, 4, 1, 1, 29, 1, 1, 6, 1, 1, 1, 4,...

Fixing Category Names

Each variable form a column and each observation form a row. Now we need to fix some categories within “evtype” variable. Let’s to have a visual approach of these needed clean using explortory graphs

library(ggplot2)
library(gridExtra)

## Loading required package: grid

g1<-qplot(yday(date), evtype, 
      data=subset(stormSubset,  harmImpact > 50), size=harmImpact) +  
                        facet_wrap(~harm)

g2<-qplot(yday(date), evtype, 
      data=subset(stormSubset, cost > 5.0e-4), size=cost) + 
        facet_wrap(~damage)

grid.arrange(g1, g2, ncol=2, main="Searching malformed names")

# inspecting evtype categories
stormSubset %>% count(evtype) %>% 
        arrange(desc(n))%>%
        top_n(n = 20)

## Selecting by n

## Source: local data frame [20 x 2]
## 
##                evtype    n
## 1             tornado 4242
## 2           tstm wind 1468
## 3         flash flood 1128
## 4   thunderstorm wind 1047
## 5               flood  697
## 6           high wind  575
## 7           lightning  469
## 8  thunderstorm winds  369
## 9            wildfire  248
## 10               hail  237
## 11        strong wind  232
## 12       winter storm  177
## 13         heavy snow  139
## 14          ice storm  101
## 15           blizzard   87
## 16   wild/forest fire   87
## 17     tropical storm   86
## 18         heavy rain   84
## 19          hurricane   81
## 20         high winds   73

# checking for  extra spaces
grep('[ \t]{2,}',perl=TRUE, value=TRUE,stormSubset$evtype)

## character(0)

grep('\\t+\\s+$',perl=TRUE, value=TRUE, stormSubset$evtype)

## character(0)

# tstm wind into thunderstorm wind
stormSubset$evtype <-gsub('tstm wind', 'thunderstorm wind', 
                          stormSubset$evtype)

# into 'thunderstorm wind'
stormSubset$evtype <-gsub(' thunderstorm wind)', 'thunderstorm wind', 
                          stormSubset$evtype)
# into 'thunderstorm wind'
stormSubset$evtype <-gsub('^thunderstorm wind/\\w+)', 'thunderstorm wind', 
                          stormSubset$evtype)


# thunderstorm winds into thunderstorm wind   

stormSubset$evtype <-gsub('thunderstorm winds', 'thunderstorm wind', 
                          stormSubset$evtype)

# any '^thunderstorm wind\\s+' into 'thunderstorm wind'
stormSubset$evtype <-gsub('^thunderstorm wind\\s+', 'thunderstorm wind',
                           perl=TRUE,
                          stormSubset$evtype)

# '^thunderstorm\\s+wind\\(\\w+' into 'thunderstorm wind'

stormSubset$evtype <-gsub('^thunderstorm\\s+wind\\(\\w+', 
                          'thunderstorm wind',
                           perl=TRUE,
                          stormSubset$evtype)


stormSubset$evtype <-gsub('^thunderstorm\\s+\\w+/', 
                          'thunderstorm wind',
                           perl=TRUE,
                          stormSubset$evtype)

# winter weather/mix into winter weather
stormSubset$evtype <-gsub('winter storms', ' winter storm', 
                          stormSubset$evtype)

# winter storms into winter storm
stormSubset$evtype <-gsub('winter storms', ' winter storm', 
                          stormSubset$evtype)
# strong winds into strong wind
stormSubset$evtype <-gsub('strong wind', 'strong winds',  
                          stormSubset$evtype)

# 'high winds' into 'high wind'
stormSubset$evtype <-gsub('high winds', 'high wind', 
                          stormSubset$evtype)

#  any wild\\s+fire\\w+' into 'wildfire'
stormSubset$evtype <-gsub('wild\\s+fire\\w+', 'wildfire',
                           perl=TRUE,
                          stormSubset$evtype)


# 
stormSubset$evtype <-gsub('wild/\\w+', 'wildfire',
                           perl=TRUE,
                          stormSubset$evtype)

#
stormSubset$evtype <-gsub('wildfire\\s+\\w+', 'wildfire',
                           perl=TRUE,
                          stormSubset$evtype)


# any ^coastal\\s+\\w+' into 'coastal flood'
stormSubset$evtype <-gsub('^coastal\\s+\\w+', 
                          'coastal flood',
                           perl=TRUE,
                          stormSubset$evtype)

# any ^coastal\\s+\\w+'\\w+ into 'coastal flood'
stormSubset$evtype <-gsub('^coastal\\s+\\w+/\\w+', 
                          'coastal flood',
                           perl=TRUE,
                          stormSubset$evtype)

#any ^tornadoes\\s+\\w+' into 'tornado'
stormSubset$evtype <-gsub('^tornadoes,\\s+\\w+', 
                          'tornado',
                           perl=TRUE,
                          stormSubset$evtype)

#any ^tornado\\s+\\w+' into 'tornado'
stormSubset$evtype <-gsub('^tornado\\s+\\w+', 
                          'tornado',
                           perl=TRUE,
                          stormSubset$evtype)

#any '^tornado\\s+\\w+' into 'tornado'
stormSubset$evtype <-gsub('^tornado\\s+\\w+', 
                          'tornado',
                           perl=TRUE,
                          stormSubset$evtype)
#any '^tornado,\\s+\\w+' into 'tornado'
stormSubset$evtype <-gsub('^tornado,\\s+\\w+', 
                          'tornado',
                           perl=TRUE,
                          stormSubset$evtype)


#any '^storm\\s+\\w+' into 'storm surge'
stormSubset$evtype <-gsub('^storm\\s+\\w+/', 
                          'storm surge',
                           perl=TRUE,
                          stormSubset$evtype)

#any '^hurricane$' into 'hurricane/typhoon'
stormSubset$evtype <-gsub('^hurricane$', 
                          'hurricane/typhoon',
                           perl=TRUE,
                          stormSubset$evtype)

#any '^heat\\s+\\w+' into 'heat'
stormSubset$evtype <-gsub('^heat\\s+\\w+', 
                          'heat',
                           perl=TRUE,
                          stormSubset$evtype)

#
stormSubset$evtype <-gsub('^heat\\s+\\w+', 
                          'heat',
                           perl=TRUE,
                          stormSubset$evtype)

# transforming flash flooding, flash flash flood etc.
stormSubset$evtype <-gsub('flash flooding', 
                          'flash flood',
                           perl=TRUE,
                          stormSubset$evtype)
#
stormSubset$evtype <-gsub('(flash flood/|flood/flash)', 
                          'flash flood',
                           perl=TRUE,
                          stormSubset$evtype)
#
stormSubset$evtype <-gsub('\\w+flood\\.?', 
                          'flash flood',
                           perl=TRUE,
                          stormSubset$evtype)
#
stormSubset$evtype <-gsub('flash flash flood', 
                          'flash flood',
                           perl=TRUE,
                          stormSubset$evtype)
# 
stormSubset$evtype <-gsub('flash flood flood', 
                          'flash flood',
                           perl=TRUE,
                          stormSubset$evtype)

#
stormSubset$evtype <-gsub('flooding', 
                          'flood',
                           perl=TRUE,
                          stormSubset$evtype)

# (storm surgetide|storm surge) into 'storm surge/tide'
stormSubset$evtype <-gsub('(storm surgetide|storm surge)', 
                          'storm surge/tide',
                           perl=TRUE,
                          stormSubset$evtype)

# winter weather/' into 'winter weather'

stormSubset$evtype <-gsub('winter weather/', 
                          'winter weather',
                           perl=TRUE,
                          stormSubset$evtype)

# 'winter weathermix' 'winter weather'
stormSubset$evtype <-gsub('winter weathermix', 
                          'winter weather',
                           perl=TRUE,
                          stormSubset$evtype)


# 'winter storm\\s+' into 'winter storm'
stormSubset$evtype <-gsub('winter storm\\s+', 
                          'winter storm',
                           perl=TRUE,
                          stormSubset$evtype)

# 
stormSubset$evtype <-gsub('winter stormhigh', 
                          'winter storm',
                           perl=TRUE,
                          stormSubset$evtype)


# 'ice$' into 'ice storm'
stormSubset$evtype <-gsub('ice$', 
                          'ice storm',
                           perl=TRUE,
                          stormSubset$evtype)

# 'smal hail' into 'hail'
stormSubset$evtype <-gsub('small hail', 
                          'hail',
                           perl=TRUE,
                          stormSubset$evtype)

# hurricane/typhoons
stormSubset$evtype <-gsub('^hurricane\\s+\\w+', 
                          'hurricane/typhoon',
                           perl=TRUE,
                          stormSubset$evtype)
# '^fog$' into 'dense fog'
stormSubset$evtype <-gsub('^fog$', 
                          'dense fog',
                           perl=TRUE,
                          stormSubset$evtype)

·Reshaping dataset

stormDf <- stormSubset %>%
        group_by(evtype, harm) %>%
        summarize(harmImpact=sum(harmImpact)) %>%
        filter(harmImpact > 50) %>%
        arrange(desc(harmImpact))


stormDF <- stormSubset %>%
        group_by(evtype, damage) %>%
        summarize(costs=sum(cost)) %>%
        filter(costs > 5.0e-4) %>%
        arrange(desc(costs))

        
dF <- stormSubset %>%
        group_by(date,evtype, harm, damage) %>%
        summarize(victims = sum(harmImpact), costs=sum(cost)) %>%
        filter(victims > 50, costs > 5.0e-4) 
        
dF %>% count(evtype) %>% 
        arrange(desc(n)) %>%
        top_n(n = 20)

## Source: local data frame [20 x 2]
## 
##                   evtype   n
## 1                tornado 143
## 2                  flood  12
## 3            flash flood  10
## 4      thunderstorm wind  10
## 5               wildfire   6
## 6      hurricane/typhoon   5
## 7              high wind   4
## 8              ice storm   4
## 9         tropical storm   4
## 10               tsunami   4
## 11              blizzard   3
## 12                  heat   3
## 13          winter storm   3
## 14             dense fog   2
## 15        excessive heat   2
## 16                  hail   2
## 17            heavy snow   2
## 18 tropical storm gordon   2
## 19          extreme cold   1
## 20        winter weather   1

Results

In general, accross the United States, tornadoes have been the most harmful weather event with respect to population health. They have caused many much fatalities and injuries than any other event. Tornadoes are also the second type of weather event which have the greatest economic consequence. Hurricane/typhoons are the first one

g1<-ggplot(data=stormDf, aes(harmImpact, reorder(evtype, harmImpact)))+
        geom_point() +
        facet_wrap(~harm) +
        ggtitle('Type of Event by Number of Victims') +
        xlab('Number of Victims') +  ylab('Type of Event')  
        

g2<-ggplot(data=stormDF, aes(costs, reorder(evtype, costs))) +
        geom_point() +
        facet_wrap(~damage) +
        ggtitle('Type of Event by Damage (billions of dollars)') +
        xlab('day of the year') +  ylab('type of event') + 
        labs(size='Costs\n(billions of Dollars)')
        

grid.arrange(g1, g2, ncol=1, main="America. Weather Events")

Finally, while other weather events seem to be associated with certain seasons of the year, tornadoes occur in almost any of them:

g3<-ggplot(data=dF, aes(yday(date), evtype,size=victims))+
        geom_point() +
        facet_grid(harm ~ damage, scales='free_y')+
        ggtitle('Event type by date and harm impact') +
        xlab('day of the year') +  ylab('type of event') + 
        labs(size='Number of\nVictims')

g4<-ggplot(data=dF, aes(yday(date), evtype,size=costs)) +
        geom_point() +
        facet_grid(harm ~ damage, scales='free_y') +
        ggtitle('Event type by date and damage (in billions of dollars)') +
        xlab('day of the year') +  ylab('type of event') + 
        labs(size='Costs\n(billions of Dollars)')
        

grid.arrange(g3, g4, ncol=1, main="Weather Events")

NOAA Storm Database. Some Explorations and Analysis

Beatriz Valdez

Sunday, May 17, 2015

Synopsis