1.Load required libraries

suppressPackageStartupMessages({
    library(data.table)
    library(ggplot2)})
  1. Read in the data
if (!file.exists('data2.csv.bz2')){
  download.file('https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2',
                destfile = paste0(getwd(), '/data2.csv.bz2'),
                method = 'curl', quiet = T)
}
raw <- read.csv('data2.csv.bz2', stringsAsFactors = F)
  1. Inspect the data
names(raw)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
  1. Select data
    Since we’re only interested in the correlation of the nation-wide event type to health and economic consequense, only the following columns will be selected for further analysis:
selectCol <- c('EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP', 'CROPDMG', 'CROPDMGEXP')
rawSel <- raw[, selectCol]
summary(rawSel)
##     EVTYPE            FATALITIES          INJURIES            PROPDMG       
##  Length:902297      Min.   :  0.0000   Min.   :   0.0000   Min.   :   0.00  
##  Class :character   1st Qu.:  0.0000   1st Qu.:   0.0000   1st Qu.:   0.00  
##  Mode  :character   Median :  0.0000   Median :   0.0000   Median :   0.00  
##                     Mean   :  0.0168   Mean   :   0.1557   Mean   :  12.06  
##                     3rd Qu.:  0.0000   3rd Qu.:   0.0000   3rd Qu.:   0.50  
##                     Max.   :583.0000   Max.   :1700.0000   Max.   :5000.00  
##   PROPDMGEXP           CROPDMG         CROPDMGEXP       
##  Length:902297      Min.   :  0.000   Length:902297     
##  Class :character   1st Qu.:  0.000   Class :character  
##  Mode  :character   Median :  0.000   Mode  :character  
##                     Mean   :  1.527                     
##                     3rd Qu.:  0.000                     
##                     Max.   :990.000
Q1data <- rawSel[, 1:3] %>% 
  group_by(EVTYPE) %>%
  summarise_all(sum)
summary(Q1data)
##     EVTYPE            FATALITIES         INJURIES      
##  Length:985         Min.   :   0.00   Min.   :    0.0  
##  Class :character   1st Qu.:   0.00   1st Qu.:    0.0  
##  Mode  :character   Median :   0.00   Median :    0.0  
##                     Mean   :  15.38   Mean   :  142.7  
##                     3rd Qu.:   0.00   3rd Qu.:    0.0  
##                     Max.   :5633.00   Max.   :91346.0
topFat <- Q1data[order(Q1data$FATALITIES, decreasing = T), ]
topInj <- Q1data[order(Q1data$INJURIES, decreasing = T), ]
Q1data$total <- rowSums(Q1data[, 2:3])
topHealth <- Q1data[order(Q1data$total, decreasing = T), ]

Find types of events that have the greatest economic consequences

The actual values are encoded in the ‘EXP’ column for property and crop damage

unique(rawSel$PROPDMGEXP)
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"

Note that there are numbers, characters, and capitals all mixed together (even though now they’re all factors)

getVal <- function(expType) {
  if (expType %in% c('h', 'H')) {
    return(2)
  } else if (expType %in% c('k', 'K')) {
    return(3)
  } else if (expType %in% c('m', 'M')) {
    return(6)
  } else if (expType %in% c('b', 'B')) {
    return(9)
  } else if (suppressWarnings(!is.na(as.numeric(expType)))) {
   
    return(as.numeric(expType))
  } else {
    return(0)
  }
}
c(10**getVal('h'), 10**getVal(4), 10**getVal('B'), 10**getVal('?'))
## [1] 1e+02 1e+04 1e+09 1e+00
Q2data <- rawSel[, c(1, 4:7)] %>%
  rowwise() %>%
  mutate(PROP = PROPDMG*10**getVal(PROPDMGEXP), 
         CROP = CROPDMG*10**getVal(CROPDMGEXP))
head(Q2data)
## # A tibble: 6 × 7
## # Rowwise: 
##   EVTYPE  PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP  PROP  CROP
##   <chr>     <dbl> <chr>        <dbl> <chr>      <dbl> <dbl>
## 1 TORNADO    25   K                0 ""         25000     0
## 2 TORNADO     2.5 K                0 ""          2500     0
## 3 TORNADO    25   K                0 ""         25000     0
## 4 TORNADO     2.5 K                0 ""          2500     0
## 5 TORNADO     2.5 K                0 ""          2500     0
## 6 TORNADO     2.5 K                0 ""          2500     0
Q2dataSum <- Q2data[, c(1, 6, 7)] %>%
  group_by(EVTYPE) %>%
  summarise_all(sum)
summary(Q2dataSum)
##     EVTYPE               PROP                CROP          
##  Length:985         Min.   :0.000e+00   Min.   :0.000e+00  
##  Class :character   1st Qu.:0.000e+00   1st Qu.:0.000e+00  
##  Mode  :character   Median :0.000e+00   Median :0.000e+00  
##                     Mean   :4.347e+08   Mean   :4.985e+07  
##                     3rd Qu.:5.105e+04   3rd Qu.:0.000e+00  
##                     Max.   :1.447e+11   Max.   :1.397e+10
topProp <- Q2dataSum[order(Q2dataSum$PROP, decreasing = T), ]
topCrop <- Q2dataSum[order(Q2dataSum$CROP, decreasing = T), ]
Q2dataSum$total <- rowSums(Q2dataSum[, 2:3])
topEcon <- Q2dataSum[order(Q2dataSum$total, decreasing = T), ]

Results

Find types of events that are most harmful with respect to population health

If separated by fatalities and injuries, the top 10 events that causes the most

  • fatalities
topFat[1:10, ]
## # A tibble: 10 × 3
##    EVTYPE         FATALITIES INJURIES
##    <chr>               <dbl>    <dbl>
##  1 TORNADO              5633    91346
##  2 EXCESSIVE HEAT       1903     6525
##  3 FLASH FLOOD           978     1777
##  4 HEAT                  937     2100
##  5 LIGHTNING             816     5230
##  6 TSTM WIND             504     6957
##  7 FLOOD                 470     6789
##  8 RIP CURRENT           368      232
##  9 HIGH WIND             248     1137
## 10 AVALANCHE             224      170
  • injuries
topInj[1:10, ]
## # A tibble: 10 × 3
##    EVTYPE            FATALITIES INJURIES
##    <chr>                  <dbl>    <dbl>
##  1 TORNADO                 5633    91346
##  2 TSTM WIND                504     6957
##  3 FLOOD                    470     6789
##  4 EXCESSIVE HEAT          1903     6525
##  5 LIGHTNING                816     5230
##  6 HEAT                     937     2100
##  7 ICE STORM                 89     1975
##  8 FLASH FLOOD              978     1777
##  9 THUNDERSTORM WIND        133     1488
## 10 HAIL                      15     1361

If adding the numbers of fatalities and injuries and ranked by the total number, the top 10 events that causes the most population health are as follows

topHealth[1:10, ]
## # A tibble: 10 × 4
##    EVTYPE            FATALITIES INJURIES total
##    <chr>                  <dbl>    <dbl> <dbl>
##  1 TORNADO                 5633    91346 96979
##  2 EXCESSIVE HEAT          1903     6525  8428
##  3 TSTM WIND                504     6957  7461
##  4 FLOOD                    470     6789  7259
##  5 LIGHTNING                816     5230  6046
##  6 HEAT                     937     2100  3037
##  7 FLASH FLOOD              978     1777  2755
##  8 ICE STORM                 89     1975  2064
##  9 THUNDERSTORM WIND        133     1488  1621
## 10 WINTER STORM             206     1321  1527

The following figure depicts top 10 event types that causes population health hazards (sum of fatalities and injuries)

ggplot(data = topHealth[1:10, ], aes(x = reorder(EVTYPE, total), y = total)) +
  #need to use reorder to prevent the categorical data from reordering
  geom_bar(stat = 'identity') +
  coord_flip() +
  xlab('Event type') +
  ylab('Total injuries and fatalities') +
  ggtitle('Top 10 weather events that causes population health hazards') +
  theme_classic()

As the figure is shown, tornados are the most dangerous weather event that causes injuries and fatalities

Find types of events that have the greatest economic consequences

If separated by property and crop damages, the top 10 events that causes the most

  • property damage
topProp[1:10, ]
## # A tibble: 10 × 3
##    EVTYPE                     PROP       CROP
##    <chr>                     <dbl>      <dbl>
##  1 FLOOD             144657709807  5661968450
##  2 HURRICANE/TYPHOON  69305840000  2607872800
##  3 TORNADO            56947380676.  414953270
##  4 STORM SURGE        43323536000        5000
##  5 FLASH FLOOD        16822673978. 1421317100
##  6 HAIL               15735267513. 3025954473
##  7 HURRICANE          11868319010  2741910000
##  8 TROPICAL STORM      7703890550   678346000
##  9 WINTER STORM        6688497251    26944000
## 10 HIGH WIND           5270046295   638571300
  • crop damage
topCrop[1:10, ]
## # A tibble: 10 × 3
##    EVTYPE                     PROP        CROP
##    <chr>                     <dbl>       <dbl>
##  1 DROUGHT             1046106000  13972566000
##  2 FLOOD             144657709807   5661968450
##  3 RIVER FLOOD         5118945500   5029459000
##  4 ICE STORM           3944927860   5022113500
##  5 HAIL               15735267513.  3025954473
##  6 HURRICANE          11868319010   2741910000
##  7 HURRICANE/TYPHOON  69305840000   2607872800
##  8 FLASH FLOOD        16822673978.  1421317100
##  9 EXTREME COLD          67737400   1292973000
## 10 FROST/FREEZE           9480000   1094086000

If combining the amounts lost in property and crop damage and ranked by the total number, the top 10 events that causes the most economic damages are

ggplot(data = topEcon[1:10, ], aes(x = reorder(EVTYPE, total), y = total)) +
  #need to use reorder to prevent the categorical data from reordering
  geom_bar(stat = 'identity') +
  coord_flip() +
  scale_y_continuous(trans = 'log10') +
  xlab('Event type') +
  ylab('Total property and crop damages (log10)') +
  ggtitle('Top 10 weather events that causes economic hazards') +
  theme_classic()

As the figure is shown, flash flood causes by a significant amount of property and crop damages. It should be noted that the property damage amount is significantly more that crop damage amount, as shown in summary below - the mean and maximum values fro property loss is 10**3 more than crops!

summary(topEcon[, -4])
##     EVTYPE               PROP                CROP          
##  Length:985         Min.   :0.000e+00   Min.   :0.000e+00  
##  Class :character   1st Qu.:0.000e+00   1st Qu.:0.000e+00  
##  Mode  :character   Median :0.000e+00   Median :0.000e+00  
##                     Mean   :4.347e+08   Mean   :4.985e+07  
##                     3rd Qu.:5.105e+04   3rd Qu.:0.000e+00  
##                     Max.   :1.447e+11   Max.   :1.397e+10