Synopsis

This study was written based on the data collect from the U.S. National Oceanic and Atmospheric Administration’s (NOAA). As per our analysis we can conclude that:

  1. The injuries and fatalities caused by Tornado is the most harmful event types according our analyses, causing close to 100k of health damages including injuries and fatalities.

  2. The highest property damages is represented by FLOOD with more than USD 1500B in damages, and represents close to 30% of the total loose.

Data processing

Getting Data

Loading the data

temp <- tempfile()
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", temp)
weather <- read.csv(temp)
unlink(temp)
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf", "curr.pdf", method = "curl")
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf", method = "curl", "Events-FAQ20Page.pdf")

str(weather)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Cleaning Data

Selecting only the data needed

weather <- weather %>%
        select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
dim(weather)
## [1] 902297      7
head(weather)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO          0       15    25.0          K       0           
## 2 TORNADO          0        0     2.5          K       0           
## 3 TORNADO          0        2    25.0          K       0           
## 4 TORNADO          0        2     2.5          K       0           
## 5 TORNADO          0        2     2.5          K       0           
## 6 TORNADO          0        6     2.5          K       0

checking if there are some NA values.

table(is.na(weather))
## 
##   FALSE 
## 6316079

There are not any NA values, we can continue

According to NATIONAL WEATHER SERVICE INSTRUCTION 10-1605 there are 48 event types. Wee need to check if our data is the same

len <- length(unique(weather$EVTYPE))
print(len)
## [1] 0

There are 0 data types, maybe we need to clean up if our questions were not answered correctly.

Now, we need to work in the amounts of damages.

unique(str_to_upper(weather$PROPDMGEXP))
##  [1] "K" "M" ""  "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"
unique(str_to_upper(weather$CROPDMGEXP))
## [1] ""  "M" "K" "B" "?" "0" "2"

As we can see, the types into CROPDMGEXP is included into PROPDMGEXP

# convert to upper letters
weather$PROPDMGEXP <- str_to_upper(weather$PROPDMGEXP)
weather$CROPDMGEXP <- str_to_upper(weather$CROPDMGEXP)

#creating a new dataframe
expn <- weather %>%
        select(PROPDMGEXP) %>%
        unique() %>%
        arrange(PROPDMGEXP) %>%
        mutate(EXP = c(rep(1, 5), rep(10, 9)^seq(1, 9), 100, 1000, 1000000))
# Checking Data types and values        
str(expn)        
## 'data.frame':    17 obs. of  2 variables:
##  $ PROPDMGEXP: chr  "" "-" "?" "+" ...
##  $ EXP       : num  1e+00 1e+00 1e+00 1e+00 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 ...
print(expn)
##    PROPDMGEXP   EXP
## 1             1e+00
## 2           - 1e+00
## 3           ? 1e+00
## 4           + 1e+00
## 5           0 1e+00
## 6           1 1e+01
## 7           2 1e+02
## 8           3 1e+03
## 9           4 1e+04
## 10          5 1e+05
## 11          6 1e+06
## 12          7 1e+07
## 13          8 1e+08
## 14          B 1e+09
## 15          H 1e+02
## 16          K 1e+03
## 17          M 1e+06
weather <- weather %>%
        merge(expn, by.x = "PROPDMGEXP", by.y = "PROPDMGEXP", all.x = TRUE) %>%
        rename(PEXP = EXP) %>%
        merge(expn, by.x = "CROPDMGEXP", by.y = "PROPDMGEXP", all.x = TRUE) %>%
        rename(CEXP = EXP) %>%
        mutate(PROPDMGMount = PROPDMG * PEXP, CROPDMGMount = CROPDMG * CEXP)
head(weather)
##   CROPDMGEXP PROPDMGEXP    EVTYPE FATALITIES INJURIES PROPDMG CROPDMG PEXP CEXP
## 1                       TSTM WIND          0        0       0       0    1    1
## 2                       TSTM WIND          0        0       0       0    1    1
## 3                       TSTM WIND          0        0       0       0    1    1
## 4                       TSTM WIND          0        0       0       0    1    1
## 5                       TSTM WIND          0        0       0       0    1    1
## 6                       TSTM WIND          0        0       0       0    1    1
##   PROPDMGMount CROPDMGMount
## 1            0            0
## 2            0            0
## 3            0            0
## 4            0            0
## 5            0            0
## 6            0            0

Across the United States, which types of events are most harmful with respect to population health?

Summarizing all the event types in descending order to identify how long is the data and what values are highest.

harmful <- weather %>%
        group_by(EVTYPE) %>%
        summarise(injuries = sum(INJURIES), fatalities = sum(FATALITIES)) %>%
        arrange(desc(injuries)) %>%
        print()
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 985 x 3
##    EVTYPE            injuries fatalities
##    <chr>                <dbl>      <dbl>
##  1 TORNADO              91346       5633
##  2 TSTM WIND             6957        504
##  3 FLOOD                 6789        470
##  4 EXCESSIVE HEAT        6525       1903
##  5 LIGHTNING             5230        816
##  6 HEAT                  2100        937
##  7 ICE STORM             1975         89
##  8 FLASH FLOOD           1777        978
##  9 THUNDERSTORM WIND     1488        133
## 10 HAIL                  1361         15
## # ... with 975 more rows

Summarizing in OTHERS the less injuries from position 10 to the end of the list.

othersh <- harmful %>%
        filter(between(row_number(), 10, n())) %>%
        summarise(EVTYPE = "OTHERS(976 EVTYPES)", injuries = sum(injuries), fatalities = sum(fatalities)) %>%
        print()
## # A tibble: 1 x 3
##   EVTYPE              injuries fatalities
##   <chr>                  <dbl>      <dbl>
## 1 OTHERS(976 EVTYPES)    16341       3682

Summarizing the tables in only 10 rows in order to identify the most harmful.

harmful <- harmful %>%
        filter(between(row_number(), 1, 9)) %>%
        full_join(othersh) %>%
        mutate(total = injuries + fatalities) %>%
        print()
## Joining, by = c("EVTYPE", "injuries", "fatalities")
## # A tibble: 10 x 4
##    EVTYPE              injuries fatalities total
##    <chr>                  <dbl>      <dbl> <dbl>
##  1 TORNADO                91346       5633 96979
##  2 TSTM WIND               6957        504  7461
##  3 FLOOD                   6789        470  7259
##  4 EXCESSIVE HEAT          6525       1903  8428
##  5 LIGHTNING               5230        816  6046
##  6 HEAT                    2100        937  3037
##  7 ICE STORM               1975         89  2064
##  8 FLASH FLOOD             1777        978  2755
##  9 THUNDERSTORM WIND       1488        133  1621
## 10 OTHERS(976 EVTYPES)    16341       3682 20023
f <- ggplot(harmful, aes(injuries, EVTYPE))
print(f + geom_bar(stat = "identity", fill = "blue") + labs(title = "Injuries Accross United states", y = "Event Type", x = "Injuries"))

f <- ggplot(harmful, aes(fatalities, EVTYPE))
print(f + geom_bar(stat = "identity", fill = "blue") + labs(title = "Fatalities Accross United states", y = "Event Type", x = "Fatalities"))

Results

According the table and plot the most harmful is TORNADO with 91346 injuries and 5633 fatalities.

Across the United States, which types of events have the greatest economic consequences?

Summarizing all the event types in descending order to identify which event type is the highest.

dmg <- weather %>%
        group_by(EVTYPE) %>%
        summarise(damages = sum(PROPDMGMount) + sum(CROPDMGMount)) %>%
        arrange(desc(damages)) %>%
        print()
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 985 x 2
##    EVTYPE                  damages
##    <chr>                     <dbl>
##  1 FLOOD             150319678257 
##  2 HURRICANE/TYPHOON  71913712800 
##  3 TORNADO            57362333946.
##  4 STORM SURGE        43323541000 
##  5 HAIL               18761221986.
##  6 FLASH FLOOD        18243991078.
##  7 DROUGHT            15018672000 
##  8 HURRICANE          14610229010 
##  9 RIVER FLOOD        10148404500 
## 10 ICE STORM           8967041360 
## # ... with 975 more rows

Summarizing in OTHERS the less damages from position 10 to the end of the list.

othersd <- dmg %>%
        filter(between(row_number(), 10, n())) %>%
        summarise(EVTYPE = "OTHERS(976 EVTYPES)", damages = sum(damages)) %>%
        print()
## # A tibble: 1 x 2
##   EVTYPE               damages
##   <chr>                  <dbl>
## 1 OTHERS(976 EVTYPES) 1076299.

Summarizing the tables in only 10 rows in order to identify the most harmful.

dmg <- dmg %>%
        filter(between(row_number(), 1, 9)) %>%
        full_join(othersd) %>%
        mutate(dmgM = damages / 100000000) %>%
        mutate(percent = paste(format(100 * damages/sum(damages), scientific = FALSE), "%", sep = "")) %>%
        print()
## Joining, by = c("EVTYPE", "damages")
## # A tibble: 10 x 4
##    EVTYPE                    damages      dmgM percent         
##    <chr>                       <dbl>     <dbl> <chr>           
##  1 FLOOD               150319678257  1503.     "37.6078564780%"
##  2 HURRICANE/TYPHOON    71913712800   719.     "17.9917933643%"
##  3 TORNADO              57362333946.  574.     "14.3512442770%"
##  4 STORM SURGE          43323541000   433.     "10.8389369305%"
##  5 HAIL                 18761221986.  188.     " 4.6937922697%"
##  6 FLASH FLOOD          18243991078.  182.     " 4.5643884156%"
##  7 DROUGHT              15018672000   150.     " 3.7574592203%"
##  8 HURRICANE            14610229010   146.     " 3.6552725637%"
##  9 RIVER FLOOD          10148404500   101.     " 2.5389872061%"
## 10 OTHERS(976 EVTYPES)      1076299.    0.0108 " 0.0002692748%"
f <- ggplot(dmg, aes(dmgM, EVTYPE))
print(f + geom_bar(stat = "identity", fill = "blue") + labs(title = "Damages Accross United states", y = "Event Type", x = "Damages (B USD)"))

Results

According the table and plot the most harmful is FLOOD with 150319678257 USD in damages.