Synopsis

The goal of this analysis is to explore the US National Oceanic and Atmospheric Administration’s (NOAA) storm database. Specifically, the analysis is based on answering two questions. The first question is asked to find what events are most harmful to population health in the US while the second one is to find what events have the greatest economic consequences in the US. The approaches to these questions are shown in detail later on. And the conclusions to these questions are tornado is the most harmful event and the event of flood has the greatest economic consequences in the US.

Data Processing

library(tidyverse)
setwd("E:/Coursera Data Scientist/Reproducible Research/Week4 Project")
stormData <- read.csv("repdata_data_StormData.csv", header=T, sep=",")

First we need to set our working directory by specifying the path to the desired folder, where the data is stored. The data was originally stored as a csv.bz2 file. After unzipping the file, we can load the data as a csv file by using read.csv() function.

dim(stormData)

## [1] 902297     37

colnames(stormData)

##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

By checking the dimension of the dataset, we can see that it has 902297 rows of observations and 37 variables. Since there are 37 variables and many of them are not related to our case of study, we will pick the variables which are relevant to our analysis.

stormData1 <- stormData %>% select(EVTYPE, FATALITIES, INJURIES, PROPDMG, 
                                   PROPDMGEXP, CROPDMG, CROPDMGEXP)

Now, our data after filtering has the variables of EVTYPE(event type), FATALITIES, INJURIES, PROPDMG(property damage), PROPDMGEXP(property damage expense with symbol), CROPDMG(crop damage), and CROPDMGEXP(crop damage expense with symbol).

Question1

injuries_bytype <- aggregate(INJURIES~EVTYPE, stormData1, sum)
injuries_bytype <- injuries_bytype %>% arrange(desc(INJURIES)) %>% 
      top_n(10, INJURIES)

fatalities_bytype <- aggregate(FATALITIES~EVTYPE, stormData1, sum)
fatalities_bytype <- fatalities_bytype %>% arrange(desc(FATALITIES)) %>% 
        top_n(10, FATALITIES)

injuries_bytype

##               EVTYPE INJURIES
## 1            TORNADO    91346
## 2          TSTM WIND     6957
## 3              FLOOD     6789
## 4     EXCESSIVE HEAT     6525
## 5          LIGHTNING     5230
## 6               HEAT     2100
## 7          ICE STORM     1975
## 8        FLASH FLOOD     1777
## 9  THUNDERSTORM WIND     1488
## 10              HAIL     1361

fatalities_bytype

##            EVTYPE FATALITIES
## 1         TORNADO       5633
## 2  EXCESSIVE HEAT       1903
## 3     FLASH FLOOD        978
## 4            HEAT        937
## 5       LIGHTNING        816
## 6       TSTM WIND        504
## 7           FLOOD        470
## 8     RIP CURRENT        368
## 9       HIGH WIND        248
## 10      AVALANCHE        224

The above are the event with top 10 most injuries and top 10 most fatalities. Then, we can make barplots to visualize and the graphs are included under the Results section.

Question2

table(stormData1$PROPDMGEXP)

## 
##             -      ?      +      0      1      2      3      4      5 
## 465934      1      8      5    216     25     13      4      4     28 
##      6      7      8      B      h      H      K      m      M 
##      4      5      1     40      1      6 424665      7  11330

table(stormData1$CROPDMGEXP)

## 
##             ?      0      2      B      k      K      m      M 
## 618413      7     19      1      9     21 281832      1   1994

Here, we see that the symbols H, K, M, B stand for hundreds, thousands, millions, and billions. Then, we will need to convert them to the actual numbers to do calculations.

#Convert symbols
stormData1$PROPDMGEXP <- str_replace_all(stormData1$PROPDMGEXP, "[Hh]", "100")
stormData1$PROPDMGEXP <- str_replace_all(stormData1$PROPDMGEXP, "[K]", "1000")
stormData1$PROPDMGEXP <- str_replace_all(stormData1$PROPDMGEXP, "[Mm]", "1000000")
stormData1$PROPDMGEXP <- str_replace_all(stormData1$PROPDMGEXP, "[B]", "1000000000")
stormData1$PROPDMGEXP <- str_replace_all(stormData1$PROPDMGEXP, 
                  "[^(100,1000,1000000,1000000000)]", "0")

stormData1$CROPDMGEXP <- str_replace_all(stormData1$CROPDMGEXP, "[Kk]", "1000")
stormData1$CROPDMGEXP <- str_replace_all(stormData1$CROPDMGEXP, "[Mm]", "1000000")
stormData1$CROPDMGEXP <- str_replace_all(stormData1$CROPDMGEXP, "[B]", "1000000000")
stormData1$CROPDMGEXP <- str_replace_all(stormData1$CROPDMGEXP, 
                  "[^(1000,1000000,1000000000)]", "0")

Now, we can calculate the actual expenses.

stormData1$PROPDMGEXP <- as.numeric(stormData1$PROPDMGEXP)
stormData1$CROPDMGEXP <- as.numeric(stormData1$CROPDMGEXP)
stormData1 <- stormData1 %>% mutate(Property_Dmg = PROPDMG*PROPDMGEXP, 
                                    Crop_Dmg = CROPDMG*CROPDMGEXP)
#Replace NA with 0 to do calculation
stormData1$Property_Dmg <- replace_na(stormData1$Property_Dmg, 0)
stormData1$Crop_Dmg <- replace_na(stormData1$Crop_Dmg, 0)

Total_Dmg <- stormData1 %>% group_by(EVTYPE) %>% 
      summarise(Total_Prop_Dmg=sum(Property_Dmg), Total_Crop_Dmg=sum(Crop_Dmg))
Total_Dmg <- Total_Dmg %>% mutate(Total_Econ_Dmg=Total_Prop_Dmg+Total_Crop_Dmg)

Top10_Econ_Dmg <- top_n(Total_Dmg, 10, Total_Econ_Dmg) %>% arrange(desc(Total_Econ_Dmg))

Top10_Econ_Dmg

## # A tibble: 10 x 4
##    EVTYPE            Total_Prop_Dmg Total_Crop_Dmg Total_Econ_Dmg
##    <fct>                      <dbl>          <dbl>          <dbl>
##  1 FLOOD               144657709800     5661968450   150319678250
##  2 HURRICANE/TYPHOON    69305840000     2607872800    71913712800
##  3 TORNADO              56937160480      414953110    57352113590
##  4 STORM SURGE          43323536000           5000    43323541000
##  5 HAIL                 15732267220     3025954450    18758221670
##  6 FLASH FLOOD          16140811510     1421317100    17562128610
##  7 DROUGHT               1046106000    13972566000    15018672000
##  8 HURRICANE            11868319010     2741910000    14610229010
##  9 RIVER FLOOD           5118945500     5029459000    10148404500
## 10 ICE STORM             3944927810     5022113500     8967041310

Here, we can see the top 10 damages on economic by adding the property damage and crop damage together. Then, we can visualize the graphs for both of them under the Results section.

Results

Question1

par(mfrow=c(1,2))
par(mar=c(10, 5, 6, 3))
barplot(injuries_bytype$INJURIES, names.arg=injuries_bytype$EVTYPE, las=2, 
        ylim=c(0, 100000), ylab="Count", cex.axis=0.85, cex.names=0.85, 
        main="Top 10 Events of Most Injuries With Respect \n
        to Population Health Across the US")

barplot(fatalities_bytype$FATALITIES, names.arg=fatalities_bytype$EVTYPE, 
        las=2, ylim=c(0, 6000), ylab="Count", cex.axis=0.85, cex.names=0.85, 
        main="Top 10 Events of Most Fatalities With Respect \n
        to Population Health Across the US")

The above two graphs shown the top 10 types of events that have most injuries and most fatalities with respect to population health across the US. Clearly, we can see that tornado is the most harmful event since it causes the most injuries and fatalities.

Question2

par(mgp=c(4,1,0))
par(mar=c(10, 5, 6, 3))
barplot(Top10_Econ_Dmg$Total_Econ_Dmg, names.arg=Top10_Econ_Dmg$EVTYPE, las=2, 
        ylab="Cost of Damages", cex.axis=0.85, cex.names=0.85, 
        main="Top 10 Events With the Greatest \n Economic Consequences Across the US")

From the graph, we can see that event of Flood has the greatest economic consequences across the US since it has the largest cost of damages in price.

Storms and Severe Weather Events Data Exploration and Analysis

Jun Zhang

September 27, 2019

Synopsis

Data Processing

Question1

Question2

Results

Question1

Question2