Explore the NOAA Storm Database

Explore the NOAA Storm Database

Synopsis

This project is going to answer some basic questions about severe weather events. Using data collected by the National Oceanic and Atmospheric Administration (NOAA), we will examine the public health and financial impacts of weather events. These data will quantify the top ten most expensive events (in terms of dollars) and the top ten human casualty events (defined as deaths and injuries.) The NOAA storm database will be downloaded, pre-processed, analyzed and reported using R and will include interpretation of estimates, assumptions and other data sources.

Loading and Processing the Raw Data

From the NOAA Storm Database, we obtain the storm data from 1950 to 2011. After reading in the strom data in the bz2 archive, we download and read the data, then store it in Storm_data.

## loading package
library(ggplot2)
library(dplyr)
## download the storm data, then open and store it
URL <-
        "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
stormfil <- basename(URL)
if (!file.exists(stormfil))
        download.file(URL, stormfil)
file_temp <- bzfile(stormfil)
Storm_data <- read.csv(file_temp, sep = ",", header = TRUE)

Display the data summary.

names(Storm_data)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"     "COUNTYNAME" "STATE"     
##  [8] "EVTYPE"     "BGN_RANGE"  "BGN_AZI"    "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END"
## [15] "COUNTYENDN" "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"      "F"         
## [22] "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"    "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP"
## [29] "WFO"        "STATEOFFIC" "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
head(Storm_data)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE  EVTYPE BGN_RANGE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL TORNADO         0
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL TORNADO         0
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL TORNADO         0
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL TORNADO         0
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL TORNADO         0
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL TORNADO         0
##   BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH
## 1                                               0         NA         0                      14.0
## 2                                               0         NA         0                       2.0
## 3                                               0         NA         0                       0.1
## 4                                               0         NA         0                       0.0
## 5                                               0         NA         0                       0.0
## 6                                               0         NA         0                       1.5
##   WIDTH F MAG FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1   100 3   0          0       15    25.0          K       0                                    
## 2   150 2   0          0        0     2.5          K       0                                    
## 3   123 2   0          0        2    25.0          K       0                                    
## 4   100 2   0          0        2     2.5          K       0                                    
## 5   150 2   0          0        2     2.5          K       0                                    
## 6   177 2   0          0        6     2.5          K       0                                    
##   LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1     3040      8812       3051       8806              1
## 2     3042      8755          0          0              2
## 3     3340      8742          0          0              3
## 4     3458      8626          0          0              4
## 5     3412      8642          0          0              5
## 6     3450      8748          0          0              6
str(Storm_data)
## 'data.frame': 902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
summary(Storm_data)
##     STATE__       BGN_DATE           BGN_TIME          TIME_ZONE             COUNTY     
##  Min.   : 1.0   Length:902297      Length:902297      Length:902297      Min.   :  0.0  
##  1st Qu.:19.0   Class :character   Class :character   Class :character   1st Qu.: 31.0  
##  Median :30.0   Mode  :character   Mode  :character   Mode  :character   Median : 75.0  
##  Mean   :31.2                                                            Mean   :100.6  
##  3rd Qu.:45.0                                                            3rd Qu.:131.0  
##  Max.   :95.0                                                            Max.   :873.0  
##                                                                                         
##   COUNTYNAME           STATE              EVTYPE            BGN_RANGE          BGN_AZI         
##  Length:902297      Length:902297      Length:902297      Min.   :   0.000   Length:902297     
##  Class :character   Class :character   Class :character   1st Qu.:   0.000   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Median :   0.000   Mode  :character  
##                                                           Mean   :   1.484                     
##                                                           3rd Qu.:   1.000                     
##                                                           Max.   :3749.000                     
##                                                                                                
##   BGN_LOCATI          END_DATE           END_TIME           COUNTY_END COUNTYENDN    
##  Length:902297      Length:902297      Length:902297      Min.   :0    Mode:logical  
##  Class :character   Class :character   Class :character   1st Qu.:0    NA's:902297   
##  Mode  :character   Mode  :character   Mode  :character   Median :0                  
##                                                           Mean   :0                  
##                                                           3rd Qu.:0                  
##                                                           Max.   :0                  
##                                                                                      
##    END_RANGE          END_AZI           END_LOCATI            LENGTH              WIDTH         
##  Min.   :  0.0000   Length:902297      Length:902297      Min.   :   0.0000   Min.   :   0.000  
##  1st Qu.:  0.0000   Class :character   Class :character   1st Qu.:   0.0000   1st Qu.:   0.000  
##  Median :  0.0000   Mode  :character   Mode  :character   Median :   0.0000   Median :   0.000  
##  Mean   :  0.9862                                         Mean   :   0.2301   Mean   :   7.503  
##  3rd Qu.:  0.0000                                         3rd Qu.:   0.0000   3rd Qu.:   0.000  
##  Max.   :925.0000                                         Max.   :2315.0000   Max.   :4400.000  
##                                                                                                 
##        F               MAG            FATALITIES          INJURIES            PROPDMG       
##  Min.   :0.0      Min.   :    0.0   Min.   :  0.0000   Min.   :   0.0000   Min.   :   0.00  
##  1st Qu.:0.0      1st Qu.:    0.0   1st Qu.:  0.0000   1st Qu.:   0.0000   1st Qu.:   0.00  
##  Median :1.0      Median :   50.0   Median :  0.0000   Median :   0.0000   Median :   0.00  
##  Mean   :0.9      Mean   :   46.9   Mean   :  0.0168   Mean   :   0.1557   Mean   :  12.06  
##  3rd Qu.:1.0      3rd Qu.:   75.0   3rd Qu.:  0.0000   3rd Qu.:   0.0000   3rd Qu.:   0.50  
##  Max.   :5.0      Max.   :22000.0   Max.   :583.0000   Max.   :1700.0000   Max.   :5000.00  
##  NA's   :843563                                                                             
##   PROPDMGEXP           CROPDMG         CROPDMGEXP            WFO             STATEOFFIC       
##  Length:902297      Min.   :  0.000   Length:902297      Length:902297      Length:902297     
##  Class :character   1st Qu.:  0.000   Class :character   Class :character   Class :character  
##  Mode  :character   Median :  0.000   Mode  :character   Mode  :character   Mode  :character  
##                     Mean   :  1.527                                                           
##                     3rd Qu.:  0.000                                                           
##                     Max.   :990.000                                                           
##                                                                                               
##   ZONENAMES            LATITUDE      LONGITUDE        LATITUDE_E     LONGITUDE_    
##  Length:902297      Min.   :   0   Min.   :-14451   Min.   :   0   Min.   :-14455  
##  Class :character   1st Qu.:2802   1st Qu.:  7247   1st Qu.:   0   1st Qu.:     0  
##  Mode  :character   Median :3540   Median :  8707   Median :   0   Median :     0  
##                     Mean   :2875   Mean   :  6940   Mean   :1452   Mean   :  3509  
##                     3rd Qu.:4019   3rd Qu.:  9605   3rd Qu.:3549   3rd Qu.:  8735  
##                     Max.   :9706   Max.   : 17124   Max.   :9706   Max.   :106220  
##                     NA's   :47                      NA's   :40                     
##    REMARKS              REFNUM      
##  Length:902297      Min.   :     1  
##  Class :character   1st Qu.:225575  
##  Mode  :character   Median :451149  
##                     Mean   :451149  
##                     3rd Qu.:676723  
##                     Max.   :902297  
## 

We are going to subset the raw data to focus on the important variables relating to this research. The variables we need are the type of events (including the state and time of the event), damage numbers (including Unit multiplier).

StormData_Clean <-
        subset(
                Storm_data,
                EVTYPE != "?" & (FATALITIES > 0 |
                                         INJURIES > 0 |
                                         PROPDMG > 0 |
                                         CROPDMG > 0),
                select = c(
                        "EVTYPE",
                        "FATALITIES",
                        "INJURIES",
                        "PROPDMG",
                        "PROPDMGEXP",
                        "CROPDMG",
                        "CROPDMGEXP",
                        "BGN_DATE",
                        "END_DATE"
                )
        )
num_obser <- nrow(StormData_Clean)
num_variable <- ncol(StormData_Clean)

Display the summary of subset data.

names(StormData_Clean)
## [1] "EVTYPE"     "FATALITIES" "INJURIES"   "PROPDMG"    "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP"
## [8] "BGN_DATE"   "END_DATE"
head(StormData_Clean)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP           BGN_DATE END_DATE
## 1 TORNADO          0       15    25.0          K       0             4/18/1950 0:00:00         
## 2 TORNADO          0        0     2.5          K       0             4/18/1950 0:00:00         
## 3 TORNADO          0        2    25.0          K       0             2/20/1951 0:00:00         
## 4 TORNADO          0        2     2.5          K       0              6/8/1951 0:00:00         
## 5 TORNADO          0        2     2.5          K       0            11/15/1951 0:00:00         
## 6 TORNADO          0        6     2.5          K       0            11/15/1951 0:00:00
str(StormData_Clean)
## 'data.frame': 254632 obs. of  9 variables:
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ END_DATE  : chr  "" "" "" "" ...
summary(StormData_Clean)
##     EVTYPE            FATALITIES          INJURIES            PROPDMG         PROPDMGEXP       
##  Length:254632      Min.   :  0.0000   Min.   :   0.0000   Min.   :   0.00   Length:254632     
##  Class :character   1st Qu.:  0.0000   1st Qu.:   0.0000   1st Qu.:   2.00   Class :character  
##  Mode  :character   Median :  0.0000   Median :   0.0000   Median :   5.00   Mode  :character  
##                     Mean   :  0.0595   Mean   :   0.5519   Mean   :  42.75                     
##                     3rd Qu.:  0.0000   3rd Qu.:   0.0000   3rd Qu.:  25.00                     
##                     Max.   :583.0000   Max.   :1700.0000   Max.   :5000.00                     
##     CROPDMG         CROPDMGEXP          BGN_DATE           END_DATE        
##  Min.   :  0.000   Length:254632      Length:254632      Length:254632     
##  1st Qu.:  0.000   Class :character   Class :character   Class :character  
##  Median :  0.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :  5.411                                                           
##  3rd Qu.:  0.000                                                           
##  Max.   :990.000

The subset data for analysing contains 254632 observations and 9 variables. Next, we transform the class of BGN_DATE and END_DATE into time format. Then, we create two new variable to identify the year and duration of events.

StormData_Clean$BGN_DATE <-
        as.Date(StormData_Clean$BGN_DATE, format = "%m/%d/%Y")
StormData_Clean$END_DATE <-
        as.Date(StormData_Clean$END_DATE, format = "%m/%d/%Y")
StormData_Clean$YEAR <-
        as.integer(format(StormData_Clean$BGN_DATE, "%Y"))
StormData_Clean$DURATION <-
        as.numeric(StormData_Clean$END_DATE - StormData_Clean$BGN_DATE) / 3600

## modify the name of event type
StormData_Clean$EVTYPE <- toupper(StormData_Clean$EVTYPE)

To get a better overall picture of the data, we want to plot the event number per year.

StormYear_Type <- StormData_Clean %>%
        subset(select = c("EVTYPE", "YEAR")) %>%
        group_by(YEAR) %>%
        summarise(by = length(unique(EVTYPE)))
plot(
        x = StormYear_Type$YEAR,
        y = StormYear_Type$by,
        xlab = "Year",
        ylab = "Numbers of event types",
        main = "Numbers of Event Types per Year"
)

plot of chunk events-year

According to the figure, only one type of event was observed before 1993, and there are too many observed types during 1994-1995.So we need to subset the data again, focusing on the data between 1996 and 2011.

StormData_Clean <- subset(StormData_Clean, YEAR >= 1996)
num_event <- length(unique(StormData_Clean$EVTYPE))

The total number of unique event type is 186.

In order to identify accurate damage number, we need to create a function to calculate it, then store it in new variables. The calculation method is based on the National Weather Service Storm Data Documentation. In addition, we need to identify the damage level of event influencing population health.

## create function to calculate the damage number of economic consequences
damage_num <- function (x) {
        if (x == "")
                return (10 ^ 0)
        
        if (x == "-")
                return (10 ^ 0)
        
        if (x == "?")
                return (10 ^ 0)
        
        if (x == "+")
                return (10 ^ 0)
        
        if (x == "0")
                return (10 ^ 0)
        
        if (x == "1")
                return (10 ^ 1)
        
        if (x == "2")
                return (10 ^ 2)
        
        if (x == "3")
                return (10 ^ 3)
        
        if (x == "4")
                return (10 ^ 4)
        
        if (x == "5")
                return (10 ^ 5)
        
        if (x == "6")
                return (10 ^ 6)
        
        if (x == "7")
                return (10 ^ 7)
        
        if (x == "8")
                return (10 ^ 8)
        
        if (x == "9")
                return (10 ^ 9)
        
        if (x == "H")
                return (10 ^ 2)
        
        if (x == "K")
                return (10 ^ 3)
        
        if (x == "M")
                return (10 ^ 6)
        
        if (x == "B")
                return (10 ^ 9)
        
        return (NA)
        
}

## store the damage number into new variable
StormData_Clean$PROPDAMA_NUM <-
        with(StormData_Clean,
             as.numeric(PROPDMG) * sapply(PROPDMGEXP, damage_num)) / 10 ^ 3
StormData_Clean$CROPDAMA_NUM <-
        with(StormData_Clean,
             as.numeric(CROPDMG) * sapply(CROPDMGEXP, damage_num)) / 10 ^ 3

## calculate the damage level of population health
StormData_Clean$HEALTHDAMA_NUM <-
        StormData_Clean$FATALITIES + StormData_Clean$INJURIES

Results

| Summarize health damage data (fatalities + injuries) and economic damage data (property + crop). | First step is subsetting clean data, and create Health_damage. Then plot bar graphic to indentify top 10 extreme weather which result in the most amout of health damage.

Health_damage <-
        StormData_Clean %>%
        group_by(EVTYPE) %>%
        summarise(healthdam_total = sum(HEALTHDAMA_NUM)) %>%
        arrange(desc(healthdam_total))
## plot Health_damage[1:10]
gh <-
        ggplot(Health_damage[1:10, ],
               aes(x = reorder(EVTYPE, healthdam_total), y = healthdam_total),
               color = EVTYPE)
chart1 <-
        gh + geom_bar(stat = "identity") + xlab("Event type") + ylab("Damage to population health (fatalities and injuries)") + ggtitle("Population health risks of extreme weather (top 10 events from 1996-2011)") + theme(plot.title = element_text(size = 14, hjust = 0.5)) + coord_flip() + theme_light()
print(chart1)

plot of chunk health-damage

Second step is subsetting clean data, and create Economic_damage. Then plot bar graphic to indentify top 10 extreme weather which result in the most amout of economic damage.

StormData_Clean$ECONOMIC_SUM <-
        StormData_Clean$PROPDAMA_NUM + StormData_Clean$CROPDAMA_NUM
Economic_damage <-
        StormData_Clean %>%
        group_by(EVTYPE) %>%
        summarise(economicdam_total = sum(ECONOMIC_SUM)) %>%
        arrange(desc(economicdam_total))
        
## plot Health_damage[1:10]
gh <-
        ggplot(Economic_damage[1:10, ],
               aes(x = reorder(EVTYPE, economicdam_total), y = economicdam_total),
               color = EVTYPE)
chart2 <-
        gh + geom_bar(stat = "identity") + xlab("Event type") + ylab("Economic consequences of extreme weacher (corp damage and property damage)") + ggtitle("Economic consequences of extreme weather (top 10 events from 1996-2011)") + theme(plot.title = element_text(size = 15, hjust = 1)) + coord_flip() + theme_light()
print(chart2)

plot of chunk economic-damage