US storms data analysis to determine the most health/property damage causing weather events

Data Processing

1. Download dataset, if not already done, and load into R.

The data for the assignment can be downloaded from here. The data download would need to be done one time. In subsequent runs, we’d check if the data file is available.

Initializations: Load required libraries. Set filename variables.

library("data.table")
library("ggplot2")

bizfile="./repdata_data_StormData.csv.bz2"
csvfile="./repdata_data_StormData.csv"

fileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
temp <- tempfile("StormData.csv", tmpdir="C:/Users/ASwar/ReprProj2", fileext=c(".bz2"))

Check if the data has already been read in R.

if (!exists("stormData")) {
   stormData <- read.csv("./repdata_data_StormData.csv")
}

Check if the datafile has already been downloaded to disk

if(file.exists(csvfile)) {
    stormData <- read.csv(csvfile)
} else {
        if(!file.exists(bizfile)) {
           download.file(fileURL, bizfile)
           system("7za x repdata_data_StormData.csv.bz2", intern=TRUE)
        } else {
           system("7za x repdata_data_StormData.csv.bz2", intern=TRUE)
        }
    stormData <- read.csv(csvfile)
}

Take a look at some data records

head(stormData, n=3)

##   STATE__          BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1       1 4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2       1 4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3       1 2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
##    EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO         0                                               0
## 2 TORNADO         0                                               0
## 3 TORNADO         0                                               0
##   COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1         NA         0                      14.0   100 3   0          0
## 2         NA         0                       2.0   150 2   0          0
## 3         NA         0                       0.1   123 2   0          0
##   INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1       15    25.0          K       0                                    
## 2        0     2.5          K       0                                    
## 3        2    25.0          K       0                                    
##   LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1     3040      8812       3051       8806              1
## 2     3042      8755          0          0              2
## 3     3340      8742          0          0              3

Convert data frame to data table, examine column names

if (!exists("stormDT")) {
   stormDT <- as.data.table(stormData)
}
colnames(stormDT)

##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

2. Extract relevant columns and rows

Extract columns related to our health and economic consequences analysis, namely, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP

myVector <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")

mystormDT0 <- stormDT[, myVector, with=FALSE]

Extract data rows where fatalaties or injuries or damages are non-zero

mystormDT <- mystormDT0[EVTYPE != "?" & (INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0)]

Look at few records of the relevant dataset

head(mystormDT, n=5)

##     EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1: TORNADO          0       15    25.0          K       0           
## 2: TORNADO          0        0     2.5          K       0           
## 3: TORNADO          0        2    25.0          K       0           
## 4: TORNADO          0        2     2.5          K       0           
## 5: TORNADO          0        2     2.5          K       0

3. Convert alphabetic units column to equivalent numeric multipliers

In the exponent columns PROPDMGEXP and CROPDMGEXP change alphabet values to equivalent numeric values

# Change both damage exponents to uppercase
cols <- c("PROPDMGEXP", "CROPDMGEXP")
mystormDT[, (cols) := c(lapply(.SD, toupper)), .SDcols = cols]

# Convert PROPDMGEXP column to numeric
propDmgMultiplier <-  c("\"\"" = 10^0, "-" = 10^0, "+" = 10^0, 
                     "0" = 10^0, "1" = 10^1, "2" = 10^2, "3" = 10^3, "4" = 10^4,
                     "5" = 10^5, "6" = 10^6, "7" = 10^7, "8" = 10^8, "9" = 10^9,
                     "H" = 10^2, "K" = 10^3, "M" = 10^6, "B" = 10^9)
mystormDT[,PROPDMGEXP := propDmgMultiplier[as.character(mystormDT[,PROPDMGEXP])]]
mystormDT[is.na(PROPDMGEXP), PROPDMGEXP := 10^0 ]

# Convert CROPDMGEXP column to numeric
cropDmgMultiplier <-  c("\"\"" = 10^0, "?" = 10^0, "0" = 10^0,
                           "K" = 10^3, "M" = 10^6, "B" = 10^9)

mystormDT[,CROPDMGEXP := cropDmgMultiplier[as.character(mystormDT[,CROPDMGEXP])]]
mystormDT[is.na(CROPDMGEXP), CROPDMGEXP := 10^0 ]

4. Get values of damage in property and health sub-categories and their totals

Create columns Property Damage Value, Crop Damage Value, and their Totals

mystormDT <- mystormDT[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, propDamage = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, cropDamage = CROPDMG * CROPDMGEXP)]

# Add the Property Damage and Crop Damage to get Total Loss Value
totalDamageDT <- mystormDT[, .(propDamage = sum(propDamage), cropDamage = sum(cropDamage), Total_Damage = sum(propDamage) + sum(cropDamage)), by = .(EVTYPE)]
totalDamageDT <- totalDamageDT[order(-Total_Damage), ]
totalDamageDT <- totalDamageDT[1:10, ]

# head(totalDamageDT, 5)

Calculate totals of injuries and fatalities -> total incidents

totalIncidentsDT <- mystormDT[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), TOTALS = sum(FATALITIES) + sum(INJURIES)), by = .(EVTYPE)]
totalIncidentsDT <- totalIncidentsDT[order(-FATALITIES), ]
totalIncidentsDT <- totalIncidentsDT[1:10, ]

# head(totalIncidentsDT, 5)

5. Chart the events that are most harmful to population health

# Melt the total incidents data table to make it suitable for bar chart plotting using ggplot
incidents <- melt(totalIncidentsDT, id.vars="EVTYPE", variable.name = "Incident_Type")
head(incidents, 5)

##            EVTYPE Incident_Type value
## 1:        TORNADO    FATALITIES  5633
## 2: EXCESSIVE HEAT    FATALITIES  1903
## 3:    FLASH FLOOD    FATALITIES   978
## 4:           HEAT    FATALITIES   937
## 5:      LIGHTNING    FATALITIES   816

# Plot incidents harmful to human health
chart1 <- ggplot(incidents, aes(x=reorder(EVTYPE, value), y=value)) +
          geom_bar(stat="identity", aes(fill=Incident_Type), position="dodge") + coord_flip() +
          xlab("Event Type") + ylab("Total number of incidents (fatalities, injuries, totals)") +
          theme(axis.text.x = element_text(angle=45, hjust=1)) +
          ggtitle("Top 10 Harmful US Weather Events") +
          theme(plot.title = element_text(hjust = 0.5))
chart1

6. Chart the events that have the greatest economic consequences

# Melt the total damages data table to make it suitable for bar chart plotting using ggplot
losses <- melt(totalDamageDT, id.vars="EVTYPE", variable.name = "Damage_Type")
head(losses, 5)

##               EVTYPE Damage_Type        value
## 1:             FLOOD  propDamage 144657709807
## 2: HURRICANE/TYPHOON  propDamage  69305840000
## 3:           TORNADO  propDamage  56947380677
## 4:       STORM SURGE  propDamage  43323536000
## 5:              HAIL  propDamage  15735267513

# Plot financial losses causing events
chart2 <- ggplot(losses, aes(x=reorder(EVTYPE, value), y=(value/10^9))) +
          geom_bar(stat="identity", aes(fill=Damage_Type), position="dodge") + coord_flip() +
          xlab("Event Type") + ylab("Cost in billion dollars") +
          theme(axis.text.x = element_text(angle=45, hjust=1)) +
          ggtitle("Top 10 US Storm Events causing Economic Consequences") +
          theme(plot.title = element_text(hjust = 0.5))

chart2

US storms data analysis to determine the most health/property damage causing weather events

A. Swarup

April 1, 2018

Synopsis