# download and read data into R

# define working names for URL and filename
        url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
        filename <- "repdata_data_StormData.csv.bz2"
        
# download file from url
        download.file(url, filename)
        
# read data in to R dataframe
        stormdata <- read.csv(filename)
        
# check dataframe characteristics
        dim(stormdata)
## [1] 902297     37
        str(stormdata)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : Factor w/ 35 levels "","  N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_DATE  : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_TIME  : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ WFO       : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZONENAMES : Factor w/ 25112 levels "","                                                                                                                               "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
# extract relevant fields

# load dplyr package
        library(dplyr)

# define and select relevant effect fields
        effect <- select(stormdata, EVTYPE, FATALITIES:CROPDMGEXP)
        
# check dataframe characteristics
        dim(effect)
## [1] 902297      7
        str(effect)
## 'data.frame':    902297 obs. of  7 variables:
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
# check the possible values for each of these fields
        unique(effect$PROPDMGEXP)
##  [1] K M   B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
        unique(effect$CROPDMGEXP)
## [1]   M K m B ? 0 k 2
## Levels:  ? 0 2 B k K m M
# convert PROPDMGEXP character strings to multipliers

# assume numbers 0 - 8 are powers of 10
        effect$PROPDMGMULT[effect$PROPDMGEXP == "0"] <- 10^0
        effect$PROPDMGMULT[effect$PROPDMGEXP == "1"] <- 10^1
        effect$PROPDMGMULT[effect$PROPDMGEXP == "2"] <- 10^2
        effect$PROPDMGMULT[effect$PROPDMGEXP == "3"] <- 10^3
        effect$PROPDMGMULT[effect$PROPDMGEXP == "4"] <- 10^4
        effect$PROPDMGMULT[effect$PROPDMGEXP == "5"] <- 10^5
        effect$PROPDMGMULT[effect$PROPDMGEXP == "6"] <- 10^6
        effect$PROPDMGMULT[effect$PROPDMGEXP == "7"] <- 10^7
        effect$PROPDMGMULT[effect$PROPDMGEXP == "8"] <- 10^8
        
# assume h and H are hundred
        effect$PROPDMGMULT[effect$PROPDMGEXP == "h"] <- 10^2
        effect$PROPDMGMULT[effect$PROPDMGEXP == "H"] <- 10^2
        
# assume K is thousand
        effect$PROPDMGMULT[effect$PROPDMGEXP == "K"] <- 10^3
        
# assume m and M are million
        effect$PROPDMGMULT[effect$PROPDMGEXP == "m"] <- 10^6
        effect$PROPDMGMULT[effect$PROPDMGEXP == "M"] <- 10^6
        
# assume B is billion
        effect$PROPDMGMULT[effect$PROPDMGEXP == "B"] <- 10^9
        
# assume blank, -, +, and ? are one
        effect$PROPDMGMULT[effect$PROPDMGEXP == ""] <- 1
        effect$PROPDMGMULT[effect$PROPDMGEXP == "-"] <- 1
        effect$PROPDMGMULT[effect$PROPDMGEXP == "+"] <- 1
        effect$PROPDMGMULT[effect$PROPDMGEXP == "?"] <- 1

# define property damage cost field
        effect$PROPDMGCOST <- effect$PROPDMG * effect$PROPDMGMULT / 10^9 # express in billions
# convert CROPDMGEXP character strings to multipliers

# assume numbers 0 and 2 are powers of 10
        effect$CROPDMGMULT[effect$CROPDMGEXP == "0"] <- 10^0
        effect$CROPDMGMULT[effect$CROPDMGEXP == "2"] <- 10^2
        
# assume k and K are thousand
        effect$CROPDMGMULT[effect$CROPDMGEXP == "k"] <- 10^3
        effect$CROPDMGMULT[effect$CROPDMGEXP == "K"] <- 10^3
        
# assume m and M are million
        effect$CROPDMGMULT[effect$CROPDMGEXP == "m"] <- 10^6
        effect$CROPDMGMULT[effect$CROPDMGEXP == "M"] <- 10^6
        
# assume B is billion
        effect$CROPDMGMULT[effect$CROPDMGEXP == "B"] <- 10^9
        
# assume blank and ? are one
        effect$CROPDMGMULT[effect$CROPDMGEXP == ""] <- 1
        effect$CROPDMGMULT[effect$CROPDMGEXP == "?"] <- 1

# define crop damage cost field
        effect$CROPDMGCOST <- effect$CROPDMG * effect$CROPDMGMULT / 10^9 # express in billions
        
# check new fields added
        dim(effect)
## [1] 902297     11
        str(effect)
## 'data.frame':    902297 obs. of  11 variables:
##  $ EVTYPE     : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ FATALITIES : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES   : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG    : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP : Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP : Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ PROPDMGMULT: num  1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
##  $ PROPDMGCOST: num  2.5e-05 2.5e-06 2.5e-05 2.5e-06 2.5e-06 2.5e-06 2.5e-06 2.5e-06 2.5e-05 2.5e-05 ...
##  $ CROPDMGMULT: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ CROPDMGCOST: num  0 0 0 0 0 0 0 0 0 0 ...
# group and calculate totals

# total fatalities
        totalfatalities <- aggregate(FATALITIES ~ EVTYPE, effect, FUN = sum)
        totalfatalities <- arrange(totalfatalities, desc(FATALITIES)) # rank high to low
        top10fatalities <- totalfatalities[1:10,] # top 10
        top10fatalities
##            EVTYPE FATALITIES
## 1         TORNADO       5633
## 2  EXCESSIVE HEAT       1903
## 3     FLASH FLOOD        978
## 4            HEAT        937
## 5       LIGHTNING        816
## 6       TSTM WIND        504
## 7           FLOOD        470
## 8     RIP CURRENT        368
## 9       HIGH WIND        248
## 10      AVALANCHE        224
# total injuries
        totalinjuries <- aggregate(INJURIES ~ EVTYPE, effect, FUN = sum)
        totalinjuries <- arrange(totalinjuries, desc(INJURIES)) # rank high to low
        top10injuries <- totalinjuries[1:10,] # top 10
        top10injuries
##               EVTYPE INJURIES
## 1            TORNADO    91346
## 2          TSTM WIND     6957
## 3              FLOOD     6789
## 4     EXCESSIVE HEAT     6525
## 5          LIGHTNING     5230
## 6               HEAT     2100
## 7          ICE STORM     1975
## 8        FLASH FLOOD     1777
## 9  THUNDERSTORM WIND     1488
## 10              HAIL     1361
# total property damage
        totalpropdmg <- aggregate(PROPDMGCOST ~ EVTYPE, effect, FUN = sum)
        totalpropdmg <- arrange(totalpropdmg, desc(PROPDMGCOST)) # rank high to low
        top10propdmg <- totalpropdmg[1:10,] # top 10
        top10propdmg
##               EVTYPE PROPDMGCOST
## 1              FLOOD  144.657710
## 2  HURRICANE/TYPHOON   69.305840
## 3            TORNADO   56.947381
## 4        STORM SURGE   43.323536
## 5        FLASH FLOOD   16.822674
## 6               HAIL   15.735268
## 7          HURRICANE   11.868319
## 8     TROPICAL STORM    7.703891
## 9       WINTER STORM    6.688497
## 10         HIGH WIND    5.270046
# total crop damage
        totalcropdmg <- aggregate(CROPDMGCOST ~ EVTYPE, effect, FUN = sum)
        totalcropdmg <- arrange(totalcropdmg, desc(CROPDMGCOST)) # rank high to low
        top10cropdmg <- totalcropdmg[1:10,] # top 10
        top10cropdmg
##               EVTYPE CROPDMGCOST
## 1            DROUGHT   13.972566
## 2              FLOOD    5.661968
## 3        RIVER FLOOD    5.029459
## 4          ICE STORM    5.022113
## 5               HAIL    3.025954
## 6          HURRICANE    2.741910
## 7  HURRICANE/TYPHOON    2.607873
## 8        FLASH FLOOD    1.421317
## 9       EXTREME COLD    1.292973
## 10      FROST/FREEZE    1.094086
# set up a 1x2 view of two graphs
        par(mfrow = c(1,2), mar=c(12, 6, 3, 3), mgp=c(4, 1, 0), las=2, cex = 0.8)

# force standard notation for y axis (instead of scientific notation)
        options("scipen" = 100)

# fatalities bar chart
        barplot(top10fatalities$FATALITIES,
                names.arg = top10fatalities$EVTYPE, 
                main = "Top 10 Fatality Event Types", 
                ylab = "Number of fatalities", 
                ylim = c(0, 6000), 
                col = "black")
        
# injuries bar chart
        barplot(top10injuries$INJURIES, 
                names.arg = top10injuries$EVTYPE, 
                main = "Top 10 Injury Event Types", 
                ylab = "Number of injuries", 
                ylim = c(0, 100000), 
                col = "light grey")

# set up a 1x2 view of two graphs
        par(mfrow = c(1,2), mar=c(12, 6, 3, 3), mgp=c(4, 1, 0), las=2, cex = 0.8)

# force standard notation for y axis (instead of scientific notation)
        options("scipen" = 100)

# property damage bar chart
        barplot(top10propdmg$PROPDMGCOST,
                names.arg = top10propdmg$EVTYPE, 
                main = "Top 10 Property Damage Event Types", 
                ylab = "Property damage ($ billion)", 
                ylim = c(0, 150), 
                col = "green")
        
# crop damage bar chart
        barplot(top10cropdmg$CROPDMGCOST,
                names.arg = top10cropdmg$EVTYPE, 
                main = "Top 10 Crop Damage Event Types", 
                ylab = "Crop damage ($ billion)", 
                ylim = c(0, 15), 
                col = "wheat")