# download and read data into R
# define working names for URL and filename
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
filename <- "repdata_data_StormData.csv.bz2"
# download file from url
download.file(url, filename)
# read data in to R dataframe
stormdata <- read.csv(filename)
# check dataframe characteristics
dim(stormdata)
## [1] 902297 37
str(stormdata)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
# extract relevant fields
# load dplyr package
library(dplyr)
# define and select relevant effect fields
effect <- select(stormdata, EVTYPE, FATALITIES:CROPDMGEXP)
# check dataframe characteristics
dim(effect)
## [1] 902297 7
str(effect)
## 'data.frame': 902297 obs. of 7 variables:
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
# check the possible values for each of these fields
unique(effect$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(effect$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
# convert PROPDMGEXP character strings to multipliers
# assume numbers 0 - 8 are powers of 10
effect$PROPDMGMULT[effect$PROPDMGEXP == "0"] <- 10^0
effect$PROPDMGMULT[effect$PROPDMGEXP == "1"] <- 10^1
effect$PROPDMGMULT[effect$PROPDMGEXP == "2"] <- 10^2
effect$PROPDMGMULT[effect$PROPDMGEXP == "3"] <- 10^3
effect$PROPDMGMULT[effect$PROPDMGEXP == "4"] <- 10^4
effect$PROPDMGMULT[effect$PROPDMGEXP == "5"] <- 10^5
effect$PROPDMGMULT[effect$PROPDMGEXP == "6"] <- 10^6
effect$PROPDMGMULT[effect$PROPDMGEXP == "7"] <- 10^7
effect$PROPDMGMULT[effect$PROPDMGEXP == "8"] <- 10^8
# assume h and H are hundred
effect$PROPDMGMULT[effect$PROPDMGEXP == "h"] <- 10^2
effect$PROPDMGMULT[effect$PROPDMGEXP == "H"] <- 10^2
# assume K is thousand
effect$PROPDMGMULT[effect$PROPDMGEXP == "K"] <- 10^3
# assume m and M are million
effect$PROPDMGMULT[effect$PROPDMGEXP == "m"] <- 10^6
effect$PROPDMGMULT[effect$PROPDMGEXP == "M"] <- 10^6
# assume B is billion
effect$PROPDMGMULT[effect$PROPDMGEXP == "B"] <- 10^9
# assume blank, -, +, and ? are one
effect$PROPDMGMULT[effect$PROPDMGEXP == ""] <- 1
effect$PROPDMGMULT[effect$PROPDMGEXP == "-"] <- 1
effect$PROPDMGMULT[effect$PROPDMGEXP == "+"] <- 1
effect$PROPDMGMULT[effect$PROPDMGEXP == "?"] <- 1
# define property damage cost field
effect$PROPDMGCOST <- effect$PROPDMG * effect$PROPDMGMULT / 10^9 # express in billions
# convert CROPDMGEXP character strings to multipliers
# assume numbers 0 and 2 are powers of 10
effect$CROPDMGMULT[effect$CROPDMGEXP == "0"] <- 10^0
effect$CROPDMGMULT[effect$CROPDMGEXP == "2"] <- 10^2
# assume k and K are thousand
effect$CROPDMGMULT[effect$CROPDMGEXP == "k"] <- 10^3
effect$CROPDMGMULT[effect$CROPDMGEXP == "K"] <- 10^3
# assume m and M are million
effect$CROPDMGMULT[effect$CROPDMGEXP == "m"] <- 10^6
effect$CROPDMGMULT[effect$CROPDMGEXP == "M"] <- 10^6
# assume B is billion
effect$CROPDMGMULT[effect$CROPDMGEXP == "B"] <- 10^9
# assume blank and ? are one
effect$CROPDMGMULT[effect$CROPDMGEXP == ""] <- 1
effect$CROPDMGMULT[effect$CROPDMGEXP == "?"] <- 1
# define crop damage cost field
effect$CROPDMGCOST <- effect$CROPDMG * effect$CROPDMGMULT / 10^9 # express in billions
# check new fields added
dim(effect)
## [1] 902297 11
str(effect)
## 'data.frame': 902297 obs. of 11 variables:
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ FATALITIES : num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP : Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP : Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PROPDMGMULT: num 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
## $ PROPDMGCOST: num 2.5e-05 2.5e-06 2.5e-05 2.5e-06 2.5e-06 2.5e-06 2.5e-06 2.5e-06 2.5e-05 2.5e-05 ...
## $ CROPDMGMULT: num 1 1 1 1 1 1 1 1 1 1 ...
## $ CROPDMGCOST: num 0 0 0 0 0 0 0 0 0 0 ...
# group and calculate totals
# total fatalities
totalfatalities <- aggregate(FATALITIES ~ EVTYPE, effect, FUN = sum)
totalfatalities <- arrange(totalfatalities, desc(FATALITIES)) # rank high to low
top10fatalities <- totalfatalities[1:10,] # top 10
top10fatalities
## EVTYPE FATALITIES
## 1 TORNADO 5633
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
## 6 TSTM WIND 504
## 7 FLOOD 470
## 8 RIP CURRENT 368
## 9 HIGH WIND 248
## 10 AVALANCHE 224
# total injuries
totalinjuries <- aggregate(INJURIES ~ EVTYPE, effect, FUN = sum)
totalinjuries <- arrange(totalinjuries, desc(INJURIES)) # rank high to low
top10injuries <- totalinjuries[1:10,] # top 10
top10injuries
## EVTYPE INJURIES
## 1 TORNADO 91346
## 2 TSTM WIND 6957
## 3 FLOOD 6789
## 4 EXCESSIVE HEAT 6525
## 5 LIGHTNING 5230
## 6 HEAT 2100
## 7 ICE STORM 1975
## 8 FLASH FLOOD 1777
## 9 THUNDERSTORM WIND 1488
## 10 HAIL 1361
# total property damage
totalpropdmg <- aggregate(PROPDMGCOST ~ EVTYPE, effect, FUN = sum)
totalpropdmg <- arrange(totalpropdmg, desc(PROPDMGCOST)) # rank high to low
top10propdmg <- totalpropdmg[1:10,] # top 10
top10propdmg
## EVTYPE PROPDMGCOST
## 1 FLOOD 144.657710
## 2 HURRICANE/TYPHOON 69.305840
## 3 TORNADO 56.947381
## 4 STORM SURGE 43.323536
## 5 FLASH FLOOD 16.822674
## 6 HAIL 15.735268
## 7 HURRICANE 11.868319
## 8 TROPICAL STORM 7.703891
## 9 WINTER STORM 6.688497
## 10 HIGH WIND 5.270046
# total crop damage
totalcropdmg <- aggregate(CROPDMGCOST ~ EVTYPE, effect, FUN = sum)
totalcropdmg <- arrange(totalcropdmg, desc(CROPDMGCOST)) # rank high to low
top10cropdmg <- totalcropdmg[1:10,] # top 10
top10cropdmg
## EVTYPE CROPDMGCOST
## 1 DROUGHT 13.972566
## 2 FLOOD 5.661968
## 3 RIVER FLOOD 5.029459
## 4 ICE STORM 5.022113
## 5 HAIL 3.025954
## 6 HURRICANE 2.741910
## 7 HURRICANE/TYPHOON 2.607873
## 8 FLASH FLOOD 1.421317
## 9 EXTREME COLD 1.292973
## 10 FROST/FREEZE 1.094086
# set up a 1x2 view of two graphs
par(mfrow = c(1,2), mar=c(12, 6, 3, 3), mgp=c(4, 1, 0), las=2, cex = 0.8)
# force standard notation for y axis (instead of scientific notation)
options("scipen" = 100)
# fatalities bar chart
barplot(top10fatalities$FATALITIES,
names.arg = top10fatalities$EVTYPE,
main = "Top 10 Fatality Event Types",
ylab = "Number of fatalities",
ylim = c(0, 6000),
col = "black")
# injuries bar chart
barplot(top10injuries$INJURIES,
names.arg = top10injuries$EVTYPE,
main = "Top 10 Injury Event Types",
ylab = "Number of injuries",
ylim = c(0, 100000),
col = "light grey")

# set up a 1x2 view of two graphs
par(mfrow = c(1,2), mar=c(12, 6, 3, 3), mgp=c(4, 1, 0), las=2, cex = 0.8)
# force standard notation for y axis (instead of scientific notation)
options("scipen" = 100)
# property damage bar chart
barplot(top10propdmg$PROPDMGCOST,
names.arg = top10propdmg$EVTYPE,
main = "Top 10 Property Damage Event Types",
ylab = "Property damage ($ billion)",
ylim = c(0, 150),
col = "green")
# crop damage bar chart
barplot(top10cropdmg$CROPDMGCOST,
names.arg = top10cropdmg$EVTYPE,
main = "Top 10 Crop Damage Event Types",
ylab = "Crop damage ($ billion)",
ylim = c(0, 15),
col = "wheat")
