2. Loading and processing data
setwd("C:\\Users\\Windows\\Documents\\JHU_Data_Science\\Course_5\\Project_2") # Set Project_2 as the working directory
Set the database url, destination filename in the Project_2 folder and then load the data.
URL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2" # Database URL
destfile <- "storm_data.csv.bz2" # destination file name
download.file(URL, destfile) # download file
Read data from the data file.
storm_data <- read.csv(destfile) # Read in csv format data
Change column names to lower case.
names(storm_data) <- tolower(names(storm_data)) # Change all field names to lowercase
Details of the data frame: dimensions, variable names, structure etc.
dim(storm_data) # dimensions of storm data
## [1] 902297 37
names(storm_data) # column names of storm data
## [1] "state__" "bgn_date" "bgn_time" "time_zone" "county"
## [6] "countyname" "state" "evtype" "bgn_range" "bgn_azi"
## [11] "bgn_locati" "end_date" "end_time" "county_end" "countyendn"
## [16] "end_range" "end_azi" "end_locati" "length" "width"
## [21] "f" "mag" "fatalities" "injuries" "propdmg"
## [26] "propdmgexp" "cropdmg" "cropdmgexp" "wfo" "stateoffic"
## [31] "zonenames" "latitude" "longitude" "latitude_e" "longitude_"
## [36] "remarks" "refnum"
str(storm_data) # structure of storm data
## 'data.frame': 902297 obs. of 37 variables:
## $ state__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ bgn_date : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ bgn_time : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ time_zone : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ county : num 97 3 57 89 43 77 9 123 125 57 ...
## $ countyname: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ state : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ evtype : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ bgn_range : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bgn_azi : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ bgn_locati: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ end_date : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ end_time : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county_end: num 0 0 0 0 0 0 0 0 0 0 ...
## $ countyendn: logi NA NA NA NA NA NA ...
## $ end_range : num 0 0 0 0 0 0 0 0 0 0 ...
## $ end_azi : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ end_locati: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ length : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ width : num 100 150 123 100 150 177 33 33 100 100 ...
## $ f : int 3 2 2 2 2 2 2 1 3 3 ...
## $ mag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fatalities: num 0 0 0 0 0 0 0 0 1 0 ...
## $ injuries : num 15 0 2 2 2 6 1 0 14 0 ...
## $ propdmg : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ propdmgexp: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ cropdmg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cropdmgexp: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ wfo : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ stateoffic: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ zonenames : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ latitude : num 3040 3042 3340 3458 3412 ...
## $ longitude : num 8812 8755 8742 8626 8642 ...
## $ latitude_e: num 3051 0 0 0 0 ...
## $ longitude_: num 8806 0 0 0 0 ...
## $ remarks : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ refnum : num 1 2 3 4 5 6 7 8 9 10 ...
In this analysis, we are interested in three types of variables: variables indicating severe event type, economic impact and public health impact. We remove any other variables that are not relevant. Information about these variables and the data in general come from the National Climatic Data Center Storm Events FAQ and the National Weather Service storm data documentation.
The following three variable categories are of interest. We remove all other columns of data.
Variables names related to storm type: evtype Variable names related to public health: fatalities, injuries Variable names related to property/crop damage: propdmg, propdmgexp, cropdmg, and cropdmgexp. The first three significant digits are given by propdmg and cropdmg. The exponents are given by propdmgexp and cropdmgexp.
library(dplyr)
# Keep public health and economic (property/crops) impact data
data <- select(storm_data, state__, county, evtype, fatalities,
injuries, propdmg, propdmgexp, cropdmg, cropdmgexp)
# Thunderstorm winds and thunderstorm wind are the same thing
data$evtype[which(data$evtype == "THUNDERSTORM WINDS")] <- "THUNDERSTORM WIND"
str(data) # structure of data set
## 'data.frame': 902297 obs. of 9 variables:
## $ state__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ county : num 97 3 57 89 43 77 9 123 125 57 ...
## $ evtype : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ fatalities: num 0 0 0 0 0 0 0 0 1 0 ...
## $ injuries : num 15 0 2 2 2 6 1 0 14 0 ...
## $ propdmg : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ propdmgexp: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ cropdmg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cropdmgexp: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
levels(factor(data$cropdmgexp)) # Set of values taken by cropdmgexp
## [1] "" "?" "0" "2" "B" "k" "K" "m" "M"
levels(factor(data$propdmgexp)) # Set of values taken by propdmgexp
## [1] "" "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K"
## [18] "m" "M"
Only one data point takes on the value of -. We replace it with NA.
Replace exponents of property and crop damage characters by equivalent numeric value.
# Replace exponents by characters by equivalent numbers
data$cropdmgexp <- sub("^1$", "10", data$cropdmgexp)
data$cropdmgexp <- sub("^2$", "100", data$cropdmgexp)
data$cropdmgexp <- sub("^3$", "1000", data$cropdmgexp)
data$cropdmgexp <- sub("^4$", "10000", data$cropdmgexp)
data$cropdmgexp <- sub("^5$", "100000", data$cropdmgexp)
data$cropdmgexp <- sub("^6$", "1000000", data$cropdmgexp)
data$cropdmgexp <- sub("^0$", "1", data$cropdmgexp)
data$cropdmgexp <- sub("^?$", "1", data$cropdmgexp)
data$cropdmgexp <- sub("^$", "1", data$cropdmgexp)
data$cropdmgexp <- sub("^[Kk]$", "100000", data$cropdmgexp)
data$cropdmgexp <- sub("^[Mm]$", "1000000", data$cropdmgexp)
data$cropdmgexp <- sub("^[B]$", "1000000000", data$cropdmgexp)
data$propdmgexp <- sub("^1$", "10", data$propdmgexp)
data$propdmgexp <- sub("^2$", "100", data$propdmgexp)
data$propdmgexp <- sub("^3$", "1000", data$propdmgexp)
data$propdmgexp <- sub("^4$", "10000", data$propdmgexp)
data$propdmgexp <- sub("^5$", "100000", data$propdmgexp)
data$propdmgexp <- sub("^6$", "1000000", data$propdmgexp)
data$propdmgexp <- sub("^7$", "10000000", data$propdmgexp)
data$propdmgexp <- sub("^8$", "100000000", data$propdmgexp)
data$propdmgexp <- sub("^0$", "1", data$propdmgexp)
data$propdmgexp <- sub("^?$", "1", data$propdmgexp)
data$propdmgexp <- sub("^$", "1", data$propdmgexp)
data$propdmgexp <- sub("^[Kk]$", "100000", data$propdmgexp)
data$propdmgexp <- sub("^[Mm]$", "1000000", data$propdmgexp)
data$propdmgexp <- sub("^[B]$", "1000000000", data$propdmgexp)
The two characters + and - in propdmgexp are ambiguous. So we represent them as NA.
data[data == "-" | data == "+"] <- NA # Replace + and - with NA
sum(!complete.cases(data)) # Number of rows with NA values
## [1] 6
Convert propdmgexp, cropdmgexp into numeric values.
data$propdmgexp <- suppressWarnings(as.numeric(data$propdmgexp))
data$cropdmgexp <- suppressWarnings(as.numeric(data$cropdmgexp))
data$propdmg <- data$propdmg * data$propdmgexp # Property damage in dollars
data$cropdmg <- data$cropdmg * data$cropdmgexp # Crop damage in dollars
# Removing prop and crop dmg exp since it is redundant and might be confusing
data <- select(data, state__, county, evtype, fatalities, injuries, propdmg, cropdmg)
str(data) # Structure of processed data
## 'data.frame': 902297 obs. of 7 variables:
## $ state__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ county : num 97 3 57 89 43 77 9 123 125 57 ...
## $ evtype : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ fatalities: num 0 0 0 0 0 0 0 0 1 0 ...
## $ injuries : num 15 0 2 2 2 6 1 0 14 0 ...
## $ propdmg : num 2500000 250000 2500000 250000 250000 250000 250000 250000 2500000 2500000 ...
## $ cropdmg : num 0 0 0 0 0 0 0 0 0 0 ...
The final processed data is represented by the data frame `data’.
3. Main Results
Total damage in terms of dollars, fatalities and injuries
We aggregate the total amount of damage done for crops, properties, injuries and fatalities per severe event type.
fatalitiesdata <- arrange(aggregate(fatalities ~ evtype, data = data, FUN = sum, na.rm =
TRUE), desc(fatalities))
head(fatalitiesdata, 10)
## evtype fatalities
## 1 TORNADO 5633
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
## 6 TSTM WIND 504
## 7 FLOOD 470
## 8 RIP CURRENT 368
## 9 HIGH WIND 248
## 10 AVALANCHE 224
injuriesdata <- arrange(aggregate(injuries ~ evtype, data = data, FUN = sum, na.rm =
TRUE), desc(injuries))
head(injuriesdata, 10)
## evtype injuries
## 1 TORNADO 91346
## 2 TSTM WIND 6957
## 3 FLOOD 6789
## 4 EXCESSIVE HEAT 6525
## 5 LIGHTNING 5230
## 6 THUNDERSTORM WIND 2396
## 7 HEAT 2100
## 8 ICE STORM 1975
## 9 FLASH FLOOD 1777
## 10 HAIL 1361
propertydata <- arrange(aggregate(propdmg ~ evtype, data = data, FUN = sum, na.rm =
TRUE), desc(propdmg))
head(propertydata, 10)
## evtype propdmg
## 1 TORNADO 370131948137
## 2 FLOOD 231632160007
## 3 FLASH FLOOD 155999993469
## 4 TSTM WIND 136428014055
## 5 THUNDERSTORM WIND 135406196341
## 6 HAIL 82570832293
## 7 HURRICANE/TYPHOON 69500870000
## 8 LIGHTNING 60613448150
## 9 STORM SURGE 45165530000
## 10 HIGH WIND 37053626000
cropdata <- arrange(aggregate(cropdmg ~ evtype, data = data, FUN = sum, na.rm = TRUE),
desc(cropdmg))
head(cropdata, 10)
## evtype cropdmg
## 1 HAIL 60161275023
## 2 FLOOD 21753275000
## 3 FLASH FLOOD 19039070000
## 4 DROUGHT 16095720000
## 5 TSTM WIND 11320985000
## 6 TORNADO 10269721160
## 7 THUNDERSTORM WIND 9007405088
## 8 RIVER FLOOD 5371900000
## 9 ICE STORM 5186800000
## 10 HURRICANE 2999310000
Economic Impact
Let us look at a subset of the columns of data related to severe event type and the correspnding economic cost (in dollars).
propertydamagetop10 <- propertydata[1:10, ]
cropdamagetop10 <- cropdata[1:10, ]
par(mfrow = c(1, 2), mar = c(10, 3, 3, 2))
barplot(propertydamagetop10$propdmg/10^9, las = 3, names.arg = propertydamagetop10$evtype, col = "steelblue", main = "Top 10 causes for property damage", cex.names = 0.8, ylab = "Property damage (billion dollars)")
barplot(cropdamagetop10$cropdmg/10^9, las = 3, names.arg = cropdamagetop10$evtype, col = "steelblue", cex.names = 0.8, main = "Top 10 causes for crop damage", ylab = "Crop damage (billion dollars)")

Public Health Impact
Let us look at the impact of severe events on fatalities and injuries.
fatalitiestop10 <- fatalitiesdata[1:10, ]
injuriestop10 <- injuriesdata[1:10, ]
par(mfrow = c(1, 2), mar = c(10, 3, 3, 2))
barplot(fatalitiestop10$fatalities, las = 3, cex.names = 0.8, names.arg = fatalitiestop10$evtype, col = "red", main = "Top 10 causes for fatalities", ylab = "Number of fatalities")
barplot(injuriestop10$injuries, las = 3, cex.names = 0.8, names.arg = injuriestop10$evtype, col = "red", main = "Top 10 causes for injuries", ylab = "Number of injuries")
