SYNOPSIS

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s storm database collected betwween 1950 and 2011. This database tracks characteristics of major storms and weather events in the United States, including estimates of any fatalities, injuries, and property damage.

The original dataset was downloaded here. There is also some documentation of the database available on the National Weather Service Storm Data Documentation and National Climatic Data Center Storm Events FAQ.

The preliminary data processing required extensive data cleaning as described below.

The analysis below identifies (a) the most frequent severe weather events in the US, (b) events causing most fatalities and injuries, and (c) events causing the biggest property and crop damage.

DATA PROCESSING

Downloading

Processing data starts from the original dataset from the above link:

setwd("C:/Users/Admin/Documents/DS from Coursera")
if(!file.exists("RR_PA2")) dir.create("RR_PA2")
setwd("C:/Users/Admin/Documents/DS from Coursera/RR_PA2")
storm_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("StormData.csv")) download.file(storm_url, destfile = "StormData.csv")

Reading in

In order to ensure faster reading of the data frame, only the columns needed in further analysis are read in, and their restecpive classes are specified:

my.colclass <- c(rep("NULL", 7), rep("factor", 1), rep("NULL", 14), 
               rep("numeric", 3), "character", "numeric", "character", rep("NULL", 9))
storms <- read.csv("StormData.csv", header = TRUE, colClasses = my.colclass)
"The 'storms' data frame has the following column names:"
## [1] "The 'storms' data frame has the following column names:"
colnames(storms)
## [1] "EVTYPE"     "FATALITIES" "INJURIES"   "PROPDMG"    "PROPDMGEXP"
## [6] "CROPDMG"    "CROPDMGEXP"
paste ("It contains ", nrow(storms), "observations")
## [1] "It contains  902297 observations"

Unique Event Types Cleaning

evtypes_be4 <- length(unique(storms$EVTYPE))
paste("The raw data has ", evtypes_be4, "unique event types.")
## [1] "The raw data has  985 unique event types."

Such a large range of unique event types is due to the fact that the dataset contrains observations recorded in pre-computer era. The following code cleans the EVTYPE column:

storms$EVTYPE <- toupper(storms$EVTYPE)
storms$EVTYPE <- gsub("^SUMMARY.*", NA, storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("TIDES", "TIDE", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FLOODING", "FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("COASTALFLOOD", "COASTAL FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FUNNELS", "FUNNEL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("TEMPERATURES", "TEMPERATURE", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("DRIEST MONTH", "DRY", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("DRY MICROBURST", "DRY", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("DEVEL", "DEVIL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("DUSTSTORM", "DUST STORM", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("^EXCESSIVE$", NA, storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("PRECIPITATION", "RAIN", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("RAINFALL", "RAIN", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("CHILLS", "CHILL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("WINDCHILL", "WIND CHILL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FLASH FLOOD/", "FLASH FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("WINDS", "WIND", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FLOODS", "FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("\\", "/", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("CLOUD.", "CLOUD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("CLOUDS", "CLOUD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("RAINS", "RAIN", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("^HIGH$", NA, storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("SNOWFALL", "SNOW", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("PRECIP", "RAIN", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("MONTHLY ", "", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FLOODIN", "FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("TSTM", "THUNDERSTORM", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("THUNDERSTORM WIND", "THUNDERSTORM", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("HURRICANE/TYPHOON", "HURRICANE", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("SLIDES", "SLIDE", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub(" (HEAVY)", "", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("CURRENTS", "CURRENT", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("SQUALLS", "SQUALL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub(" DAMAGE TO", "", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub(" DAMAGE", "", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("STORMS", "STORM", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("^DRY.*", "DRY", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^FLASH FLOOD.*", "FLASH FLOOD", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^HAIL.*", "HAIL", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^HEAVY RAIN.*", "HEAVY RAIN", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^HEAVY SNOW.*", "HEAVY SNOW", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^HIGH WIND.*", "HIGH WIND", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^LIGHTNING.*", "LIGHTNING", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^THUNDERSTORM.*", "THUNDERSTORM", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^WATERSPOUT.*", "WATERSPOUT", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^BLIZZARD.*", "BLIZZARD", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^SNOW.*", "SNOW", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- as.factor(storms$EVTYPE)
evtypes_after <- length(unique(storms$EVTYPE))

paste("The above code has reduced the number of unique event types by", round((evtypes_be4 - evtypes_after)/evtypes_be4*100, 0), "percent.")
## [1] "The above code has reduced the number of unique event types by 51 percent."
paste("Now the dataset has ", evtypes_after, "unique event types.")
## [1] "Now the dataset has  479 unique event types."

Product/Crop Damage Cleaning

The Product and Crop Damage columns also require cleaning. The PROPDMEXP/CROPDMEXP columns originally contain multiplicators for the PROPDM/CROPDM columns:

unique(storms$PROPDMGEXP)
##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
unique(storms$CROPDMGEXP)
## [1] ""  "M" "K" "m" "B" "?" "0" "k" "2"

The following code cleans these columns:

storms$PROPDMGEXP <- toupper(storms$PROPDMGEXP)
for (i in 1:8) storms$PROPDMGEXP[storms$PROPDMGEXP==i] <- 10^i
storms$PROPDMGEXP[storms$PROPDMGEXP==0|storms$PROPDMGEXP=="+"|storms$PROPDMGEXP=="-"] <- 1
storms$PROPDMGEXP[nchar(storms$PROPDMGEXP)==0] <- 1
storms$PROPDMGEXP <- gsub("H", 10^2, storms$PROPDMGEXP, fixed = TRUE)
storms$PROPDMGEXP <- gsub("K", 10^3, storms$PROPDMGEXP, fixed = TRUE)
storms$PROPDMGEXP <- gsub("M", 10^6, storms$PROPDMGEXP, fixed = TRUE)
storms$PROPDMGEXP <- gsub("B", 10^9, storms$PROPDMGEXP, fixed = TRUE)

storms$CROPDMGEXP <- toupper(storms$CROPDMGEXP)
storms$CROPDMGEXP[storms$CROPDMGEXP==2] <- 10^2
storms$CROPDMGEXP[storms$CROPDMGEXP==0] <- 1
storms$CROPDMGEXP[nchar(storms$CROPDMGEXP)==0] <- 1
storms$CROPDMGEXP <- gsub("K", 10^3, storms$CROPDMGEXP, fixed = TRUE)
storms$CROPDMGEXP <- gsub("M", 10^6, storms$CROPDMGEXP, fixed = TRUE)
storms$CROPDMGEXP <- gsub("B", 10^9, storms$CROPDMGEXP, fixed = TRUE)

Then, the PROPDM/CROPDM columns are updated to include the full sums of damages, and the ‘EXP’ columns are dropped:

storms$PROPDMG <- storms$PROPDMG * as.numeric(storms$PROPDMGEXP)
## Warning: в результате преобразования созданы NA
storms$CROPDMG <- storms$CROPDMG * as.numeric(storms$CROPDMGEXP)
## Warning: в результате преобразования созданы NA
storms <- storms[, -c(5, 7)]

Clean Dataset Summary

The clean dataset looks like this:

summary(storms)
##           EVTYPE         FATALITIES          INJURIES        
##  THUNDERSTORM:324759   Min.   :  0.0000   Min.   :   0.0000  
##  HAIL        :288776   1st Qu.:  0.0000   1st Qu.:   0.0000  
##  TORNADO     : 60652   Median :  0.0000   Median :   0.0000  
##  FLASH FLOOD : 55037   Mean   :  0.0168   Mean   :   0.1557  
##  FLOOD       : 25450   3rd Qu.:  0.0000   3rd Qu.:   0.0000  
##  (Other)     :147546   Max.   :583.0000   Max.   :1700.0000  
##  NA's        :    77                                         
##     PROPDMG             CROPDMG         
##  Min.   :0.000e+00   Min.   :0.000e+00  
##  1st Qu.:0.000e+00   1st Qu.:0.000e+00  
##  Median :0.000e+00   Median :0.000e+00  
##  Mean   :4.746e+05   Mean   :5.442e+04  
##  3rd Qu.:5.000e+02   3rd Qu.:0.000e+00  
##  Max.   :1.150e+11   Max.   :5.000e+09  
##  NA's   :8           NA's   :7

Aggregation

The following aggregation is performed before plotting:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked _by_ '.GlobalEnv':
## 
##     storms
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#rank event types by occurance
storms_occ <- data.frame(storms$EVTYPE, rep(1, nrow(storms)))
colnames(storms_occ) <- c("EVTYPE", "OCC")
occurance <- aggregate(OCC ~ EVTYPE, data = storms_occ, FUN = sum)
occurance <- occurance[order(occurance$OCC, decreasing = TRUE), ]

#rank event types by fatalities
fatality <- aggregate(FATALITIES ~ EVTYPE, data = storms, FUN = sum)
fatality <- fatality[order(fatality$FATALITIES, decreasing=TRUE),]

#rank event types by injuries
injury <- aggregate(INJURIES ~ EVTYPE, data = storms, FUN = sum)
injury <- injury[order(injury$INJURIES, decreasing=TRUE),]

#rank event types by property damage
property_damage <- aggregate(PROPDMG ~ EVTYPE, data = storms, FUN = sum)
property_damage <- property_damage[order(property_damage$PROPDMG, decreasing=TRUE),]

#rank event types by crop damage
crop_damage <- aggregate(CROPDMG ~ EVTYPE, data = storms, FUN = sum)
crop_damage <- crop_damage[order(crop_damage$CROPDMG, decreasing=TRUE),]

RESULTS

Comparing Severe Weather Events by Occurance

par(mfrow = c(1, 1), mar = c(2, 2, 2, 2), cex = 0.9)
pie(occurance$OCC, labels = occurance$EVTYPE[1:10], main="Events Frequency")

paste("The most frequent severe weather event across the US is", occurance$EVTYPE[1])
## [1] "The most frequent severe weather event across the US is THUNDERSTORM"

Types of Events Most Harmful to Population Health

par(mfrow = c(1, 2), mar = c(10, 4, 4, 2), cex = 0.9)
barplot(height = fatality$FATALITIES[1:10]/1000, names.arg = fatality$EVTYPE[1:10],  
        ylim= c(0,6), col="blue", main="Fatalities", las = 2)
barplot(height = injury$INJURIES[1:10]/1000, names.arg = injury$EVTYPE[1:10],  
        ylim= c(0,100), col="blue", main="Injuries", las = 2)
title("Top 10 Events Causing Fatalities and Injuries, in thousand cases", line = -1, outer=TRUE) 

paste("The top causes of fatalities and injuries are ", fatality$EVTYPE[1], "and", injury$EVTYPE[1], "respectively")
## [1] "The top causes of fatalities and injuries are  TORNADO and TORNADO respectively"

Types of Events with the Greatest Economic Consequences

par(mfrow = c(1, 2), mar = c(10, 4, 4, 2), cex = 0.9)
barplot(height = property_damage$PROPDMG[1:10]/10^9, names.arg = property_damage$EVTYPE[1:10],
        ylim= c(0,160), col="blue", main="Property Damage", las = 2)
barplot(height = crop_damage$CROPDMG[1:10]/10^9, names.arg = crop_damage$EVTYPE[1:10],  
        ylim= c(0,15), col="blue", main="Crop Damage", las = 2)
title("Top 10 Events Causing Property and Crop Damage, in $ Bn", 
      line = -1, outer=TRUE)

paste("The top causes of property and crop damage are", property_damage$EVTYPE[1], "and", 
crop_damage$EVTYPE[1], "respectively")
## [1] "The top causes of property and crop damage are FLOOD and DROUGHT respectively"