This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s storm database collected betwween 1950 and 2011. This database tracks characteristics of major storms and weather events in the United States, including estimates of any fatalities, injuries, and property damage.
The original dataset was downloaded here. There is also some documentation of the database available on the National Weather Service Storm Data Documentation and National Climatic Data Center Storm Events FAQ.
The preliminary data processing required extensive data cleaning as described below.
The analysis below identifies (a) the most frequent severe weather events in the US, (b) events causing most fatalities and injuries, and (c) events causing the biggest property and crop damage.
Processing data starts from the original dataset from the above link:
setwd("C:/Users/Admin/Documents/DS from Coursera")
if(!file.exists("RR_PA2")) dir.create("RR_PA2")
setwd("C:/Users/Admin/Documents/DS from Coursera/RR_PA2")
storm_url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("StormData.csv")) download.file(storm_url, destfile = "StormData.csv")
In order to ensure faster reading of the data frame, only the columns needed in further analysis are read in, and their restecpive classes are specified:
my.colclass <- c(rep("NULL", 7), rep("factor", 1), rep("NULL", 14),
rep("numeric", 3), "character", "numeric", "character", rep("NULL", 9))
storms <- read.csv("StormData.csv", header = TRUE, colClasses = my.colclass)
"The 'storms' data frame has the following column names:"
## [1] "The 'storms' data frame has the following column names:"
colnames(storms)
## [1] "EVTYPE" "FATALITIES" "INJURIES" "PROPDMG" "PROPDMGEXP"
## [6] "CROPDMG" "CROPDMGEXP"
paste ("It contains ", nrow(storms), "observations")
## [1] "It contains 902297 observations"
evtypes_be4 <- length(unique(storms$EVTYPE))
paste("The raw data has ", evtypes_be4, "unique event types.")
## [1] "The raw data has 985 unique event types."
Such a large range of unique event types is due to the fact that the dataset contrains observations recorded in pre-computer era. The following code cleans the EVTYPE column:
storms$EVTYPE <- toupper(storms$EVTYPE)
storms$EVTYPE <- gsub("^SUMMARY.*", NA, storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("TIDES", "TIDE", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FLOODING", "FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("COASTALFLOOD", "COASTAL FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FUNNELS", "FUNNEL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("TEMPERATURES", "TEMPERATURE", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("DRIEST MONTH", "DRY", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("DRY MICROBURST", "DRY", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("DEVEL", "DEVIL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("DUSTSTORM", "DUST STORM", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("^EXCESSIVE$", NA, storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("PRECIPITATION", "RAIN", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("RAINFALL", "RAIN", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("CHILLS", "CHILL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("WINDCHILL", "WIND CHILL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FLASH FLOOD/", "FLASH FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("WINDS", "WIND", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FLOODS", "FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("\\", "/", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("CLOUD.", "CLOUD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("CLOUDS", "CLOUD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("RAINS", "RAIN", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("^HIGH$", NA, storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("SNOWFALL", "SNOW", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("PRECIP", "RAIN", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("MONTHLY ", "", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("FLOODIN", "FLOOD", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("TSTM", "THUNDERSTORM", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("THUNDERSTORM WIND", "THUNDERSTORM", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("HURRICANE/TYPHOON", "HURRICANE", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("SLIDES", "SLIDE", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub(" (HEAVY)", "", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("CURRENTS", "CURRENT", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("SQUALLS", "SQUALL", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub(" DAMAGE TO", "", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub(" DAMAGE", "", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("STORMS", "STORM", storms$EVTYPE, fixed = TRUE)
storms$EVTYPE <- gsub("^DRY.*", "DRY", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^FLASH FLOOD.*", "FLASH FLOOD", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^HAIL.*", "HAIL", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^HEAVY RAIN.*", "HEAVY RAIN", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^HEAVY SNOW.*", "HEAVY SNOW", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^HIGH WIND.*", "HIGH WIND", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^LIGHTNING.*", "LIGHTNING", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^THUNDERSTORM.*", "THUNDERSTORM", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^WATERSPOUT.*", "WATERSPOUT", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^BLIZZARD.*", "BLIZZARD", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- gsub("^SNOW.*", "SNOW", storms$EVTYPE, fixed = FALSE)
storms$EVTYPE <- as.factor(storms$EVTYPE)
evtypes_after <- length(unique(storms$EVTYPE))
paste("The above code has reduced the number of unique event types by", round((evtypes_be4 - evtypes_after)/evtypes_be4*100, 0), "percent.")
## [1] "The above code has reduced the number of unique event types by 51 percent."
paste("Now the dataset has ", evtypes_after, "unique event types.")
## [1] "Now the dataset has 479 unique event types."
The Product and Crop Damage columns also require cleaning. The PROPDMEXP/CROPDMEXP columns originally contain multiplicators for the PROPDM/CROPDM columns:
unique(storms$PROPDMGEXP)
## [1] "K" "M" "" "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-"
## [18] "1" "8"
unique(storms$CROPDMGEXP)
## [1] "" "M" "K" "m" "B" "?" "0" "k" "2"
The following code cleans these columns:
storms$PROPDMGEXP <- toupper(storms$PROPDMGEXP)
for (i in 1:8) storms$PROPDMGEXP[storms$PROPDMGEXP==i] <- 10^i
storms$PROPDMGEXP[storms$PROPDMGEXP==0|storms$PROPDMGEXP=="+"|storms$PROPDMGEXP=="-"] <- 1
storms$PROPDMGEXP[nchar(storms$PROPDMGEXP)==0] <- 1
storms$PROPDMGEXP <- gsub("H", 10^2, storms$PROPDMGEXP, fixed = TRUE)
storms$PROPDMGEXP <- gsub("K", 10^3, storms$PROPDMGEXP, fixed = TRUE)
storms$PROPDMGEXP <- gsub("M", 10^6, storms$PROPDMGEXP, fixed = TRUE)
storms$PROPDMGEXP <- gsub("B", 10^9, storms$PROPDMGEXP, fixed = TRUE)
storms$CROPDMGEXP <- toupper(storms$CROPDMGEXP)
storms$CROPDMGEXP[storms$CROPDMGEXP==2] <- 10^2
storms$CROPDMGEXP[storms$CROPDMGEXP==0] <- 1
storms$CROPDMGEXP[nchar(storms$CROPDMGEXP)==0] <- 1
storms$CROPDMGEXP <- gsub("K", 10^3, storms$CROPDMGEXP, fixed = TRUE)
storms$CROPDMGEXP <- gsub("M", 10^6, storms$CROPDMGEXP, fixed = TRUE)
storms$CROPDMGEXP <- gsub("B", 10^9, storms$CROPDMGEXP, fixed = TRUE)
Then, the PROPDM/CROPDM columns are updated to include the full sums of damages, and the ‘EXP’ columns are dropped:
storms$PROPDMG <- storms$PROPDMG * as.numeric(storms$PROPDMGEXP)
## Warning: в результате преобразования созданы NA
storms$CROPDMG <- storms$CROPDMG * as.numeric(storms$CROPDMGEXP)
## Warning: в результате преобразования созданы NA
storms <- storms[, -c(5, 7)]
The clean dataset looks like this:
summary(storms)
## EVTYPE FATALITIES INJURIES
## THUNDERSTORM:324759 Min. : 0.0000 Min. : 0.0000
## HAIL :288776 1st Qu.: 0.0000 1st Qu.: 0.0000
## TORNADO : 60652 Median : 0.0000 Median : 0.0000
## FLASH FLOOD : 55037 Mean : 0.0168 Mean : 0.1557
## FLOOD : 25450 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## (Other) :147546 Max. :583.0000 Max. :1700.0000
## NA's : 77
## PROPDMG CROPDMG
## Min. :0.000e+00 Min. :0.000e+00
## 1st Qu.:0.000e+00 1st Qu.:0.000e+00
## Median :0.000e+00 Median :0.000e+00
## Mean :4.746e+05 Mean :5.442e+04
## 3rd Qu.:5.000e+02 3rd Qu.:0.000e+00
## Max. :1.150e+11 Max. :5.000e+09
## NA's :8 NA's :7
The following aggregation is performed before plotting:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked _by_ '.GlobalEnv':
##
## storms
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#rank event types by occurance
storms_occ <- data.frame(storms$EVTYPE, rep(1, nrow(storms)))
colnames(storms_occ) <- c("EVTYPE", "OCC")
occurance <- aggregate(OCC ~ EVTYPE, data = storms_occ, FUN = sum)
occurance <- occurance[order(occurance$OCC, decreasing = TRUE), ]
#rank event types by fatalities
fatality <- aggregate(FATALITIES ~ EVTYPE, data = storms, FUN = sum)
fatality <- fatality[order(fatality$FATALITIES, decreasing=TRUE),]
#rank event types by injuries
injury <- aggregate(INJURIES ~ EVTYPE, data = storms, FUN = sum)
injury <- injury[order(injury$INJURIES, decreasing=TRUE),]
#rank event types by property damage
property_damage <- aggregate(PROPDMG ~ EVTYPE, data = storms, FUN = sum)
property_damage <- property_damage[order(property_damage$PROPDMG, decreasing=TRUE),]
#rank event types by crop damage
crop_damage <- aggregate(CROPDMG ~ EVTYPE, data = storms, FUN = sum)
crop_damage <- crop_damage[order(crop_damage$CROPDMG, decreasing=TRUE),]
par(mfrow = c(1, 1), mar = c(2, 2, 2, 2), cex = 0.9)
pie(occurance$OCC, labels = occurance$EVTYPE[1:10], main="Events Frequency")
paste("The most frequent severe weather event across the US is", occurance$EVTYPE[1])
## [1] "The most frequent severe weather event across the US is THUNDERSTORM"
par(mfrow = c(1, 2), mar = c(10, 4, 4, 2), cex = 0.9)
barplot(height = fatality$FATALITIES[1:10]/1000, names.arg = fatality$EVTYPE[1:10],
ylim= c(0,6), col="blue", main="Fatalities", las = 2)
barplot(height = injury$INJURIES[1:10]/1000, names.arg = injury$EVTYPE[1:10],
ylim= c(0,100), col="blue", main="Injuries", las = 2)
title("Top 10 Events Causing Fatalities and Injuries, in thousand cases", line = -1, outer=TRUE)
paste("The top causes of fatalities and injuries are ", fatality$EVTYPE[1], "and", injury$EVTYPE[1], "respectively")
## [1] "The top causes of fatalities and injuries are TORNADO and TORNADO respectively"
par(mfrow = c(1, 2), mar = c(10, 4, 4, 2), cex = 0.9)
barplot(height = property_damage$PROPDMG[1:10]/10^9, names.arg = property_damage$EVTYPE[1:10],
ylim= c(0,160), col="blue", main="Property Damage", las = 2)
barplot(height = crop_damage$CROPDMG[1:10]/10^9, names.arg = crop_damage$EVTYPE[1:10],
ylim= c(0,15), col="blue", main="Crop Damage", las = 2)
title("Top 10 Events Causing Property and Crop Damage, in $ Bn",
line = -1, outer=TRUE)
paste("The top causes of property and crop damage are", property_damage$EVTYPE[1], "and",
crop_damage$EVTYPE[1], "respectively")
## [1] "The top causes of property and crop damage are FLOOD and DROUGHT respectively"