Analysis performed on Storm Data from the NOAA Storm Database with the goal of detrermining which event types are most detrimental to population health, and similarly which event types are more detrimental to overall econmic environment.
Firstly, we need to download the data and load it into R.
if (!file.exists("StormData.csv.bz2")) {
download.file(url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "StormData.csv.bz2", method = "curl")}
if (!exists("StormData")) {
StormData <- read.csv("StormData.csv.bz2")
}
## Let's summarise the data to see what we're working with!
str(StormData)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels ""," Christiansburg",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels ""," CANTON"," TULIA",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","%SD",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","\t","\t\t",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Load any required packages into R for analyis
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
First we want to format dates in the correct way and then group data by EVTYPE and YEAR in order to check population health impact, take total of FATALITIES and INJURIES fields. We will also check for the number of missing values in both of these fields within the dataset to identify if missing values are a problem.
StormData_Fix = mutate(StormData, BGN_DATE = as.POSIXct(BGN_DATE, format = "%m/%d/%Y %H:%M:%S"),
BGN_YEAR = year(BGN_DATE))
Great, there are no missing values.
Now we want to cleanse the PROPDMG and CROPDMG fields as we know that these are stored across two different fields, one with the multiplier and one with the power of the value.
unique(StormData_Fix$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == ""] <- 1
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "0"] <- 1
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "1"] <- 10
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "2"] <- 100
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "3"] <- 1000
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "4"] <- 10000
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "5"] <- 100000
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "6"] <- 1000000
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "7"] <- 10000000
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "B"] <- 1000000000
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "H"] <- 100
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "h"] <- 100
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "K"] <- 1000
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "M"] <- 1000000
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "m"] <- 1000000
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "+"] <- 0
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "-"] <- 0
StormData_Fix$PROPEXP[StormData_Fix$PROPDMGEXP == "?"] <- 0
unique(StormData_Fix$PROPEXP)
## [1] 1e+03 1e+06 1e+00 1e+09 0e+00 1e+05 1e+04 1e+02 1e+07 1e+01 NA
unique(StormData_Fix$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
StormData_Fix$CROPEXP[StormData_Fix$CROPDMGEXP == ""] <- 1
StormData_Fix$CROPEXP[StormData_Fix$CROPDMGEXP == "0"] <- 1
StormData_Fix$CROPEXP[StormData_Fix$CROPDMGEXP == "2"] <- 100
StormData_Fix$CROPEXP[StormData_Fix$CROPDMGEXP == "B"] <- 1000000000
StormData_Fix$CROPEXP[StormData_Fix$CROPDMGEXP == "k"] <- 1000
StormData_Fix$CROPEXP[StormData_Fix$CROPDMGEXP == "K"] <- 1000
StormData_Fix$CROPEXP[StormData_Fix$CROPDMGEXP == "m"] <- 1000000
StormData_Fix$CROPEXP[StormData_Fix$CROPDMGEXP == "M"] <- 1000000
StormData_Fix$CROPEXP[StormData_Fix$CROPDMGEXP == "?"] <- 0
unique(StormData_Fix$CROPEXP)
## [1] 1e+00 1e+06 1e+03 1e+09 0e+00 1e+02
StormData_Fix <- mutate(StormData_Fix, TOTAL_PROPDMG = PROPDMG*PROPEXP)
StormData_Fix <- mutate(StormData_Fix, TOTAL_CROPDMG = CROPDMG*CROPEXP)
StormData_Type <- StormData_Fix %>%
group_by(EVTYPE) %>%
summarise(Fatalities_Sum = sum(FATALITIES,na.rm=TRUE),
Injuries_Sum = sum(INJURIES,na.rm=TRUE))
Fatalities <- StormData_Fix$FATALITIES;
Injuries <- StormData_Fix$INJURIES
## Top 10 for fatalities
Top10_Fatalities <- StormData_Type[order(-StormData_Type$Fatalities_Sum), ][1:10, ]
## Top 10 for injuries
Top10_Injuries <- StormData_Type[order(-StormData_Type$Injuries_Sum), ][1:10, ]
## Plot both of these charges
par(mfrow = c(2,1))
barplot(Top10_Fatalities$Fatalities_Sum, names.arg = Top10_Fatalities$EVTYPE, las =3, main = "Top 10 events for fatalities")
barplot(Top10_Injuries$Injuries_Sum, names.arg = Top10_Injuries$EVTYPE, las = 3, main = "Top 10 events for crop injuries")
Max_Fat_Sum <- StormData_Type[which.max(StormData_Type$Fatalities_Sum),]
Max_Inj_Sum <- StormData_Type[which.max(StormData_Type$Injuries_Sum),]
The event type which causes the most fatalities is TORNADO and which causes the most injuries is TORNADO. Co-incidentally, these are the same event type.
Now, looking at the event type most detrimental to overall economy.
StormData_Damages <- StormData_Fix %>%
group_by(EVTYPE) %>%
summarise(PROPDMG_Sum = sum(TOTAL_PROPDMG,na.rm=TRUE),
CROPDMG_Sum = sum(TOTAL_CROPDMG, na.rm = TRUE))
TOTAL_PROPDMG <- StormData_Fix$TOTAL_PROPDMG;
TOTAL_CROPDMG <- StormData_Fix$TOTAL_CROPDMG
## Top 10 for fatalities
Top10_PropDmg <- StormData_Damages[order(-StormData_Damages$PROPDMG_Sum), ][1:10, ]
## Top 10 for injuries
Top10_CropDmg <- StormData_Damages[order(-StormData_Damages$CROPDMG_Sum), ][1:10, ]
## Plot both of these charges
par(mfrow = c(2,1))
barplot(Top10_PropDmg$PROPDMG_Sum, names.arg = Top10_PropDmg$EVTYPE, las =3, main = "Top 10 events for property damage")
barplot(Top10_CropDmg$CROPDMG_Sum, names.arg = Top10_CropDmg$EVTYPE, las = 3, main = "Top 10 events for crop damage")
Max_PROPDMG <- StormData_Damages[which.max(StormData_Damages$PROPDMG_Sum),]
Max_CROPDMG <- StormData_Damages[which.max(StormData_Damages$CROPDMG_Sum),]
The event type which causes the most economic damage is FLOOD for property damages and DROUGHT for crop damages.