In this report, we shall explore the U.S. National Oceanic and Atmospheric Administrations (NOAA) storm database. This exploration will help us determine which are the greatest threats to US health and economy.
#Prep the environment
library(data.table)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
First we will need to process the data and gather insights.
We will extract the data from our copy of the NOAA Storm Database.
noaaData <- fread("data/repdata_data_StormData.csv")
We will examine the features of the database to determine which one is necessary for our experiment. This will also help us in getting an understanding of what the data is and what we might need to do with it.
names(noaaData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
str(noaaData)
## Classes 'data.table' and 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(noaaData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE
## Min. : 1.0 Length:902297 Length:902297 Length:902297
## 1st Qu.:19.0 Class :character Class :character Class :character
## Median :30.0 Mode :character Mode :character Mode :character
## Mean :31.2
## 3rd Qu.:45.0
## Max. :95.0
##
## COUNTY COUNTYNAME STATE EVTYPE
## Min. : 0.0 Length:902297 Length:902297 Length:902297
## 1st Qu.: 31.0 Class :character Class :character Class :character
## Median : 75.0 Mode :character Mode :character Mode :character
## Mean :100.6
## 3rd Qu.:131.0
## Max. :873.0
##
## BGN_RANGE BGN_AZI BGN_LOCATI
## Min. : 0.000 Length:902297 Length:902297
## 1st Qu.: 0.000 Class :character Class :character
## Median : 0.000 Mode :character Mode :character
## Mean : 1.484
## 3rd Qu.: 1.000
## Max. :3749.000
##
## END_DATE END_TIME COUNTY_END COUNTYENDN
## Length:902297 Length:902297 Min. :0 Mode:logical
## Class :character Class :character 1st Qu.:0 NA's:902297
## Mode :character Mode :character Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
##
## END_RANGE END_AZI END_LOCATI
## Min. : 0.0000 Length:902297 Length:902297
## 1st Qu.: 0.0000 Class :character Class :character
## Median : 0.0000 Mode :character Mode :character
## Mean : 0.9862
## 3rd Qu.: 0.0000
## Max. :925.0000
##
## LENGTH WIDTH F MAG
## Min. : 0.0000 Min. : 0.000 Min. :0.0 Min. : 0.0
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.:0.0 1st Qu.: 0.0
## Median : 0.0000 Median : 0.000 Median :1.0 Median : 50.0
## Mean : 0.2301 Mean : 7.503 Mean :0.9 Mean : 46.9
## 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.:1.0 3rd Qu.: 75.0
## Max. :2315.0000 Max. :4400.000 Max. :5.0 Max. :22000.0
## NA's :843563
## FATALITIES INJURIES PROPDMG
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 0.0000 Median : 0.0000 Median : 0.00
## Mean : 0.0168 Mean : 0.1557 Mean : 12.06
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.50
## Max. :583.0000 Max. :1700.0000 Max. :5000.00
##
## PROPDMGEXP CROPDMG CROPDMGEXP
## Length:902297 Min. : 0.000 Length:902297
## Class :character 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.000 Mode :character
## Mean : 1.527
## 3rd Qu.: 0.000
## Max. :990.000
##
## WFO STATEOFFIC ZONENAMES LATITUDE
## Length:902297 Length:902297 Length:902297 Min. : 0
## Class :character Class :character Class :character 1st Qu.:2802
## Mode :character Mode :character Mode :character Median :3540
## Mean :2875
## 3rd Qu.:4019
## Max. :9706
## NA's :47
## LONGITUDE LATITUDE_E LONGITUDE_ REMARKS
## Min. :-14451 Min. : 0 Min. :-14455 Length:902297
## 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0 Class :character
## Median : 8707 Median : 0 Median : 0 Mode :character
## Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. : 17124 Max. :9706 Max. :106220
## NA's :40
## REFNUM
## Min. : 1
## 1st Qu.:225575
## Median :451149
## Mean :451149
## 3rd Qu.:676723
## Max. :902297
##
From our data dictionary, we can see that the only data points we need for this experiment are:
noaaData <- select(noaaData, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
We only need that data where there is an occurence of damage, death, or injury.
noaaData <- noaaData %>% filter(INJURIES > 0) %>%
filter(FATALITIES > 0) %>%
filter(PROPDMG > 0) %>%
filter(CROPDMG > 0)
noaaData <- as.data.table(noaaData)
Since the damage costs and their respective exponent keys are separated, we will need to format the explonent columns to format the costs properly.
## Change all damage exponents to uppercase.
cols <- c("PROPDMGEXP", "CROPDMGEXP")
noaaData[, (cols) := c(lapply(.SD, toupper)), .SDcols = cols]
## Map property damage alphanumeric exponents to numeric values.
propDmgKey <- c("\"\"" = 10^0,
"-" = 10^0,
"+" = 10^0,
"0" = 10^0,
"1" = 10^1,
"2" = 10^2,
"3" = 10^3,
"4" = 10^4,
"5" = 10^5,
"6" = 10^6,
"7" = 10^7,
"8" = 10^8,
"9" = 10^9,
"H" = 10^2,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
## Map crop damage alphanumeric exponents to numeric values
cropDmgKey <- c("\"\"" = 10^0,
"?" = 10^0,
"0" = 10^0,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
noaaData[, PROPDMGEXP := propDmgKey[as.character(noaaData[,PROPDMGEXP])]]
noaaData[is.na(PROPDMGEXP), PROPDMGEXP := 10^0 ]
noaaData[, CROPDMGEXP := cropDmgKey[as.character(noaaData[,CROPDMGEXP])] ]
noaaData[is.na(CROPDMGEXP), CROPDMGEXP := 10^0 ]
After formatting the exponents data, we can now get the actual costs of the damages. We will also be able to get the big picture of which are the greatest economic costs.
noaaData <- noaaData %>% mutate(PROPCOST = PROPDMG * PROPDMGEXP, CROPCOST = CROPDMG * CROPDMGEXP)
## Calculate Total Costs
totalCosts <- noaaData %>% group_by(EVTYPE) %>%
summarise(cropSum = sum(CROPCOST), propSum = sum(PROPCOST)) %>%
mutate(totalSum = cropSum + propSum)
## Only get the top 10
totalCosts <- totalCosts %>% arrange(desc(totalSum)) %>% slice(1:10)
We can get the greatest health risks without additional formatting. We just need a few calculations.
## Calculate Total Fatalities and Injuries
totalHealthDMG <- noaaData %>% group_by(EVTYPE) %>%
summarise(totalDead = sum(FATALITIES), totalInjured = sum(INJURIES)) %>%
mutate(totalLoss = totalDead + totalInjured)
## Only get the top 10
totalHealthDMG <- totalHealthDMG %>% arrange(desc(totalLoss)) %>% slice(1:10)
From our calculated databases, we can get and frame our data so that it is easy to understand
## Melt table for easier plotting
lossTable <- melt(totalHealthDMG, id.vars="EVTYPE", variable.name = "loss_type")
## Create bar chart to visualize losses
healthChart <- ggplot(lossTable, aes(x=reorder(EVTYPE, -value), y=value)) +
geom_bar(stat="identity", aes(fill=loss_type), position="dodge") +
ylab("Count") + xlab("Danger Type") +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
ggtitle("Greatest Health Hazards in the US") +
theme(plot.title = element_text(hjust = 0.5))
healthChart
## Melt table for easier plotting
costTable <- melt(totalCosts, id.vars="EVTYPE", variable.name = "cost_type")
## Create bar chart to visualize costs
econChart <- ggplot(costTable, aes(x=reorder(EVTYPE, -value), y=value)) +
geom_bar(stat="identity", aes(fill=cost_type), position="dodge") +
ylab("Cost ($)") + xlab("Danger Type") +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
ggtitle("Greatest Environmental Threats to Economy in the US") +
theme(plot.title = element_text(hjust = 0.5))
econChart