Utilising data from the storm database collected by the U.S. National Oceanic and Atmospheric Administration between the years of 1950 and 2011, the objective of this report was to answer two specific questions:
Processing the data and reporting the insight we can conclude the answers are as follows:
The below contents will guide the reader through the processing of the data set and the subsequent results which have led to the conclusions detailed.
The data source can be obtained from the link provided here: https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Fdata_sub.csv.bz2
Load the applicable packages utilised in the report:
library(dplyr)
library(stats)
library(ggplot2)
Load the data into R and carry out inital checks of the data being processed:
st_data <- read.csv(file = "repdata_data_StormData.csv.bz2", header = TRUE, sep = ",")
dim(st_data)
## [1] 902297 37
head(st_data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
There is a lot of data which is not required for this particular study so we will extract the data which is relevant:
required <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG",
"CROPDMGEXP")
sub_st <- st_data[required]
dim(sub_st)
## [1] 902297 7
head(sub_st)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
Looking at the impact to people first, we will sum the total by event type for both fatalities and injuries and combine these with a basic addition sum in order to calculate the total impact:
health_imp <- aggregate(cbind(FATALITIES, INJURIES) ~ EVTYPE, data = sub_st, FUN = sum)
health_imp$PEOP_TOTAL <- health_imp$FATALITIES + health_imp$INJURIES
head(health_imp)
## EVTYPE FATALITIES INJURIES PEOP_TOTAL
## 1 HIGH SURF ADVISORY 0 0 0
## 2 COASTAL FLOOD 0 0 0
## 3 FLASH FLOOD 0 0 0
## 4 LIGHTNING 0 0 0
## 5 TSTM WIND 0 0 0
## 6 TSTM WIND (G45) 0 0 0
The next step is to do the same for property damage and crop damage in order to understand the economic impact. Before we can do this we need to process the data in order to get the correct values. Currently the data holds an alphanumeric value split across two columns for each, the first with the number and the second with the exponent value, for example ‘k’ = 1000. As we need to aggregate the total cost, we will need these values to be entirely numeric. First we will identify the various values existing within the exponent column:
unique(sub_st$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(sub_st$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
Now we can apply the numeric value for the exponent data enabling the calculation to be carried out into the column “PROPCALC” for property and “CROPCALC” for crops:
sub_st$PROPEXP[sub_st$PROPDMGEXP == "K"] <- 1000
sub_st$PROPEXP[sub_st$PROPDMGEXP == "M"] <- 1e+06
sub_st$PROPEXP[sub_st$PROPDMGEXP == ""] <- 1
sub_st$PROPEXP[sub_st$PROPDMGEXP == "B"] <- 1e+09
sub_st$PROPEXP[sub_st$PROPDMGEXP == "m"] <- 1e+06
sub_st$PROPEXP[sub_st$PROPDMGEXP == "0"] <- 1
sub_st$PROPEXP[sub_st$PROPDMGEXP == "5"] <- 1e+05
sub_st$PROPEXP[sub_st$PROPDMGEXP == "6"] <- 1e+06
sub_st$PROPEXP[sub_st$PROPDMGEXP == "4"] <- 10000
sub_st$PROPEXP[sub_st$PROPDMGEXP == "2"] <- 100
sub_st$PROPEXP[sub_st$PROPDMGEXP == "3"] <- 1000
sub_st$PROPEXP[sub_st$PROPDMGEXP == "h"] <- 100
sub_st$PROPEXP[sub_st$PROPDMGEXP == "7"] <- 1e+07
sub_st$PROPEXP[sub_st$PROPDMGEXP == "H"] <- 100
sub_st$PROPEXP[sub_st$PROPDMGEXP == "1"] <- 10
sub_st$PROPEXP[sub_st$PROPDMGEXP == "8"] <- 1e+08
sub_st$PROPEXP[sub_st$PROPDMGEXP == "+"] <- 0
sub_st$PROPEXP[sub_st$PROPDMGEXP == "-"] <- 0
sub_st$PROPEXP[sub_st$PROPDMGEXP == "?"] <- 0
sub_st$CROPEXP[sub_st$CROPDMGEXP == "M"] <- 1e+06
sub_st$CROPEXP[sub_st$CROPDMGEXP == "K"] <- 1000
sub_st$CROPEXP[sub_st$CROPDMGEXP == "m"] <- 1e+06
sub_st$CROPEXP[sub_st$CROPDMGEXP == "B"] <- 1e+09
sub_st$CROPEXP[sub_st$CROPDMGEXP == "0"] <- 1
sub_st$CROPEXP[sub_st$CROPDMGEXP == "k"] <- 1000
sub_st$CROPEXP[sub_st$CROPDMGEXP == "2"] <- 100
sub_st$CROPEXP[sub_st$CROPDMGEXP == ""] <- 1
sub_st$CROPEXP[sub_st$CROPDMGEXP == "?"] <- 0
sub_st$PROPCALC <- sub_st$PROPDMG * sub_st$PROPEXP
sub_st$CROPCALC <- sub_st$CROPDMG * sub_st$CROPEXP
The data associated to economic impact is now ready to aggregate:
eco_imp <- aggregate(cbind(PROPCALC, CROPCALC) ~ EVTYPE, data = sub_st, FUN = sum)
eco_imp$ECO_TOTAL <- eco_imp$PROPCALC + eco_imp$CROPCALC
head(eco_imp)
## EVTYPE PROPCALC CROPCALC ECO_TOTAL
## 1 HIGH SURF ADVISORY 200000 0 200000
## 2 COASTAL FLOOD 0 0 0
## 3 FLASH FLOOD 50000 0 50000
## 4 LIGHTNING 0 0 0
## 5 TSTM WIND 8100000 0 8100000
## 6 TSTM WIND (G45) 8000 0 8000
With the data processing complete it is now time to review the results.
The processed data still contains a large variety of event types, as we are only interested in the event types which have the most impact we will order the data by decreasing value and extract the top 10:
health_imp <- health_imp[order(health_imp$PEOP_TOTAL, decreasing = TRUE), ]
highest_health <- health_imp[1:10,]
print(highest_health)
## EVTYPE FATALITIES INJURIES PEOP_TOTAL
## 834 TORNADO 5633 91346 96979
## 130 EXCESSIVE HEAT 1903 6525 8428
## 856 TSTM WIND 504 6957 7461
## 170 FLOOD 470 6789 7259
## 464 LIGHTNING 816 5230 6046
## 275 HEAT 937 2100 3037
## 153 FLASH FLOOD 978 1777 2755
## 427 ICE STORM 89 1975 2064
## 760 THUNDERSTORM WIND 133 1488 1621
## 972 WINTER STORM 206 1321 1527
The following chart can be created to provide a visual summary:
ggplot(data = highest_health,
aes(x = reorder(EVTYPE, PEOP_TOTAL),
y = PEOP_TOTAL)
) +
geom_bar(stat = "identity", color = "black") +
labs(x = "Event Type",
y = "Sum of Fatalities and Injuries",
title = "Weather Events Most Harmful to Population Health") +
coord_flip()
When we next look at the economic impact, again we will extract the event types which have the highest impact on the economy using the same method of ordering the data:
eco_imp <- eco_imp[order(eco_imp$ECO_TOTAL, decreasing = TRUE), ]
highest_eco <- eco_imp[1:10,]
print(highest_eco)
## EVTYPE PROPCALC CROPCALC ECO_TOTAL
## 170 FLOOD 144657709807 5661968450 150319678257
## 411 HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 834 TORNADO 56947380617 414953270 57362333887
## 670 STORM SURGE 43323536000 5000 43323541000
## 244 HAIL 15735267513 3025954473 18761221986
## 153 FLASH FLOOD 16822673979 1421317100 18243991079
## 95 DROUGHT 1046106000 13972566000 15018672000
## 402 HURRICANE 11868319010 2741910000 14610229010
## 590 RIVER FLOOD 5118945500 5029459000 10148404500
## 427 ICE STORM 3944927860 5022113500 8967041360
Economic impact will also be visualised in the same method:
ggplot(data = highest_eco,
aes(x = reorder(EVTYPE, ECO_TOTAL),
y = ECO_TOTAL)
) +
geom_bar(stat = "identity", color = "black") +
labs(x = "Event Type",
y = "Sum of Property and Crop Damage",
title = "Weather Events with the Highest Economic Consequences") +
coord_flip()