This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database between the period of 1950-1-3 UTC to 2011-11-30 UTC. The results show that across the United States, Tornadoes are most harmful with respect to population and health. Floods on the other hand are most harmful to the economy in terms of the number of property and crop damages. We explore in more detail below.
library(dplyr)
library(ggplot2)
library(ggrepel)
library(stringr)
library(lubridate)
library(knitr)
df<-read.csv("D:/Documents Ddrive/R/ReproducibleResearch/repdata_data_StormData.csv.bz2")
min(mdy_hms(df$BGN_DATE))
## [1] "1950-01-03 UTC"
max(na.omit(mdy_hms(df$END_DATE)))
## [1] "2011-11-30 UTC"
Group by events, keep top 3, plot graph.
df_grp_ev = df %>% group_by(EVTYPE) %>% summarise(total_injuries = sum(INJURIES), total_fatalities = sum(FATALITIES), .groups = 'drop') %>% arrange(desc(total_injuries)) %>% top_n(3,total_injuries)
kable(df_grp_ev)
| EVTYPE | total_injuries | total_fatalities |
|---|---|---|
| TORNADO | 91346 | 5633 |
| TSTM WIND | 6957 | 504 |
| FLOOD | 6789 | 470 |
p1<-ggplot(data = df_grp_ev, aes(x = total_injuries, y = total_fatalities,label = EVTYPE)) + geom_point() + geom_text_repel()
p1
We can see that Tornadoes cause the most injuries and fatalities.
First we clean up the data for property and crop damages. Convert characters to numeric for graphing.
df2 = df[,c(7,8,25,26,27,28)]
df2$kmd<-df2$PROPDMGEXP
# Clean propdmg
df2$kmd<-sub("K", "1000", df2$PROPDMGEXP, fixed = TRUE)
df2$kmd<-sub("k", "1000", df2$kmd, fixed = TRUE)
df2$kmd<-sub("M", "1000000", df2$kmd, fixed = TRUE)
df2$kmd<-sub("m", "1000000", df2$kmd, fixed = TRUE)
df2$kmd<-sub("B", "1000000000", df2$kmd, fixed = TRUE)
df2$kmd<-sub("b", "1000000000", df2$kmd, fixed = TRUE)
df2$kmd<-sub("B", "1000000000", df2$kmd, fixed = TRUE)
df2$kmd<-sub("h", "100", df2$kmd, fixed = TRUE)
df2$kmd<-sub("H", "100", df2$kmd, fixed = TRUE)
df2$kmd<-sub("-", "1", df2$kmd, fixed = TRUE)
df2$kmd<-sub("?", "1", df2$kmd, fixed = TRUE)
df2$kmd<-sub("+", "1", df2$kmd, fixed = TRUE)
df2$kmd<-sub("2", "1", df2$kmd, fixed = TRUE)
df2$kmd<-sub("3", "1", df2$kmd, fixed = TRUE)
df2$kmd<-sub("4", "1", df2$kmd, fixed = TRUE)
df2$kmd<-sub("5", "1", df2$kmd, fixed = TRUE)
df2$kmd<-sub("6", "1", df2$kmd, fixed = TRUE)
df2$kmd<-sub("7", "1", df2$kmd, fixed = TRUE)
df2$kmd<-sub("8", "1", df2$kmd, fixed = TRUE)
df2$kmd<-as.numeric(df2$kmd)
df2$kmdp<-df2$PROPDMG*df2$kmd
# Clean cropdmg
df2$kmdc<-df2$CROPDMGEXP
df2$kmdc<-sub("K", "1000", df2$kmdc, fixed = TRUE)
df2$kmdc<-sub("k", "1000", df2$kmdc, fixed = TRUE)
df2$kmdc<-sub("M", "1000000", df2$kmdc, fixed = TRUE)
df2$kmdc<-sub("m", "1000000", df2$kmdc, fixed = TRUE)
df2$kmdc<-sub("B", "1000000000", df2$kmdc, fixed = TRUE)
df2$kmdc<-sub("b", "1000000000", df2$kmdc, fixed = TRUE)
df2$kmdc<-sub("2", "1", df2$kmdc, fixed = TRUE)
df2$kmdc<-sub("?", "1", df2$kmdc, fixed = TRUE)
df2$kmdc<-as.numeric(df2$kmdc)
df2$kmdc<-df2$CROPDMG*df2$kmdc
colnames(df2)
## [1] "STATE" "EVTYPE" "PROPDMG" "PROPDMGEXP" "CROPDMG"
## [6] "CROPDMGEXP" "kmd" "kmdp" "kmdc"
df3<-df2[,c(1,2)]
df3$PROPDMG<-df2$kmdp
df3$CROPDMG<-df2$kmdc
# Replace NAs with 0
df3[is.na(df3)]<-0
Next, we group by events and graph
df_grp_ev_dmg = df3 %>% group_by(EVTYPE) %>%
summarise(total_propdamage = sum(PROPDMG), total_cropdamage = sum(CROPDMG), .groups = 'drop') %>%
arrange(desc(total_propdamage)) %>%
top_n(3,total_propdamage)
kable(df_grp_ev_dmg)
| EVTYPE | total_propdamage | total_cropdamage |
|---|---|---|
| FLOOD | 144657709800 | 5661968450 |
| HURRICANE/TYPHOON | 69305840000 | 2607872800 |
| TORNADO | 56937160642 | 414953110 |
p2<-ggplot(data = df_grp_ev_dmg, aes(x = total_propdamage, y = total_cropdamage,label = EVTYPE)) + geom_point() + geom_text_repel()
p2
We can see that Floods cause the most property and crop damages.
We can see that Tornadoes cause the most injuries and fatalities and that that Floods cause the most property and crop damages.