Synopsis
This project aims to analyze the storm data provided. We will be looking at the most damaging natural disasters in the US in terms of population health and property damages. In summary tornados seemed to be the most deadly, and floods were the most expensive disasters.
Data Processing
Getting the data
# create data directory
dir.create("./data")
# download the file
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",
destfile = "./data/stormdata.bz2")
# unzip the file
unzip("./data/stormdata.bz2")
# load in the data
my_data <- read.csv("./data/stormdata.bz2")
Question 1
- Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
# create data to answer question, my idea is to have sums of fatalities and
# injuries by type of storm. there are a lot so only take the top x that
# appear in both top 10 fatalities and top 10 injuries
fat_data <- my_data %>%
group_by(EVTYPE) %>%
summarise("Total Fatalities"=sum(FATALITIES)) %>%
arrange(-`Total Fatalities`) %>%
head(.,10) # finding the top 10 fatality disasters
inj_data <- my_data %>%
group_by(EVTYPE) %>%
summarise("Total Injuries"=sum(INJURIES)) %>%
arrange(-`Total Injuries`) %>%
head(.,10) # finding the top 10 injury disasters
# finding the intersection of both top 10s
topx_data <- inner_join(fat_data,inj_data,by="EVTYPE")
# creating a dataset of 3 columns evtype, total affected, and type of health
# risk to be able to use the type of health risk in the plot
topx_fat <- topx_data[,1:2] %>%
rename("Total" = `Total Fatalities`) %>%
mutate("Health" = "Fatalities")
topx_inj <- topx_data[,c(1,3)] %>%
rename("Total" = `Total Injuries`) %>%
mutate("Health" = "Injuries")
topx_fixed <- rbind(topx_fat,topx_inj)
Result 1/Figure 1
ggplot(topx_fixed)+
geom_bar(aes(x=reorder(EVTYPE,-Total),y=Total/1000,fill=Health),
stat = "identity", position = "dodge")+
labs(x="Disaster", title = "Total Fatalities and Injuries in the US by type of Disaster", y = "Total (in Thousands)")+
theme(axis.text.x = element_text(angle = 45,
vjust = 1,
hjust=1))+
scale_fill_manual(values = c("#F87070", "#FCAB60"))
Clearly Tornado has the most injuries by far and noticeably more
fatalities. These are the top 7 disasters that appeared in both the top
10 total fatalities and the top 10 total injuries in the US.
Question 2
Across the United States, which types of events have the greatest economic consequences?
# create the dataset that has the sum of property damage by type of disaster,
# but first need to convert them to their actual value, for now will just
# change M to millions and B to Billions and K to thousands, documentation
# has nothing to help with what the rest of them might mean, i will show the rest
my_data %>%
mutate(PROPDMGEXP = as.factor(PROPDMGEXP)) %>%
group_by(PROPDMGEXP) %>%
summarise(TotalExp=n()) %>%
kbl() %>%
kable_styling()
| PROPDMGEXP | TotalExp |
|---|---|
| 465934 | |
|
|
1 |
| ? | 8 |
|
|
5 |
| 0 | 216 |
| 1 | 25 |
| 2 | 13 |
| 3 | 4 |
| 4 | 4 |
| 5 | 28 |
| 6 | 4 |
| 7 | 5 |
| 8 | 1 |
| B | 40 |
| h | 1 |
| H | 6 |
| K | 424665 |
| m | 7 |
| M | 11330 |
my_data %>%
mutate(CROPDMGEXP = as.factor(CROPDMGEXP)) %>%
group_by(CROPDMGEXP) %>%
summarise(TotalExp=n()) %>%
kbl() %>%
kable_styling()
| CROPDMGEXP | TotalExp |
|---|---|
| 618413 | |
| ? | 7 |
| 0 | 19 |
| 2 | 1 |
| B | 9 |
| k | 21 |
| K | 281832 |
| m | 1 |
| M | 1994 |
Less categories in CROPDMGEXP but still only know K,M,B. The rest will not be changed.
# subset the data we are going to work with/modify
dmg_data <- my_data %>% select(EVTYPE,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)
# multiply the value of property damage according to its PROPDMGEXP
dmg_data$PROPDMG <- if_else(dmg_data$PROPDMGEXP=="K",
dmg_data$PROPDMG*1000,
if_else(dmg_data$PROPDMGEXP=="M",
dmg_data$PROPDMG*1000000,
if_else(dmg_data$PROPDMGEXP=="B",
dmg_data$PROPDMG*1000000000,
dmg_data$PROPDMG)))
# similarly for crop damage
dmg_data$CROPDMG <- if_else(dmg_data$CROPDMGEXP=="K",
dmg_data$CROPDMG*1000,
if_else(dmg_data$CROPDMGEXP=="M",
dmg_data$CROPDMG*1000000,
if_else(dmg_data$CROPDMGEXP=="B",
dmg_data$CROPDMG*1000000000,
dmg_data$CROPDMG)))
# reformat the data to have 3 columns evtype,damage value, damage type, im
# sure this can be easily done another way pivot/unpivot or something i
# just did it this way
dmg_data <- rbind(dmg_data %>% select(EVTYPE,PROPDMG) %>%
rename("Disaster"=EVTYPE,
"Value"=PROPDMG) %>%
mutate("Damage Type" = "Property"),
dmg_data %>% select(EVTYPE,CROPDMG) %>%
rename("Disaster"=EVTYPE,
"Value"=CROPDMG) %>%
mutate("Damage Type" = "Crop"))
# check the top 10 for the sum of damage but keeping both damage values
dmg_data <- dmg_data %>%
group_by(Disaster,`Damage Type`) %>%
summarise(TotalDamage=as.numeric(sum(Value))) %>%
left_join(., dmg_data %>%
group_by(Disaster) %>%
summarise(Total=as.numeric(sum(Value))),
by= "Disaster") %>%
arrange(-Total) %>%
head(.,20)
Result 2/Figure 2
ggplot(dmg_data)+
geom_bar(aes(x=reorder(Disaster,-Total), y=TotalDamage/1000000000,fill=`Damage Type`), stat = "identity")+
scale_fill_manual(values = c("#597143", "#2C29A2"))+
theme(axis.text.x = element_text(angle = 45,
vjust = 1,
hjust=1))+
labs(x="Disaster", y="Total Damage (in Billions)", title="Top 10 Most Damaging Disasters in US")+
scale_y_continuous(breaks = seq(0,150,25))+
theme(panel.grid.major.y = element_line(color = "red",
size = 0.5,
linetype = 2))
The Flood disaster has the most damage, around double that of second place, at 150 Billions dollars in damages with almost all of it being property damage.