Preparations:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggpubr)
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.2 (2022-06-13 22:00:14 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.25.0 (2022-06-12 02:20:02 UTC) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
##
## throw
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
## R.utils v2.12.0 (2022-06-28 03:20:05 UTC) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, isOpen, nullfile, parse, warnings
###
getwd()
## [1] "/Users/pollyhung/Desktop/Summer school/Data science [Specialization]/[5] Reproducible Research/Projects/Week 4"
setwd("/Users/pollyhung/data")
# Download the files
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile = "/Users/pollyhung/data/storm_data.csv.bz2")
stormData <- bunzip2( filename = "storm_data.csv.bz2", destname = "storm_data.csv")
storm <- read.csv("storm_data.csv")
# See what variables are in the dataset?
names(storm)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
Which type of events are most harmful with respect to population
health?
#Create a new table to collect only the columns we need
storm_event <- storm[c("BGN_DATE", "STATE" ,"EVTYPE", "LENGTH", "WIDTH", "FATALITIES", "INJURIES")]
colnames(storm_event) <- c("date", "state", "event", "length", "width", "fatalities", "injuries")
#There are repetitive evtnames written in both capital and lower cases
storm_event$event <- tolower(storm_event$event)
#Calculate the sum of death and injuries in each type of event, drop the event with 0 in both entry
death_injury <- storm_event %>% group_by(event) %>%
summarise(sum_death = sum(fatalities), sum_injuries = sum(injuries)) %>%
filter(!(sum_death == 0 & sum_injuries == 0))
summary(death_injury)
## event sum_death sum_injuries
## Length:205 Min. : 0.00 Min. : 0.0
## Class :character 1st Qu.: 1.00 1st Qu.: 0.0
## Mode :character Median : 2.00 Median : 3.0
## Mean : 73.88 Mean : 685.5
## 3rd Qu.: 13.00 3rd Qu.: 40.0
## Max. :5633.00 Max. :91346.0
head(death_injury, n = 10L)
## # A tibble: 10 × 3
## event sum_death sum_injuries
## <chr> <dbl> <dbl>
## 1 avalance 1 0
## 2 avalanche 224 170
## 3 black ice 1 24
## 4 blizzard 101 805
## 5 blowing snow 2 14
## 6 brush fire 0 2
## 7 coastal flood 3 2
## 8 coastal flooding 3 0
## 9 coastal flooding/erosion 0 5
## 10 coastal storm 3 2
#A total version...
death_injury_total <- death_injury
death_injury_total$pop_health_dmg <- rowSums(death_injury_total[, c(2, 3)])
#Find out the most harmful event (i.e., the event with maximum death and/or injury number)
death_injury$event[which.max(death_injury$sum_death)]
## [1] "tornado"
death_injury$event[which.max(death_injury$sum_injuries)]
## [1] "tornado"
death_injury_total$event[which.max(death_injury_total$pop_health_dmg)]
## [1] "tornado"
Scatterplot that shows the storm event that causes highest total
fatility/injuries
#Use scatterplot x=death, y=injury
plot <- ggplot(death_injury, aes(x = sum_death,
y = sum_injuries)) +
geom_point(size=1, shape=23) +
geom_label(
data = death_injury %>% filter(sum_death > 3000 & sum_injuries > 3000),
aes(label = event),
label.size = 0,
label.padding = unit(0.01, "lines"),
color = "black",
fill = "#c6eb34",
size = 3,
nudge_y = 3000) +
scale_x_continuous(breaks = seq(0, max(death_injury$sum_death) + 400, by = 600)) +
scale_y_continuous(breaks = seq(0, max(death_injury$sum_injuries) + 400, by = 27000)) +
xlab("total death") +
ylab("total injuries") +
ggtitle("total death and injuries of all
storm event types in US from 1950 to 2011")
#Since the data is highly skewed, let's make a log scatterplot
death_injury_log <- death_injury
death_injury_log$sum_death <- log(death_injury_log$sum_death)
death_injury_log$sum_injuries <- log(death_injury_log$sum_injuries)
#Replace all the -Inf with 0
death_injury_log <- replace(death_injury_log, death_injury_log == -Inf, 0)
#Summary check~
summary(death_injury_log)
#Use scatterplot x=death, y=injury
log_plot <- ggplot(death_injury_log, aes(x = sum_death,
y = sum_injuries)) +
geom_point(size=1, shape=23) +
geom_label(
data = death_injury_log %>% filter(sum_death > 7 & sum_injuries > 8),
aes(label = event),
label.size = 0,
label.padding = unit(0.01, "lines"),
color = "black",
fill = "#c6eb34",
size = 3,
nudge_y = 0.3) +
scale_x_continuous(breaks = seq(0, ceiling(max(death_injury_log$sum_death)), by = 1)) +
scale_y_continuous(breaks = seq(0, ceiling(max(death_injury_log$sum_injuries)), by = 3)) +
xlab("log(total death)") +
ylab("log(total injuries)") +
ggtitle("log(total death and injuries) of all
storm event types in US from 1950 to 2011")
Scatterplot that shows the storm event that causes highest total
fatility/injuries in each state
#Which event type is most harmful by state
storm_event$state <- toupper(storm_event$state)
#Calculate the sum of death and injuries in each type of event, drop the event with 0 in both entry
death_injury_state <- storm_event %>% group_by(state, event) %>%
summarise(sum_death = sum(fatalities), sum_injuries = sum(injuries)) %>%
filter(!(sum_death == 0 & sum_injuries == 0))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
x <- length(unique(death_injury_state$state))
#Now why the fuck are there 63 state????
#state codes:
codes <- state.abb
#remove the list of unexplained states that doesn't represent anything
death_injury_state <- death_injury_state %>% filter(state %in% codes)
#create a log version...
death_injury_state_log <- death_injury_state
death_injury_state_log$sum_death <- log(death_injury_state$sum_death)
death_injury_state_log$sum_injuries <- log(death_injury_state$sum_injuries)
death_injury_state_log <- replace(death_injury_state_log, death_injury_state_log == -Inf, 0)
#remove all negative number
death_injury_state_log <- death_injury_state_log %>% filter_all( all_vars(. >0))
#use facet wrap to create multiple plots for normal values
plot_2 <- ggplot(death_injury_state_log, aes(x = sum_death, y = sum_injuries)) +
facet_wrap(state~., nrow = 17, ncol = 3) +
geom_point(size=1, shape=23) +
geom_label(
data = death_injury_state_log %>%
filter(sum_death == max(sum_death) | sum_injuries == max(sum_injuries)),
aes(label = event),
label.size = 0,
label.padding = unit(0.01, "lines"),
color = "black",
fill = "#FFFFFF",
size = 3,
nudge_x = -1,
nudge_y = 1) +
scale_x_continuous(breaks = seq(0, ceiling(max(death_injury_state_log$sum_death)), by = 1)) +
scale_y_continuous(breaks = seq(0, ceiling(max(death_injury_state_log$sum_injuries)), by = 3)) +
xlab("log(total death)") +
ylab("log(total injuries)") +
ggtitle("log(total death and injuries) of all storm event types in US from 1950 to 2011")
ggsave(plot = plot_2, height = 40, width = 12, device = jpeg,
filename = "death and injury of storm events by states",
path = "/Users/pollyhung/data")
Scatterplot that shows the storm event that causes highest
population health damage
# only plot the 9th quantile data
quantile(death_injury_total$pop_health_dmg, probs = seq(0, 1, 0.1), type = 9)
## 0% 10% 20% 30% 40% 50% 60% 70%
## 1.000 1.000 1.000 2.000 3.475 6.000 16.000 32.200
## 80% 90% 100%
## 107.000 551.600 96979.000
death_injury_total <- death_injury_total %>% filter(pop_health_dmg > 551.600)
plot_5 <- ggplot(death_injury_total, aes(x = event,
y = pop_health_dmg)) +
geom_point(size=1, shape=23) +
scale_y_continuous(breaks = seq(0, max(death_injury_total$pop_health_dmg) + 400, by = 10000)) +
xlab("state codes") +
ylab("total population health damage (unit: cases)") +
ggtitle("total population health damage of all
storm event types in US from 1950 to 2011") +
theme(axis.text.x = element_text(angle = 90))
Across the United States, which types of events have the greatest
economic consequences?
### Across the United States, which types of events have the greatest economic consequences?
economy_loss <- storm[c("STATE", "EVTYPE", "PROPDMG", "CROPDMG")]
economy_loss$EVTYPE <- tolower(economy_loss$EVTYPE)
colnames(economy_loss) <- c("state", "event", "property_dmg", "crop_dmg")
## Calculate the sum of property damage and crop damage in each type of event, drop the event with 0 in both entry
loss <- economy_loss %>% group_by(event) %>%
summarise(sum_property = sum(property_dmg),
sum_crop = sum(crop_dmg)) %>%
filter(!(sum_property == 0 & sum_crop == 0))
loss <- loss %>% filter(!(event == "?"))
summary(loss)
## event sum_property sum_crop
## Length:396 Min. : 0 Min. : 0
## Class :character 1st Qu.: 5 1st Qu.: 0
## Mode :character Median : 55 Median : 0
## Mean : 27486 Mean : 3479
## 3rd Qu.: 578 3rd Qu.: 10
## Max. :3212258 Max. :579596
## log-transform the data
loss_log <- loss
loss_log$sum_property <- log(loss_log$sum_property)
loss_log$sum_crop <- log(loss_log$sum_crop)
loss_log <- replace(loss_log, loss_log == -Inf, 0)
## Total
loss_total <- loss
loss_total$econ_loss <- rowSums(loss_total[, c(2, 3)])
## Find the maximum!
loss$event[which.max(loss$sum_property)]
## [1] "tornado"
loss$event[which.max(loss$sum_crop)]
## [1] "hail"
loss_total$event[which.max(loss_total$econ_loss)]
## [1] "tornado"
Scatterplot that shows which storm event type cause most property/
crop loss
## Make individual plots
plot_3 <- ggplot(loss, aes(x = sum_property,
y = sum_crop)) +
geom_point(size=1, shape=23) +
geom_label(
data = loss %>% filter(sum_property == max(sum_property) | sum_crop == max(sum_crop)),
aes(label = event),
label.size = 0,
label.padding = unit(0.01, "lines"),
color = "black",
fill = "#c6eb34",
size = 3,
nudge_y = 20000,
nudge_x = -100000) +
scale_x_continuous(breaks = seq(0, signif(max(loss$sum_property), digits = 3),
by = (signif(max(loss$sum_property), digits = 3))/10)) +
scale_y_continuous(breaks = seq(0, signif(max(loss$sum_crop), digits = 3),
by = (signif(max(loss$sum_crop), digits = 3))/10 )) +
xlab("Total property loss (unit = dollars)") +
ylab("Total crop loss (unit = dollars)") +
ggtitle("Total property and crops loss
during storms in US from 1950 to 2011") +
theme(axis.text.x = element_text(angle = 90))
## Make a log plot
loss_log <- loss
loss_log$sum_property <- log(loss_log$sum_property)
loss_log$sum_crop <- log(loss_log$sum_crop)
loss_log <- replace(loss_log, loss_log == -Inf, 0)
log_plot_3 <- ggplot(loss_log, aes(x = sum_property,
y = sum_crop)) +
geom_point(size=1, shape=23) +
geom_label(
data = loss_log %>% filter(sum_property == max(sum_property) | sum_crop == max(sum_crop)),
aes(label = event),
label.size = 0,
label.padding = unit(0.01, "lines"),
color = "black",
fill = "#c6eb34",
size = 3) +
scale_x_continuous(breaks = seq(0, signif(max(loss_log$sum_property), digits = 3),
by = (signif(max(loss_log$sum_property), digits = 3))/10)) +
scale_y_continuous(breaks = seq(0, signif(max(loss_log$sum_crop), digits = 3),
by = (signif(max(loss_log$sum_crop), digits = 3))/5 )) +
xlab("log total property loss (unit = dollars)") +
ylab("log total crop loss (unit = dollars)") +
ggtitle("log total property and crops loss
during storms in US from 1950 to 2011")
Scatterplot that shows the storm event that causes highest total
property/ crop loss in each state
# Which event type is most harmful by state
economy_loss$state <- toupper(economy_loss$state)
# Calculate the sum of death and injuries in each type of event, drop the event with 0 in both entry
loss_by_state <- economy_loss %>% group_by(state, event) %>%
summarise(sum_property = sum(property_dmg), sum_crop = sum(crop_dmg)) %>%
filter(!(sum_property == 0 & sum_crop == 0))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
x <- length(unique(loss_by_state$state))
#state codes:
codes <- state.abb
#remove the list of unexplained states that doesn't represent anything
loss_by_state <- loss_by_state %>% filter(state %in% codes)
#create a log version
loss_by_state$sum_property <- log(loss_by_state$sum_property)
loss_by_state$sum_crop <- log(loss_by_state$sum_crop)
loss_by_state <- replace(loss_by_state, loss_by_state == -Inf, 0)
#remove all negative number
loss_by_state <- loss_by_state %>% filter_all( all_vars(. >= 0))
#use facet wrap to create multiple plots for normal values
plot_4 <- ggplot(loss_by_state, aes(x = sum_property, y = sum_crop)) +
facet_wrap(state~., nrow = 17, ncol = 3) +
geom_point(size=1, shape=23) +
geom_label(
data = loss_by_state %>%
filter(sum_property == max(sum_property) | sum_crop == max(sum_crop)),
aes(label = event),
label.size = 0,
label.padding = unit(0.01, "lines"),
color = "black",
fill = "#FFFFFF",
size = 3) +
scale_x_continuous(breaks = seq(0, signif(max(loss_by_state$sum_property), digits = 3),
by = (signif(max(loss_by_state$sum_property), digits = 3))/5)) +
scale_y_continuous(breaks = seq(0, signif(max(loss_by_state$sum_crop), digits = 3),
by = (signif(max(loss_by_state$sum_crop), digits = 3))/5 )) +
xlab("log total property loss (unit = dollars)") +
ylab("log total crop loss (unit = dollars)") +
ggtitle("log total property and crops loss during storms in US from 1950 to 2011")
ggsave(plot = plot_4, height = 40, width = 12, device = jpeg,
filename = "plot_4",
path = "/Users/pollyhung/data")
Scatterplot that shows the storm event that causes highest economy
loss
# only plot the 9th quantile data
quantile(loss_total$econ_loss, probs = seq(0, 1, 0.1), type = 9)
## 0% 10% 20% 30% 40% 50%
## 0.0500 2.0000 5.0000 12.6250 47.1875 72.5000
## 60% 70% 80% 90% 100%
## 204.8125 504.7500 1055.0187 7660.0000 3312276.6800
loss_1 <- loss_total %>% filter(econ_loss > 7660)
plot_6 <- ggplot(loss_1, aes(x = event,
y = econ_loss)) +
geom_point(size=1, shape=23) +
scale_y_continuous(breaks = seq(0, max(loss_total$econ_loss) + 400, by = 300000)) +
xlab("state codes") +
ylab("total economy loss (unit: dollars)") +
ggtitle("total economy loss of all storm event
types in US from 1950 to 2011") +
theme(axis.text.x = element_text(angle = 90))
Table showing which storm event type causes greatest fatalities/
injuries/ property loss/ crop loss across 50 states in US
codes <- as.list(unique(death_injury_state$state))
#injury
table_injury <- rep(NA, 51)
for (i in codes){
temporary <- death_injury_state %>% filter(state == i)
temporary_1 <- temporary$event[which.max(temporary$sum_injuries)]
temporary_2 <- as.data.frame(temporary_1)
table_injury <- rbind(temporary_2, table_injury)
}
table_injury <- as.data.frame(rev(table_injury$temporary_1))
table_injury <- as.data.frame(table_injury$`rev(table_injury$temporary_1)`[2:51])
#death
table_death <- rep(NA, 51)
for(i in codes){
temporary <- death_injury_state %>% filter(state == i)
temporary_1 <- temporary$event[which.max(temporary$sum_death)]
temporary_2 <- as.data.frame(temporary_1)
table_death <- rbind(temporary_2, table_death)
}
table_death <- as.data.frame(rev(table_death$temporary_1))
table_death <- as.data.frame(table_death$`rev(table_death$temporary_1)`[2:51])
#property loss
table_property <- rep(NA, 51)
for(i in codes){
temporary <- loss_by_state %>% filter(state == i)
temporary_1 <- temporary$event[which.max(temporary$sum_property)]
temporary_2 <- as.data.frame(temporary_1)
table_property <- rbind(temporary_2, table_property)
}
table_property <- as.data.frame(rev(table_property$temporary_1))
table_property <- as.data.frame(table_property$`rev(table_property$temporary_1)`[2:51])
#crop loss
table_crop <- rep(NA, 51)
for(i in codes){
temporary <- loss_by_state %>% filter(state == i)
temporary_1 <- temporary$event[which.max(temporary$sum_crop)]
temporary_2 <- as.data.frame(temporary_1)
table_crop <- rbind(temporary_2, table_crop)
}
table_crop <- as.data.frame(rev(table_crop$temporary_1)) #the original dataframe is upside down, so
table_crop <- as.data.frame(table_crop$`rev(table_crop$temporary_1)`[2:51])
#state names
codes <- as.list(unique(death_injury_state$state))
state_name <- t(as.data.frame(codes))
#accumulate tables
state_max <- cbind(state_name, table_death, table_injury, table_property, table_crop)
colnames(state_max) <- c("state code", "Evt deaths",
"Evt injuries",
"Evt property loss",
"Evt crop loss")
Combine all plots
#Combine all plots
overall_plot <- ggarrange(plot, log_plot, plot_3, log_plot_3, plot_5, plot_6,
ncol = 2, nrow = 3)
ggsave(plot = overall_plot, height = 18, width = 12, device = jpeg,
filename = "overall_plot",
path = "/Users/pollyhung/data")