The NOAA data is downloaded from it’s website and loaded using “read.csv” function. Downloading the data directly from the website makes this effort reproducible
#Download the data from the URL
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, "./noaa_storm_data.bz2")
#Read the data and load it into a dataframe
dataset <- read.csv("./noaa_storm_data.bz2")
There isn’t whole lot of preprocessing required for this dataset. The dataset is well structured and doesn’t need any cleanup or tidying. Before further processing, a quick look into a subset of data and their info would help
head(dataset)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
str(dataset)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels ""," Christiansburg",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels ""," CANTON"," TULIA",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","%SD",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","\t","\t\t",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
plot_human_impact_events <- function(top_10_most_harmful_events) {
harmful_events_melted <- melt(top_10_most_harmful_events, id.vars = 'EVTYPE')
harmful_events_melted <- harmful_events_melted %>% mutate(row = row_number())
harmful_events_plot <- ggplot(harmful_events_melted, aes(x = reorder(EVTYPE,row), y = value, fill = variable)) +
geom_bar(aes(fill = variable), position = 'dodge', stat = "identity") +
geom_text(aes(label = value), position = position_dodge(width = 0.9), vjust = -0.25, size = 3) +
theme( legend.title = element_blank(), axis.title.x = element_blank(),
axis.title.y = element_blank() ) +
labs( x = "Event", y = "Total Injuries + Fatalities") +
labs(title = "Top 10 Events causing most Human Impact") +
theme(plot.title = element_text(hjust = 0.5))
print(harmful_events_plot)
}
#Group By Event Type and find the sum of Fatalities and Injuries
human_impact_dataset <- dataset %>% group_by(EVTYPE) %>%
summarise(totalFatalities = sum(FATALITIES), totalInjuries = sum(INJURIES),
totalHumanImpact = sum(totalFatalities, totalInjuries))
top_10_most_harmful_events <- human_impact_dataset %>% arrange(desc(totalHumanImpact)) %>% slice(1:10)
plot_human_impact_events(top_10_most_harmful_events)
plot_economic_impact_events <- function(top_10_economic_impactful_events) {
library(ggplot2)
library(reshape2)
economic_impactful_events_melted <- melt(top_10_economic_impactful_events, id.vars = 'EVTYPE')
economic_impactful_events_melted <- economic_impactful_events_melted %>% mutate(row = row_number())
#View(economic_impactful_events_melted)
economic_impactful_events_plot <- ggplot(economic_impactful_events_melted, aes(x = reorder(EVTYPE,row), y = value, fill = variable)) +
geom_bar(aes(fill = variable), position = 'dodge', stat = "identity") +
geom_text(aes(label = value), position = position_dodge(width = 0.9), vjust = -0.25, size = 3) +
theme( legend.title = element_blank(), axis.title.x = element_blank(),
axis.title.y = element_blank() ) +
labs( x = "Event", y = "Total Property + Crop Damages") +
labs(title = "Top 10 Events causing most Economic Impact in Billions of Dollars") +
theme(plot.title = element_text(hjust = 0.5))
print(economic_impactful_events_plot)
}
#Replace the values of PROPDMGEXP and CROPDMGEXP with their letter interpretations
#Letters in PROPDMGEXP and CROPDMGEXP follows below guidelines
# 'K' - Thousands
# 'M' - Millions
# 'B' - Billions
dataset <- dataset %>% mutate(PROPDMGEXP_VALUE = case_when(
PROPDMGEXP == 'K' ~ 1000,
PROPDMGEXP == 'M' ~ 1000000,
PROPDMGEXP == 'B' ~ 1000000000,
TRUE ~ 0))
dataset <- dataset %>% mutate(CROPDMGEXP_VALUE = case_when(
CROPDMGEXP == 'K' ~ 1000,
CROPDMGEXP == 'M' ~ 1000000,
CROPDMGEXP == 'B' ~ 1000000000,
TRUE ~ 0))
#Normalize columns PROPDMG and CROPDMG with PROPDMGEXP_VALUE and CROPDMGEXP_VALUE
dataset <- dataset %>% mutate(PROPDMG_NORM = PROPDMG * PROPDMGEXP_VALUE)
dataset <- dataset %>% mutate(CROPDMG_NORM = CROPDMG * CROPDMGEXP_VALUE)
#Group By Event Type and find the sum of PROPDMG_NORM and CROPDMG_NORM
economic_impact_dataset <- dataset %>% group_by(EVTYPE) %>%
summarise(totalPropertyDamage = sum(PROPDMG_NORM),
totalCropDamage = sum(CROPDMG_NORM),
totalEconomicImpact = sum(totalPropertyDamage, totalCropDamage))
#Represent totalPropertyDamage, totalCropDamage, totalEconomicImpact in Billions of Dollars
#Also Round them off to just 2 decimals
economic_impact_dataset <- economic_impact_dataset %>%
mutate(totalPropertyDamage = round(totalPropertyDamage / 1000000000, 2)) %>%
mutate(totalCropDamage = round(totalCropDamage / 1000000000, 2)) %>%
mutate(totalEconomicImpact = round(totalEconomicImpact / 1000000000, 2))
top_10_economic_impactful_events <- economic_impact_dataset %>% arrange(desc(totalEconomicImpact)) %>% slice(1:10)
plot_economic_impact_events(top_10_economic_impactful_events)
plot_fatal_events <- function(top_10_fatal_events) {
library(ggplot2)
fatalites_plot <- ggplot(top_10_fatal_events, aes(x = reorder(EVTYPE, row), y = totalFatalities, fill = EVTYPE)) +
geom_bar(stat = "identity") +
geom_text(aes(x = EVTYPE, y = totalFatalities + 100 , label = totalFatalities), size = 3) +
labs( x = "Event", y = "Total Fatalities") +
labs(title = "Top 10 Events causing most fatalities") +
theme(plot.title = element_text(hjust = 0.5))
fatalites_plot
}
plot_injury_prone_events <- function(top_10_injury_prone_events) {
library(ggplot2)
injuries_plot <- ggplot(top_10_injury_prone_events, aes(x = reorder(EVTYPE, row), y = totalInjuries, fill = EVTYPE)) +
geom_bar(stat = "identity") +
geom_text(aes(x = EVTYPE, y = totalInjuries + 100 , label = totalInjuries), size = 3) +
labs( x = "Event", y = "Total Injuries") +
labs(title = "Top 10 Events causing most injuries") +
theme(plot.title = element_text(hjust = 0.5))
injuries_plot
}
plot_prop_damage_events <- function(top_10_prop_damage_events) {
library(ggplot2)
prop_damages_plot <- ggplot(top_10_prop_damage_events, aes(x = reorder(EVTYPE, row), y = totalPropertyDamage,
fill = EVTYPE)) +
geom_bar(stat = "identity") +
geom_text(aes(x = EVTYPE, y = totalPropertyDamage + 1 ,
label = totalPropertyDamage), size = 3) +
labs( x = "Event", y = "Total Property Damages in Billions of Dollars") +
labs(title = "Top 10 Events causing most property damages") +
theme(plot.title = element_text(hjust = 0.5))
prop_damages_plot
}
plot_crop_damage_events <- function(top_10_crop_damage_events) {
library(ggplot2)
crop_damages_plot <- ggplot(top_10_crop_damage_events, aes(x = reorder(EVTYPE, row), y = totalCropDamage,
fill = EVTYPE)) +
geom_bar(stat = "identity") +
geom_text(aes(x = EVTYPE, y = totalCropDamage + 0.25 , label = totalCropDamage), size = 3) +
labs( x = "Event", y = "Total Crop Damages in Billions of Dollars") +
labs(title = "Top 10 Events causing most crop damages") +
theme(plot.title = element_text(hjust = 0.5))
crop_damages_plot
}
################################## Human Impact Plots ##################################
top_10_fatal_events <- human_impact_dataset %>% arrange(desc(totalFatalities)) %>% slice(1:10) %>%
mutate(row = row_number())
fatalites_plot <- plot_fatal_events(top_10_fatal_events)
top_10_injury_prone_events <- human_impact_dataset %>% arrange(desc(totalInjuries)) %>% slice(1:10) %>%
mutate(row = row_number())
injuries_plot <- plot_injury_prone_events(top_10_injury_prone_events)
human_impact_plot_list <- list(fatalites_plot, injuries_plot)
#########################################################################################
################################## Economic Impact Plots ###################################
top_10_prop_damage_events <- economic_impact_dataset %>% select(EVTYPE, totalPropertyDamage) %>%
arrange(desc(totalPropertyDamage)) %>%
slice(1:10) %>%
mutate(row = row_number())
prop_damages_plot <- plot_prop_damage_events(top_10_prop_damage_events)
top_10_crop_damage_events <- economic_impact_dataset %>% select(EVTYPE, totalCropDamage) %>%
arrange(desc(totalCropDamage)) %>%
slice(1:10) %>%
mutate(row = row_number())
crop_damages_plot <- plot_crop_damage_events(top_10_crop_damage_events)
economic_impact_plot_list <- list(prop_damages_plot, crop_damages_plot)
############################################################################################
#Plot Human and Economic Plots in 2 x 2 panels
require(gridExtra)
grid.arrange(grobs = c(human_impact_plot_list, economic_impact_plot_list), ncol = 2)