This assignment’s data source comes from the storm’s impact towards fatalities, injuries and property damage coming from US National Oceanic and Atmospheric Administration’s storm data. Listing when the storm occurs, and estimating of any fatalities, injuries and property damage.
The data set is located in the dataset link to download.
For detailed explanation for individual parameters, please refer to the documentation link.
Obtain raw data file and extract the data into a data frame for filtering.
## load necessary packages
library(data.table)
library(ggplot2)
library(dplyr)
# Download the file
PA02 <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(PA02, "StormData.csv.bz2")
stormF <- read.csv("StormData.csv.bz2")
# Converting data.frame to data.table
stormT <- as.data.table(stormF)
# Extract valuable data input due to the bz file extract while remove
# non-value adding entries.
stormT <- stormT[1:547362, ]
# Convert the damage of crop and properties into numeric values
stormT$PROPDMG <- as.numeric(stormT$PROPDMG, na.rm = FALSE)
stormT$CROPDMG <- as.numeric(stormT$CROPDMG, na.rm = FALSE)
stormT$FATALITIES <- as.numeric(stormT$FATALITIES, na.rm = FALSE)
stormT$INJURIES <- as.numeric(stormT$INJURIES, na.rm = FALSE)
str(stormT)
## Classes 'data.table' and 'data.frame': 547362 obs. of 37 variables:
## $ STATE__ : chr "1.00" "1.00" "1.00" "1.00" ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : chr "97.00" "3.00" "57.00" "89.00" ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : chr "0.00" "0.00" "0.00" "0.00" ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: chr "0.00" "0.00" "0.00" "0.00" ...
## $ COUNTYENDN: chr "" "" "" "" ...
## $ END_RANGE : chr "0.00" "0.00" "0.00" "0.00" ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : chr "14.00" "2.00" "0.10" "0.00" ...
## $ WIDTH : chr "100.00" "150.00" "123.00" "100.00" ...
## $ F : chr "3" "2" "2" "2" ...
## $ MAG : chr "0.00" "0.00" "0.00" "0.00" ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : chr "3040.00" "3042.00" "3340.00" "3458.00" ...
## $ LONGITUDE : chr "8812.00" "8755.00" "8742.00" "8626.00" ...
## $ LATITUDE_E: chr "3051.00" "0.00" "0.00" "0.00" ...
## $ LONGITUDE_: chr "8806.00" "0.00" "0.00" "0.00" ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : chr "1.00" "2.00" "3.00" "4.00" ...
## - attr(*, ".internal.selfref")=<externalptr>
Subset the data only for the interested parameters, in this case will only be occurred event, fatalities, injuries, prop and crop damages
# Filtering via dplyr command
stormT <- stormT %>%
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>%
filter(EVTYPE != "?")%>%
filter((INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0))
# Change all damaged exponents to upper case for correction.
exp <- c("PROPDMGEXP","CROPDMGEXP")
stormT[, (exp) := c(lapply(.SD, toupper)), .SDcols = exp]
# Map property damage alphanumeric exponents to numeric values.
propDmgExp <- c("\"\"" = 10^0,
"-" = 10^0,
"+" = 10^0,
"0" = 10^0,
"1" = 10^1,
"2" = 10^2,
"3" = 10^3,
"4" = 10^4,
"5" = 10^5,
"6" = 10^6,
"7" = 10^7,
"8" = 10^8,
"9" = 10^9,
"H" = 10^2,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
# Map crop damage alphanumeric exponents to numeric values.
cropDmgExp <- c("\"\"" = 10^0,
"?" = 10^0,
"0" = 10^0,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
# Replace the data set's value with characters before using the logical vector to convert to numeric.
stormT[, PROPDMGEXP := propDmgExp[as.character(stormT[, PROPDMGEXP])]]
stormT[, CROPDMGEXP := cropDmgExp[as.character(stormT[, CROPDMGEXP])]]
# Convert the zero values to default numeric (which is 10^0)
stormT[is.na(PROPDMGEXP), PROPDMGEXP := 10^0]
stormT[is.na(CROPDMGEXP), CROPDMGEXP := 10^0]
# Mutate data table by add extra columns for the calculation
stormT <- stormT %>%
mutate(Cost_Prop = as.numeric(PROPDMG) * PROPDMGEXP, Cost_Crop = as.numeric(CROPDMG )* CROPDMGEXP)
# Perform the Total Cost summary by having the 3 data columns for Prop Cost, Crop Cost and Total Cost by Event Type Classification
Cost_Total <- stormT[, .(Cost_Prop = sum(Cost_Prop),
Cost_Crop = sum(Cost_Crop),
Cost_Total = sum(Cost_Prop + Cost_Crop))
, by =.(EVTYPE)]
# Order by Reverse with Total Cost to see the top 10 EVTYPE failure
Cost_Total <- Cost_Total %>%
arrange(desc(Cost_Total))
Cost_Total <- Cost_Total [1:10, ]
# Check the structure by only showing the top 5 EVTYPE which caused high costs.
head(Cost_Total, 5)
## EVTYPE Cost_Prop Cost_Crop Cost_Total
## 1: TORNADO 40987926487 216014370 41203940857
## 2: HURRICANE/TYPHOON 19403415000 586770800 19990185800
## 3: FLOOD 12338106477 2333949550 14672056027
## 4: HURRICANE 9400719010 2561400000 11962119010
## 5: DROUGHT 845298000 9860245000 10705543000
# Perform the Total Injuries summary by having the 3 data columns for Injuries, Fatalities and Total Injuries by Event Type Classification
Inj_Tot <- stormT[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), TOTAL_INJURIES = sum(FATALITIES + INJURIES)), by =.(EVTYPE)]
# Order by Reverse with fatality count to see the top 10 EVTYPE failure
Inj_Tot <- Inj_Tot[order(-FATALITIES), ]
Inj_Tot <- Inj_Tot[1:10, ]
head(Inj_Tot, 5)
## EVTYPE FATALITIES INJURIES TOTAL_INJURIES
## 1: TORNADO 4658 80084 84742
## 2: EXCESSIVE HEAT 1416 4354 5770
## 3: HEAT 708 878 1586
## 4: LIGHTNING 562 3628 4190
## 5: FLASH FLOOD 559 1407 1966
This chart will display and rank the event which caused high economic impact for crop and property damages.
# Melt the data.table for bar graph format with sub levels
eimpact <- melt(Cost_Total, id.vars="EVTYPE", variable.name = "Damage_Type")
head(eimpact, 5)
## EVTYPE Damage_Type value
## 1: TORNADO Cost_Prop 40987926487
## 2: HURRICANE/TYPHOON Cost_Prop 19403415000
## 3: FLOOD Cost_Prop 12338106477
## 4: HURRICANE Cost_Prop 9400719010
## 5: DROUGHT Cost_Prop 845298000
# Generate Chart
econChart <- ggplot(eimpact, aes(x = reorder(EVTYPE, -value), y = value)) +
geom_bar(stat = "identity", aes(fill = Damage_Type), position ="dodge") +
labs(x = "Event Types", y = "Frequency Count", title = "Top 10 US Storm Events with Strong Economic Impact")+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
axis.text.y = element_text(size = 8),
axis.title = element_text(size = 8),
plot.title = element_text(hjust = 0.5, size = 11),
legend.title = element_text(size = 7),
legend.text = element_text(size = 7))+
scale_fill_manual(values = c("#45B39D","#8A14B3","#E02A5F"))
econChart
This chart will display and rank the event which caused the severe fatality and injuries for the populations
# Melt the data.table for bar graph format with sub levels
chaos <- melt(Inj_Tot, id.vars="EVTYPE", variable.name = "Chaos_Level")
head(chaos, 5)
## EVTYPE Chaos_Level value
## 1: TORNADO FATALITIES 4658
## 2: EXCESSIVE HEAT FATALITIES 1416
## 3: HEAT FATALITIES 708
## 4: LIGHTNING FATALITIES 562
## 5: FLASH FLOOD FATALITIES 559
# Generate Chart
InjuryChart <- ggplot(chaos, aes(x = reorder(EVTYPE, -value), y = value)) +
geom_bar(stat = "identity", aes(fill = Chaos_Level), position ="dodge") +
labs(x = "Event Types", y = "Frequency", title = "Top 10 US Fatalities Events")+
theme(plot.title = element_text(hjust = 0.5, size = 10),
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
axis.text.y = element_text(size = 8),
axis.title = element_text(size = 8),
legend.title = element_text(size = 7),
legend.text = element_text(size = 7))+
scale_fill_manual(values = c("#45B39D","#8A14B3","#E02A5F"))
InjuryChart