This reproducible research aims at exploring the NOAA Storm Database and answer some basic questions about severe weather events:
National Weather Service Storm Data Documentation, which shows us how some of the variables are constructed/defined. (https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf)
setwd("C:/Users/Anhuynh/Desktop/Data Science_Cousera/Reproducible Research/Assignment/data 2")
library(readr)
dataStorm <- read.csv("repdata_data_StormData.csv.bz2", header = TRUE, sep = ",", quote = "\"")
str(dataStorm)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
library(plyr)
# Count on number of fatalities and figure out the most harmful event
subgroup_fat <- ddply(dataStorm, ~EVTYPE, summarise, num_fatalities = sum(FATALITIES))
data.frame(subgroup_fat[which.max(subgroup_fat$num_fat), ])
## EVTYPE num_fatalities
## 834 TORNADO 5633
# Count on number of injuries and figure out the most harmful event
subgroup_inj <- ddply(dataStorm, ~EVTYPE, summarise, num_injuries = sum(INJURIES))
data.frame(subgroup_inj[which.max(subgroup_inj$num_injuries), ])
## EVTYPE num_injuries
## 834 TORNADO 91346
The below graphs demonstrate the serious influences of Tornado on number of fatalities and injuries in comparison with the second and the third harmful ones, Excessive Heat and Flood respectively.
library(ggplot2)
# Create subsets for comparison of events
subA <- grep("^[A-Ia-i]", dataStorm$EVTYPE, value=TRUE)
subB <- grep("^[L-Wl-w]", dataStorm$EVTYPE, value=TRUE)
subgroupA <- subset(dataStorm, EVTYPE %in% subA)
subgroupB <- subset(dataStorm, EVTYPE %in% subB)
# Figure out the 1st and 2nd harmful event on group of fatalities
subgroupA_fat <- ddply(subgroupA, ~EVTYPE, summarise, num_fatalities = sum(FATALITIES, na.rm=TRUE))
A <- data.frame(subgroupA_fat[which.max(subgroupA_fat$num_fatalities), ])
subgroupB_fat <- ddply(subgroupB, ~EVTYPE, summarise, num_fatalities = sum(FATALITIES, na.rm=TRUE))
B <- data.frame(subgroupB_fat[which.max(subgroupB_fat$num_fatalities), ])
AB <- rbind(A, B)
AB
## EVTYPE num_fatalities
## 121 EXCESSIVE HEAT 1903
## 399 TORNADO 5633
# Figure out the 1st and 2nd harmful event on group of injuries
subgroupC_inj <- ddply(subgroupA, ~EVTYPE, summarise, num_injuries = sum(INJURIES, na.rm=TRUE))
C <- data.frame(subgroupC_inj[which.max(subgroupC_inj$num_injuries), ])
subgroupD_inj <- ddply(subgroupB, ~EVTYPE, summarise, num_injuries = sum(INJURIES, na.rm=TRUE))
D <- data.frame(subgroupD_inj[which.max(subgroupD_inj$num_injuries), ])
CD <- rbind(C, D)
CD
## EVTYPE num_injuries
## 161 FLOOD 6789
## 399 TORNADO 91346
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
colnames(AB) <- c("EVTYPE", "num_event")
colnames(CD) <- c("EVTYPE", "num_event")
com_ABCD <- rbind(AB, CD)
com_ABCD <- tapply(com_ABCD$num_event, com_ABCD$EVTYPE, FUN=sum)
com_ABCD = data.frame(EVTYPE = c("EXCESSIVE HEAT", " FLOOD ", "TORNADO "),
num_event = c(1903, 6789, 96979) )
com_ABCD <- com_ABCD %>%
group_by(EVTYPE) %>%
mutate(cum_ev = cumsum(num_event)) %>%
mutate(sum_ev = sum(num_event) )
ggplot(data=com_ABCD, aes(x = reorder(EVTYPE, -sum_ev), y=num_event, fill=EVTYPE)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=cum_ev, label=num_event), vjust=1, color="black", size=3) +
labs(title="Top 3 Events impacting Population Health") +
labs(x="Event Type",y="Total number of Fatalities and Injuries (cases)") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
Figure 01: Top 3 Events impacting Population Health
# Create new subset for calculating economic damages.
sub_dataStorm <- subset(dataStorm, select = c("STATE",
"EVTYPE",
"FATALITIES",
"INJURIES",
"PROPDMG",
"PROPDMGEXP",
"CROPDMG",
"CROPDMGEXP"))
# Replace numeric value to PROPDMGEXP
pEXP <- replace(sub_dataStorm$PROPDMGEXP, sub_dataStorm$PROPDMGEXP %in% c("\"\"",
"-",
"?",
"+",
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"B",
"h",
"H",
"K",
"m",
"M"), c(0,
0,
0,
10^0,
10^1,
10^1,
10^1,
10^1,
10^1,
10^1,
10^1,
10^1,
10^1,
10^9,
10^2,
10^2,
10^3,
10^6,
10^6))
## Warning in x[list] <- values: number of items to replace is not a multiple of
## replacement length
# Replace numeric value to CROPDMGEXP
cEXP <- replace(sub_dataStorm$CROPDMGEXP, sub_dataStorm$CROPDMGEXP %in% c("\"\"",
"?",
"0",
"2",
"B",
"k",
"K",
"m",
"M"), c(0,
0,
10^1,
10^1,
10^9,
10^3,
10^3,
10^6,
10^6))
## Warning in x[list] <- values: number of items to replace is not a multiple of
## replacement length
# Create data with new values to PROPDMGEXP (Property) and CROPDMGEXP (Crop)
final_dataStorm <- mutate(sub_dataStorm, PROPDMGEXP = pEXP, CROPDMGEXP = cEXP)
final_dataStorm <- mutate(final_dataStorm, valuePROP = PROPDMG * as.numeric(as.character(PROPDMGEXP)))
final_dataStorm <- mutate(final_dataStorm, valueCROP = CROPDMG * as.numeric(as.character(CROPDMGEXP)))
final_dataStorm <- final_dataStorm[which(final_dataStorm$CROPDMG >0 & final_dataStorm$PROPDMG >0), ]
head(final_dataStorm)
## STATE EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP
## 187566 AL HURRICANE OPAL/HIGH WINDS 2 0 0.1 100
## 187571 AL THUNDERSTORM WINDS 0 0 5.0 1000
## 187581 AL HURRICANE ERIN 0 0 25.0 0
## 187583 AL HURRICANE OPAL 0 0 48.0 0
## 187584 AL HURRICANE OPAL 0 0 20.0 1
## 187653 AL THUNDERSTORM WINDS 0 0 50.0 10
## CROPDMG CROPDMGEXP valuePROP valueCROP
## 187566 10 0 10 0e+00
## 187571 500 0 5000 0e+00
## 187581 1 10 0 1e+01
## 187583 4 10 0 4e+01
## 187584 10 1e+09 20 1e+10
## 187653 50 1000 500 5e+04
library(dplyr)
require(ggplot2)
require(ggmap)
## Loading required package: ggmap
## Warning: package 'ggmap' was built under R version 4.0.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
require(maps)
## Loading required package: maps
##
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
##
## ozone
library(usdata)
## Warning: package 'usdata' was built under R version 4.0.3
prop <- aggregate(valuePROP ~ EVTYPE, final_dataStorm, sum, na.rm=TRUE)
prop_ev = prop[order(prop$valuePROP, decreasing = TRUE), ]
sort_prop_ev <- head(prop_ev, 10)
sort_prop_ev
## EVTYPE valuePROP
## 10 FLASH FLOOD 1.173593e+13
## 25 HAIL 1.119754e+13
## 74 TORNADO 9.764030e+12
## 14 FLOOD 7.623682e+12
## 81 TSTM WIND 6.640148e+12
## 65 THUNDERSTORM WINDS 4.613926e+12
## 64 THUNDERSTORM WIND 3.176501e+12
## 77 TROPICAL STORM 7.009258e+11
## 85 URBAN FLOOD 6.001702e+11
## 86 URBAN FLOODING 5.052504e+11
crop <- aggregate(valueCROP ~ EVTYPE, final_dataStorm, sum, na.rm=TRUE)
crop_ev = crop[order(crop$valueCROP, decreasing = TRUE), ]
sort_crop_ev <- head(crop_ev, 10)
sort_crop_ev
## EVTYPE valueCROP
## 25 HAIL 4.234878e+13
## 10 FLASH FLOOD 2.174994e+13
## 14 FLOOD 1.731630e+13
## 74 TORNADO 1.188096e+13
## 64 THUNDERSTORM WIND 9.637671e+12
## 81 TSTM WIND 9.091257e+12
## 38 HIGH WIND 1.759873e+12
## 65 THUNDERSTORM WINDS 1.753841e+12
## 82 TSTM WIND/HAIL 1.466429e+12
## 12 FLASH FLOODING 1.165821e+12
subgroup_ev <- ddply(final_dataStorm, .(EVTYPE), summarise,
damage_val = sum(valuePROP + valueCROP)/1000000)
data.frame(subgroup_ev[which.max(subgroup_ev$damage_val), ])
## EVTYPE damage_val
## 14 FLOOD 24939981
# Sorting data for creating graph chart.
subgroup_ev = subgroup_ev[order(subgroup_ev$damage_val, decreasing = TRUE), ]
sort_subgroup_ev <- head(subgroup_ev, 10)
# Data visualization
sort_subgroup_ev$value <- round(sort_subgroup_ev$damage_val)
sort_subgroup_ev <- sort_subgroup_ev %>%
group_by(EVTYPE) %>%
mutate(cum_val = cumsum(damage_val)) %>%
mutate(sum_val = sum(damage_val) )
ggplot(data=sort_subgroup_ev, aes(x = reorder(EVTYPE, -sum_val), y=damage_val, fill=EVTYPE)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=cum_val, label=""), vjust=1, color="black", size=3) +
labs(title="Top 10 Events impacting Economy in the US") +
labs(x="Event Type",y="Total amount of property and crop damages (millions)") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
Figure 02: Top 10 Events impacting the economy (Properties and Crops) in the US
subgroup_ev <- ddply(final_dataStorm, .(EVTYPE, STATE), summarise,
damage_val = sum(valuePROP + valueCROP)/1000000)
data.frame(subgroup_ev[which.max(subgroup_ev$damage_val), ])
## EVTYPE STATE damage_val
## 159 HAIL NE 23093439
Among the states, Nebraska was the most suffering from severe Hail when damage values reached to 23,093,439 (US Dollar).
states_map <- map_data("state")
df <- data.frame(state = tolower(abbr2state(subgroup_ev$STATE)), subgroup_ev$damage_val)
names(df) <- c("region", "value")
join_df <- left_join(df, states_map, by = "region")
mp <- ggplot() +
geom_polygon(data=states_map, aes(x=long, y=lat, group = group), fill="grey92" ) + ggtitle("Economic Damages States Map") +
geom_point(data=join_df, aes(x=long, y=lat, size = value, color = value)) +
guides(size=guide_legend("Event Index")) +
theme_void()
mp
## Warning: Removed 3991 rows containing missing values (geom_point).
Figure 03: Economic Damages (Properties and Crops loss) States Map