This study was written based on the data collect from the U.S. National Oceanic and Atmospheric Administration’s (NOAA). As per our analysis we can conclude that:
The injuries and fatalities caused by Tornado is the most harmful event types according our analyses, causing close to 100k of health damages including injuries and fatalities.
The highest property damages is represented by FLOOD with more than USD 1500B in damages, and represents close to 30% of the total loose.
Loading the data
temp <- tempfile()
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", temp)
weather <- read.csv(temp)
unlink(temp)
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf", "curr.pdf", method = "curl")
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf", method = "curl", "Events-FAQ20Page.pdf")
str(weather)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Selecting only the data needed
weather <- weather %>%
select(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
dim(weather)
## [1] 902297 7
head(weather)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
checking if there are some NA values.
table(is.na(weather))
##
## FALSE
## 6316079
There are not any NA values, we can continue
According to NATIONAL WEATHER SERVICE INSTRUCTION 10-1605 there are 48 event types. Wee need to check if our data is the same
len <- length(unique(weather$EVTYPE))
print(len)
## [1] 0
There are 0 data types, maybe we need to clean up if our questions were not answered correctly.
Now, we need to work in the amounts of damages.
unique(str_to_upper(weather$PROPDMGEXP))
## [1] "K" "M" "" "B" "+" "0" "5" "6" "?" "4" "2" "3" "H" "7" "-" "1" "8"
unique(str_to_upper(weather$CROPDMGEXP))
## [1] "" "M" "K" "B" "?" "0" "2"
As we can see, the types into CROPDMGEXP is included into PROPDMGEXP
# convert to upper letters
weather$PROPDMGEXP <- str_to_upper(weather$PROPDMGEXP)
weather$CROPDMGEXP <- str_to_upper(weather$CROPDMGEXP)
#creating a new dataframe
expn <- weather %>%
select(PROPDMGEXP) %>%
unique() %>%
arrange(PROPDMGEXP) %>%
mutate(EXP = c(rep(1, 5), rep(10, 9)^seq(1, 9), 100, 1000, 1000000))
# Checking Data types and values
str(expn)
## 'data.frame': 17 obs. of 2 variables:
## $ PROPDMGEXP: chr "" "-" "?" "+" ...
## $ EXP : num 1e+00 1e+00 1e+00 1e+00 1e+00 1e+01 1e+02 1e+03 1e+04 1e+05 ...
print(expn)
## PROPDMGEXP EXP
## 1 1e+00
## 2 - 1e+00
## 3 ? 1e+00
## 4 + 1e+00
## 5 0 1e+00
## 6 1 1e+01
## 7 2 1e+02
## 8 3 1e+03
## 9 4 1e+04
## 10 5 1e+05
## 11 6 1e+06
## 12 7 1e+07
## 13 8 1e+08
## 14 B 1e+09
## 15 H 1e+02
## 16 K 1e+03
## 17 M 1e+06
weather <- weather %>%
merge(expn, by.x = "PROPDMGEXP", by.y = "PROPDMGEXP", all.x = TRUE) %>%
rename(PEXP = EXP) %>%
merge(expn, by.x = "CROPDMGEXP", by.y = "PROPDMGEXP", all.x = TRUE) %>%
rename(CEXP = EXP) %>%
mutate(PROPDMGMount = PROPDMG * PEXP, CROPDMGMount = CROPDMG * CEXP)
head(weather)
## CROPDMGEXP PROPDMGEXP EVTYPE FATALITIES INJURIES PROPDMG CROPDMG PEXP CEXP
## 1 TSTM WIND 0 0 0 0 1 1
## 2 TSTM WIND 0 0 0 0 1 1
## 3 TSTM WIND 0 0 0 0 1 1
## 4 TSTM WIND 0 0 0 0 1 1
## 5 TSTM WIND 0 0 0 0 1 1
## 6 TSTM WIND 0 0 0 0 1 1
## PROPDMGMount CROPDMGMount
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
Summarizing all the event types in descending order to identify how long is the data and what values are highest.
harmful <- weather %>%
group_by(EVTYPE) %>%
summarise(injuries = sum(INJURIES), fatalities = sum(FATALITIES)) %>%
arrange(desc(injuries)) %>%
print()
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 985 x 3
## EVTYPE injuries fatalities
## <chr> <dbl> <dbl>
## 1 TORNADO 91346 5633
## 2 TSTM WIND 6957 504
## 3 FLOOD 6789 470
## 4 EXCESSIVE HEAT 6525 1903
## 5 LIGHTNING 5230 816
## 6 HEAT 2100 937
## 7 ICE STORM 1975 89
## 8 FLASH FLOOD 1777 978
## 9 THUNDERSTORM WIND 1488 133
## 10 HAIL 1361 15
## # ... with 975 more rows
Summarizing in OTHERS the less injuries from position 10 to the end of the list.
othersh <- harmful %>%
filter(between(row_number(), 10, n())) %>%
summarise(EVTYPE = "OTHERS(976 EVTYPES)", injuries = sum(injuries), fatalities = sum(fatalities)) %>%
print()
## # A tibble: 1 x 3
## EVTYPE injuries fatalities
## <chr> <dbl> <dbl>
## 1 OTHERS(976 EVTYPES) 16341 3682
Summarizing the tables in only 10 rows in order to identify the most harmful.
harmful <- harmful %>%
filter(between(row_number(), 1, 9)) %>%
full_join(othersh) %>%
mutate(total = injuries + fatalities) %>%
print()
## Joining, by = c("EVTYPE", "injuries", "fatalities")
## # A tibble: 10 x 4
## EVTYPE injuries fatalities total
## <chr> <dbl> <dbl> <dbl>
## 1 TORNADO 91346 5633 96979
## 2 TSTM WIND 6957 504 7461
## 3 FLOOD 6789 470 7259
## 4 EXCESSIVE HEAT 6525 1903 8428
## 5 LIGHTNING 5230 816 6046
## 6 HEAT 2100 937 3037
## 7 ICE STORM 1975 89 2064
## 8 FLASH FLOOD 1777 978 2755
## 9 THUNDERSTORM WIND 1488 133 1621
## 10 OTHERS(976 EVTYPES) 16341 3682 20023
f <- ggplot(harmful, aes(injuries, EVTYPE))
print(f + geom_bar(stat = "identity", fill = "blue") + labs(title = "Injuries Accross United states", y = "Event Type", x = "Injuries"))
f <- ggplot(harmful, aes(fatalities, EVTYPE))
print(f + geom_bar(stat = "identity", fill = "blue") + labs(title = "Fatalities Accross United states", y = "Event Type", x = "Fatalities"))
According the table and plot the most harmful is TORNADO with 91346 injuries and 5633 fatalities.
Summarizing all the event types in descending order to identify which event type is the highest.
dmg <- weather %>%
group_by(EVTYPE) %>%
summarise(damages = sum(PROPDMGMount) + sum(CROPDMGMount)) %>%
arrange(desc(damages)) %>%
print()
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 985 x 2
## EVTYPE damages
## <chr> <dbl>
## 1 FLOOD 150319678257
## 2 HURRICANE/TYPHOON 71913712800
## 3 TORNADO 57362333946.
## 4 STORM SURGE 43323541000
## 5 HAIL 18761221986.
## 6 FLASH FLOOD 18243991078.
## 7 DROUGHT 15018672000
## 8 HURRICANE 14610229010
## 9 RIVER FLOOD 10148404500
## 10 ICE STORM 8967041360
## # ... with 975 more rows
Summarizing in OTHERS the less damages from position 10 to the end of the list.
othersd <- dmg %>%
filter(between(row_number(), 10, n())) %>%
summarise(EVTYPE = "OTHERS(976 EVTYPES)", damages = sum(damages)) %>%
print()
## # A tibble: 1 x 2
## EVTYPE damages
## <chr> <dbl>
## 1 OTHERS(976 EVTYPES) 1076299.
Summarizing the tables in only 10 rows in order to identify the most harmful.
dmg <- dmg %>%
filter(between(row_number(), 1, 9)) %>%
full_join(othersd) %>%
mutate(dmgM = damages / 100000000) %>%
mutate(percent = paste(format(100 * damages/sum(damages), scientific = FALSE), "%", sep = "")) %>%
print()
## Joining, by = c("EVTYPE", "damages")
## # A tibble: 10 x 4
## EVTYPE damages dmgM percent
## <chr> <dbl> <dbl> <chr>
## 1 FLOOD 150319678257 1503. "37.6078564780%"
## 2 HURRICANE/TYPHOON 71913712800 719. "17.9917933643%"
## 3 TORNADO 57362333946. 574. "14.3512442770%"
## 4 STORM SURGE 43323541000 433. "10.8389369305%"
## 5 HAIL 18761221986. 188. " 4.6937922697%"
## 6 FLASH FLOOD 18243991078. 182. " 4.5643884156%"
## 7 DROUGHT 15018672000 150. " 3.7574592203%"
## 8 HURRICANE 14610229010 146. " 3.6552725637%"
## 9 RIVER FLOOD 10148404500 101. " 2.5389872061%"
## 10 OTHERS(976 EVTYPES) 1076299. 0.0108 " 0.0002692748%"
f <- ggplot(dmg, aes(dmgM, EVTYPE))
print(f + geom_bar(stat = "identity", fill = "blue") + labs(title = "Damages Accross United states", y = "Event Type", x = "Damages (B USD)"))
According the table and plot the most harmful is FLOOD with 150319678257 USD in damages.