Sypnosis:
Data can be downloaded into the working directory from here
file <- "repdata_data_StormData.csv.bz2"
NOAA <- read.csv(file)
The variables of EVTYPE contains meaningless values such as “Summary of August 10” which seem out of place in the data. Create a data.frame without these rows.
subset <- NOAA[-(grep("Summary", NOAA$"EVTYPE")),]
Clean up the EVTYPE variable by changing all values to upper case and view them in table form.
evtype_list <- toupper(subset$EVTYPE)
head(evtype_list)
## [1] "TORNADO" "TORNADO" "TORNADO" "TORNADO" "TORNADO" "TORNADO"
The uncleaned EVTYPE are all in upper case now
evtype_list_sorted <- sort(unique(evtype_list))
head(evtype_list_sorted)
## [1] " HIGH SURF ADVISORY" " COASTAL FLOOD" " FLASH FLOOD"
## [4] " LIGHTNING" " TSTM WIND" " TSTM WIND (G45)"
And the uncleaned EVTYPE are sorted in order.
Many different event types exist, and they need cleaning up to categorise into any of the 48 allowable events
evtype_list <- gsub("(.*)LAKE(.*)FLOOD(.*)", "Lakeshore Flood", evtype_list)
evtype_list <- gsub("(.*)MARINE HA(.*)", "Marine Hail", evtype_list)
evtype_list <- gsub("(.*)MARINE HI(.*)", "Marine High Wind", evtype_list)
evtype_list <- gsub("(.*)MARINE ST(.*)", "Marine Strong Wind", evtype_list)
evtype_list <- gsub("(.*)MARINE(.*)ST(.*)", "Marine Thunderstorm Wind", evtype_list)
evtype_list <- gsub("(.*)THUNDERSTORM(.*)", "Thunderstorm Wind", evtype_list)
evtype_list <- gsub("^TSTM(.*)", "Thunderstorm Wind", evtype_list)
evtype_list <- gsub("(.*)NON(.*)TSTM(.*)", "Strong Wind", evtype_list)
evtype_list <- gsub("^HEAVY ( ?)RA(.*)", "Heavy Rain", evtype_list)
evtype_list <- gsub("(.*)RAIN(.*)SLEET(.*)", "Winter Storm", evtype_list)
evtype_list <- gsub("(.*)SLEET(.*)RAIN(.*)", "Winter Storm", evtype_list)
evtype_list <- gsub("(.*)SNOW(.*)SLEET(.*)", "Winter Storm", evtype_list)
evtype_list <- gsub("(.*)SLEET(.*)", "Sleet", evtype_list)
evtype_list <- gsub("(.*)SURGE(.*)", "Storm Surge/Tide", evtype_list)
evtype_list <- gsub("(.*)RIP(.*)", "Rip Current", evtype_list)
evtype_list <- gsub("(.*)WINTER STORM(.*)", "Winter Storm", evtype_list)
evtype_list <- gsub("(.*)WIN(.*)(WEATHER|(MIX$))(.*)", "Winter Weather", evtype_list)
evtype_list <- gsub("(.*)EXTREME(.*)C(.*)", "Extreme Cold/Wind Chill", evtype_list)
evtype_list <- gsub("(.*)FIRE(.*)", "Wildfire", evtype_list)
evtype_list <- gsub("^W(.*){4,5}SPOUT(.*){1}$", "Waterspout", evtype_list)
evtype_list <- gsub("(.*)VOL(.*)", "Volcanic Ash", evtype_list)
evtype_list <- gsub("(.*)ASTRO(.*)LOW(.*)", "Astronomical Low Tide", evtype_list)
evtype_list <- gsub("(.*)FLASH(.*)", "Flash Flood", evtype_list)
evtype_list <- gsub("(^BLIZZARD(.*))|(^GROUND(.*))", "Blizzard", evtype_list)
evtype_list <- gsub("^AV(.*)", "Avalanche", evtype_list)
evtype_list <- gsub("^HIGH WI(.*)", "High Wind", evtype_list)
evtype_list <- gsub("(.*)COA(.*)", "Coastal Flood", evtype_list)
evtype_list <- gsub("(.*)SLIDE(.*)", "Debris Flow", evtype_list)
evtype_list <- gsub("(.*)SMOKE(.*)", "Dense Smoke", evtype_list)
evtype_list <- gsub("((.*)DENSE FOG(.*))|^FOG$", "Dense Fog", evtype_list)
evtype_list <- gsub("(.*)FOG(.*)", "Freezing Fog", evtype_list)
evtype_list <- gsub("(.*)EXCESSIVE(.*)H(.*)", "Excessive Heat", evtype_list)
evtype_list <- gsub("(.*)DROUGHT$", "Drought", evtype_list)
evtype_list <- gsub("(.*)DEV(.*)", "Dust Devil", evtype_list)
evtype_list <- gsub("(.*)DU(.*)", "Dust Storm", evtype_list)
evtype_list <- gsub("(.*)EXTREME H(.*)", "Excessive Heat", evtype_list)
evtype_list <- gsub("(.*)EX(.*) C(.*)", "Extreme Cold/Wind Chill", evtype_list)
evtype_list <- gsub("^H(.*)SN(.*)", "Heavy Snow", evtype_list)
evtype_list <- gsub("(.*)FLOOD(.*)", "Flood", evtype_list)
evtype_list <- gsub("((.*)FROST(.*))|((.*)FREEZE(.*))", "Frost/Freeze", evtype_list)
evtype_list <- gsub("(.*)FUNNEL(.*)", "Funnel Cloud", evtype_list)
evtype_list <- gsub("(.*)TORN(.*)", "Tornado", evtype_list)
evtype_list <- gsub("(.*)HAIL(.*)", "Hail", evtype_list)
evtype_list <- gsub("(.*)HEAT(.*)", "Heat", evtype_list)
evtype_list <- gsub("(.*)LIGHTN(.*)", "Lightning", evtype_list)
evtype_list <- gsub("(.*)HEAVY R(.*)", "Heavy Rain", evtype_list)
evtype_list <- gsub("(.*)HEAVY SN(.*)", "Heavy Snow", evtype_list)
evtype_list <- gsub("(.*)SURF(.*)", "High Surf", evtype_list)
evtype_list <- gsub("((.*)HUR(.*))|((.*)TYPHOON(.*))", "Hurricane (Typhoon)", evtype_list)
evtype_list <- gsub("(.*)ICE STORM(.*)", "Ice Storm", evtype_list)
evtype_list <- gsub("(.*)LAKE(.*)SNOW(.*)", "Lake-Effect Snow", evtype_list)
evtype_list <- gsub("(.*)(RECORD|SEVERE|PROLONG|BITTER) COLD(.*)", "Extreme Cold/Wind Chill", evtype_list)
evtype_list <- gsub("(.*)RECORD(.*)SNOW(.*)", "Heavy Snow", evtype_list)
evtype_list <- gsub("(.*)SEICHE(.*)", "Seiche", evtype_list)
evtype_list <- gsub("(.*)STRONG W(.*)", "Strong Wind", evtype_list)
evtype_list <- gsub("(.*)DEPRESSION(.*)", "Tropical Depression", evtype_list)
evtype_list <- gsub("(.*)TSUNAMI(.*)", "Tsunami", evtype_list)
evtype_list <- gsub("(.*)MIX(.*)", "Sleet", evtype_list)
evtype_list <- gsub("(.*)WIND CHILL(.*)", "Extreme Cold/Wind Chill", evtype_list)
evtype_list <- gsub("((.*)HIGH WIND(.*))|((.*)HIGH WIND(.*))", "High Wind", evtype_list)
evtype_list <- gsub("(.*)TSTM WIND(.*)", "Thunderstorm Wind", evtype_list)
evtype_list <- gsub("(.*)THUNDER(.*)WIND(.*)", "Thunderstorm Wind", evtype_list)
evtype_list <- gsub("(.*)T(.*)STORM WIND(.*)", "Thunderstorm Wind", evtype_list)
evtype_list <- gsub("(.*)DRY(.*)", "Drought", evtype_list)
evtype_list <- gsub("(.*)URBAN(.*)", "Flash Flood", evtype_list)
evtype_list <- gsub("(.*)(RECORD|UNUSUALLY)(.*)COLD(.*)", "Extreme Cold/Wind Chill", evtype_list)
evtype_list <- gsub("(.*)THUNDER(.*)", "Heavy Snow", evtype_list)
evtype_list <- gsub("(.*)COLD(.*)", "Cold/Wind Chill", evtype_list)
evtype_list <- gsub("(.*)THUNDER(.*)SNOW(.*)", "Heavy Snow", evtype_list)
evtype_list <- gsub("(.*)SNOW(.*)STORM(.*)", "Heavy Snow", evtype_list)
evtype_list <- gsub("(.*)EX(.*)SNOW(.*)", "Heavy Snow", evtype_list)
evtype_list <- gsub("(.*)TROPICAL(.*)", "Tropical Storm", evtype_list)
evtype_list <- gsub("(.*)BURST(.*)", "High Wind", evtype_list)
evtype_list <- gsub("(.*)FREE(.*)", "Frost/Freeze", evtype_list)
evtype_list <- gsub("(.*)FLD(.*)", "Flash Flood", evtype_list)
evtype_list <- gsub("(.*)WIND(.*)|WND", "Strong Wind", evtype_list)
evtype_list <- gsub("^LACK(.*)", "Drought", evtype_list)
evtype_list <- gsub("(.*)LOW TEMP(.*)", "Cold/Wind Chill", evtype_list)
evtype_list <- gsub("(.*)SNOW(.*)", "Heavy Snow", evtype_list)
sort(unique(evtype_list), decreasing = TRUE)
## [1] "Winter Weather" "Winter Storm"
## [3] "Wildfire" "WET YEAR"
## [5] "WET WEATHER" "WET MONTH"
## [7] "Waterspout" "WARM WEATHER"
## [9] "WALL CLOUD" "Volcanic Ash"
## [11] "VOG" "VERY WARM"
## [13] "UNUSUALLY WARM" "UNUSUAL/RECORD WARMTH"
## [15] "UNUSUAL WARMTH" "UNSEASONAL RAIN"
## [17] "UNSEASONABLY WET" "UNSEASONABLY WARM/WET"
## [19] "UNSEASONABLY WARM YEAR" "UNSEASONABLY WARM & WET"
## [21] "UNSEASONABLY WARM" "UNSEASONABLY HOT"
## [23] "UNSEASONABLY COOL & WET" "UNSEASONABLY COOL"
## [25] "Tsunami" "Tropical Storm"
## [27] "Tropical Depression" "TORRENTIAL RAINFALL"
## [29] "TORRENTIAL RAIN" "Tornado"
## [31] "Thunderstorm Wind" "TEMPERATURE RECORD"
## [33] "SUMMARY OF MARCH 29" "SUMMARY OF MARCH 27"
## [35] "SUMMARY OF MARCH 24-25" "Strong Wind"
## [37] "Storm Surge/Tide" "SOUTHEAST"
## [39] "SMALL STREAM AND" "SMALL STREAM"
## [41] "Sleet" "SEVERE TURBULENCE"
## [43] "Seiche" "ROUGH SEAS"
## [45] "ROTATING WALL CLOUD" "ROGUE WAVE"
## [47] "Rip Current" "REMNANTS OF FLOYD"
## [49] "RED FLAG CRITERIA" "RECORD/EXCESSIVE RAINFALL"
## [51] "RECORD WARMTH" "RECORD WARM TEMPS."
## [53] "RECORD WARM" "RECORD TEMPERATURES"
## [55] "RECORD TEMPERATURE" "RECORD RAINFALL"
## [57] "RECORD PRECIPITATION" "RECORD LOW RAINFALL"
## [59] "RECORD LOW" "RECORD HIGH TEMPERATURES"
## [61] "RECORD HIGH TEMPERATURE" "RECORD HIGH"
## [63] "RECORD COOL" "RAPIDLY RISING WATER"
## [65] "RAINSTORM" "RAIN DAMAGE"
## [67] "RAIN (HEAVY)" "RAIN"
## [69] "PROLONGED RAIN" "PROLONG WARMTH"
## [71] "PATCHY ICE" "OTHER"
## [73] "NORTHERN LIGHTS" "NORMAL PRECIPITATION"
## [75] "NONE" "NO SEVERE WEATHER"
## [77] "MONTHLY TEMPERATURE" "MONTHLY RAINFALL"
## [79] "MONTHLY PRECIPITATION" "MILD PATTERN"
## [81] "METRO STORM, MAY 26" "Marine Thunderstorm Wind"
## [83] "Marine Strong Wind" "MARINE MISHAP"
## [85] "Marine High Wind" "Marine Hail"
## [87] "MARINE ACCIDENT" "LIGNTNING"
## [89] "Lightning" "LIGHTING"
## [91] "LARGE WALL CLOUD" "LANDSPOUT"
## [93] "LANDSLUMP" "Lakeshore Flood"
## [95] "Lake-Effect Snow" "ICY ROADS"
## [97] "ICESTORM/BLIZZARD" "Ice Storm"
## [99] "ICE ROADS" "ICE PELLETS"
## [101] "ICE ON ROAD" "ICE JAM"
## [103] "ICE FLOES" "ICE"
## [105] "HYPOTHERMIA/EXPOSURE" "HYPOTHERMIA"
## [107] "HYPERTHERMIA/EXPOSURE" "HVY RAIN"
## [109] "Hurricane (Typhoon)" "HOT WEATHER"
## [111] "HOT SPELL" "HOT PATTERN"
## [113] "High Wind" "HIGH WAVES"
## [115] "HIGH WATER" "HIGH TIDES"
## [117] "HIGH TEMPERATURE RECORD" "HIGH SWELLS"
## [119] "High Surf" "HIGH SEAS"
## [121] "HIGH SWELLS" "HIGH"
## [123] "HEAVY SWELLS" "Heavy Snow"
## [125] "HEAVY SHOWERS" "HEAVY SHOWER"
## [127] "HEAVY SEAS" "Heavy Rain"
## [129] "HEAVY PRECIPITATION" "HEAVY PRECIPATATION"
## [131] "Heat" "Hail"
## [133] "GUSTNADO AND" "GUSTNADO"
## [135] "GLAZE ICE" "GLAZE"
## [137] "Funnel Cloud" "Frost/Freeze"
## [139] "Freezing Fog" "Flood"
## [141] "Flash Flood" "EXTREMELY WET"
## [143] "Extreme Cold/Wind Chill" "EXCESSIVE WETNESS"
## [145] "EXCESSIVE RAINFALL" "EXCESSIVE RAIN"
## [147] "EXCESSIVE PRECIPITATION" "Excessive Heat"
## [149] "EXCESSIVE" "EARLY RAIN"
## [151] "Dust Storm" "Dust Devil"
## [153] "DROWNING" "Drought"
## [155] "DRIEST MONTH" "Dense Smoke"
## [157] "Dense Fog" "Debris Flow"
## [159] "DAM FAILURE" "DAM BREAK"
## [161] "COOL SPELL" "COOL AND WET"
## [163] "Cold/Wind Chill" "Coastal Flood"
## [165] "BLOW-OUT TIDES" "BLOW-OUT TIDE"
## [167] "Blizzard" "BLACK ICE"
## [169] "BELOW NORMAL PRECIPITATION" "BEACH EROSION"
## [171] "BEACH EROSIN" "Avalanche"
## [173] "Astronomical Low Tide" "ASTRONOMICAL HIGH TIDE"
## [175] "APACHE COUNTY" "ABNORMALLY WET"
## [177] "ABNORMAL WARMTH" "?"
## [179] " WATERSPOUT"
At this point in time, the event types have only been partially cleaned, but there are few unclean event types, so that should not affect the results much. Clean event types are differentiated by have a starting capitalized letter, and a second uncapitalized letter
clean_evtype_rows <- grep("[A-Z][a-z]", evtype_list)
# number of rows with unclean EVTYPE
unclean_EVTYPE_rows_num <- length(evtype_list[-clean_evtype_rows])
number10_EVTYPE <- sort(table(evtype_list), decreasing = TRUE)[10]
library(scales)
unclean_over_10th_event = percent(unclean_EVTYPE_rows_num/number10_EVTYPE)
unclean_over_10th_event
## [1] "8.57%"
The number of unclean row is only 8.57% of the event with the 10th most occurance, so is unlikely to affect determining which event type will lead to great population harm or economic consequences if these rows are not cleaned.
Pass the partially cleaned EVTYPE into “subset$EVTYPE”.
subset$EVTYPE <- evtype_list
Next up, clean the magnitude of the property and crop damages.
# Check the magnitude of the damages
table(subset$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5
## 465861 1 8 5 216 25 13 4 4 28
## 6 7 8 B h H K m M
## 4 5 1 40 1 6 424665 7 11330
table(subset$CROPDMGEXP)
##
## ? 0 2 B k K m M
## 618340 7 19 1 9 21 281832 1 1994
# The magnitude of damages needs cleaning as it contains other characters beside "B", "M" "K" and "H"
# Cleaning the magnitude of the damages
subset$PROPDMGEXP <- gsub("[Hh]", "H", subset$PROPDMGEXP)
subset$PROPDMGEXP <- gsub("[Mm]", "M", subset$PROPDMGEXP)
subset$CROPDMGEXP <- gsub("[Kk]", "K", subset$CROPDMGEXP)
subset$CROPDMGEXP <- gsub("[Mm]", "M", subset$CROPDMGEXP)
Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
Fatalities and injuries measurements are in column 23 and 24 of “subset”
pop_health <- c(23:24)
summary(subset[,pop_health])
## FATALITIES INJURIES
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000
## Mean : 0.0168 Mean : 0.1558
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :583.0000 Max. :1700.0000
Most of the events does not have any injuries and fatalities. Subset those events that contains injuries or fatalities.
subset_harm <- subset[subset[,23] > 0 | subset[,24] > 0, ]
# Find the total number of injuries/fatalities per row
library(dplyr)
subset_harm <- mutate(subset_harm, total_harmed = subset_harm[, 23] + subset_harm[, 24])
group_by_EVTYPE <- group_by(subset_harm, EVTYPE)
most_harm <- arrange(summarize(group_by_EVTYPE, harmed_total = sum(total_harmed)), desc(harmed_total))
most_harm[1:2]
## Source: local data frame [66 x 2]
##
## EVTYPE harmed_total
## 1 Tornado 97022
## 2 Thunderstorm Wind 10220
## 3 Excessive Heat 8698
## 4 Flood 7279
## 5 Lightning 6048
## 6 Heat 3645
## 7 Flash Flood 2944
## 8 Ice Storm 2079
## 9 High Wind 1800
## 10 Wildfire 1698
## .. ... ...
The top three most harmful events are Tornado, Thunderstorm Wind and Excessive Heat.