###1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health? ###2. Across the United States, which types of events have the greatest economic consequences?
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
destfile <- "C:/Users/roany/Documents/R_Language/JHU_DataScience/ReproducibleResearch/StormData.csv"
download.file(url, destfile)
data <- read.csv(destfile)
head(data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
names(data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
summary(data)
## STATE__ BGN_DATE BGN_TIME
## Min. : 1.0 5/25/2011 0:00:00: 1202 12:00:00 AM: 10163
## 1st Qu.:19.0 4/27/2011 0:00:00: 1193 06:00:00 PM: 7350
## Median :30.0 6/9/2011 0:00:00 : 1030 04:00:00 PM: 7261
## Mean :31.2 5/30/2004 0:00:00: 1016 05:00:00 PM: 6891
## 3rd Qu.:45.0 4/4/2011 0:00:00 : 1009 12:00:00 PM: 6703
## Max. :95.0 4/2/2006 0:00:00 : 981 03:00:00 PM: 6700
## (Other) :895866 (Other) :857229
## TIME_ZONE COUNTY COUNTYNAME STATE
## CST :547493 Min. : 0.0 JEFFERSON : 7840 TX : 83728
## EST :245558 1st Qu.: 31.0 WASHINGTON: 7603 KS : 53440
## MST : 68390 Median : 75.0 JACKSON : 6660 OK : 46802
## PST : 28302 Mean :100.6 FRANKLIN : 6256 MO : 35648
## AST : 6360 3rd Qu.:131.0 LINCOLN : 5937 IA : 31069
## HST : 2563 Max. :873.0 MADISON : 5632 NE : 30271
## (Other): 3631 (Other) :862369 (Other):621339
## EVTYPE BGN_RANGE BGN_AZI
## HAIL :288661 Min. : 0.000 :547332
## TSTM WIND :219940 1st Qu.: 0.000 N : 86752
## THUNDERSTORM WIND: 82563 Median : 0.000 W : 38446
## TORNADO : 60652 Mean : 1.484 S : 37558
## FLASH FLOOD : 54277 3rd Qu.: 1.000 E : 33178
## FLOOD : 25326 Max. :3749.000 NW : 24041
## (Other) :170878 (Other):134990
## BGN_LOCATI END_DATE END_TIME
## :287743 :243411 :238978
## COUNTYWIDE : 19680 4/27/2011 0:00:00: 1214 06:00:00 PM: 9802
## Countywide : 993 5/25/2011 0:00:00: 1196 05:00:00 PM: 8314
## SPRINGFIELD : 843 6/9/2011 0:00:00 : 1021 04:00:00 PM: 8104
## SOUTH PORTION: 810 4/4/2011 0:00:00 : 1007 12:00:00 PM: 7483
## NORTH PORTION: 784 5/30/2004 0:00:00: 998 11:59:00 PM: 7184
## (Other) :591444 (Other) :653450 (Other) :622432
## COUNTY_END COUNTYENDN END_RANGE END_AZI
## Min. :0 Mode:logical Min. : 0.0000 :724837
## 1st Qu.:0 NA's:902297 1st Qu.: 0.0000 N : 28082
## Median :0 Median : 0.0000 S : 22510
## Mean :0 Mean : 0.9862 W : 20119
## 3rd Qu.:0 3rd Qu.: 0.0000 E : 20047
## Max. :0 Max. :925.0000 NE : 14606
## (Other): 72096
## END_LOCATI LENGTH WIDTH
## :499225 Min. : 0.0000 Min. : 0.000
## COUNTYWIDE : 19731 1st Qu.: 0.0000 1st Qu.: 0.000
## SOUTH PORTION : 833 Median : 0.0000 Median : 0.000
## NORTH PORTION : 780 Mean : 0.2301 Mean : 7.503
## CENTRAL PORTION: 617 3rd Qu.: 0.0000 3rd Qu.: 0.000
## SPRINGFIELD : 575 Max. :2315.0000 Max. :4400.000
## (Other) :380536
## F MAG FATALITIES INJURIES
## Min. :0.0 Min. : 0.0 Min. : 0.0000 Min. : 0.0000
## 1st Qu.:0.0 1st Qu.: 0.0 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median :1.0 Median : 50.0 Median : 0.0000 Median : 0.0000
## Mean :0.9 Mean : 46.9 Mean : 0.0168 Mean : 0.1557
## 3rd Qu.:1.0 3rd Qu.: 75.0 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :5.0 Max. :22000.0 Max. :583.0000 Max. :1700.0000
## NA's :843563
## PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## Min. : 0.00 :465934 Min. : 0.000 :618413
## 1st Qu.: 0.00 K :424665 1st Qu.: 0.000 K :281832
## Median : 0.00 M : 11330 Median : 0.000 M : 1994
## Mean : 12.06 0 : 216 Mean : 1.527 k : 21
## 3rd Qu.: 0.50 B : 40 3rd Qu.: 0.000 0 : 19
## Max. :5000.00 5 : 28 Max. :990.000 B : 9
## (Other): 84 (Other): 9
## WFO STATEOFFIC
## :142069 :248769
## OUN : 17393 TEXAS, North : 12193
## JAN : 13889 ARKANSAS, Central and North Central: 11738
## LWX : 13174 IOWA, Central : 11345
## PHI : 12551 KANSAS, Southwest : 11212
## TSA : 12483 GEORGIA, North and Central : 11120
## (Other):690738 (Other) :595920
## ZONENAMES
## :594029
## :205988
## GREATER RENO / CARSON CITY / M - GREATER RENO / CARSON CITY / M : 639
## GREATER LAKE TAHOE AREA - GREATER LAKE TAHOE AREA : 592
## JEFFERSON - JEFFERSON : 303
## MADISON - MADISON : 302
## (Other) :100444
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_
## Min. : 0 Min. :-14451 Min. : 0 Min. :-14455
## 1st Qu.:2802 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0
## Median :3540 Median : 8707 Median : 0 Median : 0
## Mean :2875 Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.:4019 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. :9706 Max. : 17124 Max. :9706 Max. :106220
## NA's :47 NA's :40
## REMARKS REFNUM
## :287433 Min. : 1
## : 24013 1st Qu.:225575
## Trees down.\n : 1110 Median :451149
## Several trees were blown down.\n : 568 Mean :451149
## Trees were downed.\n : 446 3rd Qu.:676723
## Large trees and power lines were blown down.\n: 432 Max. :902297
## (Other) :588295
#Variable of interest is event type; so it is good to take a peek of the data
summary(data$EVTYPE)
## HAIL TSTM WIND THUNDERSTORM WIND
## 288661 219940 82563
## TORNADO FLASH FLOOD FLOOD
## 60652 54277 25326
## THUNDERSTORM WINDS HIGH WIND LIGHTNING
## 20843 20212 15754
## HEAVY SNOW HEAVY RAIN WINTER STORM
## 15708 11723 11433
## WINTER WEATHER FUNNEL CLOUD MARINE TSTM WIND
## 7026 6839 6175
## MARINE THUNDERSTORM WIND WATERSPOUT STRONG WIND
## 5812 3796 3566
## URBAN/SML STREAM FLD WILDFIRE BLIZZARD
## 3392 2761 2719
## DROUGHT ICE STORM EXCESSIVE HEAT
## 2488 2006 1678
## HIGH WINDS WILD/FOREST FIRE FROST/FREEZE
## 1533 1457 1342
## DENSE FOG WINTER WEATHER/MIX TSTM WIND/HAIL
## 1293 1104 1028
## EXTREME COLD/WIND CHILL HEAT HIGH SURF
## 1002 767 725
## TROPICAL STORM FLASH FLOODING EXTREME COLD
## 690 682 655
## COASTAL FLOOD LAKE-EFFECT SNOW FLOOD/FLASH FLOOD
## 650 636 624
## LANDSLIDE SNOW COLD/WIND CHILL
## 600 587 539
## FOG RIP CURRENT MARINE HAIL
## 538 470 442
## DUST STORM AVALANCHE WIND
## 427 386 340
## RIP CURRENTS STORM SURGE FREEZING RAIN
## 304 261 250
## URBAN FLOOD HEAVY SURF/HIGH SURF EXTREME WINDCHILL
## 249 228 204
## STRONG WINDS DRY MICROBURST ASTRONOMICAL LOW TIDE
## 196 186 174
## HURRICANE RIVER FLOOD LIGHT SNOW
## 174 173 154
## STORM SURGE/TIDE RECORD WARMTH COASTAL FLOODING
## 148 146 143
## DUST DEVIL MARINE HIGH WIND UNSEASONABLY WARM
## 141 135 126
## FLOODING ASTRONOMICAL HIGH TIDE MODERATE SNOWFALL
## 120 103 101
## URBAN FLOODING WINTRY MIX HURRICANE/TYPHOON
## 98 90 88
## FUNNEL CLOUDS HEAVY SURF RECORD HEAT
## 87 84 81
## FREEZE HEAT WAVE COLD
## 74 74 72
## RECORD COLD ICE THUNDERSTORM WINDS HAIL
## 64 61 61
## TROPICAL DEPRESSION SLEET UNSEASONABLY DRY
## 60 59 56
## FROST GUSTY WINDS THUNDERSTORM WINDSS
## 53 53 51
## MARINE STRONG WIND OTHER SMALL HAIL
## 48 48 47
## FUNNEL FREEZING FOG THUNDERSTORM
## 46 45 45
## Temperature record TSTM WIND (G45) Coastal Flooding
## 43 39 38
## WATERSPOUTS MONTHLY PRECIPITATION WINDS
## 37 36 36
## (Other)
## 2940
## [1] "from here, we can tell that there are some typos and inconsistent terms for example 'THUNDERSTORM WIND' and 'THUNDERSTORM WINDSS'. So we have to scrub the text data before we can do any analysis."
###1. For the event types
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## <U+2713> ggplot2 3.3.0 <U+2713> purrr 0.3.3
## <U+2713> tibble 3.0.0 <U+2713> dplyr 0.8.5
## <U+2713> tidyr 1.0.2 <U+2713> stringr 1.4.0
## <U+2713> readr 1.3.1 <U+2713> forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
###Few things we need to do here.
###1. Convert the event type into character
data$EVTYPE <- as.character(data$EVTYPE)
###2. Convert the event type into lower case (This is to remove the inconsistencies)
data$EVTYPE <- tolower(data$EVTYPE)
###3. It seems that the records are keyed in using free text. We will have to consolidate some of the categories to make meaningful analysis. We will also override these records and other inconsistent data entries with standardised event types.
data$EVTYPE[grepl("(thunder|tstm.*)|(thu(n|?).*)|tun.*", data$EVTYPE)] <- "thunderstorm"
data$EVTYPE[grepl("microburst|downburst(\\s?).*", data$EVTYPE)] <- "downbursts"
data$EVTYPE[grepl("gusty|high wind|wind|wnd.*", data$EVTYPE)] <- "high winds"
data$EVTYPE[grepl("funnel|torn(a|d)(d|a)o|landspout|wa(y|?)ter(\\s?)spout|gustnado|downburst|dust|high winds.*", data$EVTYPE)] <- "tornado, whirlwind, and high winds"
data$EVTYPE[grepl("(TROPICAL.STORM|tropical.*)|(hurri|opal)|typhoon|remnants", data$EVTYPE)] <- "tropical cyclone"
data$EVTYPE[grepl("volc", data$EVTYPE)] <- "volcanic eruption"
data$EVTYPE[grepl("ic(e|y)(\\s?)storm|snow(\\s?)storm|winter storm|blizzard.*", data$EVTYPE)] <- "winter storm"
data$EVTYPE[grepl("rip current.*", data$EVTYPE)] <- "rip currents"
data$EVTYPE[grepl("fire|smoke", data$EVTYPE)] <- "firestorm"
data$EVTYPE[grepl("light(n|?)ing.*", data$EVTYPE)] <- "lightning"
data$EVTYPE[grepl("urban|urban(/|?|\\s)sm|flood.*", data$EVTYPE)] <- "flood"
data$EVTYPE[grepl("hail.*", data$EVTYPE)] <- "hail"
data$EVTYPE[grepl("(unseasonably cool|dry)|(unusually cold|snow)|(winter(y|?) weather|mix)|hail", data$EVTYPE)] <- "winter precipitation"
data$EVTYPE[grepl("heat|warm|drought|high temperature|hot.*", data$EVTYPE)] <- "heat and drought"
data$EVTYPE[grepl("frost|cold|freeze|glaze|ice|fog.*", data$EVTYPE)] <- "frost and freeze"
data$EVTYPE[grepl("rain|wet|shower.*", data$EVTYPE)] <- "rain"
data$EVTYPE[grepl("sleet.*|freezing drizzle|freezing spray", data$EVTYPE)] <- "winter precipitation"
data$EVTYPE[grepl("aval*|slide|landslump", data$EVTYPE)] <- "avalance and landslide"
data$EVTYPE[grepl("tsunami|wave|surge|coastal|swell|marine|surf|tide", data$EVTYPE)] <- "tsunami, waves, and tides"
###2. For property and crop damages To find out types of events have the greatest economic consequences, we need to analyse and sum up property damages (PROPDMG) and crop damages (CROPDMG).
We have to also take note of their corresponding units in PROPDMGEXP and CROPDMGEXP, which indicate if the damages are in hundreds, thousands, millions, or billions.
###Let's start with examining what's in PROPDMGEXP and CROPDMGEXP.
table(data$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5 6
## 465934 1 8 5 216 25 13 4 4 28 4
## 7 8 B h H K m M
## 5 1 40 1 6 424665 7 11330
table(data$CROPDMGEXP)
##
## ? 0 2 B k K m M
## 618413 7 19 1 9 21 281832 1 1994
###We can see that there are a mix of numbers and letters in both PROPDMGEXP and CROPDMGEXP. The numbers represent the power of ten (10^The number). We will have to convert the letters to numbers and multiply these numbers with PROPDMG and CROPDMG respectively.
###1. Cleaning the data in PROPDMGEXP
data$PROPDMGEXP <- gsub("[Hh]", "2", data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("[Kk]", "3", data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("[Mm]", "6", data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("[Bb]", "9", data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("\\+", "1", data$PROPDMGEXP)
data$PROPDMGEXP <- gsub("\\?|\\-|\\ ", "0", data$PROPDMGEXP)
data$PROPDMGEXP <- as.numeric(data$PROPDMGEXP)
###2. Cleaning the data in CROPDMGEXP
data$CROPDMGEXP <- gsub("[Hh]", "2", data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("[Kk]", "3", data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("[Mm]", "6", data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("[Bb]", "9", data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("\\+", "1", data$CROPDMGEXP)
data$CROPDMGEXP <- gsub("\\-|\\?|\\ ", "0", data$CROPDMGEXP)
data$CROPDMGEXP <- as.numeric(data$CROPDMGEXP)
###3. Examine PROPDMGEXP and CROPDMGEXP to see if the data processing is successful.
table(data$PROPDMGEXP)
##
## 0 1 2 3 4 5 6 7 8 9
## 225 30 20 424669 4 28 11341 5 1 40
table(data$CROPDMGEXP)
##
## 0 2 3 6 9
## 26 1 281853 1995 9
##1. Types of events (as indicated in the EVTYPE variable) that are most harmful with respect to population health.
#From the col names, we can see that the variables of interest are INJURIES and FATALITIES. We shall use these two parameters - sorted first by INJURIES and then FATALITIES - to identify top 10 events that are most harmful to population health.
harmful_events <- data %>% select(EVTYPE, INJURIES, FATALITIES) %>% group_by(EVTYPE) %>% summarise(INJURIES = sum(INJURIES), FATALITIES = sum(FATALITIES)) %>% arrange(desc(INJURIES), desc(FATALITIES))
#top 10 types of harmful events
head(harmful_events,10)
## # A tibble: 10 x 3
## EVTYPE INJURIES FATALITIES
## <chr> <dbl> <dbl>
## 1 tornado, whirlwind, and high winds 93944 6363
## 2 thunderstorm 9545 756
## 3 heat and drought 9247 3149
## 4 flood 8681 1553
## 5 lightning 5231 817
## 6 winter storm 4136 406
## 7 winter precipitation 3148 276
## 8 frost and freeze 1737 312
## 9 tropical cyclone 1716 199
## 10 firestorm 1608 90
#Visualising the data using ggplot with light blue as the number of injuries and light red as the number of fatalities.
harmful_events %>% head(10) %>% ggplot() + geom_bar(aes(x = reorder(EVTYPE, INJURIES), y = INJURIES, fill = "light blue"), stat = "identity") + geom_bar(aes(x = reorder(EVTYPE, FATALITIES), y = FATALITIES, fill= "light red"), stat = "identity") + coord_flip() + theme(legend.position = "none") + labs(y = "Number of fatalities and injuries", x = "Event Type", title = "Total people loss in USA by weather events in 1996-2011")
##2. Types of events that have the greatest economic consequences.
#Let's first filter the variables of interest, namely: EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP.
#We have to remove any NA values, before proceeding with the calculations: 1. calculating property damages (PROPDMG * PROPDMGEXP), 2. calcultaing crop damages (CROPDMG * CROPDMGEXP), and 3. sum the damages.
events_econs_dmg <- data %>% select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) %>% drop_na() %>% mutate(TOTALPROPDMG = PROPDMG * PROPDMGEXP, TOTALCROPDMG = CROPDMG * CROPDMGEXP, TOTALDMG = TOTALPROPDMG + TOTALCROPDMG) %>% group_by(EVTYPE) %>% summarise(TOTALDMG = sum(TOTALDMG)) %>% arrange(desc(TOTALDMG))
#top 10 types of events with great economic consequences
head(events_econs_dmg,10)
## # A tibble: 10 x 2
## EVTYPE TOTALDMG
## <chr> <dbl>
## 1 flood 4923575.
## 2 thunderstorm 3830725.
## 3 tornado, whirlwind, and high winds 2852939.
## 4 winter precipitation 2528811.
## 5 lightning 604725.
## 6 winter storm 304260.
## 7 tropical cyclone 224874.
## 8 firestorm 201470.
## 9 rain 79560.
## 10 heat and drought 79539.
#Visualising the data using ggplot
events_econs_dmg %>% head(10) %>% ggplot() + geom_bar(aes(x = reorder(EVTYPE, TOTALDMG), y = TOTALDMG), stat = "identity") + coord_flip() + labs(y = "Size of property and crop loss", x = "Event Type", title = "Total economic loss in USA by weather events in 1996-2011")