Storms and other severe weather conditions can cause both public health and economic problems. Severe events can result in injuries, fatalities, and damages such as property and crop damages. In this report, I will explore the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database to find useful information about the severe events and their effects. I also escribe the steps taken to answer the questions as well as the code and the results. The aim of this project is to explore the NOAA Storm Database and answer the questions about severe weather events: Q1 - Across the United States, which types of events are most harmful with respect to population health? and Q2 - Across the United States, which types of events have the greatest economic consequences?
# install.packages("dplyr", repos="http://cran.ca.r-project.org") # uncomment if needed: cran has to be set to install the library
library(data.table)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.3
library(ggplot2)
The data are loaded using the following command from the local .zip file in the cloned repository:
#unzip("activity.zip") # Uncomment if needed (so it does not unzip the data automatically)
dat <- data.table(read.csv("repdata%2Fdata%2FStormData.csv"))
summary(dat)
## STATE__ BGN_DATE BGN_TIME
## Min. : 1.0 5/25/2011 0:00:00: 1202 12:00:00 AM: 10163
## 1st Qu.:19.0 4/27/2011 0:00:00: 1193 06:00:00 PM: 7350
## Median :30.0 6/9/2011 0:00:00 : 1030 04:00:00 PM: 7261
## Mean :31.2 5/30/2004 0:00:00: 1016 05:00:00 PM: 6891
## 3rd Qu.:45.0 4/4/2011 0:00:00 : 1009 12:00:00 PM: 6703
## Max. :95.0 4/2/2006 0:00:00 : 981 03:00:00 PM: 6700
## (Other) :895866 (Other) :857229
## TIME_ZONE COUNTY COUNTYNAME STATE
## CST :547493 Min. : 0.0 JEFFERSON : 7840 TX : 83728
## EST :245558 1st Qu.: 31.0 WASHINGTON: 7603 KS : 53440
## MST : 68390 Median : 75.0 JACKSON : 6660 OK : 46802
## PST : 28302 Mean :100.6 FRANKLIN : 6256 MO : 35648
## AST : 6360 3rd Qu.:131.0 LINCOLN : 5937 IA : 31069
## HST : 2563 Max. :873.0 MADISON : 5632 NE : 30271
## (Other): 3631 (Other) :862369 (Other):621339
## EVTYPE BGN_RANGE BGN_AZI
## HAIL :288661 Min. : 0.000 :547332
## TSTM WIND :219940 1st Qu.: 0.000 N : 86752
## THUNDERSTORM WIND: 82563 Median : 0.000 W : 38446
## TORNADO : 60652 Mean : 1.484 S : 37558
## FLASH FLOOD : 54277 3rd Qu.: 1.000 E : 33178
## FLOOD : 25326 Max. :3749.000 NW : 24041
## (Other) :170878 (Other):134990
## BGN_LOCATI END_DATE END_TIME
## :287743 :243411 :238978
## COUNTYWIDE : 19680 4/27/2011 0:00:00: 1214 06:00:00 PM: 9802
## Countywide : 993 5/25/2011 0:00:00: 1196 05:00:00 PM: 8314
## SPRINGFIELD : 843 6/9/2011 0:00:00 : 1021 04:00:00 PM: 8104
## SOUTH PORTION: 810 4/4/2011 0:00:00 : 1007 12:00:00 PM: 7483
## NORTH PORTION: 784 5/30/2004 0:00:00: 998 11:59:00 PM: 7184
## (Other) :591444 (Other) :653450 (Other) :622432
## COUNTY_END COUNTYENDN END_RANGE END_AZI
## Min. :0 Mode:logical Min. : 0.0000 :724837
## 1st Qu.:0 NA's:902297 1st Qu.: 0.0000 N : 28082
## Median :0 Median : 0.0000 S : 22510
## Mean :0 Mean : 0.9862 W : 20119
## 3rd Qu.:0 3rd Qu.: 0.0000 E : 20047
## Max. :0 Max. :925.0000 NE : 14606
## (Other): 72096
## END_LOCATI LENGTH WIDTH
## :499225 Min. : 0.0000 Min. : 0.000
## COUNTYWIDE : 19731 1st Qu.: 0.0000 1st Qu.: 0.000
## SOUTH PORTION : 833 Median : 0.0000 Median : 0.000
## NORTH PORTION : 780 Mean : 0.2301 Mean : 7.503
## CENTRAL PORTION: 617 3rd Qu.: 0.0000 3rd Qu.: 0.000
## SPRINGFIELD : 575 Max. :2315.0000 Max. :4400.000
## (Other) :380536
## F MAG FATALITIES INJURIES
## Min. :0.0 Min. : 0.0 Min. : 0.0000 Min. : 0.0000
## 1st Qu.:0.0 1st Qu.: 0.0 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median :1.0 Median : 50.0 Median : 0.0000 Median : 0.0000
## Mean :0.9 Mean : 46.9 Mean : 0.0168 Mean : 0.1557
## 3rd Qu.:1.0 3rd Qu.: 75.0 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :5.0 Max. :22000.0 Max. :583.0000 Max. :1700.0000
## NA's :843563
## PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## Min. : 0.00 :465934 Min. : 0.000 :618413
## 1st Qu.: 0.00 K :424665 1st Qu.: 0.000 K :281832
## Median : 0.00 M : 11330 Median : 0.000 M : 1994
## Mean : 12.06 0 : 216 Mean : 1.527 k : 21
## 3rd Qu.: 0.50 B : 40 3rd Qu.: 0.000 0 : 19
## Max. :5000.00 5 : 28 Max. :990.000 B : 9
## (Other): 84 (Other): 9
## WFO STATEOFFIC
## :142069 :248769
## OUN : 17393 TEXAS, North : 12193
## JAN : 13889 ARKANSAS, Central and North Central: 11738
## LWX : 13174 IOWA, Central : 11345
## PHI : 12551 KANSAS, Southwest : 11212
## TSA : 12483 GEORGIA, North and Central : 11120
## (Other):690738 (Other) :595920
## ZONENAMES
## :594029
## :205988
## GREATER RENO / CARSON CITY / M - GREATER RENO / CARSON CITY / M : 639
## GREATER LAKE TAHOE AREA - GREATER LAKE TAHOE AREA : 592
## JEFFERSON - JEFFERSON : 303
## MADISON - MADISON : 302
## (Other) :100444
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_
## Min. : 0 Min. :-14451 Min. : 0 Min. :-14455
## 1st Qu.:2802 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0
## Median :3540 Median : 8707 Median : 0 Median : 0
## Mean :2875 Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.:4019 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. :9706 Max. : 17124 Max. :9706 Max. :106220
## NA's :47 NA's :40
## REMARKS REFNUM
## :287433 Min. : 1
## : 24013 1st Qu.:225575
## Trees down.\n : 1110 Median :451149
## Several trees were blown down.\n : 569 Mean :451149
## Trees were downed.\n : 446 3rd Qu.:676723
## Large trees and power lines were blown down.\n: 432 Max. :902297
## (Other) :588294
dim(dat)
## [1] 902297 37
# names(dat)
Note that to answer this question we are interested in three columns of the data: EVTYPE, INJURIES and FATALITIES. The idea here is to see what event type had more injuries and fatalities.
# get the summary of the event type, injuries and fatalities
summary(dat$EVTYPE)
## HAIL TSTM WIND THUNDERSTORM WIND
## 288661 219940 82563
## TORNADO FLASH FLOOD FLOOD
## 60652 54277 25326
## THUNDERSTORM WINDS HIGH WIND LIGHTNING
## 20843 20212 15754
## HEAVY SNOW HEAVY RAIN WINTER STORM
## 15708 11723 11433
## WINTER WEATHER FUNNEL CLOUD MARINE TSTM WIND
## 7026 6839 6175
## MARINE THUNDERSTORM WIND WATERSPOUT STRONG WIND
## 5812 3796 3566
## URBAN/SML STREAM FLD WILDFIRE BLIZZARD
## 3392 2761 2719
## DROUGHT ICE STORM EXCESSIVE HEAT
## 2488 2006 1678
## HIGH WINDS WILD/FOREST FIRE FROST/FREEZE
## 1533 1457 1342
## DENSE FOG WINTER WEATHER/MIX TSTM WIND/HAIL
## 1293 1104 1028
## EXTREME COLD/WIND CHILL HEAT HIGH SURF
## 1002 767 725
## TROPICAL STORM FLASH FLOODING EXTREME COLD
## 690 682 655
## COASTAL FLOOD LAKE-EFFECT SNOW FLOOD/FLASH FLOOD
## 650 636 624
## LANDSLIDE SNOW COLD/WIND CHILL
## 600 587 539
## FOG RIP CURRENT MARINE HAIL
## 538 470 442
## DUST STORM AVALANCHE WIND
## 427 386 340
## RIP CURRENTS STORM SURGE FREEZING RAIN
## 304 261 250
## URBAN FLOOD HEAVY SURF/HIGH SURF EXTREME WINDCHILL
## 249 228 204
## STRONG WINDS DRY MICROBURST ASTRONOMICAL LOW TIDE
## 196 186 174
## HURRICANE RIVER FLOOD LIGHT SNOW
## 174 173 154
## STORM SURGE/TIDE RECORD WARMTH COASTAL FLOODING
## 148 146 143
## DUST DEVIL MARINE HIGH WIND UNSEASONABLY WARM
## 141 135 126
## FLOODING ASTRONOMICAL HIGH TIDE MODERATE SNOWFALL
## 120 103 101
## URBAN FLOODING WINTRY MIX HURRICANE/TYPHOON
## 98 90 88
## FUNNEL CLOUDS HEAVY SURF RECORD HEAT
## 87 84 81
## FREEZE HEAT WAVE COLD
## 74 74 72
## RECORD COLD ICE THUNDERSTORM WINDS HAIL
## 64 61 61
## TROPICAL DEPRESSION SLEET UNSEASONABLY DRY
## 60 59 56
## FROST GUSTY WINDS THUNDERSTORM WINDSS
## 53 53 51
## MARINE STRONG WIND OTHER SMALL HAIL
## 48 48 47
## FUNNEL FREEZING FOG THUNDERSTORM
## 46 45 45
## Temperature record TSTM WIND (G45) Coastal Flooding
## 43 39 38
## WATERSPOUTS MONTHLY PRECIPITATION WINDS
## 37 36 36
## (Other)
## 2940
summary(dat$INJURIES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1557 0.0000 1700.0000
summary(dat$FATALITIES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0168 0.0000 583.0000
## Aggregate function is used to compute the sum of injuries and fatalities based on event type
#ev.by.harm <- aggregate(INJURIES + FATALITIES ~ EVTYPE, dat, FUN = sum)
#ev.by.harm <- ev.by.harm[order(-ev.by.harm[, 2]),] # to sort the total harms in descending order
#summary(ev.by.harm)
## showing top 6 (most harmful) events
#Harmful <- head(ev.by.harm)
#names(Harmful) <- c("Event_type", "Total_harms")
#barplot(Harmful$Total_harms, names.arg = Harmful$Event_type, main = "Barplot of the most harmful events", xlab = "Events", ylab = "Total number of harms")
# Another way of plotting the same plot by disciminating the types of injuries
dat_inj <- mutate(summarise(group_by(dat[dat$INJURIES > 0,] , EVTYPE), HARMVAL = sum(INJURIES)), HARMTYPE = "INJURIES")
dat_fat <- mutate(summarise(group_by(dat[dat$FATALITIES > 0,], EVTYPE), HARMVAL = sum(FATALITIES)), HARMTYPE = "FATALITIES")
dat_tot <- rbind(dat_inj, dat_fat)
head(dat_tot)
## # A tibble: 6 × 3
## EVTYPE HARMVAL HARMTYPE
## <fctr> <dbl> <chr>
## 1 AVALANCHE 170 INJURIES
## 2 BLACK ICE 24 INJURIES
## 3 BLIZZARD 805 INJURIES
## 4 blowing snow 1 INJURIES
## 5 BLOWING SNOW 13 INJURIES
## 6 BRUSH FIRE 2 INJURIES
# Spread HARMTYPE-HARMVAL accorss multiple columns
dat_tot <- spread(dat_tot, HARMTYPE, HARMVAL)
head(dat_tot)
## # A tibble: 6 × 3
## EVTYPE FATALITIES INJURIES
## <fctr> <dbl> <dbl>
## 1 AVALANCE 1 NA
## 2 AVALANCHE 224 170
## 3 BLACK ICE 1 24
## 4 BLIZZARD 101 805
## 5 blowing snow 1 1
## 6 BLOWING SNOW 1 13
# getting rid of NAs
dat_tot[is.na(dat_tot$INJURIES), "INJURIES"] <- 0
dat_tot[is.na(dat_tot$FATALITIES), "FATALITIES"] <- 0
head(dat_tot)
## # A tibble: 6 × 3
## EVTYPE FATALITIES INJURIES
## <fctr> <dbl> <dbl>
## 1 AVALANCE 1 0
## 2 AVALANCHE 224 170
## 3 BLACK ICE 1 24
## 4 BLIZZARD 101 805
## 5 blowing snow 1 1
## 6 BLOWING SNOW 1 13
# Now we compute the total economic damages as the summation of PROPDMGEXP and CROPDMGEXP components of dat_econ, and sort it in descending order:
dat_tot <- arrange(dat_tot, desc(INJURIES + FATALITIES))
dat_tot <- head(dat_tot)
# Take multiple columns and collapse them into HARMTYPE-HARMVAL pairs
dat_tot <- gather(dat_tot, HARMTYPE, HARMVAL, INJURIES:FATALITIES)
# Answer to Q1, Harmful events in descending order for each category of injuries:
dat_tot
## # A tibble: 12 × 3
## EVTYPE HARMTYPE HARMVAL
## <fctr> <chr> <dbl>
## 1 TORNADO INJURIES 91346
## 2 EXCESSIVE HEAT INJURIES 6525
## 3 TSTM WIND INJURIES 6957
## 4 FLOOD INJURIES 6789
## 5 LIGHTNING INJURIES 5230
## 6 HEAT INJURIES 2100
## 7 TORNADO FATALITIES 5633
## 8 EXCESSIVE HEAT FATALITIES 1903
## 9 TSTM WIND FATALITIES 504
## 10 FLOOD FATALITIES 470
## 11 LIGHTNING FATALITIES 816
## 12 HEAT FATALITIES 937
# plotting
ggplot(dat_tot, aes(x = reorder(EVTYPE, -HARMVAL ), y = HARMVAL, fill = HARMTYPE)) +
geom_bar(stat = "identity") +
labs(fill = "Type", title = "Barplot of the events with most economic consequences", x = "Events", y = "Total economic consequences") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Note that to answer this question we are interested in 5 columns of the data: EVTYPE, which is the event type and the grouping parameter, and CROPDMGEXP/CROPDMG and PROPDMGEXP/PROPDMG for crop/property damages which could have economic consequences.
summary(dat$EVTYPE)
## HAIL TSTM WIND THUNDERSTORM WIND
## 288661 219940 82563
## TORNADO FLASH FLOOD FLOOD
## 60652 54277 25326
## THUNDERSTORM WINDS HIGH WIND LIGHTNING
## 20843 20212 15754
## HEAVY SNOW HEAVY RAIN WINTER STORM
## 15708 11723 11433
## WINTER WEATHER FUNNEL CLOUD MARINE TSTM WIND
## 7026 6839 6175
## MARINE THUNDERSTORM WIND WATERSPOUT STRONG WIND
## 5812 3796 3566
## URBAN/SML STREAM FLD WILDFIRE BLIZZARD
## 3392 2761 2719
## DROUGHT ICE STORM EXCESSIVE HEAT
## 2488 2006 1678
## HIGH WINDS WILD/FOREST FIRE FROST/FREEZE
## 1533 1457 1342
## DENSE FOG WINTER WEATHER/MIX TSTM WIND/HAIL
## 1293 1104 1028
## EXTREME COLD/WIND CHILL HEAT HIGH SURF
## 1002 767 725
## TROPICAL STORM FLASH FLOODING EXTREME COLD
## 690 682 655
## COASTAL FLOOD LAKE-EFFECT SNOW FLOOD/FLASH FLOOD
## 650 636 624
## LANDSLIDE SNOW COLD/WIND CHILL
## 600 587 539
## FOG RIP CURRENT MARINE HAIL
## 538 470 442
## DUST STORM AVALANCHE WIND
## 427 386 340
## RIP CURRENTS STORM SURGE FREEZING RAIN
## 304 261 250
## URBAN FLOOD HEAVY SURF/HIGH SURF EXTREME WINDCHILL
## 249 228 204
## STRONG WINDS DRY MICROBURST ASTRONOMICAL LOW TIDE
## 196 186 174
## HURRICANE RIVER FLOOD LIGHT SNOW
## 174 173 154
## STORM SURGE/TIDE RECORD WARMTH COASTAL FLOODING
## 148 146 143
## DUST DEVIL MARINE HIGH WIND UNSEASONABLY WARM
## 141 135 126
## FLOODING ASTRONOMICAL HIGH TIDE MODERATE SNOWFALL
## 120 103 101
## URBAN FLOODING WINTRY MIX HURRICANE/TYPHOON
## 98 90 88
## FUNNEL CLOUDS HEAVY SURF RECORD HEAT
## 87 84 81
## FREEZE HEAT WAVE COLD
## 74 74 72
## RECORD COLD ICE THUNDERSTORM WINDS HAIL
## 64 61 61
## TROPICAL DEPRESSION SLEET UNSEASONABLY DRY
## 60 59 56
## FROST GUSTY WINDS THUNDERSTORM WINDSS
## 53 53 51
## MARINE STRONG WIND OTHER SMALL HAIL
## 48 48 47
## FUNNEL FREEZING FOG THUNDERSTORM
## 46 45 45
## Temperature record TSTM WIND (G45) Coastal Flooding
## 43 39 38
## WATERSPOUTS MONTHLY PRECIPITATION WINDS
## 37 36 36
## (Other)
## 2940
summary(dat$PROPDMGEXP)
## - ? + 0 1 2 3 4 5
## 465934 1 8 5 216 25 13 4 4 28
## 6 7 8 B h H K m M
## 4 5 1 40 1 6 424665 7 11330
summary(dat$CROPDMGEXP)
## ? 0 2 B k K m M
## 618413 7 19 1 9 21 281832 1 1994
# Alphabetical characters are used to signify amiplitude including "k", "M" and "B" for thousands, millions and billions
dat_econ <- dat[, c("EVTYPE", "PROPDMGEXP", "PROPDMG" , "CROPDMGEXP", "CROPDMG")]
# We therefore extract components of CROPDMGEXP and PROPDMGEXP with these three values:
dat_econ <- subset(dat_econ, dat_econ$CROPDMGEXP == "K" | dat_econ$CROPDMGEXP == "M" | dat_econ$CROPDMGEXP == "B" | dat_econ$PROPDMGEXP == "k" | dat_econ$PROPDMGEXP == "M" | dat_econ$PROPDMGEXP == "B")
summary(dat_econ)
## EVTYPE PROPDMGEXP PROPDMG
## HAIL :82786 K :275557 Min. : 0.00
## THUNDERSTORM WIND:81431 M : 11330 1st Qu.: 0.00
## FLASH FLOOD :22505 : 4312 Median : 0.00
## FLOOD :14240 B : 40 Mean : 14.15
## TORNADO :13275 0 : 5 3rd Qu.: 3.00
## HIGH WIND :11644 5 : 2 Max. :5000.00
## (Other) :65366 (Other): 1
## CROPDMGEXP CROPDMG
## K :281832 Min. : 0.000
## : 7411 1st Qu.: 0.000
## M : 1994 Median : 0.000
## B : 9 Mean : 4.728
## ? : 1 3rd Qu.: 0.000
## 0 : 0 Max. :990.000
## (Other): 0
head(dat_econ)
## EVTYPE PROPDMGEXP PROPDMG CROPDMGEXP CROPDMG
## 1: TORNADO M 2.5 0
## 2: TORNADO M 2.5 0
## 3: TORNADO M 2.5 0
## 4: TORNADO M 2.5 0
## 5: TORNADO M 2.5 0
## 6: TORNADO M 2.5 0
dim(dat_econ)
## [1] 291247 5
# Now according to the values ok K, M and B and also the corresponding multiplier in PROPDMG and CROPDMG, we replace the components with corresponding costs:
dat_econ$PROPDMG <- as.numeric(dat_econ$PROPDMGEXP == "K") * 1000 * dat_econ$PROPDMG + (dat_econ$PROPDMGEXP == "M") * 1000000 * dat_econ$PROPDMG + (dat_econ$PROPDMGEXP == "B") * 1000000000 * dat_econ$PROPDMG
dat_econ$CROPDMG <- as.numeric(dat_econ$CROPDMGEXP == "K") * 1000 * dat_econ$CROPDMG + (dat_econ$CROPDMGEXP == "M") * 1000000 * dat_econ$CROPDMG + (dat_econ$CROPDMGEXP == "B") * 1000000000 * dat_econ$CROPDMG
dat_econ_pr <- mutate(summarise(group_by(dat_econ[dat_econ$PROPDMG > 0,] , EVTYPE), ECONDMG = sum(PROPDMG)), ECONTYPE = "PROPDMG")
dat_econ_cr <- mutate(summarise(group_by(dat_econ[dat_econ$CROPDMG > 0,], EVTYPE), ECONDMG = sum(CROPDMG)), ECONTYPE = "CROPDMG")
dat_econ_tot <- rbind(dat_econ_pr, dat_econ_cr)
dat_econ_tot <- spread(dat_econ_tot, ECONTYPE, ECONDMG)
dat_econ_tot[is.na(dat_econ_tot$CROPDMG), "CROPDMG"] <- 0
dat_econ_tot[is.na(dat_econ_tot$PROPDMG), "PROPDMG"] <- 0
# Now we compute the total economic damages as the summation of PROPDMGEXP and CROPDMGEXP components of dat_econ, and sort it in descending order:
dat_econ_tot <- arrange(dat_econ_tot, desc(PROPDMG + CROPDMG))
dat_econ_tot <- head(dat_econ_tot)
dat_econ_tot <- gather(dat_econ_tot, ECONTYPE, DMGVAL, CROPDMG:PROPDMG)
# Answer to Q2, economic damages in descending order, for each category of damages:
dat_econ_tot
## # A tibble: 12 × 3
## EVTYPE ECONTYPE DMGVAL
## <fctr> <chr> <dbl>
## 1 FLOOD CROPDMG 5661968450
## 2 HURRICANE/TYPHOON CROPDMG 2607872800
## 3 TORNADO CROPDMG 414953110
## 4 STORM SURGE CROPDMG 5000
## 5 HAIL CROPDMG 3025537450
## 6 FLASH FLOOD CROPDMG 1421317100
## 7 FLOOD PROPDMG 144306779050
## 8 HURRICANE/TYPHOON PROPDMG 69305405000
## 9 TORNADO PROPDMG 54380821690
## 10 STORM SURGE PROPDMG 43305235000
## 11 HAIL PROPDMG 15412627690
## 12 FLASH FLOOD PROPDMG 15400406080
# plotting
ggplot(dat_econ_tot, aes(x = reorder(EVTYPE, -DMGVAL ), y = DMGVAL, fill = ECONTYPE)) +
geom_bar(stat = "identity") +
labs(fill = "Type", title = "Barplot of the events with most economic consequences", x = "Events", y = "Total economic consequences") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))