Synopsis

For this project, we would like to use the NOAA database to examine the impact of weather events to population health and economy of US. We found that torano has higest impact to health, and flood causes the highest damage to economy.

Load libraries we need for this project

library(tidyr)
library(lubridate)
library(ggplot2)
library(data.table)
library(dplyr)

Data Processing

data_raw<-fread("repdata_data_StormData.csv.bz2")

Show structure

str(data_raw)
## Classes 'data.table' and 'data.frame':   902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
##  - attr(*, ".internal.selfref")=<externalptr>

There are 902297 observations of 37 variables in this dataset. Accoding to the [NOAA] (https://www.ncdc.noaa.gov/stormevents/details.jsp): from 1950 through 1954, only tornado events were recorded.From 1955 through 1992, only tornado, thunderstorm wind and hail events were keyed from the paper publications into digital data. From 1993 to 1995, only tornado, thunderstorm wind and hail events have been extracted. From 1996 to present, 48 event types are recorded. Therefore we ignore the data prior to 1996 for this analysis to reduce (unnecessary) complexibility.

data=data_raw
data$BGN_DATE <- as.Date(data$BGN_DATE, "%m/%d/%Y")
data$YEAR<-year(data$BGN_DATE)
# subset the data: starting from 1996 
data<- data[data$YEAR>1996,]
# take only relevent variables into account
data<-data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
dim(data)
## [1] 621260      7

Now we have 653530 observations left for our analysis.

summary(data$EVTYPE)
##    Length     Class      Mode 
##    621260 character character
str(data$EVTYPE)
##  chr [1:621260] "TSTM WIND" "TSTM WIND" "TSTM WIND" "TSTM WIND" "TSTM WIND" ...

There are 985 types of events, which is far more than the 48 types claimed by NOAA website. Some of the differences are caused by upper and lower cases, and some caused by typos or others. We take the following steps to reduce the levels.

  1. Subset events with economic or health impacts.
data <- filter(data, PROPDMG > 0 | CROPDMG > 0 | FATALITIES > 0 | INJURIES > 0)
dim(data)
## [1] 191278      7
nlevels(data$EVTYPE)
## [1] 0

There is no improvement (still 985 levels), but the number of observations has been reduced to 201318.

  1. Convert all events to upper cases
data$EVTYPE<-as.factor(toupper(data$EVTYPE))
nlevels(data$EVTYPE)
## [1] 161

There are 186 levels.

  1. Use regular expression to replace some event names to further clean the types
# create a new variable EVENT to transform variable EVTYPE in groups
data$EVENT <- "OTHER"
data$EVENT[grep("HAIL", data$EVTYPE, ignore.case = TRUE)] <- "HAIL"
data$EVENT[grep("HEAT", data$EVTYPE, ignore.case = TRUE)] <- "HEAT"
data$EVENT[grep("FLOOD", data$EVTYPE, ignore.case = TRUE)] <- "FLOOD"
data$EVENT[grep("WIND", data$EVTYPE, ignore.case = TRUE)] <- "WIND"
data$EVENT[grep("STORM", data$EVTYPE, ignore.case = TRUE)] <- "STORM"
data$EVENT[grep("SNOW", data$EVTYPE, ignore.case = TRUE)] <- "SNOW"
data$EVENT[grep("TORNADO", data$EVTYPE, ignore.case = TRUE)] <- "TORNADO"
data$EVENT[grep("WINTER", data$EVTYPE, ignore.case = TRUE)] <- "WINTER"
data$EVENT[grep("RAIN", data$EVTYPE, ignore.case = TRUE)] <- "RAIN"
summary(as.factor(data$EVENT))
##   FLOOD    HAIL    HEAT   OTHER    RAIN    SNOW   STORM TORNADO    WIND  WINTER 
##   27231   21612     828   14919    1051    1273   44391   11750   66329    1894

Weather Effect on Population Health

wide_health<- data[,c("FATALITIES","INJURIES","EVENT")]
# with tidyr_1.0.0 we can use pivot longer to reshape our dataframe to long
long_health <- pivot_longer(wide_health, -c(EVENT), values_to = "VALUE", names_to = "FATALITIES_INJURIES")

# plot FATALITIES and INJURIES by EVENT
ggplot(long_health, aes(x = EVENT, y = VALUE, fill = FATALITIES_INJURIES)) + geom_bar(stat = "identity") +
  xlab("Event Type") + 
  ylab("Total number of fatalities or injuries") +
  ggtitle("Weather Impact on Health")+
  scale_fill_manual(values = c("red", "steelblue"))

Weather Effect on Economic Situation

These are possible values of CROPDMGEXP and PROPDMGEXP: - H,h,K,k,M,m,B,b,+,-,?,0,1,2,3,4,5,6,7,8, and blank-character - H,h = hundreds = 100 - K,k = kilos = thousands = 1,000 - M,m = millions = 1,000,000 - B,b = billions = 1,000,000,000 - (+) = 1 - (-) = 0 - (?) = 0 - black/empty character = 0 - numeric 0..8 = 10

data$PROPDMGEXP <- as.character(data$PROPDMGEXP)
data$PROPDMGEXP[is.na(data$PROPDMGEXP)] <- 0
data$PROPDMGEXP[grep("H", data$PROPDMGEXP, ignore.case = TRUE)] <- "2"
data$PROPDMGEXP[grep("K", data$PROPDMGEXP, ignore.case = TRUE)] <- "3"
data$PROPDMGEXP[grep("M", data$PROPDMGEXP, ignore.case = TRUE)] <- "6"
data$PROPDMGEXP[grep("B", data$PROPDMGEXP, ignore.case = TRUE)] <- "9"
data$PROPDMGEXP[grep("+", data$PROPDMGEXP)] <- "1"
data$PROPDMGEXP[grepl("-|?", data$PROPDMGEXP)] <- "0"
data$PROPDMGEXP[grep("[0-8]", data$PROPDMGEXP)] <- "10"
data$PROPDMGEXP <- as.numeric(as.character(data$PROPDMGEXP))
data$PROP<- data$PROPDMG * 10^data$PROPDMGEXP

data$CROPDMGEXP <- as.character(data$CROPDMGEXP)
data$CROPDMGEXP[is.na(data$CROPDMGEXP)] <- 0 
data$CROPDMGEXP[!grepl("H|K|M|B", data$CROPDMGEXP, ignore.case = TRUE)] <- 0 
data$CROPDMGEXP[grep("H", data$CROPDMGEXP, ignore.case = TRUE)] <- "2"
data$CROPDMGEXP[grep("K", data$CROPDMGEXP, ignore.case = TRUE)] <- "3"
data$CROPDMGEXP[grep("M", data$CROPDMGEXP, ignore.case = TRUE)] <- "6"
data$CROPDMGEXP[grep("B", data$CROPDMGEXP, ignore.case = TRUE)] <- "9"
data$CROPDMGEXP[grep("+", data$CROPDMGEXP)] <- "1"
data$CROPDMGEXP[grepl("-|?", data$CROPDMGEXP)] <- "0"
data$CROPDMGEXP[grep("[0-8]", data$CROPDMGEXP)] <- "10"
data$CROPDMGEXP <- as.numeric(as.character(data$CROPDMGEXP))
data$CROP<- data$CROPDMG * 10^data$CROPDMGEXP
wide_dmg<- data[,c("PROP","CROP","EVENT")]
# with tidyr_1.0.0 we can use pivot longer to reshape our dataframe to long
long_dmg<- pivot_longer(wide_dmg, -c(EVENT), values_to = "VALUE", names_to = "PROP_OR_CROP_DAMAGE")

# plot FATALITIES and INJURIES by EVENT
ggplot(long_dmg, aes(x = EVENT, y = VALUE, fill = PROP_OR_CROP_DAMAGE)) + geom_bar(stat = "identity") +
  xlab("Event Type") + 
  ylab("Total dollar of damage") +
  ggtitle("Weather Impact on Economy")+
  scale_fill_manual(values = c("red", "steelblue"))

Result

From the above figures we can find that for population health in US since 1996, tonado has a greatest impact, expecially the number of injuries. On the other hand, for US economy since 1996, Flood results in the highest loss of value on properties, and hail causes the highest damage on crops.