For this project, we would like to use the NOAA database to examine the impact of weather events to population health and economy of US. We found that torano has higest impact to health, and flood causes the highest damage to economy.
Load libraries we need for this project
library(tidyr)
library(lubridate)
library(ggplot2)
library(data.table)
library(dplyr)
data_raw<-fread("repdata_data_StormData.csv.bz2")
Show structure
str(data_raw)
## Classes 'data.table' and 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, ".internal.selfref")=<externalptr>
There are 902297 observations of 37 variables in this dataset. Accoding to the [NOAA] (https://www.ncdc.noaa.gov/stormevents/details.jsp): from 1950 through 1954, only tornado events were recorded.From 1955 through 1992, only tornado, thunderstorm wind and hail events were keyed from the paper publications into digital data. From 1993 to 1995, only tornado, thunderstorm wind and hail events have been extracted. From 1996 to present, 48 event types are recorded. Therefore we ignore the data prior to 1996 for this analysis to reduce (unnecessary) complexibility.
data=data_raw
data$BGN_DATE <- as.Date(data$BGN_DATE, "%m/%d/%Y")
data$YEAR<-year(data$BGN_DATE)
# subset the data: starting from 1996
data<- data[data$YEAR>1996,]
# take only relevent variables into account
data<-data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
dim(data)
## [1] 621260 7
Now we have 653530 observations left for our analysis.
summary(data$EVTYPE)
## Length Class Mode
## 621260 character character
str(data$EVTYPE)
## chr [1:621260] "TSTM WIND" "TSTM WIND" "TSTM WIND" "TSTM WIND" "TSTM WIND" ...
There are 985 types of events, which is far more than the 48 types claimed by NOAA website. Some of the differences are caused by upper and lower cases, and some caused by typos or others. We take the following steps to reduce the levels.
data <- filter(data, PROPDMG > 0 | CROPDMG > 0 | FATALITIES > 0 | INJURIES > 0)
dim(data)
## [1] 191278 7
nlevels(data$EVTYPE)
## [1] 0
There is no improvement (still 985 levels), but the number of observations has been reduced to 201318.
data$EVTYPE<-as.factor(toupper(data$EVTYPE))
nlevels(data$EVTYPE)
## [1] 161
There are 186 levels.
# create a new variable EVENT to transform variable EVTYPE in groups
data$EVENT <- "OTHER"
data$EVENT[grep("HAIL", data$EVTYPE, ignore.case = TRUE)] <- "HAIL"
data$EVENT[grep("HEAT", data$EVTYPE, ignore.case = TRUE)] <- "HEAT"
data$EVENT[grep("FLOOD", data$EVTYPE, ignore.case = TRUE)] <- "FLOOD"
data$EVENT[grep("WIND", data$EVTYPE, ignore.case = TRUE)] <- "WIND"
data$EVENT[grep("STORM", data$EVTYPE, ignore.case = TRUE)] <- "STORM"
data$EVENT[grep("SNOW", data$EVTYPE, ignore.case = TRUE)] <- "SNOW"
data$EVENT[grep("TORNADO", data$EVTYPE, ignore.case = TRUE)] <- "TORNADO"
data$EVENT[grep("WINTER", data$EVTYPE, ignore.case = TRUE)] <- "WINTER"
data$EVENT[grep("RAIN", data$EVTYPE, ignore.case = TRUE)] <- "RAIN"
summary(as.factor(data$EVENT))
## FLOOD HAIL HEAT OTHER RAIN SNOW STORM TORNADO WIND WINTER
## 27231 21612 828 14919 1051 1273 44391 11750 66329 1894
wide_health<- data[,c("FATALITIES","INJURIES","EVENT")]
# with tidyr_1.0.0 we can use pivot longer to reshape our dataframe to long
long_health <- pivot_longer(wide_health, -c(EVENT), values_to = "VALUE", names_to = "FATALITIES_INJURIES")
# plot FATALITIES and INJURIES by EVENT
ggplot(long_health, aes(x = EVENT, y = VALUE, fill = FATALITIES_INJURIES)) + geom_bar(stat = "identity") +
xlab("Event Type") +
ylab("Total number of fatalities or injuries") +
ggtitle("Weather Impact on Health")+
scale_fill_manual(values = c("red", "steelblue"))
These are possible values of CROPDMGEXP and PROPDMGEXP: - H,h,K,k,M,m,B,b,+,-,?,0,1,2,3,4,5,6,7,8, and blank-character - H,h = hundreds = 100 - K,k = kilos = thousands = 1,000 - M,m = millions = 1,000,000 - B,b = billions = 1,000,000,000 - (+) = 1 - (-) = 0 - (?) = 0 - black/empty character = 0 - numeric 0..8 = 10
data$PROPDMGEXP <- as.character(data$PROPDMGEXP)
data$PROPDMGEXP[is.na(data$PROPDMGEXP)] <- 0
data$PROPDMGEXP[grep("H", data$PROPDMGEXP, ignore.case = TRUE)] <- "2"
data$PROPDMGEXP[grep("K", data$PROPDMGEXP, ignore.case = TRUE)] <- "3"
data$PROPDMGEXP[grep("M", data$PROPDMGEXP, ignore.case = TRUE)] <- "6"
data$PROPDMGEXP[grep("B", data$PROPDMGEXP, ignore.case = TRUE)] <- "9"
data$PROPDMGEXP[grep("+", data$PROPDMGEXP)] <- "1"
data$PROPDMGEXP[grepl("-|?", data$PROPDMGEXP)] <- "0"
data$PROPDMGEXP[grep("[0-8]", data$PROPDMGEXP)] <- "10"
data$PROPDMGEXP <- as.numeric(as.character(data$PROPDMGEXP))
data$PROP<- data$PROPDMG * 10^data$PROPDMGEXP
data$CROPDMGEXP <- as.character(data$CROPDMGEXP)
data$CROPDMGEXP[is.na(data$CROPDMGEXP)] <- 0
data$CROPDMGEXP[!grepl("H|K|M|B", data$CROPDMGEXP, ignore.case = TRUE)] <- 0
data$CROPDMGEXP[grep("H", data$CROPDMGEXP, ignore.case = TRUE)] <- "2"
data$CROPDMGEXP[grep("K", data$CROPDMGEXP, ignore.case = TRUE)] <- "3"
data$CROPDMGEXP[grep("M", data$CROPDMGEXP, ignore.case = TRUE)] <- "6"
data$CROPDMGEXP[grep("B", data$CROPDMGEXP, ignore.case = TRUE)] <- "9"
data$CROPDMGEXP[grep("+", data$CROPDMGEXP)] <- "1"
data$CROPDMGEXP[grepl("-|?", data$CROPDMGEXP)] <- "0"
data$CROPDMGEXP[grep("[0-8]", data$CROPDMGEXP)] <- "10"
data$CROPDMGEXP <- as.numeric(as.character(data$CROPDMGEXP))
data$CROP<- data$CROPDMG * 10^data$CROPDMGEXP
wide_dmg<- data[,c("PROP","CROP","EVENT")]
# with tidyr_1.0.0 we can use pivot longer to reshape our dataframe to long
long_dmg<- pivot_longer(wide_dmg, -c(EVENT), values_to = "VALUE", names_to = "PROP_OR_CROP_DAMAGE")
# plot FATALITIES and INJURIES by EVENT
ggplot(long_dmg, aes(x = EVENT, y = VALUE, fill = PROP_OR_CROP_DAMAGE)) + geom_bar(stat = "identity") +
xlab("Event Type") +
ylab("Total dollar of damage") +
ggtitle("Weather Impact on Economy")+
scale_fill_manual(values = c("red", "steelblue"))
From the above figures we can find that for population health in US since 1996, tonado has a greatest impact, expecially the number of injuries. On the other hand, for US economy since 1996, Flood results in the highest loss of value on properties, and hail causes the highest damage on crops.