This project is going to answer some basic questions about severe weather events. Using data collected by the National Oceanic and Atmospheric Administration (NOAA), we will examine the public health and financial impacts of weather events. These data will quantify the top ten most expensive events (in terms of dollars) and the top ten human casualty events (defined as deaths and injuries.) The NOAA storm database will be downloaded, pre-processed, analyzed and reported using R and will include interpretation of estimates, assumptions and other data sources.
From the NOAA Storm Database, we obtain the storm data from 1950 to 2011. After reading in the strom data in the bz2 archive, we download and read the data, then store it in Storm_data.
## loading package
library(ggplot2)
library(dplyr)
## download the storm data, then open and store it
URL <-
"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
stormfil <- basename(URL)
if (!file.exists(stormfil))
download.file(URL, stormfil)
file_temp <- bzfile(stormfil)
Storm_data <- read.csv(file_temp, sep = ",", header = TRUE)
Display the data summary.
names(Storm_data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY" "COUNTYNAME" "STATE"
## [8] "EVTYPE" "BGN_RANGE" "BGN_AZI" "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END"
## [15] "COUNTYENDN" "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH" "F"
## [22] "MAG" "FATALITIES" "INJURIES" "PROPDMG" "PROPDMGEXP" "CROPDMG" "CROPDMGEXP"
## [29] "WFO" "STATEOFFIC" "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
head(Storm_data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE BGN_RANGE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO 0
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO 0
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO 0
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO 0
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO 0
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO 0
## BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH
## 1 0 NA 0 14.0
## 2 0 NA 0 2.0
## 3 0 NA 0 0.1
## 4 0 NA 0 0.0
## 5 0 NA 0 0.0
## 6 0 NA 0 1.5
## WIDTH F MAG FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 100 3 0 0 15 25.0 K 0
## 2 150 2 0 0 0 2.5 K 0
## 3 123 2 0 0 2 25.0 K 0
## 4 100 2 0 0 2 2.5 K 0
## 5 150 2 0 0 2 2.5 K 0
## 6 177 2 0 0 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
str(Storm_data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
summary(Storm_data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY
## Min. : 1.0 Length:902297 Length:902297 Length:902297 Min. : 0.0
## 1st Qu.:19.0 Class :character Class :character Class :character 1st Qu.: 31.0
## Median :30.0 Mode :character Mode :character Mode :character Median : 75.0
## Mean :31.2 Mean :100.6
## 3rd Qu.:45.0 3rd Qu.:131.0
## Max. :95.0 Max. :873.0
##
## COUNTYNAME STATE EVTYPE BGN_RANGE BGN_AZI
## Length:902297 Length:902297 Length:902297 Min. : 0.000 Length:902297
## Class :character Class :character Class :character 1st Qu.: 0.000 Class :character
## Mode :character Mode :character Mode :character Median : 0.000 Mode :character
## Mean : 1.484
## 3rd Qu.: 1.000
## Max. :3749.000
##
## BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## Length:902297 Length:902297 Length:902297 Min. :0 Mode:logical
## Class :character Class :character Class :character 1st Qu.:0 NA's:902297
## Mode :character Mode :character Mode :character Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
##
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH
## Min. : 0.0000 Length:902297 Length:902297 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0000 Class :character Class :character 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 0.0000 Mode :character Mode :character Median : 0.0000 Median : 0.000
## Mean : 0.9862 Mean : 0.2301 Mean : 7.503
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.000
## Max. :925.0000 Max. :2315.0000 Max. :4400.000
##
## F MAG FATALITIES INJURIES PROPDMG
## Min. :0.0 Min. : 0.0 Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.:0.0 1st Qu.: 0.0 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00
## Median :1.0 Median : 50.0 Median : 0.0000 Median : 0.0000 Median : 0.00
## Mean :0.9 Mean : 46.9 Mean : 0.0168 Mean : 0.1557 Mean : 12.06
## 3rd Qu.:1.0 3rd Qu.: 75.0 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.50
## Max. :5.0 Max. :22000.0 Max. :583.0000 Max. :1700.0000 Max. :5000.00
## NA's :843563
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC
## Length:902297 Min. : 0.000 Length:902297 Length:902297 Length:902297
## Class :character 1st Qu.: 0.000 Class :character Class :character Class :character
## Mode :character Median : 0.000 Mode :character Mode :character Mode :character
## Mean : 1.527
## 3rd Qu.: 0.000
## Max. :990.000
##
## ZONENAMES LATITUDE LONGITUDE LATITUDE_E LONGITUDE_
## Length:902297 Min. : 0 Min. :-14451 Min. : 0 Min. :-14455
## Class :character 1st Qu.:2802 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0
## Mode :character Median :3540 Median : 8707 Median : 0 Median : 0
## Mean :2875 Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.:4019 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. :9706 Max. : 17124 Max. :9706 Max. :106220
## NA's :47 NA's :40
## REMARKS REFNUM
## Length:902297 Min. : 1
## Class :character 1st Qu.:225575
## Mode :character Median :451149
## Mean :451149
## 3rd Qu.:676723
## Max. :902297
##
We are going to subset the raw data to focus on the important variables relating to this research. The variables we need are the type of events (including the state and time of the event), damage numbers (including Unit multiplier).
StormData_Clean <-
subset(
Storm_data,
EVTYPE != "?" & (FATALITIES > 0 |
INJURIES > 0 |
PROPDMG > 0 |
CROPDMG > 0),
select = c(
"EVTYPE",
"FATALITIES",
"INJURIES",
"PROPDMG",
"PROPDMGEXP",
"CROPDMG",
"CROPDMGEXP",
"BGN_DATE",
"END_DATE"
)
)
num_obser <- nrow(StormData_Clean)
num_variable <- ncol(StormData_Clean)
Display the summary of subset data.
names(StormData_Clean)
## [1] "EVTYPE" "FATALITIES" "INJURIES" "PROPDMG" "PROPDMGEXP" "CROPDMG" "CROPDMGEXP"
## [8] "BGN_DATE" "END_DATE"
head(StormData_Clean)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP BGN_DATE END_DATE
## 1 TORNADO 0 15 25.0 K 0 4/18/1950 0:00:00
## 2 TORNADO 0 0 2.5 K 0 4/18/1950 0:00:00
## 3 TORNADO 0 2 25.0 K 0 2/20/1951 0:00:00
## 4 TORNADO 0 2 2.5 K 0 6/8/1951 0:00:00
## 5 TORNADO 0 2 2.5 K 0 11/15/1951 0:00:00
## 6 TORNADO 0 6 2.5 K 0 11/15/1951 0:00:00
str(StormData_Clean)
## 'data.frame': 254632 obs. of 9 variables:
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ END_DATE : chr "" "" "" "" ...
summary(StormData_Clean)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP
## Length:254632 Min. : 0.0000 Min. : 0.0000 Min. : 0.00 Length:254632
## Class :character 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 2.00 Class :character
## Mode :character Median : 0.0000 Median : 0.0000 Median : 5.00 Mode :character
## Mean : 0.0595 Mean : 0.5519 Mean : 42.75
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 25.00
## Max. :583.0000 Max. :1700.0000 Max. :5000.00
## CROPDMG CROPDMGEXP BGN_DATE END_DATE
## Min. : 0.000 Length:254632 Length:254632 Length:254632
## 1st Qu.: 0.000 Class :character Class :character Class :character
## Median : 0.000 Mode :character Mode :character Mode :character
## Mean : 5.411
## 3rd Qu.: 0.000
## Max. :990.000
The subset data for analysing contains 254632 observations and 9 variables. Next, we transform the class of BGN_DATE and END_DATE into time format. Then, we create two new variable to identify the year and duration of events.
StormData_Clean$BGN_DATE <-
as.Date(StormData_Clean$BGN_DATE, format = "%m/%d/%Y")
StormData_Clean$END_DATE <-
as.Date(StormData_Clean$END_DATE, format = "%m/%d/%Y")
StormData_Clean$YEAR <-
as.integer(format(StormData_Clean$BGN_DATE, "%Y"))
StormData_Clean$DURATION <-
as.numeric(StormData_Clean$END_DATE - StormData_Clean$BGN_DATE) / 3600
## modify the name of event type
StormData_Clean$EVTYPE <- toupper(StormData_Clean$EVTYPE)
To get a better overall picture of the data, we want to plot the event number per year.
StormYear_Type <- StormData_Clean %>%
subset(select = c("EVTYPE", "YEAR")) %>%
group_by(YEAR) %>%
summarise(by = length(unique(EVTYPE)))
plot(
x = StormYear_Type$YEAR,
y = StormYear_Type$by,
xlab = "Year",
ylab = "Numbers of event types",
main = "Numbers of Event Types per Year"
)
According to the figure, only one type of event was observed before 1993, and there are too many observed types during 1994-1995.So we need to subset the data again, focusing on the data between 1996 and 2011.
StormData_Clean <- subset(StormData_Clean, YEAR >= 1996)
num_event <- length(unique(StormData_Clean$EVTYPE))
The total number of unique event type is 186.
In order to identify accurate damage number, we need to create a function to calculate it, then store it in new variables. The calculation method is based on the National Weather Service Storm Data Documentation. In addition, we need to identify the damage level of event influencing population health.
## create function to calculate the damage number of economic consequences
damage_num <- function (x) {
if (x == "")
return (10 ^ 0)
if (x == "-")
return (10 ^ 0)
if (x == "?")
return (10 ^ 0)
if (x == "+")
return (10 ^ 0)
if (x == "0")
return (10 ^ 0)
if (x == "1")
return (10 ^ 1)
if (x == "2")
return (10 ^ 2)
if (x == "3")
return (10 ^ 3)
if (x == "4")
return (10 ^ 4)
if (x == "5")
return (10 ^ 5)
if (x == "6")
return (10 ^ 6)
if (x == "7")
return (10 ^ 7)
if (x == "8")
return (10 ^ 8)
if (x == "9")
return (10 ^ 9)
if (x == "H")
return (10 ^ 2)
if (x == "K")
return (10 ^ 3)
if (x == "M")
return (10 ^ 6)
if (x == "B")
return (10 ^ 9)
return (NA)
}
## store the damage number into new variable
StormData_Clean$PROPDAMA_NUM <-
with(StormData_Clean,
as.numeric(PROPDMG) * sapply(PROPDMGEXP, damage_num)) / 10 ^ 3
StormData_Clean$CROPDAMA_NUM <-
with(StormData_Clean,
as.numeric(CROPDMG) * sapply(CROPDMGEXP, damage_num)) / 10 ^ 3
## calculate the damage level of population health
StormData_Clean$HEALTHDAMA_NUM <-
StormData_Clean$FATALITIES + StormData_Clean$INJURIES
| Summarize health damage data (fatalities + injuries) and economic damage data (property + crop). | First step is subsetting clean data, and create Health_damage. Then plot bar graphic to indentify top 10 extreme weather which result in the most amout of health damage.
Health_damage <-
StormData_Clean %>%
group_by(EVTYPE) %>%
summarise(healthdam_total = sum(HEALTHDAMA_NUM)) %>%
arrange(desc(healthdam_total))
## plot Health_damage[1:10]
gh <-
ggplot(Health_damage[1:10, ],
aes(x = reorder(EVTYPE, healthdam_total), y = healthdam_total),
color = EVTYPE)
chart1 <-
gh + geom_bar(stat = "identity") + xlab("Event type") + ylab("Damage to population health (fatalities and injuries)") + ggtitle("Population health risks of extreme weather (top 10 events from 1996-2011)") + theme(plot.title = element_text(size = 14, hjust = 0.5)) + coord_flip() + theme_light()
print(chart1)
Second step is subsetting clean data, and create Economic_damage. Then plot bar graphic to indentify top 10 extreme weather which result in the most amout of economic damage.
StormData_Clean$ECONOMIC_SUM <-
StormData_Clean$PROPDAMA_NUM + StormData_Clean$CROPDAMA_NUM
Economic_damage <-
StormData_Clean %>%
group_by(EVTYPE) %>%
summarise(economicdam_total = sum(ECONOMIC_SUM)) %>%
arrange(desc(economicdam_total))
## plot Health_damage[1:10]
gh <-
ggplot(Economic_damage[1:10, ],
aes(x = reorder(EVTYPE, economicdam_total), y = economicdam_total),
color = EVTYPE)
chart2 <-
gh + geom_bar(stat = "identity") + xlab("Event type") + ylab("Economic consequences of extreme weacher (corp damage and property damage)") + ggtitle("Economic consequences of extreme weather (top 10 events from 1996-2011)") + theme(plot.title = element_text(size = 15, hjust = 1)) + coord_flip() + theme_light()
print(chart2)