Load packages used in this analysis.
if (!require(ggplot2)) {
install.packages("ggplot2")
library(ggplot2)
}
## 필요한 패키지를 로딩중입니다: ggplot2
## Warning: 패키지 'ggplot2'는 R 버전 4.4.2에서 작성되었습니다
if (!require(dplyr)) {
install.packages("dplyr")
library(dplyr, warn.conflicts = FALSE)
}
## 필요한 패키지를 로딩중입니다: dplyr
## Warning: 패키지 'dplyr'는 R 버전 4.4.2에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if (!require(xtable)) {
install.packages("xtable")
library(xtable, warn.conflicts = FALSE)
}
## 필요한 패키지를 로딩중입니다: xtable
## Warning: 패키지 'xtable'는 R 버전 4.4.2에서 작성되었습니다
Display session information.
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=Korean_Korea.utf8 LC_CTYPE=Korean_Korea.utf8
## [3] LC_MONETARY=Korean_Korea.utf8 LC_NUMERIC=C
## [5] LC_TIME=Korean_Korea.utf8
##
## time zone: Etc/GMT-9
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] xtable_1.8-4 dplyr_1.1.4 ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.5 cli_3.6.3 knitr_1.48 rlang_1.1.4
## [5] xfun_0.48 generics_0.1.3 jsonlite_1.8.9 glue_1.8.0
## [9] colorspace_2.1-1 htmltools_0.5.8.1 sass_0.4.9 fansi_1.0.6
## [13] scales_1.3.0 rmarkdown_2.28 grid_4.4.1 evaluate_1.0.1
## [17] munsell_0.5.1 jquerylib_0.1.4 tibble_3.2.1 fastmap_1.2.0
## [21] yaml_2.3.10 lifecycle_1.0.4 compiler_4.4.1 pkgconfig_2.0.3
## [25] rstudioapi_0.17.1 digest_0.6.37 R6_2.5.1 tidyselect_1.2.1
## [29] utf8_1.2.4 pillar_1.9.0 magrittr_2.0.3 bslib_0.8.0
## [33] withr_3.0.1 tools_4.4.1 gtable_0.3.5 cachem_1.1.0
Download the compressed data file from the source URL (if not found
locally) and then load the compressed data file via
read.csv. Prior to processing the data, validate the
downloaded data file and loaded dataset by checking the file size and
dimensions respectively.
stormDataFileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
stormDataFile <- "data/storm-data.csv.bz2"
if (!file.exists('data')) {
dir.create('data')
}
if (!file.exists(stormDataFile)) {
download.file(url = stormDataFileURL, destfile = stormDataFile)
}
stormData <- read.csv(stormDataFile, sep = ",", header = TRUE)
stopifnot(file.size(stormDataFile) == 49177144)
stopifnot(dim(stormData) == c(902297,37))
Display dataset summary
names(stormData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
str(stormData)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
head(stormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
| Variable | Description |
|---|---|
| EVTYPE | Event type (Flood, Heat, Hurricane, Tornado, …) |
| FATALITIES | Number of fatalities resulting from event |
| INJURIES | Number of injuries resulting from event |
| PROPDMG | Property damage in USD |
| PROPDMGEXP | Unit multiplier for property damage (K, M, or B) |
| CROPDMG | Crop damage in USD |
| CROPDMGEXP | Unit multiplier for property damage (K, M, or B) |
| BGN_DATE | Begin date of the event |
| END_DATE | End date of the event |
| STATE | State where the event occurred |
stormDataNeat <- subset(stormData, EVTYPE != "?"
&
(FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0),
select = c("EVTYPE",
"FATALITIES",
"INJURIES",
"PROPDMG",
"PROPDMGEXP",
"CROPDMG",
"CROPDMGEXP",
"BGN_DATE",
"END_DATE",
"STATE"))
dim(stormDataNeat)
## [1] 254632 10
sum(is.na(stormDataNeat))
## [1] 0
The working (Neat) dataset contains 254632 observations, 10 variables and no missing values.
There are a total of 487 unique Event Type values in the current Neat dataset.
length(unique(stormDataNeat$EVTYPE))
## [1] 487
Exploring the Event Type data revealed many values that appeared to
be similar; however, some of the events entered with different
spellings, pluralization, mixed case and even misspellings. For example,
Strong Wind, STRONG WIND,
Strong Winds, and STRONG WINDS.
The dataset was normalized by converting all Event Type values to uppercase and combining similar Event Type values into unique categories.
stormDataNeat$EVTYPE <- toupper(stormDataNeat$EVTYPE)
# AVALANCHE
stormDataNeat$EVTYPE <- gsub('.*AVALANCE.*', 'AVALANCHE', stormDataNeat$EVTYPE)
# BLIZZARD
stormDataNeat$EVTYPE <- gsub('.*BLIZZARD.*', 'BLIZZARD', stormDataNeat$EVTYPE)
# CLOUD
stormDataNeat$EVTYPE <- gsub('.*CLOUD.*', 'CLOUD', stormDataNeat$EVTYPE)
# COLD
stormDataNeat$EVTYPE <- gsub('.*COLD.*', 'COLD', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*FREEZ.*', 'COLD', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*FROST.*', 'COLD', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*ICE.*', 'COLD', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*LOW TEMPERATURE RECORD.*', 'COLD', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*LO.*TEMP.*', 'COLD', stormDataNeat$EVTYPE)
# DRY
stormDataNeat$EVTYPE <- gsub('.*DRY.*', 'DRY', stormDataNeat$EVTYPE)
# DUST
stormDataNeat$EVTYPE <- gsub('.*DUST.*', 'DUST', stormDataNeat$EVTYPE)
# FIRE
stormDataNeat$EVTYPE <- gsub('.*FIRE.*', 'FIRE', stormDataNeat$EVTYPE)
# FLOOD
stormDataNeat$EVTYPE <- gsub('.*FLOOD.*', 'FLOOD', stormDataNeat$EVTYPE)
# FOG
stormDataNeat$EVTYPE <- gsub('.*FOG.*', 'FOG', stormDataNeat$EVTYPE)
# HAIL
stormDataNeat$EVTYPE <- gsub('.*HAIL.*', 'HAIL', stormDataNeat$EVTYPE)
# HEAT
stormDataNeat$EVTYPE <- gsub('.*HEAT.*', 'HEAT', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*WARM.*', 'HEAT', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*HIGH.*TEMP.*', 'HEAT', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*RECORD HIGH TEMPERATURES.*', 'HEAT', stormDataNeat$EVTYPE)
# HYPOTHERMIA/EXPOSURE
stormDataNeat$EVTYPE <- gsub('.*HYPOTHERMIA.*', 'HYPOTHERMIA/EXPOSURE', stormDataNeat$EVTYPE)
# LANDSLIDE
stormDataNeat$EVTYPE <- gsub('.*LANDSLIDE.*', 'LANDSLIDE', stormDataNeat$EVTYPE)
# LIGHTNING
stormDataNeat$EVTYPE <- gsub('^LIGHTNING.*', 'LIGHTNING', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('^LIGNTNING.*', 'LIGHTNING', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('^LIGHTING.*', 'LIGHTNING', stormDataNeat$EVTYPE)
# MICROBURST
stormDataNeat$EVTYPE <- gsub('.*MICROBURST.*', 'MICROBURST', stormDataNeat$EVTYPE)
# MUDSLIDE
stormDataNeat$EVTYPE <- gsub('.*MUDSLIDE.*', 'MUDSLIDE', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*MUD SLIDE.*', 'MUDSLIDE', stormDataNeat$EVTYPE)
# RAIN
stormDataNeat$EVTYPE <- gsub('.*RAIN.*', 'RAIN', stormDataNeat$EVTYPE)
# RIP CURRENT
stormDataNeat$EVTYPE <- gsub('.*RIP CURRENT.*', 'RIP CURRENT', stormDataNeat$EVTYPE)
# STORM
stormDataNeat$EVTYPE <- gsub('.*STORM.*', 'STORM', stormDataNeat$EVTYPE)
# SUMMARY
stormDataNeat$EVTYPE <- gsub('.*SUMMARY.*', 'SUMMARY', stormDataNeat$EVTYPE)
# TORNADO
stormDataNeat$EVTYPE <- gsub('.*TORNADO.*', 'TORNADO', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*TORNDAO.*', 'TORNADO', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*LANDSPOUT.*', 'TORNADO', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*WATERSPOUT.*', 'TORNADO', stormDataNeat$EVTYPE)
# SURF
stormDataNeat$EVTYPE <- gsub('.*SURF.*', 'SURF', stormDataNeat$EVTYPE)
# VOLCANIC
stormDataNeat$EVTYPE <- gsub('.*VOLCANIC.*', 'VOLCANIC', stormDataNeat$EVTYPE)
# WET
stormDataNeat$EVTYPE <- gsub('.*WET.*', 'WET', stormDataNeat$EVTYPE)
# WIND
stormDataNeat$EVTYPE <- gsub('.*WIND.*', 'WIND', stormDataNeat$EVTYPE)
# WINTER
stormDataNeat$EVTYPE <- gsub('.*WINTER.*', 'WINTER', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*WINTRY.*', 'WINTER', stormDataNeat$EVTYPE)
stormDataNeat$EVTYPE <- gsub('.*SNOW.*', 'WINTER', stormDataNeat$EVTYPE)
After Neating the dataset, the number of unique Event Type values were reduced to 81
length(unique(stormDataNeat$EVTYPE))
## [1] 81
Format date variables for any type of optional reporting or further analysis.
In the raw dataset, the BNG_START and
END_DATE variables are stored as factors which should be
made available as actual date types that can be manipulated and
reported on. For now, time variables will be ignored.
Create four new variables based on date variables in the Neat dataset:
| Variable | Description |
|---|---|
| DATE_START | Begin date of the event stored as a date type |
| DATE_END | End date of the event stored as a date type |
| YEAR | Year the event started |
| DURATION | Duration (in hours) of the event |
stormDataNeat$DATE_START <- as.Date(stormDataNeat$BGN_DATE, format = "%m/%d/%Y")
stormDataNeat$DATE_END <- as.Date(stormDataNeat$END_DATE, format = "%m/%d/%Y")
stormDataNeat$YEAR <- as.integer(format(stormDataNeat$DATE_START, "%Y"))
stormDataNeat$DURATION <- as.numeric(stormDataNeat$DATE_END - stormDataNeat$DATE_START)/3600
According to the “National Weather Service Storm
Data Documentation” (page 12), information about Property Damage is
logged using two variables: PROPDMG and
PROPDMGEXP. PROPDMG is the mantissa (the
significand) rounded to three significant digits and
PROPDMGEXP is the exponent (the multiplier). The same
approach is used for Crop Damage where the CROPDMG variable
is encoded by the CROPDMGEXP variable.
The documentation also specifies that the PROPDMGEXP and
CROPDMGEXP are supposed to contain an alphabetical
character used to signify magnitude and logs “K” for thousands, “M” for
millions, and “B” for billions. A quick review of the data, however,
shows that there are several other characters being logged.
table(toupper(stormDataNeat$PROPDMGEXP))
##
## - + 0 2 3 4 5 6 7 B
## 11585 1 5 210 1 1 4 18 3 3 40
## H K M
## 7 231427 11327
table(toupper(stormDataNeat$CROPDMGEXP))
##
## ? 0 B K M
## 152663 6 17 7 99953 1986
In order to calculate costs, the PROPDMGEXP and
CROPDMGEXP variables will be mapped to a multiplier factor
which will then be used to calculate the actual costs for both property
and crop damage. Two new variables will be created to store damage
costs:
# function to get multiplier factor
getMultiplier <- function(exp) {
exp <- toupper(exp);
if (exp == "") return (10^0);
if (exp == "-") return (10^0);
if (exp == "?") return (10^0);
if (exp == "+") return (10^0);
if (exp == "0") return (10^0);
if (exp == "1") return (10^1);
if (exp == "2") return (10^2);
if (exp == "3") return (10^3);
if (exp == "4") return (10^4);
if (exp == "5") return (10^5);
if (exp == "6") return (10^6);
if (exp == "7") return (10^7);
if (exp == "8") return (10^8);
if (exp == "9") return (10^9);
if (exp == "H") return (10^2);
if (exp == "K") return (10^3);
if (exp == "M") return (10^6);
if (exp == "B") return (10^9);
return (NA);
}
# calculate property damage and crop damage costs (in billions)
stormDataNeat$PROP_COST <- with(stormDataNeat, as.numeric(PROPDMG) * sapply(PROPDMGEXP, getMultiplier))/10^9
stormDataNeat$CROP_COST <- with(stormDataNeat, as.numeric(CROPDMG) * sapply(CROPDMGEXP, getMultiplier))/10^9
Create a summarized dataset of health impact data (fatalities + injuries). Sort the results in descending order by health impact.
healthImpactData <- aggregate(x = list(HEALTH_IMPACT = stormDataNeat$FATALITIES + stormDataNeat$INJURIES),
by = list(EVENT_TYPE = stormDataNeat$EVTYPE),
FUN = sum,
na.rm = TRUE)
healthImpactData <- healthImpactData[order(healthImpactData$HEALTH_IMPACT, decreasing = TRUE),]
Create a summarized dataset of damage impact costs (property damage + crop damage). Sort the results in descending order by damage cost.
damageCostImpactData <- aggregate(x = list(DAMAGE_IMPACT = stormDataNeat$PROP_COST + stormDataNeat$CROP_COST),
by = list(EVENT_TYPE = stormDataNeat$EVTYPE),
FUN = sum,
na.rm = TRUE)
damageCostImpactData <- damageCostImpactData[order(damageCostImpactData$DAMAGE_IMPACT, decreasing = TRUE),]
Fatalities and injuries have the most harmful impact on population health. The results below display the 10 most harmful weather events in terms of population health in the U.S.
print(xtable(head(healthImpactData, 10),
caption = "Top 10 Weather Events Most Harmful to Population Health"),
caption.placement = 'top',
type = "html",
include.rownames = FALSE,
html.table.attributes='class="table-bordered", width="100%"')
| EVENT_TYPE | HEALTH_IMPACT |
|---|---|
| TORNADO | 97075.00 |
| HEAT | 12392.00 |
| FLOOD | 10127.00 |
| WIND | 9893.00 |
| LIGHTNING | 6049.00 |
| STORM | 4780.00 |
| COLD | 3100.00 |
| WINTER | 1924.00 |
| FIRE | 1698.00 |
| HAIL | 1512.00 |
healthImpactChart <- ggplot(head(healthImpactData, 10),
aes(x = reorder(EVENT_TYPE, HEALTH_IMPACT), y = HEALTH_IMPACT, fill = EVENT_TYPE)) +
coord_flip() +
geom_bar(stat = "identity") +
xlab("Event Type") +
ylab("Total Fatalities and Injures") +
theme(plot.title = element_text(size = 14, hjust = 0.5)) +
ggtitle("Top 10 Weather Events Most Harmful to\nPopulation Health")
print(healthImpactChart)
Property and crop damage have the most harmful impact on the economy. The results below display the 10 most harmful weather events in terms economic consequences in the U.S.
print(xtable(head(damageCostImpactData, 10),
caption = "Top 10 Weather Events with Greatest Economic Consequences"),
caption.placement = 'top',
type = "html",
include.rownames = FALSE,
html.table.attributes='class="table-bordered", width="100%"')
| EVENT_TYPE | DAMAGE_IMPACT |
|---|---|
| FLOOD | 180.58 |
| HURRICANE/TYPHOON | 71.91 |
| STORM | 70.45 |
| TORNADO | 57.43 |
| HAIL | 20.74 |
| DROUGHT | 15.02 |
| HURRICANE | 14.61 |
| COLD | 12.70 |
| WIND | 12.01 |
| FIRE | 8.90 |
damageCostImpactChart <- ggplot(head(damageCostImpactData, 10),
aes(x = reorder(EVENT_TYPE, DAMAGE_IMPACT), y = DAMAGE_IMPACT, fill = EVENT_TYPE)) +
coord_flip() +
geom_bar(stat = "identity") +
xlab("Event Type") +
ylab("Total Property / Crop Damage Cost\n(in Billions)") +
theme(plot.title = element_text(size = 14, hjust = 0.5)) +
ggtitle("Top 10 Weather Events with\nGreatest Economic Consequences")
print(damageCostImpactChart)
Based on the evidence all the data and graphs, the following conclusions can be drawn:
Which types of weather events are most harmful to population health?
Tornadoes are responsible for the greatest number of fatalities and injuries.
Which types of weather events have the greatest economic consequences?
Floods are responsible for causing the most property damage and crop damage costs.