if (!require(ggplot2)) {
install.packages("ggplot2")
library(ggplot2)
}
## Loading required package: ggplot2
if (!require(dplyr)) {
install.packages("dplyr")
library(dplyr, warn.conflicts = FALSE)
}
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if (!require(xtable)) {
install.packages("xtable")
library(xtable, warn.conflicts = FALSE)
}
## Loading required package: xtable
## Warning: package 'xtable' was built under R version 4.3.3
stormDataFileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
stormDataFile <- "data/storm-data.csv.bz2"
if (!file.exists('data')) {
dir.create('data')
}
if (!file.exists(stormDataFile)) {
download.file(url = stormDataFileURL, destfile = stormDataFile)
}
stormData <- read.csv(stormDataFile, sep = ",", header = TRUE)
stopifnot(file.size(stormDataFile) == 49177144)
stopifnot(dim(stormData) == c(902297,37))
names(stormData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
str(stormData)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
head(stormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
stormDataTidy <- subset(stormData, EVTYPE != "?"
&
(FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0),
select = c("EVTYPE",
"FATALITIES",
"INJURIES",
"PROPDMG",
"PROPDMGEXP",
"CROPDMG",
"CROPDMGEXP",
"BGN_DATE",
"END_DATE",
"STATE"))
dim(stormDataTidy)
## [1] 254632 10
sum(is.na(stormDataTidy))
## [1] 0
length(unique(stormDataTidy$EVTYPE))
## [1] 487
stormDataTidy$EVTYPE <- toupper(stormDataTidy$EVTYPE)
# AVALANCHE
stormDataTidy$EVTYPE <- gsub('.*AVALANCE.*', 'AVALANCHE', stormDataTidy$EVTYPE)
# BLIZZARD
stormDataTidy$EVTYPE <- gsub('.*BLIZZARD.*', 'BLIZZARD', stormDataTidy$EVTYPE)
# CLOUD
stormDataTidy$EVTYPE <- gsub('.*CLOUD.*', 'CLOUD', stormDataTidy$EVTYPE)
# COLD
stormDataTidy$EVTYPE <- gsub('.*COLD.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*FREEZ.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*FROST.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*ICE.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*LOW TEMPERATURE RECORD.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*LO.*TEMP.*', 'COLD', stormDataTidy$EVTYPE)
# DRY
stormDataTidy$EVTYPE <- gsub('.*DRY.*', 'DRY', stormDataTidy$EVTYPE)
# DUST
stormDataTidy$EVTYPE <- gsub('.*DUST.*', 'DUST', stormDataTidy$EVTYPE)
# FIRE
stormDataTidy$EVTYPE <- gsub('.*FIRE.*', 'FIRE', stormDataTidy$EVTYPE)
# FLOOD
stormDataTidy$EVTYPE <- gsub('.*FLOOD.*', 'FLOOD', stormDataTidy$EVTYPE)
# FOG
stormDataTidy$EVTYPE <- gsub('.*FOG.*', 'FOG', stormDataTidy$EVTYPE)
# HAIL
stormDataTidy$EVTYPE <- gsub('.*HAIL.*', 'HAIL', stormDataTidy$EVTYPE)
# HEAT
stormDataTidy$EVTYPE <- gsub('.*HEAT.*', 'HEAT', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*WARM.*', 'HEAT', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*HIGH.*TEMP.*', 'HEAT', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*RECORD HIGH TEMPERATURES.*', 'HEAT', stormDataTidy$EVTYPE)
# HYPOTHERMIA/EXPOSURE
stormDataTidy$EVTYPE <- gsub('.*HYPOTHERMIA.*', 'HYPOTHERMIA/EXPOSURE', stormDataTidy$EVTYPE)
# LANDSLIDE
stormDataTidy$EVTYPE <- gsub('.*LANDSLIDE.*', 'LANDSLIDE', stormDataTidy$EVTYPE)
# LIGHTNING
stormDataTidy$EVTYPE <- gsub('^LIGHTNING.*', 'LIGHTNING', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('^LIGNTNING.*', 'LIGHTNING', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('^LIGHTING.*', 'LIGHTNING', stormDataTidy$EVTYPE)
# MICROBURST
stormDataTidy$EVTYPE <- gsub('.*MICROBURST.*', 'MICROBURST', stormDataTidy$EVTYPE)
# MUDSLIDE
stormDataTidy$EVTYPE <- gsub('.*MUDSLIDE.*', 'MUDSLIDE', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*MUD SLIDE.*', 'MUDSLIDE', stormDataTidy$EVTYPE)
# RAIN
stormDataTidy$EVTYPE <- gsub('.*RAIN.*', 'RAIN', stormDataTidy$EVTYPE)
# RIP CURRENT
stormDataTidy$EVTYPE <- gsub('.*RIP CURRENT.*', 'RIP CURRENT', stormDataTidy$EVTYPE)
# STORM
stormDataTidy$EVTYPE <- gsub('.*STORM.*', 'STORM', stormDataTidy$EVTYPE)
# SUMMARY
stormDataTidy$EVTYPE <- gsub('.*SUMMARY.*', 'SUMMARY', stormDataTidy$EVTYPE)
# TORNADO
stormDataTidy$EVTYPE <- gsub('.*TORNADO.*', 'TORNADO', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*TORNDAO.*', 'TORNADO', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*LANDSPOUT.*', 'TORNADO', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*WATERSPOUT.*', 'TORNADO', stormDataTidy$EVTYPE)
# SURF
stormDataTidy$EVTYPE <- gsub('.*SURF.*', 'SURF', stormDataTidy$EVTYPE)
# VOLCANIC
stormDataTidy$EVTYPE <- gsub('.*VOLCANIC.*', 'VOLCANIC', stormDataTidy$EVTYPE)
# WET
stormDataTidy$EVTYPE <- gsub('.*WET.*', 'WET', stormDataTidy$EVTYPE)
# WIND
stormDataTidy$EVTYPE <- gsub('.*WIND.*', 'WIND', stormDataTidy$EVTYPE)
# WINTER
stormDataTidy$EVTYPE <- gsub('.*WINTER.*', 'WINTER', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*WINTRY.*', 'WINTER', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*SNOW.*', 'WINTER', stormDataTidy$EVTYPE)
length(unique(stormDataTidy$EVTYPE))
## [1] 81
stormDataTidy$DATE_START <- as.Date(stormDataTidy$BGN_DATE, format = "%m/%d/%Y")
stormDataTidy$DATE_END <- as.Date(stormDataTidy$END_DATE, format = "%m/%d/%Y")
stormDataTidy$YEAR <- as.integer(format(stormDataTidy$DATE_START, "%Y"))
stormDataTidy$DURATION <- as.numeric(stormDataTidy$DATE_END - stormDataTidy$DATE_START)/3600
table(toupper(stormDataTidy$PROPDMGEXP))
##
## - + 0 2 3 4 5 6 7 B
## 11585 1 5 210 1 1 4 18 3 3 40
## H K M
## 7 231427 11327
table(toupper(stormDataTidy$CROPDMGEXP))
##
## ? 0 B K M
## 152663 6 17 7 99953 1986
# function to get multiplier factor
getMultiplier <- function(exp) {
exp <- toupper(exp);
if (exp == "") return (10^0);
if (exp == "-") return (10^0);
if (exp == "?") return (10^0);
if (exp == "+") return (10^0);
if (exp == "0") return (10^0);
if (exp == "1") return (10^1);
if (exp == "2") return (10^2);
if (exp == "3") return (10^3);
if (exp == "4") return (10^4);
if (exp == "5") return (10^5);
if (exp == "6") return (10^6);
if (exp == "7") return (10^7);
if (exp == "8") return (10^8);
if (exp == "9") return (10^9);
if (exp == "H") return (10^2);
if (exp == "K") return (10^3);
if (exp == "M") return (10^6);
if (exp == "B") return (10^9);
return (NA);
}
stormDataTidy$PROP_COST <- with(stormDataTidy, as.numeric(PROPDMG) * sapply(PROPDMGEXP, getMultiplier))/10^9
stormDataTidy$CROP_COST <- with(stormDataTidy, as.numeric(CROPDMG) * sapply(CROPDMGEXP, getMultiplier))/10^9
healthImpactData <- aggregate(x = list(HEALTH_IMPACT = stormDataTidy$FATALITIES + stormDataTidy$INJURIES),
by = list(EVENT_TYPE = stormDataTidy$EVTYPE),
FUN = sum,
na.rm = TRUE)
healthImpactData <- healthImpactData[order(healthImpactData$HEALTH_IMPACT, decreasing = TRUE),]
damageCostImpactData <- aggregate(x = list(DAMAGE_IMPACT = stormDataTidy$PROP_COST + stormDataTidy$CROP_COST),
by = list(EVENT_TYPE = stormDataTidy$EVTYPE),
FUN = sum,
na.rm = TRUE)
damageCostImpactData <- damageCostImpactData[order(damageCostImpactData$DAMAGE_IMPACT, decreasing = TRUE),]
print(xtable(head(healthImpactData, 10),
caption = "Top 10 Weather Events Most Harmful to Population Health"),
caption.placement = 'top',
type = "html",
include.rownames = FALSE,
html.table.attributes='class="table-bordered", width="100%"')
## <!-- html table generated in R 4.3.2 by xtable 1.8-4 package -->
## <!-- Mon Apr 15 22:33:51 2024 -->
## <table class="table-bordered", width="100%">
## <caption align="top"> Top 10 Weather Events Most Harmful to Population Health </caption>
## <tr> <th> EVENT_TYPE </th> <th> HEALTH_IMPACT </th> </tr>
## <tr> <td> TORNADO </td> <td align="right"> 97075.00 </td> </tr>
## <tr> <td> HEAT </td> <td align="right"> 12392.00 </td> </tr>
## <tr> <td> FLOOD </td> <td align="right"> 10127.00 </td> </tr>
## <tr> <td> WIND </td> <td align="right"> 9893.00 </td> </tr>
## <tr> <td> LIGHTNING </td> <td align="right"> 6049.00 </td> </tr>
## <tr> <td> STORM </td> <td align="right"> 4780.00 </td> </tr>
## <tr> <td> COLD </td> <td align="right"> 3100.00 </td> </tr>
## <tr> <td> WINTER </td> <td align="right"> 1924.00 </td> </tr>
## <tr> <td> FIRE </td> <td align="right"> 1698.00 </td> </tr>
## <tr> <td> HAIL </td> <td align="right"> 1512.00 </td> </tr>
## </table>
healthImpactChart <- ggplot(head(healthImpactData, 10),
aes(x = reorder(EVENT_TYPE, HEALTH_IMPACT), y = HEALTH_IMPACT, fill = EVENT_TYPE)) +
coord_flip() +
geom_bar(stat = "identity") +
xlab("Event Type") +
ylab("Total Fatalities and Injures") +
theme(plot.title = element_text(size = 14, hjust = 0.5)) +
ggtitle("Top 10 Weather Events Most Harmful to\nPopulation Health")
print(healthImpactChart)

print(xtable(head(damageCostImpactData, 10),
caption = "Top 10 Weather Events with Greatest Economic Consequences"),
caption.placement = 'top',
type = "html",
include.rownames = FALSE,
html.table.attributes='class="table-bordered", width="100%"')
## <!-- html table generated in R 4.3.2 by xtable 1.8-4 package -->
## <!-- Mon Apr 15 22:33:51 2024 -->
## <table class="table-bordered", width="100%">
## <caption align="top"> Top 10 Weather Events with Greatest Economic Consequences </caption>
## <tr> <th> EVENT_TYPE </th> <th> DAMAGE_IMPACT </th> </tr>
## <tr> <td> FLOOD </td> <td align="right"> 180.58 </td> </tr>
## <tr> <td> HURRICANE/TYPHOON </td> <td align="right"> 71.91 </td> </tr>
## <tr> <td> STORM </td> <td align="right"> 70.45 </td> </tr>
## <tr> <td> TORNADO </td> <td align="right"> 57.43 </td> </tr>
## <tr> <td> HAIL </td> <td align="right"> 20.74 </td> </tr>
## <tr> <td> DROUGHT </td> <td align="right"> 15.02 </td> </tr>
## <tr> <td> HURRICANE </td> <td align="right"> 14.61 </td> </tr>
## <tr> <td> COLD </td> <td align="right"> 12.70 </td> </tr>
## <tr> <td> WIND </td> <td align="right"> 12.01 </td> </tr>
## <tr> <td> FIRE </td> <td align="right"> 8.90 </td> </tr>
## </table>
damageCostImpactChart <- ggplot(head(damageCostImpactData, 10),
aes(x = reorder(EVENT_TYPE, DAMAGE_IMPACT), y = DAMAGE_IMPACT, fill = EVENT_TYPE)) +
coord_flip() +
geom_bar(stat = "identity") +
xlab("Event Type") +
ylab("Total Property / Crop Damage Cost\n(in Billions)") +
theme(plot.title = element_text(size = 14, hjust = 0.5)) +
ggtitle("Top 10 Weather Events with\nGreatest Economic Consequences")
print(damageCostImpactChart)
