if (!require(ggplot2)) {
  install.packages("ggplot2")
  library(ggplot2)
}
## Loading required package: ggplot2
if (!require(dplyr)) {
  install.packages("dplyr")
  library(dplyr, warn.conflicts = FALSE)
}
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if (!require(xtable)) {
  install.packages("xtable")
  library(xtable, warn.conflicts = FALSE)
}
## Loading required package: xtable
## Warning: package 'xtable' was built under R version 4.3.3
stormDataFileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
stormDataFile <- "data/storm-data.csv.bz2"
if (!file.exists('data')) {
  dir.create('data')
}
if (!file.exists(stormDataFile)) {
  download.file(url = stormDataFileURL, destfile = stormDataFile)
}
stormData <- read.csv(stormDataFile, sep = ",", header = TRUE)
stopifnot(file.size(stormDataFile) == 49177144) 
stopifnot(dim(stormData) == c(902297,37))

names(stormData)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
str(stormData)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...
head(stormData)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE  EVTYPE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL TORNADO
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL TORNADO
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL TORNADO
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL TORNADO
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL TORNADO
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL TORNADO
##   BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1         0                                               0         NA
## 2         0                                               0         NA
## 3         0                                               0         NA
## 4         0                                               0         NA
## 5         0                                               0         NA
## 6         0                                               0         NA
##   END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1         0                      14.0   100 3   0          0       15    25.0
## 2         0                       2.0   150 2   0          0        0     2.5
## 3         0                       0.1   123 2   0          0        2    25.0
## 4         0                       0.0   100 2   0          0        2     2.5
## 5         0                       0.0   150 2   0          0        2     2.5
## 6         0                       1.5   177 2   0          0        6     2.5
##   PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1          K       0                                         3040      8812
## 2          K       0                                         3042      8755
## 3          K       0                                         3340      8742
## 4          K       0                                         3458      8626
## 5          K       0                                         3412      8642
## 6          K       0                                         3450      8748
##   LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1       3051       8806              1
## 2          0          0              2
## 3          0          0              3
## 4          0          0              4
## 5          0          0              5
## 6          0          0              6
stormDataTidy <- subset(stormData, EVTYPE != "?"
                        &
                          (FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0),
                        select = c("EVTYPE",
                                   "FATALITIES",
                                   "INJURIES", 
                                   "PROPDMG",
                                   "PROPDMGEXP",
                                   "CROPDMG",
                                   "CROPDMGEXP",
                                   "BGN_DATE",
                                   "END_DATE",
                                   "STATE"))
dim(stormDataTidy)
## [1] 254632     10
sum(is.na(stormDataTidy))
## [1] 0
length(unique(stormDataTidy$EVTYPE))
## [1] 487
stormDataTidy$EVTYPE <- toupper(stormDataTidy$EVTYPE)

# AVALANCHE
stormDataTidy$EVTYPE <- gsub('.*AVALANCE.*', 'AVALANCHE', stormDataTidy$EVTYPE)

# BLIZZARD
stormDataTidy$EVTYPE <- gsub('.*BLIZZARD.*', 'BLIZZARD', stormDataTidy$EVTYPE)

# CLOUD
stormDataTidy$EVTYPE <- gsub('.*CLOUD.*', 'CLOUD', stormDataTidy$EVTYPE)

# COLD
stormDataTidy$EVTYPE <- gsub('.*COLD.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*FREEZ.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*FROST.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*ICE.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*LOW TEMPERATURE RECORD.*', 'COLD', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*LO.*TEMP.*', 'COLD', stormDataTidy$EVTYPE)

# DRY
stormDataTidy$EVTYPE <- gsub('.*DRY.*', 'DRY', stormDataTidy$EVTYPE)

# DUST
stormDataTidy$EVTYPE <- gsub('.*DUST.*', 'DUST', stormDataTidy$EVTYPE)

# FIRE
stormDataTidy$EVTYPE <- gsub('.*FIRE.*', 'FIRE', stormDataTidy$EVTYPE)

# FLOOD
stormDataTidy$EVTYPE <- gsub('.*FLOOD.*', 'FLOOD', stormDataTidy$EVTYPE)

# FOG
stormDataTidy$EVTYPE <- gsub('.*FOG.*', 'FOG', stormDataTidy$EVTYPE)

# HAIL
stormDataTidy$EVTYPE <- gsub('.*HAIL.*', 'HAIL', stormDataTidy$EVTYPE)

# HEAT
stormDataTidy$EVTYPE <- gsub('.*HEAT.*', 'HEAT', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*WARM.*', 'HEAT', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*HIGH.*TEMP.*', 'HEAT', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*RECORD HIGH TEMPERATURES.*', 'HEAT', stormDataTidy$EVTYPE)

# HYPOTHERMIA/EXPOSURE
stormDataTidy$EVTYPE <- gsub('.*HYPOTHERMIA.*', 'HYPOTHERMIA/EXPOSURE', stormDataTidy$EVTYPE)

# LANDSLIDE
stormDataTidy$EVTYPE <- gsub('.*LANDSLIDE.*', 'LANDSLIDE', stormDataTidy$EVTYPE)

# LIGHTNING
stormDataTidy$EVTYPE <- gsub('^LIGHTNING.*', 'LIGHTNING', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('^LIGNTNING.*', 'LIGHTNING', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('^LIGHTING.*', 'LIGHTNING', stormDataTidy$EVTYPE)

# MICROBURST
stormDataTidy$EVTYPE <- gsub('.*MICROBURST.*', 'MICROBURST', stormDataTidy$EVTYPE)

# MUDSLIDE
stormDataTidy$EVTYPE <- gsub('.*MUDSLIDE.*', 'MUDSLIDE', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*MUD SLIDE.*', 'MUDSLIDE', stormDataTidy$EVTYPE)

# RAIN
stormDataTidy$EVTYPE <- gsub('.*RAIN.*', 'RAIN', stormDataTidy$EVTYPE)

# RIP CURRENT
stormDataTidy$EVTYPE <- gsub('.*RIP CURRENT.*', 'RIP CURRENT', stormDataTidy$EVTYPE)

# STORM
stormDataTidy$EVTYPE <- gsub('.*STORM.*', 'STORM', stormDataTidy$EVTYPE)

# SUMMARY
stormDataTidy$EVTYPE <- gsub('.*SUMMARY.*', 'SUMMARY', stormDataTidy$EVTYPE)

# TORNADO
stormDataTidy$EVTYPE <- gsub('.*TORNADO.*', 'TORNADO', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*TORNDAO.*', 'TORNADO', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*LANDSPOUT.*', 'TORNADO', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*WATERSPOUT.*', 'TORNADO', stormDataTidy$EVTYPE)

# SURF
stormDataTidy$EVTYPE <- gsub('.*SURF.*', 'SURF', stormDataTidy$EVTYPE)

# VOLCANIC
stormDataTidy$EVTYPE <- gsub('.*VOLCANIC.*', 'VOLCANIC', stormDataTidy$EVTYPE)

# WET
stormDataTidy$EVTYPE <- gsub('.*WET.*', 'WET', stormDataTidy$EVTYPE)

# WIND
stormDataTidy$EVTYPE <- gsub('.*WIND.*', 'WIND', stormDataTidy$EVTYPE)

# WINTER
stormDataTidy$EVTYPE <- gsub('.*WINTER.*', 'WINTER', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*WINTRY.*', 'WINTER', stormDataTidy$EVTYPE)
stormDataTidy$EVTYPE <- gsub('.*SNOW.*', 'WINTER', stormDataTidy$EVTYPE)

length(unique(stormDataTidy$EVTYPE))
## [1] 81
stormDataTidy$DATE_START <- as.Date(stormDataTidy$BGN_DATE, format = "%m/%d/%Y")
stormDataTidy$DATE_END <- as.Date(stormDataTidy$END_DATE, format = "%m/%d/%Y")
stormDataTidy$YEAR <- as.integer(format(stormDataTidy$DATE_START, "%Y"))
stormDataTidy$DURATION <- as.numeric(stormDataTidy$DATE_END - stormDataTidy$DATE_START)/3600

table(toupper(stormDataTidy$PROPDMGEXP))
## 
##             -      +      0      2      3      4      5      6      7      B 
##  11585      1      5    210      1      1      4     18      3      3     40 
##      H      K      M 
##      7 231427  11327
table(toupper(stormDataTidy$CROPDMGEXP))
## 
##             ?      0      B      K      M 
## 152663      6     17      7  99953   1986
# function to get multiplier factor
getMultiplier <- function(exp) {
  exp <- toupper(exp);
  if (exp == "")  return (10^0);
  if (exp == "-") return (10^0);
  if (exp == "?") return (10^0);
  if (exp == "+") return (10^0);
  if (exp == "0") return (10^0);
  if (exp == "1") return (10^1);
  if (exp == "2") return (10^2);
  if (exp == "3") return (10^3);
  if (exp == "4") return (10^4);
  if (exp == "5") return (10^5);
  if (exp == "6") return (10^6);
  if (exp == "7") return (10^7);
  if (exp == "8") return (10^8);
  if (exp == "9") return (10^9);
  if (exp == "H") return (10^2);
  if (exp == "K") return (10^3);
  if (exp == "M") return (10^6);
  if (exp == "B") return (10^9);
  return (NA);
}
stormDataTidy$PROP_COST <- with(stormDataTidy, as.numeric(PROPDMG) * sapply(PROPDMGEXP, getMultiplier))/10^9
stormDataTidy$CROP_COST <- with(stormDataTidy, as.numeric(CROPDMG) * sapply(CROPDMGEXP, getMultiplier))/10^9

healthImpactData <- aggregate(x = list(HEALTH_IMPACT = stormDataTidy$FATALITIES + stormDataTidy$INJURIES), 
                              by = list(EVENT_TYPE = stormDataTidy$EVTYPE), 
                              FUN = sum,
                              na.rm = TRUE)
healthImpactData <- healthImpactData[order(healthImpactData$HEALTH_IMPACT, decreasing = TRUE),]

damageCostImpactData <- aggregate(x = list(DAMAGE_IMPACT = stormDataTidy$PROP_COST + stormDataTidy$CROP_COST), 
                                  by = list(EVENT_TYPE = stormDataTidy$EVTYPE), 
                                  FUN = sum,
                                  na.rm = TRUE)
damageCostImpactData <- damageCostImpactData[order(damageCostImpactData$DAMAGE_IMPACT, decreasing = TRUE),]

print(xtable(head(healthImpactData, 10),
             caption = "Top 10 Weather Events Most Harmful to Population Health"),
      caption.placement = 'top',
      type = "html",
      include.rownames = FALSE,
      html.table.attributes='class="table-bordered", width="100%"')
## <!-- html table generated in R 4.3.2 by xtable 1.8-4 package -->
## <!-- Mon Apr 15 22:33:51 2024 -->
## <table class="table-bordered", width="100%">
## <caption align="top"> Top 10 Weather Events Most Harmful to Population Health </caption>
## <tr> <th> EVENT_TYPE </th> <th> HEALTH_IMPACT </th>  </tr>
##   <tr> <td> TORNADO </td> <td align="right"> 97075.00 </td> </tr>
##   <tr> <td> HEAT </td> <td align="right"> 12392.00 </td> </tr>
##   <tr> <td> FLOOD </td> <td align="right"> 10127.00 </td> </tr>
##   <tr> <td> WIND </td> <td align="right"> 9893.00 </td> </tr>
##   <tr> <td> LIGHTNING </td> <td align="right"> 6049.00 </td> </tr>
##   <tr> <td> STORM </td> <td align="right"> 4780.00 </td> </tr>
##   <tr> <td> COLD </td> <td align="right"> 3100.00 </td> </tr>
##   <tr> <td> WINTER </td> <td align="right"> 1924.00 </td> </tr>
##   <tr> <td> FIRE </td> <td align="right"> 1698.00 </td> </tr>
##   <tr> <td> HAIL </td> <td align="right"> 1512.00 </td> </tr>
##    </table>
healthImpactChart <- ggplot(head(healthImpactData, 10),
                            aes(x = reorder(EVENT_TYPE, HEALTH_IMPACT), y = HEALTH_IMPACT, fill = EVENT_TYPE)) +
  coord_flip() +
  geom_bar(stat = "identity") + 
  xlab("Event Type") +
  ylab("Total Fatalities and Injures") +
  theme(plot.title = element_text(size = 14, hjust = 0.5)) +
  ggtitle("Top 10 Weather Events Most Harmful to\nPopulation Health")
print(healthImpactChart)

print(xtable(head(damageCostImpactData, 10),
             caption = "Top 10 Weather Events with Greatest Economic Consequences"),
      caption.placement = 'top',
      type = "html",
      include.rownames = FALSE,
      html.table.attributes='class="table-bordered", width="100%"')
## <!-- html table generated in R 4.3.2 by xtable 1.8-4 package -->
## <!-- Mon Apr 15 22:33:51 2024 -->
## <table class="table-bordered", width="100%">
## <caption align="top"> Top 10 Weather Events with Greatest Economic Consequences </caption>
## <tr> <th> EVENT_TYPE </th> <th> DAMAGE_IMPACT </th>  </tr>
##   <tr> <td> FLOOD </td> <td align="right"> 180.58 </td> </tr>
##   <tr> <td> HURRICANE/TYPHOON </td> <td align="right"> 71.91 </td> </tr>
##   <tr> <td> STORM </td> <td align="right"> 70.45 </td> </tr>
##   <tr> <td> TORNADO </td> <td align="right"> 57.43 </td> </tr>
##   <tr> <td> HAIL </td> <td align="right"> 20.74 </td> </tr>
##   <tr> <td> DROUGHT </td> <td align="right"> 15.02 </td> </tr>
##   <tr> <td> HURRICANE </td> <td align="right"> 14.61 </td> </tr>
##   <tr> <td> COLD </td> <td align="right"> 12.70 </td> </tr>
##   <tr> <td> WIND </td> <td align="right"> 12.01 </td> </tr>
##   <tr> <td> FIRE </td> <td align="right"> 8.90 </td> </tr>
##    </table>
damageCostImpactChart <- ggplot(head(damageCostImpactData, 10),
                                aes(x = reorder(EVENT_TYPE, DAMAGE_IMPACT), y = DAMAGE_IMPACT, fill = EVENT_TYPE)) +
  coord_flip() +
  geom_bar(stat = "identity") + 
  xlab("Event Type") +
  ylab("Total Property / Crop Damage Cost\n(in Billions)") +
  theme(plot.title = element_text(size = 14, hjust = 0.5)) +
  ggtitle("Top 10 Weather Events with\nGreatest Economic Consequences")
print(damageCostImpactChart)