Synopsis

This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. To examine which weather events are the most most harmful with respect to population health, this analysis looks at the rates of injury, death, and property damage for all events since 1980, limited to those for which there are at least 500 occurences since 1980.

Variables description

Using Storm Data Documentation we can check that:

Events variable:

Data Processing

First of all , we are going to use the download.file() function to extract our dataset:

#download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2","storms.csv.bz2")

The bz2 format is a zip format, but we can read it using read.cvs(), it doesn’t works using unz().

Storm_data<-read.csv('repdata_data_StormData.csv.bz2',stringsAsFactors=F)

Data exploring

str(Storm_data)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

As we are interested in variables related to health and the economy in relation to storms, we are going to carry out transformations in our data set:

Storm_data$BGN_DATE <- as.Date(Storm_data$BGN_DATE,format="%m/%d/%Y")

And now we can subsetting from 1980:

library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
Storm_data_sub <- Storm_data[year(Storm_data$BGN_DATE)>=1980,]
vars <- c( "EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
storm <- Storm_data_sub[, vars]
dim(storm)
## [1] 826931      7

We can check the variables PROPDMGEXP and CROPDMGEXP, for Checkingthe values for variables that represent units od dollars.

sort(table(storm$PROPDMGEXP), decreasing = TRUE)[1:10]
## 
##             K      M      0      B      5      1      2      ?      m 
## 412447 404025  10091    216     40     28     25     13      8      7
sort(table(storm$CROPDMGEXP), decreasing = TRUE)[1:10]
## 
##             K      M      k      0      B      ?      2      m   <NA> 
## 543047 281832   1994     21     19      9      7      1      1

And we can check if we have any NA value:

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
summarise_all(storm, funs(sum(is.na(.))))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
##   EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1      0          0        0       0          0       0          0

Data transformation

First of all, we can put together some values in EVTYPE:

storm$EVTYPE[grep("BLIZZARD|WINTER",storm$EVTYPE)] <- "BLIZZARD"
storm$EVTYPE[grep("FLOOD|FLDG|FLD|URBAN|FLOOO",storm$EVTYPE)]<-"FLOOD"
storm$EVTYPE[grep("FREEZING RAIN|RAIN",storm$EVTYPE)] <- "RAIN"
storm$EVTYPE[grep("ICE|ICY",storm$EVTYPE)]<- "ICE"
storm$EVTYPE[grep("HURRICANE|TYPHOON",storm$EVTYPE)] <- "HURRICANE"
storm$EVTYPE[grep("THUNDER|TSTM",storm$EVTYPE)] <- "THUNDER STORM"
storm$EVTYPE[grep("LIGHTN|LIGHTI|LIGNT",storm$EVTYPE)] <- "LIGHTNING"
storm$EVTYPE[grep("WIND|WND",storm$EVTYPE)] <- "WIND"
storm$EVTYPE[grep("TORN|FUNNEL",storm$EVTYPE)] <- "TORNADO"
storm$EVTYPE[grep("HEAT|HOT|HIGH TEMP",storm$EVTYPE)] <- "HEATWAVE"
storm$EVTYPE[grep("SWELL|SEA|TIDE",storm$EVTYPE)] <- "SEAS"
storm$EVTYPE<-as.factor(storm$EVTYPE)

There is some mess in units, so we transform those variables in one unit (dollar) variable by the rule following here Storm Data Documentation :

storm$PROPDMGEXP <- as.character(storm$PROPDMGEXP)
storm$PROPDMGEXP[is.na(storm$PROPDMGEXP)] <- 0 # NA's considered as dollars
storm$PROPDMGEXP[!grepl("K|M|B", storm$PROPDMGEXP, ignore.case = TRUE)] <- 0 # everything exept K,M,B
storm$PROPDMGEXP[grep("K", storm$PROPDMGEXP, ignore.case = TRUE)] <- "3"
storm$PROPDMGEXP[grep("M", storm$PROPDMGEXP, ignore.case = TRUE)] <- "6"
storm$PROPDMGEXP[grep("B", storm$PROPDMGEXP, ignore.case = TRUE)] <- "9"
storm$PROPDMGEXP <- as.numeric(as.character(storm$PROPDMGEXP))
storm$property.damage <- storm$PROPDMG * 10^storm$PROPDMGEXP

storm$CROPDMGEXP <- as.character(storm$CROPDMGEXP)
storm$CROPDMGEXP[is.na(storm$CROPDMGEXP)] <- 0 # NA's considered as dollars
storm$CROPDMGEXP[!grepl("K|M|B", storm$CROPDMGEXP, ignore.case = TRUE)] <- 0 # everything exept K,M,B is dollar
storm$CROPDMGEXP[grep("K", storm$CROPDMGEXP, ignore.case = TRUE)] <- "3"
storm$CROPDMGEXP[grep("M", storm$CROPDMGEXP, ignore.case = TRUE)] <- "6"
storm$CROPDMGEXP[grep("B", storm$CROPDMGEXP, ignore.case = TRUE)] <- "9"
storm$CROPDMGEXP <- as.numeric(as.character(storm$CROPDMGEXP))
storm$crop.damage <- storm$CROPDMG * 10^storm$CROPDMGEXP

We transform the values PROPDMGEXP and PROPDMGEXP because they have the value n in \(x^n\) , so we change the leters for the values value to \(3\), \(6\) and \(9\) respectively. And know, we can see the results table:

sort(table(storm$property.damage), decreasing = TRUE)[1:5]
## 
##      0   5000  10000   1000   2000 
## 604315  31731  21787  17544  17186
sort(table(storm$crop.damage), decreasing = TRUE)[1:5]
## 
##      0   5000  10000  50000  1e+05 
## 804832   4097   2349   1984   1233

Results

Finally we can join fatalities and injuries in one table:

library(dplyr)
library(plyr)
## Warning: package 'plyr' was built under R version 4.0.2
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
# aggregate FATALITIES and INJURIES by type of EVTYPE
agg.fatalites.and.injuries <- ddply(storm, .(EVTYPE), summarize, Total = sum(FATALITIES + INJURIES,  na.rm = TRUE))
agg.fatalites.and.injuries$type <- "fatalities and injuries"
  
# aggregate FATALITIES by type of EVTYPE
agg.fatalities <- ddply(storm, .(EVTYPE), summarize, Total = sum(FATALITIES, na.rm = TRUE))
agg.fatalities$type <- "fatalities"

# aggregate INJURIES by type of EVTYPE
agg.injuries <- ddply(storm, .(EVTYPE), summarize, Total = sum(INJURIES, na.rm = TRUE))
agg.injuries$type <- "injuries"

# combine all
agg.health <- rbind(agg.fatalities, agg.injuries)
agg.health_top<-head(agg.health[order(agg.health$Total, decreasing = TRUE),],n =20L)

health.by.EVTYPE <- join (agg.fatalities, agg.injuries, by="EVTYPE", type="inner")
health_top<-head(health.by.EVTYPE[order(health.by.EVTYPE$Total, decreasing = TRUE),], n =20L)
health_top
##             EVTYPE Total       type Total     type
## 159       HEATWAVE  3138 fatalities  9154 injuries
## 432        TORNADO  2277 fatalities 38035 injuries
## 96           FLOOD  1551 fatalities  8682 injuries
## 229      LIGHTNING   817 fatalities  5231 injuries
## 428  THUNDER STORM   756 fatalities  9545 injuries
## 481           WIND   688 fatalities  1976 injuries
## 17        BLIZZARD   379 fatalities  2697 injuries
## 305    RIP CURRENT   368 fatalities   232 injuries
## 11       AVALANCHE   224 fatalities   170 injuries
## 306   RIP CURRENTS   204 fatalities   297 injuries
## 90    EXTREME COLD   160 fatalities   231 injuries
## 196      HURRICANE   135 fatalities  1331 injuries
## 171     HEAVY SNOW   127 fatalities  1021 injuries
## 274           RAIN   113 fatalities   301 injuries
## 202            ICE   107 fatalities  2195 injuries
## 189      HIGH SURF   101 fatalities   152 injuries
## 478       WILDFIRE    75 fatalities   911 injuries
## 316           SEAS    70 fatalities    35 injuries
## 99             FOG    62 fatalities   734 injuries
## 435 TROPICAL STORM    58 fatalities   340 injuries

And doing the same with the economic variables:

# aggregate PropDamage and CropDamage by type of EVTYPE
agg.propdmg.and.cropdmg <- ddply(storm, .(EVTYPE), summarize, Total = sum(property.damage + crop.damage,  na.rm = TRUE))
agg.propdmg.and.cropdmg$type <- "property and crop damage"

# aggregate PropDamage by type of EVTYPE
agg.prop <- ddply(storm, .(EVTYPE), summarize, Total = sum(property.damage, na.rm = TRUE))
agg.prop$type <- "property"

# aggregate INJURIES by type of EVTYPE
agg.crop <- ddply(storm, .(EVTYPE), summarize, Total = sum(crop.damage, na.rm = TRUE))
agg.crop$type <- "crop"

# combine all
agg.economic <- rbind(agg.prop, agg.crop)
agg.economic_top<-head(agg.economic[order(agg.economic$Total, decreasing = TRUE),],n =20L)


economic.by.EVTYPE <- join (agg.prop, agg.crop, by="EVTYPE", type="inner")
economic_top<-head(economic.by.EVTYPE[order(economic.by.EVTYPE$Total, decreasing = TRUE),],n =20L)
economic_top
##               EVTYPE        Total     type       Total type
## 96             FLOOD 167437469632 property 12360577200 crop
## 196        HURRICANE  85356410010 property  5516117800 crop
## 432          TORNADO  43694708799 property   414961520 crop
## 357      STORM SURGE  43323536000 property        5000 crop
## 130             HAIL  15732267048 property  3025954473 crop
## 428    THUNDER STORM  12576227478 property  1274178988 crop
## 435   TROPICAL STORM   7703890550 property   678346000 crop
## 17          BLIZZARD   7441709201 property   159504000 crop
## 481             WIND   6196146123 property   775520550 crop
## 478         WILDFIRE   4765114000 property   295472800 crop
## 316             SEAS   4651148650 property    25902500 crop
## 202              ICE   3968635610 property  5022114300 crop
## 274             RAIN   3241216190 property   804662800 crop
## 476 WILD/FOREST FIRE   3001829500 property   106796830 crop
## 55           DROUGHT   1046106000 property 13972566000 crop
## 229        LIGHTNING    933699447 property    12092090 crop
## 171       HEAVY SNOW    932589142 property   134653100 crop
## 475       WILD FIRES    624100000 property           0 crop
## 212        LANDSLIDE    324596000 property    20017000 crop
## 153        HAILSTORM    241000000 property           0 crop

Answering the questions

Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.2
health.plot <- ggplot(agg.health_top, aes(x = EVTYPE, y = Total, fill = type)) + geom_bar(stat = "identity") + coord_flip() + xlab("Event Type") + ylab("Number of health impact") +
  ggtitle("Weather event types impact") + scale_fill_manual(values=c("red","black")) + theme_economist()
 
print(health.plot)  

Tornadoes are the most damaging health events for our data set

Across the United States, which types of events have the greatest economic consequences?

economic.plot <- ggplot(agg.economic_top, aes(x = EVTYPE, y = Total, fill = type)) + geom_bar(stat = "identity") + coord_flip() + xlab("Event Type") + ylab("Number of economic impact") +
  ggtitle("Weather event types impact") + scale_fill_manual(values=c("red","black")) + theme_economist()
 
print(economic.plot) 

Thunder Storms and Floods are the most economically damaging events for our data set

Francisco Javier Carela Ferrer