Synopsis

The following documents will look into the effects of various weather events in the United States. The dataset we used is from http://www.nws.noaa.gov/directives/. There are 2 questions we want to answer: effects of these weather events on population health and the economic consequences.

Loading the data and packages

We are using the data StormData from.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.1
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.4.1
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 3.4.1
library(statsr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.1

We are using the data StormData from…

StormData <- read.csv("StormData.csv")
dim(StormData)
## [1] 902297     37
names(StormData)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
head(StormData)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL
##    EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO         0                                               0
## 2 TORNADO         0                                               0
## 3 TORNADO         0                                               0
## 4 TORNADO         0                                               0
## 5 TORNADO         0                                               0
## 6 TORNADO         0                                               0
##   COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1         NA         0                      14.0   100 3   0          0
## 2         NA         0                       2.0   150 2   0          0
## 3         NA         0                       0.1   123 2   0          0
## 4         NA         0                       0.0   100 2   0          0
## 5         NA         0                       0.0   150 2   0          0
## 6         NA         0                       1.5   177 2   0          0
##   INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1       15    25.0          K       0                                    
## 2        0     2.5          K       0                                    
## 3        2    25.0          K       0                                    
## 4        2     2.5          K       0                                    
## 5        2     2.5          K       0                                    
## 6        6     2.5          K       0                                    
##   LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1     3040      8812       3051       8806              1
## 2     3042      8755          0          0              2
## 3     3340      8742          0          0              3
## 4     3458      8626          0          0              4
## 5     3412      8642          0          0              5
## 6     3450      8748          0          0              6

1) Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

Extracting from the data 3 variables EVTYPE, FATALITIES and INJURIES to determined the effects of these events on population health. We will focus on the top 10 events for both categories and then put them both together. (fatalities and injuries does not correllated for all events.)

popdata <- StormData %>%
        select(EVTYPE, FATALITIES, INJURIES) %>%
        group_by(EVTYPE) %>%
        summarize(total_injuries = sum(INJURIES), total_death = sum(FATALITIES)) %>%
        filter(total_injuries!=0 | total_death !=0) %>%
        arrange(desc(total_injuries))
## Warning: package 'bindrcpp' was built under R version 3.4.1
death10 <- arrange(popdata,desc(total_death))[1:10,-2]
injuries10 <- arrange(popdata,desc(total_injuries))[1:10,-3]
death10 ; injuries10
## # A tibble: 10 x 2
##            EVTYPE total_death
##            <fctr>       <dbl>
##  1        TORNADO        5633
##  2 EXCESSIVE HEAT        1903
##  3    FLASH FLOOD         978
##  4           HEAT         937
##  5      LIGHTNING         816
##  6      TSTM WIND         504
##  7          FLOOD         470
##  8    RIP CURRENT         368
##  9      HIGH WIND         248
## 10      AVALANCHE         224
## # A tibble: 10 x 2
##               EVTYPE total_injuries
##               <fctr>          <dbl>
##  1           TORNADO          91346
##  2         TSTM WIND           6957
##  3             FLOOD           6789
##  4    EXCESSIVE HEAT           6525
##  5         LIGHTNING           5230
##  6              HEAT           2100
##  7         ICE STORM           1975
##  8       FLASH FLOOD           1777
##  9 THUNDERSTORM WIND           1488
## 10              HAIL           1361
ggplot(popdata, aes(total_injuries, total_death)) + geom_point()+
        stat_smooth(method="lm") + labs(title = "With Tornado")

ggplot(popdata[-1,], aes(total_injuries, total_death)) + geom_point()+
        stat_smooth(method="lm") +labs(title ="Without Tornado")

Now we want to see the plot of leading causes of injuries and death

par(mfrow=c(1,2), mar = c(12, 4, 1, 1))
barplot(death10$total_death,las=3,names.arg = death10$EVTYPE, col="cadetblue2",
        ylab = "Total Deaths")
barplot(injuries10$total_injuries,las=3, names.arg = injuries10$EVTYPE,
        col="coral1",ylab = "Total Injuries")

popdata_melt <- melt(popdata, id.vars = "EVTYPE")
popdata_melt <- popdata_melt %>%
        filter(EVTYPE %in% c(as.character(death10$EVTYPE),as.character(injuries10$EVTYPE))) %>%
        group_by(EVTYPE) %>%
        filter(value!=0) 
print(popdata_melt)
## # A tibble: 26 x 3
## # Groups:   EVTYPE [13]
##               EVTYPE       variable value
##               <fctr>         <fctr> <dbl>
##  1           TORNADO total_injuries 91346
##  2         TSTM WIND total_injuries  6957
##  3             FLOOD total_injuries  6789
##  4    EXCESSIVE HEAT total_injuries  6525
##  5         LIGHTNING total_injuries  5230
##  6              HEAT total_injuries  2100
##  7         ICE STORM total_injuries  1975
##  8       FLASH FLOOD total_injuries  1777
##  9 THUNDERSTORM WIND total_injuries  1488
## 10              HAIL total_injuries  1361
## # ... with 16 more rows
ggplot(popdata_melt, aes(x=EVTYPE,y=value,fill=factor(variable)))+
        geom_col(position = "dodge")+
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        coord_cartesian(ylim = c(0,10000)) +
        labs(title = "Effects of most danderous events", x="Events",y="Total")

We purposely limit the number of injuries to 10000 since tornado is the outliner here. Nevertheless, we can conclude that tornado is the most harmful to population health in America.

2)Across the United States, which types of events have the greatest economic consequences?

We select 4 variables to answer this problem: PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP and EVTYPE. We also do cleaning up the data to get the right dollar value (renaming all the factor levels with “+”,“?”,“-” are 0 and “” is 1. The rest of the levels are recorded in the instruction)

econdata <- StormData %>%
        select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP) 
unique(econdata$PROPDMGEXP)
##  [1] K M   B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(econdata$CROPDMGEXP)
## [1]   M K m B ? 0 k 2
## Levels:  ? 0 2 B k K m M
econdata$PROPDMGEXP <- as.character(econdata$PROPDMGEXP)
econdata$CROPDMGEXP <- as.character(econdata$CROPDMGEXP)

##relevels
econdata$PROPDMGEXP[econdata$PROPDMGEXP == ""] <- 1
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "-"] <- 0
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "?"] <- 0
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "+"] <- 0
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "0"] <- 0
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "1"] <- 10
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "2"] <- 10^2
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "3"] <- 10^3
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "4"] <- 10^4
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "5"] <- 10^5
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "6"] <- 10^6
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "7"] <- 10^7
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "8"] <- 10^8
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "B"] <- 10^9
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "h"] <- 10^2
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "H"] <- 10^2
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "K"] <- 10^3
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "m"] <- 10^6
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "M"] <- 10^6


econdata$CROPDMGEXP[econdata$CROPDMGEXP == ""] <- 1
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "?"] <- 0
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "0"] <- 1
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "2"] <- 10^2
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "B"] <- 10^9
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "k"] <- 10^3
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "K"] <- 10^3
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "m"] <- 10^6
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "M"] <- 10^6

econdata <- econdata %>%
        mutate(propdmgtemp = as.numeric(PROPDMGEXP) * PROPDMG,
               cropdmgtemp = as.numeric(CROPDMGEXP) * CROPDMG) %>%
        group_by(EVTYPE) %>%
        summarise(propdmgdollars = sum(propdmgtemp),
                  cropdmgdollars = sum(cropdmgtemp)) %>%
        arrange(desc(propdmgdollars)) %>%
        print()
## # A tibble: 985 x 3
##               EVTYPE propdmgdollars cropdmgdollars
##               <fctr>          <dbl>          <dbl>
##  1             FLOOD   144657709870     5661968450
##  2 HURRICANE/TYPHOON    69305840000     2607872800
##  3           TORNADO    56947380510      414953270
##  4       STORM SURGE    43323536000           5000
##  5       FLASH FLOOD    16822675580     1421317100
##  6              HAIL    15735267790     3025954473
##  7         HURRICANE    11868319010     2741910000
##  8    TROPICAL STORM     7703890550      678346000
##  9      WINTER STORM     6688497250       26944000
## 10         HIGH WIND     5270046260      638571300
## # ... with 975 more rows
summary(econdata$propdmgdollars, na.rm=T)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 4.347e+08 5.105e+04 1.447e+11
summary(econdata$cropdmgdollars, na.rm=T)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 4.985e+07 0.000e+00 1.397e+10

Like Q1, we will find the top 10 events for each category and graph them.

prop10 <- arrange(econdata, desc(econdata$propdmgdollars))[1:10,c(1,2)]
crop10 <- arrange(econdata, desc(econdata$cropdmgdollars))[1:10,c(1,3)]
prop10;crop10
## # A tibble: 10 x 2
##               EVTYPE propdmgdollars
##               <fctr>          <dbl>
##  1             FLOOD   144657709870
##  2 HURRICANE/TYPHOON    69305840000
##  3           TORNADO    56947380510
##  4       STORM SURGE    43323536000
##  5       FLASH FLOOD    16822675580
##  6              HAIL    15735267790
##  7         HURRICANE    11868319010
##  8    TROPICAL STORM     7703890550
##  9      WINTER STORM     6688497250
## 10         HIGH WIND     5270046260
## # A tibble: 10 x 2
##               EVTYPE cropdmgdollars
##               <fctr>          <dbl>
##  1           DROUGHT    13972566000
##  2             FLOOD     5661968450
##  3       RIVER FLOOD     5029459000
##  4         ICE STORM     5022113500
##  5              HAIL     3025954473
##  6         HURRICANE     2741910000
##  7 HURRICANE/TYPHOON     2607872800
##  8       FLASH FLOOD     1421317100
##  9      EXTREME COLD     1292973000
## 10      FROST/FREEZE     1094086000
par(mfrow=c(1,2), mar = c(12, 4, 1, 1))
barplot(prop10$propdmgdollars/(10^9),las=3,names.arg = prop10$EVTYPE,
        col="cadetblue2",ylab = "Property Damage (billions)")
barplot(crop10$cropdmgdollars/(10^9),las=3, names.arg = crop10$EVTYPE,
        col="coral1",ylab = "Crop Damage(billions)")

Now we want the plot to combine the effects of these events for both property and crop damage

econdata_melt <- melt(econdata, id.vars = "EVTYPE")
econdata_melt <- econdata_melt %>%
        filter(EVTYPE %in% c(as.character(crop10$EVTYPE),
                             as.character(prop10$EVTYPE))) %>%
        group_by(EVTYPE) %>%
        filter(value!=0)
levels(econdata_melt$variable) <- c("Property Damage", "Crop Damage")
print(econdata_melt)
## # A tibble: 30 x 3
## # Groups:   EVTYPE [15]
##               EVTYPE        variable        value
##               <fctr>          <fctr>        <dbl>
##  1             FLOOD Property Damage 144657709870
##  2 HURRICANE/TYPHOON Property Damage  69305840000
##  3           TORNADO Property Damage  56947380510
##  4       STORM SURGE Property Damage  43323536000
##  5       FLASH FLOOD Property Damage  16822675580
##  6              HAIL Property Damage  15735267790
##  7         HURRICANE Property Damage  11868319010
##  8    TROPICAL STORM Property Damage   7703890550
##  9      WINTER STORM Property Damage   6688497250
## 10         HIGH WIND Property Damage   5270046260
## # ... with 20 more rows
ggplot(econdata_melt, aes(x=EVTYPE,y=value/10^9,fill=variable))+
        geom_col(position = "dodge")+
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        coord_cartesian(ylim = c(0,140)) +
        labs(title = "Effects on crops and property", x="Events",y="Total") 

Results

1.Based on this data, we found that tornadoes and excessive heat causes the most population health damages in the United States. 2.We also figures out that flood causes the biggest damage on property and drought causes the most damage on crop