The following documents will look into the effects of various weather events in the United States. The dataset we used is from http://www.nws.noaa.gov/directives/. There are 2 questions we want to answer: effects of these weather events on population health and the economic consequences.
We are using the data StormData from.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.1
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.4.1
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 3.4.1
library(statsr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.1
We are using the data StormData from…
StormData <- read.csv("StormData.csv")
dim(StormData)
## [1] 902297 37
names(StormData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
head(StormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
Extracting from the data 3 variables EVTYPE, FATALITIES and INJURIES to determined the effects of these events on population health. We will focus on the top 10 events for both categories and then put them both together. (fatalities and injuries does not correllated for all events.)
popdata <- StormData %>%
select(EVTYPE, FATALITIES, INJURIES) %>%
group_by(EVTYPE) %>%
summarize(total_injuries = sum(INJURIES), total_death = sum(FATALITIES)) %>%
filter(total_injuries!=0 | total_death !=0) %>%
arrange(desc(total_injuries))
## Warning: package 'bindrcpp' was built under R version 3.4.1
death10 <- arrange(popdata,desc(total_death))[1:10,-2]
injuries10 <- arrange(popdata,desc(total_injuries))[1:10,-3]
death10 ; injuries10
## # A tibble: 10 x 2
## EVTYPE total_death
## <fctr> <dbl>
## 1 TORNADO 5633
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
## 6 TSTM WIND 504
## 7 FLOOD 470
## 8 RIP CURRENT 368
## 9 HIGH WIND 248
## 10 AVALANCHE 224
## # A tibble: 10 x 2
## EVTYPE total_injuries
## <fctr> <dbl>
## 1 TORNADO 91346
## 2 TSTM WIND 6957
## 3 FLOOD 6789
## 4 EXCESSIVE HEAT 6525
## 5 LIGHTNING 5230
## 6 HEAT 2100
## 7 ICE STORM 1975
## 8 FLASH FLOOD 1777
## 9 THUNDERSTORM WIND 1488
## 10 HAIL 1361
ggplot(popdata, aes(total_injuries, total_death)) + geom_point()+
stat_smooth(method="lm") + labs(title = "With Tornado")
ggplot(popdata[-1,], aes(total_injuries, total_death)) + geom_point()+
stat_smooth(method="lm") +labs(title ="Without Tornado")
Now we want to see the plot of leading causes of injuries and death
par(mfrow=c(1,2), mar = c(12, 4, 1, 1))
barplot(death10$total_death,las=3,names.arg = death10$EVTYPE, col="cadetblue2",
ylab = "Total Deaths")
barplot(injuries10$total_injuries,las=3, names.arg = injuries10$EVTYPE,
col="coral1",ylab = "Total Injuries")
popdata_melt <- melt(popdata, id.vars = "EVTYPE")
popdata_melt <- popdata_melt %>%
filter(EVTYPE %in% c(as.character(death10$EVTYPE),as.character(injuries10$EVTYPE))) %>%
group_by(EVTYPE) %>%
filter(value!=0)
print(popdata_melt)
## # A tibble: 26 x 3
## # Groups: EVTYPE [13]
## EVTYPE variable value
## <fctr> <fctr> <dbl>
## 1 TORNADO total_injuries 91346
## 2 TSTM WIND total_injuries 6957
## 3 FLOOD total_injuries 6789
## 4 EXCESSIVE HEAT total_injuries 6525
## 5 LIGHTNING total_injuries 5230
## 6 HEAT total_injuries 2100
## 7 ICE STORM total_injuries 1975
## 8 FLASH FLOOD total_injuries 1777
## 9 THUNDERSTORM WIND total_injuries 1488
## 10 HAIL total_injuries 1361
## # ... with 16 more rows
ggplot(popdata_melt, aes(x=EVTYPE,y=value,fill=factor(variable)))+
geom_col(position = "dodge")+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_cartesian(ylim = c(0,10000)) +
labs(title = "Effects of most danderous events", x="Events",y="Total")
We purposely limit the number of injuries to 10000 since tornado is the outliner here. Nevertheless, we can conclude that tornado is the most harmful to population health in America.
We select 4 variables to answer this problem: PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP and EVTYPE. We also do cleaning up the data to get the right dollar value (renaming all the factor levels with “+”,“?”,“-” are 0 and “” is 1. The rest of the levels are recorded in the instruction)
econdata <- StormData %>%
select(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
unique(econdata$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(econdata$CROPDMGEXP)
## [1] M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
econdata$PROPDMGEXP <- as.character(econdata$PROPDMGEXP)
econdata$CROPDMGEXP <- as.character(econdata$CROPDMGEXP)
##relevels
econdata$PROPDMGEXP[econdata$PROPDMGEXP == ""] <- 1
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "-"] <- 0
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "?"] <- 0
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "+"] <- 0
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "0"] <- 0
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "1"] <- 10
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "2"] <- 10^2
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "3"] <- 10^3
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "4"] <- 10^4
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "5"] <- 10^5
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "6"] <- 10^6
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "7"] <- 10^7
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "8"] <- 10^8
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "B"] <- 10^9
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "h"] <- 10^2
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "H"] <- 10^2
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "K"] <- 10^3
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "m"] <- 10^6
econdata$PROPDMGEXP[econdata$PROPDMGEXP == "M"] <- 10^6
econdata$CROPDMGEXP[econdata$CROPDMGEXP == ""] <- 1
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "?"] <- 0
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "0"] <- 1
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "2"] <- 10^2
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "B"] <- 10^9
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "k"] <- 10^3
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "K"] <- 10^3
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "m"] <- 10^6
econdata$CROPDMGEXP[econdata$CROPDMGEXP == "M"] <- 10^6
econdata <- econdata %>%
mutate(propdmgtemp = as.numeric(PROPDMGEXP) * PROPDMG,
cropdmgtemp = as.numeric(CROPDMGEXP) * CROPDMG) %>%
group_by(EVTYPE) %>%
summarise(propdmgdollars = sum(propdmgtemp),
cropdmgdollars = sum(cropdmgtemp)) %>%
arrange(desc(propdmgdollars)) %>%
print()
## # A tibble: 985 x 3
## EVTYPE propdmgdollars cropdmgdollars
## <fctr> <dbl> <dbl>
## 1 FLOOD 144657709870 5661968450
## 2 HURRICANE/TYPHOON 69305840000 2607872800
## 3 TORNADO 56947380510 414953270
## 4 STORM SURGE 43323536000 5000
## 5 FLASH FLOOD 16822675580 1421317100
## 6 HAIL 15735267790 3025954473
## 7 HURRICANE 11868319010 2741910000
## 8 TROPICAL STORM 7703890550 678346000
## 9 WINTER STORM 6688497250 26944000
## 10 HIGH WIND 5270046260 638571300
## # ... with 975 more rows
summary(econdata$propdmgdollars, na.rm=T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 4.347e+08 5.105e+04 1.447e+11
summary(econdata$cropdmgdollars, na.rm=T)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 4.985e+07 0.000e+00 1.397e+10
Like Q1, we will find the top 10 events for each category and graph them.
prop10 <- arrange(econdata, desc(econdata$propdmgdollars))[1:10,c(1,2)]
crop10 <- arrange(econdata, desc(econdata$cropdmgdollars))[1:10,c(1,3)]
prop10;crop10
## # A tibble: 10 x 2
## EVTYPE propdmgdollars
## <fctr> <dbl>
## 1 FLOOD 144657709870
## 2 HURRICANE/TYPHOON 69305840000
## 3 TORNADO 56947380510
## 4 STORM SURGE 43323536000
## 5 FLASH FLOOD 16822675580
## 6 HAIL 15735267790
## 7 HURRICANE 11868319010
## 8 TROPICAL STORM 7703890550
## 9 WINTER STORM 6688497250
## 10 HIGH WIND 5270046260
## # A tibble: 10 x 2
## EVTYPE cropdmgdollars
## <fctr> <dbl>
## 1 DROUGHT 13972566000
## 2 FLOOD 5661968450
## 3 RIVER FLOOD 5029459000
## 4 ICE STORM 5022113500
## 5 HAIL 3025954473
## 6 HURRICANE 2741910000
## 7 HURRICANE/TYPHOON 2607872800
## 8 FLASH FLOOD 1421317100
## 9 EXTREME COLD 1292973000
## 10 FROST/FREEZE 1094086000
par(mfrow=c(1,2), mar = c(12, 4, 1, 1))
barplot(prop10$propdmgdollars/(10^9),las=3,names.arg = prop10$EVTYPE,
col="cadetblue2",ylab = "Property Damage (billions)")
barplot(crop10$cropdmgdollars/(10^9),las=3, names.arg = crop10$EVTYPE,
col="coral1",ylab = "Crop Damage(billions)")
Now we want the plot to combine the effects of these events for both property and crop damage
econdata_melt <- melt(econdata, id.vars = "EVTYPE")
econdata_melt <- econdata_melt %>%
filter(EVTYPE %in% c(as.character(crop10$EVTYPE),
as.character(prop10$EVTYPE))) %>%
group_by(EVTYPE) %>%
filter(value!=0)
levels(econdata_melt$variable) <- c("Property Damage", "Crop Damage")
print(econdata_melt)
## # A tibble: 30 x 3
## # Groups: EVTYPE [15]
## EVTYPE variable value
## <fctr> <fctr> <dbl>
## 1 FLOOD Property Damage 144657709870
## 2 HURRICANE/TYPHOON Property Damage 69305840000
## 3 TORNADO Property Damage 56947380510
## 4 STORM SURGE Property Damage 43323536000
## 5 FLASH FLOOD Property Damage 16822675580
## 6 HAIL Property Damage 15735267790
## 7 HURRICANE Property Damage 11868319010
## 8 TROPICAL STORM Property Damage 7703890550
## 9 WINTER STORM Property Damage 6688497250
## 10 HIGH WIND Property Damage 5270046260
## # ... with 20 more rows
ggplot(econdata_melt, aes(x=EVTYPE,y=value/10^9,fill=variable))+
geom_col(position = "dodge")+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_cartesian(ylim = c(0,140)) +
labs(title = "Effects on crops and property", x="Events",y="Total")
1.Based on this data, we found that tornadoes and excessive heat causes the most population health damages in the United States. 2.We also figures out that flood causes the biggest damage on property and drought causes the most damage on crop