library(R.utils)
## Warning: package 'R.utils' was built under R version 3.1.3
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 3.1.3
## Loading required package: R.methodsS3
## Warning: package 'R.methodsS3' was built under R version 3.1.3
## R.methodsS3 v1.7.0 (2015-02-19) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.19.0 (2015-02-27) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
##
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
##
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
##
## R.utils v2.0.0 (2015-02-28) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
##
## The following object is masked from 'package:utils':
##
## timestamp
##
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
For this analysis, I am interested in fatalities, injuries, property damage and crop damage caused by severe weather in the U.S. Therefore, I begin by subsetting the data to include only those four things.
Then I look at the health effects and economic effects seperately.
This is how I uploaded the NOAA data:
bunzip2("repdata-data-StormData.csv.bz2","StormData2.csv",remove=FALSE)
data <- read.csv("StormData.csv",as.is=TRUE,comment.char="")
This is a very large dataset, so the first thing I decided to do was subset it and select the rows that recorded positive values for economic or health damage:
q <- (data$FATALITIES > 0) | (data$INJURIES > 0) | (data$PROPDMG > 0) | (data$CROPDMG > 0)
data <- data[q,]
data$EVTYPE <- toupper(x = data$EVTYPE)
data$EVTYPE <- gsub(" +",
" ",
data$EVTYPE)
data$EVTYPE <- gsub(" $",
"",
data$EVTYPE)
There are over 400 event types in the data; however, the National Weather Service Storm Data Documentation Table 2.1.1 only has around 100. This is the code that tries to bring these two numbers closer together and clean up the event types:
data$EVTYPE[grep("NON-THUNDERSTORM WIND|NON THUNDERSTORM WIND|GUSTY WIND|^WIND|^WINDS|HIGH WIND.|WIND DAMAGE",data$EVTYPE)] <- "HIGH WIND"
data$EVTYPE[grep("STRONG WINDS|STRONG WIND",data$EVTYPE)]<-"STRONG WIND"
data$EVTYPE[grep("THUNDEERSTORM|THUNDERESTORM|THUNDERSTROM|THUDERSTORM|THUNERSTORM|THUNDERTORM|TUNDERSTORM|TSTM WIND|STORM FORCE WINDS|THUNDERSTORMW|THUNDERSTORMWINDS|THUNDERSTORM WINS|TSTMW|THUNDERSTORMS WIND|THUNDERSTORM WINDS|THUNDERSTORM WIND/HAIL|THUNDERSTORM WINDS HAIL|THUNDERSTORM WINDSS|SEVERE THUNDERSTORM|THUNDERSTORM|THUNDERSTORM WIND......*",
data$EVTYPE)] <- "THUNDERSTORM WIND"
data$EVTYPE[grep("URBAN/SML STREAM FLD|FLASH FLOODING|FLOOD/FLASH FLOOD|RIVER FLOOD|URBAN FLOODING|URBAN/SMALL STREAM FLOOD|FLASH FLOODING/FLOOD|FLASH FLOODS|URBAN FLOOD|FLASH FLOOD/FLOOD|FLASH",
data$EVTYPE)] <-"FLASH FLOOD"
data$EVTYPE[grep("HEAVY RAINS/FLOODING|^FLOODING|^FLOOD.",
data$EVTYPE)]<-"FLOOD"
data$EVTYPE[grep("LAKE-EFFECT SNOW|EXCESSIVE SNOW|SQUAL|HEAVY SNOW",
data$EVTYPE)]<-"HEAVY SNOW"
data$EVTYPE[grep("HEAVY RAINS|^RAIN|PRECIPITATION",
data$EVTYPE)]<-"HEAVY RAIN"
data$EVTYPE[grep("HEAVY MIX|RECORD SNOW",
data$EVTYPE)]<-"WINTER STORM"
data$EVTYPE[grep("WINTER WEATHER|LIGHT SNOW|^SNOW|FREEZING RAIN|LIGHT FREEZING RAIN|SNOW FREEZING RAIN|FREEZING DRIZZLE|WINTRY|LAKE EFFECT SNOW|ICY ROADS|^ICE$",
data$EVTYPE)] <-"WINTER WEATHER"
data$EVTYPE[grep("FALLING SNOW/ICE|GLAZE ICE|HARD FREEZE|ICE FLOES",
data$EVTYPE)]<-"ICE STORM"
data$EVTYPE[grep("FIRE",
data$EVTYPE)]<-"WILDFIRE"
data$EVTYPE[grep("SLIDE",
data$EVTYPE)]<-"LANDSLIDE"
data$EVTYPE[grep("TORNADO",
data$EVTYPE)]<-"TORNADO"
data$EVTYPE[grep("HURRICANE|TYPHOON",
data$EVTYPE)]<-"HURRICANE"
data$EVTYPE[grep("UNSEASONABLY COLD|^COLD|EXTENDED COLD|UNSEASONABLE COLD|LOW TEMPERATURE",
data$EVTYPE)]<-"COLD/WIND CHILL"
data$EVTYPE[grep("EXTREME COLD/WIND CHILL|RECORD COLD|EXTREME WINDCHILL|HYPOTHERMIA/EXPOSURE",
data$EVTYPE)]<-"EXTREME COLD"
data$EVTYPE[grep("FROST|FREEZE",
data$EVTYPE)]<-"FROST/FREEZE"
data$EVTYPE[grep("^FOG",
data$EVTYPE)]<-"DENSE FOG"
data$EVTYPE[grep("BLIZZARD",
data$EVTYPE)]<-"BLIZZARD"
data$EVTYPE[grep("HAIL",
data$EVTYPE)]<-"HAIL"
data$EVTYPE[grep("RIP",
data$EVTYPE)]<-"RIP CURRENT"
data$EVTYPE[grep("SURF|HIGH WATER|HIGH SWELLS|HIGH TIDE",
data$EVTYPE)]<-"HIGH SURF"
data$EVTYPE[grep("HEAT",
data$EVTYPE)]<-"HEAT"
That is probably as good as it is going to get. Now, I will aggregate the event types with a smaller count than “OTHER” into the “OTHER” event type.
event.table <- sort(table(data$EVTYPE), decreasing = TRUE)
event.types.under.OTHER <- names(event.table[event.table < 34])
data$EVTYPE[data$EVTYPE %in% event.types.under.OTHER] <- "OTHER"
data$BGN_DATE <- as.Date(data$BGN_DATE,format="%m/%d/%Y %H:%M:%S")
data$BGN_DATE <- format(data$BGN_DATE, format="%Y")
data$BGN_DATE <- as.factor(data$BGN_DATE)
data$EVTYPE <- as.factor(data$EVTYPE)
My final event types table looks like this:
sort(table(data$EVTYPE),decreasing=TRUE)
##
## THUNDERSTORM WIND TORNADO HAIL
## 119822 39969 26164
## FLASH FLOOD LIGHTNING FLOOD
## 22662 13293 10253
## HIGH WIND STRONG WIND HEAVY SNOW
## 6372 3472 1640
## WINTER STORM WILDFIRE HEAVY RAIN
## 1518 1259 1154
## HEAT WINTER WEATHER ICE STORM
## 980 902 716
## RIP CURRENT TROPICAL STORM EXTREME COLD
## 643 416 340
## OTHER AVALANCHE DROUGHT
## 290 268 266
## BLIZZARD HIGH SURF HURRICANE
## 256 240 232
## LANDSLIDE DENSE FOG STORM SURGE
## 209 182 177
## COASTAL FLOOD FROST/FREEZE COLD/WIND CHILL
## 168 153 151
## DUST STORM DUST DEVIL DRY MICROBURST
## 103 95 78
## COASTAL FLOODING STORM SURGE/TIDE WATERSPOUT
## 61 47 47
## TROPICAL DEPRESSION
## 35
#Property Damage = PROP
valid.letter <- c("H", "K", "M", "B")
multiplier <- c(100, 1000, 1e+06, 1e+09)
names(multiplier) <- valid.letter
data$PROPDMGEXP <- toupper(data$PROPDMGEXP)
valid <- data$PROPDMGEXP %in% valid.letter
data$PROPDMGEXP[!valid] <- ""
data$PROP[valid] <- data$PROPDMG[valid] * multiplier[data$PROPDMGEXP[valid]]
#Crop Damage = CROP
data$CROPDMGEXP <- toupper(data$CROPDMGEXP)
valid2 <- data$CROPDMGEXP %in% valid.letter
data$CROPDMGEXP[!valid2] <- ""
data$CROP[valid2] <- data$CROPDMG[valid2] * multiplier[data$CROPDMGEXP[valid2]]
F <- data %>% group_by(EVTYPE) %>%
summarize(total.fatality = sum(FATALITIES),
mean.fatality = mean(FATALITIES),
total.injury = sum(INJURIES),
mean.injury = mean(INJURIES),
total = n(),na.rm=TRUE) %>%
filter(mean.fatality > .1|total.fatality>500)%>%
arrange(desc(total.fatality))
ggplot(F, aes(x = reorder(EVTYPE,total.fatality), y = total))+
geom_point(aes(size = F$total.injury),color="red",alpha=.35)+
geom_point(aes(color=F$mean.fatality, size = F$total.fatality),alpha=.85)+
labs(x="Weather Event",y="Total Events",title="Fatal Weather Events Since 1950",size="Total Fatalities", color="Mean Fatality Rate")+
scale_size_continuous(range=c(1,15), limit =c(10,2000),na.value=16)+
theme(axis.text.x = element_text(face="bold", angle = 60, hjust = 1))
This scatterplot displays the weather events with a mean fatality rate above 10% or a total fatality count of 500 or more. The blue represents the number of fatalities, and the red represents the number of injuries.
D <- data %>% group_by(EVTYPE)%>%
summarize(total = n(),
total.crop.damage = sum(CROP,
na.rm=TRUE),
mean.crop.damage = mean(CROP,
na.rm=TRUE),
total.property.damage = sum(PROP,
na.rm=TRUE),
mean.property.damage = mean(PROP,
na.rm=TRUE)) %>%
filter(total.crop.damage>10000000 & total.property.damage>10000000) %>%
arrange(desc(total.property.damage))
ggplot(D, aes(x = reorder(EVTYPE,total.property.damage), y = total))+
geom_point(aes(size = D$total.crop.damage),color="dark green",alpha=.35)+
geom_point(aes(color=D$mean.property.damage, size = D$total.property.damage),alpha=.45)+
labs(x="Weather Event",y="Total Events",title="Crop/Property Damage Caused by Weather Events Since 1950",size="Total Damage", color="Mean Damage per Event")+
scale_size_continuous(range=c(1,20), limit =c(100000,6600000000),na.value=21)+
theme(axis.text.x = element_text(face="bold", angle = 60, hjust = 1))
This plot shows the weather events that have caused more than $10 million in both crop and property damage since 1950. The green points represent crop damage and the blue points represent property damage.