library(knitr)
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:data.table':
##
## between, last
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
library(ggplot2)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:data.table':
##
## hour, mday, month, quarter, wday, week, yday, year
library(maps)
library(maptools)
## Loading required package: sp
## Checking rgeos availability: FALSE
## Note: when rgeos is not available, polygon geometry computations in maptools depend on gpclib,
## which has a restricted licence. It is disabled by default;
## to enable gpclib, type gpclibPermit()
read.csv(file)
getData<-function(n=902297, type='all'){
# i could not figure out a way to cleanly query the total number of entries without having to load the data first, thus defeating the purpose of th n variable.
## temporarily, i loaded up all of the data, took nrow and set that as the default.
stateData<-data.table(read.csv('/repos/data/states.csv', stringsAsFactors=FALSE, head=FALSE))
setnames(stateData, old=c('V1','V2'), new=c('statename','sabrev'))
path<-'/repos/data/reproducableResearch/peerAssesment2'
compressedFilename<-paste0(path, '/stormData.csv.bz2')
filename<-paste0(path, '/stormData.csv')
# types All, damageing
if(!sum(grepl('stormData\\.csv', list.files(path)))>=1){
download.file('https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2', compressedFilename, method='curl')
R.utils::bunzip2(filename=compressedFilename, destname=filename)
}
dat<-read.csv(file=filename, nrow=n, stringsAsFactors=FALSE)
## set dates
dat$year<-as.numeric(format(strptime(paste0(gsub('^([0-9/]+)(.+)','\\1',dat$BGN_DATE), ':',dat$BGN_TIME), format='%m/%d/%Y:%H'), format='%Y'))
dat$quarter<-as.numeric(cut(as.numeric(format(strptime(paste0(gsub('^([0-9/]+)(.+)','\\1',dat$BGN_DATE), ':',dat$BGN_TIME), format='%m/%d/%Y:%H'), format='%m')), breaks = 4, labels=FALSE))
# Set factors and classes of other measurements
dat$county<-factor(dat$COUNTY)
dat$countyName<-factor(dat$COUNTYNAME)
dat$sabrev<-as.character(dat$STATE)
dat$state<-as.character(dat$STATE)
dat$event<-factor(dat$EVTYPE)
dat$beginLocation<-factor(dat$BGN_LOCATI)
dat$endLocation<-factor(dat$END_LOCATI)
dat$countyEnd<-factor(dat$COUNTY_END)
dat$countyEndDN<-factor(dat$COUNTYENDN)
dat$f<-factor(dat$F)
dat$mag<-factor(dat$MAG)
dat$wfo<-factor(dat$WFO)
dat$beginRange<-as.numeric(dat$BGN_RANGE)
dat$endRange<-as.numeric(dat$END_RANGE)
dat$length<-as.numeric(dat$LENGTH)
dat$width<-as.numeric(dat$WIDTH)
# Damage and Injuries
dat$propDmg<-as.double(dat$PROPDMG)
dat$cropDmg<-as.double(dat$CROPDMG)
dat$dmg<-as.double(dat$propDmg+dat$cropDmg)
dat$fatalities<-as.numeric(dat$FATALITIES)
dat$injuries<-as.numeric(dat$INJURIES)
dat$cropDmgExp<-as.character(dat$CROPDMGEXP)
dat$propDmgExp<-as.character(dat$PROPDMGEXP)
# Location
dat$lat<-as.numeric(dat$LATITUDE)
dat$long<-as.numeric(dat$LONGITUDE)
dat$lat2<-as.numeric(dat$LATITUDE_E)
dat$long2<-as.numeric(dat$LONGITUDE_)
dat$office<-factor(dat$STATEOFFIC)
dat$zone<-factor(dat$ZONENAMES)
# Other
dat$ref<-factor(dat$REFNUM)
dat$remarks<-as.character(dat$REMARKS)
dat$beginAzimuth<-as.character(dat$BGN_AZI)
dat$endAzimuth<-as.character(dat$END_AZI)
dat<-merge(x=dat, y=stateData, by='sabrev')
# remove old variables and return the data.table
removedVariables<-c('beginLocation', 'endLocation', 'countyEnd',
'countyEnd', 'countyEndDN','wfo', 'office','ref', 'beginRange', 'endRange', 'length','width', 'propDmgExp',
'cropDmgExp', 'remarks', 'beginAzimuth', 'endAzimuth','county','zone')
# I only kept the variables that seemed absolutly relevent to answer the questions and could not be derived as a function of other variables.
returnRows<-c('year', 'quarter', 'countyName', 'sabrev', 'event', 'f','mag', 'fatalities', 'injuries',
'lat', 'long', 'lat2', 'long2', 'dmg', 'statename')
dat<-data.table(dat[,returnRows])
dat<-dat[!is.na(dat$year),]
if(tolower(type)=='all')dat<-dat
if(tolower(type)=='alldamage')dat<- dat %>% filter(fatalities>0 | injuries > 0 | dmg > 0)
if(tolower(type)=='humandamage')dat<- dat %>% filter(fatalities>0 | injuries > 0 )
if(tolower(type)=='economicdamage')dat<- dat %>% filter(dmg > 0)
dat
}
choloPlot <- function(x){
state<-map_data('state')
x<-merge(x=state, y=x, by.x='region', by.y='lname')
x<-x[order(x$order),]
stripChartJunk<-theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
panel.grid=element_blank(),
panel.background=element_blank())
divergingColors<-c('#A6611A','#DFC27D','#F5F5F5','#80CDC1','#018571')
plt<-ggplot(data=x, aes(x=long, y=lat, group=group, fill=count))
plt<-plt + geom_polygon(colour='black')+coord_map(projection = 'polyconic')
colorScheme <- scale_fill_gradient2(low=divergingColors[5], mid=divergingColors[3],
high=divergingColors[1], midpoint=mean(x$count))
plt + colorScheme + stripChartJunk
}
printCurrency<-function(x){
paste0("$", formatC(as.numeric(
x
), format="f", digits=2, big.mark=","))
}
econWeather<-getData(type = 'economicdamage')
humanWeather<-getData(type = 'humandamage')
topEcon<- econWeather %>% group_by( event ) %>% summarise(n=sum(dmg)) %>% arrange(desc(n))
topHuman<- humanWeather %>% group_by( event ) %>% summarise(n=sum(dmg)) %>% arrange(desc(n))
topEcon<-topEcon[1:10,]
topHuman<-topHuman[1:10,]
topEcon$price<-printCurrency(topEcon$n)
econWeather %>% group_by(sabrev, statename, event) %>% summarise(sum(dmg))
## Source: local data table [2,062 x 4]
## Groups: sabrev, statename
##
## sabrev statename event sum(dmg)
## 1 AK Alaska AVALANCHE 869.00
## 2 AK Alaska FLOOD 10816.41
## 3 AK Alaska WILD/FOREST FIRE 1975.00
## 4 AK Alaska WILDFIRE 1736.50
## 5 AK Alaska HIGH WIND 7649.50
## 6 AK Alaska STRONG WIND 221.80
## 7 AK Alaska HEAVY SNOW 1167.75
## 8 AK Alaska BLIZZARD 201.00
## 9 AK Alaska STORM SURGE 1380.59
## 10 AK Alaska HEAVY RAIN 1869.50
## .. ... ... ... ...
humanWeather %>% group_by(sabrev, statename, event) %>% summarise(sum(dmg))
## Source: local data table [1,417 x 4]
## Groups: sabrev, statename
##
## sabrev statename event sum(dmg)
## 1 AK Alaska HEAVY SNOW 361.00
## 2 AK Alaska AVALANCHE 653.00
## 3 AK Alaska HIGH WIND 537.65
## 4 AK Alaska HIGH WINDS 5.00
## 5 AK Alaska BLIZZARD 0.00
## 6 AK Alaska WINTER WEATHER 10.00
## 7 AK Alaska LIGHTNING 1.00
## 8 AK Alaska EXTREME COLD 250.00
## 9 AK Alaska ICE STORM 199.00
## 10 AK Alaska COASTAL FLOOD 0.00
## .. ... ... ... ...
econWeatherDamage <- econWeather %>% group_by(sabrev, statename, event) %>% summarise(economicsum=sum(dmg))
humanWeatherDamage <- humanWeather %>% group_by(sabrev, statename, event) %>% summarise(human=round(sum(injuries + fatalities)))
econ<-econWeatherDamage[econWeatherDamage$event %in% topEcon$event,]
human<-humanWeatherDamage[humanWeatherDamage$event %in% topHuman$event,]
totalEconCost<-econ %>% group_by(sabrev, statename) %>% summarise(sum(economicsum))
totalEconCost$lname <- tolower(totalEconCost$statename)
setnames(totalEconCost, old = 'sum(economicsum)', new='count')
totalHumanCost<- human %>% group_by(sabrev, statename) %>% summarise(round(sum(human)))
totalHumanCost$lname <- tolower(totalHumanCost$statename)
setnames(totalHumanCost, old = 'round(sum(human))', new='count')
This is an analysis of the human and economic costs of severe weather events in the United States.
Data processing began by downloading and unbzipping the NOAA file. (I am aware that read.csv allows nativly openening tar, zip and bz2 files, however it is a personal preference of mine to uncompress and then open the uncompressed file.)
I found some oddities both in the choice of storing date, time and timezone in separate fields along with an annoying naming convention. In order to simplify the processing, I created a function called getData() which (due to the large amount of data) allows me to specify a subset to work with. Inside this function
The first step after i read the data through the getDate(numberOfRows) function is to split it up into appropriate buckets and summarize.
Looking at the data, the primary catagorical variables are going to be
With respect to thier effects on the following numerical variables.
Because the state information is included as an abreviation, I downloaded a table of abbreviations to state names and merged the data together.
I want to visualize this data as cholopleth’s so Im also taking the count by state.
We were asked to answer the following questions with this assignment:
I was most interested in visualizing this as a choropleth map against the united states, but first I wanted to look at a visualization of the costs year over year per state. To make this viewable, I took the top 300 entires. I mapped the human cost to size, economic cost to color and then plotted the year across y with the state across x.
preferedTheme<-theme(axis.text.x = element_text(size=11, angle=45, hjust=1, vjust=1))
eventByYear<-rbind(econWeather, humanWeather) %>% group_by( year, statename, event ) %>% summarise(humanCost=sum(fatalities, injuries), economicCost=sum(dmg))
eventByYear<-eventByYear[eventByYear$event %in% unique(c(as.character(topEcon$event), as.character(topHuman$event))),]
eventByYear<-eventByYear %>% arrange(year)
yearByState<-eventByYear %>% group_by(year,statename) %>% summarise(totalEconomicCost=sum(economicCost, na.rm=TRUE), totalHumanCost=sum(humanCost, na.rm=TRUE))
ybs <- (yearByState %>% ungroup() %>% arrange(desc(totalEconomicCost*totalHumanCost)))[1:200]
ggplot(ybs) + aes(y=as.factor(year), x=statename, size=totalHumanCost, colour=totalEconomicCost) + geom_point() + preferedTheme
For Economic Data:
This data is listed in the table below.
kable(data.frame(topEcon %>% select(event, 'Cost'=price)))
| event | Cost |
|---|---|
| TORNADO | $3,311,805.68 |
| FLASH FLOOD | $1,579,015.50 |
| TSTM WIND | $1,444,858.21 |
| HAIL | $1,268,283.16 |
| FLOOD | $1,063,924.86 |
| THUNDERSTORM WIND | $942,445.81 |
| LIGHTNING | $605,923.89 |
| THUNDERSTORM WINDS | $464,390.56 |
| HIGH WIND | $340,814.77 |
| WINTER STORM | $134,699.58 |
For Human Cost Data:
This data is listed in the table below.
kable(data.frame(topHuman %>% select(event, 'Injuries & Deaths'=round(n, 0))))
| event | Injuries…Deaths |
|---|---|
| TORNADO | 903839.01 |
| TSTM WIND | 110934.03 |
| FLASH FLOOD | 68211.79 |
| HIGH WIND | 42164.30 |
| THUNDERSTORM WIND | 39896.66 |
| FLOOD | 39409.22 |
| THUNDERSTORM WINDS | 33095.48 |
| WILDFIRE | 20595.70 |
| LIGHTNING | 20334.35 |
| WINTER STORM | 15256.91 |
Since the questions are general in nature, I will be combining the economic indicators into a single ‘Damages’ measure.
First, looking at a plot of the count of all economicly damaging events for all time, we see that Texas, Florida and California have seen the most events.
choloPlot(totalEconCost)
Comparing that visually with all health damaging events for all time.
choloPlot(totalHumanCost)