library(knitr)
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
library(ggplot2)
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, mday, month, quarter, wday, week, yday, year
library(maps)
library(maptools)
## Loading required package: sp
## Checking rgeos availability: FALSE
##      Note: when rgeos is not available, polygon geometry     computations in maptools depend on gpclib,
##      which has a restricted licence. It is disabled by default;
##      to enable gpclib, type gpclibPermit()

read.csv(file)

getData<-function(n=902297, type='all'){
    # i could not figure out a way to cleanly query the total number of entries without having to load the data first, thus defeating the purpose of th n variable.
    ## temporarily, i loaded up all of the data, took nrow and set that as the default.

    stateData<-data.table(read.csv('/repos/data/states.csv', stringsAsFactors=FALSE, head=FALSE))
    setnames(stateData, old=c('V1','V2'), new=c('statename','sabrev'))
    path<-'/repos/data/reproducableResearch/peerAssesment2'
    compressedFilename<-paste0(path, '/stormData.csv.bz2')
    filename<-paste0(path, '/stormData.csv')
    # types All, damageing
    if(!sum(grepl('stormData\\.csv', list.files(path)))>=1){
        download.file('https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2', compressedFilename, method='curl')
        R.utils::bunzip2(filename=compressedFilename, destname=filename)
    }
    dat<-read.csv(file=filename,  nrow=n, stringsAsFactors=FALSE)
    ## set dates
    dat$year<-as.numeric(format(strptime(paste0(gsub('^([0-9/]+)(.+)','\\1',dat$BGN_DATE), ':',dat$BGN_TIME), format='%m/%d/%Y:%H'), format='%Y'))
    dat$quarter<-as.numeric(cut(as.numeric(format(strptime(paste0(gsub('^([0-9/]+)(.+)','\\1',dat$BGN_DATE), ':',dat$BGN_TIME), format='%m/%d/%Y:%H'), format='%m')), breaks = 4, labels=FALSE))

    # Set factors and classes of other measurements
    dat$county<-factor(dat$COUNTY)
    dat$countyName<-factor(dat$COUNTYNAME)
    dat$sabrev<-as.character(dat$STATE)
    dat$state<-as.character(dat$STATE)
    dat$event<-factor(dat$EVTYPE)
    dat$beginLocation<-factor(dat$BGN_LOCATI)
    dat$endLocation<-factor(dat$END_LOCATI)
    dat$countyEnd<-factor(dat$COUNTY_END)
    dat$countyEndDN<-factor(dat$COUNTYENDN)
    dat$f<-factor(dat$F)
    dat$mag<-factor(dat$MAG)

    dat$wfo<-factor(dat$WFO)

    dat$beginRange<-as.numeric(dat$BGN_RANGE)
    dat$endRange<-as.numeric(dat$END_RANGE)
    dat$length<-as.numeric(dat$LENGTH)
    dat$width<-as.numeric(dat$WIDTH)

    # Damage and Injuries
    dat$propDmg<-as.double(dat$PROPDMG)
    dat$cropDmg<-as.double(dat$CROPDMG)
    dat$dmg<-as.double(dat$propDmg+dat$cropDmg)
    dat$fatalities<-as.numeric(dat$FATALITIES)
    dat$injuries<-as.numeric(dat$INJURIES)

    dat$cropDmgExp<-as.character(dat$CROPDMGEXP)
    dat$propDmgExp<-as.character(dat$PROPDMGEXP)

    # Location
    dat$lat<-as.numeric(dat$LATITUDE)
    dat$long<-as.numeric(dat$LONGITUDE)
    dat$lat2<-as.numeric(dat$LATITUDE_E)
    dat$long2<-as.numeric(dat$LONGITUDE_)
    dat$office<-factor(dat$STATEOFFIC)
    dat$zone<-factor(dat$ZONENAMES)

    # Other
    dat$ref<-factor(dat$REFNUM)
    dat$remarks<-as.character(dat$REMARKS)
    dat$beginAzimuth<-as.character(dat$BGN_AZI)
    dat$endAzimuth<-as.character(dat$END_AZI)

    dat<-merge(x=dat, y=stateData, by='sabrev')

    # remove old variables and return the data.table
    removedVariables<-c('beginLocation', 'endLocation', 'countyEnd',
                        'countyEnd', 'countyEndDN','wfo', 'office','ref', 'beginRange', 'endRange', 'length','width', 'propDmgExp',
                        'cropDmgExp', 'remarks', 'beginAzimuth', 'endAzimuth','county','zone')
    # I only kept the variables that seemed absolutly relevent to answer the questions and could not be derived as a function of other variables.
    returnRows<-c('year', 'quarter', 'countyName', 'sabrev', 'event', 'f','mag', 'fatalities', 'injuries',
                  'lat', 'long', 'lat2', 'long2', 'dmg', 'statename')
    dat<-data.table(dat[,returnRows])
    dat<-dat[!is.na(dat$year),]
    if(tolower(type)=='all')dat<-dat
    if(tolower(type)=='alldamage')dat<- dat %>% filter(fatalities>0 | injuries > 0 | dmg > 0)
    if(tolower(type)=='humandamage')dat<- dat %>% filter(fatalities>0 | injuries > 0 )
    if(tolower(type)=='economicdamage')dat<- dat %>% filter(dmg > 0)
    dat
}

choloPlot <- function(x){
    state<-map_data('state')
    x<-merge(x=state, y=x, by.x='region', by.y='lname')
    x<-x[order(x$order),]
    stripChartJunk<-theme(axis.title=element_blank(),
                          axis.text=element_blank(),
                          axis.ticks=element_blank(),
                          panel.grid=element_blank(),
                          panel.background=element_blank())

    divergingColors<-c('#A6611A','#DFC27D','#F5F5F5','#80CDC1','#018571')
    plt<-ggplot(data=x, aes(x=long, y=lat, group=group, fill=count))
    plt<-plt + geom_polygon(colour='black')+coord_map(projection = 'polyconic')

    colorScheme <- scale_fill_gradient2(low=divergingColors[5], mid=divergingColors[3],
                              high=divergingColors[1], midpoint=mean(x$count))
    plt + colorScheme + stripChartJunk
}

printCurrency<-function(x){
    paste0("$", formatC(as.numeric(
        x
    ), format="f", digits=2, big.mark=","))
}
econWeather<-getData(type = 'economicdamage')
humanWeather<-getData(type = 'humandamage')
topEcon<-  econWeather %>% group_by( event ) %>% summarise(n=sum(dmg)) %>% arrange(desc(n))
topHuman<-  humanWeather %>% group_by( event ) %>% summarise(n=sum(dmg)) %>% arrange(desc(n))

topEcon<-topEcon[1:10,]
topHuman<-topHuman[1:10,]
topEcon$price<-printCurrency(topEcon$n)

econWeather %>% group_by(sabrev, statename, event) %>% summarise(sum(dmg))
## Source: local data table [2,062 x 4]
## Groups: sabrev, statename
## 
##    sabrev statename            event sum(dmg)
## 1      AK    Alaska        AVALANCHE   869.00
## 2      AK    Alaska            FLOOD 10816.41
## 3      AK    Alaska WILD/FOREST FIRE  1975.00
## 4      AK    Alaska         WILDFIRE  1736.50
## 5      AK    Alaska        HIGH WIND  7649.50
## 6      AK    Alaska      STRONG WIND   221.80
## 7      AK    Alaska       HEAVY SNOW  1167.75
## 8      AK    Alaska         BLIZZARD   201.00
## 9      AK    Alaska      STORM SURGE  1380.59
## 10     AK    Alaska       HEAVY RAIN  1869.50
## ..    ...       ...              ...      ...
humanWeather %>% group_by(sabrev, statename, event) %>% summarise(sum(dmg))
## Source: local data table [1,417 x 4]
## Groups: sabrev, statename
## 
##    sabrev statename          event sum(dmg)
## 1      AK    Alaska     HEAVY SNOW   361.00
## 2      AK    Alaska      AVALANCHE   653.00
## 3      AK    Alaska      HIGH WIND   537.65
## 4      AK    Alaska     HIGH WINDS     5.00
## 5      AK    Alaska       BLIZZARD     0.00
## 6      AK    Alaska WINTER WEATHER    10.00
## 7      AK    Alaska      LIGHTNING     1.00
## 8      AK    Alaska   EXTREME COLD   250.00
## 9      AK    Alaska      ICE STORM   199.00
## 10     AK    Alaska  COASTAL FLOOD     0.00
## ..    ...       ...            ...      ...
econWeatherDamage <- econWeather %>% group_by(sabrev, statename, event) %>% summarise(economicsum=sum(dmg))
humanWeatherDamage <- humanWeather %>% group_by(sabrev, statename, event) %>% summarise(human=round(sum(injuries + fatalities)))

econ<-econWeatherDamage[econWeatherDamage$event %in% topEcon$event,]
human<-humanWeatherDamage[humanWeatherDamage$event %in% topHuman$event,]

totalEconCost<-econ %>% group_by(sabrev, statename) %>%  summarise(sum(economicsum))
totalEconCost$lname <- tolower(totalEconCost$statename)
setnames(totalEconCost, old = 'sum(economicsum)', new='count')

totalHumanCost<- human %>% group_by(sabrev, statename) %>%  summarise(round(sum(human)))
totalHumanCost$lname <- tolower(totalHumanCost$statename)
setnames(totalHumanCost, old = 'round(sum(human))', new='count')

Severe Weather Analysis(Health and Money)

Synopsis

This is an analysis of the human and economic costs of severe weather events in the United States.

Data Processing

Data processing began by downloading and unbzipping the NOAA file. (I am aware that read.csv allows nativly openening tar, zip and bz2 files, however it is a personal preference of mine to uncompress and then open the uncompressed file.)

I found some oddities both in the choice of storing date, time and timezone in separate fields along with an annoying naming convention. In order to simplify the processing, I created a function called getData() which (due to the large amount of data) allows me to specify a subset to work with. Inside this function

  • The date fields are cleaned up and converted to appropriate data structures.
  • The date fields are then converted to a year and quarter field.
  • A more appealing set of column names is added.
  • Data is converted to a data.table in order to get the speed increases.
  • Non relevent fields are dropped.
  • A number of rows (n) filter is added
  • A type filter is added.
    • all : returns all data
    • alldamage : returns all data with positive damage
    • humandamage : returns all data with some human cost in injury or fatalities
    • ecomonicdamage : returns all data with positive economic damage either in crop or property damage.

The first step after i read the data through the getDate(numberOfRows) function is to split it up into appropriate buckets and summarize.

  • topEcon and topHuman pull the top ten costly events for each of these filters.
  • totalEconCost and totalHumanCost are calculated for each state.
  • totalEconCost and totalHumanCost are passed through a choropleth function to support geographic mapping of catagorical and numeric data to the states.

Looking at the data, the primary catagorical variables are going to be

  • year/quarter
  • state
  • event

With respect to thier effects on the following numerical variables.

  • humanCost (fatalities + injuries)
  • totalDamage

Because the state information is included as an abreviation, I downloaded a table of abbreviations to state names and merged the data together.

I want to visualize this data as cholopleth’s so Im also taking the count by state.

Results

We were asked to answer the following questions with this assignment:

  • Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?
  • Across the United States, which types of events have the greatest economic consequences?

I was most interested in visualizing this as a choropleth map against the united states, but first I wanted to look at a visualization of the costs year over year per state. To make this viewable, I took the top 300 entires. I mapped the human cost to size, economic cost to color and then plotted the year across y with the state across x.

preferedTheme<-theme(axis.text.x = element_text(size=11, angle=45, hjust=1, vjust=1))

eventByYear<-rbind(econWeather, humanWeather) %>% group_by( year, statename, event ) %>% summarise(humanCost=sum(fatalities, injuries), economicCost=sum(dmg))
eventByYear<-eventByYear[eventByYear$event %in% unique(c(as.character(topEcon$event), as.character(topHuman$event))),]
eventByYear<-eventByYear %>% arrange(year)
yearByState<-eventByYear %>% group_by(year,statename) %>% summarise(totalEconomicCost=sum(economicCost, na.rm=TRUE), totalHumanCost=sum(humanCost, na.rm=TRUE))

ybs <- (yearByState %>% ungroup() %>% arrange(desc(totalEconomicCost*totalHumanCost)))[1:200]
ggplot(ybs) + aes(y=as.factor(year), x=statename, size=totalHumanCost, colour=totalEconomicCost) + geom_point() + preferedTheme

For Economic Data:

  • I took the top ten events in terms of the sum of the economic costs.
  • For the Choropleth, I then calculated the total cost per state

This data is listed in the table below.

kable(data.frame(topEcon %>% select(event, 'Cost'=price)))
event Cost
TORNADO $3,311,805.68
FLASH FLOOD $1,579,015.50
TSTM WIND $1,444,858.21
HAIL $1,268,283.16
FLOOD $1,063,924.86
THUNDERSTORM WIND $942,445.81
LIGHTNING $605,923.89
THUNDERSTORM WINDS $464,390.56
HIGH WIND $340,814.77
WINTER STORM $134,699.58

For Human Cost Data:

  • I took the top ten events in terms of the sum of the injuries and fatality counts.
  • For the Choropleth, I then calculated the total count per state

This data is listed in the table below.

kable(data.frame(topHuman %>% select(event, 'Injuries & Deaths'=round(n, 0))))
event Injuries…Deaths
TORNADO 903839.01
TSTM WIND 110934.03
FLASH FLOOD 68211.79
HIGH WIND 42164.30
THUNDERSTORM WIND 39896.66
FLOOD 39409.22
THUNDERSTORM WINDS 33095.48
WILDFIRE 20595.70
LIGHTNING 20334.35
WINTER STORM 15256.91

Since the questions are general in nature, I will be combining the economic indicators into a single ‘Damages’ measure.

First, looking at a plot of the count of all economicly damaging events for all time, we see that Texas, Florida and California have seen the most events.

choloPlot(totalEconCost)

Comparing that visually with all health damaging events for all time.

choloPlot(totalHumanCost)