NOAA data was read, processesed to get what we need and related with humann health, propoerty and agriculture effects for respective years. In the following I demonstrate assumtions on prioritising data to reap the greatest effects with efficiency.
#install.packages("R.utils")
library("R.utils")
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.20.0 (2016-02-17) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
##
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
##
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
##
## R.utils v2.3.0 (2016-04-13) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
##
## The following object is masked from 'package:utils':
##
## timestamp
##
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
library("data.table")
# Here's where we unzip the original file. By setting remove = FALSE you make use of the original unzipped file, if it was avaialble
if(!file.exists("repdata-data-StormData.csv")){
R.utils::bunzip2("repdata-data-StormData.csv.bz2", remove = FALSE)}
# By setting data.table = FALSE you get a simple data.frame
df<-data.table::fread("repdata-data-StormData.csv", data.table = FALSE)
##
Read 0.0% of 967216 rows
Read 15.5% of 967216 rows
Read 25.8% of 967216 rows
Read 30.0% of 967216 rows
Read 36.2% of 967216 rows
Read 44.5% of 967216 rows
Read 52.7% of 967216 rows
Read 61.0% of 967216 rows
Read 70.3% of 967216 rows
Read 77.5% of 967216 rows
Read 81.7% of 967216 rows
Read 85.8% of 967216 rows
Read 92.0% of 967216 rows
Read 902297 rows and 37 (of 37) columns from 0.523 GB file in 00:00:18
## Warning in data.table::fread("repdata-data-StormData.csv", data.table =
## FALSE): Read less rows (902297) than were allocated (967216). Run again
## with verbose=TRUE and please report.
# Take only what we need to answer both questions
# Jutification: doing this early will speed up the use of CPU memory
df<-df[,c("EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
library("ggplot2")
qplot(EVTYPE, FATALITIES, data=fat)+ geom_bar(stat="identity")+ ylab("No. of Fatalities") + xlab("Enviromental Disaster Type") + ggtitle("Estimates of Fatalities") + theme(axis.text.x = element_text(angle=80, hjust=1))
table(df$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5
## 465934 1 8 5 216 25 13 4 4 28
## 6 7 8 B h H K m M
## 4 5 1 40 1 6 424665 7 11330
table(df$CROPDMGEXP)
##
## ? 0 2 B k K m M
## 618413 7 19 1 9 21 281832 1 1994
# Following shows the sum of most significant exponents
# I plan to consider this sub-set as insignificant: c("", "?", "-1", "0")
x <- setdiff(df$PROPDMGEXP, c("", "?", "-1", "0"))
sum(df$PROPDMGEXP %in% x)
## [1] 436139
y <- setdiff(df$CROPDMGEXP, c("", "?", "-1", "0"))
sum(df$CROPDMGEXP %in% y)
## [1] 283858
library("reshape2")
##
## Attaching package: 'reshape2'
##
## The following objects are masked from 'package:data.table':
##
## dcast, melt
fmelt<-melt(lazydf, id=c("EVTYPE"), measure.vars=c("SUMPROPDMG", "SUMCROPDMG"))
fcast<-dcast(fmelt, EVTYPE~variable,sum, na.rm=T)
property<-head(fcast[order(fcast$SUMPROPDMG, decreasing=T),], n=15)
crop<-head(fcast[order(fcast$SUMCROPDMG, decreasing=T),], n=15)
library("ggplot2")
qplot(EVTYPE, SUMPROPDMG, data=property)+ geom_bar(stat="identity")+ ylab("Property Damage Amount $") + xlab("Enviromental Disaster Type") + ggtitle("Estimates of Property Damange") + theme(axis.text.x = element_text(angle=80, hjust=1))
qplot(EVTYPE, SUMCROPDMG, data=crop)+ geom_bar(stat="identity")+ ylab("Property Damage Amount $") + xlab("Enviromental Disaster Type") + ggtitle("Estimates of Crop Damange") + theme(axis.text.x = element_text(angle=80, hjust=1))