Title: Reproducible Project 2
Author: Scott Semel
Date: October 23, 2015
# The csv file can be found here:
# https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2
# We are going to need these libraries.
library(data.table)
library(dplyr)
library(knitr)
require(data.table)
First check and see what the column headers are Can remove the lines with excessive text with this for easier reading but it's not necessary:
# dat=fread("repdata_data_StormData.csv", verbose=TRUE, drop = c("REMARKS","STATEOFFIC","COUNTYNAME","ZONENAMES"))
fread("repdata_data_StormData.csv", nrows=2)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1: 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2: 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1: TORNADO 0 NA NA NA NA 0
## 2: TORNADO 0 NA NA NA NA 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1: NA 0 NA NA 14 100 3 0 0
## 2: NA 0 NA NA 2 150 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1: 15 25.0 K 0 NA NA NA NA
## 2: 0 2.5 K 0 NA NA NA NA
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1: 3040 8812 3051 8806 NA 1
## 2: 3042 8755 0 0 NA 2
dat=fread("repdata_data_StormData.csv")
##
Read 43.4% of 967216 rows
Read 69.3% of 967216 rows
Read 89.9% of 967216 rows
Read 902297 rows and 37 (of 37) columns from 0.523 GB file in 00:00:05
## Warning in fread("repdata_data_StormData.csv"): Read less rows (902297)
## than were allocated (967216). Run again with verbose=TRUE and please
## report.
dat = as.data.frame(dat)
max(dat$FATALITIES)
## [1] 583
max(dat$INJURIES)
## [1] 1700
which(dat$FATALITIES>=570)
## [1] 198704
#dat[198704,]
which(dat$INJURIES>=1600)
## [1] 157885
#dat[157885,]
by_type = group_by(dat, EVTYPE)
a = summarize(by_type, sum(FATALITIES))
names(a)=c("EVENT", "TOTALFATAL")
a = arrange(a,desc(TOTALFATAL))
a = a[1:10,]
b = summarize(by_type, sum(INJURIES))
names(b)=c("EVENT", "TOTALINJURIES")
b = arrange(b,desc(TOTALINJURIES))
b = b[1:10,]
PROPDMGEXP = as.factor(dat$PROPDMGEXP)
levels(PROPDMGEXP)
## [1] "" "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K"
## [18] "m" "M"
CROPDMGEXP = as.factor(dat$CROPDMGEXP)
levels(CROPDMGEXP)
## [1] "" "?" "0" "2" "B" "k" "K" "m" "M"
propdat = dat[which(dat$PROPDMGEXP=="B" | dat$PROPDMGEXP=="b"),]
propdat$PROPDMG
## [1] 5.00 0.10 2.10 1.60 1.00 5.00 2.50 1.20 3.00 1.70
## [11] 3.00 1.50 5.15 1.00 1.04 2.50 5.42 1.30 4.83 4.00
## [21] 1.00 1.50 10.00 16.93 31.30 4.00 7.35 11.26 5.88 2.09
## [31] 115.00 1.00 4.00 1.50 1.80 1.00 1.50 2.80 1.00 2.00
by_type = group_by(propdat, EVTYPE)
c = summarize(by_type, sum(PROPDMG))
names(c)=c("EVENT", "TOTALPROPCOST")
c = arrange(c,desc(TOTALPROPCOST))
c = c[1:10,]
cropdat = dat[which(dat$CROPDMGEXP=="B" | dat$CROPDMGEXP=="b"),]
cropdat$CROPDMG
## [1] 0.40 5.00 0.50 0.20 5.00 1.51 1.00 0.00 0.00
by_type = group_by(cropdat, EVTYPE)
d = summarize(by_type, sum(CROPDMG))
names(d)=c("EVENT", "TOTALCROPCOST")
d = arrange(d,desc(TOTALCROPCOST))
d = d[1:10,]
arrange(d,desc(TOTALCROPCOST))
## Source: local data frame [10 x 2]
##
## EVENT TOTALCROPCOST
## 1 ICE STORM 5.00
## 2 RIVER FLOOD 5.00
## 3 HURRICANE/TYPHOON 1.51
## 4 DROUGHT 1.50
## 5 HEAT 0.40
## 6 FREEZE 0.20
## 7 NA NA
## 8 NA NA
## 9 NA NA
## 10 NA NA
barplot(a$TOTALFATAL,names=as.character(a$EVENT),col="red",las=2,main="Event Type vs. Number of Deaths",ylab="Total Deaths")
box()
barplot(b$TOTALINJURIES,names=as.character(b$EVENT),col="orange",las=2,main="Event Type vs. Number of Injuries",ylab="Total Injuries")
box()
barplot(c$TOTALPROPCOST,names=as.character(c$EVENT),col="blue",las=2,main="Event Type vs Property Damage",ylab="Billions $")
box()
#barplot(d$TOTALCROPCOST,names=as.character(d$EVENT),col="green",las=2,main="Event Type vs Crop Damage",ylab="Billions $")
#box()