##Load needed libraries and packages to analyze and visualize storm data. Load and process storm data for analysis. Analyze by unit and type of data. We use this data to answer questions like which type of events are most harmful to population health and which events have the greatest economic consequences. Finally, these are visualized by section
library(data.table)
library(ggplot2)
library(grid)
library(gridExtra)
library(knitr)
# getwd()
setwd("~/Desktop/ReproducibleResearchAssignment2")
# environment
sessionInfo()
## R version 3.1.0 (2014-04-10)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
##
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] gridExtra_0.9.1 ggplot2_1.0.0 data.table_1.9.2 knitr_1.6
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.2-4 digest_0.6.4 evaluate_0.5.5 formatR_0.10
## [5] gtable_0.1.2 MASS_7.3-33 munsell_0.4.2 plyr_1.8.1
## [9] proto_0.3-10 Rcpp_0.11.2 reshape2_1.4 scales_0.2.4
## [13] stringr_0.6.2 tools_3.1.0
data <-read.csv("data.csv", header = TRUE, stringsAsFactors = FALSE)
head(data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
# Transform the exponent of property damage to numeric saved the transformed
# result into new column numericalPROPDMGEXP
data$numericalPROPDMGEXP <- rep(0, nrow(data))
#B = billion
#M = million
#k = kilo
#h = hundred
data$numericalPROPDMGEXP[data$PROPDMGEXP == "B"] <- 9
data$numericalPROPDMGEXP[data$PROPDMGEXP == "M"] <- 6
data$numericalPROPDMGEXP[data$PROPDMGEXP == "K"] <- 3
data$numericalPROPDMGEXP[data$PROPDMGEXP == "H"] <- 2
# digit to digit use !is.na(as.numeric(ata$PROPDMGEXP)) to check whether the
# character is a digit
digitIndex <- suppressWarnings(!is.na(as.numeric(data$PROPDMGEXP, data$PROPDMGEXP)))
data$numericalPROPDMGEXP[digitIndex] <- as.numeric(data$PROPDMGEXP[digitIndex])
data$numericalPropDmg <- data$PROPDMG * 10^(data$numericalPROPDMGEXP)
# Apply the same affect to damages to crops
data$numericalCROPDMGEXP <- rep(0, nrow(data))
data$numericalCROPDMGEXP[data$CROPDMGEXP == "B"] <- 9
data$numericalCROPDMGEXP[data$CROPDMGEXP == "M"] <- 6
data$numericalCROPDMGEXP[data$CROPDMGEXP == "K"] <- 3
data$numericalCROPDMGEXP[data$CROPDMGEXP == "H"] <- 2
digitIndex <- suppressWarnings(!is.na(as.numeric(data$CROPDMGEXP, data$CROPDMGEXP)))
data$numericalCROPDMGEXP[digitIndex] <- as.numeric(data$CROPDMGEXP[digitIndex])
data$numericalCropDmg <- data$CROPDMG * 10^(data$numericalCROPDMGEXP)
# converting type of time
data$BGN_DATE <- strptime(data$BGN_DATE, "%m/%d/%Y %H:%M:%S")
# plot number of fatalities in each event type we plot top 20 events only
agg <- aggregate(data$FATALITIES, by = list(data$EVTYPE), "sum")
agg <- agg[order(-agg$x), ][1:20, ]
agg$Group.1 <- factor(agg$Group.1, levels = agg$Group.1)
g1 <- ggplot(agg, aes(x = Group.1, y = x)) + geom_bar(stat = "identity", fill = "red",
las = 3) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") +
ylab("Count") + ggtitle("Number of fatalities in each event type")
# plot number of injures in each event type we plot top 20 events only
agg <- aggregate(data$INJURIES, by = list(data$EVTYPE), "sum")
agg <- agg[order(-agg$x), ][1:20, ]
agg$Group.1 <- factor(agg$Group.1, levels = agg$Group.1)
g2 <- ggplot(agg, aes(x = Group.1, y = x)) + geom_bar(stat = "identity", fill = "red",
las = 3) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") +
ylab("Count") + ggtitle("Number of injures in each event type")
# results plot
grid.arrange(g1, g2, ncol = 1)
# plot property damage in each event type we plot top 20 events only
agg <- aggregate(data$numericalPropDmg, by = list(`?`(data$EVTYPE)), "sum")
## Warning: no method defined for function '$' and signature 'x =
## "data.frame"'
## Error: no documentation for function '$' and signature 'x = "data.frame"'
agg <- agg[order(-agg$x), ][1:20, ]
agg$Group.1 <- factor(agg$Group.1, levels = agg$Group.1)
g1 <- ggplot(agg, aes(x = Group.1, y = x)) + geom_bar(stat = "identity", fill = "red",
las = 3) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") +
ylab("Damage") + ggtitle("Amount of property damage in each event type")
# plot crop damage in each event type we plot top 20 events only
agg <- aggregate(data$numericalCropDmg, by = list(data$EVTYPE), "sum")
agg <- agg[order(-agg$x), ][1:20, ]
agg$Group.1 <- factor(agg$Group.1, levels = agg$Group.1)
g2 <- ggplot(agg, aes(x = Group.1, y = x)) + geom_bar(stat = "identity", fill = "red",
las = 3) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Event Type") +
ylab("Damage") + ggtitle("Amount of crop damage in each event type")
# plot the results
grid.arrange(g1, g2)