library("ggplot2")
library("gridExtra")
## Warning: package 'gridExtra' was built under R version 3.5.3
library("R.utils")
## Warning: package 'R.utils' was built under R version 3.5.3
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.22.0 (2018-04-21) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
## R.utils v2.8.0 successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
# Read data if it is not already read. Use cache=TRUE while starting this
# codeblock
if (!exists("stormData")) {
# Extract file if it is not already extracted
if (file.exists("repdata_data_StormData.csv.bz2")) {
if (!file.exists("repdata_data_StormData.csv")) {
bunzip2("repdata_data_StormData.csv.bz2", overwrite = F)
}
# Read data into the varirable called stormData
stormData <- read.csv("repdata_data_StormData.csv", sep = ",")
}
}
Take a quick look at the data available.
stormData <- read.csv("repdata_data_StormData.csv", sep = ",")
summary(stormData)
## STATE__ BGN_DATE BGN_TIME
## Min. : 1.0 5/25/2011 0:00:00: 1202 12:00:00 AM: 10163
## 1st Qu.:19.0 4/27/2011 0:00:00: 1193 06:00:00 PM: 7350
## Median :30.0 6/9/2011 0:00:00 : 1030 04:00:00 PM: 7261
## Mean :31.2 5/30/2004 0:00:00: 1016 05:00:00 PM: 6891
## 3rd Qu.:45.0 4/4/2011 0:00:00 : 1009 12:00:00 PM: 6703
## Max. :95.0 4/2/2006 0:00:00 : 981 03:00:00 PM: 6700
## (Other) :895866 (Other) :857229
## TIME_ZONE COUNTY COUNTYNAME STATE
## CST :547493 Min. : 0.0 JEFFERSON : 7840 TX : 83728
## EST :245558 1st Qu.: 31.0 WASHINGTON: 7603 KS : 53440
## MST : 68390 Median : 75.0 JACKSON : 6660 OK : 46802
## PST : 28302 Mean :100.6 FRANKLIN : 6256 MO : 35648
## AST : 6360 3rd Qu.:131.0 LINCOLN : 5937 IA : 31069
## HST : 2563 Max. :873.0 MADISON : 5632 NE : 30271
## (Other): 3631 (Other) :862369 (Other):621339
## EVTYPE BGN_RANGE BGN_AZI
## HAIL :288661 Min. : 0.000 :547332
## TSTM WIND :219940 1st Qu.: 0.000 N : 86752
## THUNDERSTORM WIND: 82563 Median : 0.000 W : 38446
## TORNADO : 60652 Mean : 1.484 S : 37558
## FLASH FLOOD : 54277 3rd Qu.: 1.000 E : 33178
## FLOOD : 25326 Max. :3749.000 NW : 24041
## (Other) :170878 (Other):134990
## BGN_LOCATI END_DATE END_TIME
## :287743 :243411 :238978
## COUNTYWIDE : 19680 4/27/2011 0:00:00: 1214 06:00:00 PM: 9802
## Countywide : 993 5/25/2011 0:00:00: 1196 05:00:00 PM: 8314
## SPRINGFIELD : 843 6/9/2011 0:00:00 : 1021 04:00:00 PM: 8104
## SOUTH PORTION: 810 4/4/2011 0:00:00 : 1007 12:00:00 PM: 7483
## NORTH PORTION: 784 5/30/2004 0:00:00: 998 11:59:00 PM: 7184
## (Other) :591444 (Other) :653450 (Other) :622432
## COUNTY_END COUNTYENDN END_RANGE END_AZI
## Min. :0 Mode:logical Min. : 0.0000 :724837
## 1st Qu.:0 NA's:902297 1st Qu.: 0.0000 N : 28082
## Median :0 Median : 0.0000 S : 22510
## Mean :0 Mean : 0.9862 W : 20119
## 3rd Qu.:0 3rd Qu.: 0.0000 E : 20047
## Max. :0 Max. :925.0000 NE : 14606
## (Other): 72096
## END_LOCATI LENGTH WIDTH
## :499225 Min. : 0.0000 Min. : 0.000
## COUNTYWIDE : 19731 1st Qu.: 0.0000 1st Qu.: 0.000
## SOUTH PORTION : 833 Median : 0.0000 Median : 0.000
## NORTH PORTION : 780 Mean : 0.2301 Mean : 7.503
## CENTRAL PORTION: 617 3rd Qu.: 0.0000 3rd Qu.: 0.000
## SPRINGFIELD : 575 Max. :2315.0000 Max. :4400.000
## (Other) :380536
## F MAG FATALITIES INJURIES
## Min. :0.0 Min. : 0.0 Min. : 0.0000 Min. : 0.0000
## 1st Qu.:0.0 1st Qu.: 0.0 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median :1.0 Median : 50.0 Median : 0.0000 Median : 0.0000
## Mean :0.9 Mean : 46.9 Mean : 0.0168 Mean : 0.1557
## 3rd Qu.:1.0 3rd Qu.: 75.0 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :5.0 Max. :22000.0 Max. :583.0000 Max. :1700.0000
## NA's :843563
## PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## Min. : 0.00 :465934 Min. : 0.000 :618413
## 1st Qu.: 0.00 K :424665 1st Qu.: 0.000 K :281832
## Median : 0.00 M : 11330 Median : 0.000 M : 1994
## Mean : 12.06 0 : 216 Mean : 1.527 k : 21
## 3rd Qu.: 0.50 B : 40 3rd Qu.: 0.000 0 : 19
## Max. :5000.00 5 : 28 Max. :990.000 B : 9
## (Other): 84 (Other): 9
## WFO STATEOFFIC
## :142069 :248769
## OUN : 17393 TEXAS, North : 12193
## JAN : 13889 ARKANSAS, Central and North Central: 11738
## LWX : 13174 IOWA, Central : 11345
## PHI : 12551 KANSAS, Southwest : 11212
## TSA : 12483 GEORGIA, North and Central : 11120
## (Other):690738 (Other) :595920
## ZONENAMES
## :594029
## :205988
## GREATER RENO / CARSON CITY / M - GREATER RENO / CARSON CITY / M : 639
## GREATER LAKE TAHOE AREA - GREATER LAKE TAHOE AREA : 592
## JEFFERSON - JEFFERSON : 303
## MADISON - MADISON : 302
## (Other) :100444
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_
## Min. : 0 Min. :-14451 Min. : 0 Min. :-14455
## 1st Qu.:2802 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0
## Median :3540 Median : 8707 Median : 0 Median : 0
## Mean :2875 Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.:4019 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. :9706 Max. : 17124 Max. :9706 Max. :106220
## NA's :47 NA's :40
## REMARKS REFNUM
## :287433 Min. : 1
## : 24013 1st Qu.:225575
## Trees down.\n : 1110 Median :451149
## Several trees were blown down.\n : 569 Mean :451149
## Trees were downed.\n : 446 3rd Qu.:676723
## Large trees and power lines were blown down.\n: 432 Max. :902297
## (Other) :588294
Data Processing This will prepare requiured data to present most harmful events with respect to population health.
# Trim the data set to required columns only
stormEvent <- stormData[, c("BGN_DATE", "EVTYPE", "FATALITIES", "INJURIES",
"PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
# Create subset for Question 1 and Question 2
# Select data for Fatalities and injuries for Question 1
eventHealth <- subset(stormEvent, !stormEvent$FATALITIES == 0 & !stormEvent$INJURIES ==
0, select = c(EVTYPE, FATALITIES, INJURIES))
# Select data for Property Damage and Crop Damage for Question 2
eventEconomic <- subset(stormEvent, !stormEvent$PROPDMG == 0 & !stormEvent$CROPDMG ==
0, select = c(EVTYPE, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP))
# Create separate data set for Injury and Fatalities Fatalities
eventHealth_Death <- aggregate(eventHealth$FATALITIES, by = list(eventHealth$EVTYPE),
FUN = sum)
# Give proper name for columns
colnames(eventHealth_Death) <- c("EVENTTYPE", "FATALITIES")
# Injury
eventHealth_Inj <- aggregate(eventHealth$INJURIES, by = list(eventHealth$EVTYPE),
FUN = sum)
# Give column name
colnames(eventHealth_Inj) <- c("EVENTTYPE", "INJURIES")
# Let's reorder 2 dataset and filter top 5 events for both dataset
eventHealth_Death <- eventHealth_Death[order(eventHealth_Death$FATALITIES, decreasing = TRUE),
][1:5, ]
eventHealth_Inj <- eventHealth_Inj[order(eventHealth_Inj$INJURIES, decreasing = TRUE),
][1:5, ]
Results Populate the top 5 major cause of Both fatalities and injuriees
# plot top 5 events for fatalities and injuries
# Plot Fatalities and store at Death_plot
Death_plot <- ggplot() + geom_bar(data = eventHealth_Death, aes(x = EVENTTYPE,
y = FATALITIES, fill = interaction(FATALITIES, EVENTTYPE)), stat = "identity",
show.legend = F) + theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
xlab("Harmful Events") + ylab("No. of fatailities") + ggtitle("Top 5 weather events causing fatalities") +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
# Plot injuries and store at variable Inj_plot
Inj_plot <- ggplot() + geom_bar(data = eventHealth_Inj, aes(x = EVENTTYPE, y = INJURIES,
fill = interaction(INJURIES, EVENTTYPE)), stat = "identity", show.legend = F) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) + xlab("Harmful Events") +
ylab("No. of Injuries") + ggtitle("Top 5 weather events causing Injuries") +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
# Draw two plots generated above dividing space in two columns
grid.arrange(Death_plot, Inj_plot, ncol = 2)
Data Processing This will prepare requiured data to present most harmful events with respect to the economic damages.
# select required entries for economy
eventEconomic <- subset(eventEconomic, eventEconomic$PROPDMGEXP == "K" | eventEconomic$PROPDMGEXP ==
"k" | eventEconomic$PROPDMGEXP == "M" | eventEconomic$PROPDMGEXP == "m" |
eventEconomic$PROPDMGEXP == "B" | eventEconomic$PROPDMGEXP == "b")
eventEconomic <- subset(eventEconomic, eventEconomic$CROPDMGEXP == "K" | eventEconomic$CROPDMGEXP ==
"k" | eventEconomic$CROPDMGEXP == "M" | eventEconomic$CROPDMGEXP == "m" |
eventEconomic$CROPDMGEXP == "B" | eventEconomic$CROPDMGEXP == "b")
# Convert ecnomic values to number
eventEconomic$PROPDMGEXP <- gsub("m", 1e+06, eventEconomic$PROPDMGEXP, ignore.case = TRUE)
eventEconomic$PROPDMGEXP <- gsub("k", 1000, eventEconomic$PROPDMGEXP, ignore.case = TRUE)
eventEconomic$PROPDMGEXP <- gsub("b", 1e+09, eventEconomic$PROPDMGEXP, ignore.case = TRUE)
eventEconomic$PROPDMGEXP <- as.numeric(eventEconomic$PROPDMGEXP)
eventEconomic$CROPDMGEXP <- gsub("m", 1e+06, eventEconomic$CROPDMGEXP, ignore.case = TRUE)
eventEconomic$CROPDMGEXP <- gsub("k", 1000, eventEconomic$CROPDMGEXP, ignore.case = TRUE)
eventEconomic$CROPDMGEXP <- gsub("b", 1e+09, eventEconomic$CROPDMGEXP, ignore.case = TRUE)
eventEconomic$CROPDMGEXP <- as.numeric(eventEconomic$CROPDMGEXP)
eventEconomic$PROPDMGEXP <- as.numeric(eventEconomic$PROPDMGEXP)
# then sum the damages by each event type
eventEconomic$TOTALDMG <- (eventEconomic$CROPDMG * eventEconomic$CROPDMGEXP) +
(eventEconomic$PROPDMG * eventEconomic$PROPDMGEXP)
eventEconomic <- aggregate(eventEconomic$TOTALDMG, by = list(eventEconomic$EVTYPE),
FUN = sum)
colnames(eventEconomic) <- c("EVTYPE", "TOTALDMG")
# Rank the event type by highest damage cost and take top 5 columns
eventEconomic <- eventEconomic[order(eventEconomic$TOTALDMG, decreasing = TRUE),
]
eventEconomic <- eventEconomic[1:5, ]
# Now plot the graph
ggplot() + geom_bar(data = eventEconomic, aes(x = EVTYPE, y = TOTALDMG, fill = interaction(TOTALDMG,
EVTYPE)), stat = "identity", show.legend = F) + theme(axis.text.x = element_text(angle = 30,
hjust = 1)) + xlab("Event Type") + ylab("Total Damage")