Synopsis

The National Weather Service Storm data is analyzed ito determine the types of storm events that has significant consequences. The data shows that

  1. the source of property damage in the United States comes from Tornados and is followed by Thunderstorms and Flash Flood sources.

  2. The highest contributor to injuries in the United States comes from Tornados and is followed by Thunderstorm and excessive heat.

  3. The highest source of fatalities from weather events come from tornados. The second highest fatality source comes from excessive heat and is followed by flash floods.

The steps for consolidatation and analysis are described below.

National Weather Service Instruction 10-1605. The events in the database start in the year 1950 and end in November 2011. Fatalities, injuries, and property damage (in dollars) are totalled over that time.

Data Processing

The following is the reading and initialization of the dataset:

storm.data = read.csv(("c:/repdata.csv"), header = TRUE)

Select Columns that are relavant for our purpose.

reduced.storm.data <- 
storm.data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG")]

Normalize event names by substituting the HEAT with Excessive Heat, Thunderstorm etc. This is done using the gsub function to replace the any even type starting with the set of characters

reduced.storm.data$EVTYPE <- 
gsub("^HEAT$", "EXCESSIVE HEAT", reduced.storm.data$EVTYPE)
reduced.storm.data$EVTYPE <- 
gsub("^TSTM WIND$", "THUNDERSTORM WINDS", reduced.storm.data$EVTYPE)
reduced.storm.data$EVTYPE <- 
gsub("^THUNDERSTORM WIND$", "THUNDERSTORM WINDS", reduced.storm.data$EVTYPE)

Aggregate data on fatalities and find which events are the top 10 causes of fatalities. This is done using aggregate function and filtering top 10 items Make sure the item list is sorted before filtering so that the data is in descending order

aggregate.fatalities.data <-
aggregate(
reduced.storm.data$FATALITIES, 
by=list(reduced.storm.data$EVTYPE), FUN=sum, na.rm=TRUE)
colnames(aggregate.fatalities.data) = c("event.type", "fatality.total")
fatalities.sorted <- 
aggregate.fatalities.data[order(-aggregate.fatalities.data$fatality.total),] 
topten.fatalities <- fatalities.sorted[1:10,]
topten.fatalities$event.type <- 
factor(
topten.fatalities$event.type, levels=topten.fatalities$event.type, 
ordered=TRUE)
topten.fatalities
##             event.type fatality.total
## 832            TORNADO           5633
## 130     EXCESSIVE HEAT           2840
## 153        FLASH FLOOD            978
## 463          LIGHTNING            816
## 784 THUNDERSTORM WINDS            701
## 170              FLOOD            470
## 584        RIP CURRENT            368
## 358          HIGH WIND            248
## 19           AVALANCHE            224
## 969       WINTER STORM            206

Data aggregation for injuries. Similar way as the above function

aggregate.injuries.data <-
aggregate(
reduced.storm.data$INJURIES, 
by=list(reduced.storm.data$EVTYPE), FUN=sum, na.rm=TRUE)
colnames(aggregate.injuries.data) = c("event.type", "injury.total")
injuries.sorted <- aggregate.injuries.data[order(-aggregate.injuries.data$injury.total),] 
topten.injuries <- injuries.sorted[1:10,]
topten.injuries$event.type <- 
factor(
topten.injuries$event.type, levels=topten.injuries$event.type, 
ordered=TRUE)
topten.injuries
##             event.type injury.total
## 832            TORNADO        91346
## 784 THUNDERSTORM WINDS         9353
## 130     EXCESSIVE HEAT         8625
## 170              FLOOD         6789
## 463          LIGHTNING         5230
## 426          ICE STORM         1975
## 153        FLASH FLOOD         1777
## 244               HAIL         1361
## 969       WINTER STORM         1321
## 410  HURRICANE/TYPHOON         1275

Data aggregation for property damage. Similar was as the above function

aggregate.prop.dmg.data <-
aggregate(
reduced.storm.data$PROPDMG, 
by=list(reduced.storm.data$EVTYPE), FUN=sum, na.rm=TRUE)
colnames(aggregate.prop.dmg.data) = c("event.type", "prop.dmg.total")
prop.dmg.sorted <- aggregate.prop.dmg.data[order(-aggregate.prop.dmg.data$prop.dmg.total),] 
topten.prop.dmg <- prop.dmg.sorted[1:10,]
topten.prop.dmg$event.type <- 
factor(
topten.prop.dmg$event.type, levels=topten.prop.dmg$event.type, 
ordered=TRUE)
topten.prop.dmg
##             event.type prop.dmg.total
## 832            TORNADO     3212258.16
## 784 THUNDERSTORM WINDS     2659102.96
## 153        FLASH FLOOD     1420124.59
## 170              FLOOD      899938.48
## 244               HAIL      688693.38
## 463          LIGHTNING      603351.78
## 358          HIGH WIND      324731.56
## 969       WINTER STORM      132720.59
## 309         HEAVY SNOW      122251.99
## 954           WILDFIRE       84459.34

Results

ggplat is used to describe the graph. Other plotting methods can also be used.

a. Plotting the top 10 causes of fatalities.

library(ggplot2)
ggplot(data=topten.fatalities, aes(x=event.type, y=fatality.total)) + 
geom_bar(stat="identity") + xlab("Event type") + ylab("Total fatalities") + 
ggtitle("Fatalities By Event Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

b. Plotting the top 10 causes of injuries.

ggplot(data=topten.injuries, aes(x=event.type, y=injury.total)) + 
geom_bar(stat="identity") + xlab("Event type") + ylab("Total injuries") + 
ggtitle("Injuries By Event Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

c. Plotting the top 10 causes property damage.

ggplot(data=topten.prop.dmg, aes(x=event.type, y=prop.dmg.total)) + 
geom_bar(stat="identity") + xlab("Event type") + 
ylab("Total property damage") +  ggtitle("Property Damage By Event Type") + 
theme(axis.text.x = element_text(angle = 45, hjust = 1))