Install Packages
library(ggplot2)
library(plyr)
Read in data
data <- read.csv(bzfile("repdata-data-StormData.csv.bz2"))
Analyze data and process data so in correct format for later analysis
str(data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
data$BGN_DATE <- as.Date(data$BGN_DATE, "%m/%d/%Y %H:%M:%S")
Identifying factors related to health: Fatalities & Injuries, then grouping them by event.
most_harmful_event <- aggregate(FATALITIES + INJURIES ~ EVTYPE, data, sum)
top_most_harmful_event <- head(most_harmful_event[order(-most_harmful_event$`FATALITIES + INJURIES`),])
names(top_most_harmful_event) <- c("Event", "Count")
Plot to show the processed data.
ggplot(top_most_harmful_event, aes(Event, Count)) +
geom_bar(stat = "identity", fill = "dark blue") +
labs(title = "Most Harmful Events to Health")
In conclusion it can be seen that TORNADO with a total of 9.697910^{4} of Injuries and Fatalities is the event most harmful to health.
Looking through the data, property and crop damage are related to economic failures.
Process the data so aforementioned variables are normalized.
unique(data$PROPDMGEXP)
## [1] "K" "M" "" "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"
tempPROPDMGEXP <- mapvalues(data$PROPDMGEXP,
c("K","M","", "B","m","+","0","5","6","?","4","2","3","h","7","H","-","1","8"),
c(1e3,1e6, 1, 1e9,1e6, 1, 1,1e5,1e6, 1,1e4,1e2,1e3, 1,1e7,1e2, 1, 10,1e8))
unique(data$CROPDMGEXP)
## [1] "" "M" "K" "m" "B" "?" "0" "k" "2"
tempCROPDMGEXP <- mapvalues(data$CROPDMGEXP,
c("","M","K","m","B","?","0","k","2"),
c( 1,1e6,1e3,1e6,1e9,1,1,1e3,1e2))
data$PROPTOTALDMG <- as.numeric(tempPROPDMGEXP) * data$PROPDMG
data$CROPTOTALDMG <- as.numeric(tempCROPDMGEXP) * data$CROPDMG
most_damage <- aggregate( PROPTOTALDMG + CROPTOTALDMG ~ EVTYPE, data, sum)
top_most_damage <- head(most_damage[order(-most_damage$`PROPTOTALDMG + CROPTOTALDMG`),])
names(top_most_damage) <- c("Event", "Total_Damage")
Plot to show which events have the greatest consequences to the economy.
ggplot(top_most_damage, aes(Event, Total_Damage)) +
geom_bar(stat = "identity", fill = "dark blue") +
labs(title = "Most Damage to the Economy by Event")
In conclusion it can be seen that FLOOD with a total of 1.503196810^{11} of crop and property damage is the event most harmful to the economy.