This document is for the Coursera course: Reproducible Research Project 2.
After setting up the R environments and loading raw data, some data preprocessing was done for later exploitative plotting. And then two questions was addressed via plotting:
library(dplyr)
library(stringr)
library(ggplot2)
library(reshape2)
data <- read.csv("repdata_data_StormData.csv.bz2")
str(data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
data$EVTYPEdata$EVTYPE <- str_to_lower(data$EVTYPE)
word_counts <- table(str_to_lower(data$EVTYPE))
word_counts <- sort(word_counts,decreasing = TRUE)
head(word_counts,n=50)
##
## hail tstm wind thunderstorm wind
## 288661 219942 82564
## tornado flash flood flood
## 60652 54277 25327
## thunderstorm winds high wind lightning
## 20843 20214 15754
## heavy snow heavy rain winter storm
## 15708 11742 11433
## winter weather funnel cloud marine tstm wind
## 7045 6844 6175
## marine thunderstorm wind waterspout strong wind
## 5812 3796 3569
## urban/sml stream fld wildfire blizzard
## 3392 2761 2719
## drought ice storm excessive heat
## 2488 2006 1678
## high winds wild/forest fire frost/freeze
## 1533 1457 1343
## dense fog winter weather/mix tstm wind/hail
## 1293 1104 1028
## extreme cold/wind chill heat high surf
## 1002 767 734
## tropical storm flash flooding extreme cold
## 690 682 657
## coastal flood lake-effect snow flood/flash flood
## 656 636 625
## snow landslide cold/wind chill
## 617 600 539
## fog rip current marine hail
## 538 470 442
## dust storm avalanche wind
## 427 386 346
## rip currents storm surge
## 304 261
data$EVTYPEcommon_envs = c("hail","wind","tornado","flood","lightning","snow","rain","storm")
envtype_backup <- data$EVTYPE
data$EVTYPE <- "other"
for (env in common_envs){
data$EVTYPE[str_detect(envtype_backup,paste0("(",env,")"))] <- env
}
table(data$EVTYPE)
##
## flood hail lightning other rain snow storm tornado
## 82690 289270 15762 44018 12238 17639 124624 60700
## wind
## 255356
data.health <- select(data,EVTYPE,FATALITIES,INJURIES)
data.health.sum <- summarise(group_by(data.health,EVTYPE),
FATALITIES = sum(FATALITIES,na.rm = TRUE),
INJURIES = sum(INJURIES,na.rm = TRUE))
data.health.sum <- melt(data.health.sum,id.vars = "EVTYPE")
data$PROPDMGEXP <- str_to_lower(data$PROPDMGEXP)
data$CROPDMGEXP <- str_to_lower(data$CROPDMGEXP)
table(data$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5 6
## 465934 1 8 5 216 25 13 4 4 28 4
## 7 8 b h k m
## 5 1 40 7 424665 11337
table(data$CROPDMGEXP)
##
## ? 0 2 b k m
## 618413 7 19 1 9 281853 1995
Ignore messy labels except k,m,b
data.econ <- select(data,EVTYPE,PROPDMG,CROPDMG)
mgexps <- c("k"=3,"m"=6,"b"=9)
for (mgexp in names(mgexps)){
idx_prop <- (data$PROPDMGEXP == mgexp)
idx_crop <- (data$CROPDMGEXP == mgexp)
data.econ$PROPDMG[idx_prop] <- data.econ$PROPDMG[idx_prop] * (10^mgexps[[mgexp]])
data.econ$CROPDMG[idx_crop] <- data.econ$CROPDMG[idx_crop] * (10^mgexps[[mgexp]])
}
data.econ <- melt(data.econ,id.vars = "EVTYPE")
head(data.econ)
## EVTYPE variable value
## 1 tornado PROPDMG 25000
## 2 tornado PROPDMG 2500
## 3 tornado PROPDMG 25000
## 4 tornado PROPDMG 2500
## 5 tornado PROPDMG 2500
## 6 tornado PROPDMG 2500
According to following graph, tornado is the most harmful type of events with respect to population health.
ggplot(data = data.health.sum,aes(x=EVTYPE,y=value,fill=variable)) +
geom_bar(stat="identity") +
xlab("event type") + ylab("total number of casualties") +
ggtitle("Total Casualties by Different Severe Weathers")
According to the following graph, flood has the greatest economic consequences.
ggplot(data=data.econ,aes(x=EVTYPE,y=value,fill=variable)) +
geom_bar(stat="identity") + xlab("event type") + ylab("damage value") +
ggtitle("Severe Weather Damage on Properties(PROPDMG) and Crops(CROPDMG)")