1. Description
The basic goal of this assignment is to explore the NOAA Storm Database and answer some basic questions about severe weather events. You must use the database to answer the questions below and show the code for your entire analysis. Your analysis can consist of tables, figures, or other summaries. You may use any R package you want to support your analysis.
2. Data Processing
2.1 Download and load data set
if(!file.exists("./project2_data")){dir.create("./project2_data")}
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile = "./project2_data/StormData.bz2")
data <- read.csv(bzfile("./project2_data/StormData.bz2"))
2.2 check data basics
dim(data)
## [1] 902297 37
colnames(data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
rownames(data)[1:10]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
## check which column(s) contains NA value
list_na <- colnames(data)[apply(data, 2, anyNA)]
list_na
## [1] "COUNTYENDN" "F" "LATITUDE" "LATITUDE_E"
## check vector of EVTYPE
table(data$EVTYPE)[1:10]
##
## HIGH SURF ADVISORY COASTAL FLOOD FLASH FLOOD
## 1 1 1
## LIGHTNING TSTM WIND TSTM WIND (G45)
## 1 4 1
## WATERSPOUT WIND ?
## 1 1 1
## ABNORMAL WARMTH
## 4
2.3 subset meaningful columns
data_sub <- data[, c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG" , "PROPDMGEXP", "CROPDMG","CROPDMGEXP")]
## keep subsetting data set when injury. fatality, propdmg or cropdamg are aboce 0.
data_sub <- data_sub[data_sub$EVTYPE != "?" & (data_sub$FATALITIES > 0 |
data_sub$INJURIES > 0 |
data_sub$PROPDMG > 0 |
data_sub$CROPDMG > 0), ]
table(data_sub$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5 6
## 11585 1 0 5 210 0 1 1 4 18 3
## 7 8 B h H K m M
## 3 0 40 1 6 231427 7 11320
table(data_sub$CROPDMGEXP)
##
## ? 0 2 B k K m M
## 152663 6 17 0 7 21 99932 1 1985
3. Results
3.1 Calculate top 10 kills by EVTYPE
fatality_data <- aggregate(FATALITIES~EVTYPE, data = data_sub, sum)
fatality_data <- fatality_data[order(fatality_data$FATALITIES, decreasing = T), ]
fatality_data_top10 <- fatality_data[1:10, ]
library(ggplot2)
ggplot(fatality_data_top10, aes(reorder(EVTYPE, -FATALITIES), FATALITIES)) +
geom_bar(stat = "identity", aes(fill = EVTYPE)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 10 kills type", x = "EVTYPE", "People killed")

3.2 Calculate top 10 crop loss caused by EVTYPE
crop_data <- aggregate(CROPLOSS~EVTYPE, data = data_sub, sum)
crop_data <- crop_data[order(crop_data$CROPLOSS, decreasing = T), ]
crop_data_top10 <- crop_data[1:10, ]
ggplot(crop_data_top10, aes(reorder(EVTYPE, -CROPLOSS), CROPLOSS)) +
geom_bar(stat = "identity", aes(fill = EVTYPE)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 10 crop loss type", x = "EVTYPE", y = "Loss in U.S. dollars")
