1. Description

The basic goal of this assignment is to explore the NOAA Storm Database and answer some basic questions about severe weather events. You must use the database to answer the questions below and show the code for your entire analysis. Your analysis can consist of tables, figures, or other summaries. You may use any R package you want to support your analysis.

2. Data Processing

2.1 Download and load data set

if(!file.exists("./project2_data")){dir.create("./project2_data")}
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile = "./project2_data/StormData.bz2")

data <- read.csv(bzfile("./project2_data/StormData.bz2"))

2.2 check data basics

dim(data)
## [1] 902297     37
colnames(data)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
rownames(data)[1:10]
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
## check which column(s) contains NA value
list_na <- colnames(data)[apply(data, 2, anyNA)]
list_na
## [1] "COUNTYENDN" "F"          "LATITUDE"   "LATITUDE_E"
## check vector of EVTYPE
table(data$EVTYPE)[1:10]
## 
##    HIGH SURF ADVISORY         COASTAL FLOOD           FLASH FLOOD 
##                     1                     1                     1 
##             LIGHTNING             TSTM WIND       TSTM WIND (G45) 
##                     1                     4                     1 
##            WATERSPOUT                  WIND                     ? 
##                     1                     1                     1 
##       ABNORMAL WARMTH 
##                     4

2.3 subset meaningful columns

data_sub <- data[, c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG" , "PROPDMGEXP", "CROPDMG","CROPDMGEXP")]

## keep subsetting data set when injury. fatality, propdmg or cropdamg are aboce 0.

data_sub <- data_sub[data_sub$EVTYPE != "?" & (data_sub$FATALITIES > 0 |
                                                 data_sub$INJURIES > 0 |
                                                 data_sub$PROPDMG > 0 |
                                                 data_sub$CROPDMG > 0), ]


table(data_sub$PROPDMGEXP)
## 
##             -      ?      +      0      1      2      3      4      5      6 
##  11585      1      0      5    210      0      1      1      4     18      3 
##      7      8      B      h      H      K      m      M 
##      3      0     40      1      6 231427      7  11320
table(data_sub$CROPDMGEXP)
## 
##             ?      0      2      B      k      K      m      M 
## 152663      6     17      0      7     21  99932      1   1985

2.4 transform exponent values for cropdmg and propdmg to real number

data_sub$CROPDMGEXP <- ifelse(data_sub$CROPDMGEXP == "2", 10^2, 
                              ifelse(data_sub$CROPDMGEXP == "B", 10^9, 
                                     ifelse(data_sub$CROPDMGEXP == "k", 10^3,
                                            ifelse(data_sub$CROPDMGEXP == "K", 10^3, 
                                                   ifelse(data_sub$CROPDMGEXP == "m", 10^6,
                                                          ifelse(data_sub$CROPDMGEXP == "M", 10^6, 0)
                                                   )
                                            )
                                     )))
## add a column to calculate total crop damage loss
data_sub$CROPLOSS <- data_sub$CROPDMG * data_sub$CROPDMGEXP

3. Results

3.1 Calculate top 10 kills by EVTYPE

fatality_data <- aggregate(FATALITIES~EVTYPE, data = data_sub, sum)
fatality_data <- fatality_data[order(fatality_data$FATALITIES, decreasing = T), ]
fatality_data_top10 <- fatality_data[1:10, ]
library(ggplot2)
ggplot(fatality_data_top10, aes(reorder(EVTYPE, -FATALITIES), FATALITIES)) + 
  geom_bar(stat = "identity", aes(fill = EVTYPE)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Top 10 kills type", x = "EVTYPE", "People killed")

3.2 Calculate top 10 crop loss caused by EVTYPE

crop_data <- aggregate(CROPLOSS~EVTYPE, data = data_sub, sum)
crop_data <- crop_data[order(crop_data$CROPLOSS, decreasing = T), ]
crop_data_top10 <- crop_data[1:10, ]

ggplot(crop_data_top10, aes(reorder(EVTYPE, -CROPLOSS), CROPLOSS)) + 
  geom_bar(stat = "identity", aes(fill = EVTYPE)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Top 10 crop loss type", x = "EVTYPE", y = "Loss in U.S. dollars")