This is prepared for Johns Hopkins’ Reproducible Research online class Course Project Reproducible Research Peer Assessment 2

Synopsis

Data Source: Raw Data

Data Processing

1. Load data

  1. Load appropriate libraries, helper functions, and initiate parameter options
library(dplyr)
library(ggplot2)
library(knitr)
opts_chunk$set(echo=TRUE, results="hide")
  1. Load the Storm Data to local file ‘StormData.csv.bz2’
# helper function
checkDir <- function ( thedir ) {
  # if datadir not exist, create datadir
  if (dir.exists( thedir ) == FALSE) {
    dir.create( thedir )
  }
  return(thedir)
}
downloadZipFile <- function( src.file.url, dest.dir, dest.file ) {
  dest.filepath <- file.path(dest.dir, dest.file)
  if (file.exists(dest.filepath)) {
  }  else { 
    download.file( src.file.url, destfile=dest.filepath, method="libcurl", mode="wb") 
  }
}

cur.dir <- getwd()
data.dir <- checkDir(file.path(cur.dir, "mydata"))
rawdata.filename <- "StormData.csv.bz2"
downloadZipFile(src.file.url="https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", 
                dest.dir=data.dir, dest.file=rawdata.filename)
# helper function
loadbzDataset <- function( data.file ) { # load one data set at one time  
  df.data <- read.csv( bzfile( data.file, "rt"), header = TRUE)
  return (df.data)
}

rawdata <- loadbzDataset( file.path(data.dir, rawdata.filename))
names(rawdata)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
data <- distinct( select(rawdata, EVTYPE, BGN_DATE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,CROPDMG,CROPDMGEXP))

# FYI
summary(data)
## Warning: closing unused connection 5 (D:/2015/training/_coursera/
## 5.ReProducibleResearch.Jul2015/assignment.2/RepDataAssignment2/mydata/
## StormData.csv.bz2)
##                EVTYPE                   BGN_DATE        FATALITIES      
##  TSTM WIND        :38082   4/27/2011 0:00:00:   263   Min.   :  0.0000  
##  TORNADO          :33836   6/27/1998 0:00:00:   190   1st Qu.:  0.0000  
##  HAIL             :27231   6/4/2008 0:00:00 :   183   Median :  0.0000  
##  FLASH FLOOD      :18083   5/31/1998 0:00:00:   176   Mean   :  0.0692  
##  THUNDERSTORM WIND:15330   5/18/1995 0:00:00:   174   3rd Qu.:  0.0000  
##  LIGHTNING        :12343   4/15/2011 0:00:00:   164   Max.   :583.0000  
##  (Other)          :66889   (Other)          :210644                     
##     INJURIES            PROPDMG          PROPDMGEXP        CROPDMG       
##  Min.   :   0.0000   Min.   :   0.00   K      :143575   Min.   :  0.000  
##  1st Qu.:   0.0000   1st Qu.:   0.00          : 58188   1st Qu.:  0.000  
##  Median :   0.0000   Median :   2.00   M      :  9760   Median :  0.000  
##  Mean   :   0.6449   Mean   :  38.29   0      :   148   Mean   :  5.402  
##  3rd Qu.:   0.0000   3rd Qu.:  25.00   B      :    40   3rd Qu.:  0.000  
##  Max.   :1700.0000   Max.   :5000.00   5      :    22   Max.   :990.000  
##                                        (Other):    61                    
##    CROPDMGEXP    
##         :139136  
##  K      : 70816  
##  M      :  1795  
##  k      :    16  
##  0      :    15  
##  B      :     8  
##  (Other):     8
table(rawdata$PROPDMGEXP)   
## 
##             -      ?      +      0      1      2      3      4      5 
## 465934      1      8      5    216     25     13      4      4     28 
##      6      7      8      B      h      H      K      m      M 
##      4      5      1     40      1      6 424665      7  11330
table(rawdata$CROPDMGEXP)   
## 
##             ?      0      2      B      k      K      m      M 
## 618413      7     19      1      9     21 281832      1   1994

2. Process/transform the data into a format suitable/cleaned/tidy/ideal data for my analysis

# convert code value to uppercase
data$EVTYPE <- toupper( data$EVTYPE )
data$PROPDMGEXP <- toupper( data$PROPDMGEXP)
data$CROPDMGEXP <- toupper( data$CROPDMGEXP)
data <- mutate( data, EVYEARMO=format( as.POSIXlt( data$BGN_DATE, format="%m/%d/%Y %H:%M:%S"), "%Y-%m"))   
# trim leading/ending spaces, Correction: comma, typos ...
data$EVTYPE <- gsub('^ +', '', data$EVTYPE)
data$EVTYPE <- gsub(' +$', '', data$EVTYPE)
data$EVTYPE <- gsub('  ', ' ', data$EVTYPE)
data$EVTYPE <- gsub(' , ', ',', data$EVTYPE)
data$EVTYPE <- gsub('AVALANCE', 'AVALANCHE', data$EVTYPE)  
data$EVTYPE <- gsub('COASTALFLOOD', 'COASTAL FLOOD', data$EVTYPE)  
data$EVTYPE <- gsub('COASTALSTORM', 'COASTAL STORM', data$EVTYPE)  
data$EVTYPE <- gsub('TEMPERATURES', 'TEMPERATURE', data$EVTYPE)  
data$EVTYPE <- gsub('DUST DEVEL', 'DUST DEVIL', data$EVTYPE)  
data$EVTYPE <- gsub('PRECIPATATION', 'PRECIPITATION', data$EVTYPE)
# re-get unique data
data <- distinct( select(data, EVTYPE, BGN_DATE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP,CROPDMG,CROPDMGEXP) )
# convert to numbers, K= 1000, M=million, B= billion
data$PROPDMG <- as.numeric(data$PROPDMG) * as.numeric( 
  ifelse ((data$PROPDMGEXP == "K"), 1000, 
  ifelse ((data$PROPDMGEXP == "M"), 1000000,
  ifelse ((data$PROPDMGEXP == "B"), 1000000000, 
  ifelse (is.numeric(data$PROPDMGEXP), data$PROPDMGEXP, 1)))) 
  )
data$CROPDMG <- as.numeric(data$CROPDMG) * as.numeric( 
  ifelse ((data$CROPDMGEXP =="K"), 1000, 
  ifelse ((data$CROPDMGEXP =="M"), 1000000,
  ifelse ((data$CROPDMGEXP =="B"), 1000000000, 
  ifelse (is.numeric(data$CROPDMGEXP), data$CROPDMGEXP, 1))))
  )

3. Computation

# calculate sum and max value by event types
byevtype <- group_by( data, EVTYPE)
summarize.byevtype <- summarise( byevtype, count=n(), 
      FATALITIES_SUM=sum(FATALITIES, na.rm=TRUE ), FATALITIES_MAX = max(FATALITIES, na.rm=TRUE),
      INJURIES_SUM = sum(INJURIES, na.rm=TRUE), INJURIES_MAX = max(INJURIES, na.rm=TRUE),
      PROPDMG_SUM = sum(PROPDMG, na.rm = TRUE), PROPDMG_MAX = max(PROPDMG, na.rm=TRUE),
      CROPDMG_SUM = sum(CROPDMG, na.rm=TRUE),  CROPDMG_MAX = max(CROPDMG, na.rm=TRUE)
      )
summary(summarize.byevtype)
##     EVTYPE              count        FATALITIES_SUM   FATALITIES_MAX   
##  Length:46          Min.   :   1.0   Min.   :  0.00   Min.   :  0.000  
##  Class :character   1st Qu.:  18.0   1st Qu.:  0.00   1st Qu.:  0.000  
##  Mode  :character   Median :  59.0   Median :  1.00   Median :  1.000  
##                     Mean   : 258.6   Mean   : 19.98   Mean   :  4.935  
##                     3rd Qu.: 189.0   3rd Qu.:  8.25   3rd Qu.:  2.000  
##                     Max.   :3365.0   Max.   :547.00   Max.   :158.000  
##   INJURIES_SUM     INJURIES_MAX      PROPDMG_SUM       
##  Min.   :   0.0   Min.   :   0.00   Min.   :0.000e+00  
##  1st Qu.:   0.0   1st Qu.:   0.00   1st Qu.:7.970e+04  
##  Median :   0.5   Median :   0.50   Median :3.714e+06  
##  Mean   : 153.6   Mean   :  34.57   Mean   :4.070e+08  
##  3rd Qu.:  11.0   3rd Qu.:   5.50   3rd Qu.:2.501e+07  
##  Max.   :5478.0   Max.   :1150.00   Max.   :9.417e+09  
##   PROPDMG_MAX         CROPDMG_SUM         CROPDMG_MAX      
##  Min.   :0.000e+00   Min.   :        0   Min.   :       0  
##  1st Qu.:5.125e+04   1st Qu.:        0   1st Qu.:       0  
##  Median :1.250e+06   Median :        0   Median :       0  
##  Mean   :1.218e+08   Mean   : 13235065   Mean   : 5374935  
##  3rd Qu.:1.180e+07   3rd Qu.:  9450250   3rd Qu.: 4500000  
##  Max.   :2.800e+09   Max.   :136525000   Max.   :50000000
# order/sort computed results for sum, mean, max
# FATALITIES
cleandata.sum.1 <- arrange( summarize.byevtype, desc(FATALITIES_SUM))
data.1 <- cleandata.sum.1[1:10, c("EVTYPE","FATALITIES_SUM", "count")]
data.1
## Source: local data frame [10 x 3]
## 
##               EVTYPE FATALITIES_SUM count
## 1            TORNADO            547  1160
## 2               HEAT             60    71
## 3        FLASH FLOOD             57  1060
## 4  THUNDERSTORM WIND             51  3365
## 5              FLOOD             41  1062
## 6     EXCESSIVE HEAT             32    81
## 7        RIP CURRENT             28    44
## 8          LIGHTNING             25   637
## 9    COLD/WIND CHILL             19    35
## 10         HIGH SURF             11    58
fatality_sum_max <- round(max(cleandata.sum.1$FATALITIES_SUM, na.rm=TRUE))
fatality_sum_avg <- round(mean(cleandata.sum.1$FATALITIES_SUM, na.rm=TRUE))
#index/location of max value located
fatality_index_max <- which( round(cleandata.sum.1$FATALITIES_SUM) == fatality_sum_max)   
paste("FATALITIES: EVENT TYPE= ", cleandata.sum.1$EVTYPE[fatality_index_max],
      "  max value= ", fatality_sum_max, "  avg/mean value is ", fatality_sum_avg)
## [1] "FATALITIES: EVENT TYPE=  TORNADO   max value=  547   avg/mean value is  20"
# INJURIES
cleandata.sum.2 <- arrange( summarize.byevtype, desc(INJURIES_SUM))
data.2 <- cleandata.sum.2[1:10, c("EVTYPE","INJURIES_SUM", "count")]
data.2
## Source: local data frame [10 x 3]
## 
##               EVTYPE INJURIES_SUM count
## 1            TORNADO         5478  1160
## 2               HEAT          611    71
## 3  THUNDERSTORM WIND          355  3365
## 4          LIGHTNING          177   637
## 5     EXCESSIVE HEAT          138    81
## 6           WILDFIRE          110   486
## 7        STRONG WIND           32   277
## 8               HAIL           31  1128
## 9        FLASH FLOOD           30  1060
## 10       RIP CURRENT           27    44
injury_sum_avg <- round(mean(cleandata.sum.2$INJURIES_SUM, na.rm=TRUE))
injury_sum_max <- round(max(cleandata.sum.2$INJURIES_SUM, na.rm=TRUE))
injury_index_max <- which( round(cleandata.sum.2$INJURIES_SUM) == injury_sum_max)   
paste("INJURIES: EVENT TYPE= ", cleandata.sum.2$EVTYPE[injury_index_max],
      "  max value= ", injury_sum_max, "  avg/mean value is ", injury_sum_avg)
## [1] "INJURIES: EVENT TYPE=  TORNADO   max value=  5478   avg/mean value is  154"
# PROPDMG
cleandata.sum.3 <- arrange( summarize.byevtype, desc(PROPDMG_SUM))
data.3 <- cleandata.sum.3[1:10, c("EVTYPE", "PROPDMG_SUM", "count")]
data.3
## Source: local data frame [10 x 3]
## 
##               EVTYPE PROPDMG_SUM count
## 1            TORNADO  9417129900  1160
## 2              FLOOD  6309196950  1062
## 3        FLASH FLOOD  1244228700  1060
## 4           WILDFIRE   637917400   486
## 5               HAIL   428118800  1128
## 6  THUNDERSTORM WIND   270133560  3365
## 7     TROPICAL STORM    84112700    66
## 8            TSUNAMI    53554000     9
## 9          LIGHTNING    41343920   637
## 10  STORM SURGE/TIDE    40695000    13
propdmg_sum_avg <- round(mean(cleandata.sum.3$PROPDMG_SUM,na.rm=TRUE))
propdmg_sum_max <- round(max(cleandata.sum.3$PROPDMG_SUM,na.rm=TRUE))
propdmg_index_max <- which( round(cleandata.sum.3$PROPDMG_SUM) == propdmg_sum_max)   
paste("PROPDMG: EVENT TYPE= ", cleandata.sum.3$EVTYPE[propdmg_index_max],
      "  max value= ", propdmg_sum_max, "  avg/mean value is ", propdmg_sum_avg)
## [1] "PROPDMG: EVENT TYPE=  TORNADO   max value=  9417129900   avg/mean value is  406972765"
# CROPDMG
cleandata.sum.4 <- arrange( summarize.byevtype, desc(CROPDMG_SUM))
data.4 <- cleandata.sum.4[1:10, c("EVTYPE", "CROPDMG_SUM", "count")]
data.4
## Source: local data frame [10 x 3]
## 
##               EVTYPE CROPDMG_SUM count
## 1              FLOOD   136525000  1062
## 2  THUNDERSTORM WIND   130574000  3365
## 3        FLASH FLOOD    83648000  1060
## 4               HAIL    63578000  1128
## 5          HIGH WIND    44293000   488
## 6            DROUGHT    31269000    77
## 7            TORNADO    29602000  1160
## 8     TROPICAL STORM    24501000    66
## 9         HEAVY RAIN    20713000   246
## 10       STRONG WIND    15059000   277
cropdmg_sum_avg <- round(mean(cleandata.sum.4$CROPDMG_SUM, na.rm=TRUE))
cropdmg_sum_max <- round(max(cleandata.sum.4$CROPDMG_SUM, na.rm=TRUE))
cropdmg_index_max <- which( round(cleandata.sum.4$CROPDMG_SUM) == cropdmg_sum_max)   
paste("CROPDMG: EVENT TYPE= ", cleandata.sum.4$EVTYPE[cropdmg_index_max],
      "  max value= ", cropdmg_sum_max, "  avg/mean value is ", cropdmg_sum_avg)
## [1] "CROPDMG: EVENT TYPE=  FLOOD   max value=  136525000   avg/mean value is  13235065"

Results

Create exploratory plots

ggplot( data.1, aes(reorder(EVTYPE, -FATALITIES_SUM),FATALITIES_SUM , fill=EVTYPE)) + 
  geom_histogram(stat="identity") + labs(x="Event Type", y="Total Fatalities") + 
  ggtitle("1950-2011 U.S. Total Fatalities of Top 10 Weather-Related Events ")  +  
  theme(legend.position="bottom", axis.text.x=element_text(angle=25))

ggplot( data.3, aes(reorder(EVTYPE, -PROPDMG_SUM), PROPDMG_SUM, fill=EVTYPE)) + 
  geom_histogram(stat="identity") + geom_line() + labs(x="Event Type", y="Total Property Damage Estimate") + 
  ggtitle("1950-2011 U.S. Total Property Damage of Top 10 Weather-Related Events")  +  
  theme(legend.position="bottom", axis.text.x=element_text(angle=25))
## geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?

ggplot( data.4, aes(reorder(EVTYPE, -CROPDMG_SUM), CROPDMG_SUM, fill=EVTYPE)) + 
  geom_histogram(stat="identity") + labs(x="Event Type", y="Total Crop Damage Estimate") + 
  ggtitle("1950-2011 Total Crop Damage of Top 10 Weather-Related Events")  +  
  theme(legend.position="bottom", axis.text.x=element_text(angle=25))

Summary

Reference: