knitr::opts_chunk$set(echo = TRUE)

#Synopsis

The objective of this project is to explore the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database, which tracks characteristics of major weather events in the US from 1950 to 2011, in order to analyse the resulting health and econonmic impact.

The NOAA Storm database tracks 37 variables of which only the relevant 8 were selected for analysis. Data recorded prior 1989 were excluded due to poor quality and relatively fewer event recorded per year.

The events are grouped and summed by Event Type (EVTYPE) for Fatalitalies (FATALITIES) Injuries (INJURIES), Property Damage (PROPDMG) and Crop Damage(CROPDMG). Event types are then further grouped into wider categories for each of the 4 variables to make the analysis more meaningful.

Setting working directory and downloading data

setwd("C:/Users/jiameng.yu/Desktop/Statistics_course/Course 5 - Reproducible Research/Course project 2")
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2","storm_data.csv")

Initial exploration of data downloaded

rawData<- read.csv("storm_data.csv")
nrow(rawData)
## [1] 902297
ncol(rawData)
## [1] 37
colnames(rawData)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
### The dataset contains 902297 observation of 37 variables including extreme weather events recorded since 1955. 

### The relevant variables for the analysis are considered to be include: BGN_DATE(when the events began), EVTYPE (THE Type of weather event), FATALITIES (direct or indirect fatalities caused by the event), INJURIES (direct or indirect injuries caused by the event), PROPDMG (value of property damage), PROPDMGEXP (unit of the value for property damage estimated), CROPDMG (value of crop damage) CROPDMGEXP (unit of value for crop damage)

A subset of the data is created including the relevant variables only as liste above.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
relevantData <- select(rawData,c("BGN_DATE", "EVTYPE","FATALITIES","INJURIES","PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP"))
head(relevantData,6)

Cleaning data

### Reformatting relevant columns 
#### The BGN_DATE variable is transformed into date format so that the periods with limited good records can be identified and eliminated. 
head(relevantData$BGN_DATE, 3) ##### format: mm/dd/yyyy
## [1] 4/18/1950 0:00:00 4/18/1950 0:00:00 2/20/1951 0:00:00
## 16335 Levels: 1/1/1966 0:00:00 1/1/1972 0:00:00 ... 9/9/2011 0:00:00
DATE <- as.Date(relevantData$BGN_DATE, "%m/%d/%Y")
YEAR<- as.integer(format(DATE, "%Y"))
relevantData1<- mutate(relevantData, YEAR)
relevantData1<- select(relevantData1, -BGN_DATE)

### Reducing noise by eliminating early data 
#### The number of entries per year is calculated. It is identified that prior to 1989, less than 10000 records per year were recorded.
VYear<- table((relevantData1$YEAR))>10000
VYear
## 
##  1950  1951  1952  1953  1954  1955  1956  1957  1958  1959  1960  1961  1962 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##  1963  1964  1965  1966  1967  1968  1969  1970  1971  1972  1973  1974  1975 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##  1976  1977  1978  1979  1980  1981  1982  1983  1984  1985  1986  1987  1988 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##  1989  1990  1991  1992  1993  1994  1995  1996  1997  1998  1999  2000  2001 
##  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE 
##  2002  2003  2004  2005  2006  2007  2008  2009  2010  2011 
##  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
#### Data prior to 1989 is removed from the dataset due to low reported rates. 
relevantData2<- filter(relevantData1,YEAR>1988)

### Checking for missing value
#### It has been verified that there are no missing values in key variables. 
sum(is.na(relevantData2$EVTYPE)) #####0
## [1] 0
sum(is.na(relevantData2$FATALITIES))#####0
## [1] 0
sum(is.na(relevantData2$INJURIES))####0
## [1] 0
sum(is.na(relevantData2$PROPDMG))#####0
## [1] 0
sum(is.na(relevantData2$CROPDMG))#####0
## [1] 0

Analysis

Relating to Fatalities

#### Not all events have caused injuries. Therefore, the events causing 0 injuries are filtered out. Injuries caused by the same event is then summed.
Fatalities <- filter(relevantData2, FATALITIES!=0)
summarised_data1<- as.data.frame(Fatalities %>% group_by(EVTYPE) %>% summarise(TFatalities = sum(FATALITIES,na.rm=TRUE)) %>% arrange(desc(TFatalities)))

#### There appears to be 164 types of events whereas there should only be 48 according to the Data Documentation. 
nrow(summarised_data1)
## [1] 168
#### All lower case texts in EVTYPE is now converted to upper case in order reduce duplicated event type entries. After this step, the number of event type is reduced to 156. 
summarised_data2<- as.data.frame(summarised_data1 %>% mutate(EVTYPE1 = toupper(EVTYPE)) %>% group_by(EVTYPE1) %>% summarise(TFatalities = sum(TFatalities,na.rm=TRUE)) %>% arrange(desc(TFatalities)))

#### Looking at the top 20 events with highest fatalities. The following list of key words are identified: "HEAT, TORNADO, WIND, FLOOD, LIGHTENING, AVALANCHE, STORM, COLD, CURRENT, SNOW, SURF and ICE". Events with these key words are filtered into subsets. 
summarised_data2$EVTYPE1[1:20]
##  [1] "EXCESSIVE HEAT"          "TORNADO"                
##  [3] "FLASH FLOOD"             "HEAT"                   
##  [5] "LIGHTNING"               "FLOOD"                  
##  [7] "RIP CURRENT"             "TSTM WIND"              
##  [9] "HIGH WIND"               "AVALANCHE"              
## [11] "WINTER STORM"            "RIP CURRENTS"           
## [13] "HEAT WAVE"               "EXTREME COLD"           
## [15] "THUNDERSTORM WIND"       "HEAVY SNOW"             
## [17] "EXTREME COLD/WIND CHILL" "HIGH SURF"              
## [19] "STRONG WIND"             "BLIZZARD"
HEATF <- grep("HEAT", summarised_data2$EVTYPE1)
TORNADOF <- grep("TORNADO", summarised_data2$EVTYPE1)
WINDF <- grep("WIND", summarised_data2$EVTYPE1)
FLOODF <- grep("FLOOD", summarised_data2$EVTYPE1)
LIGHTENINGF <- grep("LIGHTNING", summarised_data2$EVTYPE1)
AVALANCHEF<- grep("AVALANCHE", summarised_data2$EVTYPE1)
STORMF <- grep("STORM", summarised_data2$EVTYPE1)
COLDF<- grep("COLD", summarised_data2$EVTYPE1)
CURRENTF <- grep("CURRENT", summarised_data2$EVTYPE1)
SNOWF <- grep("SNOW", summarised_data2$EVTYPE1)
SURFF <- grep("SURF", summarised_data2$EVTYPE1)
ICEF <- grep("ICE", summarised_data2$EVTYPE1)

#### Duplicated entries (i.e. events including two or more key words) are identified. 
Foccurence<- data.frame(table(c(HEATF, TORNADOF, WINDF, FLOODF, LIGHTENINGF, AVALANCHEF, STORMF, COLDF, CURRENTF, SNOWF, SURFF, ICEF)))
DubF <- Foccurence %>% filter(Foccurence$Freq > 1) %>% mutate(TFatalities=summarised_data2$TFatalities[Var1]) %>% mutate(Dub_TFatalities=TFatalities*(Freq-1))

#### After removing duplicated entries, events selected using the above mentioned key words cover 91.4% of events. 
(sum(summarised_data2$TFatalities[c(HEATF, TORNADOF, WINDF, FLOODF, LIGHTENINGF, AVALANCHEF, STORMF, COLDF, CURRENTF, SNOWF, SURFF, ICEF)])-sum(DubF$Dub_TFatalities))/sum(summarised_data2$TFatalities) ##### 91.4%
## [1] 0.9142038
#### As a result, all event types are grouped into these categories except that (1) TORNADO, WIND and SURF are to be combined into WIND; (2) AVALANCHE, SNOW, COLD and ICE are to be grouped into WINTER; (3) events not containing the above key words are grouped in OTHER. The final categories are therefore, HEAT, WIND, FLOOD, LIGHTENING, WINTER, STORM, CURRENT and OTHER. Duplicated entries will be counted into both categories. 
EVTYPE_HEATF <- summarised_data2[HEATF,]
EVTYPE_WINDF <- summarised_data2[c(TORNADOF, WINDF, SURFF),]
EVTYPE_FLOODF <- summarised_data2[FLOODF,]
EVTYPE_LIGHTENINGF<- summarised_data2[LIGHTENINGF,]
EVTYPE_WINTERF <- summarised_data2[c(AVALANCHEF, SNOWF, COLDF, ICEF),]
EVTYPE_STORMF<- summarised_data2[STORMF,]
EVTYPE_CURRENTF<- summarised_data2[CURRENTF,]
EVTYPE_OTHERF <- summarised_data2[-c(HEATF, TORNADOF, WINDF, FLOODF, LIGHTENINGF, AVALANCHEF, STORMF, COLDF, CURRENTF, SNOWF, SURFF, ICEF),]

EVTYPE_HEATF1 <- mutate(EVTYPE_HEATF,Type="HEAT")
EVTYPE_WINDF1 <- mutate(EVTYPE_WINDF,Type="WIND")
EVTYPE_FLOODF1 <- mutate(EVTYPE_FLOODF,Type="FLOOD")
EVTYPE_LIGHTENINGF1 <- mutate(EVTYPE_LIGHTENINGF,Type="LIGHTENING")
EVTYPE_WINTERF1 <- mutate(EVTYPE_WINTERF,Type="WINTER")
EVTYPE_STORMF1 <- mutate(EVTYPE_STORMF,Type="STORM")
EVTYPE_CURRENTF1 <- mutate(EVTYPE_CURRENTF,Type="CURRENT")
EVTYPE_OTHERF1 <- mutate(EVTYPE_OTHERF, Type="OTHER")

#### The new combined data set has 180 rows, more than the 156 before, this is because certain events cross categories. For these cases, the number of fatalities will be counted into both catetories. 
Fatalities_data<- as.data.frame(rbind(EVTYPE_CURRENTF1, EVTYPE_FLOODF1, EVTYPE_WINTERF1, EVTYPE_WINDF1, EVTYPE_OTHERF1, EVTYPE_HEATF1,EVTYPE_LIGHTENINGF1, EVTYPE_STORMF1))
nrow(Fatalities_data)
## [1] 180
Fatalities_summary<- as.data.frame(Fatalities_data %>% group_by(Type) %>% summarise(TFatalities1 = sum(TFatalities,na.rm=TRUE)))

Relating to injuries

#### Not all events have caused fatalities. Therefore, the events causing 0 fatalities are filtered out. Fatalities caused by the same event is then summed.
Injuries <- filter(relevantData2, INJURIES!=0)
summarised_data3<- as.data.frame(Injuries %>% group_by(EVTYPE) %>% summarise(TInjuries = sum(INJURIES,na.rm=TRUE)) %>% arrange(desc(TInjuries)))

#### There appears to be 158 types of events whereas there should only be 48 according to the Data Documentation. 
nrow(summarised_data3)
## [1] 158
#### All lower case texts in EVTYPE is now converted to upper case in order reduce duplicated event type entries. After this step, the number of event type is reduced to 150. 
summarised_data4<- as.data.frame(summarised_data3 %>% mutate(EVTYPE1 = toupper(EVTYPE)) %>% group_by(EVTYPE1) %>% summarise(TInjuries = sum(TInjuries,na.rm=TRUE)) %>% arrange(desc(TInjuries)))
nrow(summarised_data4)
## [1] 150
#### Looking at the top 20 events with highest fatalities. The following list of key words are identified: "HEAT, TORNADO, WIND, FLOOD, LIGHTENING, HURRICANE, STORM, FIRE, HAIL, FOG, BLIZZARD and SNOW". Events containing these key words are organised into subsets.  
summarised_data4$EVTYPE1[1:20]
##  [1] "TORNADO"            "FLOOD"              "EXCESSIVE HEAT"    
##  [4] "TSTM WIND"          "LIGHTNING"          "HEAT"              
##  [7] "ICE STORM"          "FLASH FLOOD"        "THUNDERSTORM WIND" 
## [10] "WINTER STORM"       "HURRICANE/TYPHOON"  "HAIL"              
## [13] "HIGH WIND"          "HEAVY SNOW"         "WILDFIRE"          
## [16] "THUNDERSTORM WINDS" "BLIZZARD"           "FOG"               
## [19] "WILD/FOREST FIRE"   "DUST STORM"
HEATI <- grep("HEAT", summarised_data4$EVTYPE1)
TORNADOI <- grep("TORNADO", summarised_data4$EVTYPE1)
WINDI <- grep("WIND", summarised_data4$EVTYPE1)
FLOODI <- grep("FLOOD", summarised_data4$EVTYPE1)
LIGHTENINGI <- grep("LIGHTNING", summarised_data4$EVTYPE1)
HURRICANEI<- grep("HURRICANE", summarised_data4$EVTYPE1)
STORMI <- grep("STORM", summarised_data4$EVTYPE1)
FIREI<- grep("FIRE", summarised_data4$EVTYPE1)
HAILI <- grep("HAIL", summarised_data4$EVTYPE1)
SNOWI <- grep("SNOW", summarised_data4$EVTYPE1)
BLIZZARDI <- grep("BLIZZARD", summarised_data4$EVTYPE1)
FOGI <- grep("FOG", summarised_data4$EVTYPE1)

#### Duplicated entries (i.e. events including two or more key words) are identified. 
Ioccurence<- data.frame(table(c(HEATI, TORNADOI, WINDI, FLOODI, LIGHTENINGI, HURRICANEI, STORMI, FIREI, HAILI, FOGI, BLIZZARDI, SNOWI)))
DubI <- Ioccurence %>% filter(Ioccurence$Freq > 1) %>% mutate(TInjuries=summarised_data4$TInjuries[Var1]) %>% mutate(Dub_TInjuries=TInjuries*(Freq-1))

#### After removing duplicated entries, events selected using the above mentioned key words cover 95.4% of events. 
(sum(summarised_data4$TInjuries[c(HEATI, TORNADOI, WINDI, FLOODI, LIGHTENINGI, HURRICANEI, STORMI, FIREI, HAILI, SNOWI, BLIZZARDI, FOGI)])- sum(DubI$Dub_TInjuries))/sum(summarised_data4$TInjuries) ##### 95.4%
## [1] 0.9538302
#### As a result, all event types are grouped into these categories except that (1) TORNADO and WIND are to be combined into WIND; (2)BLIZZARD HAIL and SNOW are grouped into WINTER; (3) HURRICANE and STORM are grouped into STORM. (4)  The final categories are therefore, HEAT, WIND, FLOOD, LIGHTENING, WINTER, STORM , FOG and FIRE. 
EVTYPE_HEATI <- summarised_data4[HEATI,]
EVTYPE_WINDI <- summarised_data4[c(TORNADOI, WINDI),]
EVTYPE_FLOODI <- summarised_data4[FLOODI,]
EVTYPE_LIGHTENINGI<- summarised_data4[LIGHTENINGI,]
EVTYPE_WINTERI <- summarised_data4[c(BLIZZARDI, SNOWI, HAILI),]
EVTYPE_STORMI<- summarised_data4[c(STORMI,HURRICANEI),]
EVTYPE_FIREI<- summarised_data4[FIREI,]
EVTYPE_FOGI<- summarised_data4[FOGI,]
EVTYPE_OTHERI <- summarised_data4[-c(HEATI, TORNADOI, WINDI, FLOODI, LIGHTENINGI, HURRICANEI, STORMI, FIREI, HAILI, SNOWI, BLIZZARDI, FOGI),]

EVTYPE_HEATI1 <- mutate(EVTYPE_HEATI,Type="HEAT")
EVTYPE_WINDI1 <- mutate(EVTYPE_WINDI,Type="WIND")
EVTYPE_FLOODI1 <- mutate(EVTYPE_FLOODI,Type="FLOOD")
EVTYPE_LIGHTENINGI1 <- mutate(EVTYPE_LIGHTENINGI,Type="LIGHTENING")
EVTYPE_WINTERI1 <- mutate(EVTYPE_WINTERI,Type="WINTER")
EVTYPE_STORMI1 <- mutate(EVTYPE_STORMI,Type="STORM")
EVTYPE_FIREI1 <- mutate(EVTYPE_FIREI,Type="FIRE")
EVTYPE_FOGI1 <- mutate(EVTYPE_FOGI,Type="FIRE")
EVTYPE_OTHERI1 <- mutate(EVTYPE_OTHERI, Type="OTHER")

#### The new combined data set has 167 rows, more than the 150 before, this is because certain events cross categories. For these cases, the number of injuries will be counted into both catetories. 
Injuries_data<- as.data.frame(rbind(EVTYPE_FIREI1, EVTYPE_FLOODI1, EVTYPE_WINTERI1, EVTYPE_WINDI1, EVTYPE_OTHERI1, EVTYPE_HEATI1,EVTYPE_LIGHTENINGI1, EVTYPE_STORMI1, EVTYPE_FOGI1))
nrow(Injuries_data)
## [1] 167
Injuries_summary<- as.data.frame(Injuries_data %>% group_by(Type) %>% summarise(TInjuries1 = sum(TInjuries,na.rm=TRUE)))

Relating to economic impact

#### To ensure economic impacts can be analysed, units across all observations for PROPDMGEXP and CROPDMGEXP should be consistent which they are not. For the purpose of the remaining analysis, only events recorded in, $1, B, M and K (including lower cases) are taken into account. 
table(relevantData2$PROPDMGEXP)
## 
##             -      ?      +      0      1      2      3      4      5      6 
## 355754      1      8      5    216     25     13      4      4     28      4 
##      7      8      B      h      H      K      m      M 
##      5      1     40      1      6 396954      7   9074
table(relevantData2$CROPDMGEXP)
## 
##             ?      0      2      B      k      K      m      M 
## 478266      7     19      1      9     21 281832      1   1994
#### Turning lower case units in to upper case. 
relevantData2U<- as.data.frame(relevantData2 %>% mutate(PROPDMGEXPU = toupper(PROPDMGEXP), CROPDMGEXPU=toupper(CROPDMGEXP)))

#### The values in PROPDMGEXP are processed to align units of estimated damage value. 
##### subsetting tables by units of PROPDMGEXP. 

PROPk <- filter(relevantData2U, PROPDMGEXP == "K")
PROPm <- filter(relevantData2U, PROPDMGEXP == "M")
PROPb <- filter(relevantData2U, PROPDMGEXP == "B")
PROP <- filter(relevantData2U, PROPDMGEXP == "")

##### Times all PROPDMGEXP K entries with 1000 and M entries with 1000000 etc
PROPk1 <- PROPk %>% rename("OLD" = PROPDMG) %>% mutate(PROPDMG=OLD*1000) %>% select(-OLD)
PROPm1 <- PROPm %>% rename("OLD" = PROPDMG) %>% mutate(PROPDMG=OLD*1000000) %>% select(-OLD)
PROPb1 <- PROPb %>% rename("OLD" = PROPDMG) %>% mutate(PROPDMG=OLD*1000000000) %>% select(-OLD)

##### The subsets are mergered to form the full data set again with all values at consistent units. 
relevantData3 <- rbind(PROPk1,PROPm1, PROPb1, PROP)

#### The values in CROPDMGEXP are processed to align units of estimated damage value. 
##### subsetting tables by units of CROPDMGEXP. 
CROPk <- filter(relevantData2U, CROPDMGEXP == "K")
CROPm <- filter(relevantData2U, CROPDMGEXP == "M")
CROPb <- filter(relevantData2U, CROPDMGEXP == "B")
CROP <- filter(relevantData2U, CROPDMGEXP == "")

##### Times all CROPDMGEXP K entries with 1000 and M entries with 1000000 etc
CROPk1 <- CROPk %>% rename("OLD" = CROPDMG) %>% mutate(CROPDMG=OLD*1000) %>% select(-OLD)
CROPm1 <- CROPm %>% rename("OLD" = CROPDMG) %>% mutate(CROPDMG=OLD*1000000) %>% select(-OLD)
CROPb1 <- CROPb %>% rename("OLD" = CROPDMG) %>% mutate(CROPDMG=OLD*1000000000) %>% select(-OLD)

##### The subsets are mergered to form the full data set again with all values at consistent units. 
relevantData4 <- rbind(CROPk1,CROPm1, CROPb1, CROP)

#### For both property damage and crop damage, entries with zero value are removed, then damages relating to the same type of events are summed. 
summarised_data5 <- as.data.frame(relevantData3 %>% filter(PROPDMG!=0) %>% mutate(EVTYPE1=toupper(EVTYPE)) %>% group_by(EVTYPE1) %>% summarise(PROPDMGT=sum(PROPDMG)))
summarised_data6<- as.data.frame(relevantData4 %>% filter(CROPDMG!=0) %>% mutate(EVTYPE1 = toupper(EVTYPE)) %>% group_by(EVTYPE1) %>% summarise(CROPDMGT=sum(CROPDMG)))

#### TOP 20 events are identified for each variable
summarised_data7<- arrange(summarised_data5, desc(PROPDMGT))
summarised_data8<- arrange(summarised_data6, desc(CROPDMGT))
PTop20<- summarised_data7$EVTYPE1[1:20]
CTop20<- summarised_data8$EVTYPE1[1:20]

#### For property damage: 
##### Key events indicators for property damage are: STORM, FLOOD, WIND, HURRICANE, FIRE, FLASH, HAIL, ICE, SURGE, TORNADO. 
PTop20_split <- strsplit(PTop20," ")
P_keyword_table<- as.data.frame(table(PTop20_split1<- unlist(PTop20_split)))
print.data.frame(arrange(P_keyword_table, desc(Freq)))
##                 Var1 Freq
## 1              STORM    5
## 2              FLOOD    3
## 3               WIND    3
## 4          HURRICANE    2
## 5       THUNDERSTORM    2
## 6               FIRE    1
## 7              FLASH    1
## 8               HAIL    1
## 9              HEAVY    1
## 10              HIGH    1
## 11 HURRICANE/TYPHOON    1
## 12               ICE    1
## 13              OPAL    1
## 14       RAIN/SEVERE    1
## 15             RIVER    1
## 16             SURGE    1
## 17        SURGE/TIDE    1
## 18           TORNADO    1
## 19          TROPICAL    1
## 20              TSTM    1
## 21           WEATHER    1
## 22       WILD/FOREST    1
## 23          WILDFIRE    1
## 24             WINDS    1
## 25            WINTER    1
P_keywords<- c("STORM", "FLOOD", "WIND", "HURRICANE", "FIRE", "FLASH", "HAIL", "ICE", "SURGE", "TORNADO")


##### Each data sets are grouped into subsets by keyword for property damage. 
STORMP <- grep("STORM", summarised_data7$EVTYPE1)
FLOODP <- grep("FLOOD", summarised_data7$EVTYPE1)
WINDP <- grep("WIND", summarised_data7$EVTYPE1)
HURRICANEP <- grep("HURRICANE", summarised_data7$EVTYPE1)
FIREP <- grep("FIRE", summarised_data7$EVTYPE1)
FLASHP<- grep("FLASH", summarised_data7$EVTYPE1)
HAILP <- grep("HAIL", summarised_data7$EVTYPE1)
ICEP<- grep("ICE", summarised_data7$EVTYPE1)
SURGEP <- grep("SURGE", summarised_data7$EVTYPE1)
TORNADOP <- grep("TORNADO", summarised_data7$EVTYPE1)


##### Duplicated entries (i.e. events including two or more key words) are identified. 
Poccurence<- data.frame(table(c(STORMP, FLOODP, WINDP, HURRICANEP, FIREP, FLASHP, HAILP, HAILP, ICEP, SURGEP, TORNADOP)))
DubP <- Poccurence %>% filter(Poccurence$Freq > 1) %>% mutate(PROPDMGT=summarised_data7$PROPDMGT[Var1]) %>% mutate(Dub_PROPDMGT=PROPDMGT*(Freq-1))

##### After removing duplicated entries, events selected using the above mentioned key words cover 97.9% of events. 
(sum(summarised_data7$PROPDMGT[c(STORMP, FLOODP, WINDP, HURRICANEP, FIREP, FLASHP, HAILP, HAILP, ICEP, SURGEP, TORNADOP)])-sum(DubP$PROPDMGT))/sum(summarised_data7$PROPDMGT) ##### 97.9%
## [1] 0.9793332
##### As a result, all event types are grouped into these categories except STORM, HURRICANE, WIND AND TORNADO are grouped into WIND and Events not containing the above key words are grouped in OTHER. Therefore, the final categories are WIND, FLOOD, FIRE, FLASH, HAIL, ICE, RAIN and OTHER. 
EVTYPE_WINDP <- summarised_data7[c(STORMP, HURRICANEP, WINDP, TORNADOP),]
EVTYPE_FLOODP <- summarised_data7[FLOODP,]
EVTYPE_FIREP <- summarised_data7[FIREP,]
EVTYPE_FLASHP<- summarised_data7[FLASHP,]
EVTYPE_HAILP <- summarised_data7[HAILP,]
EVTYPE_ICEP<- summarised_data7[ICEP,]
EVTYPE_SURGEP<- summarised_data7[SURGEP,]
EVTYPE_OTHERP <- summarised_data7[-c(STORMP, FLOODP, WINDP, HURRICANEP, FIREP, FLASHP, HAILP, HAILP, ICEP, SURGEP, TORNADOP),]

EVTYPE_WINDP1 <- mutate(EVTYPE_WINDP,Type="WIND")
EVTYPE_FLOODP1 <- mutate(EVTYPE_FLOODP,Type="FLOOD")
EVTYPE_FIREP1 <- mutate(EVTYPE_FIREP,Type="FIRE")
EVTYPE_FLASHP1 <- mutate(EVTYPE_FLASHP,Type="FLASH")
EVTYPE_HAILP1 <- mutate(EVTYPE_HAILP,Type="HAIL")
EVTYPE_ICEP1 <- mutate(EVTYPE_ICEP,Type="ICE")
EVTYPE_SURGEP1 <- mutate(EVTYPE_SURGEP,Type="SURGE")
EVTYPE_OTHERP1 <- mutate(EVTYPE_OTHERP, Type="OTHER")

##### The new combined data set has 462 rows, more than the 371 before, this is because certain events cross categories. For these cases, the value of damaged will be counted into all catetories. 
PROPDMGT_data<- as.data.frame(rbind(EVTYPE_WINDP1, EVTYPE_FLOODP1, EVTYPE_FIREP1, EVTYPE_FLASHP1, EVTYPE_HAILP1, EVTYPE_ICEP1,EVTYPE_SURGEP1, EVTYPE_OTHERP1))
nrow(PROPDMGT_data)
## [1] 462
PROPDMG_summary<- PROPDMGT_data %>% group_by(Type) %>% summarise(PROPDMGT = sum(PROPDMGT,na.rm=TRUE))

#### For Crop damage:
##### Key events indicators for property damage are: FLOOD, WIND, HEAT, STORM, DROUGHT, FROST, HAIL, HURRICANE, ICE, RAIN, TORNADO. 
CTop20_split <- strsplit(CTop20," ")
C_keywords_table<- as.data.frame(table(CTop20_split1<- unlist(CTop20_split)))
print.data.frame(arrange(C_keywords_table, desc(Freq)))
##                 Var1 Freq
## 1              FLOOD    3
## 2               WIND    3
## 3             FREEZE    2
## 4               HEAT    2
## 5              STORM    2
## 6               COLD    1
## 7           DAMAGING    1
## 8            DROUGHT    1
## 9          EXCESSIVE    1
## 10           EXTREME    1
## 11             FLASH    1
## 12      FROST/FREEZE    1
## 13              HAIL    1
## 14             HEAVY    1
## 15              HIGH    1
## 16         HURRICANE    1
## 17 HURRICANE/TYPHOON    1
## 18               ICE    1
## 19              RAIN    1
## 20             RIVER    1
## 21      THUNDERSTORM    1
## 22           TORNADO    1
## 23          TROPICAL    1
## 24              TSTM    1
C_keywords<- c("FLOOD", "WIND", "HEAT", "STORM", "DROUGHT", "FROST", "HAIL", "HURRICANE", "ICE", "RAIN", "TORNADO")

##### Each data sets are grouped into subsets by keyword for property damage. 
FLOODC<- grep("FLOOD", summarised_data8$EVTYPE1)
WINDC <- grep("WIND", summarised_data8$EVTYPE1)
HEATC<- grep("HEAT", summarised_data8$EVTYPE1)
STORMC <- grep("STORM", summarised_data8$EVTYPE1)
DROUGHTC <- grep("DROUGHT", summarised_data8$EVTYPE1)
FROSTC <- grep("FROST", summarised_data8$EVTYPE1)
HAILC <- grep("HAIL", summarised_data8$EVTYPE1)
HURRICANEC <- grep("HURRICANE", summarised_data8$EVTYPE1)
ICEC<- grep("ICE", summarised_data8$EVTYPE1)
RAINC <- grep("RAIN", summarised_data8$EVTYPE1)
TORNADOC <- grep("TORNADO", summarised_data8$EVTYPE1)


##### Duplicated entries (i.e. events including two or more key words) are identified. 
Coccurence<- data.frame(table(c(FLOODC, WINDC, HEATC, STORMC, DROUGHTC, FROSTC, HAILC, HURRICANEC, ICEC, RAINC, TORNADOC)))
DubC <- Coccurence %>% filter(Coccurence$Freq > 1) %>% mutate(CROPDMGT=summarised_data8$CROPDMGT[Var1]) %>% mutate(Dub_CROPDMGT=CROPDMGT*(Freq-1))

##### After removing duplicated entries, events selected using the above mentioned key words cover 92.6% of events. 
(sum(summarised_data8$CROPDMGT[c(FLOODC, WINDC, HEATC, STORMC, DROUGHTC, FROSTC, HAILC, HURRICANEC, ICEC, RAINC, TORNADOC)])-sum(DubC$CROPDMGT))/sum(summarised_data8$CROPDMGT) ##### 92.6%
## [1] 0.9260617
##### As a result, all event types are grouped into these categories except STORM, HURRICANE, WIND AND TORNADO are grouped into WIND and Events not containing the above key words are grouped in OTHER. Therefore, the final categories are WIND, FLOOD, DROUGHT, FROST, HAIL, ICE, RAIN AND OTHER. 
EVTYPE_WINDC <- summarised_data8[c(STORMC, HURRICANEC, WINDC, TORNADOC),]
EVTYPE_HEATC <- summarised_data8[HEATC,]
EVTYPE_FLOODC <- summarised_data8[FLOODC,]
EVTYPE_DROUGHTC <- summarised_data8[DROUGHTC,]
EVTYPE_FROSTC<- summarised_data8[FROSTC,]
EVTYPE_HAILC <- summarised_data8[HAILC,]
EVTYPE_ICEC<- summarised_data8[ICEC,]
EVTYPE_RAINC<- summarised_data8[RAINC,]
EVTYPE_OTHERC <- summarised_data8[-c(FLOODC, WINDC, STORMC, DROUGHTC, FROSTC, HAILC, HURRICANEC, ICEC, RAINC, TORNADOC),]

EVTYPE_WINDC1 <- mutate(EVTYPE_WINDC,Type="WIND")
EVTYPE_HEATC1 <- mutate(EVTYPE_HEATC,Type="HEAT")
EVTYPE_FLOODC1 <- mutate(EVTYPE_FLOODC,Type="FLOOD")
EVTYPE_DROUGHTC1 <- mutate(EVTYPE_DROUGHTC,Type="DROUGHT")
EVTYPE_FROSTC1 <- mutate(EVTYPE_FROSTC,Type="FROST")
EVTYPE_HAILC1 <- mutate(EVTYPE_HAILC,Type="HAIL")
EVTYPE_ICEC1 <- mutate(EVTYPE_ICEC,Type="ICE")
EVTYPE_RAINC1 <- mutate(EVTYPE_RAINC,Type="RAIN")
EVTYPE_OTHERC1 <- mutate(EVTYPE_OTHERC, Type="OTHER")

##### The new combined data set has 165 rows, more than the 131 before, this is because certain events cross categories. For these cases, the number of fatalities will be counted into all catetories. 
CROPDMGT_data<- as.data.frame(rbind(EVTYPE_WINDC1, EVTYPE_FLOODC1, EVTYPE_DROUGHTC1, EVTYPE_FROSTC1, EVTYPE_HAILC1, EVTYPE_ICEC1,EVTYPE_RAINC1, EVTYPE_OTHERC1))
nrow(CROPDMGT_data)
## [1] 165
CROPDMG_summary<- CROPDMGT_data %>% group_by(Type) %>% summarise(CROPDMGT = sum(CROPDMGT,na.rm=TRUE))

#### Expressing economic damaged in the nearest billion
PROPDMG_summary_bn <- as.data.frame(PROPDMG_summary %>% mutate(PROPDMG_summary_bn=PROPDMGT/1000000000) %>% select(-PROPDMGT))
CROPDMG_summary_bn <- as.data.frame(CROPDMG_summary %>% mutate(CROPDMG_summary_bn=CROPDMGT/1000000000) %>% select(-CROPDMGT))

Results

With tidy dataset, the impact on health and economy across the US of severe weather events from 1989 to 2011 are summarised in graphic format.

##### Fatalities - Fatalities_summary
str(Fatalities_summary)
## 'data.frame':    8 obs. of  2 variables:
##  $ Type        : chr  "CURRENT" "FLOOD" "HEAT" "LIGHTENING" ...
##  $ TFatalities1: num  577 1525 3138 817 810 ...
colnames(Fatalities_data)
## [1] "EVTYPE1"     "TFatalities" "Type"
#####Injuries - Injuries_summary
##### Property damage - PROPDMG_summary_bn
##### Crop damage - CROPDMG_summary_bn

#### The impact on health 
##### Fatalities
print.data.frame(arrange(Fatalities_summary, desc(TFatalities1)))
##         Type TFatalities1
## 1       WIND         3299
## 2       HEAT         3138
## 3      FLOOD         1525
## 4     WINTER          946
## 5 LIGHTENING          817
## 6      OTHER          810
## 7      STORM          633
## 8    CURRENT          577
qplot(Type,TFatalities,data=Fatalities_data)

##### Injuries
print.data.frame(arrange(Injuries_summary,desc(TInjuries1)))
##         Type TInjuries1
## 1       WIND      37950
## 2       HEAT       9224
## 3      FLOOD       8604
## 4      STORM       8020
## 5 LIGHTENING       5232
## 6     WINTER       3240
## 7      OTHER       3016
## 8       FIRE       2685
qplot(Type,TInjuries,data=Injuries_data)

##### Economic loss caused
print.data.frame(arrange(PROPDMG_summary_bn,desc(PROPDMG_summary_bn)))
##    Type PROPDMG_summary_bn
## 1  WIND         209.391815
## 2 FLOOD         167.529741
## 3 SURGE          47.965224
## 4  HAIL          17.615091
## 5 FLASH          16.907928
## 6  FIRE           8.501629
## 7 OTHER           8.379430
## 8   ICE           3.974061
print.data.frame(arrange(CROPDMG_summary_bn,desc(CROPDMG_summary_bn)))
##      Type CROPDMG_summary_bn
## 1    WIND         14.4889644
## 2 DROUGHT         13.9726218
## 3   FLOOD         12.3800991
## 4     ICE          5.0271143
## 5   OTHER          3.9629119
## 6    HAIL          3.1137959
## 7   FROST          1.2021860
## 8    RAIN          0.9193158
qplot(Type,PROPDMGT,data=PROPDMGT_data)