knitr::opts_chunk$set(echo = TRUE)
#Synopsis
setwd("C:/Users/jiameng.yu/Desktop/Statistics_course/Course 5 - Reproducible Research/Course project 2")
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2","storm_data.csv")
rawData<- read.csv("storm_data.csv")
nrow(rawData)
## [1] 902297
ncol(rawData)
## [1] 37
colnames(rawData)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
### The dataset contains 902297 observation of 37 variables including extreme weather events recorded since 1955.
### The relevant variables for the analysis are considered to be include: BGN_DATE(when the events began), EVTYPE (THE Type of weather event), FATALITIES (direct or indirect fatalities caused by the event), INJURIES (direct or indirect injuries caused by the event), PROPDMG (value of property damage), PROPDMGEXP (unit of the value for property damage estimated), CROPDMG (value of crop damage) CROPDMGEXP (unit of value for crop damage)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
relevantData <- select(rawData,c("BGN_DATE", "EVTYPE","FATALITIES","INJURIES","PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP"))
head(relevantData,6)
### Reformatting relevant columns
#### The BGN_DATE variable is transformed into date format so that the periods with limited good records can be identified and eliminated.
head(relevantData$BGN_DATE, 3) ##### format: mm/dd/yyyy
## [1] 4/18/1950 0:00:00 4/18/1950 0:00:00 2/20/1951 0:00:00
## 16335 Levels: 1/1/1966 0:00:00 1/1/1972 0:00:00 ... 9/9/2011 0:00:00
DATE <- as.Date(relevantData$BGN_DATE, "%m/%d/%Y")
YEAR<- as.integer(format(DATE, "%Y"))
relevantData1<- mutate(relevantData, YEAR)
relevantData1<- select(relevantData1, -BGN_DATE)
### Reducing noise by eliminating early data
#### The number of entries per year is calculated. It is identified that prior to 1989, less than 10000 records per year were recorded.
VYear<- table((relevantData1$YEAR))>10000
VYear
##
## 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#### Data prior to 1989 is removed from the dataset due to low reported rates.
relevantData2<- filter(relevantData1,YEAR>1988)
### Checking for missing value
#### It has been verified that there are no missing values in key variables.
sum(is.na(relevantData2$EVTYPE)) #####0
## [1] 0
sum(is.na(relevantData2$FATALITIES))#####0
## [1] 0
sum(is.na(relevantData2$INJURIES))####0
## [1] 0
sum(is.na(relevantData2$PROPDMG))#####0
## [1] 0
sum(is.na(relevantData2$CROPDMG))#####0
## [1] 0
#### Not all events have caused injuries. Therefore, the events causing 0 injuries are filtered out. Injuries caused by the same event is then summed.
Fatalities <- filter(relevantData2, FATALITIES!=0)
summarised_data1<- as.data.frame(Fatalities %>% group_by(EVTYPE) %>% summarise(TFatalities = sum(FATALITIES,na.rm=TRUE)) %>% arrange(desc(TFatalities)))
#### There appears to be 164 types of events whereas there should only be 48 according to the Data Documentation.
nrow(summarised_data1)
## [1] 168
#### All lower case texts in EVTYPE is now converted to upper case in order reduce duplicated event type entries. After this step, the number of event type is reduced to 156.
summarised_data2<- as.data.frame(summarised_data1 %>% mutate(EVTYPE1 = toupper(EVTYPE)) %>% group_by(EVTYPE1) %>% summarise(TFatalities = sum(TFatalities,na.rm=TRUE)) %>% arrange(desc(TFatalities)))
#### Looking at the top 20 events with highest fatalities. The following list of key words are identified: "HEAT, TORNADO, WIND, FLOOD, LIGHTENING, AVALANCHE, STORM, COLD, CURRENT, SNOW, SURF and ICE". Events with these key words are filtered into subsets.
summarised_data2$EVTYPE1[1:20]
## [1] "EXCESSIVE HEAT" "TORNADO"
## [3] "FLASH FLOOD" "HEAT"
## [5] "LIGHTNING" "FLOOD"
## [7] "RIP CURRENT" "TSTM WIND"
## [9] "HIGH WIND" "AVALANCHE"
## [11] "WINTER STORM" "RIP CURRENTS"
## [13] "HEAT WAVE" "EXTREME COLD"
## [15] "THUNDERSTORM WIND" "HEAVY SNOW"
## [17] "EXTREME COLD/WIND CHILL" "HIGH SURF"
## [19] "STRONG WIND" "BLIZZARD"
HEATF <- grep("HEAT", summarised_data2$EVTYPE1)
TORNADOF <- grep("TORNADO", summarised_data2$EVTYPE1)
WINDF <- grep("WIND", summarised_data2$EVTYPE1)
FLOODF <- grep("FLOOD", summarised_data2$EVTYPE1)
LIGHTENINGF <- grep("LIGHTNING", summarised_data2$EVTYPE1)
AVALANCHEF<- grep("AVALANCHE", summarised_data2$EVTYPE1)
STORMF <- grep("STORM", summarised_data2$EVTYPE1)
COLDF<- grep("COLD", summarised_data2$EVTYPE1)
CURRENTF <- grep("CURRENT", summarised_data2$EVTYPE1)
SNOWF <- grep("SNOW", summarised_data2$EVTYPE1)
SURFF <- grep("SURF", summarised_data2$EVTYPE1)
ICEF <- grep("ICE", summarised_data2$EVTYPE1)
#### Duplicated entries (i.e. events including two or more key words) are identified.
Foccurence<- data.frame(table(c(HEATF, TORNADOF, WINDF, FLOODF, LIGHTENINGF, AVALANCHEF, STORMF, COLDF, CURRENTF, SNOWF, SURFF, ICEF)))
DubF <- Foccurence %>% filter(Foccurence$Freq > 1) %>% mutate(TFatalities=summarised_data2$TFatalities[Var1]) %>% mutate(Dub_TFatalities=TFatalities*(Freq-1))
#### After removing duplicated entries, events selected using the above mentioned key words cover 91.4% of events.
(sum(summarised_data2$TFatalities[c(HEATF, TORNADOF, WINDF, FLOODF, LIGHTENINGF, AVALANCHEF, STORMF, COLDF, CURRENTF, SNOWF, SURFF, ICEF)])-sum(DubF$Dub_TFatalities))/sum(summarised_data2$TFatalities) ##### 91.4%
## [1] 0.9142038
#### As a result, all event types are grouped into these categories except that (1) TORNADO, WIND and SURF are to be combined into WIND; (2) AVALANCHE, SNOW, COLD and ICE are to be grouped into WINTER; (3) events not containing the above key words are grouped in OTHER. The final categories are therefore, HEAT, WIND, FLOOD, LIGHTENING, WINTER, STORM, CURRENT and OTHER. Duplicated entries will be counted into both categories.
EVTYPE_HEATF <- summarised_data2[HEATF,]
EVTYPE_WINDF <- summarised_data2[c(TORNADOF, WINDF, SURFF),]
EVTYPE_FLOODF <- summarised_data2[FLOODF,]
EVTYPE_LIGHTENINGF<- summarised_data2[LIGHTENINGF,]
EVTYPE_WINTERF <- summarised_data2[c(AVALANCHEF, SNOWF, COLDF, ICEF),]
EVTYPE_STORMF<- summarised_data2[STORMF,]
EVTYPE_CURRENTF<- summarised_data2[CURRENTF,]
EVTYPE_OTHERF <- summarised_data2[-c(HEATF, TORNADOF, WINDF, FLOODF, LIGHTENINGF, AVALANCHEF, STORMF, COLDF, CURRENTF, SNOWF, SURFF, ICEF),]
EVTYPE_HEATF1 <- mutate(EVTYPE_HEATF,Type="HEAT")
EVTYPE_WINDF1 <- mutate(EVTYPE_WINDF,Type="WIND")
EVTYPE_FLOODF1 <- mutate(EVTYPE_FLOODF,Type="FLOOD")
EVTYPE_LIGHTENINGF1 <- mutate(EVTYPE_LIGHTENINGF,Type="LIGHTENING")
EVTYPE_WINTERF1 <- mutate(EVTYPE_WINTERF,Type="WINTER")
EVTYPE_STORMF1 <- mutate(EVTYPE_STORMF,Type="STORM")
EVTYPE_CURRENTF1 <- mutate(EVTYPE_CURRENTF,Type="CURRENT")
EVTYPE_OTHERF1 <- mutate(EVTYPE_OTHERF, Type="OTHER")
#### The new combined data set has 180 rows, more than the 156 before, this is because certain events cross categories. For these cases, the number of fatalities will be counted into both catetories.
Fatalities_data<- as.data.frame(rbind(EVTYPE_CURRENTF1, EVTYPE_FLOODF1, EVTYPE_WINTERF1, EVTYPE_WINDF1, EVTYPE_OTHERF1, EVTYPE_HEATF1,EVTYPE_LIGHTENINGF1, EVTYPE_STORMF1))
nrow(Fatalities_data)
## [1] 180
Fatalities_summary<- as.data.frame(Fatalities_data %>% group_by(Type) %>% summarise(TFatalities1 = sum(TFatalities,na.rm=TRUE)))
#### Not all events have caused fatalities. Therefore, the events causing 0 fatalities are filtered out. Fatalities caused by the same event is then summed.
Injuries <- filter(relevantData2, INJURIES!=0)
summarised_data3<- as.data.frame(Injuries %>% group_by(EVTYPE) %>% summarise(TInjuries = sum(INJURIES,na.rm=TRUE)) %>% arrange(desc(TInjuries)))
#### There appears to be 158 types of events whereas there should only be 48 according to the Data Documentation.
nrow(summarised_data3)
## [1] 158
#### All lower case texts in EVTYPE is now converted to upper case in order reduce duplicated event type entries. After this step, the number of event type is reduced to 150.
summarised_data4<- as.data.frame(summarised_data3 %>% mutate(EVTYPE1 = toupper(EVTYPE)) %>% group_by(EVTYPE1) %>% summarise(TInjuries = sum(TInjuries,na.rm=TRUE)) %>% arrange(desc(TInjuries)))
nrow(summarised_data4)
## [1] 150
#### Looking at the top 20 events with highest fatalities. The following list of key words are identified: "HEAT, TORNADO, WIND, FLOOD, LIGHTENING, HURRICANE, STORM, FIRE, HAIL, FOG, BLIZZARD and SNOW". Events containing these key words are organised into subsets.
summarised_data4$EVTYPE1[1:20]
## [1] "TORNADO" "FLOOD" "EXCESSIVE HEAT"
## [4] "TSTM WIND" "LIGHTNING" "HEAT"
## [7] "ICE STORM" "FLASH FLOOD" "THUNDERSTORM WIND"
## [10] "WINTER STORM" "HURRICANE/TYPHOON" "HAIL"
## [13] "HIGH WIND" "HEAVY SNOW" "WILDFIRE"
## [16] "THUNDERSTORM WINDS" "BLIZZARD" "FOG"
## [19] "WILD/FOREST FIRE" "DUST STORM"
HEATI <- grep("HEAT", summarised_data4$EVTYPE1)
TORNADOI <- grep("TORNADO", summarised_data4$EVTYPE1)
WINDI <- grep("WIND", summarised_data4$EVTYPE1)
FLOODI <- grep("FLOOD", summarised_data4$EVTYPE1)
LIGHTENINGI <- grep("LIGHTNING", summarised_data4$EVTYPE1)
HURRICANEI<- grep("HURRICANE", summarised_data4$EVTYPE1)
STORMI <- grep("STORM", summarised_data4$EVTYPE1)
FIREI<- grep("FIRE", summarised_data4$EVTYPE1)
HAILI <- grep("HAIL", summarised_data4$EVTYPE1)
SNOWI <- grep("SNOW", summarised_data4$EVTYPE1)
BLIZZARDI <- grep("BLIZZARD", summarised_data4$EVTYPE1)
FOGI <- grep("FOG", summarised_data4$EVTYPE1)
#### Duplicated entries (i.e. events including two or more key words) are identified.
Ioccurence<- data.frame(table(c(HEATI, TORNADOI, WINDI, FLOODI, LIGHTENINGI, HURRICANEI, STORMI, FIREI, HAILI, FOGI, BLIZZARDI, SNOWI)))
DubI <- Ioccurence %>% filter(Ioccurence$Freq > 1) %>% mutate(TInjuries=summarised_data4$TInjuries[Var1]) %>% mutate(Dub_TInjuries=TInjuries*(Freq-1))
#### After removing duplicated entries, events selected using the above mentioned key words cover 95.4% of events.
(sum(summarised_data4$TInjuries[c(HEATI, TORNADOI, WINDI, FLOODI, LIGHTENINGI, HURRICANEI, STORMI, FIREI, HAILI, SNOWI, BLIZZARDI, FOGI)])- sum(DubI$Dub_TInjuries))/sum(summarised_data4$TInjuries) ##### 95.4%
## [1] 0.9538302
#### As a result, all event types are grouped into these categories except that (1) TORNADO and WIND are to be combined into WIND; (2)BLIZZARD HAIL and SNOW are grouped into WINTER; (3) HURRICANE and STORM are grouped into STORM. (4) The final categories are therefore, HEAT, WIND, FLOOD, LIGHTENING, WINTER, STORM , FOG and FIRE.
EVTYPE_HEATI <- summarised_data4[HEATI,]
EVTYPE_WINDI <- summarised_data4[c(TORNADOI, WINDI),]
EVTYPE_FLOODI <- summarised_data4[FLOODI,]
EVTYPE_LIGHTENINGI<- summarised_data4[LIGHTENINGI,]
EVTYPE_WINTERI <- summarised_data4[c(BLIZZARDI, SNOWI, HAILI),]
EVTYPE_STORMI<- summarised_data4[c(STORMI,HURRICANEI),]
EVTYPE_FIREI<- summarised_data4[FIREI,]
EVTYPE_FOGI<- summarised_data4[FOGI,]
EVTYPE_OTHERI <- summarised_data4[-c(HEATI, TORNADOI, WINDI, FLOODI, LIGHTENINGI, HURRICANEI, STORMI, FIREI, HAILI, SNOWI, BLIZZARDI, FOGI),]
EVTYPE_HEATI1 <- mutate(EVTYPE_HEATI,Type="HEAT")
EVTYPE_WINDI1 <- mutate(EVTYPE_WINDI,Type="WIND")
EVTYPE_FLOODI1 <- mutate(EVTYPE_FLOODI,Type="FLOOD")
EVTYPE_LIGHTENINGI1 <- mutate(EVTYPE_LIGHTENINGI,Type="LIGHTENING")
EVTYPE_WINTERI1 <- mutate(EVTYPE_WINTERI,Type="WINTER")
EVTYPE_STORMI1 <- mutate(EVTYPE_STORMI,Type="STORM")
EVTYPE_FIREI1 <- mutate(EVTYPE_FIREI,Type="FIRE")
EVTYPE_FOGI1 <- mutate(EVTYPE_FOGI,Type="FIRE")
EVTYPE_OTHERI1 <- mutate(EVTYPE_OTHERI, Type="OTHER")
#### The new combined data set has 167 rows, more than the 150 before, this is because certain events cross categories. For these cases, the number of injuries will be counted into both catetories.
Injuries_data<- as.data.frame(rbind(EVTYPE_FIREI1, EVTYPE_FLOODI1, EVTYPE_WINTERI1, EVTYPE_WINDI1, EVTYPE_OTHERI1, EVTYPE_HEATI1,EVTYPE_LIGHTENINGI1, EVTYPE_STORMI1, EVTYPE_FOGI1))
nrow(Injuries_data)
## [1] 167
Injuries_summary<- as.data.frame(Injuries_data %>% group_by(Type) %>% summarise(TInjuries1 = sum(TInjuries,na.rm=TRUE)))
#### To ensure economic impacts can be analysed, units across all observations for PROPDMGEXP and CROPDMGEXP should be consistent which they are not. For the purpose of the remaining analysis, only events recorded in, $1, B, M and K (including lower cases) are taken into account.
table(relevantData2$PROPDMGEXP)
##
## - ? + 0 1 2 3 4 5 6
## 355754 1 8 5 216 25 13 4 4 28 4
## 7 8 B h H K m M
## 5 1 40 1 6 396954 7 9074
table(relevantData2$CROPDMGEXP)
##
## ? 0 2 B k K m M
## 478266 7 19 1 9 21 281832 1 1994
#### Turning lower case units in to upper case.
relevantData2U<- as.data.frame(relevantData2 %>% mutate(PROPDMGEXPU = toupper(PROPDMGEXP), CROPDMGEXPU=toupper(CROPDMGEXP)))
#### The values in PROPDMGEXP are processed to align units of estimated damage value.
##### subsetting tables by units of PROPDMGEXP.
PROPk <- filter(relevantData2U, PROPDMGEXP == "K")
PROPm <- filter(relevantData2U, PROPDMGEXP == "M")
PROPb <- filter(relevantData2U, PROPDMGEXP == "B")
PROP <- filter(relevantData2U, PROPDMGEXP == "")
##### Times all PROPDMGEXP K entries with 1000 and M entries with 1000000 etc
PROPk1 <- PROPk %>% rename("OLD" = PROPDMG) %>% mutate(PROPDMG=OLD*1000) %>% select(-OLD)
PROPm1 <- PROPm %>% rename("OLD" = PROPDMG) %>% mutate(PROPDMG=OLD*1000000) %>% select(-OLD)
PROPb1 <- PROPb %>% rename("OLD" = PROPDMG) %>% mutate(PROPDMG=OLD*1000000000) %>% select(-OLD)
##### The subsets are mergered to form the full data set again with all values at consistent units.
relevantData3 <- rbind(PROPk1,PROPm1, PROPb1, PROP)
#### The values in CROPDMGEXP are processed to align units of estimated damage value.
##### subsetting tables by units of CROPDMGEXP.
CROPk <- filter(relevantData2U, CROPDMGEXP == "K")
CROPm <- filter(relevantData2U, CROPDMGEXP == "M")
CROPb <- filter(relevantData2U, CROPDMGEXP == "B")
CROP <- filter(relevantData2U, CROPDMGEXP == "")
##### Times all CROPDMGEXP K entries with 1000 and M entries with 1000000 etc
CROPk1 <- CROPk %>% rename("OLD" = CROPDMG) %>% mutate(CROPDMG=OLD*1000) %>% select(-OLD)
CROPm1 <- CROPm %>% rename("OLD" = CROPDMG) %>% mutate(CROPDMG=OLD*1000000) %>% select(-OLD)
CROPb1 <- CROPb %>% rename("OLD" = CROPDMG) %>% mutate(CROPDMG=OLD*1000000000) %>% select(-OLD)
##### The subsets are mergered to form the full data set again with all values at consistent units.
relevantData4 <- rbind(CROPk1,CROPm1, CROPb1, CROP)
#### For both property damage and crop damage, entries with zero value are removed, then damages relating to the same type of events are summed.
summarised_data5 <- as.data.frame(relevantData3 %>% filter(PROPDMG!=0) %>% mutate(EVTYPE1=toupper(EVTYPE)) %>% group_by(EVTYPE1) %>% summarise(PROPDMGT=sum(PROPDMG)))
summarised_data6<- as.data.frame(relevantData4 %>% filter(CROPDMG!=0) %>% mutate(EVTYPE1 = toupper(EVTYPE)) %>% group_by(EVTYPE1) %>% summarise(CROPDMGT=sum(CROPDMG)))
#### TOP 20 events are identified for each variable
summarised_data7<- arrange(summarised_data5, desc(PROPDMGT))
summarised_data8<- arrange(summarised_data6, desc(CROPDMGT))
PTop20<- summarised_data7$EVTYPE1[1:20]
CTop20<- summarised_data8$EVTYPE1[1:20]
#### For property damage:
##### Key events indicators for property damage are: STORM, FLOOD, WIND, HURRICANE, FIRE, FLASH, HAIL, ICE, SURGE, TORNADO.
PTop20_split <- strsplit(PTop20," ")
P_keyword_table<- as.data.frame(table(PTop20_split1<- unlist(PTop20_split)))
print.data.frame(arrange(P_keyword_table, desc(Freq)))
## Var1 Freq
## 1 STORM 5
## 2 FLOOD 3
## 3 WIND 3
## 4 HURRICANE 2
## 5 THUNDERSTORM 2
## 6 FIRE 1
## 7 FLASH 1
## 8 HAIL 1
## 9 HEAVY 1
## 10 HIGH 1
## 11 HURRICANE/TYPHOON 1
## 12 ICE 1
## 13 OPAL 1
## 14 RAIN/SEVERE 1
## 15 RIVER 1
## 16 SURGE 1
## 17 SURGE/TIDE 1
## 18 TORNADO 1
## 19 TROPICAL 1
## 20 TSTM 1
## 21 WEATHER 1
## 22 WILD/FOREST 1
## 23 WILDFIRE 1
## 24 WINDS 1
## 25 WINTER 1
P_keywords<- c("STORM", "FLOOD", "WIND", "HURRICANE", "FIRE", "FLASH", "HAIL", "ICE", "SURGE", "TORNADO")
##### Each data sets are grouped into subsets by keyword for property damage.
STORMP <- grep("STORM", summarised_data7$EVTYPE1)
FLOODP <- grep("FLOOD", summarised_data7$EVTYPE1)
WINDP <- grep("WIND", summarised_data7$EVTYPE1)
HURRICANEP <- grep("HURRICANE", summarised_data7$EVTYPE1)
FIREP <- grep("FIRE", summarised_data7$EVTYPE1)
FLASHP<- grep("FLASH", summarised_data7$EVTYPE1)
HAILP <- grep("HAIL", summarised_data7$EVTYPE1)
ICEP<- grep("ICE", summarised_data7$EVTYPE1)
SURGEP <- grep("SURGE", summarised_data7$EVTYPE1)
TORNADOP <- grep("TORNADO", summarised_data7$EVTYPE1)
##### Duplicated entries (i.e. events including two or more key words) are identified.
Poccurence<- data.frame(table(c(STORMP, FLOODP, WINDP, HURRICANEP, FIREP, FLASHP, HAILP, HAILP, ICEP, SURGEP, TORNADOP)))
DubP <- Poccurence %>% filter(Poccurence$Freq > 1) %>% mutate(PROPDMGT=summarised_data7$PROPDMGT[Var1]) %>% mutate(Dub_PROPDMGT=PROPDMGT*(Freq-1))
##### After removing duplicated entries, events selected using the above mentioned key words cover 97.9% of events.
(sum(summarised_data7$PROPDMGT[c(STORMP, FLOODP, WINDP, HURRICANEP, FIREP, FLASHP, HAILP, HAILP, ICEP, SURGEP, TORNADOP)])-sum(DubP$PROPDMGT))/sum(summarised_data7$PROPDMGT) ##### 97.9%
## [1] 0.9793332
##### As a result, all event types are grouped into these categories except STORM, HURRICANE, WIND AND TORNADO are grouped into WIND and Events not containing the above key words are grouped in OTHER. Therefore, the final categories are WIND, FLOOD, FIRE, FLASH, HAIL, ICE, RAIN and OTHER.
EVTYPE_WINDP <- summarised_data7[c(STORMP, HURRICANEP, WINDP, TORNADOP),]
EVTYPE_FLOODP <- summarised_data7[FLOODP,]
EVTYPE_FIREP <- summarised_data7[FIREP,]
EVTYPE_FLASHP<- summarised_data7[FLASHP,]
EVTYPE_HAILP <- summarised_data7[HAILP,]
EVTYPE_ICEP<- summarised_data7[ICEP,]
EVTYPE_SURGEP<- summarised_data7[SURGEP,]
EVTYPE_OTHERP <- summarised_data7[-c(STORMP, FLOODP, WINDP, HURRICANEP, FIREP, FLASHP, HAILP, HAILP, ICEP, SURGEP, TORNADOP),]
EVTYPE_WINDP1 <- mutate(EVTYPE_WINDP,Type="WIND")
EVTYPE_FLOODP1 <- mutate(EVTYPE_FLOODP,Type="FLOOD")
EVTYPE_FIREP1 <- mutate(EVTYPE_FIREP,Type="FIRE")
EVTYPE_FLASHP1 <- mutate(EVTYPE_FLASHP,Type="FLASH")
EVTYPE_HAILP1 <- mutate(EVTYPE_HAILP,Type="HAIL")
EVTYPE_ICEP1 <- mutate(EVTYPE_ICEP,Type="ICE")
EVTYPE_SURGEP1 <- mutate(EVTYPE_SURGEP,Type="SURGE")
EVTYPE_OTHERP1 <- mutate(EVTYPE_OTHERP, Type="OTHER")
##### The new combined data set has 462 rows, more than the 371 before, this is because certain events cross categories. For these cases, the value of damaged will be counted into all catetories.
PROPDMGT_data<- as.data.frame(rbind(EVTYPE_WINDP1, EVTYPE_FLOODP1, EVTYPE_FIREP1, EVTYPE_FLASHP1, EVTYPE_HAILP1, EVTYPE_ICEP1,EVTYPE_SURGEP1, EVTYPE_OTHERP1))
nrow(PROPDMGT_data)
## [1] 462
PROPDMG_summary<- PROPDMGT_data %>% group_by(Type) %>% summarise(PROPDMGT = sum(PROPDMGT,na.rm=TRUE))
#### For Crop damage:
##### Key events indicators for property damage are: FLOOD, WIND, HEAT, STORM, DROUGHT, FROST, HAIL, HURRICANE, ICE, RAIN, TORNADO.
CTop20_split <- strsplit(CTop20," ")
C_keywords_table<- as.data.frame(table(CTop20_split1<- unlist(CTop20_split)))
print.data.frame(arrange(C_keywords_table, desc(Freq)))
## Var1 Freq
## 1 FLOOD 3
## 2 WIND 3
## 3 FREEZE 2
## 4 HEAT 2
## 5 STORM 2
## 6 COLD 1
## 7 DAMAGING 1
## 8 DROUGHT 1
## 9 EXCESSIVE 1
## 10 EXTREME 1
## 11 FLASH 1
## 12 FROST/FREEZE 1
## 13 HAIL 1
## 14 HEAVY 1
## 15 HIGH 1
## 16 HURRICANE 1
## 17 HURRICANE/TYPHOON 1
## 18 ICE 1
## 19 RAIN 1
## 20 RIVER 1
## 21 THUNDERSTORM 1
## 22 TORNADO 1
## 23 TROPICAL 1
## 24 TSTM 1
C_keywords<- c("FLOOD", "WIND", "HEAT", "STORM", "DROUGHT", "FROST", "HAIL", "HURRICANE", "ICE", "RAIN", "TORNADO")
##### Each data sets are grouped into subsets by keyword for property damage.
FLOODC<- grep("FLOOD", summarised_data8$EVTYPE1)
WINDC <- grep("WIND", summarised_data8$EVTYPE1)
HEATC<- grep("HEAT", summarised_data8$EVTYPE1)
STORMC <- grep("STORM", summarised_data8$EVTYPE1)
DROUGHTC <- grep("DROUGHT", summarised_data8$EVTYPE1)
FROSTC <- grep("FROST", summarised_data8$EVTYPE1)
HAILC <- grep("HAIL", summarised_data8$EVTYPE1)
HURRICANEC <- grep("HURRICANE", summarised_data8$EVTYPE1)
ICEC<- grep("ICE", summarised_data8$EVTYPE1)
RAINC <- grep("RAIN", summarised_data8$EVTYPE1)
TORNADOC <- grep("TORNADO", summarised_data8$EVTYPE1)
##### Duplicated entries (i.e. events including two or more key words) are identified.
Coccurence<- data.frame(table(c(FLOODC, WINDC, HEATC, STORMC, DROUGHTC, FROSTC, HAILC, HURRICANEC, ICEC, RAINC, TORNADOC)))
DubC <- Coccurence %>% filter(Coccurence$Freq > 1) %>% mutate(CROPDMGT=summarised_data8$CROPDMGT[Var1]) %>% mutate(Dub_CROPDMGT=CROPDMGT*(Freq-1))
##### After removing duplicated entries, events selected using the above mentioned key words cover 92.6% of events.
(sum(summarised_data8$CROPDMGT[c(FLOODC, WINDC, HEATC, STORMC, DROUGHTC, FROSTC, HAILC, HURRICANEC, ICEC, RAINC, TORNADOC)])-sum(DubC$CROPDMGT))/sum(summarised_data8$CROPDMGT) ##### 92.6%
## [1] 0.9260617
##### As a result, all event types are grouped into these categories except STORM, HURRICANE, WIND AND TORNADO are grouped into WIND and Events not containing the above key words are grouped in OTHER. Therefore, the final categories are WIND, FLOOD, DROUGHT, FROST, HAIL, ICE, RAIN AND OTHER.
EVTYPE_WINDC <- summarised_data8[c(STORMC, HURRICANEC, WINDC, TORNADOC),]
EVTYPE_HEATC <- summarised_data8[HEATC,]
EVTYPE_FLOODC <- summarised_data8[FLOODC,]
EVTYPE_DROUGHTC <- summarised_data8[DROUGHTC,]
EVTYPE_FROSTC<- summarised_data8[FROSTC,]
EVTYPE_HAILC <- summarised_data8[HAILC,]
EVTYPE_ICEC<- summarised_data8[ICEC,]
EVTYPE_RAINC<- summarised_data8[RAINC,]
EVTYPE_OTHERC <- summarised_data8[-c(FLOODC, WINDC, STORMC, DROUGHTC, FROSTC, HAILC, HURRICANEC, ICEC, RAINC, TORNADOC),]
EVTYPE_WINDC1 <- mutate(EVTYPE_WINDC,Type="WIND")
EVTYPE_HEATC1 <- mutate(EVTYPE_HEATC,Type="HEAT")
EVTYPE_FLOODC1 <- mutate(EVTYPE_FLOODC,Type="FLOOD")
EVTYPE_DROUGHTC1 <- mutate(EVTYPE_DROUGHTC,Type="DROUGHT")
EVTYPE_FROSTC1 <- mutate(EVTYPE_FROSTC,Type="FROST")
EVTYPE_HAILC1 <- mutate(EVTYPE_HAILC,Type="HAIL")
EVTYPE_ICEC1 <- mutate(EVTYPE_ICEC,Type="ICE")
EVTYPE_RAINC1 <- mutate(EVTYPE_RAINC,Type="RAIN")
EVTYPE_OTHERC1 <- mutate(EVTYPE_OTHERC, Type="OTHER")
##### The new combined data set has 165 rows, more than the 131 before, this is because certain events cross categories. For these cases, the number of fatalities will be counted into all catetories.
CROPDMGT_data<- as.data.frame(rbind(EVTYPE_WINDC1, EVTYPE_FLOODC1, EVTYPE_DROUGHTC1, EVTYPE_FROSTC1, EVTYPE_HAILC1, EVTYPE_ICEC1,EVTYPE_RAINC1, EVTYPE_OTHERC1))
nrow(CROPDMGT_data)
## [1] 165
CROPDMG_summary<- CROPDMGT_data %>% group_by(Type) %>% summarise(CROPDMGT = sum(CROPDMGT,na.rm=TRUE))
#### Expressing economic damaged in the nearest billion
PROPDMG_summary_bn <- as.data.frame(PROPDMG_summary %>% mutate(PROPDMG_summary_bn=PROPDMGT/1000000000) %>% select(-PROPDMGT))
CROPDMG_summary_bn <- as.data.frame(CROPDMG_summary %>% mutate(CROPDMG_summary_bn=CROPDMGT/1000000000) %>% select(-CROPDMGT))
##### Fatalities - Fatalities_summary
str(Fatalities_summary)
## 'data.frame': 8 obs. of 2 variables:
## $ Type : chr "CURRENT" "FLOOD" "HEAT" "LIGHTENING" ...
## $ TFatalities1: num 577 1525 3138 817 810 ...
colnames(Fatalities_data)
## [1] "EVTYPE1" "TFatalities" "Type"
#####Injuries - Injuries_summary
##### Property damage - PROPDMG_summary_bn
##### Crop damage - CROPDMG_summary_bn
#### The impact on health
##### Fatalities
print.data.frame(arrange(Fatalities_summary, desc(TFatalities1)))
## Type TFatalities1
## 1 WIND 3299
## 2 HEAT 3138
## 3 FLOOD 1525
## 4 WINTER 946
## 5 LIGHTENING 817
## 6 OTHER 810
## 7 STORM 633
## 8 CURRENT 577
qplot(Type,TFatalities,data=Fatalities_data)
##### Injuries
print.data.frame(arrange(Injuries_summary,desc(TInjuries1)))
## Type TInjuries1
## 1 WIND 37950
## 2 HEAT 9224
## 3 FLOOD 8604
## 4 STORM 8020
## 5 LIGHTENING 5232
## 6 WINTER 3240
## 7 OTHER 3016
## 8 FIRE 2685
qplot(Type,TInjuries,data=Injuries_data)
##### Economic loss caused
print.data.frame(arrange(PROPDMG_summary_bn,desc(PROPDMG_summary_bn)))
## Type PROPDMG_summary_bn
## 1 WIND 209.391815
## 2 FLOOD 167.529741
## 3 SURGE 47.965224
## 4 HAIL 17.615091
## 5 FLASH 16.907928
## 6 FIRE 8.501629
## 7 OTHER 8.379430
## 8 ICE 3.974061
print.data.frame(arrange(CROPDMG_summary_bn,desc(CROPDMG_summary_bn)))
## Type CROPDMG_summary_bn
## 1 WIND 14.4889644
## 2 DROUGHT 13.9726218
## 3 FLOOD 12.3800991
## 4 ICE 5.0271143
## 5 OTHER 3.9629119
## 6 HAIL 3.1137959
## 7 FROST 1.2021860
## 8 RAIN 0.9193158
qplot(Type,PROPDMGT,data=PROPDMGT_data)