SYNOPSIS OF THE ANALYSIS: Study investigates the severe weather events in US based on time series data from NOAA.the present data analysis indicates tornados having a major effect on humans whereas floods are more expensive in the ecoinomic point of view.
Loading the libraries required.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
LOADING AND PREPROCESSING OF DATA
stormDataRaw<-read.csv("repdata_data_StormData.csv")
str(stormDataRaw)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
There are total 902297 observations with 37 variables in the data.
Only the data variables concerned for the analysis are chosen.
The data contains a lot of observations which don’t have any information on the analysis in question thus those are neglected.
stormData<-select(stormDataRaw,BGN_DATE,EVTYPE,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP,FATALITIES,INJURIES)
Formatting the BGN_DATE variable as date.
stormData$BGN_DATE<-as.Date(stormData$BGN_DATE,"%m/%d/%Y")
stormData$YEAR<-year(stormData$BGN_DATE)
consider the data for TORNADO data from 1996.
stormData<-filter(stormData,YEAR>=1996)
stormData<-filter(stormData,PROPDMG>0|CROPDMG>0|FATALITIES>0|INJURIES>0)
In here we are only using the observations with health impact or economic damage.
The variables PROPDMG and CROPDMG come with separate exponents and are reqired to be converted to proper factors.
table(stormData$PROPDMGEXP)
##
## B K M
## 8448 32 185474 7364
table(stormData$CROPDMGEXP)
##
## B K M
## 102767 2 96787 1762
Both exponents are converted to upper caseto adpat to all exponents with same meanings.
stormData$PROPDMGEXP<-toupper(stormData$PROPDMGEXP)
stormData$CROPDMGEXP<-toupper(stormData$CROPDMGEXP)
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP=="")]<-10^0
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP=="?")]<-10^0
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP=="0")]<-10^0
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP=="2")]<-10^2
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP=="K")]<-10^3
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP=="M")]<-10^6
stormData$CROPDMGFACTOR[(stormData$CROPDMGEXP=="B")]<-10^9
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="")]<-10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="_")]<-10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="?")]<-10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="+")]<-10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="0")]<-10^0
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="1")]<-10^1
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="2")]<-10^2
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="3")]<-10^3
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="4")]<-10^4
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="5")]<-10^5
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="6")]<-10^6
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="7")]<-10^7
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="8")]<-10^8
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="H")]<-10^2
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="K")]<-10^3
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="M")]<-10^6
stormData$PROPDMGFACTOR[(stormData$PROPDMGEXP=="B")]<-10^9
Distinction between the fatalities and injuries is not significant thus they are merged to one.
Similarly,both crop and property damages are merged to one after multiplication with their corresponding factors.
stormData<-mutate(stormData,HEALTHIMP= FATALITIES + INJURIES)
stormData<-mutate(stormData,ECONOMICCOST= PROPDMG * PROPDMGFACTOR + CROPDMG * CROPDMGFACTOR)
The event types also require detailed type of examination.
stormData$EVTYPE<-toupper(stormData$EVTYPE)
dim(data.frame(table(stormData$EVTYPE)))
## [1] 186 2
After converting variable EVTYPE to upper case there are still 186 different types of events listed.
But according to NOAA there should be only 48 events thus there are duplicates.
evtypeUnique<-unique(stormData$EVTYPE)
evtypeUnique[grep("THUND",evtypeUnique)]
## [1] "THUNDERSTORM" "THUNDERSTORM WIND (G40)"
## [3] "THUNDERSTORM WIND" "MARINE THUNDERSTORM WIND"
Cleaning the entire data for duplicates is tedious and unnecesary thus we only focus on cleaning the variable of HEALTHIMP which concerns our study.
healthimpact<-with(stormData,aggregate(HEALTHIMP~EVTYPE,FUN=sum))
subset(healthimpact,HEALTHIMP>quantile(HEALTHIMP,prob=0.95))
## EVTYPE HEALTHIMP
## 39 EXCESSIVE HEAT 8188
## 46 FLASH FLOOD 2561
## 48 FLOOD 7172
## 69 HEAT 1459
## 88 HURRICANE/TYPHOON 1339
## 107 LIGHTNING 4792
## 146 THUNDERSTORM WIND 1530
## 149 TORNADO 22178
## 153 TSTM WIND 3870
## 182 WINTER STORM 1483
There are only 2 event types in 95% quantile. These are TSTM WIND and HURRICANE/TYPHOON.
stormData$EVTYPE[(stormData$EVTYPE=="TSTM WIND")]<-"THUNDERSTORM WIND"
stormData$EVTYPE[(stormData$EVTYPE=="HURRICANE/TYPHOON")]<-"HURRICANE(TYPHOON)"
Same procedure is applied for the variable of ECONOMICCOST, i.e events in 95%quntile are cleaned.
economicCost<-with(stormData,aggregate(ECONOMICCOST~EVTYPE,FUN=sum))
subset(economicCost,ECONOMICCOST>quantile(ECONOMICCOST,prob=0.95))
## EVTYPE ECONOMICCOST
## 32 DROUGHT 14413667000
## 46 FLASH FLOOD 16557105610
## 48 FLOOD 148919611950
## 66 HAIL 17071172870
## 86 HURRICANE 14554229010
## 88 HURRICANE(TYPHOON) 71913712800
## 141 STORM SURGE 43193541000
## 146 THUNDERSTORM WIND 8812957230
## 149 TORNADO 24900370720
## 152 TROPICAL STORM 8320186550
Again there are 2 events which are renamed .
stormData$EVTYPE[(stormData$EVTYPE=="HURRICANE")]<-"HURRICANE(TYPHOON)"
stormData$EVTYPE[(stormData$EVTYPE=="STORM SURGE")]<-"STORM SURGE/TIDE"
RESULTS
The cleaned up data stormData is been aggregated per EVTYPE and proided an descending oredr in new dataframe healthImpact.
healthImpact<-stormData %>% group_by(EVTYPE) %>% summarise(HEALTHIMP=sum(HEALTHIMP)) %>% arrange(desc(HEALTHIMP))
## `summarise()` ungrouping output (override with `.groups` argument)
g1<-ggplot(healthImpact[1:10,],aes(x=reorder(EVTYPE,-HEALTHIMP),y=HEALTHIMP,color=EVTYPE))+geom_bar(stat="identity",fill="white")+theme(axis.text.x=element_text(angle=90,hjust=1))+xlab("Event")+ylab("Number of fatalities and injuries")+theme(legend.position="none")+ggtitle("Fatalities and injuries by severe weather")
g1
This shows that most fatalities are caused by Tornado.
The same analysis and plotting has been done for economiccost variable.
economicCost<-stormData %>% group_by(EVTYPE) %>% summarise(ECONOMICCOST=sum(ECONOMICCOST)) %>% arrange(desc(ECONOMICCOST))
## `summarise()` ungrouping output (override with `.groups` argument)
g1<-ggplot(economicCost[1:10,],aes(x=reorder(EVTYPE,-ECONOMICCOST),y=ECONOMICCOST,color=EVTYPE))+geom_bar(stat="identity",fill="white")+theme(axis.text.x=element_text(angle=90,hjust=1))+xlab("Event")+ylab("Economic cost in USD")+theme(legend.position="none")+ggtitle("Economic cost in US caused by severe weather")
g1
This shows that Floods caused great economic loss.