The data analisys of public health and economical impact of extreme weather events in usa looking for the answers to two questions. The first; which type of events are most harmful with respect to population health.Secondly, which types of events have the greatest economic consequences. With extensive analysis,related to the economic aspects and it’s impacts, the analisys shows that FLOOD has highest negative impact on economy and TORNODO causes major health damages in population.The top 10 list of extreme weather events that cause damages to population health and economy are plotted in detail at the end of the analysis.
The Data Set comes from coursera website compressed as bzip file.It is downloded and saved in the working directory of project and set the working directory.
setwd("C:/Users/DOJOHN LOYD/Documents/Storm_Peer_Ass2")
Loading the required libraries
require(lubridate)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Read the original csv.bz2 file using read.csv function.
if(!exists("strom_data")){
storm_data<-read.csv("repdata%2Fdata%2FStormData.csv.bz2", stringsAsFactors = FALSE, sep=",")
}
dim(storm_data)
## [1] 902297 37
Examine the strom data
str(storm_data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Parse the date from character into POSIXlt/ct form.
#parse the BGN_DATE using lubridate
storm_data$BGN_DATE<-mdy_hms(storm_data$BGN_DATE)
#plot the frequency of disaster every year
hist(year(storm_data$BGN_DATE), xlab="years", main="No. of Disasters Every year",breaks=30,
ylim=c(0,130000))
In the above plot, the number of events from 1950 to 1980 are small compared with the period 1990-2011. In order to avoid the skwed analysis, consider the observations from 1990 to 2011.
storm_data<-filter(storm_data,year(BGN_DATE)>=1990)
dim(storm_data)
## [1] 751740 37
This will reduce the dimension of data set further by obmitting the columns that are required for this analysis.It is crucial for large data set.
#select the required columns
storm_data_small<- select(storm_data,c(BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP))
B ( bilion) h or H (houndre) K (kilo) m or M (million)
# encode M,H,B,K to numeric 6,2,9,3
storm_data_small$PROPDMGEXP[(storm_data_small$PROPDMGEXP=="1" | storm_data_small$PROPDMGEXP=="2"
| storm_data_small$PROPDMGEXP=="3" | storm_data_small$PROPDMGEXP=="4"
| storm_data_small$PROPDMGEXP=="5" | storm_data_small$PROPDMGEXP=="6"
| storm_data_small$PROPDMGEXP=="7" | storm_data_small$PROPDMGEXP=="8")]<-"0"
storm_data_small$PROPDMGEXP[(storm_data_small$PROPDMGEXP=="m" | storm_data_small$PROPDMGEXP=="M")]<-"6"
storm_data_small$PROPDMGEXP[(storm_data_small$PROPDMGEXP=="B" )]<-"9"
storm_data_small$PROPDMGEXP[(storm_data_small$PROPDMGEXP=="K")]<-"3"
storm_data_small$PROPDMGEXP[(storm_data_small$PROPDMGEXP=="h" | storm_data_small$PROPDMGEXP=="H")]<-"2"
storm_data_small$PROPDMGEXP<-as.numeric(storm_data_small$PROPDMGEXP)
## Warning: NAs introduced by coercion
storm_data_small$PROPDMGEXP[is.na(storm_data_small$PROPDMGEXP)]<-0
# Total property damage
property_damage<-storm_data_small$PROPDMG*10^storm_data_small$PROPDMGEXP
# encode the crop damage
storm_data_small$CROPDMGEXP[(storm_data_small$CROPDMGEXP=="2" | storm_data_small$CROPDMGEXP=="?")]<-"0"
storm_data_small$CROPDMGEXP[(storm_data_small$CROPDMGEXP=="k"| storm_data_small$CROPDMGEXP=="K")]<-"3"
storm_data_small$CROPDMGEXP[(storm_data_small$CROPDMGEXP=="M"| storm_data_small$CROPDMGEXP=="m")]<-"6"
storm_data_small$CROPDMGEXP[(storm_data_small$CROPDMGEXP=="B")]<-"9"
storm_data_small$CROPDMGEXP<-as.numeric(storm_data_small$CROPDMGEXP)
storm_data_small$CROPDMGEXP[is.na(storm_data_small$CROPDMGEXP)]<-0
# Total Crop Damage
crop_damage<-storm_data_small$CROPDMG*10^storm_data_small$CROPDMGEXP
# final data
final_data<-mutate(storm_data_small,property_damage,crop_damage)%>%
select(EVTYPE,INJURIES,FATALITIES,property_damage,crop_damage)
Aggregate the damages of injury,fatality,property and crop event wise.
aggregate_data<-final_data %>% group_by(EVTYPE) %>% summarise(fatality=sum(FATALITIES),injury=sum(INJURIES),prop_dam=sum(property_damage),crop_dam=sum(crop_damage))
population health constitutes of both fatlaties and injuries.
fatality_data<-aggregate_data %>% select(EVTYPE,fatality) %>% mutate(value=fatality, effect=as.factor(c("fatality"))) %>% select(EVTYPE,value,effect) %>% arrange(desc(value))
injury_data<-aggregate_data %>% select(EVTYPE,injury) %>% mutate(value=injury, effect=as.factor(c("injury"))) %>% select(EVTYPE,value,effect) %>% arrange(desc(value))
Total population heath cost due to extreme weather events.
total_health<-merge(injury_data,fatality_data,by='EVTYPE') %>% mutate(total_value=value.x+value.y) %>% select(EVTYPE,total_value) %>% arrange(desc(total_value))
head(total_health)
## EVTYPE total_value
## 1 TORNADO 28426
## 2 EXCESSIVE HEAT 8428
## 3 FLOOD 7259
## 4 LIGHTNING 6046
## 5 TSTM WIND 5349
## 6 HEAT 3037
From the above table we can conclude tornado costs most of population health.
Economy Damages constitutes of both property and crop damages.
crop_data<-aggregate_data %>% select(EVTYPE,crop_dam) %>% mutate(value=crop_dam, effect=as.factor(c("crop damage"))) %>% select(EVTYPE,value,effect) %>% arrange(desc(value))
property_data<-aggregate_data %>% select(EVTYPE,prop_dam) %>% mutate(value=prop_dam, effect=as.factor(c("property damage"))) %>% select(EVTYPE,value,effect) %>% arrange(desc(value))
Total economic damages due to extreme weather events.
total_economy<-merge(crop_data,property_data,by='EVTYPE') %>% mutate(total_value=value.x+value.y) %>% select(EVTYPE,total_value) %>% arrange(desc(total_value))
head(total_economy)
## EVTYPE total_value
## 1 FLOOD 150319678257
## 2 HURRICANE/TYPHOON 71913712800
## 3 STORM SURGE 43323541000
## 4 TORNADO 30873468879
## 5 HAIL 18758222016
## 6 FLASH FLOOD 17562129167
From the above table we can conclude that FLOOD damages the economy most.
ggplot(data = head(fatality_data,10),aes(x=EVTYPE,y=value,fill=EVTYPE))+geom_bar(stat = 'identity')+coord_flip()+labs(title = 'Top 10 Total fatalities')+xlab('event type')+ylab('count')
ggplot(data = head(injury_data,10),aes(x=EVTYPE,y=value,fill=EVTYPE))+geom_bar(stat = 'identity')+coord_flip()+labs(title = 'Top 10 Total injuries')+xlab('event type')+ylab('count')
ggplot(data = head(crop_data,10),aes(x=EVTYPE,y=value/1e+06,fill=EVTYPE))+geom_bar(stat = 'identity')+coord_flip()+labs(title = 'Top 10 Total crop damages')+xlab('event type')+ylab('million in $')
ggplot(data = head(property_data,10),aes(x=EVTYPE,y=value/1e+06,fill=EVTYPE))+geom_bar(stat = 'identity')+coord_flip()+labs(title = 'Top 10 Total Property Damages')+xlab('event type')+ylab('million $')