Synopsis

Based on NOAA (the National Oceanic & Atmospheric Administration) Storm Database, this report is aimed to find out the specified weather events, which are most harmful to ourhealth or have the largest influence on economy during 1950 to 2011. There are 3 parts in this report. The 1st part deals with data processing, involving with rearrange the label of “EVTYPE” ‘CROPDMGEXP’and ’PROPDMGEXP’. Then the 2cd part, I reconstructed 2 summary tables. One calculated the total damage on human body and ecomomy by each type of weather events, and the another calculated the average damage in the past 60 years in U.S. Finally, I have visualized all the results of my data anaysis in part 3 and made a brief conclusion.

Part 1. Data Processing

library(ggplot2);library(stats);library(knitr);library(base);
library(dplyr);library(reshape2);library(gridExtra);library(lubridate)

1.1 Load and explore the dataset

# Loading the raw data 
dat=read.csv('/Users/apple/Desktop/Cousera/Data Science/7. Reproducibale Research/StormData.csv.bz2')
# Explore the raw dataset.  
t=data.frame(Obs=dim(dat)[1], Features=dim(dat)[2], States=length(levels(dat$STATE)),
               EVTYPE=length(levels(dat$EVTYPE)),TimeZone=length(levels(dat$TIME_ZONE)),
               UFatal=max(dat$FATALITIES),UInjur=max(dat$INJURIES),
               UProperty=paste( max(dat$PROPDMG[which(dat$PROPDMGEXP=='B')]) ,"Billion"),
               UCrop=paste( max(dat$CROPDMG[which(dat$CROPDMGEXP=='B')]) ,"Billion")   )
rownames(t)=c('Total'); kable(t)
Obs Features States EVTYPE TimeZone UFatal UInjur UProperty UCrop
Total 902297 37 72 985 22 583 1700 115 Billion 5 Billion
# The prefix "U" means the maximum value.
dat$Year= as.factor( year( as.Date(dat$BGN_DATE,format='%m/%d/%Y') ) )
Freq=data.frame(table(dat$Year))    ;       names(Freq)=c('Year','freq')
dat=arrange(dat,Year)               ;      Y=merge(dat,Freq,by = 'Year')
g=ggplot(Y,aes(Year,fill=as.factor(freq),alpha=0.5))+geom_histogram()
g=g+theme(legend.position='none',axis.text.x=element_text(size=6,angle=50))
g=g+xlab('60 years through 1950 ~ 2011')+ylab('Number of Record')   ; g

1.2 Subsetting the dataset according to research goals.
Our goals are to figure out the specified weather events have largest harm on either human or economic health. Thus we should tailor raw data, with surplus information, to fit our research need. The follwing columns are chosen for further analysis: BGN_DATE, EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP.

var=c("EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")
storm=dat[ , var]

1.3 Modifying the ‘storm’ dataset

table(storm$PROPDMGEXP)           #  -  ?  +  0  1  2  3  4  5  6  7  8  B  H  h  K  m  M
table(storm$CROPDMGEXP)           #  ?  0  2  B  k  K  m  M
storm$PROPDMGEXP=as.character(storm$PROPDMGEXP);storm$CROPDMGEXP=as.character(storm$CROPDMGEXP)
storm$PROPDMGEXP[which(storm$PROPDMGEXP %in% c('h','H'))]='2' # h/H is Hundred.
storm$PROPDMGEXP[which(storm$PROPDMGEXP %in% c('k','K'))]='3'
storm$CROPDMGEXP[which(storm$CROPDMGEXP %in% c('k','K'))]='3' # k/K is Thousand.
storm$PROPDMGEXP[which(storm$PROPDMGEXP %in% c('m','M'))]='6'
storm$CROPDMGEXP[which(storm$CROPDMGEXP %in% c('m','M'))]='6' # m/M is Million.
storm$PROPDMGEXP[which(storm$PROPDMGEXP %in% c('b','B'))]='9'
storm$CROPDMGEXP[which(storm$CROPDMGEXP %in% c('b','B'))]='9' # b/B is Billion.
storm$PROPDMGEXP[which(storm$PROPDMGEXP %in% c('-','+','','?'))]=''
storm$CROPDMGEXP[which(storm$CROPDMGEXP %in% c('-','+','','?'))]=''# missing/fault value. 
# now, the value for PROPDMGEXP and CROPDMGEXP is normal and tidy.
storm$EVTYPE=tolower(storm$EVTYPE)
storm$EVTYPE[which(grepl('^hail*',storm$EVTYPE))]='Hail'
storm$EVTYPE[which(grepl(('^wint(er*|*ry)'),storm$EVTYPE))]='winter'# winter storm/weather.
storm$EVTYPE[which(grepl(('(^tornado*)|(*wall.*cloud*)|(*torndao*)'),storm$EVTYPE))]='Tornado'
storm$EVTYPE[which(grepl('(*drought*)|(*dry*)',storm$EVTYPE))]='Drought'
storm$EVTYPE[which(grepl('*tide*',storm$EVTYPE))]='Tide'
storm$EVTYPE[which(grepl('(*fog*)|(*vog*)',storm$EVTYPE))]='Fog'
storm$EVTYPE[which(grepl(('(*snow)|(*blizzard*)|(*avalan*)'),storm$EVTYPE))]='Snow Disaster'
storm$EVTYPE[which(grepl(('(*water*sprout*)|(*water.*spout*)|(*wayter*spout*)')
                                                              ,storm$EVTYPE))]='Watersprout'
storm$EVTYPE[which(grepl('(*wild.*fire*)|(*brush*fire*)|(*grass|fire*)|(*forest*fire*)'
                                                                 ,storm$EVTYPE))]='Wildfire'
storm$EVTYPE[which(grepl('(*cold*)|(*chill*)|(*low.*temp*)|(*hypothermia*)|
                                            (*unseason.*cool*)',storm$EVTYPE))]='Cold/Chill'
storm$EVTYPE[which(grepl('(*flood*)|(*fld*)|(*high.*water*)|(*turbulence*)|
                                          (*excess.*precipitation*)',storm$EVTYPE))]='Flood'
storm$EVTYPE[which(grepl('*volcan*',storm$EVTYPE))]='Volcanic Ash'
storm$EVTYPE[which(grepl('(*thunder*)|(*tstm*)|(*gustnado*)',storm$EVTYPE))]='Thunderstorm'
storm$EVTYPE[which(grepl('(*wind*)|(*wnd*)',storm$EVTYPE))]='Wind Disaster'
storm$EVTYPE[which(grepl('(*surf*)|(*beach.*(erosion|erosin)*)|(*(high|rogue)*wave*)|
                                                      (*high*water*)',storm$EVTYPE))]='Surf'
storm$EVTYPE[which(grepl('*rip*',storm$EVTYPE))]='Rip Current'
storm$EVTYPE[which(grepl('*funnel*',storm$EVTYPE))]='Funnel Cloud'
storm$EVTYPE[which(grepl('(*frost*)|(*ice*)|(*freez*)|(*glaze*)',storm$EVTYPE))]='FrostFreezeIce'
storm$EVTYPE[which(grepl('(*rain*)|(*urban.*small*)|(*small.*stream*)',storm$EVTYPE))]='Rain'
storm$EVTYPE[which(grepl('(*light*)|(*ligntning*)',storm$EVTYPE))]='Lightening'
storm$EVTYPE[which(grepl('(*slide*)|(*land*(slump|spout))',storm$EVTYPE))]='Land/Rock Slide'
storm$EVTYPE[which(grepl('(*typhoon*)|(*coastal.*(storm|erosion))',storm$EVTYPE))]='Typhoon'
storm$EVTYPE[which(grepl('*tsunami*',storm$EVTYPE))]='Tsunami'
storm$EVTYPE[which(grepl('(*heat*)|(*hot*)|(*(unusual|unseasonably)*warm*)|(*high.*temper*)'
                                                                     ,storm$EVTYPE))]='Heat'
storm$EVTYPE[which(grepl('(*dust*)',storm$EVTYPE))]='Dust Devil'
storm$EVTYPE[which(grepl('(*wet*)|(*high*water*)',storm$EVTYPE))]='Wet'
storm$EVTYPE[which(grepl('(*smoke*)',storm$EVTYPE))]='Dense Smoke'
storm$EVTYPE[which(grepl('(*sleet*)',storm$EVTYPE))]='Sleet'
storm$EVTYPE[which(grepl('(*hail*)',storm$EVTYPE))]='Hail'
storm$EVTYPE[which(grepl('(^none*)|(^\\?)',storm$EVTYPE))]='Unknown'        
storm$EVTYPE[which(grepl('^[a-z]',storm$EVTYPE))]='Unknown'
kable(data.frame(table(storm$EVTYPE)))    # The frequency table for each type of weather Events.
Var1 Freq
Cold/Chill 2728
Dense Smoke 21
Drought 2847
Dust Devil 581
Flood 86106
Fog 3373
FrostFreezeIce 4765
Funnel Cloud 6980
Hail 288838
Heat 2929
Land/Rock Slide 650
Lightening 15764
Rain 11866
Rip Current 1520
Sleet 71
Snow Disaster 20822
Surf 1415
Thunderstorm 336821
Tide 459
Tornado 60699
Tsunami 20
Unknown 331
Watersprout 3862
Wet 41
Wildfire 2782
Wind Disaster 46006

Part 2 Data Transform

2.1 Total damages through all 60 years

storm=arrange(storm,EVTYPE)
storm$totPROP=0    ; storm$totCROP=0 
storm$totPROP[which(is.na(storm$PROPDMG)==T | is.na(storm$PROPDMGEXP)==T)]=0
storm$totCROP[which(is.na(storm$CROPDMG)==T | is.na(storm$CROPDMGEXP)==T)]=0
storm$totPROP[which(is.na(storm$PROPDMG)==F)]=storm$PROPDMG*(10^as.numeric(storm$PROPDMGEXP))
storm$totCROP[which(is.na(storm$CROPDMG)==F)]=storm$CROPDMG*(10^as.numeric(storm$CROPDMGEXP))
T1=aggregate(FATALITIES~EVTYPE,storm,sum)             ;names(T1)=c('EVTYPE','Fatal')
T2=aggregate(INJURIES~EVTYPE,storm,sum)             
T3=aggregate(totPROP~EVTYPE,storm,sum)              
T4=aggregate(totCROP~EVTYPE,storm,sum)             
T5=data.frame(T1$Fatal+T2$INJURIES)                   ;names(T5)=c("Bodyharm")              
T6=data.frame(T3$totPROP+T4$totCROP)                  ;names(T6)=c("Ecoharm")      
tot=data.frame(T1,Injur=T2$INJURIES,Property=T3$totPROP,Crop=T4$totCROP,Bodyharm=T5,
                                                           Ecoharm=T6,Index='Total')
kable(tot)
EVTYPE Fatal Injur Property Crop Bodyharm Ecoharm Index
Cold/Chill 467 324 246334450 1433765550 791 1680100000 Total
Dense Smoke 0 0 100000 0 0 100000 Total
Drought 41 63 1053183600 13972636780 104 15025820380 Total
Dust Devil 24 483 6267630 3100000 507 9367630 Total
Flood 1556 8682 168264617238 12388577200 10238 180653194438 Total
Fog 93 1622 3032561000 107308830 1715 3139869830 Total
FrostFreezeIce 185 2808 11704815610 7685251300 2993 19390066910 Total
Funnel Cloud 0 3 194600 0 3 194600 Total
Hail 15 1371 15977539956 3046887620 1386 19024427576 Total
Heat 2973 8856 612532200 899698500 11829 1512230700 Total
Land/Rock Slide 44 55 327403100 20017000 99 347420100 Total
Lightening 817 5231 930419426 12092090 6048 942511516 Total
Rain 100 280 3216931740 804652800 380 4021584540 Total
Rip Current 714 1864 84656397010 5534112800 2578 90190509810 Total
Sleet 2 0 0 0 2 0 Total
Snow Disaster 495 2141 1696605490 246743100 2636 1943348590 Total
Surf 355 665 43449571050 7055000 1020 43456626050 Total
Thunderstorm 730 9544 11184947523 1271680530 10274 12456628053 Total
Tide 11 6 4650946150 850000 17 4651796150 Total
Tornado 5658 91364 58552152314 417461470 97022 58969613784 Total
Tsunami 33 129 144062000 20000 162 144082000 Total
Unknown 15 44 1873000 1034400 59 2907400 Total
Watersprout 6 72 60730700 0 78 60730700 Total
Wet 1 0 5000 147000000 1 147005000 Total
Wildfire 78 1063 5494779000 295972800 1141 5790751800 Total
Wind Disaster 732 3858 12959898321 808274400 4590 13768172721 Total

3.2 Average damages through all the 60 years.

A1=aggregate(FATALITIES~EVTYPE,storm,mean)            ;names(A1)=c('EVTYPE','Fatal')
A2=aggregate(INJURIES~EVTYPE,storm,mean)              ;names(A2)=c('EVTYPE','Injur')
A3=aggregate(totPROP~EVTYPE,storm,mean)               ;names(A3)=c('EVTYPE','Property')
A4=aggregate(totCROP~EVTYPE,storm,mean)               ;names(A4)=c('EVTYPE','Crop')
A5=A1$Fatal+A2$Injur                                  ;names(A5)=c('Bodyharm')
A6=A3$Property+A4$Crop                                ;names(A6)=c('Ecoharm')
avg=data.frame(A1,Injur=A2$Injur,Property=A3$Property,Crop=A4$Crop,Bodyharm=A5,
                                                           Ecoharm=A6,Index='Average')
avg[,c(2:7)]=round(avg[,c(2:7)],2)
kable((avg))
EVTYPE Fatal Injur Property Crop Bodyharm Ecoharm Index
Cold/Chill 0.17 0.12 173719.64 1032972.30 0.29 1206691.94 Average
Dense Smoke 0.00 0.00 11111.11 0.00 0.00 11111.11 Average
Drought 0.01 0.02 710650.20 9096768.74 0.04 9807418.94 Average
Dust Devil 0.04 0.83 18219.85 12204.72 0.87 30424.58 Average
Flood 0.02 0.10 3187493.93 341461.85 0.12 3528955.78 Average
Fog 0.03 0.48 2205498.91 106562.89 0.51 2312061.80 Average
FrostFreezeIce 0.04 0.59 3772096.55 3080261.04 0.63 6852357.60 Average
Funnel Cloud 0.00 0.00 80.25 0.00 0.00 80.25 Average
Hail 0.00 0.00 173644.38 37010.03 0.00 210654.40 Average
Heat 1.02 3.02 443863.91 665950.04 4.04 1109813.95 Average
Land/Rock Slide 0.07 0.08 770360.24 62553.12 0.15 832913.36 Average
Lightening 0.05 0.33 84637.44 2868.14 0.38 87505.59 Average
Rain 0.01 0.02 554261.15 151278.96 0.03 705540.12 Average
Rip Current 0.47 1.23 119402534.57 9238919.53 1.70 128641454.10 Average
Sleet 0.03 0.00 0.00 0.00 0.03 0.00 Average
Snow Disaster 0.02 0.10 169254.34 28933.29 0.13 198187.63 Average
Surf 0.25 0.47 66948491.60 16877.99 0.72 66965369.59 Average
Thunderstorm 0.00 0.03 68523.88 13397.25 0.03 81921.13 Average
Tide 0.02 0.01 14051196.83 2733.12 0.04 14053929.95 Average
Tornado 0.09 1.51 1129020.89 43476.51 1.60 1172497.40 Average
Tsunami 1.65 6.45 7582210.53 1052.63 8.10 7583263.16 Average
Unknown 0.05 0.13 69370.37 34480.00 0.18 103850.37 Average
Watersprout 0.00 0.02 54127.18 0.00 0.02 54127.18 Average
Wet 0.02 0.00 5000.00 73500000.00 0.02 73505000.00 Average
Wildfire 0.03 0.38 2763973.34 164612.24 0.41 2928585.58 Average
Wind Disaster 0.02 0.08 395396.11 29231.29 0.10 424627.40 Average

Part 3 Date Visualization

3.1 Visualized the total damages and average damages through over 60 years.

summary=rbind(tot,avg)
s1=melt(summary,id.vars = c('EVTYPE','Property','Crop' ,'Bodyharm','Ecoharm','Index')) 
s2=melt(summary,id.vars = c('EVTYPE',  'Fatal' ,'Injur','Bodyharm','Ecoharm','Index'))
g1=ggplot(s1,aes(x=EVTYPE,y=value,color=variable,alpha=0.6))
g1=g1+geom_errorbar(aes(ymin=0,ymax=value),width=0.7,lwd=1.6)
g1=g1+facet_grid(Index~variable,scales='free')+theme(axis.text.x=element_text(angle=90))
g1=g1+ggtitle("Bodyharm from Weather Events")+xlab('Weather Events')+ylab('Harm');g1

g2=ggplot(s2,aes(x=EVTYPE,y=value,color=variable,alpha=0.6))
g2=g2+geom_errorbar(aes(ymin=0,ymax=value), width=0.7,lwd=1.6)
g2=g2+facet_grid(Index~variable,scales='free')+theme(axis.text.x=element_text(angle=90))
g2=g2+ggtitle("Ecoharm from Weather Events")+xlab('Weather Events')+ylab('Harm');g2

3.2 The highest harm in each weather events.

c1=head( arrange(tot,desc(Fatal))[,c(1,2)],3 )    ; c1=format(c1,scientific=T,digit=3)
c2=head( arrange(tot,desc(Injur))[,c(1,3)],3 )    ; c2=format(c2,scientific=T,digit=3)
c3=head( arrange(tot,desc(Property))[,c(1,4)],3 ) ; c3=format(c3,scientific=T,digit=3)
c4=head( arrange(tot,desc(Crop))[,c(1,5)],3 )     ; c4=format(c4,scientific=T,digit=3)
c5=head( arrange(tot,desc(Bodyharm))[,c(1,6)],3 ) ; c5=format(c5,scientific=T,digit=3)
c6=head( arrange(tot,desc(Ecoharm))[,c(1,7)],3 )  ; c6=format(c6,scientific=T,digit=3)

The table of Top 3 list of total damage through over 60 years.

kable(data.frame(c1,c2,c3,c4,c5,c6))  
EVTYPE Fatal EVTYPE.1 Injur EVTYPE.2 Property EVTYPE.3 Crop EVTYPE.4 Bodyharm EVTYPE.5 Ecoharm
Tornado 5.66e+03 Tornado 9.14e+04 Flood 1.68e+11 Drought 1.40e+10 Tornado 9.70e+04 Flood 1.81e+11
Heat 2.97e+03 Thunderstorm 9.54e+03 Rip Current 8.47e+10 Flood 1.24e+10 Heat 1.18e+04 Rip Current 9.02e+10
Flood 1.56e+03 Heat 8.86e+03 Tornado 5.86e+10 FrostFreezeIce 7.69e+09 Thunderstorm 1.03e+04 Tornado 5.90e+10
d1=head( arrange(avg,desc(Fatal))[,c(1,2)],3 )    ; d1=format(d1,scientific=T,digit=3)
d2=head( arrange(avg,desc(Injur))[,c(1,3)],3 )    ; d2=format(d2,scientific=T,digit=3)
d3=head( arrange(avg,desc(Property))[,c(1,4)],3 ) ; d3=format(d3,scientific=T,digit=3)
d4=head( arrange(avg,desc(Crop))[,c(1,5)],3 )     ; d4=format(d4,scientific=T,digit=3)
d5=head( arrange(avg,desc(Bodyharm))[,c(1,6)],3 ) ; d5=format(d5,scientific=T,digit=3)
d6=head( arrange(avg,desc(Ecoharm))[,c(1,7)],3 )  ; d6=format(d6,scientific=T,digit=3)

The table of Top 3 list of average damage through over 60 years.

kable(data.frame(d1,d2,d3,d4,d5,d6))
EVTYPE Fatal EVTYPE.1 Injur EVTYPE.2 Property EVTYPE.3 Crop EVTYPE.4 Bodyharm EVTYPE.5 Ecoharm
Tsunami 1.65e+00 Tsunami 6.45e+00 Rip Current 1.19e+08 Wet 7.35e+07 Tsunami 8.10e+00 Rip Current 1.29e+08
Heat 1.02e+00 Heat 3.02e+00 Surf 6.69e+07 Rip Current 9.24e+06 Heat 4.04e+00 Wet 7.35e+07
Rip Current 4.70e-01 Tornado 1.51e+00 Tide 1.41e+07 Drought 9.10e+06 Rip Current 1.70e+00 Surf 6.70e+07

3.3 Conclusion

In terms of the harm on human body:
- Tornadoes has caused the most deaths and the most injuries in the past 60 years.
- Although thunderstrom has caused the second most injuries, but it led to fewer dealths.
- Heat,with high temperatures and relative humidity, has still caused highest dealth and injuries.

In terms of the harm on economic cost:
- Floods has cost the most damage on property and the second most on crop.
- Floods has cost the most damage , considering the property and crop together.
- Drought has cost the most damage on crop, but near 0 on property.