Title goes here

Synopsis

Data processing

Get data

Download csv.bz2 file from destined url and read it into a data frame named “df”

url<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url, "storm.bz2", method="libcurl")
df<-read.csv("storm.bz2", header=T)
library(ggplot2)

Process and clean data

Extract only data about damages and time and locale about the disasters

dim(df)

## [1] 902297     37

names(df)

##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

head(df)

##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL
##    EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO         0                                               0
## 2 TORNADO         0                                               0
## 3 TORNADO         0                                               0
## 4 TORNADO         0                                               0
## 5 TORNADO         0                                               0
## 6 TORNADO         0                                               0
##   COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1         NA         0                      14.0   100 3   0          0
## 2         NA         0                       2.0   150 2   0          0
## 3         NA         0                       0.1   123 2   0          0
## 4         NA         0                       0.0   100 2   0          0
## 5         NA         0                       0.0   150 2   0          0
## 6         NA         0                       1.5   177 2   0          0
##   INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1       15    25.0          K       0                                    
## 2        0     2.5          K       0                                    
## 3        2    25.0          K       0                                    
## 4        2     2.5          K       0                                    
## 5        2     2.5          K       0                                    
## 6        6     2.5          K       0                                    
##   LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1     3040      8812       3051       8806              1
## 2     3042      8755          0          0              2
## 3     3340      8742          0          0              3
## 4     3458      8626          0          0              4
## 5     3412      8642          0          0              5
## 6     3450      8748          0          0              6

#subset out cols relate to damage
df2<-df[,c(1:8,23:28,36)]
dim(df2)

## [1] 902297     15

head(df2)

##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO          0       15    25.0          K       0           
## 2 TORNADO          0        0     2.5          K       0           
## 3 TORNADO          0        2    25.0          K       0           
## 4 TORNADO          0        2     2.5          K       0           
## 5 TORNADO          0        2     2.5          K       0           
## 6 TORNADO          0        6     2.5          K       0           
##   REMARKS
## 1        
## 2        
## 3        
## 4        
## 5        
## 6

str(df2)

## 'data.frame':    902297 obs. of  15 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "10/10/1954 0:00:00",..: 6523 6523 4213 11116 1426 1426 1462 2873 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "000","0000","00:00:00 AM",..: 212 257 2645 1563 2524 3126 122 1563 3126 3126 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "?","ABNORMALLY DRY",..: 830 830 830 830 830 830 830 830 830 830 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REMARKS   : Factor w/ 436781 levels ""," ","  ","   ",..: 1 1 1 1 1 1 1 1 1 1 ...

Some STATE has two STATE__ code, AK, MA, MD, ND, NJ, OH, SC

state<-tapply(df2$STATE__, df2$STATE, unique)
distate<-state[sapply(state, length)==2]
format(distate)

##       AK       MA       MD       ND       NJ       OH       SC 
##  "2, 72" "25, 26" "24, 11" "38, 39" "34, 35" "39, 24" "45, 46"

Don’t know why

Results

1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

We examine 2 numbers regarding threat to public health, the “FATALITIES” and “INJURIES”

#pick the top 10 disasters baseed on FATALITIES and INJURIES
f<-with(df2, tapply(FATALITIES, EVTYPE, sum))
topf<-sort(f, decreasing=TRUE)[1:10]
df2f<-df2[df2$EVTYPE %in% names(topf), ]

i<-with(df2, tapply(INJURIES, EVTYPE, sum))
topi<-sort(i, decreasing=TRUE)[1:10]
df2i<-df2[df2$EVTYPE %in% names(topi), ]

par(mfrow=c(1,2), mar=c(8,4,2,2))
barplot(topf, las=2,cex.names=0.7)
barplot(topi, las=2,cex.names=0.7)

The top 10 regarding FATALITIES:
5633, 1903, 978, 937, 816, 504, 470, 368, 248, 224 The top 10 regarding INJURIES:
9.134610^{4}, 6957, 6789, 6525, 5230, 2100, 1975, 1777, 1488, 1361 Consider both INJURIES and FATALITIES, choose the intersect and plot the total number of people dead or injured across the country

toph<-intersect(names(topf), names(topi))
df2h<-df2[df2$EVTYPE %in% toph, ]
qplot(EVTYPE, data=df2h, geom="bar", weight=FATALITIES+INJURIES)

So the top 7 disasters threat public health is as shown in above figuer

2. Across the United States, which types of events have the greatest economic consequences?

Compute number of both property damage and crop damage by combining damage and exp. Adding 2 new columns for both property and crop.

df2$PROPDMGEXP2[df2$PROPDMGEXP==""]<-0
df2$PROPDMGEXP2[df2$PROPDMGEXP=="K"]<-1000
df2$PROPDMGEXP2[df2$PROPDMGEXP=="M"]<-1000000
df2$PROPDMGEXP2[df2$PROPDMGEXP=="B"]<-1000000000
df2$CROPDMGEXP2[df2$CROPDMGEXP==""]<-0
df2$CROPDMGEXP2[df2$CROPDMGEXP=="K"]<-1000
df2$CROPDMGEXP2[df2$CROPDMGEXP=="M"]<-1000000
df2$CROPDMGEXP2[df2$CROPDMGEXP=="B"]<-1000000000
df2$prop<-df2$PROPDMG*df2$PROPDMGEXP2
df2$crop<-df2$CROPDMG*df2$CROPDMGEXP2
str(df2$prop)

##  num [1:902297] 25000 2500 25000 2500 2500 2500 2500 2500 25000 25000 ...

str(df2$crop)

##  num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...

Find out top10 disaster with highest property damage and crop damage respectively

tprop<-tapply(df2$prop, df2$EVTYPE, sum)
topp<-sort(tprop, decreasing=TRUE)[1:10]
tcrop<-tapply(df2$crop, df2$EVTYPE, sum)
topc<-sort(tcrop, decreasing=TRUE)[1:10]

The top 10 on property damage are: 6.93058410^{10}, 4.332353610^{10}, 1.186831910^{10}, 7.703890510^{9}, 5.118945510^{9}, 4.76511410^{9}, 4.64118810^{9}, 3.001829510^{9}, 2.510^{9}, 1.610^{9} the top 10 on crop damage are: 5.661968410^{9}, 5.02945910^{9}, 5.022113510^{9}, 2.7419110^{9}, 2.607872810^{9}, 1.421317110^{9}, 1.29297310^{9}, 1.09408610^{9}, 7.33399810^{8}, 6.7834610^{8}

plot the top 3 disaster with combined property and crop damage

df2pc<-df2[df2$EVTYPE %in% intersect(names(topp), names(topc)),]
qplot(EVTYPE, data=df2pc, weight=prop+crop, ylab="total property and crop damage")

Conclusion

The natural disasters are so detrimental to our life and our properties. According to the analysis, there is no overlap between the worst disasters threatening our lifes with those damaging our crops ans properties. So area specific sustibility should be taken into considerations.

RepData_assign2

li sun

06/09/2015