Download csv.bz2 file from destined url and read it into a data frame named “df”
url<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url, "storm.bz2", method="libcurl")
df<-read.csv("storm.bz2", header=T)
library(ggplot2)
Extract only data about damages and time and locale about the disasters
dim(df)
## [1] 902297 37
names(df)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
head(df)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
#subset out cols relate to damage
df2<-df[,c(1:8,23:28,36)]
dim(df2)
## [1] 902297 15
head(df2)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
## REMARKS
## 1
## 2
## 3
## 4
## 5
## 6
str(df2)
## 'data.frame': 902297 obs. of 15 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "10/10/1954 0:00:00",..: 6523 6523 4213 11116 1426 1426 1462 2873 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "000","0000","00:00:00 AM",..: 212 257 2645 1563 2524 3126 122 1563 3126 3126 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels "?","ABNORMALLY DRY",..: 830 830 830 830 830 830 830 830 830 830 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REMARKS : Factor w/ 436781 levels ""," "," "," ",..: 1 1 1 1 1 1 1 1 1 1 ...
Some STATE has two STATE__ code, AK, MA, MD, ND, NJ, OH, SC
state<-tapply(df2$STATE__, df2$STATE, unique)
distate<-state[sapply(state, length)==2]
format(distate)
## AK MA MD ND NJ OH SC
## "2, 72" "25, 26" "24, 11" "38, 39" "34, 35" "39, 24" "45, 46"
Don’t know why
We examine 2 numbers regarding threat to public health, the “FATALITIES” and “INJURIES”
#pick the top 10 disasters baseed on FATALITIES and INJURIES
f<-with(df2, tapply(FATALITIES, EVTYPE, sum))
topf<-sort(f, decreasing=TRUE)[1:10]
df2f<-df2[df2$EVTYPE %in% names(topf), ]
i<-with(df2, tapply(INJURIES, EVTYPE, sum))
topi<-sort(i, decreasing=TRUE)[1:10]
df2i<-df2[df2$EVTYPE %in% names(topi), ]
par(mfrow=c(1,2), mar=c(8,4,2,2))
barplot(topf, las=2,cex.names=0.7)
barplot(topi, las=2,cex.names=0.7)
The top 10 regarding FATALITIES:
5633, 1903, 978, 937, 816, 504, 470, 368, 248, 224 The top 10 regarding INJURIES:
9.134610^{4}, 6957, 6789, 6525, 5230, 2100, 1975, 1777, 1488, 1361 Consider both INJURIES and FATALITIES, choose the intersect and plot the total number of people dead or injured across the country
toph<-intersect(names(topf), names(topi))
df2h<-df2[df2$EVTYPE %in% toph, ]
qplot(EVTYPE, data=df2h, geom="bar", weight=FATALITIES+INJURIES)
So the top 7 disasters threat public health is as shown in above figuer
Compute number of both property damage and crop damage by combining damage and exp. Adding 2 new columns for both property and crop.
df2$PROPDMGEXP2[df2$PROPDMGEXP==""]<-0
df2$PROPDMGEXP2[df2$PROPDMGEXP=="K"]<-1000
df2$PROPDMGEXP2[df2$PROPDMGEXP=="M"]<-1000000
df2$PROPDMGEXP2[df2$PROPDMGEXP=="B"]<-1000000000
df2$CROPDMGEXP2[df2$CROPDMGEXP==""]<-0
df2$CROPDMGEXP2[df2$CROPDMGEXP=="K"]<-1000
df2$CROPDMGEXP2[df2$CROPDMGEXP=="M"]<-1000000
df2$CROPDMGEXP2[df2$CROPDMGEXP=="B"]<-1000000000
df2$prop<-df2$PROPDMG*df2$PROPDMGEXP2
df2$crop<-df2$CROPDMG*df2$CROPDMGEXP2
str(df2$prop)
## num [1:902297] 25000 2500 25000 2500 2500 2500 2500 2500 25000 25000 ...
str(df2$crop)
## num [1:902297] 0 0 0 0 0 0 0 0 0 0 ...
Find out top10 disaster with highest property damage and crop damage respectively
tprop<-tapply(df2$prop, df2$EVTYPE, sum)
topp<-sort(tprop, decreasing=TRUE)[1:10]
tcrop<-tapply(df2$crop, df2$EVTYPE, sum)
topc<-sort(tcrop, decreasing=TRUE)[1:10]
The top 10 on property damage are: 6.93058410^{10}, 4.332353610^{10}, 1.186831910^{10}, 7.703890510^{9}, 5.118945510^{9}, 4.76511410^{9}, 4.64118810^{9}, 3.001829510^{9}, 2.510^{9}, 1.610^{9} the top 10 on crop damage are: 5.661968410^{9}, 5.02945910^{9}, 5.022113510^{9}, 2.7419110^{9}, 2.607872810^{9}, 1.421317110^{9}, 1.29297310^{9}, 1.09408610^{9}, 7.33399810^{8}, 6.7834610^{8}
plot the top 3 disaster with combined property and crop damage
df2pc<-df2[df2$EVTYPE %in% intersect(names(topp), names(topc)),]
qplot(EVTYPE, data=df2pc, weight=prop+crop, ylab="total property and crop damage")
The natural disasters are so detrimental to our life and our properties. According to the analysis, there is no overlap between the worst disasters threatening our lifes with those damaging our crops ans properties. So area specific sustibility should be taken into considerations.