Before start, load the data
rm(list=ls())
getwd()
## [1] "E:/Dropbox/Git/Reproducible2"
list.files()
## [1] "code.R" "markdown.html"
## [3] "markdown.Rmd" "repdata_data_StormData.csv.bz2"
## [5] "Reproducible2.Rproj"
data<-read.csv("repdata_data_StormData.csv.bz2")
table(data$STATE__)
##
## 1 2 4 5 6 8 9 10 11 12 13 15 16
## 22739 4390 6156 27102 10780 20473 3294 1913 450 22124 25259 2547 4767
## 17 18 19 20 21 22 23 24 25 26 27 28 29
## 28488 21506 31069 53441 22092 17323 4524 8173 5651 17911 23609 22192 35648
## 30 31 32 33 34 35 36 37 38 39 40 41 42
## 14695 30271 3139 3022 8074 7130 21058 25351 14630 24923 46802 4821 22226
## 44 45 46 47 48 49 50 51 53 54 55 56 60
## 839 17125 21728 21721 83728 4135 3871 21189 3312 9099 19781 7332 257
## 66 68 72 78 81 83 84 85 86 87 88 89 90
## 306 1 3016 338 274 1 28 5337 96 1879 3250 23 654
## 91 92 93 94 95
## 1347 262 9 70 1526
colnames(data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
head(table(data$EVTYPE))
##
## HIGH SURF ADVISORY COASTAL FLOOD FLASH FLOOD
## 1 1 1
## LIGHTNING TSTM WIND TSTM WIND (G45)
## 1 4 1
Population health can be monitored by looking up injuries + fatalities
data1<-data.frame(cbind(data$EVTYPE, data$INJURIES + data$FATALITIES))
colnames(data1)<-c("disaster", "effect")
data1$effect<-as.numeric(data1$effect)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
disaster_group<-group_by(data1, disaster)
summary1<-disaster_group %>% summarise(sum = sum(effect))
head(table(summary1$sum))
##
## 0 1 2 3 4 5
## 765 55 30 10 11 9
summary1<-subset(summary1, summary1$sum != 0)
Huge typos! have to modify & unite groups
Finding groups of strings
library(stringdist)
## Warning: package 'stringdist' was built under R version 4.0.5
dist.matrix<-stringdistmatrix(tolower(summary1$disaster),tolower(summary1$disaster), method = 'jw')
row.names(dist.matrix)<-summary1$disaster
names(dist.matrix)<-summary1$disaster
dist.matrix<-as.dist(dist.matrix)
clusts<-hclust(dist.matrix, method = 'ward.D2')
plot(clusts)
Clust level of 0.4 seems reasonable
summary1$groups<-cutree(clusts,h=0.4)
#A function that matches the gruop and name
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
summary1<-summary1%>%
group_by(groups)%>%
mutate(disaster_type=Mode(disaster))
summary1<-data.frame(summary1$disaster_type, summary1$sum)
summary1<-group_by(summary1, summary1.disaster_type)
summary1<-summary1 %>% summarise(sum = sum(summary1.sum))
table(summary1$summary1.disaster_type)
##
## AVALANCE BLACK ICE
## 1 1
## BLIZZARD blowing snow
## 1 1
## COASTAL FLOOD Coastal Storm
## 1 1
## Cold Cold Temperature
## 1 1
## COLD/WIND CHILL DENSE FOG
## 1 1
## DROUGHT DROWNING
## 1 1
## DRY MICROBURST Dust Devil
## 1 1
## EXCESSIVE HEAT Extended Cold
## 1 1
## FLASH FLOOD FLOOD
## 1 1
## FOG FREEZE
## 1 1
## FUNNEL CLOUD GLAZE
## 1 1
## GUSTY WIND HAIL
## 1 1
## HAZARDOUS SURF HEAT
## 1 1
## HEAVY RAIN HEAVY SNOW
## 1 1
## HIGH HIGH WIND
## 1 1
## HURRICANE HYPERTHERMIA/EXPOSURE
## 1 1
## ICE ICE ON ROAD
## 1 1
## ICE STORM LANDSLIDE
## 1 1
## LIGHTNING Marine Accident
## 1 1
## MINOR FLOODING MIXED PRECIP
## 1 1
## NON-SEVERE WIND DAMAGE NON TSTM WIND
## 1 1
## RAIN/SNOW RAPIDLY RISING WATER
## 1 1
## RECORD COLD RIP CURRENT
## 1 1
## RIVER FLOOD ROGUE WAVE
## 1 1
## SLEET SMALL HAIL
## 1 1
## Snow SNOW SQUALL
## 1 1
## STORM SURGE THUNDERSNOW
## 1 1
## TORNADO TROPICAL STORM
## 1 1
## TSTM WIND TSUNAMI
## 1 1
## TYPHOON UNSEASONABLY COLD
## 1 1
## URBAN AND SMALL STREAM FLOODIN WATERSPOUT
## 1 1
## WILD FIRES WIND
## 1 1
## WINTER STORM
## 1
colnames(summary1)<-c("disaster", "sum")
barplot(summary1$sum~summary1$disaster)
It seems that one specific incident caused the most casualties
subset(summary1, summary1$sum==max(summary1$sum))
## # A tibble: 1 x 2
## disaster sum
## <chr> <dbl>
## 1 TORNADO 97026
Tornado caused most casualties in the US.
Q2
Regarding ecnomic consequences, new subdataset can be extracted. Ecnomic damage was estimated using the sum of property and crop damage.
Other processes were analogous to the previous analysis.
colnames(data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
# Damage is summation of property damage and crop damage
data2<-data.frame(cbind(data$EVTYPE, data$PROPDMG+data$CROPDMG))
colnames(data2)<-c("disaster", "damage")
data2$damage<-as.numeric(data2$damage)
disaster_group<-group_by(data2, disaster)
summary1<-disaster_group %>% summarise(sum = sum(damage))
summary2<-disaster_group %>% summarise(sum = sum(damage), average = mean(damage))
#Eliminate occasions with zero damage
summary1<-subset(summary1, summary1$sum != 0)
#Finding groups of strings
dist.matrix<-stringdistmatrix(tolower(summary1$disaster),tolower(summary1$disaster), method = 'jw')
row.names(dist.matrix)<-summary1$disaster
names(dist.matrix)<-summary1$disaster
dist.matrix<-as.dist(dist.matrix)
clusts<-hclust(dist.matrix, method = 'ward.D2')
plot(clusts)
Also, clust level of 0.4 seems reasonable, therefore
summary1$groups<-cutree(clusts,h=0.4)
#A function that matches the gruop and name
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
summary1<-summary1%>%
group_by(groups)%>%
mutate(disaster_type=Mode(disaster))
summary1<-data.frame(summary1$disaster_type, summary1$sum)
summary1<-group_by(summary1, summary1.disaster_type)
summary1<-summary1 %>% summarise(sum = sum(summary1.sum))
table(summary1$summary1.disaster_type)
##
## HIGH SURF ADVISORY FLASH FLOOD TSTM WIND
## 1 1 1
## ? AGRICULTURAL FREEZE APACHE COUNTY
## 1 1 1
## ASTRONOMICAL HIGH TIDE AVALANCHE Beach Erosion
## 1 1 1
## BLIZZARD BLOWING DUST BREAKUP FLOODING
## 1 1 1
## BRUSH FIRE COASTAL FLOODING/EROSION COASTAL EROSION
## 1 1 1
## Cold DAM BREAK DENSE FOG
## 1 1 1
## DOWNBURST DROUGHT DRY MICROBURST
## 1 1 1
## Dust Devil DUST STORM Erosion/Cstl Flood
## 1 1 1
## EXCESSIVE HEAT Extended Cold FLASH FLOOD/FLOOD
## 1 1 1
## FLOOD FLOOD/RAIN/WINDS Freeze
## 1 1 1
## Freezing drizzle FROST FUNNEL CLOUD
## 1 1 1
## Glaze GROUND BLIZZARD GUSTNADO
## 1 1 1
## GUSTY WIND HAIL HAIL DAMAGE
## 1 1 1
## HAIL/WIND HAILSTORM HEAT
## 1 1 1
## HEAVY LAKE SNOW HEAVY MIX HEAVY PRECIPITATION
## 1 1 1
## HEAVY RAIN AND FLOOD Heavy Rain/High Surf HEAVY SNOW-SQUALLS
## 1 1 1
## HEAVY SNOW/FREEZING RAIN Heavy Surf HIGH WINDS
## 1 1 1
## HIGH SEAS HURRICANE ICE
## 1 1 1
## ICE FLOES ICE/STRONG WINDS LAKE-EFFECT SNOW
## 1 1 1
## LAKE FLOOD LANDSLIDE LATE SEASON SNOW
## 1 1 1
## LIGHT FREEZING RAIN Light snow MAJOR FLOOD
## 1 1 1
## Marine Accident Microburst MUD SLIDE
## 1 1 1
## NON-SEVERE WIND DAMAGE Other RECORD COLD
## 1 1 1
## RIP CURRENT RIVER FLOOD RURAL FLOOD
## 1 1 1
## SEVERE THUNDERSTORM SMALL HAIL Snow
## 1 1 1
## SNOW AND HEAVY SNOW SNOW SQUALL SNOW/ BITTER COLD
## 1 1 1
## STORM FORCE WINDS THUDERSTORM WINDS TORNADO
## 1 1 1
## TROPICAL DEPRESSION Tstm Wind TSTM WIND/HAIL
## 1 1 1
## Unseasonable Cold URBAN AND SMALL WATERSPOUT
## 1 1 1
## WILD FIRES WINTER STORM WINTER WEATHER
## 1 1 1
colnames(summary1)<-c("disaster", "sum")
barplot(summary1$sum~summary1$disaster)
subset(summary1, summary1$sum==max(summary1$sum))
## # A tibble: 1 x 2
## disaster sum
## <chr> <dbl>
## 1 TORNADO 3314563.
Therefore, tornado caused most economic damages in the US also.
For the average damage per incident, a barplot was established.
barplot(summary2$average~summary2$disaster)
subset(summary2, summary2$average==max(summary2$average))
## # A tibble: 1 x 3
## disaster sum average
## <chr> <dbl> <dbl>
## 1 TROPICAL STORM GORDON 1000 1000
Therefore, in average per incident, tropical storm exhibited the highest damage