Firstly we download data and read data.
file.url="https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
file.name="repdata%2Fdata%2FStormData.csv.bz2"
if (!file.exists("repdata%2Fdata%2FStormData.csv.bz2")){
download.file(file.url,file.name)
}
dataStorm <- read.csv("repdata%2Fdata%2FStormData.csv.bz2", na.strings = "NA")
Set up libraries:
library(dplyr)
library(ggplot2)
library(gridExtra)
library(grid)
According to our question, we only need columns corresponding to event types, population health and economic consequences. So we subset the dataset using EVTYPE,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP:
columnsKeep <- c("EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")
dataSub <- select(dataStorm, columnsKeep)
Then we need to transform the magnitude of damage into corresponding scale. H stands for hundreds, K for thousands, M for millions, B for billions.
indexProperty <- NA
index <- grep("H|h",dataSub$PROPDMGEXP)
indexProperty[index] <- 1e2
index <- grep("K|k",dataSub$PROPDMGEXP)
indexProperty[index] <- 1e3
index <- grep("M|m",dataSub$PROPDMGEXP)
indexProperty[index] <- 1e6
index <- grep("B|b",dataSub$PROPDMGEXP)
indexProperty[index] <- 1e9
indexProperty[is.na(indexProperty)] <- 0
indexCorp <- NA
index <- grep("H|h",dataSub$CROPDMGEXP)
indexCorp[index] <- 1e2
index <- grep("K|k",dataSub$CROPDMGEXP)
indexCorp[index] <- 1e3
index <- grep("M|m",dataSub$CROPDMGEXP)
indexCorp[index] <- 1e6
index <- grep("B|b",dataSub$CROPDMGEXP)
indexCorp[index] <- 1e9
indexCorp[is.na(indexCorp)] <- 0
where we get indexProperty for scale of property damage and indexCorp for scale of crop damage. Then we compute the actual property damage and crop damage. Also we get total population health impact by sum fatalities and injuries.
dataRecal <- dataSub %>%
mutate(totalInjury = FATALITIES+INJURIES) %>%
mutate(indexOfProperty = indexProperty) %>%
mutate(indexOfCrop = indexCorp) %>%
mutate(totalProperty = PROPDMG * indexOfProperty) %>%
mutate(totalCrop = CROPDMG * indexOfCrop) %>%
mutate(totalDamage = totalProperty + totalCrop)
head(dataRecal)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
## totalInjury indexOfProperty indexOfCrop totalProperty totalCrop
## 1 15 1000 0 25000 0
## 2 0 1000 0 2500 0
## 3 2 1000 0 25000 0
## 4 2 1000 0 2500 0
## 5 2 1000 0 2500 0
## 6 6 1000 0 2500 0
## totalDamage
## 1 25000
## 2 2500
## 3 25000
## 4 2500
## 5 2500
## 6 2500
For question 1, we sum up total fatalities and total injuries for every event type. Show the results in descending order:
dataForHealth <- dataRecal %>%
group_by(EVTYPE) %>%
summarise(numFatalities = sum(FATALITIES),
numInjuries = sum(INJURIES),
numTotal = sum(totalInjury)) %>%
arrange(desc(numTotal))
dataForHealth <- as.data.frame(dataForHealth)
head(dataForHealth)
## EVTYPE numFatalities numInjuries numTotal
## 1 TORNADO 5633 91346 96979
## 2 EXCESSIVE HEAT 1903 6525 8428
## 3 TSTM WIND 504 6957 7461
## 4 FLOOD 470 6789 7259
## 5 LIGHTNING 816 5230 6046
## 6 HEAT 937 2100 3037
Then we use bar chart to plot top 10 event types that are most harmful with respect to population health:
g1 <- ggplot(head(dataForHealth,10), aes(x=reorder(EVTYPE,numFatalities),y=numFatalities))
q1 <- g1 + geom_bar(stat = "identity") +
coord_flip() +
ggtitle("Population health impact - Top 10") +
ylab("Number of fatalities") +
xlab("Type of events")
g2 <- ggplot(head(dataForHealth,10), aes(x=reorder(EVTYPE,numInjuries),y=numInjuries))
q2 <- g2 + geom_bar(stat = "identity") +
coord_flip() +
ylab("Number of injuries") +
xlab("Type of events")
grid.arrange(q1,q2,nrow=2)
As we can see above, tornado are most harmful with respect to population health.
For question 2, we sum up property damage and crop damage according to different event types:
dataForEco <- dataRecal %>%
group_by(EVTYPE) %>%
summarise(numProperty = sum(totalProperty),
numCrop = sum(totalCrop),
numTotal = sum(totalDamage)) %>%
arrange(desc(numTotal))
dataForEco <- as.data.frame(dataForEco)
head(dataForEco)
## EVTYPE numProperty numCrop numTotal
## 1 FLOOD 144657709800 5661968450 150319678250
## 2 HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3 TORNADO 56937160480 414953110 57352113590
## 4 STORM SURGE 43323536000 5000 43323541000
## 5 HAIL 15732267220 3025954450 18758221670
## 6 FLASH FLOOD 16140811510 1421317100 17562128610
Then use bar chart to plot the top 10 event types that have the greatest economic consequences:
g1 <- ggplot(head(dataForEco,10), aes(x=reorder(EVTYPE,numProperty),y=numProperty))
q1 <- g1 + geom_bar(stat = "identity") +
coord_flip() +
ggtitle("Economic consequences - Top 10") +
ylab("Property damage") +
xlab("Type of events")
g2 <- ggplot(head(dataForEco,10), aes(x=reorder(EVTYPE,numCrop),y=numCrop))
q2 <- g2 + geom_bar(stat = "identity") +
coord_flip() +
ylab("Crop damage") +
xlab("Type of events")
grid.arrange(q1,q2,nrow=2)
As we can see above, flood and drought have the greatest economic consequences.