The research starts from the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database in order to address the question about which are the natural events that have the most relevant impact on both the USA human health and the USA economy.
Data from the raw database are loaded in R.
data <- read.csv(bzfile('repdata-data-StormData.csv.bz2'), sep = ',', header = TRUE)
Raw data are manipulated in order to obtain a database suitable for the scope of the analysis.
The EVTYPE variable is processed to remove differences due to the use of lower and upper letters for the same event type.
length(unique(data$EVTYPE))
## [1] 985
length(unique(toupper(data$EVTYPE)))
## [1] 898
data$EVTYPE <- toupper(data$EVTYPE)
The PROPDMGEXP and CROPDMGEXP variables are processed to define a specific rule of reading. In particular, symbols are replaced with the following magnitude order:
data$PROPDMGEXP <- as.factor(data$PROPDMGEXP)
levels(data$PROPDMGEXP) <- list('0' = c('-', '?', '0'),
'1' = c('', '+'),
'10' = c('1', '2', '3', '4', '5', '6', '7', '8'),
'100' = c('h', 'H'),
'1000' = 'K',
'1000000' = c('m', 'M'),
'1000000000' = 'B')
data$PROPDMGEXP <- as.numeric(as.character(data$PROPDMGEXP))
data$CROPDMGEXP <- as.factor(data$CROPDMGEXP)
levels(data$CROPDMGEXP) <- list('0' = c('?', '0'),
'1' = '',
'10' = '2',
'1000' = c('k', 'K'),
'1000000' = c('m', 'M'),
'1000000000' = 'B')
data$CROPDMGEXP <- as.numeric(as.character(data$CROPDMGEXP))
To obtain the answer, data are grouped by EVTYPE variable and summarized in a new data set displaying the sum of injuries (form the INJURIES variable) for each event type.
data1 <- data %>% group_by(EVTYPE) %>% summarise(Tot.Injuries = sum(INJURIES))
data1 <- data1[data1$Tot.Injuries != 0, ]
Below are shown the top-10 impact events:
result1 <- arrange(data1, desc(Tot.Injuries))
head(result1, 10)
## # A tibble: 10 x 2
## EVTYPE Tot.Injuries
## <chr> <dbl>
## 1 TORNADO 91346
## 2 TSTM WIND 6957
## 3 FLOOD 6789
## 4 EXCESSIVE HEAT 6525
## 5 LIGHTNING 5230
## 6 HEAT 2100
## 7 ICE STORM 1975
## 8 FLASH FLOOD 1777
## 9 THUNDERSTORM WIND 1488
## 10 HAIL 1361
result1$EVTYPE <- factor(result1$EVTYPE, levels = c(result1$EVTYPE))
p1 <- ggplot(data = result1[1:10, ], aes(x = EVTYPE, y = Tot.Injuries/1000)) +
geom_bar(stat = 'identity', aes(fill = Tot.Injuries), show.legend = FALSE) +
scale_fill_gradient(low = 'gold', high = 'firebrick4') +
labs(title = 'Top-10 impact events', x = '', y = 'Number of injuries (thousands)') +
theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
axis.text.x = element_text(angle = 45, hjust = 1))
p1
To obtain the answer, data are grouped by EVTYPE variable and summarized in a new data set displaying the sum of economic damages. The sum economic damages is obtained as the sum of the properties damages and crop damages (respectively obtained as PROPDMG times PROPDMGEXP and CROPDMG times CROPDMGEXP).
data2 <- data %>% group_by(EVTYPE) %>% summarise(Tot.PropDmg = sum(PROPDMG*PROPDMGEXP),
Tot.CropDmg = sum(CROPDMG*CROPDMGEXP))
data2$Tot.EconDmg <- data2$Tot.PropDmg + data2$Tot.CropDmg
data2 <- data2[data2$Tot.EconDmg != 0, ]
Below are shown the top-10 impact events:
result2 <- arrange(data2, desc(Tot.EconDmg))
head(result2, 10)
## # A tibble: 10 x 4
## EVTYPE Tot.PropDmg Tot.CropDmg Tot.EconDmg
## <chr> <dbl> <dbl> <dbl>
## 1 FLOOD 144657709807 5661968450 150319678257
## 2 HURRICANE/TYPHOON 69305840000 2607872800 71913712800
## 3 TORNADO 56937161565 414953110 57352114675
## 4 STORM SURGE 43323536000 5000 43323541000
## 5 HAIL 15732267577 3025954453 18758222030
## 6 FLASH FLOOD 16140812603 1421317100 17562129703
## 7 DROUGHT 1046106000 13972566000 15018672000
## 8 HURRICANE 11868319010 2741910000 14610229010
## 9 RIVER FLOOD 5118945500 5029459000 10148404500
## 10 ICE STORM 3944927810 5022113500 8967041310
result2$EVTYPE <- factor(result2$EVTYPE, levels = c(result2$EVTYPE))
p2 <- ggplot(data = result2[1:10, ], aes(x = EVTYPE, y = Tot.EconDmg/1000000000)) +
geom_bar(stat = 'identity', aes(fill = Tot.EconDmg), show.legend = FALSE) +
scale_fill_gradient(low = 'gold', high = 'firebrick4') +
labs(title = 'Top-10 impact events', x = '', y = 'Economic damages (billions)') +
theme(plot.title = element_text(hjust = 0.5, face = 'bold'),
axis.text.x = element_text(angle = 45, hjust = 1))
p2