if(!file.exists("./file.csv.bz2")){
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", "./file.csv.bz2")
} else {
require(data.table)
if(!exists("DT")){
DT <- as.data.table(read.csv("./file.csv.bz2"))
}
}
Because our interest it’s to know the event types or weather events has caused more harm and loses to humanity on teh U.S.A., we’ll split the question in two parts: human harm and material loses caused. ## Human Harm:
So, we are interested just in the weather events (EVTYPES) which has caused more human harm, and the datset is somewhat big. Let’s remove all the zeros human harm events. Initially we have a dataset of
dim(DT)
## [1] 902297 37
and now got a dataset of
DT_hh <- DT[FATALITIES != 0 & INJURIES != 0, ]
dim(DT_hh)
## [1] 2649 37
DT_hh <- subset(DT_hh, select = c("EVTYPE", "FATALITIES", "INJURIES"))
DT_hh <- DT_hh[, .(TOTFATALITIES=sum(FATALITIES), TOTINJURIES=sum(INJURIES)), by = EVTYPE]
Aiming to get a criteria for ranking the EVTYPES according to the total number of human harm produced, lets find out the correlation between FATALITIES and INJURIES. As you can see, it’s of 0.9974628. That means that the 99.7462781% of the times, if one kind of human harm caused increases, the other do the same. And if one decreases, the other too.
We can show it mre clearly on a plot
par(mfrow=c(1,2))
plot(DT_hh$TOTFATALITIES, DT_hh$TOTINJURIES, xlab = "Total Fatalities", ylab = "Total Injuries",
main = "Correlation Total Fatalities
vs Total Injuries")
plot(log(DT_hh$TOTFATALITIES), log(DT_hh$TOTINJURIES),
xlab = "Log(Total Fatalities)", ylab = "Log(Total Injuries)")
On the first plot, the correlation is not visible, because of the dots are excessively clustered. So, I get rid of that trouble by applying logarithmic function.
DT_hh <- DT_hh[, .(TOTALHH=sum(TOTFATALITIES, TOTINJURIES)), by=EVTYPE]
DT_ec <- DT[PROPDMG != 0 & CROPDMG != 0, ]
l <- length(unique(DT_ec$EVTYPE))
Now we have just 96 cases.
DT_ec <- subset(DT_ec, select = c("EVTYPE", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP"))
3.1. PROPDMGEXP
Lets look how many cases of each one we have here
s_PROPDMGEXP <- summary(DT_ec$PROPDMGEXP)
The problematic (may be misspelled) PROPDMGEXP values are just these
s_PROPDMGEXP[1:13]
## - ? + 0 1 2 3 4 5 6 7 8
## 5 0 0 0 4 0 0 1 0 2 0 0 0
If you look at, you can find that the cases represent the 0.0738825 percent of the total of the sample after removed the zero-cases. So, I don’t guess it worth the shame to dedicate resources to figure out what it would want to mean such notation, more if you have to make assumptions beyond the ground of the data.
3.2. CROPDMGEXP
s_CROPDMGEXP <- summary(DT_ec$CROPDMGEXP)
Lets look how many cases of each one we have here
s_CROPDMGEXP
## ? 0 2 B k K m M
## 3 0 11 0 3 21 14911 1 1292
The problematic (may be misspelled) PROPDMGEXP values are these
s_CROPDMGEXP[1:4]
## ? 0 2
## 3 0 11 0
When you look closer, you can look that the cases just represent the 0.0861963% of the total of the sample after removed the zero cases. Here again I don’t guess it worth the shame to dedicate resources to figure out what it would want to mean such notation, more if you have to make assumptions beyond the ground of the data.
DT_ec <- DT_ec[PROPDMGEXP == "B" | PROPDMGEXP == "h" | PROPDMGEXP == "H" | PROPDMGEXP == "K" | PROPDMGEXP == "m" | PROPDMGEXP == "M", ]
DT_ec <- DT_ec[CROPDMGEXP == "B" | CROPDMGEXP == "h" | CROPDMGEXP == "H" | CROPDMGEXP == "K" | CROPDMGEXP == "m" | CROPDMGEXP == "M", ]
val_letters <- data.frame(letter = c("B","H", "h", "K", "m", "M"), values = c(1000000000, 100, 100, 1000, 1000000, 1000000))
setkey(DT_ec, PROPDMGEXP)
DT_ec <- DT_ec[val_letters]
setkey(DT_ec, CROPDMGEXP)
DT_ec <- DT_ec[val_letters]
Make up just a little the colnames
colnames(DT_ec)[6:7] <- c("PROPDMGEXP_VAL", "CROPDMGEXP_VAL")
head(DT_ec)
## EVTYPE PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP PROPDMGEXP_VAL
## 1: RIVER FLOOD 5.00 B 5.00 B 1e+09
## 2: HURRICANE/TYPHOON 5.88 B 1.51 B 1e+09
## 3: ICE STORM 500.00 K 5.00 B 1e+03
## 4: NA NA NA NA H NA
## 5: NA NA NA NA h NA
## 6: HURRICANE/TYPHOON 1.50 B 300.00 K 1e+09
## CROPDMGEXP_VAL
## 1: 1e+09
## 2: 1e+09
## 3: 1e+09
## 4: 1e+02
## 5: 1e+02
## 6: 1e+03
summary(DT_ec$PROPDMGEXP)
## B h H K m M NA's
## 15 0 0 14816 1 1363 2
summary(DT_ec$CROPDMGEXP)
## B h H K m M
## 3 1 1 14901 1 1290
DT_ec <- DT_ec[complete.cases(DT_ec),]
sum(is.na(DT_ec))
## [1] 0
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
DT_ec <- mutate(DT_ec, VAL_PROPDMG = PROPDMG * PROPDMGEXP_VAL, VAL_CROPDMG = CROPDMG * CROPDMGEXP_VAL)
DT_ec <- subset(DT_ec, select = c("EVTYPE", "VAL_PROPDMG", "VAL_CROPDMG"))
DT_ec <- DT_ec[, .(TOTPROPDMG_VAL=sum(VAL_PROPDMG), TOTCROPDMG_VAL=sum(VAL_CROPDMG)), by = EVTYPE]
With the aim of establish a ranking method for the weather events accordingly to its economic consequences, we analyze the correlation between material damage to properties and material damage to crops by EVTYPE. And the correlation is 0.5114979.
So, we’ll take the total amount of the values of the damages
DT_ec2 <- DT_ec[, .(TOTDMG = sum(TOTPROPDMG_VAL, TOTCROPDMG_VAL)), by = EVTYPE]
setkey(DT_ec2, EVTYPE)
setkey(DT_ec, EVTYPE)
DT_ec2 <- DT_ec2[DT_ec]
par(mfrow=c(1,2))
plot(DT_ec2$TOTDMG, DT_ec2$TOTPROPDMG_VAL, xlab = "Total Damage", ylab = "Tot Damage Property Value", main = "Correlation Tot Damage Property
vs Total Damage Property Value")
plot(log(DT_ec2$TOTDMG), log(DT_ec2$TOTPROPDMG_VAL), xlab = "Log(Total Damage)", ylab = "Log(Tot Damage Property Value)")
DT_ec <- DT_ec[, .(TOTDMG = sum(TOTPROPDMG_VAL, TOTCROPDMG_VAL)), by = EVTYPE]
as you can see, it is the damages to properties the one which more determine the total amount of economic consequences: the correlation between properties damages and the total amount is of 0.998191 while the correlation between crop damages and the total amount is of 0.5622346. we can use the total amount as classifier. # Results Now we can response: 1. Historically the 20 weather events which more harm have caused to the human health on the U.S. , taking in account the NOAA datasets, have been from major to lesser:
head(setorder(DT_hh, -TOTALHH), 20)
## EVTYPE TOTALHH
## 1: TORNADO 65414
## 2: EXCESSIVE HEAT 5193
## 3: FLOOD 2783
## 4: ICE STORM 1755
## 5: HEAT 1493
## 6: HURRICANE/TYPHOON 1251
## 7: LIGHTNING 932
## 8: TSTM WIND 845
## 9: FLASH FLOOD 812
## 10: BLIZZARD 766
## 11: WINTER STORM 684
## 12: HIGH WIND 410
## 13: FOG 346
## 14: WILDFIRE 316
## 15: HEAT WAVE 291
## 16: TROPICAL STORM 286
## 17: HEAVY SNOW 276
## 18: EXTREME COLD 224
## 19: WINTER WEATHER 201
## 20: THUNDERSTORM WIND 184
head(setorder(DT_ec, - TOTDMG), 20)
## EVTYPE TOTDMG
## 1: FLOOD 126044533500
## 2: HURRICANE/TYPHOON 29348117800
## 3: HURRICANE 10498188000
## 4: RIVER FLOOD 10108369000
## 5: ICE STORM 5108614000
## 6: FLASH FLOOD 4308671360
## 7: HAIL 3813339540
## 8: TORNADO 2335763950
## 9: HURRICANE OPAL 2187000000
## 10: HIGH WIND 1918571300
## 11: TORNADOES, TSTM WIND, HAIL 1602500000
## 12: WILDFIRE 1540694900
## 13: DROUGHT 1464487000
## 14: TROPICAL STORM 1315040400
## 15: THUNDERSTORM WIND 1207893050
## 16: TSTM WIND 1107671950
## 17: EXCESSIVE HEAT 492570000
## 18: THUNDERSTORM WINDS 466014200
## 19: FLASH FLOOD/FLOOD 271505000
## 20: HURRICANE ERIN 262010000
summary(DT_hh)
## EVTYPE TOTALHH
## AVALANCHE : 1 Min. : 2.00
## BLACK ICE : 1 1st Qu.: 5.25
## BLIZZARD : 1 Median : 24.00
## blowing snow : 1 Mean : 1006.69
## BLOWING SNOW : 1 3rd Qu.: 159.00
## COASTAL STORM: 1 Max. :65414.00
## (Other) :80
summary(DT_ec)
## EVTYPE TOTDMG
## BLIZZARD : 1 Min. :1.000e+02
## COASTAL FLOODING: 1 1st Qu.:4.950e+05
## COLD AIR TORNADO: 1 Median :1.000e+07
## DROUGHT : 1 Mean :2.187e+09
## DRY MICROBURST : 1 3rd Qu.:1.492e+08
## DUST STORM : 1 Max. :1.260e+11
## (Other) :89