The aim of this report is to analyze the events such as Tornados, Freezing rain, winter storm, etc that have major impacts in the economy and in the population health using the NOAA Storm Database. First, the data was downloaded and processed. A matrix wData, as in working data, was created using only the usefull information to answer the questions. The matrix contained the number of fatalities and injuries and the value in USD regarding the property and crops damages. Finally, two plots were made.
The data were downloaded from the coursera page and automatically saved as repdata_data_StormData.csv.bz2. The data was loaded into the StormData variable. A brief summary of the data is presented.
StormData <- read.csv("repdata_data_StormData.csv.bz2")
str(StormData)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
Since the only variables that can answer our questions are EVTYPE, FATALITIES, INJURIES, PROPDMG, CROPDMG, PROPDMGEXP, CROPDMGEXP these are going to be grouped in a new dataframe called wData. The numerical variables FATALITIES and INJURIES are related to the the harmful effects in the population, therefore, are going to be grouped in a column called P_health.
The damage in the economy is described through the damage in the properties and in the crops using US Dollars in the variables PROPDMG, CROPDMG, however, those are followed by an alphabetical character signifying the magnitude of the number. Alphabetical characters used to signify magnitude include “K” for thousands, “M” for millions, and “B” for billions are stored in the variables PROPDMGEXP, CROPDMGEXO. Therefore, a transformation is made and the results are stored in the column Damage using the complete amount in US Dollars.
wData <-as.data.frame(StormData$EVTYPE); names(wData) <- "EVTYPE"
wData$P_Health <-(StormData$FATALITIES + StormData$INJURIES)
P_C <-as.data.frame(cbind(StormData$PROPDMG,StormData$PROPDMGEXP,StormData$CROPDMG, StormData$CROPDMGEXP))
P_C[grepl("[Kk]",P_C$V2),5] <-1000;
P_C[grepl("[Mm]",P_C$V2),5] <-1000000;
P_C[grepl("[Bb]",P_C$V2),5] <-1000000000;
P_C[grepl("[Kk]",P_C$V4),6] <-1000;
P_C[grepl("[Mm]",P_C$V4),6] <-1000000;
P_C[grepl("[Bb]",P_C$V4),6] <-1000000000;
P_C[is.na(as.numeric(P_C[,6])),6]<-0; P_C[is.na(as.numeric(P_C[,5])),5]<-0
P_C$V5 <- P_C$V5*as.numeric(P_C$V1);
P_C$V6 <- P_C$V6*as.numeric(P_C$V3)
P_C$V7 <- (P_C$V5 + P_C$V6)
names(P_C) <-c("PROPDMG","PROPDMEXP","CROPDMG", "CROPDMGEXP", "PDMG_USD", "CDMG_USD", "TOTDMG_USD")
## P_C is the dataframe that contains the total in USD regarding property damage "PDMG_USD" and crop damage "CDMG_USD" and the sum of both "TOTDMG_USD"
head(P_C)
## PROPDMG PROPDMEXP CROPDMG CROPDMGEXP PDMG_USD CDMG_USD TOTDMG_USD
## 1 25 K 0 25000 0 25000
## 2 2.5 K 0 2500 0 2500
## 3 25 K 0 25000 0 25000
## 4 2.5 K 0 2500 0 2500
## 5 2.5 K 0 2500 0 2500
## 6 2.5 K 0 2500 0 2500
wData$Damage <- P_C$TOTDMG_USD
head(wData)
## EVTYPE P_Health Damage
## 1 TORNADO 15 25000
## 2 TORNADO 0 2500
## 3 TORNADO 2 25000
## 4 TORNADO 2 2500
## 5 TORNADO 2 2500
## 6 TORNADO 6 2500
library(plyr)
## Warning: package 'plyr' was built under R version 4.0.2
wData_grouped <- ddply(wData, .(EVTYPE), function(x) {h <- sum(x$P_Health); d <- sum(x$Damage); ans <-cbind(h,d)} )
## wData_grouped is the grouped data.
head(wData_grouped[order(-wData_grouped$h),])
## EVTYPE h d
## 834 TORNADO 96979 57352113590
## 130 EXCESSIVE HEAT 8428 500155700
## 856 TSTM WIND 7461 5038935790
## 170 FLOOD 7259 150319678250
## 464 LIGHTNING 6046 940751370
## 275 HEAT 3037 403258500
Although the data is grouped, there are some inconsistencies. For instance, if you look for the events that have “TORN” in their names you’ll find several of them.
head(wData_grouped[grepl("TORN",wData_grouped$EVTYPE),])
## EVTYPE h d
## 69 COLD AIR TORNADO 0 100
## 834 TORNADO 96979 57352113590
## 835 TORNADO DEBRIS 0 0
## 836 TORNADO F0 0 83400
## 837 TORNADO F1 0 2370000
## 838 TORNADO F2 16 1600000
Therefore, wData$EVTYPE will be edited using only the top 10 entries in the last table presented.
wData$EVTYPE <- toupper(wData$EVTYPE)
wData[grepl("TORN",wData$EVTYPE),]$EVTYPE <- "TORNADO"
wData[grepl("HEAT",wData$EVTYPE),]$EVTYPE <- "HEAT"
wData[grepl("WIND",wData$EVTYPE),]$EVTYPE <- "WIND"
wData[grepl("FLOOD",wData$EVTYPE),]$EVTYPE <- "FLOOD"
wData[grepl("LIGHTNING",wData$EVTYPE),]$EVTYPE <- "LIGHTNING"
wData[grepl("STORM",wData$EVTYPE),]$EVTYPE <- "STORM"
wData[grepl("HAIL",wData$EVTYPE),]$EVTYPE <- "HAIL"
wData[grepl("HURRICANE|TYPHOON",wData$EVTYPE),]$EVTYPE <- "HURRICANE/TYPHOON"
wData[grepl("SNOW",wData$EVTYPE),]$EVTYPE <- "SNOW"
wData_grouped <- ddply(wData, .(EVTYPE), function(x) {h <- sum(x$P_Health); d <- sum(x$Damage); ans <-cbind(h,d)} )
wData_grouped[order(-wData_grouped$h),][1:10,]
## EVTYPE h d
## 313 TORNADO 97068 59010559590
## 376 WIND 12924 18299378920
## 111 HEAT 12362 924795030
## 80 FLOOD 10129 179790474420
## 165 LIGHTNING 6049 950814370
## 245 STORM 4657 73536870660
## 144 HURRICANE/TYPHOON 1466 90762527810
## 108 HAIL 1386 18779880170
## 243 SNOW 1287 1157812840
## 374 WILDFIRE 986 5060586800
According to the results, across the United States, the tornadoes, winds and heat are the main types of events that are most harmful with respect to population health.
library(ggplot2)
Position <- factor(wData_grouped[order(-wData_grouped$h),][1:10,]$EVTYPE,
levels=wData_grouped[order(-wData_grouped$h),][1:10,]$EVTYPE)
x <-ggplot(wData_grouped[order(-wData_grouped$h),][1:10,],
aes( x = Position, y = h))
x + geom_bar(stat= "identity", fill="Skyblue") + labs(x = "Event Type", y = "Fatalities and Injuries", title = "Human Fatalities or injuries vs Event Type") +theme_minimal()+ theme(axis.text.x = element_text(angle = 20, hjust = 1))
According to the results, across the United States, the floods, hurricane/typhoon and storms are the main types of events that are most harmful to properties and crops.
Position <- factor(wData_grouped[order(-wData_grouped$d),][1:10,]$EVTYPE,
levels=wData_grouped[order(-wData_grouped$d),][1:10,]$EVTYPE)
x <-ggplot(wData_grouped[order(-wData_grouped$d),][1:10,],
aes( x = Position, y = d))
x + geom_bar(stat= "identity", fill="Skyblue") + labs(x = "Event Type", y = "Damages in USD", title = "Properties and Crops Damages in USD vs Event Type") +theme_minimal()+ theme(axis.text.x = element_text(angle = 20, hjust = 1))