In this document, the Data Set from U.S. National Oceanic and Atmospheric Administration’s (NOAA) database is analysed. The goal of this analysis is to explore effects of servere weather and climate abnormalities on the economy and general population of the USA.
In this analysis, we investigated which types of weather abnormalities most hamful for: a) Population Health (& fatalities); b) Poperty and Crop losses.
library(ggplot2)
library(data.table)
# if Data Set has not been downlaoded previously then download (and decompress) it
if(!file.exists('repdata_data_StormData.csv.bz2') ){
if (!file.exists('repdata_data_StormData.csv.bz2')){
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
data_file <- "repdata_data_StormData.csv.bz2"
download.file(url, data_file, mode = "wb")
}
}
# input"bz2" Data Set into a Data Frame as a data frame is the output of read.csv()
DF <- read.csv("repdata_data_StormData.csv.bz2")
dim(DF)
## [1] 902297 37
# Column titles in the entire Data Set:
names(DF)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
DT <- as.data.table(DF)
# a list of clumns to retain in the Data table we will be working with
columnsToRetain <- c("CROPDMG", "CROPDMGEXP","EVTYPE", "FATALITIES","INJURIES", "PROPDMG", "PROPDMGEXP")
sub_DT <- DT[, ..columnsToRetain]
#Ensuring there are no missing values in the sub-set we will be working with
message("Total number of missing values (NAs) in the sub-set we will be working with: ", sum (is.na (sub_DT)))
## Total number of missing values (NAs) in the sub-set we will be working with: 0
sub_DT <- sub_DT[((CROPDMG > 0 | FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0) & EVTYPE != "?"), ..columnsToRetain]
dim(sub_DT)
## [1] 254632 7
columnsToUpper <- c("PROPDMGEXP", "CROPDMGEXP")
sub_DT[, lapply(.SD, toupper), .SDcols = columnsToUpper]
## PROPDMGEXP CROPDMGEXP
## 1: K
## 2: K
## 3: K
## 4: K
## 5: K
## ---
## 254628: K K
## 254629: K K
## 254630: K K
## 254631: K K
## 254632: K K
# Initite vectors which contain to what numeric values the characters in out data Set should be transformed to
# For instance, 'M' which is a million should be converted to 10^6
cropDamageAlNum <- c("?" = 10^0,"0" = 10^0,"\"\"" = 10^0,"K" = 10^3,"M" = 10^6,"B" = 10^9)
propDamageAlNum <- c("-" = 10^0, "+" = 10^0, "\"\"" = 10^0, "0" = 10^0,
"1" = 10^1,
"2" = 10^2,
"3" = 10^3,
"4" = 10^4,
"5" = 10^5,
"6" = 10^6,
"7" = 10^7,
"8" = 10^8,
"9" = 10^9,
"H" = 10^2,
"K" = 10^3,
"M" = 10^6,
"B" = 10^9)
sub_DT[, CROPDMGEXP := cropDamageAlNum[as.character(sub_DT[,CROPDMGEXP])]]
sub_DT[, PROPDMGEXP := propDamageAlNum[as.character(sub_DT[,PROPDMGEXP])]]
sub_DT <- sub_DT[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, propertyLoss = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, cropLoss = CROPDMG * CROPDMGEXP)]
losses <- sub_DT[, .(propertyLoss = sum(propertyLoss), cropLoss = sum(cropLoss), Total_Cost = sum(propertyLoss) + sum(cropLoss)), by = .(EVTYPE)]
fatalities_DF <- aggregate(FATALITIES ~ EVTYPE, data = DF, sum)
fatalities_DF <- fatalities_DF[fatalities_DF$FATALITIES > 0, ]
orderdered_fatal_DF <- fatalities_DF[order(fatalities_DF$FATALITIES, decreasing = TRUE), ]
# Checking the Fatalities sub-set before generating Results
head(orderdered_fatal_DF)
## EVTYPE FATALITIES
## 834 TORNADO 5633
## 130 EXCESSIVE HEAT 1903
## 153 FLASH FLOOD 978
## 275 HEAT 937
## 464 LIGHTNING 816
## 856 TSTM WIND 504
injuries_DF <- aggregate (INJURIES~EVTYPE, data = DF, sum)
orderdered_injury_DF <- injuries_DF[order(injuries_DF$INJURIES, decreasing=TRUE),]
# Checking the Injury sub-set before generating Results
head(orderdered_injury_DF)
## EVTYPE INJURIES
## 834 TORNADO 91346
## 856 TSTM WIND 6957
## 170 FLOOD 6789
## 130 EXCESSIVE HEAT 6525
## 464 LIGHTNING 5230
## 275 HEAT 2100
We utlise data available starting from 1950 until 2011 in our calculations.
Top five causes for Losses in Crops
totalLosses_DT <- losses[, list(`Total Cost (Millions)` = Total_Cost / (10^6)), by = .(`Event Type:` = EVTYPE)]
totalLosses_DT <- totalLosses_DT[order(-`Total Cost (Millions)`), ]
head(totalLosses_DT, 10)
## Event Type: Total Cost (Millions)
## 1: TORNADOES, TSTM WIND, HAIL 1602.500
## 2: TSUNAMI 144.082
## 3: HIGH WINDS/COLD 117.500
## 4: HURRICANE OPAL/HIGH WINDS 110.000
## 5: WINTER STORM HIGH WINDS 65.000
## 6: TROPICAL STORM JERRY 20.600
## 7: Heavy Rain/High Surf 15.000
## 8: LAKESHORE FLOOD 7.540
## 9: HIGH WINDS HEAVY RAINS 7.510
## 10: FOREST FIRES 5.500
totalCrops_DT <- losses[, list(`Crop Losses (Millions)` = cropLoss / (10^6)), by = .(`Event Type:` = EVTYPE)]
totalCrops_DT <- totalCrops_DT[order(-`Crop Losses (Millions)`), ]
#message("Top 5 Events which cause the highest Losses in Crops: ")
head(totalCrops_DT, 5)
## Event Type: Crop Losses (Millions)
## 1: FROST/FREEZE 1094.086
## 2: EXCESSIVE WETNESS 142.000
## 3: FLOOD/RAIN/WINDS 112.800
## 4: COLD AND WET CONDITIONS 66.000
## 5: Early Frost 42.000
totalProp_DT <- losses[, list(`Property Losses (Millions)` = propertyLoss / (10^6)), by = .(`Event Type:` = EVTYPE)]
totalProp_DT <- totalProp_DT[order(-`Property Losses (Millions)`), ]
#message("Top 5 Events which cause the highest Losses in Properties: ")
head(totalProp_DT, 5)
## Event Type: Property Losses (Millions)
## 1: STORM SURGE/TIDE 4641.188
## 2: HEAVY RAIN/SEVERE WEATHER 2500.000
## 3: TORNADOES, TSTM WIND, HAIL 1600.000
## 4: WILD FIRES 624.100
## 5: TYPHOON 600.230
plot_df <- as.data.frame(totalLosses_DT)
plot_df2 <- as.data.frame(totalCrops_DT)
barplot(height = plot_df[1:10, 2], names.arg = plot_df[1:10,1], las = 2,
cex.names = 0.5,
legend.text = plot_df[1:10,1],
col = rainbow (10, start=0.5, end=0.8),
ylab = "Overall Damage (M)", main = "Top 10 Events for Overall Damage")
barplot(height = plot_df2[1:10, 2],
names.arg = plot_df2[1:10,1], las = 2, cex.names = 0.5,
legend.text = plot_df2[1:10,1],
col = rainbow (10, start=0.5, end=0.8),
ylab = "Total Crop (M)", main = "Top 10 Events for Total Crop Losses")
barplot(height = orderdered_injury_DF$INJURIES[1:10], names.arg = orderdered_injury_DF$EVTYPE[1:10], las = 3, cex.names = 0.6, col = rainbow (10, start=0.5, end=0.9), ylab = "Number of Injuried People", main = "Top 10 Weather Events Cause for Injuries")
barplot(height = orderdered_fatal_DF$FATALITIES[1:10], names.arg = orderdered_fatal_DF$EVTYPE[1:10], las = 3, cex.names= 0.6, col = rainbow (10, start=0.5, end=0.9), ylab = "Number of Fatalities", main = "Top 10 Weather Events Cause for Fatalities")