Loading Dplyr package to subset relevant columns.
Dplyr package is loaded as we can efficiently manipulate a dataset
in R.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Since we are going to investigate the storms and weather events to
conclude the top 10 harmful weather events on the public health and
economy, we will subset COUNTYNAME, STATE, EVTYPE, MAG, FATALITIES,
INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP out of the 37
columns.
Storm_DataSet <- StormDataset %>%
select(COUNTYNAME, STATE, EVTYPE, MAG, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
We only need values greater than 0 which means we need entries being
made to have an impact on health and property/crop damage. We filter
FATALITIES, INJURIES, PROPDMG and CROPDMG columns in Storm_DataSet.
Storm_dataSet_use <- Storm_DataSet %>% filter(FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0)
Some exponential alphabets are in lowercase letters. Lowercase
letters and uppercase ones are indicating the same numeric values. For
instance, k means 1 thousand (1,000). So does K. We change the lower
exponential alphabet to the upper one to remove confusion.
Storm_dataSet_use$EVTYPE <- toupper(Storm_dataSet_use$EVTYPE)
Storm_dataSet_use$PROPDMGEXP <- toupper(Storm_dataSet_use$PROPDMGEXP)
Storm_dataSet_use$CROPDMGEXP <- toupper(Storm_dataSet_use$CROPDMGEXP)
We change alphanumeric exponential values to numeric ones for
property damage to calculate the accurate numeric values.
Storm_dataSet_use$PROPDMGEXP[which(Storm_dataSet_use$PROPDMGEXP == "")] <- 0
PROPDMG_in_number <- c("-" = 10^0, "+" = 10^0, "0" = 10^0, "1" = 10^1, "2" = 10^2, "3" = 10^3, "4" = 10^4, "5" = 10^5,
"6" = 10^6, "7" = 10^7, "B" = 10^9, "H" = 10^2, "K" = 10^3, "M" = 10^6)
PROPDMG_in_number_df <- data.frame(PROPDMG_in_number)
PROPDMG_in_number_df2 <- cbind(rownames(PROPDMG_in_number_df), data.frame(PROPDMG_in_number_df, row.names = NULL))
names(PROPDMG_in_number_df2) <- c("PROPDMGEXP", "PROPDMGEXP_in_Number")
SD_updated_PROPDMGEXP <- left_join(Storm_dataSet_use, PROPDMG_in_number_df2, by = "PROPDMGEXP")
We change alphanumeric exponential values to numeric ones for crop
damage to calculate the accurate numeric values.
SD_updated_PROPDMGEXP$CROPDMGEXP[which(SD_updated_PROPDMGEXP$CROPDMGEXP == "")] <- 0
CROPDMG_in_number <- c("?" = 10^0, "0" = 10^0, "1" = 10^1, "B" = 10^9, "K" = 10^3, "M" = 10^6)
CROPDMG_in_number_df <- data.frame(CROPDMG_in_number)
CROPDMG_in_number_df2 <- cbind(rownames(CROPDMG_in_number_df), data.frame(CROPDMG_in_number_df, row.names = NULL))
names(CROPDMG_in_number_df2) <- c("CROPDMGEXP", "CROPDMGEXP_in_Number")
SD_updated_PROPDMGEXP_CROPDMGEXP <- left_join(SD_updated_PROPDMGEXP, CROPDMG_in_number_df2, by = "CROPDMGEXP")
We make a new column to show the total property damage in numeric
value by multiplying PROPDMG and PROPDMGEXP_in_Number and to make the
total crop damage in numeric value by multiplying CROPDMG and
CROPDMGEXP_in_Number.
SD_updated_PROPDMGEXP_CROPDMGEXP$total_PROPDMG <- SD_updated_PROPDMGEXP_CROPDMGEXP$PROPDMG * SD_updated_PROPDMGEXP_CROPDMGEXP$PROPDMGEXP_in_Number
SD_updated_PROPDMGEXP_CROPDMGEXP$total_CROPDMG <- SD_updated_PROPDMGEXP_CROPDMGEXP$CROPDMG * SD_updated_PROPDMGEXP_CROPDMGEXP$CROPDMGEXP_in_Number
We make a column for the total economic damage cost by property
damage and crop damage. This will show us the total economic impact by
the weather events.
SD_updated_PROPDMGEXP_CROPDMGEXP$total_economic_DMG <- SD_updated_PROPDMGEXP_CROPDMGEXP$total_PROPDMG + SD_updated_PROPDMGEXP_CROPDMGEXP$total_CROPDMG
We combine FATALITIES and INJURIES columns to show the total health
impact by the weather events.
SD_updated_PROPDMGEXP_CROPDMGEXP$combined_fatalities_injuries <- SD_updated_PROPDMGEXP_CROPDMGEXP$FATALITIES + SD_updated_PROPDMGEXP_CROPDMGEXP$INJURIES
We clean the dataset (SD_updated_PROPDMGEXP_CROPDMGEXP) by removing
leading and trailing spaces in EVTYPE column to get the accurate
results. We use the “trimws” function.
SD_updated_PROPDMGEXP_CROPDMGEXP$EVTYPE <- trimws(SD_updated_PROPDMGEXP_CROPDMGEXP$EVTYPE, which = c("both"))