2.3. Loading Dplyr package to subset relevant columns.
Dplyr package is loaded as we can efficiently manipulate a dataset in R.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Since we will investigate the storms and weather events to conclude the top 10 harmful weather events on the public health and economy, we will subset COUNTYNAME, STATE, EVTYPE, MAG, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP out of the 37 columns.
Storm_DataSet <- StormDataset %>%
select(COUNTYNAME, STATE, EVTYPE, MAG, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP)
We only need values greater than 0 which means we need entries being made to have an impact on health and property/crop damage. We filter FATALITIES, INJURIES, PROPDMG and CROPDMG columns in Storm_DataSet.
Storm_dataSet_use <- Storm_DataSet %>% filter(FATALITIES > 0 | INJURIES > 0 | PROPDMG > 0 | CROPDMG > 0)
Some exponential alphabets are in lowercase letters. Lowercase letters and uppercase ones are indicating the same numeric values. For instance, k means 1 thousand (1,000). So does K. We change the lower exponential alphabet to the upper one to remove confusion.
Storm_dataSet_use$EVTYPE <- toupper(Storm_dataSet_use$EVTYPE)
Storm_dataSet_use$PROPDMGEXP <- toupper(Storm_dataSet_use$PROPDMGEXP)
Storm_dataSet_use$CROPDMGEXP <- toupper(Storm_dataSet_use$CROPDMGEXP)
We change alphanumeric exponential values to numeric ones for property damage to calculate the accurate numeric values.
Storm_dataSet_use$PROPDMGEXP[which(Storm_dataSet_use$PROPDMGEXP == "")] <- 0
PROPDMG_in_number <- c("-" = 10^0, "+" = 10^0, "0" = 10^0, "1" = 10^1, "2" = 10^2, "3" = 10^3, "4" = 10^4, "5" = 10^5,
"6" = 10^6, "7" = 10^7, "B" = 10^9, "H" = 10^2, "K" = 10^3, "M" = 10^6)
PROPDMG_in_number_df <- data.frame(PROPDMG_in_number)
PROPDMG_in_number_df2 <- cbind(rownames(PROPDMG_in_number_df), data.frame(PROPDMG_in_number_df, row.names = NULL))
names(PROPDMG_in_number_df2) <- c("PROPDMGEXP", "PROPDMGEXP_in_Number")
SD_updated_PROPDMGEXP <- left_join(Storm_dataSet_use, PROPDMG_in_number_df2, by = "PROPDMGEXP")
We change alphanumeric exponential values to numeric ones for crop damage to calculate the accurate numeric values.
SD_updated_PROPDMGEXP$CROPDMGEXP[which(SD_updated_PROPDMGEXP$CROPDMGEXP == "")] <- 0
CROPDMG_in_number <- c("?" = 10^0, "0" = 10^0, "1" = 10^1, "B" = 10^9, "K" = 10^3, "M" = 10^6)
CROPDMG_in_number_df <- data.frame(CROPDMG_in_number)
CROPDMG_in_number_df2 <- cbind(rownames(CROPDMG_in_number_df), data.frame(CROPDMG_in_number_df, row.names = NULL))
names(CROPDMG_in_number_df2) <- c("CROPDMGEXP", "CROPDMGEXP_in_Number")
SD_updated_PROPDMGEXP_CROPDMGEXP <- left_join(SD_updated_PROPDMGEXP, CROPDMG_in_number_df2, by = "CROPDMGEXP")
We make a new column to show the total property damage in numeric value by multiplying PROPDMG and PROPDMGEXP_in_Number and to make the total crop damage in numeric value by multiplying CROPDMG and CROPDMGEXP_in_Number.
SD_updated_PROPDMGEXP_CROPDMGEXP$total_PROPDMG <- SD_updated_PROPDMGEXP_CROPDMGEXP$PROPDMG * SD_updated_PROPDMGEXP_CROPDMGEXP$PROPDMGEXP_in_Number
SD_updated_PROPDMGEXP_CROPDMGEXP$total_CROPDMG <- SD_updated_PROPDMGEXP_CROPDMGEXP$CROPDMG * SD_updated_PROPDMGEXP_CROPDMGEXP$CROPDMGEXP_in_Number
We make a column for the total economic damage cost by property damage and crop damage. This will show us the total economic impact by the weather events.
SD_updated_PROPDMGEXP_CROPDMGEXP$total_economic_DMG <- SD_updated_PROPDMGEXP_CROPDMGEXP$total_PROPDMG + SD_updated_PROPDMGEXP_CROPDMGEXP$total_CROPDMG
We combine FATALITIES and INJURIES columns to show the total health impact by the weather events.
SD_updated_PROPDMGEXP_CROPDMGEXP$combined_fatalities_injuries <- SD_updated_PROPDMGEXP_CROPDMGEXP$FATALITIES + SD_updated_PROPDMGEXP_CROPDMGEXP$INJURIES
We clean the dataset (SD_updated_PROPDMGEXP_CROPDMGEXP) by removing leading and trailing spaces in EVTYPE column to get the accurate results. We use the “trimws” function.
SD_updated_PROPDMGEXP_CROPDMGEXP$EVTYPE <- trimws(SD_updated_PROPDMGEXP_CROPDMGEXP$EVTYPE, which = c("both"))