Peer-graded Assignment: Course Project 2 for Reproducible Research course

Synopsis: This report tries to answer some basic questions about severe weather events in USA according to NOAA report.

Across the United States, which types of events (EVTYPE) are most harmful with respect to
population health?

Across the United States, which types of events have the greatest economic consequences?

The report consists of two parts, Data Processing and Results. The first part shows how the data is organized for analysis. The second part shows how the final results are obtained through the processed data.

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Data Processing

opts_chunk$set(echo=TRUE)

Loading the data into a dataframe.

raw <- read.csv("repdata_data_StormData.csv.bz2", sep=",", header = TRUE)

Let’s look at the event types in the dataframe.

summary(unique(raw$EVTYPE))

##    Length     Class      Mode 
##       985 character character

There are 985 unique types of events in the data frame. According to the National Weather Service Storm Data Documentation there should be 48 event types.

raw$EVTYPE <- toupper(raw$EVTYPE)
summary(unique(raw$EVTYPE))

##    Length     Class      Mode 
##       898 character character

A small reduction of the “EVTYPE” variable was made by making all records with capital letters. Still, more must be done. Let’s check the word count of each event type.

words <- paste(raw$EVTYPE, collapse = " ")
uniqueWords <- strsplit(words, " ")[[1]]
wordCount <- as.data.frame(table(uniqueWords)) %>% arrange(desc(Freq))
head(wordCount, 50)

Now, we can create a new column for event types as a result of processing the raw data… This way we will reduce the entity types to a feasible count:

raw[ifelse(grepl('TSTM|THUNDERSTORM|LIGHTNING', raw$EVTYPE, ignore.case = TRUE),
           yes = "thunderstorm", no = "NA")=="thunderstorm",
    "PROC_ETYPE"] <- "thunderstorm"
raw[ifelse(grepl('WIND|MICROBURST|*MICRO|*BURST', raw$EVTYPE, ignore.case = TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "wind", no = "NA") == "wind", "PROC_ETYPE"] <- "wind"
raw[ifelse(grepl("HAIL", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "hail", no = "NA") == "hail", "PROC_ETYPE"] <- "hail"
raw[ifelse(grepl("FLOOD|FLD", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "flood", no = "NA") == "flood", "PROC_ETYPE"] <- "flood"
raw[ifelse(grepl("TORNADO|FUNNEL|WATERSPOUT", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "tornado", no = "NA") == "tornado", "PROC_ETYPE"] <- "tornado"
raw[ifelse(grepl("SLEET|SNOW", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "sleet and snow", no = "NA") == "sleet and snow",
    "PROC_ETYPE"] <- "sleet and snow"
raw[ifelse(grepl("RAIN", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "rain", no = "NA") == "rain", "PROC_ETYPE"] <- "rain"
raw[ifelse(grepl("SURF|TIDE|SURGE|RIP|CURRENT", raw$EVTYPE, ignore.case=TRUE)
& is.na(raw$PROC_ETYPE),
  yes = "surftide", no = "NA") == "surftide", "PROC_ETYPE"] <- "surftide"
raw[ifelse(grepl("ICE|FREEZ|FROST|FROZEN|COLD|CHILL", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "cold", no = "NA") == "cold", "PROC_ETYPE"] <- "cold"
raw[ifelse(grepl("DUST", raw$EVTYPE, ignore.case=TRUE) & is.na(raw$PROC_ETYPE),
           yes = "dust", no = "NA") == "dust", "PROC_ETYPE"] <- "dust"
raw[ifelse(grepl("HEAT|WARM", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "heat", no = "NA") == "heat", "PROC_ETYPE"] <- "heat"
raw[ifelse(grepl("FOG", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "fog", no = "NA") == "fog", "PROC_ETYPE"] <- "fog"
raw[ifelse(grepl("LANDSLIDE", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "landslide", no = "NA") == "landslide", "PROC_ETYPE"] <- "landslide"
raw[ifelse(grepl("AVALANCHE", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "avalanche", no = "NA") == "avalanche", "PROC_ETYPE"] <- "avalanche"
raw[ifelse(grepl("WILDFIRE|*WILD|*FIRE|*FOREST FIRE", raw$EVTYPE, ignore.case=TRUE)
& is.na(raw$PROC_ETYPE),
yes = "fire", no = "NA") == "fire", "PROC_ETYPE"] <- "fire"
raw[ifelse(grepl("HURRICANE|TYPHOON|*TROPICAL *STORM|*depression", raw$EVTYPE,
    ignore.case=TRUE) & is.na(raw$PROC_ETYPE),
yes = "tropical storm", no = "NA") == "tropical storm", "PROC_ETYPE"] <- "tropical storm"
raw[ifelse(grepl("BLIZZARD|*ICE *STORM|*SNOW *STORM|*WINTER *STORM|*LAKE *EFFECT",
    raw$EVTYPE, ignore.case=TRUE) & is.na(raw$PROC_ETYPE),
yes = "blizzard", no = "NA") == "blizzard", "PROC_ETYPE"] <- "blizzard"
raw[ifelse(grepl("DROUGHT|*DRY|*WARM", raw$EVTYPE, ignore.case=TRUE) &
    is.na(raw$PROC_ETYPE),
    yes = "drought", no = "NA") == "drought", "PROC_ETYPE"] <- "drought"

We can check how many records are there for each category

table(raw$PROC_ETYPE)

## 
##      avalanche       blizzard           cold        drought           dust 
##            386          14160           4568           2583            586 
##           fire          flood            fog           hail           heat 
##           4239          86107           1834         289276           2974 
##      landslide           rain sleet and snow       surftide   thunderstorm 
##            609          12130          17769           2529         352572 
##        tornado tropical storm           wind 
##          71532           1055          28404

The percent of unclassified event types is almost 1

sum(is.na(raw$PROC_ETYPE))/nrow(raw)*100

## [1] 0.995681

Let’s make a copy of the dataframe for processing the already existing records in it. We will begin with the date fields.

storm <- raw
storm$BGN_DATE <- as.Date(storm$BGN_DATE, "%m/%d/%Y")
storm$END_DATE <- as.Date(storm$END_DATE, "%m/%d/%Y")

In order to answer the question “Across the United States, which types of events have the greatest economic consequences?”, we need to look at the values in columns CROPDMG, CROPDMGEXP, PROPDMG and PROPDMGEXP. CROPDMGEXP and PROPDMGEXP are exponent values - they have to be multiplied by the values of CROPDMG and PROPDMG, respectively, in order to calculate the correct damage in US dollars. Let’s see how many numeric values are there in CROPDMGEXP and PROPDMGEXP.

nrow(storm[grepl("[0-9]", storm$PROPDMGEXP) | grepl("[0-9]", 
                                                    storm$CROPDMGEXP), ])

## [1] 320

nrow(storm[grepl("[0-9]", storm$PROPDMGEXP) |
           grepl("[0-9]", storm$CROPDMGEXP), ])/nrow(storm)*100

## [1] 0.03546504

Only about 0.04% of the data is numeric. We can skip it. What other unique values are there in PROPDMGEXP and CROPDMGEXP?

unique(storm$PROPDMGEXP)

##  [1] "K" "M" ""  "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"

unique(storm$CROPDMGEXP)

## [1] ""  "M" "K" "m" "B" "?" "0" "k" "2"

And what classes are PROPDMG and CROPDMG

class(storm$PROPDMG)

## [1] "numeric"

class(storm$CROPDMG)

## [1] "numeric"

OK, then. Let’s calculate “PROP_DAMAGE” and “CROP_DAMAGE” variables by multiplying the values in PROPDMG and CROPDMG with the appropriate exponential.

storm<-
mutate(storm,
       PROP_DAMAGE=ifelse(grepl("K", PROPDMGEXP,
                                       ignore.case = TRUE),
                                 PROPDMG*1000,
    ifelse(grepl("B", PROPDMGEXP,
                 ignore.case = TRUE),
           PROPDMG*1000000000,
    ifelse(grepl("m", PROPDMGEXP,
                 ignore.case = TRUE),
           PROPDMG*1000000,
    ifelse(grepl("0", PROPDMGEXP,
                 ignore.case = TRUE),
           PROPDMG,
    ifelse(grepl("", PROPDMGEXP,
                 ignore.case = TRUE),
           0,NA))))))
storm<-
    mutate(storm,
       CROP_DAMAGE=ifelse(grepl("K", CROPDMGEXP,
                                ignore.case = TRUE),
                          CROPDMG*1000,
    ifelse(grepl("B", CROPDMGEXP,
                 ignore.case = TRUE),
           CROPDMG*1000000000,
    ifelse(grepl("M", CROPDMGEXP,
                 ignore.case = TRUE),
           CROPDMG*1000000,
    ifelse(grepl("0", CROPDMGEXP,
                 ignore.case = TRUE),
           CROPDMG,
    ifelse(grepl("", CROPDMGEXP,
                 ignore.case = TRUE),
           0,NA))))))

Now, we can add a “TOTAL_DAMAGE” variable to the dataframe, which is the sum of “CROP_DAMAGE” and “PROP_DAMAGE”.

storm<-
    mutate(storm, TOTAL_DAMAGE=CROP_DAMAGE+PROP_DAMAGE)

Additional data processing

Upon further examination of the collected data, there are some biases about the recorded event types. Before the year 1996 there are only three types of events added - Tornado, Thunderstorm Wind and Hail (for a period of 41 years) - link: https://www.ncdc.noaa.gov/stormevents/details.jsp?type=eventtype . And before the beginning of their recording, there was a five year period with only Tornado as a single recorded event (same link). Let’s see what % of all data is the one before 1996.

nrow(storm[storm$BGN_DATE<"1996-01-01",])/nrow(storm)*100

## [1] 27.57041

For a more unbiased analysis all observations before 1996 should be dropped.

storm_clean <- storm[storm$BGN_DATE>="1996-01-01",]

Results

To answer the question: “Across the United States, which types of events are most harmful with respect to population health?”, we will make a summary of the storm_clean dataframe

sum_harmful <- storm_clean %>%
    group_by(PROC_ETYPE) %>%
    summarise(recs = n(),
              meanFat = mean(FATALITIES, na.rm = T),
              sumFat = sum(FATALITIES, na.rm = T),
              meanInj = mean(INJURIES, na.rm = T),
              sumInj = sum(INJURIES, na.rm = T))
sum_harmful<-arrange(sum_harmful, desc(sumFat, sumInj))
sum_harmful

We will add a “FATProcent” variable with the percent of fatalities from their total sum.

sum_fat_proc <-
    mutate(sum_harmful, FATProcent = sumFat/sum(sumFat)*100)
sum_fat_proc

As we can see the first 5 types of events cause 75.88% of the total fatalities, with “heat” being the at the top of the list.

But what about the “Injuries” variable.

sum_harmful<-arrange(sum_harmful, desc(sumInj,sumFat))
sum_harmful

We will add an “INJProcent” variable with the percent of injuries from their total sum.

sum_inj_proc <-
    mutate(sum_harmful, INJProcent = sumInj/sum(sumInj)*100)
sum_inj_proc

As we can see the first 4 types of events cause 79.8% of total injuries, with “tornado” being the clear leader with over 35% of the total amount of injuries.

Let’s make a couple of plots to better show the fatalities and injuries by the top 6 and top 4 respective fatality and injury event types per year.

Getting the top 6 event types for “Fatalities”.

event <- c("heat","tornado","flood",
           "thunderstorm","surftide", "wind")

Preparing the data for plotting.

plot_fat <- with(
    storm_clean[storm_clean$PROC_ETYPE %in% event,],
    aggregate(FATALITIES~year(BGN_DATE)+PROC_ETYPE,
              FUN=sum))
names(plot_fat) <- c("YEAR", "EVENT_TYPE", "TOTAL_FAT")

Creating the plot.

ggplot(data = plot_fat, aes(x=YEAR, y=TOTAL_FAT, color=EVENT_TYPE)) +
    geom_line() +
    labs(title = "Total number of fatalities by event type from 1996 to 2011",
         y = "Total number of fatalities", x = "year", color="Event Type")

Now let’s get the top 4 event types for “Injuries”.

event <- sum_inj_proc[sum_inj_proc$INJProcent>10,]$PROC_ETYPE

Preparing the data for plotting.

plot_inj <-
    with(storm_clean[storm_clean$PROC_ETYPE %in% event,],
        aggregate(INJURIES~year(BGN_DATE)+PROC_ETYPE,
                  FUN=sum))
names(plot_inj) <- c("YEAR", "EVENT_TYPE", "TOTAL_INJ")

And creating the plot.

ggplot(data = plot_inj, aes(x=YEAR, y=TOTAL_INJ, color=EVENT_TYPE)) +
    geom_line() +
    labs(title = "Total number of injuries by event type from 1996 to 2011",
         y = "Total number of injuries", x = "year", color="Event Type")

To answer the question: “Across the United States, which types of events have the greatest economic consequences?”, we will make a summary of the storm_clean dataframe

sum_dam <- storm_clean %>%
    group_by(PROC_ETYPE) %>%
    summarise(recs = n(),
              meanCropDam_mill = mean(CROP_DAMAGE,
              na.rm = T)/1000000,
              sumCropDam_mill = sum(CROP_DAMAGE,
              na.rm = T)/1000000,
              meanPropDam_mill = mean(PROP_DAMAGE,
              na.rm = T)/1000000,
              sumPropDam_mill = sum(PROP_DAMAGE,
              na.rm = T)/1000000,
              sumTotal_mill = sum(TOTAL_DAMAGE,
              na.rm = T)/1000000) %>%
    arrange(desc(sumTotal_mill))

The property damage caused by weather events (in Million dollars):

sum_dam[,c("PROC_ETYPE", "recs", "sumCropDam_mill",
           "sumPropDam_mill", "sumTotal_mill")]

Let’s get the percentage for the total damage done by events.

sum_dam_perc <- mutate(sum_dam,
                       PERCsumTotal_mill =
                           sumTotal_mill/sum(sumTotal_mill)*100)
sum_dam_perc[, c("PROC_ETYPE", "recs",
                 "sumTotal_mill", "PERCsumTotal_mill")]

As we can see, the first four event types make for 83.3% of the sum total damage.

Let’s make a couple of plots to visualize the damage done by different events.

g1 <- ggplot(data = head(sum_dam), aes(x = PROC_ETYPE, y = sumTotal_mill)) +
    geom_bar(stat = "identity", fill = "red") +
    labs(x="Event Type",
         y=expression("Sum Total Damage (in millions of dollars)"))+
    labs(title=expression("Total Damage by Event Type in USA (for the six top events) from 1996 to 2011"))
g1

Now lets look at how much damage these events do by year for the period of 1996-2011.

event <- head(sum_dam$PROC_ETYPE)
tot_dam <- with(storm_clean[storm_clean$PROC_ETYPE %in% event,],
                aggregate(TOTAL_DAMAGE/1000000~year(BGN_DATE)+PROC_ETYPE,
         FUN = sum))
names(tot_dam)<-c("YEAR", "EVENT_TYPE", "TOTAL_DAMAGE")

g2 <- ggplot(data = tot_dam,
             aes(x=YEAR, y=TOTAL_DAMAGE, color=EVENT_TYPE)) +
    geom_line() +
    labs(title = "Total damage by event type (for top six events) from 1996 to 2011",
         y = "Total Damage (in millions of dollars)", x = "year", color="Event Type")
g2

Let’s make a couple of plots concerning the events that deal the most damage to Crops and Property!

sum_dam <- arrange(sum_dam, desc(sumCropDam_mill))
crop_dam_perc <- mutate(sum_dam,
                       PERCcropDam_mill =
                           sumCropDam_mill/sum(sumCropDam_mill)*100)
crop_dam_perc[,c("PROC_ETYPE", "sumCropDam_mill", "PERCcropDam_mill")]

As we can see the first five event types account for 89.12% of the total crop damage with “drought” being the leader with 38.5%.

event <- crop_dam_perc[crop_dam_perc$PERCcropDam_mill>7,
                       ]$PROC_ETYPE
tot_crop_dam <- with(storm_clean[storm_clean$PROC_ETYPE %in% event,],
                     aggregate(CROP_DAMAGE/1000000~year(BGN_DATE)+PROC_ETYPE,
                               FUN = sum))
names(tot_crop_dam)<-c("YEAR", "EVENT_TYPE", "CROP_DAMAGE")
g3 <- ggplot(data = tot_crop_dam,
             aes(x=YEAR, y=CROP_DAMAGE, color=EVENT_TYPE)) +
    geom_line() +
    labs(title = "Total crops damage by event type (for top five events) from 1996 to 2011",
         y = "Total Damage (in millions of dollars)", x = "year", color="Event Type")
g3

Now let’s look at the Sum Total of Property damage per year:

sum_dam <- arrange(sum_dam, desc(sumPropDam_mill))
prop_dam_perc <- mutate(sum_dam,
                        PERCpropDam_mill = sumPropDam_mill/sum(sumPropDam_mill)*100)
prop_dam_perc[,c("PROC_ETYPE", "sumPropDam_mill", "PERCpropDam_mill")]

As we can see the first four event types account for 87.81% with “flood” being the undisputed leader with 43.6%.

event <- prop_dam_perc[prop_dam_perc$PERCpropDam_mill>6,
                       ]$PROC_ETYPE
tot_prop_dam <- with(storm_clean[storm_clean$PROC_ETYPE %in% event,],
                     aggregate(PROP_DAMAGE/1000000~year(BGN_DATE)+PROC_ETYPE,
                               FUN = sum))
names(tot_prop_dam)<-c("YEAR", "EVENT_TYPE", "PROP_DAMAGE")
g4 <- ggplot(data = tot_prop_dam,
             aes(x=YEAR, y=PROP_DAMAGE, color=EVENT_TYPE)) +
    geom_line() +
    labs(title = "Total property damage by event type (for top four events)
from 1996 to 2011",
         y = "Total Damage (in millions of dollars)", x = "year", color="Event Type")
g4

Peer-graded Assignment: Course Project 2 for Reproducible Research course

Veselin Iliev

5/11/2021

Data Processing

Additional data processing

Results