Synopsis:

I downloaded the data, and removed the columns I didn’t need. Then removed years before 1993 as there was little event data for those years. I converted the damange values from exponents into real values. Then I orderd the data and plotted the most impactful event types.

Load the required libraries

  library(R.utils)
  library(data.table)
  library(R.cache)
  library(dplyr)
  library(lubridate)
  library(ggplot2)

Downloading and Loading the Data, and only including the fields that will be used in analysis

    download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2","noaa.csv.bz2", method="curl")
  bunzip2("noaa.csv.bz2",remove=FALSE, skip=TRUE)
## [1] "noaa.csv"
## attr(,"temporary")
## [1] FALSE
  noaa<-fread("noaa.csv",sep=",")[,c("BGN_DATE","EVTYPE", "FATALITIES", "INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]

Then I tidy up the data: change damange data into real numbers, change BGN to lubradate, create a total dmg column.

  noaa<- noaa %>% mutate(date=mdy_hms(BGN_DATE))
  noaa<- noaa %>% mutate(year=year(date))    
  
  noaa<-noaa %>% mutate(cropdmgv2 = case_when(CROPDMGEXP %in% c("B","b") ~ CROPDMG*100000000, 
                                               CROPDMGEXP %in% c("M","m") ~ CROPDMG*1000000,
                                               CROPDMGEXP %in% c("K","k") ~ CROPDMG*1000,
                                               CROPDMGEXP %in% c("H","h") ~ CROPDMG*100,
                                               CROPDMGEXP %in% c("+") ~ CROPDMG*1,
                                               CROPDMGEXP %in% c("-","?") ~ CROPDMG*0,
                                               CROPDMGEXP %in% c(""," ") ~ CROPDMG*0,
                                               CROPDMGEXP %in% 0:8 ~ CROPDMG*10,
                                               ))
  
  noaa<-noaa %>% mutate(propdmgv2 = case_when(PROPDMGEXP %in% c("B","b") ~ PROPDMG*100000000, 
                                               PROPDMGEXP %in% c("M","m") ~ PROPDMG*1000000,
                                               PROPDMGEXP %in% c("K","k") ~ PROPDMG*1000,
                                               PROPDMGEXP %in% c("H","h") ~ PROPDMG*100,
                                               PROPDMGEXP %in% c("+") ~ PROPDMG*1,
                                               PROPDMGEXP %in% c("-","?") ~ PROPDMG*0,
                                               PROPDMGEXP %in% c(""," ") ~ PROPDMG*0,
                                               PROPDMGEXP %in% 0:8 ~ PROPDMG*10,
  ))
  
  noaa<-noaa %>% mutate(totaldmg = propdmgv2+cropdmgv2)
  noaa<-noaa %>% mutate(anydmg = propdmgv2+cropdmgv2+FATALITIES+INJURIES)

Then I determine which years have valueable data. In this case I find that it wasn’t until 1993 that there were a signifiant number of EV Types.

    subset(noaa,year>1990 & year < 1995) %>% 
    group_by(year) %>%
    summarise(count = n_distinct(EVTYPE))
## # A tibble: 4 x 2
##    year count
##   <dbl> <int>
## 1  1991     3
## 2  1992     3
## 3  1993   160
## 4  1994   267

I removed all years prior to 1993, plus any records where there were no damange or population health impact

  noaav2<<-subset(noaa, anydmg>0 & year>1992)

By ordering the data by total fatalities and plotting i was able to determine which events are the most impactful to the population. The types of events that cause the greatest harm to human health are Tornado’s, Heat, and Floods

  popimpact<-noaav2 %>%
    group_by(EVTYPE) %>%
    summarise(Total_Fatality = sum(FATALITIES), Total_Injury = sum(INJURIES), Total=sum(FATALITIES+INJURIES))
  popimpact<-as.data.table(popimpact)
  popimpact<-popimpact[order(-Total_Fatality)]
  popimpact<-popimpact[1:10,]
  
  popimpact<-melt(popimpact, id.vars="EVTYPE", variable.name="category")
  g<-ggplot(popimpact, aes(x=reorder(EVTYPE,-value), y=value))+geom_bar(position="dodge",stat="identity", aes(fill=category))
  g<-g+ylab("Count")+xlab("Event")+ggtitle("Top Population Impact") + theme(axis.text.x = element_text(angle=45, hjust=1))
  print(g)

The by ordering the data by total damange I was able to determine which events have done the most damange. The types of events that cause the greatest economic consequences are Flood, Tornado’s, and Hail

  dmg<-noaav2 %>%
    group_by(EVTYPE) %>%
    summarise(Prop_Dmg = sum(propdmgv2), Crop_Dmg = sum(cropdmgv2), Total=sum(propdmgv2+cropdmgv2))
  dmg<-as.data.table(dmg)
  dmg<-dmg[order(-Total)]
  dmg<-dmg[1:10,]
  
  dmg<-melt(dmg, id.vars="EVTYPE", variable.name="category")
  g<-ggplot(dmg, aes(x=reorder(EVTYPE,-value), y=value))+geom_bar(position="dodge",stat="identity", aes(fill=category))
  g<-g+ylab("Damage")+xlab("Event")+ggtitle("Top Crop and Properity Damage") + theme(axis.text.x = element_text(angle=45, hjust=1))
  print(g)