Data Processing

For this particular analysis the following R packages, dplyr and ggplot2, have been loaded.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

The following describes assumptions made and the steps for loading and processing the raw files into R for analysis.

Assumptions

The events in the database start in the year 1950 and end in November 2011. In the earlier years of the database there are generally fewer events recorded, most likely due to a lack of good records. More recent years should be considered more complete.

Events that span more than one month will be entered for each month they occur, otherwise each row of the Storm Data is for a storm event.

Step 1 Downloading data

Three files were downloaded from the URL https://d396qusza40orc.cloudfront.net/ 1. National Weather Service Storm Data Documentation 2. National Climatic Data Center Storm Events FAQ 3. The Storm Data as a raw bzip2 csv data [47Mb]

# Set the working directory
setwd("~/ResearchProject2")
# Download the stormdata.doc and stormfaq.pdf for background information to data
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2Fpd01016005curr.pdf","stormdata.doc")
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fpeer2_doc%2FNCDC%20Storm%20Events-FAQ%20Page.pdf","stormfaq.pdf")

# Download and read the csv data file into R using the readLines function to review the data before loading entire dataset
download.file(  "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",destfile = "stormdata.csv" )

Step 2 Reading data into R

Using the function readLines on the first 10 records indicated that the columns were separated by commas(,) and the headings quoted in backslashes(). The csv file was then read into the dataframe ‘stormadata’ using the read.csv function.

# Calculate approximate row count of the data set. 
data <- readLines("stormdata.csv",n=-1)
# There are `r length(data)` rows / observations

# read in first 10 records of the input file , 
stormdata10 <- read.csv("stormdata.csv",sep=",",quote="\"",stringsAsFactors=FALSE,nrows=10)

# Identify the classes of the input file,update the classes 
classes <- sapply(stormdata10, class)
# Change the BEG_TIME(3), BGN_AZI(10), BGN_AZI(11), END_DATE(12), END_TIME(13), COUNTYENDN(15), END_AZI(17), END_LOCATI(18), F(21), CROPDMGEXP(28), WFO(29), STATEOFFICE(30), ZONENAMES(31), REMARKS(36) to character classes
classes[[3]] <- "character"
classes[[10]] <- "character"
classes[[11]] <- "character"
classes[[12]] <- "character"
classes[[13]] <- "character"
classes[[15]] <- "character"
classes[[17]] <- "character"
classes[[18]] <- "character"
classes[[21]] <- "character"
classes[[26]] <- "character"
classes[[28]] <- "character"
classes[[29]] <- "character"
classes[[30]] <- "character"
classes[[31]] <- "character"
classes[[36]] <- "character"

# Assign that column class to the input file 
stormdata <- read.csv("stormdata.csv",sep=",",quote="\"",stringsAsFactors=FALSE,nrows=1422162,colClasses =classes )

There are a 37 observations in the loaded data.

Step 3 Data Transformations

The stormdata headings were converted to lower case
The beg.dat and end.date variables were converted to Date class
Understores were removed from the variable names and replaced with “.”
The ending longitude name was left as ‘longitude.’ to differentiate from the starting point ‘longitude’ and ‘latitude’ variable names. The ending latitude is labeled’latitude.e’.
In reading in the data, a list of variables descirbed above, were converted to characters

# Preprocessing the csv data file
names(stormdata) <- tolower(names(stormdata))
names(stormdata) <- gsub("__",".ref",names(stormdata))
names(stormdata) <- gsub("_",".",names(stormdata))
stormdata$bgn.date <- as.Date(x =stormdata$bgn.date,format="%m/%d/%Y" )
stormdata$end.date <- as.Date(x =stormdata$bgn.date,format="%m/%d/%Y" )

# View the frequency of events over time
qplot(bgn.date,data=stormdata,main = "Frequency of Storm Events recorded from 1950 to 2011",xlab ="Beginning Date")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Check for events longer > 1 day
stormdata %>%
        filter(bgn.date != end.date) %>% 
        group_by(bgn.date)%>% 
        summarise(count = n())

## # A tibble: 0 × 2
## # ... with 2 variables: bgn.date <date>, count <int>

Step 4 Analysing the data to answer the questions

# Calculate the total number of event types
tottype <- length(stormdata$evtype)

# Calculate the sums of the 'injuries'by 'evtype' and create a new dataframe 'storm.inj.sums'
storm.inj.sums <- stormdata %>% 
        group_by(evtype,injuries) %>%  
        summarise(count = n())
storm.inj.sums <- storm.inj.sums[,1:2]
storm.inj.sums <- tapply(storm.inj.sums$injuries,storm.inj.sums$evtype,sum)
storm.inj.sums <- data.frame(storm.inj.sums)

# Calculate the sums of the 'fatalities'  'evtype'and create a new dataframe 'storm.fat.sums'
storm.fat.sums <- stormdata %>% 
        group_by(evtype,fatalities) %>%  
        summarise(count = n())
storm.fat.sums <- storm.fat.sums[,1:2]
storm.fat.sums <- tapply(storm.fat.sums$fatalities,storm.fat.sums$evtype,sum)
storm.fat.sums <- data.frame(storm.fat.sums)

# Create a dataframe combining the total injuries and total fatalities by evtype'
stormsums <- cbind(storm.inj.sums,storm.fat.sums)
stormsums <- add_rownames(stormsums,"evtype")

## Warning: Deprecated, use tibble::rownames_to_column() instead.

# Calculate the means of the sums of the fatalities and the injuries
fatmean <- mean(stormsums$storm.fat.sums)
injmean <- mean(stormsums$storm.inj.sums)

# As the number of event types is very large, and difficult to view on a plot, subset the data where the sums are greater than the means
stormsums <- stormsums %>% 
        mutate(above.mean.f=storm.fat.sums>fatmean)
stormsums <- stormsums %>% 
        mutate(above.mean.i=storm.inj.sums>injmean)
# stormsums <- subset(stormsums,tot.above=="TRUE") line not needed
stormsums <- subset(stormsums,above.mean.f=="TRUE")
stormsums <- subset(stormsums,above.mean.i=="TRUE")

# Arrange stormsums data in descending order
stormsums <- stormsums %>% 
        arrange(desc(storm.inj.sums))
  
# Plot the top 6 fatality totals, viewing the Total Injuries compared to Total Fatalities for event types above the mean
qplot(storm.fat.sums,storm.inj.sums,data = stormsums[1:6,],geom="point",shape=evtype, main="Total Injuries Compared to Total Fatalities for Storm Events Across the US 1950 to 2011",xlab="Total Fatalities",ylab="Total Injuries")

# Viewing the the values of 'propdmg' and 'cropdmg',and referring to the documentation, these are  the property damage and crop damage in $ ie economic impact, but these values are not consistently denominated.

# Convert the propdmg to thousands and millions using K and M in the cropdmgexp columnds to be able to sum the total dollars to calculate economic impact
stormpropsums <- stormdata %>% 
      group_by(evtype) %>% 
      filter(propdmgexp=="")%>% 
      mutate(propdmgmil = propdmg) %>% 
      summarise(sum.prop=sum(propdmgmil))
stormpropsumsK <- stormdata %>% 
      group_by(evtype) %>% 
      filter(propdmgexp=="K")%>% 
      mutate(propdmgmil = propdmg*1000) %>% 
      summarise(sum.prop=sum(propdmgmil))
stormpropsumsM <- stormdata %>% 
      group_by(evtype) %>% 
      filter(propdmgexp=="M")%>% 
      mutate(propdmgmil = propdmg*1000000) %>% 
      summarise(sum.prop=sum(propdmgmil))

# Convert the cropdmg to thousands and millions using K and M in thecropdmgexp columnds to be able to sum the total dollars to calculate economic impact
stormcropsums <- stormdata %>% 
      group_by(evtype) %>% 
      filter(cropdmgexp=="")%>% 
      mutate(cropdmgmil = cropdmg) %>% 
      summarise(sum.crop=sum(cropdmgmil))
stormcropsumsK <- stormdata %>% 
      group_by(evtype) %>% 
      filter(cropdmgexp=="K")%>% 
      mutate(cropdmgmil = cropdmg*1000) %>% 
      summarise(sum.crop=sum(cropdmgmil))
stormcropsumsM <- stormdata %>% 
      group_by(evtype) %>% 
      filter(cropdmgexp=="M")%>% 
      mutate(cropdmgmil = cropdmg*1000000) %>% 
      summarise(sum.crop=sum(cropdmgmil))

#Combine the converted datatables
stormecosumsp <- bind_rows(stormpropsums,stormpropsumsK,stormpropsumsM)

stormecosumsp <- stormecosumsp %>% 
          group_by(evtype) %>%
          summarise(sum.prop =sum(sum.prop))  %>% 
          arrange(desc(sum.prop))

stormecosumsc <- bind_rows(stormcropsums,stormcropsumsK,stormcropsumsM)

stormecosumsc <- stormecosumsc %>% 
          group_by(evtype) %>%
          summarise(sum.crop =sum(sum.crop))  %>%
          arrange(desc(sum.crop))

stormecosums <- full_join(stormecosumsc,stormecosumsp,by="evtype") 

stormecosums <- stormecosums %>%
          group_by(evtype) %>%
          arrange(desc(sum.prop))

qplot(sum.prop,sum.crop,data = stormecosums[1:6,],geom="point",shape=evtype, main="Total Injuries compared to Total Fatalities for Storm Events Across the US 1950 to 2011",xlab="Total Property Damage($)",ylab="Total Crop Damage($)")

Explore the NOAA Storm Database and answer questions about severe weather events

kimnewzealand

22 May 2017

Synopsis