Synopsis

This report is a partial summary statistics of data from the NOAA database containing information about extreme weather events in the period 1950-2011. The R program first calculates the total costs attributed to each event (based on two columns containing a number variable of $ and a second column containing amount in hundreds, thousands and millions ect (denoted by h, k, m respectively). The program then summarizes data on personal fatalities and injuries and combine these in a single variable “health”. The top 20 events with repect to casualties and injuries and economic consequences are displayed in two bar charts. Tornadoes are the most dangerous weather event to health whereas floods are the most costly to properties and crops.

Data processing

Load packages and data from website and make dataframe

#library dplyr is loaded but echo and warnings are supressed
setwd("~/datasciencecoursera/RepDataPeerAssessment2")
temp <- tempfile()
download.file("http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2",temp,mode="wb", method = "curl")
datedownloaded<-date()
data<-read.table(temp, header=TRUE, sep=',')
unlink(temp)

The variables in this dataset

names(data)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

Data are subsetted to the columns needed for the analysis

data<- select(data,EVTYPE,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)
head(data,5)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO          0       15    25.0          K       0           
## 2 TORNADO          0        0     2.5          K       0           
## 3 TORNADO          0        2    25.0          K       0           
## 4 TORNADO          0        2     2.5          K       0           
## 5 TORNADO          0        2     2.5          K       0
#Find what kind af values are in the  ??EXP columns 
uni1<-unique(data$PROPDMGEXP, incomparables=FALSE)
uni2<-unique(data$CROPDMGEXP, incomparables=FALSE)
#These are the values in the PROPDMGEXP column:
uni1
##  [1] K M   B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels:  - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
##These are the values in the CROPDMGEXP column: 
uni2
## [1]   M K m B ? 0 k 2
## Levels:  ? 0 2 B k K m M

New columns are made for calculating costs to property and crops by converting these to numeric factors (H=100, K=1000, M=1000000 and B=1000000000). Factors that are wrong are set to 1

data$factor1 <- ifelse(data$PROPDMGEXP %in% c("","-","?","+","0","1","2","3","4","5","6","7","8"), 1,0)
data$factor1 <- ifelse(data$PROPDMGEXP %in% c("h","H"), 100,data$factor1)
data$factor1 <- ifelse(data$PROPDMGEXP == "K", 1000,data$factor1)
data$factor1 <- ifelse(data$PROPDMGEXP %in% c("m","M"), 1000000,data$factor1)
data$factor1 <- ifelse(data$PROPDMGEXP == "B", 1000000000,data$factor1)

data$factor2 <- ifelse(data$CROPDMGEXP %in% c("","?","0","2"), 1,0)
data$factor2 <- ifelse(data$CROPDMGEXP %in% c("k","K"), 1000,data$factor2)
data$factor2 <- ifelse(data$CROPDMGEXP %in% c("m","M"), 1000000,data$factor2)
data$factor2 <- ifelse(data$CROPDMGEXP == "B", 1000000000,data$factor2)

Two new colummns summing up health damage and total costs are created

data$health <- data$FATALITIES + data$INJURIES
data$eco <- data$PROPDMG * data$factor1 + data$CROPDMG * data$factor2

Results

The most dangerous kind of weather to human health is TORNADOES

health <- as.data.frame(tapply(data$health, data$EVTYPE, sum))
names(health) <- "total"
par(mar=c(10,8,2,2)) 
t <- barplot(sort(health$total,decreasing = T)[1:20], las= 2,col="blue", ylab = "Number of fatalities and injuries",srt=45, main = "Total cases by category")

The most costly weather disasters are FLOODS

eco <- as.data.frame(tapply(data$eco, data$EVTYPE, sum))
names(eco) <- "total"
par(mar=c(10,8,2,2)) 
t <- barplot(sort(eco$total,decreasing = T)[1:20], col="red", las= 2,ylab = "Total economic damage ($)", srt=45, main = "Top 20 events with\n greatest economic consequences")

Acknowledgement: Thanks to “rpubs.com/wangw5/80740” for providing a solution to the conversion of characters (h,k,m,b) to integer problem!