Title

Health and Economic Impacts of US severe weather

Synopsis

This project enter into the framework of the work assignment of the data science course. Through, this document, we are trying to answer two major question. First, across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?, secondly, across the United States, which types of events have the greatest economic consequences?. To answer those questions we are using the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#downloading the file, reading the data, chekcing data, and view the parameters

url<- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
destfile<- "stormdata.csv"
download.file(url,destfile, method = "curl")
sotrmdata<- read.csv("stormdata.csv")
head(sotrmdata)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE  EVTYPE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL TORNADO
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL TORNADO
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL TORNADO
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL TORNADO
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL TORNADO
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL TORNADO
##   BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1         0                                               0         NA
## 2         0                                               0         NA
## 3         0                                               0         NA
## 4         0                                               0         NA
## 5         0                                               0         NA
## 6         0                                               0         NA
##   END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1         0                      14.0   100 3   0          0       15    25.0
## 2         0                       2.0   150 2   0          0        0     2.5
## 3         0                       0.1   123 2   0          0        2    25.0
## 4         0                       0.0   100 2   0          0        2     2.5
## 5         0                       0.0   150 2   0          0        2     2.5
## 6         0                       1.5   177 2   0          0        6     2.5
##   PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1          K       0                                         3040      8812
## 2          K       0                                         3042      8755
## 3          K       0                                         3340      8742
## 4          K       0                                         3458      8626
## 5          K       0                                         3412      8642
## 6          K       0                                         3450      8748
##   LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1       3051       8806              1
## 2          0          0              2
## 3          0          0              3
## 4          0          0              4
## 5          0          0              5
## 6          0          0              6
str(sotrmdata)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ BGN_TIME  : chr  "0130" "0145" "1600" "0900" ...
##  $ TIME_ZONE : chr  "CST" "CST" "CST" "CST" ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ STATE     : chr  "AL" "AL" "AL" "AL" ...
##  $ EVTYPE    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : chr  "" "" "" "" ...
##  $ BGN_LOCATI: chr  "" "" "" "" ...
##  $ END_DATE  : chr  "" "" "" "" ...
##  $ END_TIME  : chr  "" "" "" "" ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : chr  "" "" "" "" ...
##  $ END_LOCATI: chr  "" "" "" "" ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: chr  "K" "K" "K" "K" ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: chr  "" "" "" "" ...
##  $ WFO       : chr  "" "" "" "" ...
##  $ STATEOFFIC: chr  "" "" "" "" ...
##  $ ZONENAMES : chr  "" "" "" "" ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : chr  "" "" "" "" ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

From the results of “str” function we can see that the data contain 37 variables and 902297 observations, the variables are either a numeric, charcter or integer. Based on this function, we can conclude that almost 7 variables arenof interst. In the following work, those 7 varibales will be slected for further analysis.

#Transforming ans subsetting data According to NOAA, the data recording strat from jan. 1950. However, at that time, only one event type was recroded. by jan. 1996 all event types have been recorded. Since we are interested in comparing the effects of different wheather events, the data base should contain only observation that started not earlier than jan. 1996. ## Transformin and subsetting

sotrmdata$BGN_DATE<- strptime(sotrmdata$BGN_DATE, "%m/%d/%Y %H:%M:%S")
data<- subset(sotrmdata, BGN_DATE> "1995-12-31")

Selecting variables

The variables of interest are: * EVTYPE: Type of event * FATALITIES: Number of fatalities * INJURIES: Number of injuries * PROPDMG: Size of property damage * PROPDMGEXP: The exponent values for PROPDMG * CROPDMG: Size of crop damage * CROPDMGEXP: The exponent values for CROPDMG

varint<- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
maindata<- subset(data, select= varint)

homoginizing and subseting non-zeor data

To homoginize the event in EVTYPE, we will transform data to capital letters. Further, we subset only non-zeo data regarding our traget numbers

maindata$EVTYPE <- toupper(maindata$EVTYPE)

maindata <- maindata[maindata$FATALITIES !=0 | 
                       maindata$INJURIES !=0 | 
                       maindata$PROPDMG !=0 | 
                       maindata$CROPDMG !=0, ]

Now our data are readi to be analyzed #RESULTS Q1: Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

fatalities <- aggregate(FATALITIES ~ EVTYPE, data=maindata, sum)
injuries <- aggregate(INJURIES ~ EVTYPE, data=maindata, sum)
fatalities <- arrange(fatalities,desc(FATALITIES),EVTYPE)[1:10,]
injuries <- arrange(injuries,desc(INJURIES),EVTYPE)[1:10,]
ggplot(fatalities, aes(x = EVTYPE, y= FATALITIES)) + 
      geom_bar(stat = "identity", fill = "red", width = NULL) + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
      xlab("Event Type") + ylab("Fatalities")

ggplot(injuries, aes(x = EVTYPE, y= INJURIES)) + 
      geom_bar(stat = "identity", fill = "orange", width = NULL) + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
      xlab("Event Type") + ylab("INJURIES")

Q2: across the United States, which types of events have the greatest economic consequences?

maindata$PROPDMGEXP <- gsub("[Hh]", "2", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("[Kk]", "3", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("[Mm]", "6", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("[Bb]", "9", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("\\+", "1", maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- gsub("\\?|\\-|\\ ", "0",  maindata$PROPDMGEXP)
maindata$PROPDMGEXP <- as.numeric(maindata$PROPDMGEXP)

maindata$CROPDMGEXP <- gsub("[Hh]", "2", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("[Kk]", "3", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("[Mm]", "6", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("[Bb]", "9", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("\\+", "1", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- gsub("\\-|\\?|\\ ", "0", maindata$CROPDMGEXP)
maindata$CROPDMGEXP <- as.numeric(maindata$CROPDMGEXP)

maindata$PROPDMGEXP[is.na(maindata$PROPDMGEXP)] <- 0
maindata$CROPDMGEXP[is.na(maindata$CROPDMGEXP)] <- 0

maindata <- mutate(maindata, 
                    PROPDMGTOTAL = PROPDMG * (10 ^ PROPDMGEXP), 
                    CROPDMGTOTAL = CROPDMG * (10 ^ CROPDMGEXP))

Summing economic consequencess

Economic_data <- aggregate(cbind(PROPDMGTOTAL, CROPDMGTOTAL) ~ EVTYPE, data = maindata, FUN=sum)
Economic_data$ECONOMIC_LOSS <- Economic_data$PROPDMGTOTAL + Economic_data$CROPDMGTOTAL
Economic_data <- Economic_data[order(Economic_data$ECONOMIC_LOSS, decreasing = TRUE), ]
worsteconomicevents <- Economic_data[1:10,c(1,4)]
worsteconomicevents
##                EVTYPE ECONOMIC_LOSS
## 48              FLOOD  147456390150
## 88  HURRICANE/TYPHOON   71913712800
## 141       STORM SURGE   43193541000
## 149           TORNADO   24887430720
## 66               HAIL   17056759620
## 46        FLASH FLOOD   16541801360
## 86          HURRICANE   14554229010
## 32            DROUGHT   14413057000
## 152    TROPICAL STORM    8320186550
## 83          HIGH WIND    5881288560

Loss per event type

ggplot(worsteconomicevents, aes(x = EVTYPE, y = ECONOMIC_LOSS)) + 
  geom_bar(stat = "identity", fill = "green") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  xlab("Event Type") + ylab("Total Prop & Crop Damages (USD)") +
ggtitle("Total economic loss in the US in the period 1996 - 2011 by weather event")