Synopsis

The purpose of this research project is to asses the impact of weather events on public health and the economy. Using the NOAA Storm Database, which contains data from the years 1950-2011 about wheater events, Property and Crop cost damage as well as numbers of injuries and fatalities in the US. The research aim is to find out the most impact events in term of economic (cost), and public health (number of injuries and fatalities).

Data Processing

Load the Data Set

Download the files and unzip them into a data directory.

Load R libreries

if(!file.exists("data")) {  
        dir.create("data")  
}  
if(!file.exists("./data/StormData.csv.bz2")) {  
        fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"  
        download.file(fileUrl, destfile="./data/StormData.csv.bz2")  
        if(!file.exists("./data/StormData.csv"))   
                bunzip2("./data/StormData.csv.bz2", "./data/StormData.csv", remove = FALSE, skip = TRUE) 
}
StormsData<-read.csv("./data/StormData.csv")

Initial review

Look at the data’s dimensions.

dim(StormsData)
## [1] 902297     37

preliminary analyzes of the data using str function.

names(StormsData)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"

Processing

Create a subset data frame which will use the relevant variables only: EVTYPE - The event type is used as in the raw data. BGN_TIME - This is the exact begin time, it will be used to extract the year of the event. PROPDMG - Property damage cost. PROPDMGEXP - Property Damage Cost Factor (K-1000,M-1000000,B-1000000000). CROPDMG - Crop damage cost. CROPDMGEXP-Crop damage cost factor (K-1000,M-1000000,B-1000000000) FATALITIES - Number of deaths. INJURIES - Number of injuries preliminary analyzes of the data using str.

StormDF<-data.frame(Event.Type=StormsData$EVTYPE)

The data analyzed will be based on years - Create a new column, which will have the year of the event, which will be extracted from BGN_TIME.

StormDF<-cbind(StormDF,years = with(StormsData,format(as.Date(BGN_DATE, format="%m/%d/%Y"),"%Y")))

Add the fatalities and injuries columns from the raw data.

Fatalities.No<-StormsData$FATALITIES

Injuries.No<-StormsData$INJURIES

StormDF<-cbind(StormDF,data.frame(Fatalities.No))

StormDF<-cbind(StormDF,data.frame(Injuries.No))

Create the property damage cost column: Property.Damage.Cost - The calculation is done using the PROPDMG and the PROPDMGEXP raw data. The factor extracted from the PROPDMGEXP columns. The key to multiple is based on this dictionary : K - 1000, M - 1000000 B - 1000000000 Any other value is assumed to be error and thus using zero as a factor (which result as a zero property cost damage)

#Copy the two property damage columns to the new data frame.
StormDF$Property.Damage.Cost<-StormsData$PROPDMG
StormDF$Property.Damage.Factor<-StormsData$PROPDMGEXP
#Build a new column with (K - 1000, M - 1000000, B - 1000000000)
#Any other value is treated as an error,  and set to zero.
StormDF<-StormDF %>%  
      mutate(Property.Damage.Cost.Factor = as.numeric(case_when(
      Property.Damage.Factor =="K" ~ 1000,  Property.Damage.Factor =="M" ~ 1000000,
      Property.Damage.Factor =="B" ~ 1000000000,(Property.Damage.Factor !="M" |Property.Damage.Factor !="B" | Property.Damage.Factor !="K")~0)))
#Replace the Damage cost column with the Damage.Cost * Damage.Cost.Factor.
StormDF$Property.Damage.Cost<-StormDF$Property.Damage.Cost * StormDF$Property.Damage.Cost.Factor
#Present a summary using table.
table(StormDF$Property.Damage.Cost.Factor)
## 
##      0   1000  1e+06  1e+09 
## 466262 424665  11330     40
#remove the temporary columns.
StormDF<-select(StormDF,Event.Type,years,Fatalities.No,Injuries.No,Property.Damage.Cost)
head(StormDF)
##   Event.Type years Fatalities.No Injuries.No Property.Damage.Cost
## 1    TORNADO  1950             0          15                25000
## 2    TORNADO  1950             0           0                 2500
## 3    TORNADO  1951             0           2                25000
## 4    TORNADO  1951             0           2                 2500
## 5    TORNADO  1951             0           2                 2500
## 6    TORNADO  1951             0           6                 2500

Create the crop damage cost column: Crop.Damage.Cost - The calculation is done using the CROPDMG and the CROPDMGEXP raw data. The factor extracted from the CROPDMGEXP columns. The key to multiple is based on this dictionary : K - 1000, M - 1000000 B - 1000000000 Any other value is assumed to be error and thus using zero as a factor (which result as a zero crop cost damage).

#Copy the Two CROP Damage Columns to the new DataFrame
StormDF$Crop.Damage.Cost<-StormsData$CROPDMG
StormDF$Crop.Damage.Factor<-StormsData$CROPDMGEXP
#Build a new column with (K - 1000, M - 1000000, B - 1000000000)
#Any other value is treated as an error,  and set to zero.
StormDF<-StormDF %>%  
      mutate(Crop.Damage.Cost.Factor = as.numeric(case_when(
      Crop.Damage.Factor =="K" ~ 1000,  Crop.Damage.Factor =="M" ~ 1000000,
      Crop.Damage.Factor =="B" ~ 1000000000,(Crop.Damage.Factor !="M" |Crop.Damage.Factor !="B" | Crop.Damage.Factor !="K")~0)))
#Replace the crop cost column with the Crop.Cost * Damage.Crop.Factor.
StormDF$Crop.Damage.Cost<-StormDF$Crop.Damage.Cost * StormDF$Crop.Damage.Cost.Factor
#Present a summary using table.
table(StormDF$Crop.Damage.Cost.Factor)
## 
##      0   1000  1e+06  1e+09 
## 618462 281832   1994      9
#remove the temporary columns.
StormDF<-select(StormDF,Event.Type,years,Fatalities.No,Injuries.No,Property.Damage.Cost,Crop.Damage.Cost)
head(StormDF)
##   Event.Type years Fatalities.No Injuries.No Property.Damage.Cost
## 1    TORNADO  1950             0          15                25000
## 2    TORNADO  1950             0           0                 2500
## 3    TORNADO  1951             0           2                25000
## 4    TORNADO  1951             0           2                 2500
## 5    TORNADO  1951             0           2                 2500
## 6    TORNADO  1951             0           6                 2500
##   Crop.Damage.Cost
## 1                0
## 2                0
## 3                0
## 4                0
## 5                0
## 6                0

Sum the number of fatalities per event type, using the aggregate function (along the years).

#Aggregate based on Event Type - along 1950 - 2011
Fatalities<-aggregate(StormDF$Fatalities.No,by=list(Category=StormDF$Event.Type),sum)

Fatalities$Type<-"Fatlity"

colnames(Fatalities)<-c("Event.Type","Total","Type")

Fatalities<-Fatalities[order(Fatalities$Total, decreasing = T), ]

Sum the number of injuries per event type, using the aggregate function (along the years)

#Aggregate based on Event Type - along 1950 - 2011
Injuries<-aggregate(StormDF$Injuries.No,by=list(Category=StormDF$Event.Type),sum)

Injuries$Type<-"Injury"

colnames(Injuries)<-c("Event.Type","Total","Type")

Injuries<-Injuries[order(Injuries$Total, decreasing = T), ]

Create a combined data frame for injuries and fatalities to create the overall population health impact, for all years (1950-2011), then select the top 10 events. Using the join function, as it keeps the order of the two joined data frame.

Health<-join(Injuries,Fatalities,by="Event.Type",type="inner")
#subset - top 10 events.
Health<-head(Health,10)

head(Health)
##       Event.Type Total   Type Total    Type
## 1        TORNADO 91346 Injury  5633 Fatlity
## 2      TSTM WIND  6957 Injury   504 Fatlity
## 3          FLOOD  6789 Injury   470 Fatlity
## 4 EXCESSIVE HEAT  6525 Injury  1903 Fatlity
## 5      LIGHTNING  5230 Injury   816 Fatlity
## 6           HEAT  2100 Injury   937 Fatlity
#Aggregate based on Event Type 
Cost.Property.Damage<-aggregate(StormDF$Property.Damage.Cost,by=list(Category=StormDF$Event.Type),sum)

Cost.Property.Damage$Type<-"Property"

colnames(Cost.Property.Damage)<-c("Event.Type","Cost","Type")

Cost.Property.Damage<-Cost.Property.Damage[order(Cost.Property.Damage$Cost, decreasing = T), ]

Sum the total property dmage cost per event type along the years (1950 - 2011).

#Aggregate based on Event Type 
Cost.Crop.Damage<-aggregate(StormDF$Crop.Damage.Cost,by=list(Category=StormDF$Event.Type),sum)

Cost.Crop.Damage$Type<-"Crop"

colnames(Cost.Crop.Damage)<-c("Event.Type","Cost","Type")

Cost.Crop.Damage<-Cost.Crop.Damage[order(Cost.Crop.Damage$Cost, decreasing = T), ]

Create a combined data frame for property damage cost and crop damage cost to create the overall economic impact, for all years (1950-2011), then select the top 10 events. Using the join function, as it keeps the order of the two joined data frame.

Cost<-join(Cost.Property.Damage,Cost.Crop.Damage,by="Event.Type",type="inner")
#Subset by top 10
Cost<-head(Cost,10)

head(Cost)
##          Event.Type         Cost     Type       Cost Type
## 1             FLOOD 144657709800 Property 5661968450 Crop
## 2 HURRICANE/TYPHOON  69305840000 Property 2607872800 Crop
## 3           TORNADO  56925660480 Property  414953110 Crop
## 4       STORM SURGE  43323536000 Property       5000 Crop
## 5       FLASH FLOOD  16140811510 Property 1421317100 Crop
## 6              HAIL  15727366720 Property 3025537450 Crop

Results

This section contains the answers to the two questions:

1.Across the United States, which types of events (as indicated in the EVENT TYPE variable) are most harmful concerning population health?

2.Across the United States, which types of events have the most significant economic consequences?

Harmful events - Population Health

Plot the top 10 combined injuries and fatalities events for 1950-2011 Using the melt function (reshape2 library) to organize the data to allow plotting at the same panel.

colnames(Health)<-c("Event.Type","Total.Injuries","Type.Injuries","Total.Fatalities","Type.Fatalities")

Health_long <- melt(Health, id.vars = c("Event.Type","Type.Injuries","Type.Fatalities"))

theme_update(plot.title = element_text(hjust = 0.5))

ggplot(data=Health_long,aes(x=Event.Type, y=value, fill=variable)) +
            geom_bar(stat="identity",position ="stack") +theme(axis.text.x = element_text(size = 10,angle = -45))+labs(y="Total Injuries/Fatalities",x = "Event Type", title = "Health impact by event-Years 1950-2011",size=12)+scale_fill_manual("Impact Type", values = c("Total.Injuries" = "gray", "Total.Fatalities" = "red",size=12))+theme(axis.text.y.left =   element_text(size = 10))+theme(axis.text.y.right =  element_text(size = 10))

Harmful events - Cost Impact

Plot the top 10 combined cost of the property and crop damages events for the years 1950-2011
Using the melt function (reshape2 library) organize the data to allow plotting at the same panel.

colnames(Cost)<-c("Event.Type","Total.Property","Type.Property","Total.Crop","Type.Crop")

Cost_long <- melt(Cost, id.vars = c("Event.Type","Type.Property","Type.Crop"))

theme_update(plot.title = element_text(hjust = 0.5))

ggplot(data=Cost_long,aes(x=Event.Type, y=value, fill=variable)) +
   geom_bar(stat="identity",position ="stack") +theme(axis.text.x = element_text(size = 10,angle = -45))+labs(y="Total Property/Crop Cost",x = "Event Type", title = "Cost impact by event-Years 1950-2011")+scale_fill_manual("Cost Type", values = c("Total.Property" = "gray", "Total.Crop" = "red",size=10))+theme(axis.text.y.left =   element_text(size = 10))+theme(axis.text.y.right =  element_text(size = 10))

Conclusion

When analyzing the two plots, it can be seen that for economic impact the most harmful events are floods, regarding public health, the most harmful event is Tornado. We can also see that regarding the economic impact, the property damages are more significant then crop damages costs.