This project aims to analyze the public health and economic problems brought about by storms and other weather events in communities and municipalities in the US. It involves the use of the storm database from the U.S. National Oceanic and Atmospheric Administration. This database tracks the characteristics of major storms and weather events, including the time and place of occurrence, as well as estimates of any fatalities, injuries, and property damage. The events included in the analysis are from the year 1950 to November 2011.
The project is divided into two parts:
1. PUBLIC HEALTH: The first part determines the most harmful effects of weather events to population health.
1. LOADING THE PACKAGES
library(dplyr)
library(ggplot2)
2. LOADING THE DATA
Starting from the raw data file ( repdata_data_StormData.csv.bz2 ), the data was loaded into R using read.csv and stored in a variable called data. The storm database contains the following variables:
data <- read.csv("repdata_data_StormData.csv.bz2", header = TRUE)
str(data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
3. SUBSETTING THE DATA
From these 37 variables, a subset of the storm data will be created, which involves only seven variables that are necessary for the analyses:
General:
EVTYPE: Describes the types of events stored in the database.
Public Health: Describes the fatalities and injuries
FATALITIES
INJURIES
Economic: Describes the property and crop damage
PROPDMG
PROPDMGEXP
CROPDMG
CROPDMGEXP
subdata <- data[,c("EVTYPE","FATALITIES","INJURIES", "PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]
head(subdata)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 K 0
## 2 TORNADO 0 0 2.5 K 0
## 3 TORNADO 0 2 25.0 K 0
## 4 TORNADO 0 2 2.5 K 0
## 5 TORNADO 0 2 2.5 K 0
## 6 TORNADO 0 6 2.5 K 0
4. CLEANING THE DATA
subdata$EVTYPE <- toupper(subdata$EVTYPE)
subdata$NEW_EVTYPE <- "OTHERS"
subdata$NEW_EVTYPE[grep("ASTRONOMICAL LOW TIDE", subdata$EVTYPE)] <- "ASTRONOMICAL LOW TIDE"
subdata$NEW_EVTYPE[grep("AVALANCHE", subdata$EVTYPE)] <- "AVALANCHE"
subdata$NEW_EVTYPE[grep("BLIZZARD", subdata$EVTYPE)] <- "BLIZZARD"
subdata$NEW_EVTYPE[grep("FLOOD", subdata$EVTYPE)] <- "FLOOD"
subdata$NEW_EVTYPE[grep("WIND CHILL", subdata$EVTYPE)] <- "WIND CHILL"
subdata$NEW_EVTYPE[grep("DEBRIS FLOW", subdata$EVTYPE)] <- "DEBRIS FLOW"
subdata$NEW_EVTYPE[grep("FOG", subdata$EVTYPE)] <- "FOG"
subdata$NEW_EVTYPE[grep("SMOKE", subdata$EVTYPE)] <- "SMOKE"
subdata$NEW_EVTYPE[grep("DROUGHT", subdata$EVTYPE)] <- "DROUGHT"
subdata$NEW_EVTYPE[grep("DUST", subdata$EVTYPE)] <- "DUST"
subdata$NEW_EVTYPE[grep("HEAT", subdata$EVTYPE)] <- "HEAT"
subdata$NEW_EVTYPE[grep("FREEZE", subdata$EVTYPE)] <- "FREEZE"
subdata$NEW_EVTYPE[grep("FUNNEL CLOUD", subdata$EVTYPE)] <- "FUNNEL CLOUD"
subdata$NEW_EVTYPE[grep("HAIL", subdata$EVTYPE)] <- "HAIL"
subdata$NEW_EVTYPE[grep("RAIN", subdata$EVTYPE)] <- "RAIN"
subdata$NEW_EVTYPE[grep("SNOW", subdata$EVTYPE)] <- "SNOW"
subdata$NEW_EVTYPE[grep("HIGH SURF", subdata$EVTYPE)] <- "HIGH SURF"
subdata$NEW_EVTYPE[grep("WIND", subdata$EVTYPE)] <- "WIND"
subdata$NEW_EVTYPE[grep("HURRICANE", subdata$EVTYPE)] <- "HURRICANE"
subdata$NEW_EVTYPE[grep("STORM", subdata$EVTYPE)] <- "STORM"
subdata$NEW_EVTYPE[grep("LIGHTNING", subdata$EVTYPE)] <- "LIGHTNING"
subdata$NEW_EVTYPE[grep("RIP CURRENT", subdata$EVTYPE)] <- "RIP CURRENT"
subdata$NEW_EVTYPE[grep("SEICHE", subdata$EVTYPE)] <- "SEICHE"
subdata$NEW_EVTYPE[grep("SLEET", subdata$EVTYPE)] <- "SLEET"
subdata$NEW_EVTYPE[grep("STORM SURGE", subdata$EVTYPE)] <- "STORM SURGE"
subdata$NEW_EVTYPE[grep("TORNADO", subdata$EVTYPE)] <- "TORNADO"
subdata$NEW_EVTYPE[grep("TROPICAL DEPRESSION", subdata$EVTYPE)] <- "TROPICAL DEPRESSION"
subdata$NEW_EVTYPE[grep("VOLCANIC ASH", subdata$EVTYPE)] <- "VOLCANIC ASH"
subdata$NEW_EVTYPE[grep("WATERSPOUT", subdata$EVTYPE)] <- "WATERSPOUT"
subdata$NEW_EVTYPE[grep("WILDFIRE", subdata$EVTYPE)] <- "WILDFIRE"
subdata$NEW_EVTYPE[grep("WINTER", subdata$EVTYPE)] <- "WINTER"
subdata$NEW_EVTYPE[grep("WINTRY", subdata$EVTYPE)] <- "WINTER"
unique(subdata[c("NEW_EVTYPE")])
## NEW_EVTYPE
## 1 TORNADO
## 54 WIND
## 55 HAIL
## 187560 RAIN
## 187561 SNOW
## 187562 STORM
## 187564 WINTER
## 187566 HURRICANE
## 187580 OTHERS
## 187586 LIGHTNING
## 187617 FOG
## 187621 RIP CURRENT
## 187637 FLOOD
## 187674 FUNNEL CLOUD
## 187837 HEAT
## 188447 WATERSPOUT
## 188774 BLIZZARD
## 188817 FREEZE
## 188986 AVALANCHE
## 189195 SLEET
## 189324 DUST
## 192272 HIGH SURF
## 194361 DROUGHT
## 194651 STORM SURGE
## 197132 WILDFIRE
## 280995 VOLCANIC ASH
## 324915 SEICHE
## 347748 TROPICAL DEPRESSION
## 417910 SMOKE
## 647990 ASTRONOMICAL LOW TIDE
5. PROCESSING FOR ANALYSIS 1: PUBLICH HEALTH
#A subset called subhealth was created, which involves only the NEW_EVTYPE, FATALITIES, and INJURIES.
subhealth <- subdata[,c("NEW_EVTYPE","FATALITIES","INJURIES")]
#The variable called fatalities was used to store the total number of fatalities per event type, obtained using the function aggregate. The column TYPE was added to denote "fatalities".
fatalities <- aggregate(subhealth$FATALITIES, by=list(EVENT=subhealth$NEW_EVTYPE), FUN=sum, na.rm=TRUE)
names(fatalities)[2] <- "TOTAL"
fatalities$TYPE <- "FATALITIES"
#The variable called injuries was used to store the total number of injuries per event type, obtained using the function aggregate. The column TYPE was added to denote "injuries".
injuries <- aggregate(subhealth$INJURIES, by=list(EVENT=subhealth$NEW_EVTYPE), FUN=sum, na.rm=TRUE)
names(injuries)[2] <- "TOTAL"
injuries$TYPE <- "INJURIES"
#Using rbind function, the fatalities and injuries were combined. A subset called subhealthcount was created to store only the rows with non-zero values. This subset will be used to plot the results.
healthcount <- rbind(fatalities,injuries)
subhealthcount <- healthcount[which(healthcount$TOTAL>0),]
6. PROCESSING FOR ANALYSIS 2: ECONOMIC
##PROPERTIES
##A subset called subprop was created which involves only the NEW_EVTYPE, PROPDMG, and PROPDMGEXP variables.
subprop <- subdata[,c("NEW_EVTYPE","PROPDMG","PROPDMGEXP")]
##The subset was used to transform the entries such that "K" denotes thousands (10^3), "M" for millions (10^6), and "B" for billions (10^9).
subprop$PROPDMGEXP[is.na(subprop$PROPDMGEXP)] <- 0
subprop$PROPDMGEXP[!grepl("K | M | B", subprop$PROPDMGEXP, ignore.case=TRUE)] <- 0
subprop$PROPDMGEXP[grep("K", subprop$PROPDMGEXP, ignore.case=TRUE)] <- "3"
subprop$PROPDMGEXP[grep("M", subprop$PROPDMGEXP, ignore.case=TRUE)] <- "6"
subprop$PROPDMGEXP[grep("B", subprop$PROPDMGEXP, ignore.case=TRUE)] <- "9"
subprop$PROPDMGEXP <- as.numeric(subprop$PROPDMGEXP)
#To compute for the total damage to properties, the column PROPDMG was multiplied with 10 raised to the value indicated in the column PROPDMGEXP.
subprop$TOTALPROP <- subprop$PROPDMG * 10^subprop$PROPDMGEXP
##CROPS
##A subset called subcrop was created which involves only the NEW_EVTYPE, CROPDMG, and CROPDMGEXP variables.
subcrop <- subdata[,c("NEW_EVTYPE","CROPDMG","CROPDMGEXP")]
##The subset was used to transform the entries such that "K" denotes thousands (10^3), "M" for millions (10^6), and "B" for billions (10^9).
subcrop$CROPDMGEXP[is.na(subcrop$CROPDMGEXP)] <- 0
subcrop$CROPDMGEXP[!grepl("K | M | B", subcrop$CROPDMGEXP, ignore.case=TRUE)] <- 0
subcrop$CROPDMGEXP[grep("K", subcrop$CROPDMGEXP, ignore.case=TRUE)] <- "3"
subcrop$CROPDMGEXP[grep("M", subcrop$CROPDMGEXP, ignore.case=TRUE)] <- "6"
subcrop$CROPDMGEXP[grep("B", subcrop$CROPDMGEXP, ignore.case=TRUE)] <- "9"
subcrop$CROPDMGEXP <- as.numeric(subcrop$CROPDMGEXP)
#To compute for the total damage to crops, the column CROPDMG was multiplied with 10 raised to the value indicated in the column CROPDMGEXP.
subcrop$TOTALCROP <- subcrop$CROPDMG * 10^subcrop$CROPDMGEXP
#The variable called properties was used to store the total property damages per event type, obtained using the function aggregate. The column TYPE was added to denote "PROPERTY".
properties <- aggregate(subprop$TOTALPROP, by=list(EVENT=subprop$NEW_EVTYPE), FUN=sum, na.rm=TRUE)
names(properties)[2] <- "TOTAL"
properties$TYPE <- "PROPERTY"
#The variable called crops was used to store the total crop damages per event type, obtained using the function aggregate. The column TYPE was added to denote "CROP".
crops <- aggregate(subcrop$TOTALCROP, by=list(EVENT=subcrop$NEW_EVTYPE), FUN=sum, na.rm=TRUE)
names(crops)[2] <- "TOTAL"
crops$TYPE <- "CROP"
#Using rbind function, the properties and crops were combined. A subset called subeconcount was created to store only the rows with non-zero values. This subset will be used to plot the results.
econcount <- rbind(properties,crops)
subeconcount <- econcount[which(econcount$TOTAL>0),]
1. EFFECTS OF WEATHER EVENTS TO PUBLIC HEALTH
Using the subhealthcount dataset, the top 5 weather events that are most harmful to population health based on the total number of fatalities and injuries are:
ggplot(subhealthcount, aes(x=reorder(EVENT,TOTAL), y=TOTAL, fill=TYPE)) +
geom_bar(stat="identity", width=0.8) +
scale_x_discrete(expand=c(0,0.5)) +
coord_flip(xlim=c(length(unique(subhealthcount$EVENT))-4,length(unique(subhealthcount$EVENT)))) +
ggtitle("Top 5 Weather Events with the Highest Fatalities and Injuries")+
theme(plot.title=element_text(hjust=0.5)) +
labs(x="Weather Event", y="Total Fatalities and Injuries")
2. ECONOMIC EFFECTS OF WEATHER EVENTS
Using the subeconcount dataset, the top 5 weather events with the greatest economic consequences based on the total property and crop damages are:
ggplot(subeconcount, aes(x=reorder(EVENT,TOTAL), y=TOTAL, fill=TYPE)) +
geom_bar(stat="identity", width=0.8) +
scale_x_discrete(expand=c(0,0.5)) +
coord_flip(xlim=c(length(unique(subeconcount$EVENT))-4,length(unique(subeconcount$EVENT)))) +
ggtitle("Top 5 Weather Events with the Highest Property \n and Crop Damages") +
theme(plot.title=element_text(hjust=0.5)) +
labs(x="Weather Event", y="Total Property and Crop Damages")