Initialisation

Download and load the data

setwd("C:\\Users\\Andy's Home PC\\Documents\\Coursera Courses\\Data Science\\Reproducible Research")
getwd()
## [1] "C:/Users/Andy's Home PC/Documents/Coursera Courses/Data Science/Reproducible Research"
if(!file.exists("./data")){dir.create("./data")} #this will create a folder "data", if it doesn't exists
fileUrl = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile="./data/stormData.csv.bz2", method="curl") #this download the file
data <- read.csv(bzfile("./data/stormData.csv.bz2"), sep=",", header=TRUE, na.strings=c(""))

#view top 3 records
head(data, 3)
##   STATE__          BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1       1 4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2       1 4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3       1 2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
##    EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO         0    <NA>       <NA>     <NA>     <NA>          0
## 2 TORNADO         0    <NA>       <NA>     <NA>     <NA>          0
## 3 TORNADO         0    <NA>       <NA>     <NA>     <NA>          0
##   COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1         NA         0    <NA>       <NA>   14.0   100 3   0          0
## 2         NA         0    <NA>       <NA>    2.0   150 2   0          0
## 3         NA         0    <NA>       <NA>    0.1   123 2   0          0
##   INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP  WFO STATEOFFIC ZONENAMES
## 1       15    25.0          K       0       <NA> <NA>       <NA>      <NA>
## 2        0     2.5          K       0       <NA> <NA>       <NA>      <NA>
## 3        2    25.0          K       0       <NA> <NA>       <NA>      <NA>
##   LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1     3040      8812       3051       8806    <NA>      1
## 2     3042      8755          0          0    <NA>      2
## 3     3340      8742          0          0    <NA>      3

Data Processing

Copy over the columns we need to a smaller data frame

data2 <- data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]

Explore the PROPDMGEXP and CROPDMGEXP columns

unique(data2$PROPDMGEXP)
##  [1] K    M    <NA> B    m    +    0    5    6    ?    4    2    3    h   
## [15] 7    H    -    1    8   
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(data2$CROPDMGEXP)
## [1] <NA> M    K    m    B    ?    0    k    2   
## Levels: ? 0 2 B k K m M

Assign numeric values to the special characters in PROPDMGEXP column (i.e. - ? + 0 1 2 3 4 5 6 7 8 B h H K m M)

data2$PROPDMGEXP <- as.character(data2$PROPDMGEXP) #change this from "factor" to "character" class; else it would create problems if we assign numbers to factors directly without this step
class(data2$PROPDMGEXP) #confirms that it has been converted to "character" class
## [1] "character"
data2[which(data2$PROPDMGEXP=="H"), "PROPDMGEXP"] <- 100
data2[which(data2$PROPDMGEXP=="h"), "PROPDMGEXP"] <- 100
data2[which(data2$PROPDMGEXP=="K"), "PROPDMGEXP"] <- 1000
data2[which(data2$PROPDMGEXP=="M"), "PROPDMGEXP"] <- 1000000
data2[which(data2$PROPDMGEXP=="m"), "PROPDMGEXP"] <- 1000000
data2[which(data2$PROPDMGEXP=="B"), "PROPDMGEXP"] <- 1000000000

data2[which(data2$PROPDMGEXP=="0"), "PROPDMGEXP"] <- 1
data2[which(data2$PROPDMGEXP=="1"), "PROPDMGEXP"] <- 10
data2[which(data2$PROPDMGEXP=="2"), "PROPDMGEXP"] <- 100
data2[which(data2$PROPDMGEXP=="3"), "PROPDMGEXP"] <- 1000
data2[which(data2$PROPDMGEXP=="4"), "PROPDMGEXP"] <- 10000
data2[which(data2$PROPDMGEXP=="5"), "PROPDMGEXP"] <- 100000
data2[which(data2$PROPDMGEXP=="6"), "PROPDMGEXP"] <- 1000000
data2[which(data2$PROPDMGEXP=="7"), "PROPDMGEXP"] <- 10000000
data2[which(data2$PROPDMGEXP=="8"), "PROPDMGEXP"] <- 100000000

#zero out invalid records ("-", "+", "?")
data2[which(data2$PROPDMGEX=="-"), "PROPDMGEXP"] <- 0
data2[which(data2$PROPDMGEX=="+"), "PROPDMGEXP"] <- 0
data2[which(data2$PROPDMGEX=="?"), "PROPDMGEXP"] <- 0

Assign numeric values to the special characters in CROPDMGEXP column (i.e. ? 0 2 B k K m M)

data2$CROPDMGEXP <- as.character(data2$CROPDMGEXP) #change this from "factor" to "character" class
class(data2$CROPDMGEXP) #confirms that it has been converted to "character" class
## [1] "character"
data2[which(data2$CROPDMGEXP=="K"), "CROPDMGEXP"] <- 1000
data2[which(data2$CROPDMGEXP=="k"), "CROPDMGEXP"] <- 1000
data2[which(data2$CROPDMGEXP=="M"), "CROPDMGEXP"] <- 1000000
data2[which(data2$CROPDMGEXP=="m"), "CROPDMGEXP"] <- 1000000
data2[which(data2$CROPDMGEXP=="B"), "CROPDMGEXP"] <- 1000000000

data2[which(data2$CROPDMGEXP=="0"), "CROPDMGEXP"] <- 1
data2[which(data2$CROPDMGEXP=="2"), "CROPDMGEXP"] <- 100

data2[which(data2$CROPDMGEXP=="?"), "CROPDMGEXP"] <- 0

Calculate the crop damages and property damages, and store these values in 2 new columns in the dataset

data2$PROPDMGEXP <- as.numeric(data2$PROPDMGEXP) #convert from character to numeric class (else we can't do numeric operations)
data2$CROPDMGEXP <- as.numeric(data2$CROPDMGEXP) #convert from character to numeric class

data2$PROPDMGAMT <- data2$PROPDMG * data2$PROPDMGEXP #economic value of property damages
data2$CROPDMGAMT <- data2$CROPDMG * data2$CROPDMGEXP #economic value of crop damages
data2$TOTALDMG <- rowSums(data2[,c("PROPDMGAMT", "CROPDMGAMT")], na.rm=TRUE)
head(data2,3)
##    EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO          0       15    25.0       1000       0         NA
## 2 TORNADO          0        0     2.5       1000       0         NA
## 3 TORNADO          0        2    25.0       1000       0         NA
##   PROPDMGAMT CROPDMGAMT TOTALDMG
## 1      25000         NA    25000
## 2       2500         NA     2500
## 3      25000         NA    25000

Convert “EVTYPE” from Factor to Character class

data2$EVTYPE<-as.character(data2$EVTYPE)
class(data2$EVTYPE)
## [1] "character"

Data Analysis

Calculate total fatalities based on the environmental types of storm

fatalitiesTotal <- aggregate(FATALITIES~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)

library(dplyr) #we need this to use the "arrange" function
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
fatalitiesTotal<- arrange(fatalitiesTotal, desc(FATALITIES))
fatalitiesTotal_top10 <- fatalitiesTotal[1:10,]
fatalitiesTotal_top10
##            EVTYPE FATALITIES
## 1         TORNADO       5633
## 2  EXCESSIVE HEAT       1903
## 3     FLASH FLOOD        978
## 4            HEAT        937
## 5       LIGHTNING        816
## 6       TSTM WIND        504
## 7           FLOOD        470
## 8     RIP CURRENT        368
## 9       HIGH WIND        248
## 10      AVALANCHE        224

Calculate total injuries based on environmental types of storm

injuriesTotal <- aggregate(INJURIES~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)

library(dplyr) #we need this to use the "arrange" function
injuriesTotal<- arrange(injuriesTotal, desc(INJURIES))
injuriesTotal_top10 <- injuriesTotal[1:10,]
injuriesTotal_top10
##               EVTYPE INJURIES
## 1            TORNADO    91346
## 2          TSTM WIND     6957
## 3              FLOOD     6789
## 4     EXCESSIVE HEAT     6525
## 5          LIGHTNING     5230
## 6               HEAT     2100
## 7          ICE STORM     1975
## 8        FLASH FLOOD     1777
## 9  THUNDERSTORM WIND     1488
## 10              HAIL     1361

Computational Results

1. Across the United States, which types of events(as indicated in the EVTYPE variable) are most harmful with respect to population health?

Creating 2 barplots with respect to storm types and their impact on fatalities and injuries

par(mfrow=c(1,2))
barplot(fatalitiesTotal_top10$FATALITIES, names.arg=fatalitiesTotal_top10$EVTYPE, main="Top 10 Storm Types and Fatalities in U.S.", ylab="Number of Fatalities", col="firebrick", las=3, cex.names=0.5, cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,90000))
barplot(injuriesTotal_top10$INJURIES, names.arg=injuriesTotal_top10$EVTYPE, main="Top 10 Storm Types and Injuries in U.S.", ylab="Number of Injuries", col="Blue4", las=3, cex.names=0.5,  cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,90000))

Based on the above diagrams, we found that tornado, excessive heat, and flash flood are the top 3 weather events that cause the greatest fatalities in U.S. However, in terms of greatest human injuries, the top 3 weather events are tornado, thunderstorm wind (TSTM WIND), and flood.

2. Across the United States, which types of events have the greatest economic consequences?

Creating 3 bar plots that indicate weather events that causes property damages, crop damages, and total economic damages (combination of property and crop damages)

#Calculating data for aggregate property damages(data2$PROPDMGAMT) and weather event types(data2$EVTYPE)
ppty_dmg <- aggregate(PROPDMGAMT~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)

library(dplyr) #we need this to use the "arrange" function
ppty_dmg<- arrange(ppty_dmg, desc(PROPDMGAMT))
ppty_dmg_top10 <- ppty_dmg[1:10,]
ppty_dmg_top10
##               EVTYPE   PROPDMGAMT
## 1              FLOOD 144657709800
## 2  HURRICANE/TYPHOON  69305840000
## 3            TORNADO  56947381815
## 4        STORM SURGE  43323536000
## 5        FLASH FLOOD  16822676125
## 6               HAIL  15735269577
## 7          HURRICANE  11868319010
## 8     TROPICAL STORM   7703890550
## 9       WINTER STORM   6688497260
## 10         HIGH WIND   5270046260
#Calculating data for aggregate crop damages(data2$CROPDMGAMT) and weather event types (data2$EVTYPE)
crop_dmg <- aggregate(CROPDMGAMT~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)

library(dplyr) #we need this to use the "arrange" function
crop_dmg<- arrange(crop_dmg, desc(CROPDMGAMT))
crop_dmg_top10 <- crop_dmg[1:10,]
crop_dmg_top10
##               EVTYPE  CROPDMGAMT
## 1            DROUGHT 13972566000
## 2              FLOOD  5661968450
## 3        RIVER FLOOD  5029459000
## 4          ICE STORM  5022113500
## 5               HAIL  3025954470
## 6          HURRICANE  2741910000
## 7  HURRICANE/TYPHOON  2607872800
## 8        FLASH FLOOD  1421317100
## 9       EXTREME COLD  1292973000
## 10      FROST/FREEZE  1094086000
#Calculating data for total economic damages (combination of crop and property damages)
total_dmg <- aggregate(TOTALDMG~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)

library(dplyr) #we need this to use the "arrange" function
total_dmg<- arrange(total_dmg, desc(TOTALDMG))
total_dmg_top10 <- total_dmg[1:10,]
total_dmg_top10
##               EVTYPE     TOTALDMG
## 1              FLOOD 150319678250
## 2  HURRICANE/TYPHOON  71913712800
## 3            TORNADO  57362335085
## 4        STORM SURGE  43323541000
## 5               HAIL  18761224047
## 6        FLASH FLOOD  18243993225
## 7            DROUGHT  15018672000
## 8          HURRICANE  14610229010
## 9        RIVER FLOOD  10148404500
## 10         ICE STORM   8967041810

The following are 3 plots that outline the top 10 weather events that cause the greatest property, crop, and total economic damages.

par(mfrow=c(1,3))
barplot(ppty_dmg_top10$PROPDMGAMT/1000000000, names.arg=ppty_dmg_top10$EVTYPE, main="Top 10 Storm Types and Property Damages in U.S.", ylab="Property Damages (US$B)", col="khaki", las=3, cex.names=0.6, cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,180))
barplot(crop_dmg_top10$CROPDMGAMT/1000000000, names.arg=crop_dmg_top10$EVTYPE, main="Top 10 Storm Types and Crop Damages in U.S.", ylab="Crop Damages (US$B)", col="green4", las=3, cex.names=0.6,  cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,180))
barplot(total_dmg_top10$TOTALDMG/1000000000, names.arg=total_dmg_top10$EVTYPE, main="Top 10 Storm Types and Total Economic Damages in U.S.", ylab="Economic Damages (US$B)", col="brown4", las=3, cex.names=0.6,  cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,180))

From graph of “Top 10 storm types and Total Economic Damages in U.S.”, it is evident that the top 3 weather events that cause the greatest economic impact are: flood, hurricane/typhoon, and tornado.

Conclusion

From these data, we conclude that flood, hurricane/typhoon, and tornado have the greatest economic impact in U.S. In terms of human-social impact, tornado, excessive heat, and flash flood cause the greatest fatalities; while tornado, thunderstorm wind (TSTM WIND), and flood contribute most to human injuries.