Download and load the data
setwd("C:\\Users\\Andy's Home PC\\Documents\\Coursera Courses\\Data Science\\Reproducible Research")
getwd()
## [1] "C:/Users/Andy's Home PC/Documents/Coursera Courses/Data Science/Reproducible Research"
if(!file.exists("./data")){dir.create("./data")} #this will create a folder "data", if it doesn't exists
fileUrl = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(fileUrl, destfile="./data/stormData.csv.bz2", method="curl") #this download the file
data <- read.csv(bzfile("./data/stormData.csv.bz2"), sep=",", header=TRUE, na.strings=c(""))
#view top 3 records
head(data, 3)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 <NA> <NA> <NA> <NA> 0
## 2 TORNADO 0 <NA> <NA> <NA> <NA> 0
## 3 TORNADO 0 <NA> <NA> <NA> <NA> 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 <NA> <NA> 14.0 100 3 0 0
## 2 NA 0 <NA> <NA> 2.0 150 2 0 0
## 3 NA 0 <NA> <NA> 0.1 123 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0 <NA> <NA> <NA> <NA>
## 2 0 2.5 K 0 <NA> <NA> <NA> <NA>
## 3 2 25.0 K 0 <NA> <NA> <NA> <NA>
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 <NA> 1
## 2 3042 8755 0 0 <NA> 2
## 3 3340 8742 0 0 <NA> 3
Copy over the columns we need to a smaller data frame
data2 <- data[,c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")]
Explore the PROPDMGEXP and CROPDMGEXP columns
unique(data2$PROPDMGEXP)
## [1] K M <NA> B m + 0 5 6 ? 4 2 3 h
## [15] 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
unique(data2$CROPDMGEXP)
## [1] <NA> M K m B ? 0 k 2
## Levels: ? 0 2 B k K m M
Assign numeric values to the special characters in PROPDMGEXP column (i.e. - ? + 0 1 2 3 4 5 6 7 8 B h H K m M)
data2$PROPDMGEXP <- as.character(data2$PROPDMGEXP) #change this from "factor" to "character" class; else it would create problems if we assign numbers to factors directly without this step
class(data2$PROPDMGEXP) #confirms that it has been converted to "character" class
## [1] "character"
data2[which(data2$PROPDMGEXP=="H"), "PROPDMGEXP"] <- 100
data2[which(data2$PROPDMGEXP=="h"), "PROPDMGEXP"] <- 100
data2[which(data2$PROPDMGEXP=="K"), "PROPDMGEXP"] <- 1000
data2[which(data2$PROPDMGEXP=="M"), "PROPDMGEXP"] <- 1000000
data2[which(data2$PROPDMGEXP=="m"), "PROPDMGEXP"] <- 1000000
data2[which(data2$PROPDMGEXP=="B"), "PROPDMGEXP"] <- 1000000000
data2[which(data2$PROPDMGEXP=="0"), "PROPDMGEXP"] <- 1
data2[which(data2$PROPDMGEXP=="1"), "PROPDMGEXP"] <- 10
data2[which(data2$PROPDMGEXP=="2"), "PROPDMGEXP"] <- 100
data2[which(data2$PROPDMGEXP=="3"), "PROPDMGEXP"] <- 1000
data2[which(data2$PROPDMGEXP=="4"), "PROPDMGEXP"] <- 10000
data2[which(data2$PROPDMGEXP=="5"), "PROPDMGEXP"] <- 100000
data2[which(data2$PROPDMGEXP=="6"), "PROPDMGEXP"] <- 1000000
data2[which(data2$PROPDMGEXP=="7"), "PROPDMGEXP"] <- 10000000
data2[which(data2$PROPDMGEXP=="8"), "PROPDMGEXP"] <- 100000000
#zero out invalid records ("-", "+", "?")
data2[which(data2$PROPDMGEX=="-"), "PROPDMGEXP"] <- 0
data2[which(data2$PROPDMGEX=="+"), "PROPDMGEXP"] <- 0
data2[which(data2$PROPDMGEX=="?"), "PROPDMGEXP"] <- 0
Assign numeric values to the special characters in CROPDMGEXP column (i.e. ? 0 2 B k K m M)
data2$CROPDMGEXP <- as.character(data2$CROPDMGEXP) #change this from "factor" to "character" class
class(data2$CROPDMGEXP) #confirms that it has been converted to "character" class
## [1] "character"
data2[which(data2$CROPDMGEXP=="K"), "CROPDMGEXP"] <- 1000
data2[which(data2$CROPDMGEXP=="k"), "CROPDMGEXP"] <- 1000
data2[which(data2$CROPDMGEXP=="M"), "CROPDMGEXP"] <- 1000000
data2[which(data2$CROPDMGEXP=="m"), "CROPDMGEXP"] <- 1000000
data2[which(data2$CROPDMGEXP=="B"), "CROPDMGEXP"] <- 1000000000
data2[which(data2$CROPDMGEXP=="0"), "CROPDMGEXP"] <- 1
data2[which(data2$CROPDMGEXP=="2"), "CROPDMGEXP"] <- 100
data2[which(data2$CROPDMGEXP=="?"), "CROPDMGEXP"] <- 0
Calculate the crop damages and property damages, and store these values in 2 new columns in the dataset
data2$PROPDMGEXP <- as.numeric(data2$PROPDMGEXP) #convert from character to numeric class (else we can't do numeric operations)
data2$CROPDMGEXP <- as.numeric(data2$CROPDMGEXP) #convert from character to numeric class
data2$PROPDMGAMT <- data2$PROPDMG * data2$PROPDMGEXP #economic value of property damages
data2$CROPDMGAMT <- data2$CROPDMG * data2$CROPDMGEXP #economic value of crop damages
data2$TOTALDMG <- rowSums(data2[,c("PROPDMGAMT", "CROPDMGAMT")], na.rm=TRUE)
head(data2,3)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0 15 25.0 1000 0 NA
## 2 TORNADO 0 0 2.5 1000 0 NA
## 3 TORNADO 0 2 25.0 1000 0 NA
## PROPDMGAMT CROPDMGAMT TOTALDMG
## 1 25000 NA 25000
## 2 2500 NA 2500
## 3 25000 NA 25000
Convert “EVTYPE” from Factor to Character class
data2$EVTYPE<-as.character(data2$EVTYPE)
class(data2$EVTYPE)
## [1] "character"
Calculate total fatalities based on the environmental types of storm
fatalitiesTotal <- aggregate(FATALITIES~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)
library(dplyr) #we need this to use the "arrange" function
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
fatalitiesTotal<- arrange(fatalitiesTotal, desc(FATALITIES))
fatalitiesTotal_top10 <- fatalitiesTotal[1:10,]
fatalitiesTotal_top10
## EVTYPE FATALITIES
## 1 TORNADO 5633
## 2 EXCESSIVE HEAT 1903
## 3 FLASH FLOOD 978
## 4 HEAT 937
## 5 LIGHTNING 816
## 6 TSTM WIND 504
## 7 FLOOD 470
## 8 RIP CURRENT 368
## 9 HIGH WIND 248
## 10 AVALANCHE 224
Calculate total injuries based on environmental types of storm
injuriesTotal <- aggregate(INJURIES~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)
library(dplyr) #we need this to use the "arrange" function
injuriesTotal<- arrange(injuriesTotal, desc(INJURIES))
injuriesTotal_top10 <- injuriesTotal[1:10,]
injuriesTotal_top10
## EVTYPE INJURIES
## 1 TORNADO 91346
## 2 TSTM WIND 6957
## 3 FLOOD 6789
## 4 EXCESSIVE HEAT 6525
## 5 LIGHTNING 5230
## 6 HEAT 2100
## 7 ICE STORM 1975
## 8 FLASH FLOOD 1777
## 9 THUNDERSTORM WIND 1488
## 10 HAIL 1361
Creating 2 barplots with respect to storm types and their impact on fatalities and injuries
par(mfrow=c(1,2))
barplot(fatalitiesTotal_top10$FATALITIES, names.arg=fatalitiesTotal_top10$EVTYPE, main="Top 10 Storm Types and Fatalities in U.S.", ylab="Number of Fatalities", col="firebrick", las=3, cex.names=0.5, cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,90000))
barplot(injuriesTotal_top10$INJURIES, names.arg=injuriesTotal_top10$EVTYPE, main="Top 10 Storm Types and Injuries in U.S.", ylab="Number of Injuries", col="Blue4", las=3, cex.names=0.5, cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,90000))
Based on the above diagrams, we found that tornado, excessive heat, and flash flood are the top 3 weather events that cause the greatest fatalities in U.S. However, in terms of greatest human injuries, the top 3 weather events are tornado, thunderstorm wind (TSTM WIND), and flood.
Creating 3 bar plots that indicate weather events that causes property damages, crop damages, and total economic damages (combination of property and crop damages)
#Calculating data for aggregate property damages(data2$PROPDMGAMT) and weather event types(data2$EVTYPE)
ppty_dmg <- aggregate(PROPDMGAMT~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)
library(dplyr) #we need this to use the "arrange" function
ppty_dmg<- arrange(ppty_dmg, desc(PROPDMGAMT))
ppty_dmg_top10 <- ppty_dmg[1:10,]
ppty_dmg_top10
## EVTYPE PROPDMGAMT
## 1 FLOOD 144657709800
## 2 HURRICANE/TYPHOON 69305840000
## 3 TORNADO 56947381815
## 4 STORM SURGE 43323536000
## 5 FLASH FLOOD 16822676125
## 6 HAIL 15735269577
## 7 HURRICANE 11868319010
## 8 TROPICAL STORM 7703890550
## 9 WINTER STORM 6688497260
## 10 HIGH WIND 5270046260
#Calculating data for aggregate crop damages(data2$CROPDMGAMT) and weather event types (data2$EVTYPE)
crop_dmg <- aggregate(CROPDMGAMT~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)
library(dplyr) #we need this to use the "arrange" function
crop_dmg<- arrange(crop_dmg, desc(CROPDMGAMT))
crop_dmg_top10 <- crop_dmg[1:10,]
crop_dmg_top10
## EVTYPE CROPDMGAMT
## 1 DROUGHT 13972566000
## 2 FLOOD 5661968450
## 3 RIVER FLOOD 5029459000
## 4 ICE STORM 5022113500
## 5 HAIL 3025954470
## 6 HURRICANE 2741910000
## 7 HURRICANE/TYPHOON 2607872800
## 8 FLASH FLOOD 1421317100
## 9 EXTREME COLD 1292973000
## 10 FROST/FREEZE 1094086000
#Calculating data for total economic damages (combination of crop and property damages)
total_dmg <- aggregate(TOTALDMG~EVTYPE, data2, function(x) sum(x, na.rm=TRUE), na.action = na.pass)
library(dplyr) #we need this to use the "arrange" function
total_dmg<- arrange(total_dmg, desc(TOTALDMG))
total_dmg_top10 <- total_dmg[1:10,]
total_dmg_top10
## EVTYPE TOTALDMG
## 1 FLOOD 150319678250
## 2 HURRICANE/TYPHOON 71913712800
## 3 TORNADO 57362335085
## 4 STORM SURGE 43323541000
## 5 HAIL 18761224047
## 6 FLASH FLOOD 18243993225
## 7 DROUGHT 15018672000
## 8 HURRICANE 14610229010
## 9 RIVER FLOOD 10148404500
## 10 ICE STORM 8967041810
The following are 3 plots that outline the top 10 weather events that cause the greatest property, crop, and total economic damages.
par(mfrow=c(1,3))
barplot(ppty_dmg_top10$PROPDMGAMT/1000000000, names.arg=ppty_dmg_top10$EVTYPE, main="Top 10 Storm Types and Property Damages in U.S.", ylab="Property Damages (US$B)", col="khaki", las=3, cex.names=0.6, cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,180))
barplot(crop_dmg_top10$CROPDMGAMT/1000000000, names.arg=crop_dmg_top10$EVTYPE, main="Top 10 Storm Types and Crop Damages in U.S.", ylab="Crop Damages (US$B)", col="green4", las=3, cex.names=0.6, cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,180))
barplot(total_dmg_top10$TOTALDMG/1000000000, names.arg=total_dmg_top10$EVTYPE, main="Top 10 Storm Types and Total Economic Damages in U.S.", ylab="Economic Damages (US$B)", col="brown4", las=3, cex.names=0.6, cex.main=0.8, cex.axis=0.75, cex.lab=0.9, ylim=c(0,180))
From graph of “Top 10 storm types and Total Economic Damages in U.S.”, it is evident that the top 3 weather events that cause the greatest economic impact are: flood, hurricane/typhoon, and tornado.
From these data, we conclude that flood, hurricane/typhoon, and tornado have the greatest economic impact in U.S. In terms of human-social impact, tornado, excessive heat, and flash flood cause the greatest fatalities; while tornado, thunderstorm wind (TSTM WIND), and flood contribute most to human injuries.