library(ggplot2)
library(data.table)
D <- fread("StormData.csv") #Reads from the CSV file
##
Read 0.0% of 967216 rows
Read 31.0% of 967216 rows
Read 51.7% of 967216 rows
Read 70.3% of 967216 rows
Read 79.6% of 967216 rows
Read 92.0% of 967216 rows
Read 902297 rows and 37 (of 37) columns from 0.523 GB file in 00:00:09
Dt <- as.data.table(D)
rm(D) # We remove D in the interest of Memory Conservation
Dt <- subset(Dt, select = c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG",
"PROPDMGEXP", "CROPDMG", "CROPDMGEXP")) #Columns to Keep
# Only want data where we have injuries, fatalities, property damage, crop damage > 0
Dt <- subset(Dt, INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0)
Dt$PROPDMGEXP <- toupper(as.character(Dt$PROPDMGEXP))
Dt$CROPDMGEXP <- toupper(as.character(Dt$CROPDMGEXP))
Dt$PROPDMGEXP[is.na(Dt$PROPDMGEXP)] <- 0
Dt$CROPDMGEXP[is.na(Dt$CROPDMGEXP)] <- 0
# Set all missing values to 0
Dt$FATALITIES[(Dt$FATALITIES == "")] <- 0
Dt$INJURIES[(Dt$INJURIES == "")] <- 0
Dt$PROPDMG[(Dt$PROPDMG == "")] <- 0
Dt$PROPDMGEXP[(Dt$PROPDMGEXP == "")] <- 0
Dt$CROPDMG[(Dt$CROPDMG == "")] <- 0
Dt$CROPDMGEXP[(Dt$CROPDMGEXP == "")] <- 0
# Set exponent codes to zero
Dt$PROPDMGEXP[(Dt$PROPDMGEXP == "H")] <- 2
Dt$PROPDMGEXP[(Dt$PROPDMGEXP == "K")] <- 3
Dt$PROPDMGEXP[(Dt$PROPDMGEXP == "M")] <- 6
Dt$PROPDMGEXP[(Dt$PROPDMGEXP == "B")] <- 9
Dt$PROPDMGEXP[(Dt$PROPDMGEXP == "+")] <- 0
Dt$PROPDMGEXP[(Dt$PROPDMGEXP == "-")] <- 0
Dt$PROPDMGEXP[(Dt$PROPDMGEXP == "?")] <- 0
Dt$CROPDMGEXP[(Dt$CROPDMGEXP == "+")] <- 0
Dt$CROPDMGEXP[(Dt$CROPDMGEXP == "-")] <- 0
Dt$CROPDMGEXP[(Dt$CROPDMGEXP == "?")] <- 0
Dt$CROPDMGEXP[(Dt$CROPDMGEXP == "H")] <- 2
Dt$CROPDMGEXP[(Dt$CROPDMGEXP == "K")] <- 3
Dt$CROPDMGEXP[(Dt$CROPDMGEXP == "M")] <- 6
Dt$CROPDMGEXP[(Dt$CROPDMGEXP == "B")] <- 9
na_Dt <- Dt[rowSums(is.na(Dt)) > 0,] #Check how many NA values there are
head(na_Dt)
## [1] EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG
## [7] CROPDMGEXP
## <0 rows> (or 0-length row.names)
Since there are few/no NA values, we do not need to omit the NA values
Dt$PROPDMGEXP <- as.integer(Dt$PROPDMGEXP) #Set as integers
Dt$CROPDMGEXP <- as.integer(Dt$CROPDMGEXP)
Dt$PROPDMGTOT <- Dt$PROPDMG * 10^Dt$PROPDMGEXP #Total Property and Crop Damage
Dt$CROPDMGTOT <- Dt$CROPDMG * 10^Dt$CROPDMGEXP
Dt$TOTDMG <- Dt$PROPDMGTOT + Dt$PROPDMGTOT #Total Damage (Property + Crop)
Fatalities <- aggregate(FATALITIES ~ EVTYPE, data = Dt, FUN=sum)
Injuries <- aggregate(INJURIES ~ EVTYPE, data = Dt, FUN=sum)
PropertyDamage <- aggregate(PROPDMGTOT ~ EVTYPE, data = Dt, FUN=sum)
CropDamage <- aggregate(CROPDMGTOT ~ EVTYPE, data = Dt, FUN=sum)
TotalDamage <- aggregate(TOTDMG ~ EVTYPE, data = Dt, FUN=sum)
Summary <- merge(Fatalities, Injuries, by="EVTYPE", all=TRUE)
Summary <- merge(Summary, PropertyDamage, by="EVTYPE", all=TRUE)
Summary <- merge(Summary, CropDamage, by="EVTYPE", all=TRUE)
Summary <- merge(Summary, TotalDamage, by="EVTYPE", all=TRUE)
#Order all data into Summary Statistics for Subsequent Plotting
Fatalities <- Summary[order(Summary$FATALITIES, decreasing=TRUE),][1:10,]
Injuries <- Summary[order(Summary$INJURIES, decreasing=TRUE),][1:10,]
PropertyDamage <- Summary[order(Summary$PROPDMGTOT, decreasing=TRUE),][1:10,]
CropDamage <- Summary[order(Summary$CROPDMGTOT, decreasing=TRUE),][1:10,]
TotalDamage <- Summary[order(Summary$TOTDMG, decreasing=TRUE),][1:10,]
par(mfrow=c(2,1), mar=c(5,4,4,2), oma=c(4,2,2,2), cex=0.7)
barplot(Fatalities$FATALITIES, names.arg=Fatalities$EVTYPE, las=3,
cex.names=0.7, xlab="", ylab="Total Fatalities", col="blue",
main="Weather Events with Highest Fatalities")
barplot(Injuries$INJURIES, names.arg=Injuries$EVTYPE, las=3, cex.names=0.7,
xlab="", ylab="Total Injuries", col="blue", main="Weather Events
with Highest Injuries")
The histogram shows clearly that Tornadoes, Excessive Heat, Flash Floods, Heat, and Lightning are the weather events with the highest fatalities. The weather events with the highest injuries are Tornadoes, TSTM Wind, Floods, Excessive Heat, and Lightning.
par(mfrow=c(2,1), mar=c(5,4,4,2), oma=c(4,2,2,2), cex=0.7)
barplot(PropertyDamage$PROPDMGTOT/10^6, names.arg=PropertyDamage$EVTYPE, las=3,
cex.names=0.7, xlab="", ylab="Total Property Damage in USD (Millions)",
col="blue", main="Weather Events with Highest Property Damage Cost")
barplot(CropDamage$CROPDMGTOT/10^6, names.arg=CropDamage$EVTYPE, las=3,
cex.names=0.7, xlab="", ylab="Total Crop Damage in USD (Millions)",
col="blue", main="Weather Events with Highest Crop Damage Cost")
The histogram shows clearly that Floods, Hurricanes, Tornadoes, Storm Surges, and Flash Floods are the weather events with the highest property damage. The weather events with the highest crop damage are Droughts, Floods, River Floods, Ice Storms, and Hail.