This report contains an analysis of the effects of weather events on population health and the economy. The data used here has been downloaded from the storm events data base kept by the National Climatic Data Centre in the United States. The data has been analysed and processed and a number of exploratory plots have been produced.
As some of the event types in the data do not conform to the event types listed in the “National Weather Service” document; I have listed the most common names that don’t conform and replaced them with the closest matching official type I could find (matching is clearly list below) to produce the baseline analytic data. Any event types with non official names that total less than 1000 have been ignored.
USWeather_raw <- read.csv("repdata_data_StormData.csv")
# Finding the most common non-official names
off_evtype <- file("EVTYPES.txt","r")
off_evtype_names <- toupper(readLines(off_evtype))
off_logical <- USWeather_raw$EVTYPE %in% off_evtype_names
# 20 most common non-official names
head(summary(USWeather_raw$EVTYPE[!off_logical]),20)
## TSTM WIND THUNDERSTORM WINDS MARINE TSTM WIND
## 219940 20843 6175
## URBAN/SML STREAM FLD HIGH WINDS WILD/FOREST FIRE
## 3392 1533 1457
## WINTER WEATHER/MIX TSTM WIND/HAIL FLASH FLOODING
## 1104 1028 682
## EXTREME COLD FLOOD/FLASH FLOOD LANDSLIDE
## 655 624 600
## SNOW FOG WIND
## 587 538 340
## RIP CURRENTS STORM SURGE FREEZING RAIN
## 304 261 250
## URBAN FLOOD HEAVY SURF/HIGH SURF
## 249 228
# Replacing most common EVTYPES with non official names with the official name
# Any item with greater than 1000 occurences in replace to create analytical data
USWeather_anal <- USWeather_raw
USWeather_anal$EVTYPE <- gsub("TSTM WIND",
"THUNDERSTORM WIND",
USWeather_anal$EVTYPE)
USWeather_anal$EVTYPE <- gsub("THUNDERSTORM WINDS",
"THUNDERSTORM WIND",
USWeather_anal$EVTYPE)
USWeather_anal$EVTYPE <- gsub("MARINE TSTM WIND",
"MARINE THUNDERSTORM WIND",
USWeather_anal$EVTYPE)
USWeather_anal$EVTYPE <- gsub("URBAN/SML STREAM FLD",
"FLASH FLOOD",
USWeather_anal$EVTYPE)
USWeather_anal$EVTYPE <- gsub("HIGH WINDS",
"HIGH WIND",
USWeather_anal$EVTYPE)
USWeather_anal$EVTYPE <- gsub("WILD/FOREST FIRE",
"WILDFIRE",
USWeather_anal$EVTYPE)
USWeather_anal$EVTYPE <- gsub("WINTER WEATHER/MIX",
"WINTER WEATHER",
USWeather_anal$EVTYPE)
USWeather_anal$EVTYPE <- gsub("TSTM WIND/HAIL",
"THUNDERSTORM WIND",
USWeather_anal$EVTYPE)
# Removing remaining non-official data from analytical data set
off_logical <- USWeather_anal$EVTYPE %in% off_evtype_names
USWeather_anal <- USWeather_anal[off_logical,]
size_raw <- dim(USWeather_raw)
size_anal <- dim(USWeather_anal)
# Finding percentage lost
percentage_lost <- 100*(size_raw[1] - size_anal[1])/size_raw[1]
percentage_lost
## [1] 1.393333
# Subsetting to an economy and population health data frame
# Health data
poph_cols <- c("EVTYPE","FATALITIES","INJURIES","STATE","BGN_DATE","END_DATE")
USWeather_poph <- subset(USWeather_anal,select = poph_cols)
# Economic data
eco_cols <- c("EVTYPE","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP","STATE","BGN_DATE","END_DATE")
USWeather_eco <- subset(USWeather_anal,select = eco_cols)
close(off_evtype)
I have found and listed the “top ten” weather types that cause the highest average number of fatalities and injuries and also the “top ten” weather types that cause the highest number of fatalities and injuries. There is also a bar chart to summarise this information. I have limited the y-values in the second plot as the number of injuries caused by tornados is so large.
# Analysing Fatalises
# Identifying the most dangerous Weather (for fatalities)
mean_fatalities <- tapply(USWeather_poph$FATALITIES,
USWeather_poph$EVTYPE,
mean)
# Identifying the highest killers
sum_fatalities <- tapply(USWeather_poph$FATALITIES,
USWeather_poph$EVTYPE,
sum)
# Analysing Injuries
# Identifying the most dangerous Weather (for injuries)
mean_injuries <- tapply(USWeather_poph$INJURIES,
USWeather_poph$EVTYPE,
mean)
# Identifying the highest injuries
sum_injuries <- tapply(USWeather_poph$INJURIES,
USWeather_poph$EVTYPE,
sum)
# Analysing Fatalises
# The Top 10 most dangerous weather types (for fatalities)
head(sort(mean_fatalities,decreasing=TRUE),10)
## TSUNAMI HEAT EXCESSIVE HEAT
## 1.6500000 1.2216428 1.1340882
## RIP CURRENT AVALANCHE MARINE STRONG WIND
## 0.7829787 0.5803109 0.2916667
## COLD/WIND CHILL HIGH SURF EXTREME COLD/WIND CHILL
## 0.1762523 0.1393103 0.1247505
## TORNADO
## 0.0928741
# Top 10 most highest killers
head(sort(sum_fatalities,decreasing=TRUE),10)
## TORNADO EXCESSIVE HEAT FLASH FLOOD HEAT
## 5633 1903 1006 937
## LIGHTNING THUNDERSTORM WIND FLOOD RIP CURRENT
## 816 701 470 368
## HIGH WIND AVALANCHE
## 283 224
# Analysing Injuries
# Top 10 most dangerous weather types
head(sort(mean_injuries,decreasing=TRUE),10)
## TSUNAMI EXCESSIVE HEAT HEAT
## 6.4500000 3.8885578 2.7379400
## TORNADO DUST STORM ICE STORM
## 1.5060674 1.0304450 0.9845464
## RIP CURRENT TROPICAL STORM MARINE STRONG WIND
## 0.4936170 0.4927536 0.4583333
## AVALANCHE
## 0.4404145
# Top 10 most highest injuries
head(sort(sum_injuries,decreasing=TRUE),10)
## TORNADO THUNDERSTORM WIND FLOOD EXCESSIVE HEAT
## 91346 9353 6789 6525
## LIGHTNING HEAT ICE STORM FLASH FLOOD
## 5230 2100 1975 1856
## WILDFIRE HIGH WIND
## 1456 1439
#Producing a plot to illustrate results
# Extracting data for plots
#Extracting most dangerous average data
num_bars <- 10
barnames_mean <- names(head(sort(mean_fatalities,decreasing=TRUE),num_bars))
mean_fatal <- mean_fatalities[barnames_mean]
mean_inj <- mean_injuries[barnames_mean]
plot_matrix_mean <- matrix(c(mean_fatal,mean_inj),nrow=2,ncol=num_bars,byrow=TRUE)
#Extracting data for the largest number of fatalities
barnames_sum <- names(head(sort(sum_fatalities,decreasing=TRUE),num_bars))
sum_fatal <- sum_fatalities[barnames_sum]
sum_inj <- sum_injuries[barnames_sum]
plot_matrix_sum <- matrix(c(sum_fatal,sum_inj),nrow=2,ncol=num_bars,byrow=TRUE)
#plot options
par(mar=c(13.1,4.1,4.1,2.1))
par(mfrow = c(1, 2))
colours <- c("blue","red")
# Producing bar plot for the most dangerous weather types
barplot(plot_matrix_mean,
ylab="Average Values",
names.arg=barnames_mean,
col=colours,
las=2)
legend("topright",
c("Fatalities","Injuries"),
fill=colours)
# Producing bar plot for the weather types causing the most fatalities
barplot(plot_matrix_sum,
ylab="Total Values",
names.arg=barnames_sum,
col=colours,las=2,ylim=c(0,12000))
legend("topleft",
c("Fatalities","Injuries"),
fill=colours)
### Analysing Economic Effects I have found and listed the “top ten” weather types that cause the highest cost on average and also the “top ten” weather types that cause the highest total cost. Firstly for property damage and then for crop damage.There is also a bar chart to summarise this information.
# Analysing property damage
# Removing entries which aren't blank or H,K,M,B from property damage.
KMB <- c("","h","H","k","K","m","M","b","B")
USWeather_eco_prop <- subset(USWeather_eco,
USWeather_eco$PROPDMGEXP %in% KMB,
select=c("EVTYPE","PROPDMG","PROPDMGEXP"))
# Working out percentage data that has been lost
size_eco <- dim(USWeather_eco)
size_eco_prop <- dim(USWeather_eco_prop)
percentage_lost <- 100*(size_eco[1] - size_eco_prop[1])/size_eco[1]
percentage_lost
## [1] 0.03383068
# Converting units to numbers
USWeather_eco_prop$PROPDMGUNITS <- USWeather_eco_prop$PROPDMGEXP
# Replacing blanks
KMB_cost <- c(1,100,100,1000,1000,1000000,1000000,1000000000,1000000000)
USWeather_eco_prop$PROPDMGUNITS <- gsub("^$",KMB_cost[1],USWeather_eco_prop$PROPDMGUNITS)
# Replacing other units
for(i in 2:9)
{
USWeather_eco_prop$PROPDMGUNITS <- gsub(KMB[i],KMB_cost[i],USWeather_eco_prop$PROPDMGUNITS)
}
USWeather_eco_prop$PROPDMGCOST <- USWeather_eco_prop$PROPDMG*as.numeric(USWeather_eco_prop$PROPDMGUNITS)
# Find the most expensive events on average
prop_mean_cost <- tapply(USWeather_eco_prop$PROPDMGCOST,
USWeather_eco_prop$EVTYPE,
mean)
# Identifying events that cost the most
prop_sum_cost <- tapply(USWeather_eco_prop$PROPDMGCOST,
USWeather_eco_prop$EVTYPE,
sum)
# The Top 10 most expensive on average
head(sort(prop_mean_cost,decreasing=TRUE),10)
## STORM SURGE/TIDE TROPICAL STORM TSUNAMI FLOOD
## 31359378.4 11165058.8 7203100.0 5712051.7
## ICE STORM WILDFIRE TORNADO WINTER STORM
## 1967545.0 1841380.6 939061.2 585068.0
## DROUGHT COASTAL FLOOD
## 420460.6 365639.3
# Top 10 most money spent
head(sort(prop_sum_cost,decreasing=TRUE),10)
## FLOOD TORNADO FLASH FLOOD HAIL
## 144657709807 56937160483 16199121367 15732267277
## THUNDERSTORM WIND WILDFIRE TROPICAL STORM WINTER STORM
## 9704063633 7766943500 7703890550 6688497250
## HIGH WIND STORM SURGE/TIDE
## 5878369913 4641188000
# Plotting property data
# Extracting most costly average data
num_bars <- 10
barnames_mean <- names(head(sort(prop_mean_cost,decreasing=TRUE),num_bars))
mean_prop <- prop_mean_cost[barnames_mean]
# Extracting data for the largest number of fatalities
barnames_sum <- names(head(sort(prop_sum_cost,decreasing=TRUE),num_bars))
sum_prop <- prop_sum_cost[barnames_sum]
# plot options
par(mar=c(13.1,4.1,4.1,2.1))
par(mfrow = c(1, 2))
# Producing bar plot for the most expensive weather types on average
barplot(mean_prop,
ylab="Average Values",
las=2,
names.arg=barnames_mean)
# Producing bar plot for the weather types causing the most expense
barplot(sum_prop,
ylab="Total Values",
las=2,
names.arg=barnames_sum)
# Analysing crop damage
# Removing entries which aren't blank or H,K,M,B from crop damage.
KMB <- c("","h","H","k","K","m","M","b","B")
USWeather_eco_crop <- subset(USWeather_eco,
USWeather_eco$CROPDMGEXP %in% KMB,
select=c("EVTYPE","CROPDMG","CROPDMGEXP"))
# Working out percentage data that has been lost
size_eco <- dim(USWeather_eco)
size_eco_crop <- dim(USWeather_eco_crop)
percentage_lost <- 100*(size_eco[1] - size_eco_crop[1])/size_eco[1]
percentage_lost
## [1] 0.002585068
# Converting units to numbers
USWeather_eco_crop$CROPDMGUNITS <- USWeather_eco_crop$CROPDMGEXP
# Replacing blanks
KMB_cost <- c(1,100,100,1000,1000,1000000,1000000,1000000000,1000000000)
USWeather_eco_crop$CROPDMGUNITS <- gsub("^$",KMB_cost[1],USWeather_eco_crop$CROPDMGUNITS)
# Replacing other units
for(i in 2:9)
{
USWeather_eco_crop$CROPDMGUNITS <- gsub(KMB[i],KMB_cost[i],USWeather_eco_crop$CROPDMGUNITS)
}
USWeather_eco_crop$CROPDMGCOST <- USWeather_eco_crop$CROPDMG*as.numeric(USWeather_eco_crop$CROPDMGUNITS)
# Find the most expensive events on average
crop_mean_cost <- tapply(USWeather_eco_crop$CROPDMGCOST,
USWeather_eco_crop$EVTYPE,
mean)
# Identifying events that cost the most
crop_sum_cost <- tapply(USWeather_eco_crop$CROPDMGCOST,
USWeather_eco_crop$EVTYPE,
sum)
# The Top 10 most expensive on average
head(sort(crop_mean_cost,decreasing=TRUE),10)
## DROUGHT ICE STORM TROPICAL STORM FROST/FREEZE HEAT
## 5618241.25 2503546.11 983110.14 815265.28 523417.86
## EXCESSIVE HEAT FLOOD WILDFIRE HEAVY RAIN BLIZZARD
## 293445.77 223563.47 95369.76 62560.76 41213.68
# Top 10 most money spent
head(sort(crop_sum_cost,decreasing=TRUE),10)
## DROUGHT FLOOD ICE STORM HAIL
## 13972566000 5661968450 5022113500 3025954453
## FLASH FLOOD THUNDERSTORM WIND FROST/FREEZE HEAVY RAIN
## 1429805200 1159505108 1094086000 733399800
## HIGH WIND TROPICAL STORM
## 679291900 678346000
# Plotting crop damage data
# Extracting most costly average data
num_bars <- 10
barnames_mean <- names(head(sort(crop_mean_cost,decreasing=TRUE),num_bars))
mean_crop <- crop_mean_cost[barnames_mean]
# Extracting data for the largest number of fatalities
barnames_sum <- names(head(sort(crop_sum_cost,decreasing=TRUE),num_bars))
sum_crop <- crop_sum_cost[barnames_sum]
# plot options
par(mar=c(13.1,4.1,4.1,2.1))
par(mfrow = c(1, 2))
# Producing bar plot for the most expensive weather types on average
barplot(mean_crop,
ylab="Average Values",
las=2,
names.arg=barnames_mean)
# Producing bar plot for the weather types causing the most expense
barplot(sum_crop,
ylab="Total Values",
las=2,
names.arg=barnames_sum)