This report contains an analysis of the effects of weather events on population health and the economy. The data used here has been downloaded from the storm events data base kept by the National Climatic Data Centre in the United States. The data has been analysed and processed and a number of exploratory plots have been produced.
I have produced 3 data frames of analytic data. One which can be used to analyse effects on health, one which can be used to analyse the economic effects of crop damage and one which can be used to analyse the economic effects of property damage.
To produce analytic data I have had to resolve two problems. Firstly, some of the event types in the data do not conform to the event types listed in the “National Weather Service” document and secondly some of the units for the costs of property and crop damage are not in a format I recognise (e.g. not k for thousands or m for millions).
To resolve the first problem I have listed the most common names that don’t conform and replaced them with the closest matching official type I could find (matching is clearly list below). Any event types with non official names that total less than 1000 have been ignored. This is a very small percentage, which is reported below.
To resolve the second problem I have ignored cost units that I do not recognise (Again this is a very small percentage, which is reported below) and produced an additional column with all cost converted to dollars with no units so that comparisons can be made.
Code to resolve the first problem and produce analytic data for health:
USWeather_raw <- read.csv("repdata_data_StormData.csv.bz2")
# Finding the most common non-official names
off_evtype <- file("EVTYPES.txt","r")
off_evtype_names <- toupper(readLines(off_evtype))
off_logical <- USWeather_raw$EVTYPE %in% off_evtype_names
# 20 most common non-official names
head(summary(USWeather_raw$EVTYPE[!off_logical]),20)
## TSTM WIND THUNDERSTORM WINDS MARINE TSTM WIND
## 219940 20843 6175
## URBAN/SML STREAM FLD HIGH WINDS WILD/FOREST FIRE
## 3392 1533 1457
## WINTER WEATHER/MIX TSTM WIND/HAIL FLASH FLOODING
## 1104 1028 682
## EXTREME COLD FLOOD/FLASH FLOOD LANDSLIDE
## 655 624 600
## SNOW FOG WIND
## 587 538 340
## RIP CURRENTS STORM SURGE FREEZING RAIN
## 304 261 250
## URBAN FLOOD HEAVY SURF/HIGH SURF
## 249 228
# Replacing most common EVTYPES with non official names with the official name
# Any item with greater than 1000 occurences in replace to create analytical data
USWeather_off <- USWeather_raw
USWeather_off$EVTYPE <- gsub("TSTM WIND",
"THUNDERSTORM WIND",
USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("THUNDERSTORM WINDS",
"THUNDERSTORM WIND",
USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("MARINE TSTM WIND",
"MARINE THUNDERSTORM WIND",
USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("URBAN/SML STREAM FLD",
"FLASH FLOOD",
USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("HIGH WINDS",
"HIGH WIND",
USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("WILD/FOREST FIRE",
"WILDFIRE",
USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("WINTER WEATHER/MIX",
"WINTER WEATHER",
USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("TSTM WIND/HAIL",
"THUNDERSTORM WIND",
USWeather_off$EVTYPE)
# Removing remaining non-official data from analytical data set
off_logical <- USWeather_off$EVTYPE %in% off_evtype_names
USWeather_off <- USWeather_off[off_logical,]
size_raw <- dim(USWeather_raw)
size_off <- dim(USWeather_off)
# Finding percentage lost
percentage_lost <- 100*(size_raw[1] - size_off[1])/size_raw[1]
percentage_lost
## [1] 1.393333
# Subsetting to an economy and population health data frame
# Health data
poph_cols <- c("EVTYPE","FATALITIES","INJURIES")
USWeather_poph_anal <- subset(USWeather_off,select = poph_cols)
# Economic data
ecop_cols <- c("EVTYPE","PROPDMG","PROPDMGEXP")
ecoc_cols <- c("EVTYPE","CROPDMG","CROPDMGEXP")
USWeather_ecoc <- subset(USWeather_off,select = ecoc_cols)
USWeather_ecop <- subset(USWeather_off,select = ecop_cols)
close(off_evtype)
Code to resolve second problem and produce the analytic data for the crop and property economic effects
# Analysing property damage
# Removing entries which aren't blank or H,K,M,B from property damage.
KMB <- c("","h","H","k","K","m","M","b","B")
size_ecop <- dim(USWeather_ecop)
USWeather_ecop <- USWeather_ecop[USWeather_ecop$PROPDMGEXP %in% KMB,]
# Working out percentage data that has been lost
size_ecop_trim <- dim(USWeather_ecop)
percentage_lost <- 100*(size_ecop[1] - size_ecop_trim[1])/size_ecop[1]
percentage_lost
## [1] 0.03383068
# Converting units to numbers
USWeather_ecop$PROPDMGUNITS <- USWeather_ecop$PROPDMGEXP
# Replacing blanks
KMB_cost <- c(1,100,100,1000,1000,1000000,1000000,1000000000,1000000000)
USWeather_ecop$PROPDMGUNITS <- gsub("^$",KMB_cost[1],USWeather_ecop$PROPDMGUNITS)
# Replacing other units
for(i in 2:9)
{
USWeather_ecop$PROPDMGUNITS <- gsub(KMB[i],KMB_cost[i],USWeather_ecop$PROPDMGUNITS)
}
USWeather_ecop$PROPDMGCOST <- USWeather_ecop$PROPDMG*as.numeric(USWeather_ecop$PROPDMGUNITS)
USWeather_ecop_anal <- USWeather_ecop
# Analysing crop damage
# Removing entries which aren't blank or H,K,M,B from crop damage.
KMB <- c("","h","H","k","K","m","M","b","B")
size_ecoc <- dim(USWeather_ecoc)
USWeather_ecoc <- USWeather_ecoc[USWeather_ecoc$CROPDMGEXP %in% KMB,]
# Working out percentage data that has been lost
size_ecoc_trim <- dim(USWeather_ecoc)
percentage_lost <- 100*(size_ecoc[1] - size_ecoc_trim[1])/size_ecoc[1]
percentage_lost
## [1] 0.002585068
# Converting units to numbers
USWeather_ecoc$CROPDMGUNITS <- USWeather_ecoc$CROPDMGEXP
# Replacing blanks
KMB_cost <- c(1,100,100,1000,1000,1000000,1000000,1000000000,1000000000)
USWeather_ecoc$CROPDMGUNITS <- gsub("^$",KMB_cost[1],USWeather_ecoc$CROPDMGUNITS)
# Replacing other units
for(i in 2:9)
{
USWeather_ecoc$CROPDMGUNITS <- gsub(KMB[i],KMB_cost[i],USWeather_ecoc$CROPDMGUNITS)
}
USWeather_ecoc$CROPDMGCOST <- USWeather_ecoc$CROPDMG*as.numeric(USWeather_ecoc$CROPDMGUNITS)
USWeather_ecoc_anal <- USWeather_ecoc
I have found and listed the “top ten” weather types that cause the highest average number of fatalities and injuries and also the “top ten” weather types that cause the highest number of fatalities and injuries. There is also a bar chart to summarise this information. I have limited the y-values in the second plot as the number of injuries caused by tornados is so large.
# Analysing Fatalises
# Identifying the most dangerous Weather (for fatalities)
mean_fatalities <- tapply(USWeather_poph_anal$FATALITIES,
USWeather_poph_anal$EVTYPE,
mean)
# Identifying the highest killers
sum_fatalities <- tapply(USWeather_poph_anal$FATALITIES,
USWeather_poph_anal$EVTYPE,
sum)
# Analysing Injuries
# Identifying the most dangerous Weather (for injuries)
mean_injuries <- tapply(USWeather_poph_anal$INJURIES,
USWeather_poph_anal$EVTYPE,
mean)
# Identifying the highest injuries
sum_injuries <- tapply(USWeather_poph_anal$INJURIES,
USWeather_poph_anal$EVTYPE,
sum)
# Analysing Fatalises
# The Top 10 most dangerous weather types (for fatalities)
head(sort(mean_fatalities,decreasing=TRUE),10)
## TSUNAMI HEAT EXCESSIVE HEAT
## 1.6500000 1.2216428 1.1340882
## RIP CURRENT AVALANCHE MARINE STRONG WIND
## 0.7829787 0.5803109 0.2916667
## COLD/WIND CHILL HIGH SURF EXTREME COLD/WIND CHILL
## 0.1762523 0.1393103 0.1247505
## TORNADO
## 0.0928741
# Top 10 most highest killers
head(sort(sum_fatalities,decreasing=TRUE),10)
## TORNADO EXCESSIVE HEAT FLASH FLOOD HEAT
## 5633 1903 1006 937
## LIGHTNING THUNDERSTORM WIND FLOOD RIP CURRENT
## 816 701 470 368
## HIGH WIND AVALANCHE
## 283 224
# Analysing Injuries
# Top 10 most dangerous weather types
head(sort(mean_injuries,decreasing=TRUE),10)
## TSUNAMI EXCESSIVE HEAT HEAT
## 6.4500000 3.8885578 2.7379400
## TORNADO DUST STORM ICE STORM
## 1.5060674 1.0304450 0.9845464
## RIP CURRENT TROPICAL STORM MARINE STRONG WIND
## 0.4936170 0.4927536 0.4583333
## AVALANCHE
## 0.4404145
# Top 10 most highest injuries
head(sort(sum_injuries,decreasing=TRUE),10)
## TORNADO THUNDERSTORM WIND FLOOD EXCESSIVE HEAT
## 91346 9353 6789 6525
## LIGHTNING HEAT ICE STORM FLASH FLOOD
## 5230 2100 1975 1856
## WILDFIRE HIGH WIND
## 1456 1439
#Producing a plot to illustrate results
# Extracting data for plots
#Extracting most dangerous average data
num_bars <- 10
barnames_mean <- names(head(sort(mean_fatalities,decreasing=TRUE),num_bars))
mean_fatal <- mean_fatalities[barnames_mean]
mean_inj <- mean_injuries[barnames_mean]
plot_matrix_mean <- matrix(c(mean_fatal,mean_inj),nrow=2,ncol=num_bars,byrow=TRUE)
#Extracting data for the largest number of fatalities
barnames_sum <- names(head(sort(sum_fatalities,decreasing=TRUE),num_bars))
sum_fatal <- sum_fatalities[barnames_sum]
sum_inj <- sum_injuries[barnames_sum]
plot_matrix_sum <- matrix(c(sum_fatal,sum_inj),nrow=2,ncol=num_bars,byrow=TRUE)
#plot options
par(mar=c(13.1,4.1,4.1,2.1))
par(mfrow = c(1, 2))
colours <- c("blue","red")
# Producing bar plot for the most dangerous weather types
barplot(plot_matrix_mean,
ylab="Average Values",
names.arg=barnames_mean,
col=colours,
las=2)
legend("topright",
c("Fatalities","Injuries"),
fill=colours)
# Producing bar plot for the weather types causing the most fatalities
barplot(plot_matrix_sum,
ylab="Total Values",
names.arg=barnames_sum,
col=colours,las=2,ylim=c(0,12000))
legend("topleft",
c("Fatalities","Injuries"),
fill=colours)
I have found and listed the “top ten” weather types that cause the highest cost on average and also the “top ten” weather types that cause the highest total cost. Firstly for property damage and then for crop damage.There is also a bar chart to summarise this information.
# Find the most expensive events on average
prop_mean_cost <- tapply(USWeather_ecop_anal$PROPDMGCOST,
USWeather_ecop_anal$EVTYPE,
mean)
# Identifying events that cost the most
prop_sum_cost <- tapply(USWeather_ecop_anal$PROPDMGCOST,
USWeather_ecop_anal$EVTYPE,
sum)
# The Top 10 most expensive on average
head(sort(prop_mean_cost,decreasing=TRUE),10)
## STORM SURGE/TIDE TROPICAL STORM TSUNAMI FLOOD
## 31359378.4 11165058.8 7203100.0 5712051.7
## ICE STORM WILDFIRE TORNADO WINTER STORM
## 1967545.0 1841380.6 939061.2 585068.0
## DROUGHT COASTAL FLOOD
## 420460.6 365639.3
# Top 10 most money spent
head(sort(prop_sum_cost,decreasing=TRUE),10)
## FLOOD TORNADO FLASH FLOOD HAIL
## 144657709807 56937160483 16199121367 15732267277
## THUNDERSTORM WIND WILDFIRE TROPICAL STORM WINTER STORM
## 9704063633 7766943500 7703890550 6688497250
## HIGH WIND STORM SURGE/TIDE
## 5878369913 4641188000
# Plotting property data
# Extracting most costly average data
num_bars <- 10
barnames_mean <- names(head(sort(prop_mean_cost,decreasing=TRUE),num_bars))
mean_prop <- prop_mean_cost[barnames_mean]
# Extracting data for the largest number of fatalities
barnames_sum <- names(head(sort(prop_sum_cost,decreasing=TRUE),num_bars))
sum_prop <- prop_sum_cost[barnames_sum]
# plot options
par(mar=c(13.1,4.1,4.1,2.1))
par(mfrow = c(1, 2))
# Producing bar plot for the most expensive weather types on average
barplot(mean_prop,
ylab="Average Values",
las=2,
names.arg=barnames_mean)
# Producing bar plot for the weather types causing the most expense
barplot(sum_prop,
ylab="Total Values",
las=2,
names.arg=barnames_sum)
# Find the most expensive events on average
crop_mean_cost <- tapply(USWeather_ecoc_anal$CROPDMGCOST,
USWeather_ecoc_anal$EVTYPE,
mean)
# Identifying events that cost the most
crop_sum_cost <- tapply(USWeather_ecoc_anal$CROPDMGCOST,
USWeather_ecoc_anal$EVTYPE,
sum)
# The Top 10 most expensive on average
head(sort(crop_mean_cost,decreasing=TRUE),10)
## DROUGHT ICE STORM TROPICAL STORM FROST/FREEZE HEAT
## 5618241.25 2503546.11 983110.14 815265.28 523417.86
## EXCESSIVE HEAT FLOOD WILDFIRE HEAVY RAIN BLIZZARD
## 293445.77 223563.47 95369.76 62560.76 41213.68
# Top 10 most money spent
head(sort(crop_sum_cost,decreasing=TRUE),10)
## DROUGHT FLOOD ICE STORM HAIL
## 13972566000 5661968450 5022113500 3025954453
## FLASH FLOOD THUNDERSTORM WIND FROST/FREEZE HEAVY RAIN
## 1429805200 1159505108 1094086000 733399800
## HIGH WIND TROPICAL STORM
## 679291900 678346000
# Plotting crop damage data
# Extracting most costly average data
num_bars <- 10
barnames_mean <- names(head(sort(crop_mean_cost,decreasing=TRUE),num_bars))
mean_crop <- crop_mean_cost[barnames_mean]
# Extracting data for the largest number of fatalities
barnames_sum <- names(head(sort(crop_sum_cost,decreasing=TRUE),num_bars))
sum_crop <- crop_sum_cost[barnames_sum]
# plot options
par(mar=c(13.1,4.1,4.1,2.1))
par(mfrow = c(1, 2))
# Producing bar plot for the most expensive weather types on average
barplot(mean_crop,
ylab="Average Values",
las=2,
names.arg=barnames_mean)
# Producing bar plot for the weather types causing the most expense
barplot(sum_crop,
ylab="Total Values",
las=2,
names.arg=barnames_sum)