US Weather Investigation

Synopsis

This report contains an analysis of the effects of weather events on population health and the economy. The data used here has been downloaded from the storm events data base kept by the National Climatic Data Centre in the United States. The data has been analysed and processed and a number of exploratory plots have been produced.

Data Processing

I have produced 3 data frames of analytic data. One which can be used to analyse effects on health, one which can be used to analyse the economic effects of crop damage and one which can be used to analyse the economic effects of property damage.

To produce analytic data I have had to resolve two problems. Firstly, some of the event types in the data do not conform to the event types listed in the “National Weather Service” document and secondly some of the units for the costs of property and crop damage are not in a format I recognise (e.g. not k for thousands or m for millions).

To resolve the first problem I have listed the most common names that don’t conform and replaced them with the closest matching official type I could find (matching is clearly list below). Any event types with non official names that total less than 1000 have been ignored. This is a very small percentage, which is reported below.

To resolve the second problem I have ignored cost units that I do not recognise (Again this is a very small percentage, which is reported below) and produced an additional column with all cost converted to dollars with no units so that comparisons can be made.

Code to resolve the first problem and produce analytic data for health:

USWeather_raw <- read.csv("repdata_data_StormData.csv.bz2")

# Finding the most common non-official names
off_evtype <- file("EVTYPES.txt","r")
off_evtype_names <- toupper(readLines(off_evtype))
off_logical <- USWeather_raw$EVTYPE %in% off_evtype_names
# 20 most common non-official names
head(summary(USWeather_raw$EVTYPE[!off_logical]),20)
##            TSTM WIND   THUNDERSTORM WINDS     MARINE TSTM WIND 
##               219940                20843                 6175 
## URBAN/SML STREAM FLD           HIGH WINDS     WILD/FOREST FIRE 
##                 3392                 1533                 1457 
##   WINTER WEATHER/MIX       TSTM WIND/HAIL       FLASH FLOODING 
##                 1104                 1028                  682 
##         EXTREME COLD    FLOOD/FLASH FLOOD            LANDSLIDE 
##                  655                  624                  600 
##                 SNOW                  FOG                 WIND 
##                  587                  538                  340 
##         RIP CURRENTS          STORM SURGE        FREEZING RAIN 
##                  304                  261                  250 
##          URBAN FLOOD HEAVY SURF/HIGH SURF 
##                  249                  228
# Replacing most common EVTYPES with non official names with the official name
# Any item with greater than 1000 occurences in replace to create analytical data

USWeather_off <- USWeather_raw
USWeather_off$EVTYPE <- gsub("TSTM WIND",
                              "THUNDERSTORM WIND",
                              USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("THUNDERSTORM WINDS", 
                              "THUNDERSTORM WIND",
                              USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("MARINE TSTM WIND", 
                              "MARINE THUNDERSTORM WIND",
                              USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("URBAN/SML STREAM FLD", 
                              "FLASH FLOOD",
                              USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("HIGH WINDS", 
                              "HIGH WIND",
                              USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("WILD/FOREST FIRE", 
                              "WILDFIRE",
                              USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("WINTER WEATHER/MIX", 
                              "WINTER WEATHER",
                              USWeather_off$EVTYPE)
USWeather_off$EVTYPE <- gsub("TSTM WIND/HAIL", 
                              "THUNDERSTORM WIND",
                              USWeather_off$EVTYPE)

# Removing remaining non-official data from analytical data set

off_logical <- USWeather_off$EVTYPE %in% off_evtype_names
USWeather_off <- USWeather_off[off_logical,]
size_raw <- dim(USWeather_raw)
size_off <- dim(USWeather_off)

# Finding percentage lost

percentage_lost <- 100*(size_raw[1] - size_off[1])/size_raw[1]
percentage_lost
## [1] 1.393333
# Subsetting to an economy and population health data frame
# Health data
poph_cols <- c("EVTYPE","FATALITIES","INJURIES")
USWeather_poph_anal <- subset(USWeather_off,select = poph_cols)
# Economic data
ecop_cols <- c("EVTYPE","PROPDMG","PROPDMGEXP")
ecoc_cols <- c("EVTYPE","CROPDMG","CROPDMGEXP")
USWeather_ecoc <- subset(USWeather_off,select = ecoc_cols)
USWeather_ecop <- subset(USWeather_off,select = ecop_cols)
close(off_evtype)

Code to resolve second problem and produce the analytic data for the crop and property economic effects

# Analysing property damage
# Removing entries which aren't blank or H,K,M,B from property damage.
KMB <- c("","h","H","k","K","m","M","b","B")
size_ecop <- dim(USWeather_ecop)
USWeather_ecop <- USWeather_ecop[USWeather_ecop$PROPDMGEXP %in% KMB,]
# Working out percentage data that has been lost
size_ecop_trim <- dim(USWeather_ecop)
percentage_lost <- 100*(size_ecop[1] - size_ecop_trim[1])/size_ecop[1]
percentage_lost
## [1] 0.03383068
# Converting units to numbers
USWeather_ecop$PROPDMGUNITS <- USWeather_ecop$PROPDMGEXP
# Replacing blanks
KMB_cost <- c(1,100,100,1000,1000,1000000,1000000,1000000000,1000000000)
USWeather_ecop$PROPDMGUNITS <- gsub("^$",KMB_cost[1],USWeather_ecop$PROPDMGUNITS)
# Replacing other units
for(i in 2:9)
  {
      USWeather_ecop$PROPDMGUNITS <- gsub(KMB[i],KMB_cost[i],USWeather_ecop$PROPDMGUNITS)  
  }
USWeather_ecop$PROPDMGCOST <- USWeather_ecop$PROPDMG*as.numeric(USWeather_ecop$PROPDMGUNITS)
USWeather_ecop_anal <- USWeather_ecop 

# Analysing crop damage
# Removing entries which aren't blank or H,K,M,B from crop damage.
KMB <- c("","h","H","k","K","m","M","b","B")
size_ecoc <- dim(USWeather_ecoc)
USWeather_ecoc <- USWeather_ecoc[USWeather_ecoc$CROPDMGEXP %in% KMB,]
# Working out percentage data that has been lost
size_ecoc_trim <- dim(USWeather_ecoc)
percentage_lost <- 100*(size_ecoc[1] - size_ecoc_trim[1])/size_ecoc[1]
percentage_lost
## [1] 0.002585068
# Converting units to numbers
USWeather_ecoc$CROPDMGUNITS <- USWeather_ecoc$CROPDMGEXP
# Replacing blanks
KMB_cost <- c(1,100,100,1000,1000,1000000,1000000,1000000000,1000000000)
USWeather_ecoc$CROPDMGUNITS <- gsub("^$",KMB_cost[1],USWeather_ecoc$CROPDMGUNITS)
# Replacing other units
for(i in 2:9)
  {
      USWeather_ecoc$CROPDMGUNITS <- gsub(KMB[i],KMB_cost[i],USWeather_ecoc$CROPDMGUNITS)  
  }
USWeather_ecoc$CROPDMGCOST <- USWeather_ecoc$CROPDMG*as.numeric(USWeather_ecoc$CROPDMGUNITS)
USWeather_ecoc_anal <- USWeather_ecoc 

Results

Analysing Health Effects

I have found and listed the “top ten” weather types that cause the highest average number of fatalities and injuries and also the “top ten” weather types that cause the highest number of fatalities and injuries. There is also a bar chart to summarise this information. I have limited the y-values in the second plot as the number of injuries caused by tornados is so large.

# Analysing Fatalises
# Identifying the most dangerous Weather (for fatalities)
mean_fatalities <- tapply(USWeather_poph_anal$FATALITIES, 
                          USWeather_poph_anal$EVTYPE,
                          mean)
# Identifying the highest killers
sum_fatalities <- tapply(USWeather_poph_anal$FATALITIES, 
                         USWeather_poph_anal$EVTYPE,
                         sum)
# Analysing Injuries
# Identifying the most dangerous Weather (for injuries)
mean_injuries <- tapply(USWeather_poph_anal$INJURIES, 
                        USWeather_poph_anal$EVTYPE,
                        mean)
# Identifying the highest injuries
sum_injuries <- tapply(USWeather_poph_anal$INJURIES, 
                       USWeather_poph_anal$EVTYPE,
                       sum)
# Analysing Fatalises
# The Top 10 most dangerous weather types (for fatalities)
head(sort(mean_fatalities,decreasing=TRUE),10)
##                 TSUNAMI                    HEAT          EXCESSIVE HEAT 
##               1.6500000               1.2216428               1.1340882 
##             RIP CURRENT               AVALANCHE      MARINE STRONG WIND 
##               0.7829787               0.5803109               0.2916667 
##         COLD/WIND CHILL               HIGH SURF EXTREME COLD/WIND CHILL 
##               0.1762523               0.1393103               0.1247505 
##                 TORNADO 
##               0.0928741
# Top 10 most highest killers
head(sort(sum_fatalities,decreasing=TRUE),10)
##           TORNADO    EXCESSIVE HEAT       FLASH FLOOD              HEAT 
##              5633              1903              1006               937 
##         LIGHTNING THUNDERSTORM WIND             FLOOD       RIP CURRENT 
##               816               701               470               368 
##         HIGH WIND         AVALANCHE 
##               283               224
# Analysing Injuries
# Top 10 most dangerous weather types
head(sort(mean_injuries,decreasing=TRUE),10)
##            TSUNAMI     EXCESSIVE HEAT               HEAT 
##          6.4500000          3.8885578          2.7379400 
##            TORNADO         DUST STORM          ICE STORM 
##          1.5060674          1.0304450          0.9845464 
##        RIP CURRENT     TROPICAL STORM MARINE STRONG WIND 
##          0.4936170          0.4927536          0.4583333 
##          AVALANCHE 
##          0.4404145
# Top 10 most highest injuries
head(sort(sum_injuries,decreasing=TRUE),10)
##           TORNADO THUNDERSTORM WIND             FLOOD    EXCESSIVE HEAT 
##             91346              9353              6789              6525 
##         LIGHTNING              HEAT         ICE STORM       FLASH FLOOD 
##              5230              2100              1975              1856 
##          WILDFIRE         HIGH WIND 
##              1456              1439
#Producing a plot to illustrate results
# Extracting data for plots
#Extracting most dangerous average data
num_bars <- 10
barnames_mean <- names(head(sort(mean_fatalities,decreasing=TRUE),num_bars))
mean_fatal <- mean_fatalities[barnames_mean]
mean_inj <- mean_injuries[barnames_mean]
plot_matrix_mean <- matrix(c(mean_fatal,mean_inj),nrow=2,ncol=num_bars,byrow=TRUE)
#Extracting data for the largest number of fatalities
barnames_sum <- names(head(sort(sum_fatalities,decreasing=TRUE),num_bars))
sum_fatal <- sum_fatalities[barnames_sum]
sum_inj <- sum_injuries[barnames_sum]
plot_matrix_sum <- matrix(c(sum_fatal,sum_inj),nrow=2,ncol=num_bars,byrow=TRUE)
#plot options
par(mar=c(13.1,4.1,4.1,2.1))
par(mfrow = c(1, 2))
colours <- c("blue","red")
# Producing bar plot for the most dangerous weather types
barplot(plot_matrix_mean, 
        ylab="Average Values", 
        names.arg=barnames_mean,
        col=colours,
        las=2)
legend("topright", 
       c("Fatalities","Injuries"),
       fill=colours)
# Producing bar plot for the weather types causing the most fatalities
barplot(plot_matrix_sum, 
        ylab="Total Values", 
        names.arg=barnames_sum,
        col=colours,las=2,ylim=c(0,12000))
legend("topleft", 
       c("Fatalities","Injuries"),
       fill=colours)

Analysing Economic Effects

I have found and listed the “top ten” weather types that cause the highest cost on average and also the “top ten” weather types that cause the highest total cost. Firstly for property damage and then for crop damage.There is also a bar chart to summarise this information.

Property Damage

# Find the most expensive events on average
prop_mean_cost <- tapply(USWeather_ecop_anal$PROPDMGCOST, 
                         USWeather_ecop_anal$EVTYPE,
                         mean)
# Identifying events that cost the most
prop_sum_cost <- tapply(USWeather_ecop_anal$PROPDMGCOST, 
                        USWeather_ecop_anal$EVTYPE,
                        sum)
# The Top 10 most expensive on average
head(sort(prop_mean_cost,decreasing=TRUE),10)
## STORM SURGE/TIDE   TROPICAL STORM          TSUNAMI            FLOOD 
##       31359378.4       11165058.8        7203100.0        5712051.7 
##        ICE STORM         WILDFIRE          TORNADO     WINTER STORM 
##        1967545.0        1841380.6         939061.2         585068.0 
##          DROUGHT    COASTAL FLOOD 
##         420460.6         365639.3
# Top 10 most money spent
head(sort(prop_sum_cost,decreasing=TRUE),10)
##             FLOOD           TORNADO       FLASH FLOOD              HAIL 
##      144657709807       56937160483       16199121367       15732267277 
## THUNDERSTORM WIND          WILDFIRE    TROPICAL STORM      WINTER STORM 
##        9704063633        7766943500        7703890550        6688497250 
##         HIGH WIND  STORM SURGE/TIDE 
##        5878369913        4641188000
# Plotting property data
# Extracting most costly average data
num_bars <- 10
barnames_mean <- names(head(sort(prop_mean_cost,decreasing=TRUE),num_bars))
mean_prop <- prop_mean_cost[barnames_mean]
# Extracting data for the largest number of fatalities
barnames_sum <- names(head(sort(prop_sum_cost,decreasing=TRUE),num_bars))
sum_prop <- prop_sum_cost[barnames_sum]
# plot options
par(mar=c(13.1,4.1,4.1,2.1))
par(mfrow = c(1, 2))
# Producing bar plot for the most expensive weather types on average
barplot(mean_prop, 
        ylab="Average Values",
        las=2,        
        names.arg=barnames_mean)
# Producing bar plot for the weather types causing the most expense
barplot(sum_prop, 
        ylab="Total Values",
        las=2,
        names.arg=barnames_sum)

Crop Damage

# Find the most expensive events on average
crop_mean_cost <- tapply(USWeather_ecoc_anal$CROPDMGCOST, 
                         USWeather_ecoc_anal$EVTYPE,
                         mean)
# Identifying events that cost the most
crop_sum_cost <- tapply(USWeather_ecoc_anal$CROPDMGCOST, 
                        USWeather_ecoc_anal$EVTYPE,
                        sum)
# The Top 10 most expensive on average
head(sort(crop_mean_cost,decreasing=TRUE),10)
##        DROUGHT      ICE STORM TROPICAL STORM   FROST/FREEZE           HEAT 
##     5618241.25     2503546.11      983110.14      815265.28      523417.86 
## EXCESSIVE HEAT          FLOOD       WILDFIRE     HEAVY RAIN       BLIZZARD 
##      293445.77      223563.47       95369.76       62560.76       41213.68
# Top 10 most money spent
head(sort(crop_sum_cost,decreasing=TRUE),10)
##           DROUGHT             FLOOD         ICE STORM              HAIL 
##       13972566000        5661968450        5022113500        3025954453 
##       FLASH FLOOD THUNDERSTORM WIND      FROST/FREEZE        HEAVY RAIN 
##        1429805200        1159505108        1094086000         733399800 
##         HIGH WIND    TROPICAL STORM 
##         679291900         678346000
# Plotting crop damage data
# Extracting most costly average data
num_bars <- 10
barnames_mean <- names(head(sort(crop_mean_cost,decreasing=TRUE),num_bars))
mean_crop <- crop_mean_cost[barnames_mean]
# Extracting data for the largest number of fatalities
barnames_sum <- names(head(sort(crop_sum_cost,decreasing=TRUE),num_bars))
sum_crop <- crop_sum_cost[barnames_sum]
# plot options
par(mar=c(13.1,4.1,4.1,2.1))
par(mfrow = c(1, 2))
# Producing bar plot for the most expensive weather types on average
barplot(mean_crop, 
        ylab="Average Values",
        las=2,
        names.arg=barnames_mean)
# Producing bar plot for the weather types causing the most expense
barplot(sum_crop, 
        ylab="Total Values", 
        las=2,
        names.arg=barnames_sum)