Weather and population health on the USA: A data science approach

Synopsis

This study involves analyzing data from the NOAA Storm Database with the purpose of seeing how storms and other weather events affect people.

In order to perform this project we have used R for both analyzing data and plotting results.

Data Processing

We start unzipping dataset and loading CSV file into R.

library(R.utils)  # install with install.packages('R.utils')
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.6.1 (2014-01-04) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.18.0 (2014-02-22) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## 
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## 
## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save
## 
## R.utils v1.32.4 (2014-05-14) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## 
## The following object is masked from 'package:utils':
## 
##     timestamp
## 
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings
bzipFile <- "./data/repdata-data-StormData.csv.bz2"
csvFile <- "./data/repdata-data-StormData.csv"
bunzip2(bzipFile, csvFile, remove = FALSE)
## Error: File already exists: ./data/repdata-data-StormData.csv

data <- read.csv(csvFile, as.is = TRUE, comment.char = "")

Once we have the data on our workspace we can see that data is quite large in size.

dim(data)
## [1] 902297     37

so our next step is to remove all records that don't contain useful information. We focus on those events that caused any kind of damage.

care.about <- (data$FATALITIES > 0) | (data$INJURIES > 0) | (data$PROPDMG > 
    0) | (data$CROPDMG > 0)
d <- data[care.about, ]
dim(d)
## [1] 254633     37

Before goint to deep analysis we have to continue cleaning and preparing data. We define a new event called BeginYear.

splits <- strsplit(d$BGN_DATE, " ")
table(unlist(lapply(splits, "[", 2)))
## 
## 0:00:00 
##  254633

d$BeginDate <- as.Date(unlist(lapply(splits, "[", 1)), "%m/%d/%Y")
d$BeginYear <- substr(d$BeginDate, 1, 4)
table(d$BeginYear)
## 
##  1950  1951  1952  1953  1954  1955  1956  1957  1958  1959  1960  1961 
##   201   241   233   421   491   441   428   824   543   505   556   627 
##  1962  1963  1964  1965  1966  1967  1968  1969  1970  1971  1972  1973 
##   411   380   594   724   423   683   524   458   517   714   579  1026 
##  1974  1975  1976  1977  1978  1979  1980  1981  1982  1983  1984  1985 
##   884   748   707   693   620   655   728   578  1128  1019   920   575 
##  1986  1987  1988  1989  1990  1991  1992  1993  1994  1995  1996  1997 
##   690   563   678   791   986   879   990  5838  9644 10457 10040 10322 
##  1998  1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  2009 
## 14013 10609 11508 10298 10432 11015 10484 10014 11974 11953 17633 14434 
##  2010  2011 
## 16019 20570

After that we want to ensure that all event types are corrected written (all uppercase) and there are no multiple spaces.

d$EVTYPE <- toupper(d$EVTYPE)
d$EVTYPE <- gsub(" +", " ", d$EVTYPE)

Now let's look deeper to Tornado event, and check that there are no non-sense values, according to Wikipedia (visit http://en.wikipedia.org/wiki/Tornado_records) it has never been recorded a Tornado with length greater than 219 mph.

d$LENGTH[d$LENGTH > 219] <- NA

More Tornando cleaning

tornado.data <- (d$LENGTH > 0) | (d$WIDTH > 0) | (!is.na(d$F)) | grepl("TORNADO", 
    d$EVTYPE)

Now we have cleaned our dataset, we can show the Fujita tornado intensity scale, from 0 to 5.

table(d$F, useNA = "ifany")
## 
##      0      1      2      3      4      5   <NA> 
##   9974  15930   8933   2998    994    129 215675

We can also show all events related with Tornados:

table(d$EVTYPE[tornado.data])
## 
##           COLD AIR TORNADO                 DUST DEVIL 
##                          1                          1 
##                FLASH FLOOD                   GUSTNADO 
##                          1                          5 
##                       HAIL        SEVERE THUNDERSTORM 
##                          2                          1 
##       SEVERE THUNDERSTORMS         THUNDERSTORM WINDS 
##                          1                         11 
##                    TORNADO                 TORNADO F0 
##                      39944                          5 
##                 TORNADO F1                 TORNADO F2 
##                          4                          2 
##                 TORNADO F3                  TORNADOES 
##                          2                          2 
## TORNADOES, TSTM WIND, HAIL                    TORNDAO 
##                          1                          1 
##                  TSTM WIND                 WATERSPOUT 
##                          3                          8 
##         WATERSPOUT TORNADO                WATERSPOUT- 
##                          1                          4 
##         WATERSPOUT-TORNADO        WATERSPOUT/ TORNADO 
##                          1                          1 
##         WATERSPOUT/TORNADO               WINTER STORM 
##                          6                          1

Up to now we have dealt with Tornado entries, it's time to clean and prepare data for the rest of events. A quick overview of those events show that there are many events that can be group into a larger set of events, for example we have FLOOD, FLD and RECORD RAINFALL tags that can be group into a new FLOODING tag.

# Combine variations of fire
d$EVTYPE[grep("FIRE", d$EVTYPE)] <- "WILDFIRE"

# HEAT or EXCESSIVE HEAT
d$EVTYPE[grep("HEAT", d$EVTYPE)] <- "HEAT"

d$EVTYPE[grep("DENSE FOG", d$EVTYPE)] <- "FOG"

# Only notable winds are high wind
d$EVTYPE[d$EVTYPE == "WINDS" | d$EVTYPE == "WIND"] <- "HIGH WIND"

# Combine variations of freeze -- likely agriculture related
d$EVTYPE[grep("FROST|FREEZE", d$EVTYPE)] <- "FROST/FREEZE"

d$EVTYPE[grep("SLIDE", d$EVTYPE)] <- "LAND/MUD/ROCK SLIDES"

d$EVTYPE[grep("HURRICANE|TYPHOON|TROPICAL", d$EVTYPE)] <- "HURRICANE/TYPHOON/TROPICAL STORM"

d$EVTYPE[grep("SNOW|ICE|BLIZZARD|WINTER STORM|FREEZING|WINTRY MIX|SLEET", d$EVTYPE)] <- "WINTER STORM"

d$EVTYPE[grep("WINTER WEATHER|COLD|ICY|GLAZE|LOW TEMPERATURE|HYPOTHERMIA|HYPERTHERMIA|COLD TEMPERATURE|WINDCHILL", 
    d$EVTYPE)] <- "WINTER WEATHER"

# ~15 variations of 'High Wind' and 3 variations of 'strong wind' but after
# 'Snow' variations removed above
d$EVTYPE[grep("HIGH WIND|STRONG WIND|Gusty Winds|GUSTY WIND", d$EVTYPE)] <- "HIGH WIND"

# Change these 'false hits' for 'thunderstorm wind' to wind
d$EVTYPE <- sub("NON-THUNDERSTORM WIND|NON THUNDERSTORM WIND", "WIND", d$EVTYPE)

# Other non thunderstorm winds

# Standardize variations/spelling errors in 'Thunderstorm'
d$EVTYPE <- gsub("THUNDEERSTORM|THUNDERESTORM|THUNDERSTROM|THUDERSTORM|THUNERSTORM|THUNDERTORM|TUNDERSTORM", 
    "THUNDERSTORM", d$EVTYPE)

d$EVTYPE <- sub("TSTM WIND|STORM FORCE WINDS", "THUNDERSTORM WIND", d$EVTYPE)
# Manually reviewed ~44 variations that are combined here
d$EVTYPE[grep("THUNDERSTORMW|THUNDERSTORMWINDS|THUNDERSTORM WINS|TSTMW", d$EVTYPE)] <- "THUNDERSTORM WIND"
d$EVTYPE[grep("THUNDERSTORM WIND|THUNDERSTORMS WIND", d$EVTYPE)] <- "THUNDERSTORM WIND"

# Combine variations of fire
d$EVTYPE[grep("HEAVY SURF|HIGH SURF", d$EVTYPE)] <- "HEAVY/HIGH SURF"

d$EVTYPE[grep("RIP", d$EVTYPE)] <- "RIP CURRENT"

# Combine ~59 variations of FLOODING
d$EVTYPE[grep("FLOOD|FLD|RECORD RAINFALL", d$EVTYPE)] <- "FLOODING"

# Combine ~20 variations of HAIL
d$EVTYPE[grep("HAIL", d$EVTYPE)] <- "HAIL"

# Combine variations of LIGHTNING
d$EVTYPE[grep("LIGHTNING|LIGHTING|LIGNTNING", d$EVTYPE)] <- "LIGHTNING"

Finally in order to reduce the number of events we collapse those with low frequency under the tag “miscellaneous”.

event.types <- sort(table(d$EVTYPE), decreasing = TRUE)
event.types.under.50 <- names(event.types[event.types < 50])

d$EVTYPE[d$EVTYPE %in% event.types.under.50] <- "miscellaneous"

So we have now:

sort(table(d$EVTYPE), decreasing = TRUE)
## 
##                THUNDERSTORM WIND                          TORNADO 
##                           119790                            39944 
##                         FLOODING                             HAIL 
##                            33165                            26166 
##                        LIGHTNING                        HIGH WIND 
##                            13302                             9831 
##                     WINTER STORM                         WILDFIRE 
##                             4502                             1259 
##                       HEAVY RAIN                   WINTER WEATHER 
##                             1105                             1090 
##                             HEAT HURRICANE/TYPHOON/TROPICAL STORM 
##                              980                              689 
##                      RIP CURRENT                    miscellaneous 
##                              641                              410 
##                        AVALANCHE                          DROUGHT 
##                              268                              266 
##                  HEAVY/HIGH SURF             LAND/MUD/ROCK SLIDES 
##                              224                              212 
##                              FOG                      STORM SURGE 
##                              181                              177 
##                     FROST/FREEZE                       DUST STORM 
##                              155                              103 
##                       DUST DEVIL                   DRY MICROBURST 
##                               95                               78

Other minor changes into dataset are performed:

# scientific notation
ValidMultiplierLetter <- c("H", "K", "M", "B")

Multiplier <- c(100, 1000, 1e+06, 1e+09)
names(Multiplier) <- ValidMultiplierLetter

### Property Damage
d$PROPDMGEXP <- toupper(d$PROPDMGEXP)

Valid <- d$PROPDMGEXP %in% ValidMultiplierLetter
d$PROPDMGEXP[!Valid] <- ""

d$PROPDMG[!Valid] <- NA

d$PropCash <- NA
d$PropCash[Valid] <- d$PROPDMG[Valid] * Multiplier[d$PROPDMGEXP[Valid]]

### Crop Damage
d$CROPDMGEXP <- toupper(d$CROPDMGEXP)

Valid <- d$CROPDMGEXP %in% ValidMultiplierLetter
d$CROPDMGEXP[!Valid] <- ""

d$CROPDMG[!Valid] <- NA

d$CropCash <- NA
d$CropCash[Valid] <- d$CROPDMG[Valid] * Multiplier[d$CROPDMGEXP[Valid]]

Results

Experience has shown that weather and natural disasters produce a huge impact in people's life, we can use the NOAA dataset to quantify this impact using different metrics.

First we will show how different events have affected in terms of injuries and fatalities during the last years.

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
EVTYPE Events Fatalities Injuries
1 TORNADO 39944 5633 91346
2 HEAT 980 3138 9224
3 FLOODING 33165 1553 8681
4 LIGHTNING 13302 817 5232
5 THUNDERSTORM WIND 119790 754 9531
6 WINTER STORM 4502 597 5593
7 RIP CURRENT 641 572 529
8 WINTER WEATHER 1090 542 1110
9 HIGH WIND 9831 445 1883
10 AVALANCHE 268 224 170
11 HURRICANE/TYPHOON/TROPICAL STORM 689 201 1716
12 HEAVY/HIGH SURF 224 162 244
13 miscellaneous 410 138 352
14 HEAVY RAIN 1105 98 251
15 WILDFIRE 1259 90 1608
16 FOG 181 80 1076
17 LAND/MUD/ROCK SLIDES 212 44 55
18 DUST STORM 103 22 440
19 HAIL 26166 15 1371
20 STORM SURGE 177 13 38
21 DRY MICROBURST 78 3 28
22 DUST DEVIL 95 2 43
23 FROST/FREEZE 155 2 3
24 DROUGHT 266 0 4

The breakdown by year shows the method for collecting and reporting data has likely changed a lot over the years, with 20-time more data collected in 2011 than 20 years before.

## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, desc, failwith, id, mutate, summarise, summarize
BeginYear Events Fatalities Tornado Heat Flooding
1 1950 201 70 70 0 0
2 1951 241 34 34 0 0
3 1952 233 230 230 0 0
4 1953 421 519 519 0 0
5 1954 491 36 36 0 0
6 1955 441 129 129 0 0
7 1956 428 83 83 0 0
8 1957 824 193 193 0 0
9 1958 543 67 67 0 0
10 1959 505 58 58 0 0
11 1960 556 46 46 0 0
12 1961 627 52 52 0 0
13 1962 411 30 30 0 0
14 1963 380 31 31 0 0
15 1964 594 73 73 0 0
16 1965 724 301 301 0 0
17 1966 423 98 98 0 0
18 1967 683 114 114 0 0
19 1968 524 131 131 0 0
20 1969 458 66 66 0 0
21 1970 517 73 73 0 0
22 1971 714 159 159 0 0
23 1972 579 27 27 0 0
24 1973 1026 89 89 0 0
25 1974 884 366 366 0 0
26 1975 748 60 60 0 0
27 1976 707 44 44 0 0
28 1977 693 43 43 0 0
29 1978 620 53 53 0 0
30 1979 655 84 84 0 0
31 1980 728 28 28 0 0
32 1981 578 24 24 0 0
33 1982 1128 64 64 0 0
34 1983 1019 37 34 0 0
35 1984 920 160 122 0 0
36 1985 575 112 94 0 0
37 1986 690 51 15 0 0
38 1987 563 89 59 0 0
39 1988 678 55 32 0 0
40 1989 791 79 50 0 0
41 1990 986 95 53 0 0
42 1991 879 73 39 0 0
43 1992 990 54 39 0 0
44 1993 5838 298 28 7 53
45 1994 9644 344 48 44 86
46 1995 10457 1491 34 1051 77
47 1996 10040 542 26 36 133
48 1997 10322 601 68 81 118
49 1998 14013 687 130 173 136
50 1999 10609 908 94 502 68
51 2000 11508 477 41 158 38
52 2001 10298 469 40 166 50
53 2002 10432 498 55 167 46
54 2003 11015 443 54 36 88
55 2004 10484 370 35 6 82
56 2005 10014 469 38 158 43
57 2006 11974 599 67 252 78
58 2007 11953 421 81 44 87
59 2008 17633 488 129 42 79
60 2009 14434 333 21 33 56
61 2010 16019 425 45 83 108
62 2011 20570 1002 587 99 127

Let's plot the data from the table above:

library(RColorBrewer)  # brewer.pal

Colors <- brewer.pal(3, "Set1")

par(mfrow = c(4, 1), mar = c(4, 4, 2, 2))

plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Fatalities, pch = 15, col = "black", 
    main = "Reported Weather-Related Fatalities by Year", xlab = "Year", ylab = "Fatalities", 
    ylim = c(0, 1500), las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("All Weather Events", line = -2)

plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Tornado, pch = 16, col = Colors[1], 
    main = "", xlab = "Year", ylab = "Fatalities", ylim = c(0, 1500), las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("#1 Tornado Fatalities", line = -2)

plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Heat, pch = 16, col = Colors[2], 
    main = "", xlab = "Year", ylab = "Fatalities", las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("#2 Heat Fatalities", line = -2)

plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Flooding, pch = 16, col = Colors[3], 
    main = "", xlab = "Year", ylab = "Fatalities", las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("#3 Flooding Fatalities", line = -2)

mtext(expression("Source:  NOAA Storm Events Database (2011)"), BOTTOM <- 1, 
    adj = 0.025, line = -1, cex = 0.8, col = "blue", outer = TRUE)
mtext(format(Sys.time(), "%Y-%m-%d %H:%M"), BOTTOM, adj = 0.975, line = -1, 
    cex = 0.75, col = "blue", outer = TRUE)

plot of chunk unnamed-chunk-14