Weather and population health on the USA: A data science approach

Synopsis

This study involves analyzing data from the NOAA Storm Database with the purpose of seeing how storms and other weather events affect people.

In order to perform this project we have used R for both analyzing data and plotting results.

Data Processing

We start unzipping dataset and loading CSV file into R.

library(R.utils)  # install with install.packages('R.utils')

## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.6.1 (2014-01-04) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.18.0 (2014-02-22) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## 
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## 
## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save
## 
## R.utils v1.32.4 (2014-05-14) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## 
## The following object is masked from 'package:utils':
## 
##     timestamp
## 
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings

bzipFile <- "./data/repdata-data-StormData.csv.bz2"
csvFile <- "./data/repdata-data-StormData.csv"
bunzip2(bzipFile, csvFile, remove = FALSE)

## Error: File already exists: ./data/repdata-data-StormData.csv


data <- read.csv(csvFile, as.is = TRUE, comment.char = "")

Once we have the data on our workspace we can see that data is quite large in size.

dim(data)

## [1] 902297     37

so our next step is to remove all records that don't contain useful information. We focus on those events that caused any kind of damage.

care.about <- (data$FATALITIES > 0) | (data$INJURIES > 0) | (data$PROPDMG > 
    0) | (data$CROPDMG > 0)
d <- data[care.about, ]
dim(d)

## [1] 254633     37

Before goint to deep analysis we have to continue cleaning and preparing data. We define a new event called BeginYear.

splits <- strsplit(d$BGN_DATE, " ")
table(unlist(lapply(splits, "[", 2)))

## 
## 0:00:00 
##  254633


d$BeginDate <- as.Date(unlist(lapply(splits, "[", 1)), "%m/%d/%Y")
d$BeginYear <- substr(d$BeginDate, 1, 4)
table(d$BeginYear)

## 
##  1950  1951  1952  1953  1954  1955  1956  1957  1958  1959  1960  1961 
##   201   241   233   421   491   441   428   824   543   505   556   627 
##  1962  1963  1964  1965  1966  1967  1968  1969  1970  1971  1972  1973 
##   411   380   594   724   423   683   524   458   517   714   579  1026 
##  1974  1975  1976  1977  1978  1979  1980  1981  1982  1983  1984  1985 
##   884   748   707   693   620   655   728   578  1128  1019   920   575 
##  1986  1987  1988  1989  1990  1991  1992  1993  1994  1995  1996  1997 
##   690   563   678   791   986   879   990  5838  9644 10457 10040 10322 
##  1998  1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  2009 
## 14013 10609 11508 10298 10432 11015 10484 10014 11974 11953 17633 14434 
##  2010  2011 
## 16019 20570

After that we want to ensure that all event types are corrected written (all uppercase) and there are no multiple spaces.

d$EVTYPE <- toupper(d$EVTYPE)
d$EVTYPE <- gsub(" +", " ", d$EVTYPE)

Now let's look deeper to Tornado event, and check that there are no non-sense values, according to Wikipedia (visit http://en.wikipedia.org/wiki/Tornado_records) it has never been recorded a Tornado with length greater than 219 mph.

d$LENGTH[d$LENGTH > 219] <- NA

More Tornando cleaning

tornado.data <- (d$LENGTH > 0) | (d$WIDTH > 0) | (!is.na(d$F)) | grepl("TORNADO", 
    d$EVTYPE)

Now we have cleaned our dataset, we can show the Fujita tornado intensity scale, from 0 to 5.

table(d$F, useNA = "ifany")

## 
##      0      1      2      3      4      5   <NA> 
##   9974  15930   8933   2998    994    129 215675

We can also show all events related with Tornados:

table(d$EVTYPE[tornado.data])

## 
##           COLD AIR TORNADO                 DUST DEVIL 
##                          1                          1 
##                FLASH FLOOD                   GUSTNADO 
##                          1                          5 
##                       HAIL        SEVERE THUNDERSTORM 
##                          2                          1 
##       SEVERE THUNDERSTORMS         THUNDERSTORM WINDS 
##                          1                         11 
##                    TORNADO                 TORNADO F0 
##                      39944                          5 
##                 TORNADO F1                 TORNADO F2 
##                          4                          2 
##                 TORNADO F3                  TORNADOES 
##                          2                          2 
## TORNADOES, TSTM WIND, HAIL                    TORNDAO 
##                          1                          1 
##                  TSTM WIND                 WATERSPOUT 
##                          3                          8 
##         WATERSPOUT TORNADO                WATERSPOUT- 
##                          1                          4 
##         WATERSPOUT-TORNADO        WATERSPOUT/ TORNADO 
##                          1                          1 
##         WATERSPOUT/TORNADO               WINTER STORM 
##                          6                          1

Up to now we have dealt with Tornado entries, it's time to clean and prepare data for the rest of events. A quick overview of those events show that there are many events that can be group into a larger set of events, for example we have FLOOD, FLD and RECORD RAINFALL tags that can be group into a new FLOODING tag.

# Combine variations of fire
d$EVTYPE[grep("FIRE", d$EVTYPE)] <- "WILDFIRE"

# HEAT or EXCESSIVE HEAT
d$EVTYPE[grep("HEAT", d$EVTYPE)] <- "HEAT"

d$EVTYPE[grep("DENSE FOG", d$EVTYPE)] <- "FOG"

# Only notable winds are high wind
d$EVTYPE[d$EVTYPE == "WINDS" | d$EVTYPE == "WIND"] <- "HIGH WIND"

# Combine variations of freeze -- likely agriculture related
d$EVTYPE[grep("FROST|FREEZE", d$EVTYPE)] <- "FROST/FREEZE"

d$EVTYPE[grep("SLIDE", d$EVTYPE)] <- "LAND/MUD/ROCK SLIDES"

d$EVTYPE[grep("HURRICANE|TYPHOON|TROPICAL", d$EVTYPE)] <- "HURRICANE/TYPHOON/TROPICAL STORM"

d$EVTYPE[grep("SNOW|ICE|BLIZZARD|WINTER STORM|FREEZING|WINTRY MIX|SLEET", d$EVTYPE)] <- "WINTER STORM"

d$EVTYPE[grep("WINTER WEATHER|COLD|ICY|GLAZE|LOW TEMPERATURE|HYPOTHERMIA|HYPERTHERMIA|COLD TEMPERATURE|WINDCHILL", 
    d$EVTYPE)] <- "WINTER WEATHER"

# ~15 variations of 'High Wind' and 3 variations of 'strong wind' but after
# 'Snow' variations removed above
d$EVTYPE[grep("HIGH WIND|STRONG WIND|Gusty Winds|GUSTY WIND", d$EVTYPE)] <- "HIGH WIND"

# Change these 'false hits' for 'thunderstorm wind' to wind
d$EVTYPE <- sub("NON-THUNDERSTORM WIND|NON THUNDERSTORM WIND", "WIND", d$EVTYPE)

# Other non thunderstorm winds

# Standardize variations/spelling errors in 'Thunderstorm'
d$EVTYPE <- gsub("THUNDEERSTORM|THUNDERESTORM|THUNDERSTROM|THUDERSTORM|THUNERSTORM|THUNDERTORM|TUNDERSTORM", 
    "THUNDERSTORM", d$EVTYPE)

d$EVTYPE <- sub("TSTM WIND|STORM FORCE WINDS", "THUNDERSTORM WIND", d$EVTYPE)
# Manually reviewed ~44 variations that are combined here
d$EVTYPE[grep("THUNDERSTORMW|THUNDERSTORMWINDS|THUNDERSTORM WINS|TSTMW", d$EVTYPE)] <- "THUNDERSTORM WIND"
d$EVTYPE[grep("THUNDERSTORM WIND|THUNDERSTORMS WIND", d$EVTYPE)] <- "THUNDERSTORM WIND"

# Combine variations of fire
d$EVTYPE[grep("HEAVY SURF|HIGH SURF", d$EVTYPE)] <- "HEAVY/HIGH SURF"

d$EVTYPE[grep("RIP", d$EVTYPE)] <- "RIP CURRENT"

# Combine ~59 variations of FLOODING
d$EVTYPE[grep("FLOOD|FLD|RECORD RAINFALL", d$EVTYPE)] <- "FLOODING"

# Combine ~20 variations of HAIL
d$EVTYPE[grep("HAIL", d$EVTYPE)] <- "HAIL"

# Combine variations of LIGHTNING
d$EVTYPE[grep("LIGHTNING|LIGHTING|LIGNTNING", d$EVTYPE)] <- "LIGHTNING"

Finally in order to reduce the number of events we collapse those with low frequency under the tag “miscellaneous”.

event.types <- sort(table(d$EVTYPE), decreasing = TRUE)
event.types.under.50 <- names(event.types[event.types < 50])

d$EVTYPE[d$EVTYPE %in% event.types.under.50] <- "miscellaneous"

So we have now:

sort(table(d$EVTYPE), decreasing = TRUE)

## 
##                THUNDERSTORM WIND                          TORNADO 
##                           119790                            39944 
##                         FLOODING                             HAIL 
##                            33165                            26166 
##                        LIGHTNING                        HIGH WIND 
##                            13302                             9831 
##                     WINTER STORM                         WILDFIRE 
##                             4502                             1259 
##                       HEAVY RAIN                   WINTER WEATHER 
##                             1105                             1090 
##                             HEAT HURRICANE/TYPHOON/TROPICAL STORM 
##                              980                              689 
##                      RIP CURRENT                    miscellaneous 
##                              641                              410 
##                        AVALANCHE                          DROUGHT 
##                              268                              266 
##                  HEAVY/HIGH SURF             LAND/MUD/ROCK SLIDES 
##                              224                              212 
##                              FOG                      STORM SURGE 
##                              181                              177 
##                     FROST/FREEZE                       DUST STORM 
##                              155                              103 
##                       DUST DEVIL                   DRY MICROBURST 
##                               95                               78

Other minor changes into dataset are performed:

# scientific notation
ValidMultiplierLetter <- c("H", "K", "M", "B")

Multiplier <- c(100, 1000, 1e+06, 1e+09)
names(Multiplier) <- ValidMultiplierLetter

### Property Damage
d$PROPDMGEXP <- toupper(d$PROPDMGEXP)

Valid <- d$PROPDMGEXP %in% ValidMultiplierLetter
d$PROPDMGEXP[!Valid] <- ""

d$PROPDMG[!Valid] <- NA

d$PropCash <- NA
d$PropCash[Valid] <- d$PROPDMG[Valid] * Multiplier[d$PROPDMGEXP[Valid]]

### Crop Damage
d$CROPDMGEXP <- toupper(d$CROPDMGEXP)

Valid <- d$CROPDMGEXP %in% ValidMultiplierLetter
d$CROPDMGEXP[!Valid] <- ""

d$CROPDMG[!Valid] <- NA

d$CropCash <- NA
d$CropCash[Valid] <- d$CROPDMG[Valid] * Multiplier[d$CROPDMGEXP[Valid]]

Results

Experience has shown that weather and natural disasters produce a huge impact in people's life, we can use the NOAA dataset to quantify this impact using different metrics.

First we will show how different events have affected in terms of injuries and fatalities during the last years.

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

	EVTYPE	Events	Fatalities	Injuries
1	TORNADO	39944	5633	91346
2	HEAT	980	3138	9224
3	FLOODING	33165	1553	8681
4	LIGHTNING	13302	817	5232
5	THUNDERSTORM WIND	119790	754	9531
6	WINTER STORM	4502	597	5593
7	RIP CURRENT	641	572	529
8	WINTER WEATHER	1090	542	1110
9	HIGH WIND	9831	445	1883
10	AVALANCHE	268	224	170
11	HURRICANE/TYPHOON/TROPICAL STORM	689	201	1716
12	HEAVY/HIGH SURF	224	162	244
13	miscellaneous	410	138	352
14	HEAVY RAIN	1105	98	251
15	WILDFIRE	1259	90	1608
16	FOG	181	80	1076
17	LAND/MUD/ROCK SLIDES	212	44	55
18	DUST STORM	103	22	440
19	HAIL	26166	15	1371
20	STORM SURGE	177	13	38
21	DRY MICROBURST	78	3	28
22	DUST DEVIL	95	2	43
23	FROST/FREEZE	155	2	3
24	DROUGHT	266	0	4

The breakdown by year shows the method for collecting and reporting data has likely changed a lot over the years, with 20-time more data collected in 2011 than 20 years before.

## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, desc, failwith, id, mutate, summarise, summarize

	BeginYear	Events	Fatalities	Tornado	Heat	Flooding
1	1950	201	70	70	0	0
2	1951	241	34	34	0	0
3	1952	233	230	230	0	0
4	1953	421	519	519	0	0
5	1954	491	36	36	0	0
6	1955	441	129	129	0	0
7	1956	428	83	83	0	0
8	1957	824	193	193	0	0
9	1958	543	67	67	0	0
10	1959	505	58	58	0	0
11	1960	556	46	46	0	0
12	1961	627	52	52	0	0
13	1962	411	30	30	0	0
14	1963	380	31	31	0	0
15	1964	594	73	73	0	0
16	1965	724	301	301	0	0
17	1966	423	98	98	0	0
18	1967	683	114	114	0	0
19	1968	524	131	131	0	0
20	1969	458	66	66	0	0
21	1970	517	73	73	0	0
22	1971	714	159	159	0	0
23	1972	579	27	27	0	0
24	1973	1026	89	89	0	0
25	1974	884	366	366	0	0
26	1975	748	60	60	0	0
27	1976	707	44	44	0	0
28	1977	693	43	43	0	0
29	1978	620	53	53	0	0
30	1979	655	84	84	0	0
31	1980	728	28	28	0	0
32	1981	578	24	24	0	0
33	1982	1128	64	64	0	0
34	1983	1019	37	34	0	0
35	1984	920	160	122	0	0
36	1985	575	112	94	0	0
37	1986	690	51	15	0	0
38	1987	563	89	59	0	0
39	1988	678	55	32	0	0
40	1989	791	79	50	0	0
41	1990	986	95	53	0	0
42	1991	879	73	39	0	0
43	1992	990	54	39	0	0
44	1993	5838	298	28	7	53
45	1994	9644	344	48	44	86
46	1995	10457	1491	34	1051	77
47	1996	10040	542	26	36	133
48	1997	10322	601	68	81	118
49	1998	14013	687	130	173	136
50	1999	10609	908	94	502	68
51	2000	11508	477	41	158	38
52	2001	10298	469	40	166	50
53	2002	10432	498	55	167	46
54	2003	11015	443	54	36	88
55	2004	10484	370	35	6	82
56	2005	10014	469	38	158	43
57	2006	11974	599	67	252	78
58	2007	11953	421	81	44	87
59	2008	17633	488	129	42	79
60	2009	14434	333	21	33	56
61	2010	16019	425	45	83	108
62	2011	20570	1002	587	99	127

Let's plot the data from the table above:

library(RColorBrewer)  # brewer.pal

Colors <- brewer.pal(3, "Set1")

par(mfrow = c(4, 1), mar = c(4, 4, 2, 2))

plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Fatalities, pch = 15, col = "black", 
    main = "Reported Weather-Related Fatalities by Year", xlab = "Year", ylab = "Fatalities", 
    ylim = c(0, 1500), las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("All Weather Events", line = -2)

plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Tornado, pch = 16, col = Colors[1], 
    main = "", xlab = "Year", ylab = "Fatalities", ylim = c(0, 1500), las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("#1 Tornado Fatalities", line = -2)

plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Heat, pch = 16, col = Colors[2], 
    main = "", xlab = "Year", ylab = "Fatalities", las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("#2 Heat Fatalities", line = -2)

plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Flooding, pch = 16, col = Colors[3], 
    main = "", xlab = "Year", ylab = "Fatalities", las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("#3 Flooding Fatalities", line = -2)

mtext(expression("Source:  NOAA Storm Events Database (2011)"), BOTTOM <- 1, 
    adj = 0.025, line = -1, cex = 0.8, col = "blue", outer = TRUE)
mtext(format(Sys.time(), "%Y-%m-%d %H:%M"), BOTTOM, adj = 0.975, line = -1, 
    cex = 0.75, col = "blue", outer = TRUE)

plot of chunk unnamed-chunk-14