This study involves analyzing data from the NOAA Storm Database with the purpose of seeing how storms and other weather events affect people.
In order to perform this project we have used R for both analyzing data and plotting results.
We start unzipping dataset and loading CSV file into R.
library(R.utils) # install with install.packages('R.utils')
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.6.1 (2014-01-04) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.18.0 (2014-02-22) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
##
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
##
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
##
## R.utils v1.32.4 (2014-05-14) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
##
## The following object is masked from 'package:utils':
##
## timestamp
##
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
bzipFile <- "./data/repdata-data-StormData.csv.bz2"
csvFile <- "./data/repdata-data-StormData.csv"
bunzip2(bzipFile, csvFile, remove = FALSE)
## Error: File already exists: ./data/repdata-data-StormData.csv
data <- read.csv(csvFile, as.is = TRUE, comment.char = "")
Once we have the data on our workspace we can see that data is quite large in size.
dim(data)
## [1] 902297 37
so our next step is to remove all records that don't contain useful information. We focus on those events that caused any kind of damage.
care.about <- (data$FATALITIES > 0) | (data$INJURIES > 0) | (data$PROPDMG >
0) | (data$CROPDMG > 0)
d <- data[care.about, ]
dim(d)
## [1] 254633 37
Before goint to deep analysis we have to continue cleaning and preparing data. We define a new event called BeginYear.
splits <- strsplit(d$BGN_DATE, " ")
table(unlist(lapply(splits, "[", 2)))
##
## 0:00:00
## 254633
d$BeginDate <- as.Date(unlist(lapply(splits, "[", 1)), "%m/%d/%Y")
d$BeginYear <- substr(d$BeginDate, 1, 4)
table(d$BeginYear)
##
## 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961
## 201 241 233 421 491 441 428 824 543 505 556 627
## 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
## 411 380 594 724 423 683 524 458 517 714 579 1026
## 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985
## 884 748 707 693 620 655 728 578 1128 1019 920 575
## 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
## 690 563 678 791 986 879 990 5838 9644 10457 10040 10322
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
## 14013 10609 11508 10298 10432 11015 10484 10014 11974 11953 17633 14434
## 2010 2011
## 16019 20570
After that we want to ensure that all event types are corrected written (all uppercase) and there are no multiple spaces.
d$EVTYPE <- toupper(d$EVTYPE)
d$EVTYPE <- gsub(" +", " ", d$EVTYPE)
Now let's look deeper to Tornado event, and check that there are no non-sense values, according to Wikipedia (visit http://en.wikipedia.org/wiki/Tornado_records) it has never been recorded a Tornado with length greater than 219 mph.
d$LENGTH[d$LENGTH > 219] <- NA
More Tornando cleaning
tornado.data <- (d$LENGTH > 0) | (d$WIDTH > 0) | (!is.na(d$F)) | grepl("TORNADO",
d$EVTYPE)
Now we have cleaned our dataset, we can show the Fujita tornado intensity scale, from 0 to 5.
table(d$F, useNA = "ifany")
##
## 0 1 2 3 4 5 <NA>
## 9974 15930 8933 2998 994 129 215675
We can also show all events related with Tornados:
table(d$EVTYPE[tornado.data])
##
## COLD AIR TORNADO DUST DEVIL
## 1 1
## FLASH FLOOD GUSTNADO
## 1 5
## HAIL SEVERE THUNDERSTORM
## 2 1
## SEVERE THUNDERSTORMS THUNDERSTORM WINDS
## 1 11
## TORNADO TORNADO F0
## 39944 5
## TORNADO F1 TORNADO F2
## 4 2
## TORNADO F3 TORNADOES
## 2 2
## TORNADOES, TSTM WIND, HAIL TORNDAO
## 1 1
## TSTM WIND WATERSPOUT
## 3 8
## WATERSPOUT TORNADO WATERSPOUT-
## 1 4
## WATERSPOUT-TORNADO WATERSPOUT/ TORNADO
## 1 1
## WATERSPOUT/TORNADO WINTER STORM
## 6 1
Up to now we have dealt with Tornado entries, it's time to clean and prepare data for the rest of events. A quick overview of those events show that there are many events that can be group into a larger set of events, for example we have FLOOD, FLD and RECORD RAINFALL tags that can be group into a new FLOODING tag.
# Combine variations of fire
d$EVTYPE[grep("FIRE", d$EVTYPE)] <- "WILDFIRE"
# HEAT or EXCESSIVE HEAT
d$EVTYPE[grep("HEAT", d$EVTYPE)] <- "HEAT"
d$EVTYPE[grep("DENSE FOG", d$EVTYPE)] <- "FOG"
# Only notable winds are high wind
d$EVTYPE[d$EVTYPE == "WINDS" | d$EVTYPE == "WIND"] <- "HIGH WIND"
# Combine variations of freeze -- likely agriculture related
d$EVTYPE[grep("FROST|FREEZE", d$EVTYPE)] <- "FROST/FREEZE"
d$EVTYPE[grep("SLIDE", d$EVTYPE)] <- "LAND/MUD/ROCK SLIDES"
d$EVTYPE[grep("HURRICANE|TYPHOON|TROPICAL", d$EVTYPE)] <- "HURRICANE/TYPHOON/TROPICAL STORM"
d$EVTYPE[grep("SNOW|ICE|BLIZZARD|WINTER STORM|FREEZING|WINTRY MIX|SLEET", d$EVTYPE)] <- "WINTER STORM"
d$EVTYPE[grep("WINTER WEATHER|COLD|ICY|GLAZE|LOW TEMPERATURE|HYPOTHERMIA|HYPERTHERMIA|COLD TEMPERATURE|WINDCHILL",
d$EVTYPE)] <- "WINTER WEATHER"
# ~15 variations of 'High Wind' and 3 variations of 'strong wind' but after
# 'Snow' variations removed above
d$EVTYPE[grep("HIGH WIND|STRONG WIND|Gusty Winds|GUSTY WIND", d$EVTYPE)] <- "HIGH WIND"
# Change these 'false hits' for 'thunderstorm wind' to wind
d$EVTYPE <- sub("NON-THUNDERSTORM WIND|NON THUNDERSTORM WIND", "WIND", d$EVTYPE)
# Other non thunderstorm winds
# Standardize variations/spelling errors in 'Thunderstorm'
d$EVTYPE <- gsub("THUNDEERSTORM|THUNDERESTORM|THUNDERSTROM|THUDERSTORM|THUNERSTORM|THUNDERTORM|TUNDERSTORM",
"THUNDERSTORM", d$EVTYPE)
d$EVTYPE <- sub("TSTM WIND|STORM FORCE WINDS", "THUNDERSTORM WIND", d$EVTYPE)
# Manually reviewed ~44 variations that are combined here
d$EVTYPE[grep("THUNDERSTORMW|THUNDERSTORMWINDS|THUNDERSTORM WINS|TSTMW", d$EVTYPE)] <- "THUNDERSTORM WIND"
d$EVTYPE[grep("THUNDERSTORM WIND|THUNDERSTORMS WIND", d$EVTYPE)] <- "THUNDERSTORM WIND"
# Combine variations of fire
d$EVTYPE[grep("HEAVY SURF|HIGH SURF", d$EVTYPE)] <- "HEAVY/HIGH SURF"
d$EVTYPE[grep("RIP", d$EVTYPE)] <- "RIP CURRENT"
# Combine ~59 variations of FLOODING
d$EVTYPE[grep("FLOOD|FLD|RECORD RAINFALL", d$EVTYPE)] <- "FLOODING"
# Combine ~20 variations of HAIL
d$EVTYPE[grep("HAIL", d$EVTYPE)] <- "HAIL"
# Combine variations of LIGHTNING
d$EVTYPE[grep("LIGHTNING|LIGHTING|LIGNTNING", d$EVTYPE)] <- "LIGHTNING"
Finally in order to reduce the number of events we collapse those with low frequency under the tag “miscellaneous”.
event.types <- sort(table(d$EVTYPE), decreasing = TRUE)
event.types.under.50 <- names(event.types[event.types < 50])
d$EVTYPE[d$EVTYPE %in% event.types.under.50] <- "miscellaneous"
So we have now:
sort(table(d$EVTYPE), decreasing = TRUE)
##
## THUNDERSTORM WIND TORNADO
## 119790 39944
## FLOODING HAIL
## 33165 26166
## LIGHTNING HIGH WIND
## 13302 9831
## WINTER STORM WILDFIRE
## 4502 1259
## HEAVY RAIN WINTER WEATHER
## 1105 1090
## HEAT HURRICANE/TYPHOON/TROPICAL STORM
## 980 689
## RIP CURRENT miscellaneous
## 641 410
## AVALANCHE DROUGHT
## 268 266
## HEAVY/HIGH SURF LAND/MUD/ROCK SLIDES
## 224 212
## FOG STORM SURGE
## 181 177
## FROST/FREEZE DUST STORM
## 155 103
## DUST DEVIL DRY MICROBURST
## 95 78
Other minor changes into dataset are performed:
# scientific notation
ValidMultiplierLetter <- c("H", "K", "M", "B")
Multiplier <- c(100, 1000, 1e+06, 1e+09)
names(Multiplier) <- ValidMultiplierLetter
### Property Damage
d$PROPDMGEXP <- toupper(d$PROPDMGEXP)
Valid <- d$PROPDMGEXP %in% ValidMultiplierLetter
d$PROPDMGEXP[!Valid] <- ""
d$PROPDMG[!Valid] <- NA
d$PropCash <- NA
d$PropCash[Valid] <- d$PROPDMG[Valid] * Multiplier[d$PROPDMGEXP[Valid]]
### Crop Damage
d$CROPDMGEXP <- toupper(d$CROPDMGEXP)
Valid <- d$CROPDMGEXP %in% ValidMultiplierLetter
d$CROPDMGEXP[!Valid] <- ""
d$CROPDMG[!Valid] <- NA
d$CropCash <- NA
d$CropCash[Valid] <- d$CROPDMG[Valid] * Multiplier[d$CROPDMGEXP[Valid]]
Experience has shown that weather and natural disasters produce a huge impact in people's life, we can use the NOAA dataset to quantify this impact using different metrics.
First we will show how different events have affected in terms of injuries and fatalities during the last years.
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
| EVTYPE | Events | Fatalities | Injuries | |
|---|---|---|---|---|
| 1 | TORNADO | 39944 | 5633 | 91346 |
| 2 | HEAT | 980 | 3138 | 9224 |
| 3 | FLOODING | 33165 | 1553 | 8681 |
| 4 | LIGHTNING | 13302 | 817 | 5232 |
| 5 | THUNDERSTORM WIND | 119790 | 754 | 9531 |
| 6 | WINTER STORM | 4502 | 597 | 5593 |
| 7 | RIP CURRENT | 641 | 572 | 529 |
| 8 | WINTER WEATHER | 1090 | 542 | 1110 |
| 9 | HIGH WIND | 9831 | 445 | 1883 |
| 10 | AVALANCHE | 268 | 224 | 170 |
| 11 | HURRICANE/TYPHOON/TROPICAL STORM | 689 | 201 | 1716 |
| 12 | HEAVY/HIGH SURF | 224 | 162 | 244 |
| 13 | miscellaneous | 410 | 138 | 352 |
| 14 | HEAVY RAIN | 1105 | 98 | 251 |
| 15 | WILDFIRE | 1259 | 90 | 1608 |
| 16 | FOG | 181 | 80 | 1076 |
| 17 | LAND/MUD/ROCK SLIDES | 212 | 44 | 55 |
| 18 | DUST STORM | 103 | 22 | 440 |
| 19 | HAIL | 26166 | 15 | 1371 |
| 20 | STORM SURGE | 177 | 13 | 38 |
| 21 | DRY MICROBURST | 78 | 3 | 28 |
| 22 | DUST DEVIL | 95 | 2 | 43 |
| 23 | FROST/FREEZE | 155 | 2 | 3 |
| 24 | DROUGHT | 266 | 0 | 4 |
The breakdown by year shows the method for collecting and reporting data has likely changed a lot over the years, with 20-time more data collected in 2011 than 20 years before.
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, desc, failwith, id, mutate, summarise, summarize
| BeginYear | Events | Fatalities | Tornado | Heat | Flooding | |
|---|---|---|---|---|---|---|
| 1 | 1950 | 201 | 70 | 70 | 0 | 0 |
| 2 | 1951 | 241 | 34 | 34 | 0 | 0 |
| 3 | 1952 | 233 | 230 | 230 | 0 | 0 |
| 4 | 1953 | 421 | 519 | 519 | 0 | 0 |
| 5 | 1954 | 491 | 36 | 36 | 0 | 0 |
| 6 | 1955 | 441 | 129 | 129 | 0 | 0 |
| 7 | 1956 | 428 | 83 | 83 | 0 | 0 |
| 8 | 1957 | 824 | 193 | 193 | 0 | 0 |
| 9 | 1958 | 543 | 67 | 67 | 0 | 0 |
| 10 | 1959 | 505 | 58 | 58 | 0 | 0 |
| 11 | 1960 | 556 | 46 | 46 | 0 | 0 |
| 12 | 1961 | 627 | 52 | 52 | 0 | 0 |
| 13 | 1962 | 411 | 30 | 30 | 0 | 0 |
| 14 | 1963 | 380 | 31 | 31 | 0 | 0 |
| 15 | 1964 | 594 | 73 | 73 | 0 | 0 |
| 16 | 1965 | 724 | 301 | 301 | 0 | 0 |
| 17 | 1966 | 423 | 98 | 98 | 0 | 0 |
| 18 | 1967 | 683 | 114 | 114 | 0 | 0 |
| 19 | 1968 | 524 | 131 | 131 | 0 | 0 |
| 20 | 1969 | 458 | 66 | 66 | 0 | 0 |
| 21 | 1970 | 517 | 73 | 73 | 0 | 0 |
| 22 | 1971 | 714 | 159 | 159 | 0 | 0 |
| 23 | 1972 | 579 | 27 | 27 | 0 | 0 |
| 24 | 1973 | 1026 | 89 | 89 | 0 | 0 |
| 25 | 1974 | 884 | 366 | 366 | 0 | 0 |
| 26 | 1975 | 748 | 60 | 60 | 0 | 0 |
| 27 | 1976 | 707 | 44 | 44 | 0 | 0 |
| 28 | 1977 | 693 | 43 | 43 | 0 | 0 |
| 29 | 1978 | 620 | 53 | 53 | 0 | 0 |
| 30 | 1979 | 655 | 84 | 84 | 0 | 0 |
| 31 | 1980 | 728 | 28 | 28 | 0 | 0 |
| 32 | 1981 | 578 | 24 | 24 | 0 | 0 |
| 33 | 1982 | 1128 | 64 | 64 | 0 | 0 |
| 34 | 1983 | 1019 | 37 | 34 | 0 | 0 |
| 35 | 1984 | 920 | 160 | 122 | 0 | 0 |
| 36 | 1985 | 575 | 112 | 94 | 0 | 0 |
| 37 | 1986 | 690 | 51 | 15 | 0 | 0 |
| 38 | 1987 | 563 | 89 | 59 | 0 | 0 |
| 39 | 1988 | 678 | 55 | 32 | 0 | 0 |
| 40 | 1989 | 791 | 79 | 50 | 0 | 0 |
| 41 | 1990 | 986 | 95 | 53 | 0 | 0 |
| 42 | 1991 | 879 | 73 | 39 | 0 | 0 |
| 43 | 1992 | 990 | 54 | 39 | 0 | 0 |
| 44 | 1993 | 5838 | 298 | 28 | 7 | 53 |
| 45 | 1994 | 9644 | 344 | 48 | 44 | 86 |
| 46 | 1995 | 10457 | 1491 | 34 | 1051 | 77 |
| 47 | 1996 | 10040 | 542 | 26 | 36 | 133 |
| 48 | 1997 | 10322 | 601 | 68 | 81 | 118 |
| 49 | 1998 | 14013 | 687 | 130 | 173 | 136 |
| 50 | 1999 | 10609 | 908 | 94 | 502 | 68 |
| 51 | 2000 | 11508 | 477 | 41 | 158 | 38 |
| 52 | 2001 | 10298 | 469 | 40 | 166 | 50 |
| 53 | 2002 | 10432 | 498 | 55 | 167 | 46 |
| 54 | 2003 | 11015 | 443 | 54 | 36 | 88 |
| 55 | 2004 | 10484 | 370 | 35 | 6 | 82 |
| 56 | 2005 | 10014 | 469 | 38 | 158 | 43 |
| 57 | 2006 | 11974 | 599 | 67 | 252 | 78 |
| 58 | 2007 | 11953 | 421 | 81 | 44 | 87 |
| 59 | 2008 | 17633 | 488 | 129 | 42 | 79 |
| 60 | 2009 | 14434 | 333 | 21 | 33 | 56 |
| 61 | 2010 | 16019 | 425 | 45 | 83 | 108 |
| 62 | 2011 | 20570 | 1002 | 587 | 99 | 127 |
Let's plot the data from the table above:
library(RColorBrewer) # brewer.pal
Colors <- brewer.pal(3, "Set1")
par(mfrow = c(4, 1), mar = c(4, 4, 2, 2))
plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Fatalities, pch = 15, col = "black",
main = "Reported Weather-Related Fatalities by Year", xlab = "Year", ylab = "Fatalities",
ylim = c(0, 1500), las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("All Weather Events", line = -2)
plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Tornado, pch = 16, col = Colors[1],
main = "", xlab = "Year", ylab = "Fatalities", ylim = c(0, 1500), las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("#1 Tornado Fatalities", line = -2)
plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Heat, pch = 16, col = Colors[2],
main = "", xlab = "Year", ylab = "Fatalities", las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("#2 Heat Fatalities", line = -2)
plot(FatalitiesByYear$BeginYear, FatalitiesByYear$Flooding, pch = 16, col = Colors[3],
main = "", xlab = "Year", ylab = "Fatalities", las = 1)
grid(nx = NULL, ny = NA, col = "gray", lty = "solid")
mtext("#3 Flooding Fatalities", line = -2)
mtext(expression("Source: NOAA Storm Events Database (2011)"), BOTTOM <- 1,
adj = 0.025, line = -1, cex = 0.8, col = "blue", outer = TRUE)
mtext(format(Sys.time(), "%Y-%m-%d %H:%M"), BOTTOM, adj = 0.975, line = -1,
cex = 0.75, col = "blue", outer = TRUE)