# Set working directory
setwd("/Users/lorenmyers/Desktop/Reproducible Research")
# Load the data, name it noaa
library(readr)
noaa <- read.csv("repdata_data_StormData.csv.bz2")
# Explore the structure of the dataset
str(noaa)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
library(skimr)
skim(noaa)
## Warning in grepl("^\\s+$", x): input string 192565 is invalid in this locale
## Warning in grepl("^\\s+$", x): input string 194345 is invalid in this locale
## Warning in grepl("^\\s+$", x): input string 199735 is invalid in this locale
## Warning in grepl("^\\s+$", x): input string 199745 is invalid in this locale
## Warning in grepl("^\\s+$", x): input string 200467 is invalid in this locale
| Name | noaa |
| Number of rows | 902297 |
| Number of columns | 37 |
| _______________________ | |
| Column type frequency: | |
| character | 18 |
| logical | 1 |
| numeric | 18 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| BGN_DATE | 0 | 1 | 16 | 18 | 0 | 16335 | 0 |
| BGN_TIME | 0 | 1 | 3 | 11 | 0 | 3608 | 0 |
| TIME_ZONE | 0 | 1 | 3 | 3 | 0 | 22 | 0 |
| COUNTYNAME | 0 | 1 | 0 | 200 | 1589 | 29601 | 0 |
| STATE | 0 | 1 | 2 | 2 | 0 | 72 | 0 |
| EVTYPE | 0 | 1 | 1 | 30 | 0 | 985 | 0 |
| BGN_AZI | 0 | 1 | 0 | 3 | 547332 | 35 | 0 |
| BGN_LOCATI | 0 | 1 | 0 | 21 | 287743 | 54429 | 0 |
| END_DATE | 0 | 1 | 0 | 18 | 243411 | 6663 | 0 |
| END_TIME | 0 | 1 | 0 | 12 | 238978 | 3647 | 0 |
| END_AZI | 0 | 1 | 0 | 3 | 724837 | 24 | 0 |
| END_LOCATI | 0 | 1 | 0 | 21 | 499225 | 34506 | 0 |
| PROPDMGEXP | 0 | 1 | 0 | 1 | 465934 | 19 | 0 |
| CROPDMGEXP | 0 | 1 | 0 | 1 | 618413 | 9 | 0 |
| WFO | 0 | 1 | 0 | 3 | 142069 | 542 | 0 |
| STATEOFFIC | 0 | 1 | 0 | 45 | 248769 | 250 | 0 |
| ZONENAMES | 0 | 1 | 0 | 7226 | 594029 | 25112 | 205988 |
| REMARKS | 0 | 1 | 0 | 41278 | 287433 | 436781 | 24658 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| COUNTYENDN | 902297 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| STATE__ | 0 | 1.00 | 31.20 | 16.57 | 1 | 19 | 30 | 45.0 | 95 | ▆▇▇▁▁ |
| COUNTY | 0 | 1.00 | 100.64 | 107.28 | 0 | 31 | 75 | 131.0 | 873 | ▇▁▁▁▁ |
| BGN_RANGE | 0 | 1.00 | 1.48 | 5.48 | 0 | 0 | 0 | 1.0 | 3749 | ▇▁▁▁▁ |
| COUNTY_END | 0 | 1.00 | 0.00 | 0.00 | 0 | 0 | 0 | 0.0 | 0 | ▁▁▇▁▁ |
| END_RANGE | 0 | 1.00 | 0.99 | 3.37 | 0 | 0 | 0 | 0.0 | 925 | ▇▁▁▁▁ |
| LENGTH | 0 | 1.00 | 0.23 | 4.62 | 0 | 0 | 0 | 0.0 | 2315 | ▇▁▁▁▁ |
| WIDTH | 0 | 1.00 | 7.50 | 61.57 | 0 | 0 | 0 | 0.0 | 4400 | ▇▁▁▁▁ |
| F | 843563 | 0.07 | 0.91 | 1.00 | 0 | 0 | 1 | 1.0 | 5 | ▇▂▁▁▁ |
| MAG | 0 | 1.00 | 46.90 | 61.91 | 0 | 0 | 50 | 75.0 | 22000 | ▇▁▁▁▁ |
| FATALITIES | 0 | 1.00 | 0.02 | 0.77 | 0 | 0 | 0 | 0.0 | 583 | ▇▁▁▁▁ |
| INJURIES | 0 | 1.00 | 0.16 | 5.43 | 0 | 0 | 0 | 0.0 | 1700 | ▇▁▁▁▁ |
| PROPDMG | 0 | 1.00 | 12.06 | 59.48 | 0 | 0 | 0 | 0.5 | 5000 | ▇▁▁▁▁ |
| CROPDMG | 0 | 1.00 | 1.53 | 22.17 | 0 | 0 | 0 | 0.0 | 990 | ▇▁▁▁▁ |
| LATITUDE | 47 | 1.00 | 2874.94 | 1657.65 | 0 | 2802 | 3540 | 4019.0 | 9706 | ▅▇▆▁▁ |
| LONGITUDE | 0 | 1.00 | 6939.54 | 3958.06 | -14451 | 7247 | 8707 | 9605.0 | 17124 | ▁▁▂▇▁ |
| LATITUDE_E | 40 | 1.00 | 1451.61 | 1858.73 | 0 | 0 | 0 | 3549.0 | 9706 | ▇▃▂▁▁ |
| LONGITUDE_ | 0 | 1.00 | 3509.14 | 4475.68 | -14455 | 0 | 0 | 8735.0 | 106220 | ▇▁▁▁▁ |
| REFNUM | 0 | 1.00 | 451149.00 | 260470.85 | 1 | 225575 | 451149 | 676723.0 | 902297 | ▇▇▇▇▇ |
# Creating subsets for easier analysis:
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ dplyr 1.0.2
## ✓ tibble 3.0.4 ✓ stringr 1.4.0
## ✓ tidyr 1.1.2 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
# Injury and fatality related dataset
health <- noaa %>%
select(EVTYPE, FATALITIES, INJURIES)
health$both <- health$FATALITIES + health$INJURIES
health <- health %>%
dplyr::group_by(EVTYPE) %>%
summarise(totalfatalities = sum(FATALITIES), totalinjuries = sum(INJURIES), totalboth = sum(both)) %>%
arrange(desc(totalboth))
## `summarise()` ungrouping output (override with `.groups` argument)
health_tidy <- health %>%
slice(1:10) %>%
select(EVTYPE, totalfatalities, totalinjuries) %>%
pivot_longer(totalfatalities:totalinjuries, names_to = "type", values_to = "total")
# Note that the PROPDMG and CROPDMG do not give the full damage number, the exponent in PROPDMGEXP and CROPDMGEXP need to be considered
economic <- noaa %>%
select(EVTYPE, PROPDMG, CROPDMG, PROPDMGEXP, CROPDMGEXP)
unique(economic$PROPDMGEXP)
## [1] "K" "M" "" "B" "m" "+" "0" "5" "6" "?" "4" "2" "3" "h" "7" "H" "-" "1" "8"
unique(economic$CROPDMGEXP)
## [1] "" "M" "K" "m" "B" "?" "0" "k" "2"
economic$PROPDMGEXP[economic$PROPDMGEXP %in% c("", "+", "?", "-")] <- 1
economic$PROPDMGEXP[economic$PROPDMGEXP %in% c("B")] <- 9
economic$PROPDMGEXP[economic$PROPDMGEXP %in% c("M", "m")] <- 6
economic$PROPDMGEXP[economic$PROPDMGEXP %in% c("K")] <- 3
economic$PROPDMGEXP[economic$PROPDMGEXP %in% c("H", "h")] <- 2
economic$CROPDMGEXP[economic$CROPDMGEXP %in% c("", "?", "-")] <- 1
economic$CROPDMGEXP[economic$CROPDMGEXP %in% c("B")] <- 9
economic$CROPDMGEXP[economic$CROPDMGEXP %in% c("M", "m")] <- 6
economic$CROPDMGEXP[economic$CROPDMGEXP %in% c("K", "k")] <- 3
economic$CROPDMGEXP[economic$CROPDMGEXP %in% c("H")] <- 2
economic$property <- economic$PROPDMG * (10 ^ as.numeric(economic$PROPDMGEXP))
economic$crop <- economic$CROPDMG * (10 ^ as.numeric(economic$CROPDMGEXP))
economic$damage <- economic$property + economic$crop
# Create a plot of the fatalities and injuries caused by the top 10 injurious storm related events.
chart1 <- health_tidy %>%
ggplot(aes(x= reorder(EVTYPE, + total), y = total, fill = type)) +
geom_bar(stat= "identity", position = "dodge") +
labs(x = "Event Type", y = "Total Harmful Numbers",
title = "Total injuries and fatalities from the top 10 weather\nrelated events in the United States",
fill = "Severity") +
scale_fill_discrete(labels = c("Fatalities", "Injuries")) +
theme(axis.text.x = element_text(angle = 0)) +
coord_flip()
chart1
# Tornadoes cause the most injuries and fatalities in the US by a large margin, followed by excessive heat.
# Create a plot of the amount of property damage caused by the top 10 injurious storm related events.
eco_prop <- economic %>%
dplyr::group_by(EVTYPE) %>%
summarise(totalprop = sum(property)) %>%
arrange(desc(totalprop)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(EVTYPE, +totalprop), y = totalprop)) +
geom_bar(stat = "identity") +
labs(x = "Event Type", y = "Total Damage (USD)",
title = "Top 10 property damage causing weather related\neventsin the United States") +
theme(axis.text.x = element_text(angle = 0)) +
coord_flip()
## `summarise()` ungrouping output (override with `.groups` argument)
eco_prop
# Floods cause the most property damage in the US, according the NOAA data collection.
# Create a plot of the amount of property damage caused by the top 10 injurious storm related events.
eco_crop <- economic %>%
dplyr::group_by(EVTYPE) %>%
summarise(totalcrop = sum(crop)) %>%
arrange(desc(totalcrop)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(EVTYPE, +totalcrop), y = totalcrop)) +
geom_bar(stat = "identity") +
labs(x = "Event Type", y = "Total Damage (USD)",
title = "Top 10 crop damage causing weather\nrelated events in the United States") +
theme(axis.text.x = element_text(angle = 0)) +
coord_flip()
## `summarise()` ungrouping output (override with `.groups` argument)
eco_crop
# Droughts are the leading cause of crop damage in the US, according to NOAA data collection.