##Data Processing
stormData <- read_csv("repdata_data_StormData.csv.bz2")
## Parsed with column specification:
## cols(
## .default = col_double(),
## BGN_DATE = col_character(),
## BGN_TIME = col_character(),
## TIME_ZONE = col_character(),
## COUNTYNAME = col_character(),
## STATE = col_character(),
## EVTYPE = col_character(),
## BGN_AZI = col_logical(),
## BGN_LOCATI = col_logical(),
## END_DATE = col_logical(),
## END_TIME = col_logical(),
## COUNTYENDN = col_logical(),
## END_AZI = col_logical(),
## END_LOCATI = col_logical(),
## PROPDMGEXP = col_character(),
## CROPDMGEXP = col_logical(),
## WFO = col_logical(),
## STATEOFFIC = col_logical(),
## ZONENAMES = col_logical(),
## REMARKS = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 5255570 parsing failures.
## row col expected actual file
## 1671 WFO 1/0/T/F/TRUE/FALSE NG 'repdata_data_StormData.csv.bz2'
## 1673 WFO 1/0/T/F/TRUE/FALSE NG 'repdata_data_StormData.csv.bz2'
## 1674 WFO 1/0/T/F/TRUE/FALSE NG 'repdata_data_StormData.csv.bz2'
## 1675 WFO 1/0/T/F/TRUE/FALSE NG 'repdata_data_StormData.csv.bz2'
## 1678 WFO 1/0/T/F/TRUE/FALSE NG 'repdata_data_StormData.csv.bz2'
## .... ... .................. ...... ................................
## See problems(...) for more details.
##Initial Exploration
summary(stormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE
## Min. : 1.0 Length:902297 Length:902297 Length:902297
## 1st Qu.:19.0 Class :character Class :character Class :character
## Median :30.0 Mode :character Mode :character Mode :character
## Mean :31.2
## 3rd Qu.:45.0
## Max. :95.0
##
## COUNTY COUNTYNAME STATE EVTYPE
## Min. : 0.0 Length:902297 Length:902297 Length:902297
## 1st Qu.: 31.0 Class :character Class :character Class :character
## Median : 75.0 Mode :character Mode :character Mode :character
## Mean :100.6
## 3rd Qu.:131.0
## Max. :873.0
##
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME
## Min. : 0.000 Mode:logical Mode:logical Mode:logical Mode:logical
## 1st Qu.: 0.000 NA's:902297 TRUE:1 NA's:902297 NA's:902297
## Median : 0.000 NA's:902296
## Mean : 1.484
## 3rd Qu.: 1.000
## Max. :3749.000
##
## COUNTY_END COUNTYENDN END_RANGE END_AZI END_LOCATI
## Min. :0 Mode:logical Min. : 0.0000 Mode:logical Mode:logical
## 1st Qu.:0 NA's:902297 1st Qu.: 0.0000 NA's:902297 NA's:902297
## Median :0 Median : 0.0000
## Mean :0 Mean : 0.9862
## 3rd Qu.:0 3rd Qu.: 0.0000
## Max. :0 Max. :925.0000
##
## LENGTH WIDTH F MAG
## Min. : 0.0000 Min. : 0.000 Min. :0.0 Min. : 0.0
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.:0.0 1st Qu.: 0.0
## Median : 0.0000 Median : 0.000 Median :1.0 Median : 50.0
## Mean : 0.2301 Mean : 7.503 Mean :0.9 Mean : 46.9
## 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.:1.0 3rd Qu.: 75.0
## Max. :2315.0000 Max. :4400.000 Max. :5.0 Max. :22000.0
## NA's :843563
## FATALITIES INJURIES PROPDMG PROPDMGEXP
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00 Length:902297
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00 Class :character
## Median : 0.0000 Median : 0.0000 Median : 0.00 Mode :character
## Mean : 0.0168 Mean : 0.1557 Mean : 12.06
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.50
## Max. :583.0000 Max. :1700.0000 Max. :5000.00
##
## CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## Min. : 0.000 Mode :logical Mode:logical Mode:logical Mode:logical
## 1st Qu.: 0.000 FALSE:19 TRUE:7166 NA's:902297 NA's:902297
## Median : 0.000 NA's :902278 NA's:895131
## Mean : 1.527
## 3rd Qu.: 0.000
## Max. :990.000
##
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS
## Min. : 0 Min. :-14451 Min. : 0 Min. :-14455 Mode:logical
## 1st Qu.:2802 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0 NA's:902297
## Median :3540 Median : 8707 Median : 0 Median : 0
## Mean :2875 Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.:4019 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. :9706 Max. : 17124 Max. :9706 Max. :106220
## NA's :47 NA's :40
## REFNUM
## Min. : 1
## 1st Qu.:225575
## Median :451149
## Mean :451149
## 3rd Qu.:676723
## Max. :902297
##
##Load Relevant Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.1 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ stringr 1.4.0
## ✓ tidyr 1.1.0 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
##Question 1 Processing Steps ##select just the event types, fatalities and injuries ##group the data by event type ##summarize the sum of fatalities and injuries by event type ##Transform the total number of health impacts by event type by calculating log base 10 ##The total number of each health impact was transformed to log base 10 due to the vast difference between the highest count level and the lowest
healthImpactData <- stormData %>%
select(`EVENT TYPE` = EVTYPE,FATALITIES,INJURIES) %>%
group_by(`EVENT TYPE`) %>%
summarise(
"Fatalities" = log10(sum(FATALITIES)),
"Injuries" = log10(sum(INJURIES))
) %>%
arrange(desc(`Fatalities`),desc(`Injuries`)) %>%
head(30) %>%
pivot_longer(!`EVENT TYPE`,names_to = "Health Impact", values_to = "Total")
## `summarise()` ungrouping output (override with `.groups` argument)
##Question 2 Processing ##select just the event types, PROPDMG, PROPDMGEXP, CROPDMG, CROPDMGEXP ##convert costs to straight dollar amounts ##create total economic cost by combining cost from property and crop damage ##create a summary of the sums of the total economic impact by event type ##the log of the total dollars was taken due to the large variation between the highest dollar amount
EconomcImpactData <- stormData %>%
select(EVTYPE,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP)
ConvertedImpactData <- EconomcImpactData %>%
filter(str_detect(toupper(CROPDMGEXP),"[KMB]")|str_detect(toupper(PROPDMG),"[KMB]")) %>%
mutate(
"Crop Economic Impact" = case_when(
toupper(CROPDMGEXP) == "K"~ CROPDMG*1000,
toupper(CROPDMGEXP) == "M"~ CROPDMG*1000000,
toupper(CROPDMGEXP) == "B"~ CROPDMG*1000000000,
),
"Property Economic Impact"= case_when(
toupper(PROPDMGEXP) == "K"~ PROPDMG*1000,
toupper(PROPDMGEXP) == "M"~ PROPDMG*1000000,
toupper(PROPDMGEXP) == "B"~ PROPDMG*1000000000,
)
) %>%
mutate(
"Total Economic Impact"= case_when(
(!is.na(`Crop Economic Impact`) & !is.na(`Property Economic Impact`))~`Crop Economic Impact`+ `Property Economic Impact`,
!is.na(`Crop Economic Impact`) ~`Crop Economic Impact`,
!is.na(`Property Economic Impact`)~`Property Economic Impact`
)
) %>% rename("Event Type" =EVTYPE)
EconomcImpactSummary <- ConvertedImpactData %>%
group_by(`Event Type`) %>%
summarise(
"Economic Impact By Event Type" = log10(sum(`Total Economic Impact`)
)) %>% arrange(desc(`Economic Impact By Event Type`))
## `summarise()` ungrouping output (override with `.groups` argument)
##Results Question 1 ##See figure 1
healthImpactData %>%
ggplot(aes(y=Total,x=reorder(`EVENT TYPE`,-Total), fill=`Health Impact`))+
xlab("Event Type")+
ylab("log10 Total Number")+
ggtitle("Top 20 Storm Event Types' Impact on Health in USA ")+
geom_bar(stat="identity",position="dodge")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45,hjust = 1),
legend.position = c(.9,.9))
##Results Question 2 ##See Figure 2
EconomcImpactSummary %>%
head(20) %>%
ggplot(aes(x=reorder(`Event Type`,-`Economic Impact By Event Type`),y=`Economic Impact By Event Type`, fill=`Event Type`))+
geom_bar(stat="identity")+
xlab("Event Type")+
ylab("Log10 of Dollar Amount of Impact")+
ggtitle("Top 20 Storm Event Types' Economic Impact in USA ")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45,hjust = 1),
legend.position = "none")
##R Session Information sessionInfo()