1.Data Processing
# Loading the needed packages
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
# Downloading and Reading the file
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", destfile = "stormData.csv.bz2")
# Reading and saving the file
stormData <- read_csv("stormData.csv.bz2")
## Parsed with column specification:
## cols(
## .default = col_character(),
## STATE__ = col_double(),
## COUNTY = col_double(),
## BGN_RANGE = col_double(),
## COUNTY_END = col_double(),
## END_RANGE = col_double(),
## LENGTH = col_double(),
## WIDTH = col_double(),
## F = col_integer(),
## MAG = col_double(),
## FATALITIES = col_double(),
## INJURIES = col_double(),
## PROPDMG = col_double(),
## CROPDMG = col_double(),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## LATITUDE_E = col_double(),
## LONGITUDE_ = col_double(),
## REFNUM = col_double()
## )
## See spec(...) for full column specifications.
# Summary to have an overview
summary(stormData)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE
## Min. : 1.0 Length:902297 Length:902297 Length:902297
## 1st Qu.:19.0 Class :character Class :character Class :character
## Median :30.0 Mode :character Mode :character Mode :character
## Mean :31.2
## 3rd Qu.:45.0
## Max. :95.0
##
## COUNTY COUNTYNAME STATE EVTYPE
## Min. : 0.0 Length:902297 Length:902297 Length:902297
## 1st Qu.: 31.0 Class :character Class :character Class :character
## Median : 75.0 Mode :character Mode :character Mode :character
## Mean :100.6
## 3rd Qu.:131.0
## Max. :873.0
##
## BGN_RANGE BGN_AZI BGN_LOCATI
## Min. : 0.000 Length:902297 Length:902297
## 1st Qu.: 0.000 Class :character Class :character
## Median : 0.000 Mode :character Mode :character
## Mean : 1.484
## 3rd Qu.: 1.000
## Max. :3749.000
##
## END_DATE END_TIME COUNTY_END COUNTYENDN
## Length:902297 Length:902297 Min. :0 Length:902297
## Class :character Class :character 1st Qu.:0 Class :character
## Mode :character Mode :character Median :0 Mode :character
## Mean :0
## 3rd Qu.:0
## Max. :0
##
## END_RANGE END_AZI END_LOCATI
## Min. : 0.0000 Length:902297 Length:902297
## 1st Qu.: 0.0000 Class :character Class :character
## Median : 0.0000 Mode :character Mode :character
## Mean : 0.9862
## 3rd Qu.: 0.0000
## Max. :925.0000
##
## LENGTH WIDTH F MAG
## Min. : 0.0000 Min. : 0.000 Min. :0.0 Min. : 0.0
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.:0.0 1st Qu.: 0.0
## Median : 0.0000 Median : 0.000 Median :1.0 Median : 50.0
## Mean : 0.2301 Mean : 7.503 Mean :0.9 Mean : 46.9
## 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.:1.0 3rd Qu.: 75.0
## Max. :2315.0000 Max. :4400.000 Max. :5.0 Max. :22000.0
## NA's :843563
## FATALITIES INJURIES PROPDMG
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 0.0000 Median : 0.0000 Median : 0.00
## Mean : 0.0168 Mean : 0.1557 Mean : 12.06
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.50
## Max. :583.0000 Max. :1700.0000 Max. :5000.00
##
## PROPDMGEXP CROPDMG CROPDMGEXP
## Length:902297 Min. : 0.000 Length:902297
## Class :character 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.000 Mode :character
## Mean : 1.527
## 3rd Qu.: 0.000
## Max. :990.000
##
## WFO STATEOFFIC ZONENAMES LATITUDE
## Length:902297 Length:902297 Length:902297 Min. : 0
## Class :character Class :character Class :character 1st Qu.:2802
## Mode :character Mode :character Mode :character Median :3540
## Mean :2875
## 3rd Qu.:4019
## Max. :9706
## NA's :47
## LONGITUDE LATITUDE_E LONGITUDE_ REMARKS
## Min. :-14451 Min. : 0 Min. :-14455 Length:902297
## 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0 Class :character
## Median : 8707 Median : 0 Median : 0 Mode :character
## Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. : 17124 Max. :9706 Max. :106220
## NA's :40
## REFNUM
## Min. : 1
## 1st Qu.:225575
## Median :451149
## Mean :451149
## 3rd Qu.:676723
## Max. :902297
##
stormDataProcess <- stormData
# Change BGN_DATE and END_DATE from chr to date type
stormDataProcess$BGN_DATE <- parse_date_time(stormDataProcess$BGN_DATE, orders = "mdy HMS")
stormDataProcess$END_DATE <- parse_date_time(stormDataProcess$END_DATE, orders = "mdy HMS")
stormDataProcess$STATE <- as.factor(stormDataProcess$STATE)
stormDataProcess$EVTYPE <- as.factor(stormDataProcess$EVTYPE)
# Turning all damage into number
stormDataProcess <- stormDataProcess %>%
mutate(PropDamageExp = case_when(PROPDMGEXP == "K" ~ 1000,
PROPDMGEXP == "M" | PROPDMGEXP == "m" ~ 1000000,
PROPDMGEXP == "B" ~ 1000000000,
PROPDMGEXP == "h" | PROPDMGEXP == "H" ~ 100),
propDamage = PROPDMG * as.double(PropDamageExp),
cropDamageExp = case_when(CROPDMGEXP == "K" ~ 1000,
CROPDMGEXP == "M" | CROPDMGEXP == "m" ~ 1000000,
CROPDMGEXP == "B" ~ 1000000000,
CROPDMGEXP == "h" | CROPDMGEXP =="H" ~ 100),
cropDamage = CROPDMG * as.double(cropDamageExp)) %>%
select(EVTYPE, STATE,FATALITIES,INJURIES, propDamage, cropDamage)
# Top 5 fatalities by EVENT TYPE
(stormDataProcess_evtype <- stormDataProcess %>%
group_by(EVTYPE) %>%
summarize(totalFatalities = sum(FATALITIES), totalInjuries = sum(INJURIES)) %>%
arrange(desc(totalFatalities)) %>%
head(5))
## # A tibble: 5 x 3
## EVTYPE totalFatalities totalInjuries
## <fct> <dbl> <dbl>
## 1 TORNADO 5633 91346
## 2 EXCESSIVE HEAT 1903 6525
## 3 FLASH FLOOD 978 1777
## 4 HEAT 937 2100
## 5 LIGHTNING 816 5230
# Top 10 fatalities by EVENT TYPE for each state
(stormDataProcess_evtype_state <- stormDataProcess %>%
group_by(EVTYPE, STATE) %>%
summarize(totalFatalities = sum(FATALITIES), totalInjuries = sum(INJURIES)) %>%
arrange(desc(totalFatalities)) %>%
head(10))
## # A tibble: 10 x 4
## # Groups: EVTYPE [3]
## EVTYPE STATE totalFatalities totalInjuries
## <fct> <fct> <dbl> <dbl>
## 1 HEAT IL 653 241
## 2 TORNADO AL 617 7929
## 3 TORNADO TX 538 8207
## 4 TORNADO MS 450 6244
## 5 TORNADO MO 388 4330
## 6 TORNADO AR 379 5116
## 7 TORNADO TN 368 4748
## 8 EXCESSIVE HEAT PA 359 320
## 9 EXCESSIVE HEAT IL 330 352
## 10 TORNADO OK 296 4829
# Injuries by EVENT TYPE for each state
stormDataProcess_evtype_state_inj <- stormDataProcess_evtype_state %>%
arrange(desc(totalInjuries))
# Injuries by Event TYPE
stormDataProcess_evtype_inj <- stormDataProcess_evtype %>%
arrange(desc(totalInjuries))
# Damages per events
stormDataProcess_dmg <- stormDataProcess %>%
group_by(EVTYPE) %>%
summarize(totalDamage = sum(propDamage, na.rm = TRUE) + sum(cropDamage, na.rm = TRUE)) %>%
arrange(desc(totalDamage)) %>%
head(5)
# Plotting fatalities
ggplot(stormDataProcess_evtype, aes(x = reorder(EVTYPE,-totalFatalities), y = totalFatalities, fill = EVTYPE)) +
geom_col() +
labs(title = "Total fatalities by Event type in USA",
x = "Event Type",
y = "Total Fatalities") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 60, hjust = 1))
# fatalities by state
ggplot(stormDataProcess_evtype_state, aes(x = reorder(STATE,-totalFatalities),
y = totalFatalities,fill = STATE)) +
geom_col() +
facet_grid(.~ EVTYPE) +
labs(title = "Total fatalities by top 10 Events type in USA",
x = "State",
y = "Total Fatalities") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 60, hjust = 1))
# Injuries in USA
ggplot(stormDataProcess_evtype_state_inj, aes(x = reorder(STATE,-totalInjuries),
y = totalInjuries,fill = STATE)) +
geom_col() +
facet_grid(.~ EVTYPE) +
labs(title = "Total Injuries by top 10 Events type in USA",
x = "State",
y = "Total Injuries") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 60, hjust = 1))
# Injuries by state
ggplot(stormDataProcess_evtype_inj, aes(x = reorder(EVTYPE,-totalInjuries),
y = totalInjuries,fill = EVTYPE)) +
geom_col() +
labs(title = "Total Injuries by top 5 Events type in USA",
x = "Events",
y = "Total Injuries") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 60, hjust = 1))
Looking at these plots we can say that in the USA the most harmful event is the TORNADO with a total of 5633 fatalities and 91.346 thousand injuries over the years. But looking at each state separately we can see that in Illinois the most harmful event is the HEAT.
4.2 Event with the greatest economic consequences
ggplot(stormDataProcess_dmg, aes(x = reorder(EVTYPE, -totalDamage), y = totalDamage, fill = EVTYPE)) +
geom_col() +
labs(title = "Total Damages by Weather Events in USA",
x = "Event Type",
y = "Total Damages in $") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 60, hjust = 1))
We can see with this plot that the event with greatest economic consequences is the FLOOD with $150.3196783 billions over the years.