knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
##
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
##
## filter, lag
## Les objets suivants sont masqués depuis 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attachement du package : 'lubridate'
## Les objets suivants sont masqués depuis 'package:base':
##
## date, intersect, setdiff, union
library(stringr)
library(ggplot2)
library(patchwork)
URL<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
# download.file(URL,destfile = "./my_file.bz2",method = "curl")
data_set <- read.csv("my_file.bz2", header = TRUE, row.names = NULL,stringsAsFactors=FALSE)
# Conversion of the varibale BGN_DATE into Date
data_set$BGN_DATE <- mdy_hms(data_set$BGN_DATE)
names(data_set)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
Therefore, a subset of object data_set is made :
my_cols <- c('BGN_DATE', 'EVTYPE', 'FATALITIES', 'INJURIES', 'PROPDMG', 'PROPDMGEXP',
'CROPDMG', 'CROPDMGEXP')
data_set_mini <- data_set[,my_cols]
# Conversion of the character-exponents into numeric format
calcul_exp <- function(u) {
u <- switch(u,
'+' = {0}, '-' = {0}, '?' = {0},
'h' = {2}, 'H' = {2},
'k' = {3}, 'K' = {3},
'm' = {6}, 'M' = {6},
'b' = {9}, 'B' = {9},
u)
if(u == '' | u == ' ') 0
else as.integer(u)
}
# Conversion of empty/space value to zero and any other strings to numeric value
convert_num <- function(x) {
if(x == '' | x == ' ') 0
else as.numeric(x)
}
# calculate the actual property damage by multiplying PROPDMG with 10^PROPDMGEXP
data_set_mini$PROPDMG <- apply(data.frame(data_set_mini$PROPDMG, data_set_mini$PROPDMGEXP),
1, function(x) { convert_num(x[1])*10^calcul_exp(x[2]) })
# calculate the actual property damage by multiplying CROPDMG with 10^CROPDMGEXP
data_set_mini$CROPDMG <- apply(data.frame(data_set_mini$CROPDMG, data_set_mini$CROPDMGEXP),
1, function(x) { convert_num(x[1])*10^calcul_exp(x[2]) })
# Capitalization for all events and supression of extra spaces.
data_set_mini$EVTYPE <- toupper(data_set_mini$EVTYPE)
data_set_mini$EVTYPE <- trimws(x = data_set_mini$EVTYPE, which = "both", whitespace = " ")
data_set_mini$EVTYPE[grepl(pattern = "ICESTORM", x = data_set_mini$EVTYPE)] <- "ICESTORM"
data_set_mini$EVTYPE[grepl(pattern = "BLIZZARD", x = data_set_mini$EVTYPE)] <- "BLIZZARD"
data_set_mini$EVTYPE[grepl(pattern = "COASTAL", x = data_set_mini$EVTYPE) &
grepl(pattern = "FLOOD", x = data_set_mini$EVTYPE)] <- "COASTAL FLOOD"
data_set_mini$EVTYPE[grepl(pattern = "FLAS", x = data_set_mini$EVTYPE) &
grepl(pattern = "FLOOD", x = data_set_mini$EVTYPE)] <-
"FLASH FLOOD"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*FLOOD)(?!.*COASTAL)(?!.*FLASH)",
x = data_set_mini$EVTYPE,perl = TRUE)] <- "FLOOD"
data_set_mini$EVTYPE[grepl(pattern = "FOG", x = data_set_mini$EVTYPE)] <- "DENSE FOG"
data_set_mini$EVTYPE[grepl(pattern = "SMOKE", x = data_set_mini$EVTYPE)] <- "DENSE SMOKE"
data_set_mini$EVTYPE[grepl(pattern = "DROUGHT", x = data_set_mini$EVTYPE)] <- "DROUGHT"
data_set_mini$EVTYPE[grepl(pattern = "FROST|FREEZE", x = data_set_mini$EVTYPE)] <-
"FROST/FREEZE"
data_set_mini$EVTYPE[grepl(pattern = "FUNNEL", x = data_set_mini$EVTYPE)] <-
"FUNNEL CLOUD"
data_set_mini$EVTYPE[grepl("^(?=.*HAIL)(?!.*MARINE)", x = data_set_mini$EVTYPE,
perl = TRUE)] <- "HAIL"
data_set_mini$EVTYPE[grepl(pattern = "EXCESSIVE HEAT|RECORD HEAT|HEATBURST",
x = data_set_mini$EVTYPE)] <- "EXCESSIVE HEAT"
data_set_mini$EVTYPE[grepl(pattern = "RAIN", x = data_set_mini$EVTYPE)] <-
"HEAVY RAIN"
data_set_mini$EVTYPE[grepl(pattern = "HURRICANE", x = data_set_mini$EVTYPE)] <-
"HURRICANE (TYPHONE)"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*SNOW)(?=.*LAKE)", x = data_set_mini$EVTYPE,
perl = TRUE)] <- "LAKE-EFFECT SNOW"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*SNOW)(?!.*LAKE)", x = data_set_mini$EVTYPE,
perl = TRUE)] <- "HEAVY SNOW"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*STRONG)(?=.*WIND)(?!.*MARINE)",
x = data_set_mini$EVTYPE, perl = TRUE)] <- "STRONG WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*TSTM)(?=.*WIND)(?!.*MARINE)|
^(?=.*THUNDERSTORM)(?=.*WIND)(?!.*MARINE)",
x = data_set_mini$EVTYPE, perl = TRUE)] <- "THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*TSTM)(?=.*WIND)(?=.*MARINE)",
x = data_set_mini$EVTYPE,perl = TRUE)] <-
"MARINE THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*THUNDERSTORM)(?=.*WIND)(?!.*MARINE)",
x = data_set_mini$EVTYPE,perl = TRUE)] <- "THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*EXTREME)(?=.*WIND)(?=.*CHILL)",
x = data_set_mini$EVTYPE,
perl = TRUE)] <- "EXTREME COLD/WIND CHILL"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*WIND)(?=.*CHILL)(?!.*EXTREME)",
x = data_set_mini$EVTYPE, perl = TRUE)] <- "COLD/WIND CHILL"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*WIND)(?!.*MARINE)
(?!.*CHILL)(?!.*STRONG)(?!.*THUNDERSTORM)",
x = data_set_mini$EVTYPE, perl = TRUE)] <- "HIGH WIND"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*TSTM)(?=.*WIND)(?=.*MARINE)",
x = data_set_mini$EVTYPE,perl = TRUE)] <-
"MARINE THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "CURRENT",
x = data_set_mini$EVTYPE)] <- "RIP CURRENT"
data_set_mini$EVTYPE[grepl(pattern = "SLEET", x = data_set_mini$EVTYPE)] <- "SLEET"
data_set_mini$EVTYPE[grepl(pattern = "STORM SURGE", x = data_set_mini$EVTYPE)] <-
"STORM SURGE/TIDE"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*THUNDERSTORM)(?!.*MARINE)",
x = data_set_mini$EVTYPE, perl = TRUE)] <- "THUNDERSTORM WIND"
data_set_mini$EVTYPE[grepl(pattern = "TORNADO", x = data_set_mini$EVTYPE)] <- "TORNADO"
data_set_mini$EVTYPE[grepl(pattern = "VOLCANIC", x = data_set_mini$EVTYPE)] <-
"VOLCANIC ASH"
data_set_mini$EVTYPE[grepl(pattern = "WATERSPOUT", x = data_set_mini$EVTYPE)] <-
"WATERSPOUT"
data_set_mini$EVTYPE[grepl(pattern = "^(?=.*WINTER)(?!.*STORM)",
x = data_set_mini$EVTYPE, perl = TRUE)] <- "WINTER WEATHER"
harms <- aggregate(FATALITIES + INJURIES ~ EVTYPE, data = data_set_mini, sum)
names(harms) <- c('EVTYPE', 'TOTAL.AMOUNT')
# Arrange the result
harms <- arrange(harms, desc(TOTAL.AMOUNT), EVTYPE)
head(harms, 10)
## EVTYPE TOTAL.AMOUNT
## 1 TORNADO 97043
## 2 THUNDERSTORM WIND 10119
## 3 EXCESSIVE HEAT 8497
## 4 FLOOD 7279
## 5 LIGHTNING 6046
## 6 HEAT 3037
## 7 FLASH FLOOD 2837
## 8 ICE STORM 2064
## 9 WINTER STORM 1527
## 10 HAIL 1512
# Subset from 1996
data_set_mini.sub <- data_set_mini %>% filter(year(data_set_mini$BGN_DATE) > 1996)
harms_sub <- aggregate(FATALITIES + INJURIES ~ EVTYPE, data = data_set_mini.sub, sum)
names(harms_sub) <- c('EVTYPE', 'TOTAL.AMOUNT')
# Arrange the result
harms_sub <- arrange(harms_sub, desc(TOTAL.AMOUNT), EVTYPE)
head(harms_sub, 10)
## EVTYPE TOTAL.AMOUNT
## 1 TORNADO 21447
## 2 EXCESSIVE HEAT 8095
## 3 FLOOD 7123
## 4 THUNDERSTORM WIND 5063
## 5 LIGHTNING 4424
## 6 FLASH FLOOD 2425
## 7 HEAT 1459
## 8 HURRICANE (TYPHONE) 1393
## 9 WINTER STORM 1209
## 10 HIGH WIND 1181
my_plot_fun <- function(my_data) {
my_data %>% ggplot(aes(x=EVTYPE, y=TOTAL.AMOUNT, fill=EVTYPE)) +
geom_bar(stat="identity",color="black",alpha=0.6) +
theme_linedraw() +
theme(
plot.title = element_text(color="red", size=18, face="bold.italic",hjust=0.5),
axis.title.x = element_text(color="#993333", size=18, face="bold"),
axis.text.x = element_text(color="#993333", size=14, angle = 90,vjust = 0.5, hjust=1),
axis.title.y = element_text(color="darkgreen", size=18, face="bold"),
axis.text.y = element_text(face="bold", color="darkgreen", size=14),
legend.text = element_text(size=12)
)
}
harms$EVTYPE <- factor(harms$EVTYPE, levels = harms$EVTYPE)
harms_plot <- my_plot_fun(harms[1:10,]) +
xlab("Event Type") + ylab("Total amount of injuries and fatalities") +
ggtitle("The 10 worst weather event fatalities, all years")
harms_sub$EVTYPE <- factor(harms_sub$EVTYPE, levels = harms_sub$EVTYPE)
harms_sub_plot <- my_plot_fun(harms_sub[1:10,]) +
xlab("Event Type") + ylab("Total amount of injuries and fatalities") +
labs(
title = "The 10 worst weather event fatalities, since 1996",
caption = "This plot contain the most harmful meteorologic events type when regexp is
applied and since 1996. All are arranged in this barplot.") +
theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))
harms_sub_plot plot to respect
consignprint(harms_sub_plot)
data_set_mini_2 <- data_set[,my_cols]
harms_2 <- aggregate(FATALITIES + INJURIES ~ EVTYPE, data = data_set_mini_2, sum)
names(harms_2) <- c('EVTYPE', 'TOTAL.AMOUNT')
# Arrange the result
harms_2 <- arrange(harms_2, desc(TOTAL.AMOUNT), EVTYPE)
head(harms_2)
## EVTYPE TOTAL.AMOUNT
## 1 TORNADO 96979
## 2 EXCESSIVE HEAT 8428
## 3 TSTM WIND 7461
## 4 FLOOD 7259
## 5 LIGHTNING 6046
## 6 HEAT 3037
data_set_mini_2.sub <- data_set_mini_2 %>% filter(year(data_set_mini_2$BGN_DATE) > 1996)
harms_2_sub <- aggregate(FATALITIES + INJURIES ~ EVTYPE, data = data_set_mini_2.sub, sum)
names(harms_2_sub) <- c('EVTYPE', 'TOTAL.AMOUNT')
# Arrange the result
harms_2_sub <- arrange(harms_2_sub, desc(TOTAL.AMOUNT), EVTYPE)
head(harms_2_sub, 10)
## EVTYPE TOTAL.AMOUNT
## 1 TORNADO 21447
## 2 EXCESSIVE HEAT 8093
## 3 FLOOD 7121
## 4 LIGHTNING 4424
## 5 TSTM WIND 3524
## 6 FLASH FLOOD 2425
## 7 THUNDERSTORM WIND 1530
## 8 HEAT 1459
## 9 HURRICANE/TYPHOON 1339
## 10 WINTER STORM 1209
harms_2$EVTYPE <- factor(harms_2$EVTYPE, levels = harms_2$EVTYPE)
harms_2_plot <- my_plot_fun(harms_2[1:10,]) +
xlab("Event Type") + ylab("Total amount of injuries and fatalities") +
ggtitle("The 10 worst weather event fatalities, all years, without regexp")
harms_2_sub$EVTYPE <- factor(harms_2_sub$EVTYPE, levels = harms_2_sub$EVTYPE)
harms_2_sub_plot <- my_plot_fun(harms_2_sub[1:10,]) +
xlab("Event Type") + ylab("Total amount of injuries and fatalities") +
labs(
title = "The 10 worst weather events type fatalities, since 1996, without regexp",
caption = "This plot contain the most harmful meteorologic events type when
no (any) regexp is applied and since 1996. All are arranged in this barplot.") +
theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))
harms_2_sub_plot to respect consign
:print(harms_2_sub_plot)
my_df_1<-merge (harms[1:10,],harms_sub[1:10,], by="EVTYPE")
my_df_2<-merge (harms_2[1:10,],harms_2_sub[1:10,], by="EVTYPE")
my_merge <- merge(my_df_1,my_df_2, by="EVTYPE")
names(my_merge) <- c("EVTYPE","TOTAL.regexp.ALL.YEARS","TOTAL.regexp.SINCE.1996",
"TOTAL.NOregexp.ALL.YEARS","TOTAL.NOregexp.SINCE.1996")
my_merge <- arrange(my_merge, desc(TOTAL.regexp.ALL.YEARS))
head(my_merge)
## EVTYPE TOTAL.regexp.ALL.YEARS TOTAL.regexp.SINCE.1996
## 1 TORNADO 97043 21447
## 2 THUNDERSTORM WIND 10119 5063
## 3 EXCESSIVE HEAT 8497 8095
## 4 FLOOD 7279 7123
## 5 LIGHTNING 6046 4424
## 6 HEAT 3037 1459
## TOTAL.NOregexp.ALL.YEARS TOTAL.NOregexp.SINCE.1996
## 1 96979 21447
## 2 1621 1530
## 3 8428 8093
## 4 7259 7121
## 5 6046 4424
## 6 3037 1459
In the section I. 4., the first function is construct to convert
the character-exponents into numeric format and keep numeric exponent
as-is. Then the second function converts empty/space value to zero and
any other strings to numeric value.
damages <- aggregate(CROPDMG + PROPDMG ~ EVTYPE, data = data_set_mini, sum)
names(damages) <- c('EVTYPE', 'ECONOMIC.DAMAGES')
damages <- arrange(damages, desc(ECONOMIC.DAMAGES), EVTYPE)
damages_sub <- aggregate(CROPDMG + PROPDMG ~ EVTYPE, data = data_set_mini.sub, sum)
names(damages_sub) <- c('EVTYPE', 'ECONOMIC.DAMAGES')
damages_sub <- arrange(damages_sub, desc(ECONOMIC.DAMAGES), EVTYPE)
damages_2 <- aggregate(CROPDMG + PROPDMG ~ EVTYPE, data = data_set_mini_2, sum)
names(damages_2) <- c('EVTYPE', 'ECONOMIC.DAMAGES')
damages_2 <- arrange(damages_2, desc(ECONOMIC.DAMAGES), EVTYPE)
damages_2_sub <- aggregate(CROPDMG + PROPDMG ~ EVTYPE, data = data_set_mini_2.sub, sum)
names(damages_2_sub) <- c('EVTYPE', 'ECONOMIC.DAMAGES')
damages_2_sub <- arrange(damages_2_sub, desc(ECONOMIC.DAMAGES), EVTYPE)
my_plot_fun_2 <- function(my_data) {
my_data %>% ggplot(aes(x=EVTYPE, y=ECONOMIC.DAMAGES, fill=EVTYPE)) +
geom_bar(stat="identity",color="black",alpha=0.6) +
theme_linedraw() +
theme(
plot.title = element_text(color="red", size=18, face="bold.italic",hjust=0.5),
axis.title.x = element_text(color="#993333", size=18, face="bold"),
axis.text.x = element_text(color="#993333", size=14, angle = 90,vjust = 0.5, hjust=1),
axis.title.y = element_text(color="darkgreen", size=18, face="bold"),
axis.text.y = element_text(face="bold", color="darkgreen", size=14),
legend.text = element_text(size=14)
)
}
damages$EVTYPE <- factor(damages$EVTYPE, levels = damages$EVTYPE)
damages_plot <- my_plot_fun_2(damages[1:10,]) +
xlab("Event Type") + ylab("Total crop and proprety damages") +
ggtitle("The 10 most destructive events, all yeras and with regexp") +
scale_y_continuous(
breaks = c(0, 50000000000, 100000000000, 150000000000, 200000000000,
250000000000, 300000000000, 350000000000),
labels = c("0","50 000 000 000", "100 000 000 000", "150 000 000 000",
"200 000 000 000", "250 000 000 000", "300 000 000 000",
"350 000 000 000"))
damages_sub$EVTYPE <- factor(damages_sub$EVTYPE, levels = damages_sub$EVTYPE)
damages_sub_plot <- my_plot_fun_2(damages_sub[1:10,]) +
xlab("Event Type") + ylab("Total crop and proprety damages") +
scale_y_continuous(
breaks = c(0, 50000000000, 100000000000, 150000000000, 200000000000,
250000000000, 300000000000, 350000000000),
labels = c("0","50 000 000 000", "100 000 000 000", "150 000 000 000",
"200 000 000 000", "250 000 000 000", "300 000 000 000",
"350 000 000 000")) +
labs(
title = "The 10 most destructive events, since 1996 and with regexp",
caption = "This plot contain the most destructer meteorologic events type
when regexp is applied and since 1996. All are arranged in this barplot.") +
theme(plot.caption = element_text(color = "darkblue", face = "italic", size = 12))
damages_2$EVTYPE <- factor(damages_2$EVTYPE, levels = damages_2$EVTYPE)
damages_2_plot <- my_plot_fun_2(damages_2[1:10,]) +
xlab("Event Type") + ylab("Total crop and proprety damages") +
ggtitle("The 10 most destructive events, all years and without regexp") +
scale_y_continuous(
breaks = c(0, 500000, 1000000, 1500000, 2000000, 2500000, 3000000, 3500000),
labels = c("0","500 000", "1 000 000", "1 500 000", "2 000 000", "2 500 000",
"3 000 000", "3 500 000"))
damages_2_sub$EVTYPE <- factor(damages_2_sub$EVTYPE, levels = damages_2_sub$EVTYPE)
damages_2_sub_plot <- my_plot_fun_2(damages_2_sub[1:10,]) +
xlab("Event Type") + ylab("Total crop and proprety damages") +
scale_y_continuous(
breaks = c(0, 250000, 500000, 750000, 1000000, 1250000),
labels = c("0", "250 000", "500 000", "750 000", "1 000 000", "1 250 000"))
ggtitle("The 10 most destructive events, since 1996 and without regexp")
## $title
## [1] "The 10 most destructive events, since 1996 and without regexp"
##
## attr(,"class")
## [1] "labels"
damages_sub_plot to respect
consignprint(damages_sub_plot)