The basic goal of this reproducible research is to explore the “NOAA Strom data” base and find
This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. This data can be downloaded from the link Strom Data
# Downloading data from url
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
#Destination file name
filename <- "NOAA_strom_data.csv.bz2"
#Download from URL if doesnot exist
#Download file to the current working directory with specified name "assignmentfiles.zip"
if(!file.exists(filename)){
download.file(url, destfile=filename, method="curl")
}
# Library used in this project
library(ggplot2)
library(dplyr)
library(cowplot)
library(kableExtra)
Below are the short details of data variables.
STATE_NUM - State number; BGN_DATE - Beginning date; BGN_TIME - Beginning time; TIME_ZONE - Time zone of beginning date & time; COUNTY - County Number; COUNTYNAME - County name; STATE - State code; EVTYPE - Event type; BGN_RANGE - Beginning Range; BGN_AZI - Beginning direction (azimuth); BGN_LOCATI - Beginning location; END_DATE - End date; END_TIME - End time; COUNTY_END - End County ; COUNTYENDN - ; END_RANGE - End range; END_AZI - End direction (azimuth); END_LOCATI - End location; LENGTH - Length; WIDTH - Width; F - Fujita Scale; MAG - Magnitude; FATALITIES - Fatalities; INJURIES - Injuries; PROPDMG - Property Damage value; PROPDMGEXP - Property damage value code “K” for thousands, “M” for millions, and “B” for billions; CROPDMG - Crops damage value; CROPDMGEXP - Crops damage value code “K” for thousands, “M” for millions, and “B” for billions; WFO - Weather prediction center; STATEOFFIC - State of Office; ZONENAMES - Zone Names; LATITUDE - Lattitude; LONGITUDE - Longitude; LATITUDE_E - Lattitude E; LONGI_E - Longitude E; REMARKS - Remarks; REFNUM - Reference number;
# Reading data
strom_data <- read.csv("NOAA_strom_data.csv.bz2")
colnames(strom_data)[1] <- "STATE_NUM"
colnames(strom_data)[35] <- "LONGI_E"
#Column names of data
names(strom_data)
## [1] "STATE_NUM" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGI_E"
## [36] "REMARKS" "REFNUM"
harm_data <- strom_data %>% group_by(EVTYPE) %>% summarise(total_fatal = sum(FATALITIES), total_injury = sum(INJURIES))
harm_data_sort <- harm_data[order(-harm_data$total_fatal),]
harm_data_sort_filter <- harm_data_sort[1:15,]
injury_data <- strom_data %>% group_by(EVTYPE) %>% summarise(total_injury = sum(INJURIES))
injury_data_sort <- injury_data[order(-injury_data$total_injury),]
injury_data_sort_filter <- injury_data_sort[1:15,]
fatal_data <- strom_data %>% group_by(EVTYPE) %>% summarise(total_fatal = sum(FATALITIES))
fatal_data_sort <- fatal_data[order(-fatal_data$total_fatal),]
fatal_data_sort_filter <- fatal_data_sort[1:15,]
The below table shows the number of Fatalities & Injuries against each event type.
kable(list(fatal_data_sort_filter, injury_data_sort_filter), caption = "Top 15 Injuries & Top 15 Fatal") %>% kable_styling(bootstrap_options = "striped", full_width = F)
|
|
Fatalities are more dangerous than injuries and the below table gives the details of the top 15 fatalities and its injuries.
kable(harm_data_sort_filter, caption = "Top 15 Fatal & its Injuris") %>% kable_styling(bootstrap_options = "striped", full_width = F)
| EVTYPE | total_fatal | total_injury |
|---|---|---|
| TORNADO | 5633 | 91346 |
| EXCESSIVE HEAT | 1903 | 6525 |
| FLASH FLOOD | 978 | 1777 |
| HEAT | 937 | 2100 |
| LIGHTNING | 816 | 5230 |
| TSTM WIND | 504 | 6957 |
| FLOOD | 470 | 6789 |
| RIP CURRENT | 368 | 232 |
| HIGH WIND | 248 | 1137 |
| AVALANCHE | 224 | 170 |
| WINTER STORM | 206 | 1321 |
| RIP CURRENTS | 204 | 297 |
| HEAT WAVE | 172 | 309 |
| EXTREME COLD | 160 | 231 |
| THUNDERSTORM WIND | 133 | 1488 |
paste("Maximum, minimum & mean of Fatalities after the fatalities are grouped by event type are", max(fatal_data_sort$total_fatal), ",", min(fatal_data_sort$total_fatal), ",", round(mean(fatal_data_sort$total_fatal), digits = 2), "respectively.")
## [1] "Maximum, minimum & mean of Fatalities after the fatalities are grouped by event type are 5633 , 0 , 15.38 respectively."
paste("Maximum, minimum & mean of Injuries after the injuries are grouped by event type are", max(injury_data_sort$total_injury), ",", min(injury_data_sort$total_injury), ",", round(mean(injury_data_sort$total_injury), digits = 2), "respectively.")
## [1] "Maximum, minimum & mean of Injuries after the injuries are grouped by event type are 91346 , 0 , 142.67 respectively."
The below bar chart shows the top 15 total number of fatalities & the top 15 total number of injuries. The values shown above each bar indicate the % of contribution to the grand total of fatalities & injuries (Not the total of filtered 15) .
grand_total_fatal <- sum(fatal_data_sort$total_fatal)
grand_total_injury <- sum(injury_data_sort$total_injury)
for(i in 1:nrow(fatal_data_sort_filter)){
fatal_data_sort_filter$Percent[i] <- round(100*fatal_data_sort_filter$total_fatal[i]/grand_total_fatal, digits = 2)
}
for(i in 1:nrow(injury_data_sort_filter)){
injury_data_sort_filter$Percent[i] <- round(100*injury_data_sort_filter$total_injury[i]/grand_total_injury, digits = 2)
}
fatal_data_sort_filter$EVTYPE <- factor(fatal_data_sort_filter$EVTYPE, levels = fatal_data_sort_filter$EVTYPE)
injury_data_sort_filter$EVTYPE <- factor(injury_data_sort_filter$EVTYPE, levels = injury_data_sort_filter$EVTYPE)
plot_theme <- theme(axis.text=element_text(size=28), axis.title = element_text(size = 24,face = "bold"), plot.title = element_text(size=30, face = "bold"), legend.text = element_text(size = 16))
fatal_plot <- ggplot(fatal_data_sort_filter, aes(x=EVTYPE, y=total_fatal))+geom_bar(stat = "identity")+ geom_text( aes(label=fatal_data_sort_filter$Percent), hjust=-0.25, angle = 90, size=5)+ theme(axis.text.x = element_text(angle = 90, hjust = 1))+ labs(x="Event Type", y="Total Fatalities", title = "Top 15 Fatalities & it's percentage")+ plot_theme
injury_plot <- ggplot(injury_data_sort_filter, aes(x=EVTYPE, y=total_injury))+geom_bar(stat = "identity")+ geom_text( aes(label=injury_data_sort_filter$Percent), hjust=-0.25, angle = 90, size=5)+ theme(axis.text.x = element_text(angle = 90, hjust = 1))+ labs(x="Event Type", y="Total Injuries", title = "Top 15 Injuries & it's percentage")+ plot_theme
plot_grid(fatal_plot, injury_plot, ncol = 2)
The property damage values are given in column “PROPDMG” & the maginuted of damages were given in column “PROPDMGEXP”. The “PROPDMGEXP” column has below different levels “- ? + 0 1 2 3 4 5 6 7 8 B h H K m M”
We have considered the below levels for estimation and ignored the other levels, which is unknown.
prop_data <- select(strom_data, EVTYPE, PROPDMG, PROPDMGEXP)
hundred_data <- filter(prop_data, PROPDMGEXP == "H" | PROPDMGEXP == "h")
kilo_data <- filter(prop_data, PROPDMGEXP == "K" | PROPDMGEXP == "k")
million_data <- filter(prop_data, PROPDMGEXP == "M" | PROPDMGEXP == "m")
billion_data <- filter(prop_data, PROPDMGEXP == "B" | PROPDMGEXP == "b")
hundred_data$Prop_Dmg_total<- hundred_data$PROPDMG*100
kilo_data$Prop_Dmg_total<- kilo_data$PROPDMG*1000
million_data$Prop_Dmg_total<- million_data$PROPDMG*1000000
billion_data$Prop_Dmg_total<- billion_data$PROPDMG*1000000000
prop_data_total <- rbind(hundred_data, kilo_data, million_data, billion_data)
prop_data_total$Prop_Dmg_total<-prop_data_total$Prop_Dmg_total/1000000
prop_dmg_by_type <- prop_data_total %>% group_by(EVTYPE) %>% summarise(prop_total_damage = sum(Prop_Dmg_total))
prop_dmg_by_type_sort <- prop_dmg_by_type[order(-prop_dmg_by_type$prop_total_damage),]
paste("Maximum, minimum & mean of property damage after the damage value is grouped by event type in Mn$ are", max(prop_dmg_by_type_sort$prop_total_damage), ",", min(prop_dmg_by_type_sort$prop_total_damage), ",", round(mean(prop_dmg_by_type_sort$prop_total_damage), digits = 2), "respectively.")
## [1] "Maximum, minimum & mean of property damage after the damage value is grouped by event type in Mn$ are 144657.7098 , 0 , 1055.11 respectively."
The below table shows the top 15 property damages by event type. The values are the total of property damages by event type in Mn$.
prop_dmg_by_type_sort_filter <- prop_dmg_by_type_sort[1:15,]
kable(prop_dmg_by_type_sort_filter, caption = "Top 15 Property damages in Mn$") %>% kable_styling(bootstrap_options = "striped", full_width = F)
| EVTYPE | prop_total_damage |
|---|---|
| FLOOD | 144657.710 |
| HURRICANE/TYPHOON | 69305.840 |
| TORNADO | 56937.160 |
| STORM SURGE | 43323.536 |
| FLASH FLOOD | 16140.812 |
| HAIL | 15732.267 |
| HURRICANE | 11868.319 |
| TROPICAL STORM | 7703.891 |
| WINTER STORM | 6688.497 |
| HIGH WIND | 5270.046 |
| RIVER FLOOD | 5118.945 |
| WILDFIRE | 4765.114 |
| STORM SURGE/TIDE | 4641.188 |
| TSTM WIND | 4484.928 |
| ICE STORM | 3944.928 |
The below bar chart shows the top 15 property damages. The values shown above each bar indicate the % of contribution to the grand total of property damages (Not the total of filtered 15) .
grand_total_prop_damage <- sum(prop_dmg_by_type_sort$prop_total_damage)
for(i in 1:nrow(prop_dmg_by_type_sort_filter)){
prop_dmg_by_type_sort_filter$Percent[i] <- round(100*prop_dmg_by_type_sort_filter$prop_total_damage[i]/grand_total_prop_damage, digits = 2)
}
prop_dmg_by_type_sort_filter$EVTYPE <- factor(prop_dmg_by_type_sort_filter$EVTYPE, levels = prop_dmg_by_type_sort_filter$EVTYPE)
prop_plot <- ggplot(prop_dmg_by_type_sort_filter, aes(x=EVTYPE, y=prop_total_damage))+geom_bar(stat = "identity")+ geom_text( aes(label=prop_dmg_by_type_sort_filter$Percent), hjust=-0.25, angle = 90, size=5)+ theme(axis.text.x = element_text(angle = 90, hjust = 1))+ labs(x="Event Type", y="Total Damages Mn$", title = "Top 15 Property Damages in Mn$")+ plot_theme
print(prop_plot)