The basic goal of this assignment is to explore the NOAA Storm Database and answer some basic questions about severe weather events.
This report is divided into two main sections namely data processing and results. In the data processing section the url of the data set is downloaded and read into R. The data set is processed and a new variable added with the name total damages. In the results section, graphical presentations are produced to answer two basic questions. First it shows that the most harmful event with respect to population health (fatalities and injuries) across the US is Tornado. The second plot shows that the types of events with the greatest economic consequences across the US for crop and property are Hail and Tornado repectively.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
url<- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
path<- setwd("C:/Users/user/Desktop/Cousera/5. Reproducible Research/Week 4/Project2")
download.file(url, "stormdata.csv", method = "curl" )
storm_data<- read.csv("stormdata.csv", header = TRUE, sep = ",")
dim(storm_data)
## [1] 902297 37
colnames(storm_data)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
storm_data <- storm_data[ , c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "CROPDMG", "STATE", "BGN_DATE", "END_DATE")]
storm_data$BGN_DATE <- as.POSIXct(storm_data$BGN_DATE, format="%m/%d/%Y")
storm_data$END_DATE <- as.POSIXct(storm_data$END_DATE, format="%m/%d/%Y")
storm_data$EVTYPE <- as.factor(storm_data$EVTYPE)
storm_data$STATE <- as.factor(storm_data$STATE)
summary(storm_data)
## EVTYPE FATALITIES INJURIES
## HAIL :288661 Min. : 0.0000 Min. : 0.0000
## TSTM WIND :219940 1st Qu.: 0.0000 1st Qu.: 0.0000
## THUNDERSTORM WIND: 82563 Median : 0.0000 Median : 0.0000
## TORNADO : 60652 Mean : 0.0168 Mean : 0.1557
## FLASH FLOOD : 54277 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## FLOOD : 25326 Max. :583.0000 Max. :1700.0000
## (Other) :170878
## PROPDMG CROPDMG STATE
## Min. : 0.00 Min. : 0.000 TX : 83728
## 1st Qu.: 0.00 1st Qu.: 0.000 KS : 53440
## Median : 0.00 Median : 0.000 OK : 46802
## Mean : 12.06 Mean : 1.527 MO : 35648
## 3rd Qu.: 0.50 3rd Qu.: 0.000 IA : 31069
## Max. :5000.00 Max. :990.000 NE : 30271
## (Other):621339
## BGN_DATE END_DATE
## Min. :1950-01-03 00:00:00.000 Min. :1986-04-10 00:00:00.00
## 1st Qu.:1995-04-20 00:00:00.000 1st Qu.:2000-09-01 00:00:00.00
## Median :2002-03-18 00:00:00.000 Median :2005-04-30 00:00:00.00
## Mean :1998-12-27 23:37:48.996 Mean :2004-09-26 04:11:27.33
## 3rd Qu.:2007-07-28 00:00:00.000 3rd Qu.:2008-08-10 00:00:00.00
## Max. :2011-11-30 00:00:00.000 Max. :2011-11-30 00:00:00.00
## NA's :243411
storm_data$TOTALDMG <- storm_data$PROPDMG+storm_data$CROPDMG
str(storm_data)
## 'data.frame': 902297 obs. of 9 variables:
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ BGN_DATE : POSIXct, format: "1950-04-18" "1950-04-18" ...
## $ END_DATE : POSIXct, format: NA NA ...
## $ TOTALDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
fatalities <- storm_data %>% group_by(EVTYPE) %>% summarise(totalFatalities = sum(FATALITIES)) %>% arrange(desc(totalFatalities)) %>% head(n=10)
View(fatalities)
plot1<-ggplot(fatalities, aes(x = reorder(EVTYPE, -totalFatalities), y = totalFatalities)) +
geom_bar(stat="identity", fill="red") +
ggtitle("Graph: Fatalities by type of catastrophe") +
xlab("") +
ylab("Fatalities") +
ylim(0,100000)+
theme(text=element_text(size=10),
axis.text.x = element_text(angle=90, hjust=1))
injuries <- storm_data %>% group_by(EVTYPE) %>% summarise(totalInjuries = sum(INJURIES)) %>% arrange(desc(totalInjuries)) %>% head(n=10)
View(injuries)
plot2<-ggplot(injuries, aes(x = reorder(EVTYPE, -totalInjuries), y = totalInjuries) ) +
geom_bar(stat="identity", fill="green") +
ggtitle("Graph: Injuries by type of catastrophe") +
xlab("") +
ylab("Injuries") +
ylim(0,100000)+
theme(text=element_text(size=10),
axis.text.x = element_text(angle=90, hjust=1))
grid.arrange(plot1, plot2, ncol=2)
economic <- storm_data %>% select(EVTYPE, PROPDMG, CROPDMG, TOTALDMG) %>%
group_by(EVTYPE) %>%
summarise(property_damage = sum(PROPDMG)/1000000, crop_damage = sum(CROPDMG)/1000000, total_damage = sum(TOTALDMG)/1000000 ) %>%
arrange(desc(total_damage)) %>% head(n=10)
plot3<-ggplot(economic, aes(x = reorder(EVTYPE, -property_damage), y = property_damage) ) +
geom_bar(stat="identity", fill="green") +
ggtitle("Cost of Property Damage") +
xlab("Event type") +
ylab("Cost in millions") +
ylim(0,4)+
theme(text=element_text(size=10),
axis.text.x = element_text(angle=90, hjust=1))
plot4<-ggplot(economic, aes(x = reorder(EVTYPE, -crop_damage), y = crop_damage) ) +
geom_bar(stat="identity", fill="red") +
ggtitle("Cost of Crop Damage") +
xlab("Event type") +
ylab("Cost in millions") +
ylim(0,4)+
theme(text=element_text(size=10),
axis.text.x = element_text(angle=90, hjust=1))
plot5<-ggplot(economic, aes(x = reorder(EVTYPE, -total_damage), y = total_damage) ) +
geom_bar(stat="identity", fill="blue") +
ggtitle("Economic Impact of Events") +
xlab("Event type") +
ylab("Cost in millions") +
ylim(0,4)+
theme(text=element_text(size=10),
axis.text.x = element_text(angle=90, hjust=1))
grid.arrange(plot3, plot4, plot5, ncol=3)