Author: Cristian MHT
In this doc, we will see a brief analysis data with relation to natural disasters. This is a homework of the class of “Reproducible Research” with the goal of to be reproducible.
Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. Many severe events can result in fatalities, injuries, and property damage, and preventing such outcomes to the extent possible is a key concern.
This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
We should donwload data, the URL is “Storm Data” it is in format csv.bz2.
we need the next libraries:
1 ggplot
2 dply
3 lubridate
4 gridExtra
Now the code that we need for to load library is:
library(ggplot2)
library(dplyr)
library(lubridate)
library(gridExtra)
Now directory create and download data:
if (!dir.exists("Project_2_RR")) {
(dir.create("Project_2_RR"))
}
URL<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("Project_2_RR/data.csv.bz2")){
download.file(URL,destfile = "Project_2_RR/data.csv.bz2")
}
We read the data
data<-read.csv("Project_2_RR/data.csv.bz2",sep = ",")
head(data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE EVTYPE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL TORNADO
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL TORNADO
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL TORNADO
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL TORNADO
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL TORNADO
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL TORNADO
## BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1 0 0 NA
## 2 0 0 NA
## 3 0 0 NA
## 4 0 0 NA
## 5 0 0 NA
## 6 0 0 NA
## END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1 0 14.0 100 3 0 0 15 25.0
## 2 0 2.0 150 2 0 0 0 2.5
## 3 0 0.1 123 2 0 0 2 25.0
## 4 0 0.0 100 2 0 0 2 2.5
## 5 0 0.0 150 2 0 0 2 2.5
## 6 0 1.5 177 2 0 0 6 2.5
## PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1 K 0 3040 8812
## 2 K 0 3042 8755
## 3 K 0 3340 8742
## 4 K 0 3458 8626
## 5 K 0 3412 8642
## 6 K 0 3450 8748
## LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3051 8806 1
## 2 0 0 2
## 3 0 0 3
## 4 0 0 4
## 5 0 0 5
## 6 0 0 6
Let’s go to transform a col with the label of BNG_DATE, that is date the natural disasters.
data<-transform(data,BGN_DATE=as.Date(BGN_DATE,"%m/%d/%Y %H:%M:%S"))
str(data$BGN_DATE)
## Date[1:389165], format: "1950-04-18" "1950-04-18" "1951-02-20" "1951-06-08" "1951-11-15" ...
head(data$BGN_DATE)
## [1] "1950-04-18" "1950-04-18" "1951-02-20" "1951-06-08" "1951-11-15"
## [6] "1951-11-15"
Too to change the property damage of character a numeric
character_prop <- c("","+", "0", "5", "6", "?", "4", "2", "3", "h", "7", "H", "-", "1", "8", " ")
data <- data %>%
mutate( financial = case_when(
data$PROPDMGEXP == "K" ~ data$PROPDMG *1000,
data$PROPDMGEXP == "M" ~ data$PROPDMG *1000000,
data$PROPDMGEXP == "m" ~ data$PROPDMG *1000000,
data$PROPDMGEXP == "B" ~ data$PROPDMG *1000000000,
data$PROPDMGEXP %in% character_prop ~ data$PROPDMG,
)
)
Now we will see summary data of the columns more import.
table(data$STATE)
##
## AK AL AM AR AS AZ CA CO CT DC DE FL GA
## 1602 10500 1 13978 49 2684 3621 8890 1401 181 708 12768 11954
## GU HI IA ID IL IN KS KY LA MA MD ME MH
## 102 733 12753 2387 11709 9438 22987 7194 9973 2087 2702 1323 1
## MI MN MO MS MT NC ND NE NH NJ NM NV NY
## 8034 9715 13819 9568 4457 9993 5669 13514 1068 2739 2976 1123 8779
## OH OK OR PA PR RI SC SD ST TN TX UT VA
## 11849 27508 1377 10086 818 381 6816 7722 1 8211 45028 1343 7192
## VI VT WA WI WV WY
## 131 1441 837 8849 3541 2854
We have WY states, the max obs. is TX.
We will see initial date and end date.
summary(data$BGN_DATE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "1950-01-03" "1983-07-01" "1993-05-29" "1988-11-24" "1997-05-27" "2000-12-31"
We have 1950-01-03 initial date and the end date isWY
there are 14 time zone.
table(data$TIME_ZONE)
##
## ADT AST CDT CSC CSt CST EDT ESt EST ESY GMT
## 3 2548 692 1 4 285238 569 2 68703 1 1
## HST MDT MST PDT PST SCT SST UNK UTC
## 721 99 17845 154 12445 2 124 9 4
We have 547493 obs. only in CST.
Summary of Fatalities:
summary(data$FATALITIES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.0239 0.0000 583.0000
Min obs. is 0 and max obs is 583.
Summary of Injuries:
summary(data$INJURIES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.274 0.000 1700.000
Min obs. is 0 and max obs is 1700.
the firts graphics is the Numbers of Fatalities for year >0:
data_FI<-data %>%
mutate( year = format(BGN_DATE, "%Y")) %>%
group_by( year) %>%
summarise(total_f = sum(FATALITIES,na.rm = TRUE),
total_i=sum(INJURIES,na.rm = TRUE)) %>%
filter(total_f>0 | total_i>0)
data_FI<-transform(data_FI,year=as.Date(year,"%Y"))
x<-ggplot(data = data_FI,aes(x=year,y=total_f))
x + geom_point(col="turquoise1") + geom_vline(xintercept = data_FI$year[43],
color="red", cex=1.1) +
geom_smooth() + labs(x="Years",y="Numbers of Fatalities",
title =" Fatalities for year" )
We can see that there are differences between before the red line and after the red line. the year of red line is data_FI$[33], after it is increment the numbers of fatalities.
the second graphics is the Numbers of INjuries for year >0
x<- ggplot(data = data_FI, aes(x=year,y=total_i))
x + geom_point(col="firebrick") + geom_smooth() +
geom_hline(yintercept = median(data_FI$total_i),col="black",cex=1.1) +
labs(x="Years", y ="Numbers of Injuries", title = " Injuries for Year")
summary(data_FI$total_i)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 524.0 922.5 1355.0 2091.1 2464.0 11177.0
Here, we can see that the Injuries it is near of the black line that is the median the Injuries 1355, but if we see the summary, there is data extreme.
summary(data_FI$total_i)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 524.0 922.5 1355.0 2091.1 2464.0 11177.0
boxplot(data_FI$total_i,ylab="Number",xlab="Injuries",main="Boxplot of Injuries")
Now we will see for time zone and Fatalities and Injuries:
data_Zone<-data%>%
group_by(TIME_ZONE) %>%
summarize(total_i=sum(INJURIES),total_f=sum(FATALITIES))%>% filter(total_i>0 | total_f>0)
x<-ggplot(data = data_Zone,aes(TIME_ZONE))
x +geom_point(aes(y=total_f, col="Fatality")) +
geom_point(aes(y=total_i, col="injuries")) +
coord_cartesian(ylim = c(0,1000)) + labs(x="Time Zone", y="Total People", title = "Fatalities and Injuries For Time Zone")
The data(Fatalities) is near of 0 and 250, but too there is data(Injuries) extreme, and other the data is near of 0.
par(mfrow=c(1,2),mar=c(4,4,2,1))
boxplot(data_Zone$total_f,ylab="Number",xlab="Fatalities",main="Boxplot of Fatalites for Time Zone")
boxplot(data_Zone$total_i,ylab="Number",xlab="Injuries",main="Boxplot of Injuries for Time Zone")
Now, we take 2 natural disasters with more obs. The Hail and TSTM WIND.
data_evtype_f<-data %>%
filter(EVTYPE=="HAIL" & FATALITIES>0) %>%
select(EVTYPE,FATALITIES,BGN_DATE)
data_evtype_f<-transform(data_evtype_f,BGN_DATE=as.Date(BGN_DATE,"%m/%d/%Y"))
x<-ggplot(data = data_evtype_f,aes(x=BGN_DATE,y=FATALITIES))
x+geom_point(col="red") + geom_line(col="green") + labs(x="Date Of Hail", y="Numbers of Fatalities",
title = "Date Of The Fatalities For Hail")
We can see that the Fatalities decrease for every year of Hail
For TSTM WIND
data_evtype_i<-data %>%
filter(EVTYPE=="TSTM WIND" & FATALITIES>0) %>%
select(EVTYPE,FATALITIES,BGN_DATE)
data_evtype_i<-transform(data_evtype_i,BGN_DATE=as.Date(BGN_DATE,"%m/%d/%Y"))
x<-ggplot(data = data_evtype_i,aes(x=BGN_DATE,y=FATALITIES))
x+geom_point(col="black") + geom_line(col="yellow") +
labs(x="Date Of TSTM Wind", y="Numbers of Fatalities",
title = "Date Of The Fatalities For TSTM Wind")
Too decrease the Fatalities of TSTM, but is more high that Hail.
We will finely graph the cost of property damage.
data_prop <- data %>%
group_by(EVTYPE)%>%
summarize(data_f = sum(FATALITIES,na.rm = TRUE),
data_i= sum(INJURIES,na.rm = TRUE),
financial = sum(financial,na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
data_ef<-arrange(data_prop,desc(data_f))
data_ef<-head(data_ef,5)
p1<-ggplot(data = data_ef,aes(y=data_f,x=EVTYPE)) +geom_col(aes(col=EVTYPE,fill=EVTYPE)) +
labs(x="Event Type", y=" Fatalities", title = "Property Damage For Fatalities")
data_ei<-arrange(data_prop,desc(data_i))
data_ei<-head(data_ei,5)
p2<-ggplot(data = data_ei,aes(y=data_i,x=EVTYPE)) +geom_col(aes(col=EVTYPE,fill=EVTYPE)) +
labs(x="Event Type", y=" Injuries", title = "Property Damage For Injuries")
data_ee<-arrange(data_prop,desc(financial))
data_ee<-head(data_ee,5)
p3<-ggplot(data = data_ee,aes(y=financial,x=EVTYPE)) +geom_col(aes(col=EVTYPE,fill=EVTYPE)) +
labs(x="Event Type", y=" Financial", title = "Property Damage Financial")
grid.arrange(p1,p2,p3)
This was a brief review of analysis data.