In this report we show the analysis we have done on the NOAA data base which tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage.
We have mainly focused on two aspects:
To do this, we have worked with a data file that contains all the information of weather events along over 60 years, from 1950 to 2011.
After a first reading and processing the file, we perform data analysis to respond the issues raised, providing all the explanations, tables, figures and conclusions.
First, we are go to load the necesary libraries and prepare the enviroment to use Knit option.
library(plyr)
library(ggplot2)
library(knitr)
opts_chunk$set(echo = TRUE, results = 'hold')
library(gridExtra)
## Loading required package: grid
Now we go to download and read the file
if (!file.exists("./noaa")){
dir.create("./noaa")
url <- "http://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url,destfile="./noaa/stormdata.csv.bz2",method="curl")
}
storm <- read.csv(bzfile("./noaa/stormdata.csv.bz2"))
str(storm)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "000","0000","0001",..: 152 167 2645 1563 2524 3126 122 1563 3126 3126 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 826 826 826 826 826 826 826 826 826 826 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels ""," Christiansburg",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels ""," CANTON"," TULIA",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","+","-","0",..: 16 16 16 16 16 16 16 16 16 16 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","0","2","?",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","\t","\t\t",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
As you can see, the file have 902297 observations and 37 variables (columns)
head(storm)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
We can realize that there are hardly any measurements in the early years.Perhaps we can see better if we plot the file
Format the Date column
storm$BGN_DATE <- as.Date(storm$BGN_DATE, "%m/%d/%Y %H:%M:%S")
hist(storm$BGN_DATE, breaks = 20)
storm2 <- storm[storm$FATALITIES != 0 | storm$INJURIES != 0 , c("EVTYPE", "FATALITIES","INJURIES") ]
aggrf <- aggregate(FATALITIES ~ EVTYPE, storm2, sum)
indexf <- order(aggrf$FATALITIES, decreasing = TRUE)
ord_aggrf <- aggrf[indexf, ]
colnames(ord_aggrf) <- c("Events", "Fatalities")
aggri <- aggregate(INJURIES ~ EVTYPE, storm2, sum)
indexi <- order(aggri$INJURIES, decreasing = TRUE)
ord_aggri <- aggri[indexi, ]
colnames(ord_aggri) <- c("Events", "Injuries")
storm_ECO <- storm[storm$PROPDMG != 0 | storm$CROPDMG != 0 , c("EVTYPE", "PROPDMG","CROPDMG") ]
aggrpr <- aggregate( PROPDMG ~ EVTYPE, storm_ECO, sum)
index_pr <- order(aggrpr$PROPDMG, decreasing = TRUE)
ord_aggrpr <- aggrpr[index_pr, ]
colnames(ord_aggrpr) <- c("Events", "PropertiesDM")
aggrcr <- aggregate( CROPDMG~ EVTYPE, storm_ECO, sum)
index_cr <- order(aggrcr$CROPDMG, decreasing = TRUE)
ord_aggrcr <- aggrcr[index_cr, ]
colnames(ord_aggrcr) <- c("Events", "CropsDM")
These are the Fatalities by Weather Event
head(ord_aggrf)
## Events Fatalities
## 183 TORNADO 5633
## 31 EXCESSIVE HEAT 1903
## 41 FLASH FLOOD 978
## 68 HEAT 937
## 122 LIGHTNING 816
## 189 TSTM WIND 504
These are the Injuries by Weather Event
head(ord_aggri)
## Events Injuries
## 183 TORNADO 91346
## 189 TSTM WIND 6957
## 46 FLOOD 6789
## 31 EXCESSIVE HEAT 6525
## 122 LIGHTNING 5230
## 68 HEAT 2100
we would like to respect the order in data.frame. For that to happen, we need to change the order of factor levels by specifying the order explicitly.
ord_aggrf20 <- ord_aggrf[1:15,]
ord_aggrf20$Events <- factor(ord_aggrf20$Events, levels = ord_aggrf20$Events[order(ord_aggrf20$Fatalities)])
ord_aggri20 <- ord_aggri[1:15,]
ord_aggri20$Events <- factor(ord_aggri20$Events, levels = ord_aggri20$Events[order(ord_aggri20$Injuries)])
These are the result plot that show which types of events are most harmful with respect to population health.
These are the Properties Damage by Weather Event
head(ord_aggrpr)
## Events PropertiesDM
## 351 TORNADO 3212258.2
## 58 FLASH FLOOD 1420124.6
## 365 TSTM WIND 1335965.6
## 71 FLOOD 899938.5
## 311 THUNDERSTORM WIND 876844.2
## 114 HAIL 688693.4
These are the Crops Damage by Weather Event
head(ord_aggrpr)
## Events PropertiesDM
## 351 TORNADO 3212258.2
## 58 FLASH FLOOD 1420124.6
## 365 TSTM WIND 1335965.6
## 71 FLOOD 899938.5
## 311 THUNDERSTORM WIND 876844.2
## 114 HAIL 688693.4
we would like to respect the order in data.frame to plot it. For that to happen, we need to change the order of factor levels by specifying the order explicitly.
ord_aggrpr20 <- ord_aggrpr[1:15,]
ord_aggrpr20$Events <- factor(ord_aggrpr20$Events, levels = ord_aggrpr20$Events[order(ord_aggrpr20$PropertiesDM)])
ord_aggrcr20 <- ord_aggrcr[1:15,]
ord_aggrcr20$Events <- factor(ord_aggrcr20$Events, levels = ord_aggrcr20$Events[order(ord_aggrcr20$CropsDM)])
These are the result plot that show which types of events have the greatest economic consequences.