library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, last
This is an analysis of “Storm Data”" database. Firstly, I’ve recovered the raw information from the web and I’ve named the data frame as “STORM”. Secondly, I’ve cleaned the variable EVTYPE and I’ve created a smaller dataframe “Stormsummary” grouping by EVTYPE. The columns are: Event type(EVTYPE), Injuries (INJURIES) ,Fatalities (FATALITIES), Crop Damage (CROPDMG), Property damage (PROPDMG). Later, I’ve created some table reports and graphics for the exploratory analysis. The main conclusions are that Tornado is the main category for fatalities and injuries. Additionally, Tornados represent the main the highest cost for property damage and Hail represent the highest cost for crop damage.
download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2", dest="tmp.bz2")
data <- read.csv(bzfile("tmp.bz2"),
header=TRUE,
sep=",",
stringsAsFactors=FALSE)
I’ve converted the data frame to a data table which have a faster process. Later, I did a general recognition of the information.
storm<-data.table(data)
str(storm)
## Classes 'data.table' and 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : chr "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
## $ BGN_TIME : chr "0130" "0145" "1600" "0900" ...
## $ TIME_ZONE : chr "CST" "CST" "CST" "CST" ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: chr "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
## $ STATE : chr "AL" "AL" "AL" "AL" ...
## $ EVTYPE : chr "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : chr "" "" "" "" ...
## $ BGN_LOCATI: chr "" "" "" "" ...
## $ END_DATE : chr "" "" "" "" ...
## $ END_TIME : chr "" "" "" "" ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : chr "" "" "" "" ...
## $ END_LOCATI: chr "" "" "" "" ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: chr "K" "K" "K" "K" ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: chr "" "" "" "" ...
## $ WFO : chr "" "" "" "" ...
## $ STATEOFFIC: chr "" "" "" "" ...
## $ ZONENAMES : chr "" "" "" "" ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : chr "" "" "" "" ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(storm)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE
## Min. : 1.0 Length:902297 Length:902297 Length:902297
## 1st Qu.:19.0 Class :character Class :character Class :character
## Median :30.0 Mode :character Mode :character Mode :character
## Mean :31.2
## 3rd Qu.:45.0
## Max. :95.0
##
## COUNTY COUNTYNAME STATE EVTYPE
## Min. : 0.0 Length:902297 Length:902297 Length:902297
## 1st Qu.: 31.0 Class :character Class :character Class :character
## Median : 75.0 Mode :character Mode :character Mode :character
## Mean :100.6
## 3rd Qu.:131.0
## Max. :873.0
##
## BGN_RANGE BGN_AZI BGN_LOCATI
## Min. : 0.000 Length:902297 Length:902297
## 1st Qu.: 0.000 Class :character Class :character
## Median : 0.000 Mode :character Mode :character
## Mean : 1.484
## 3rd Qu.: 1.000
## Max. :3749.000
##
## END_DATE END_TIME COUNTY_END COUNTYENDN
## Length:902297 Length:902297 Min. :0 Mode:logical
## Class :character Class :character 1st Qu.:0 NA's:902297
## Mode :character Mode :character Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
##
## END_RANGE END_AZI END_LOCATI
## Min. : 0.0000 Length:902297 Length:902297
## 1st Qu.: 0.0000 Class :character Class :character
## Median : 0.0000 Mode :character Mode :character
## Mean : 0.9862
## 3rd Qu.: 0.0000
## Max. :925.0000
##
## LENGTH WIDTH F MAG
## Min. : 0.0000 Min. : 0.000 Min. :0.0 Min. : 0.0
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.:0.0 1st Qu.: 0.0
## Median : 0.0000 Median : 0.000 Median :1.0 Median : 50.0
## Mean : 0.2301 Mean : 7.503 Mean :0.9 Mean : 46.9
## 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.:1.0 3rd Qu.: 75.0
## Max. :2315.0000 Max. :4400.000 Max. :5.0 Max. :22000.0
## NA's :843563
## FATALITIES INJURIES PROPDMG
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00
## Median : 0.0000 Median : 0.0000 Median : 0.00
## Mean : 0.0168 Mean : 0.1557 Mean : 12.06
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.50
## Max. :583.0000 Max. :1700.0000 Max. :5000.00
##
## PROPDMGEXP CROPDMG CROPDMGEXP
## Length:902297 Min. : 0.000 Length:902297
## Class :character 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.000 Mode :character
## Mean : 1.527
## 3rd Qu.: 0.000
## Max. :990.000
##
## WFO STATEOFFIC ZONENAMES LATITUDE
## Length:902297 Length:902297 Length:902297 Min. : 0
## Class :character Class :character Class :character 1st Qu.:2802
## Mode :character Mode :character Mode :character Median :3540
## Mean :2875
## 3rd Qu.:4019
## Max. :9706
## NA's :47
## LONGITUDE LATITUDE_E LONGITUDE_ REMARKS
## Min. :-14451 Min. : 0 Min. :-14455 Length:902297
## 1st Qu.: 7247 1st Qu.: 0 1st Qu.: 0 Class :character
## Median : 8707 Median : 0 Median : 0 Mode :character
## Mean : 6940 Mean :1452 Mean : 3509
## 3rd Qu.: 9605 3rd Qu.:3549 3rd Qu.: 8735
## Max. : 17124 Max. :9706 Max. :106220
## NA's :40
## REFNUM
## Min. : 1
## 1st Qu.:225575
## Median :451149
## Mean :451149
## 3rd Qu.:676723
## Max. :902297
##
After checking the dataframe, we had the objective to analyze which events were the most costly and dangerous.
I made some corrections in the column EVTYPE. Mainly, I tried to reduce the categories or unique elements in this variable. Initially it had 985 elements
evtypecheck1<-as.data.frame(unique(storm$EVTYPE))
length(unique(evtypecheck1$`unique(storm$EVTYPE)`))
## [1] 985
After cleaning the data, the categories reduced to 309 elements
storm$EVTYPE<-toupper(storm$EVTYPE) #Converting all variables upper case
storm$EVTYPE <- gsub('.*STORM.*', 'STORM', storm$EVTYPE)
storm$EVTYPE <- gsub('.*FLOOD.*', 'FLOOD', storm$EVTYPE)
storm$EVTYPE <- gsub('.*WIND.*', 'WIND', storm$EVTYPE)
storm$EVTYPE <- gsub('.*TORN.*', 'TORNADO', storm$EVTYPE)
storm$EVTYPE <- gsub('.*HAIL.*', 'HAIL', storm$EVTYPE)
storm$EVTYPE <- gsub('.*HURRICANE.*', 'HURRICANE', storm$EVTYPE)
storm$EVTYPE <- gsub('.*RAIN.*', 'RAIN', storm$EVTYPE)
storm$EVTYPE <- gsub('.*SNOW.*', 'SNOW', storm$EVTYPE)
storm$EVTYPE <- gsub('.*COLD.*', 'COLD', storm$EVTYPE)
storm$EVTYPE <- gsub('.*LOW.*TEMPER.*', 'COLD', storm$EVTYPE)
storm$EVTYPE <- gsub('.*FROST.*', 'COLD', storm$EVTYPE)
storm$EVTYPE <- gsub('.*HIGH.*TEMPER.*', 'HEAT', storm$EVTYPE)
storm$EVTYPE <- gsub('.*HEAT.*', 'HEAT', storm$EVTYPE)
storm$EVTYPE <- gsub('.*FIRE.*', 'FIRE', storm$EVTYPE)
storm$EVTYPE <- gsub('AVALANCE', 'AVALANCHE', storm$EVTYPE)
storm$EVTYPE <- gsub('BEACH EROSIN', 'BEACH EROSION', storm$EVTYPE)
storm$EVTYPE <- gsub('FUNNEL CLOUD.*', 'FUNNEL CLOUD', storm$EVTYPE)
storm$EVTYPE <- gsub('HEAVY PRECIP.*', 'HEAVY PRECIPITATION', storm$EVTYPE)
evtypecheck2<-as.data.frame(unique(storm$EVTYPE))
length(unique(evtypecheck2$`unique(storm$EVTYPE)`))
## [1] 312
zz<-group_by(storm, EVTYPE)
stormsummary<-summarise(zz, allfatalities=sum(FATALITIES), allinjuries=sum(INJURIES),
allpropdmg=sum(PROPDMG), allcropdmg=sum(CROPDMG))
To simplify the analysis, I categorized the 10 main categories in the variable EVTYPE.
In that sense, I’ve created four additional dataframes
#Fatalities and injuries
mainfatalities<-arrange(stormsummary, desc(allfatalities))[1:10 ,]
maininjuries<-arrange(stormsummary, desc(allinjuries))[1:10,]
#Property and crop damage
mainpropdamage<-arrange(stormsummary, desc(allpropdmg))[1:10,]
maincropdamage<-arrange(stormsummary, desc(allcropdmg))[1:10,]
I’ve prepared an analysis comparing injuries and damages in terms of occurrence and the economic cost of crop and property damages.
par(mfrow=c(1,2), mar=c(7,3,1,1))
First, let’s see the injuries and damages graph and their respective tables
barplot(mainfatalities$allfatalities, names.arg = mainfatalities$EVTYPE, las=2, cex.names = 0.7, cex.axis=0.7, main="fatalities")
select(mainfatalities, EVTYPE, allfatalities)
## Source: local data table [10 x 2]
##
## EVTYPE allfatalities
## (chr) (dbl)
## 1 TORNADO 5636
## 2 HEAT 3138
## 3 FLOOD 1525
## 4 WIND 1240
## 5 LIGHTNING 816
## 6 STORM 633
## 7 RIP CURRENT 368
## 8 AVALANCHE 225
## 9 COLD 223
## 10 RIP CURRENTS 204
barplot(maininjuries$allinjuries, names.arg = maininjuries$EVTYPE, las=2, cex.names = 0.7, cex.axis=0.7, main="injuries")
select(maininjuries, EVTYPE, allinjuries)
## Source: local data table [10 x 2]
##
## EVTYPE allinjuries
## (chr) (dbl)
## 1 TORNADO 91407
## 2 HEAT 9224
## 3 WIND 9044
## 4 FLOOD 8602
## 5 STORM 6692
## 6 LIGHTNING 5230
## 7 FIRE 1608
## 8 HAIL 1371
## 9 HURRICANE 1328
## 10 SNOW 1121
Now, let’s see the crop and property cost
barplot(mainpropdamage$allpropdmg, names.arg =mainpropdamage$EVTYPE, las=2, cex.names =0.7, cex.axis = 0.5, main = "Property")
select(mainpropdamage, EVTYPE, allpropdmg)
## Source: local data table [10 x 2]
##
## EVTYPE allpropdmg
## (chr) (dbl)
## 1 TORNADO 3215748.41
## 2 FLOOD 2435606.51
## 3 WIND 1806049.95
## 4 STORM 1613789.63
## 5 HAIL 689069.78
## 6 LIGHTNING 603351.78
## 7 SNOW 150127.16
## 8 FIRE 125223.29
## 9 RAIN 57871.71
## 10 URBAN/SML STREAM FLD 26051.94
barplot(maincropdamage$allcropdmg, names.arg = maincropdamage$EVTYPE, las=2, cex.names = 0.7, cex.axis = 0.5, main="Crop")
select(maincropdamage, EVTYPE, allcropdmg)
## Source: local data table [10 x 2]
##
## EVTYPE allcropdmg
## (chr) (dbl)
## 1 HAIL 581418.36
## 2 FLOOD 364476.73
## 3 WIND 137539.11
## 4 TORNADO 100026.77
## 5 STORM 99430.69
## 6 DROUGHT 33898.62
## 7 COLD 14520.88
## 8 RAIN 12447.30
## 9 HURRICANE 10802.79
## 10 FIRE 9565.74
The analysis shows that “Tornado”" is the event that causes the most injuries and fatalities across United States.
In terms of property damage, “Tornado” impplies the maximum cost. On the other hand, “Hail” represents the highest cost for crops.