We first load the data, then filter the record as per our need, will do data cleaning and processing then finally print the result and related graph
We have already downloaded data into our root folder and now its time to load record into data set
sdata <- read.csv2("StormData.csv", header = T, sep = ',')
dim(sdata)
## [1] 902297 37
# head(sdata)
# str(sdata)
colnames(sdata)
## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS" "REFNUM"
After analyzing the data and its column we found that we only need column 8 and from 23 to 28 so reducing the data set as per our requirement
sfdata <- sdata[, c(8, 23:28)]
head(sfdata)
## EVTYPE FATALITIES INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP
## 1 TORNADO 0.00 15.00 25.00 K 0.00
## 2 TORNADO 0.00 0.00 2.50 K 0.00
## 3 TORNADO 0.00 2.00 25.00 K 0.00
## 4 TORNADO 0.00 2.00 2.50 K 0.00
## 5 TORNADO 0.00 2.00 2.50 K 0.00
## 6 TORNADO 0.00 6.00 2.50 K 0.00
str(sfdata)
## 'data.frame': 902297 obs. of 7 variables:
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ FATALITIES: Factor w/ 52 levels "0.00","1.00",..: 1 1 1 1 1 1 1 1 2 1 ...
## $ INJURIES : Factor w/ 200 levels "0.00","1.00",..: 42 1 69 69 69 154 2 1 36 1 ...
## $ PROPDMG : Factor w/ 1390 levels "0.00","0.01",..: 559 446 559 446 446 446 446 446 559 559 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : Factor w/ 432 levels "0.00","0.01",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
Once we have filter data, we check no of unique event type so that we can plan our next action
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
noOfEvents <- length(unique(sfdata$EVTYPE))
majorEvents <- sfdata %>%
group_by(EVTYPE) %>%
summarize(n=n()) %>%
arrange(desc(n))
As per the result, it seems a lot of event type which make our analysis tough,Even this data having a lot of duplicate or wrong spelling but here we are only concentrating on major contribute, We need to aggregate data so that we can find total impact on public health
sfdata$FATALITIES = as.numeric(as.character(sfdata$FATALITIES))
sfdata$INJURIES = as.numeric(as.character(sfdata$INJURIES))
publicHealthImpact = sfdata %>%
group_by(EVTYPE) %>%
mutate(Total = FATALITIES + INJURIES) %>%
summarise(FATALITIES = sum(FATALITIES),
INJURIES = sum(INJURIES),
Total = sum(Total)) %>%
arrange(desc(Total))
head(publicHealthImpact)
## # A tibble: 6 × 4
## EVTYPE FATALITIES INJURIES Total
## <fctr> <dbl> <dbl> <dbl>
## 1 TORNADO 5633 91346 96979
## 2 EXCESSIVE HEAT 1903 6525 8428
## 3 TSTM WIND 504 6957 7461
## 4 FLOOD 470 6789 7259
## 5 LIGHTNING 816 5230 6046
## 6 HEAT 937 2100 3037
After the above data, here the result
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.2
publicHealthImpactNew<-gather(publicHealthImpact[1:5, 1:3], Type, Total, FATALITIES:INJURIES)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
ggplot(publicHealthImpactNew) +
aes(x=reorder(EVTYPE, +Total), y=Total, fill=Type) + geom_histogram(stat="identity", alpha=1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
As per the result, Tornado is most dangerous for population health
First we need to filter expense data which is going to use this analysis
sedata <- sfdata[,c(1, 4:7)]
unique(sedata$PROPDMGEXP)
## [1] K M B m + 0 5 6 ? 4 2 3 h 7 H - 1 8
## Levels: - ? + 0 1 2 3 4 5 6 7 8 B h H K m M
As per above data and data documentation, it has non numeric values so we first need to convert the same
sedata$PROPDMGEXP <- as.numeric(recode(as.character(sedata$PROPDMGEXP),
'0'=1,'1'=10,'2'=10^2,'3'=10^3,'4'=10^4,'5'=10^5,'6'=10^6,'7'=10^7,'8'=10^8,'B'=10^9,'h'=10^2,'H'=10^2,'K'=10^3,'M'=10^6,'m'=10^6,'-'=1,'?'=1,'+'=1, .missing = 0))
## Warning: Unreplaced values treated as NA as .x is not compatible. Please
## specify replacements exhaustively or supply .default
sedata$CROPDMGEXP <- as.numeric(recode(as.character(sedata$CROPDMGEXP),
'0'=1,'1'=10,'2'=10^2,'3'=10^3,'4'=10^4,'5'=10^5,'6'=10^6,'7'=10^7,'8'=10^8,'B'=10^9,'h'=10^2,'H'=10^2,'K'=10^3,'M'=10^6,'m'=10^6,'-'=1,'?'=1,'+'=1, .missing = 0))
## Warning: Unreplaced values treated as NA as .x is not compatible. Please
## specify replacements exhaustively or supply .default
Now we aggregate the data to calculate complete economical damage
sedata$PROPDMG = as.numeric(as.character(sedata$PROPDMG))
sedata$CROPDMG = as.numeric(as.character(sedata$CROPDMG))
economicalDamage <- sedata %>%
group_by(EVTYPE) %>%
summarise(Property = sum(PROPDMG*PROPDMGEXP, na.rm = T),
Crop = sum(CROPDMG*CROPDMGEXP, na.rm = T),
Total = Property + Crop) %>%
arrange(desc(Total))
after the above data, here the result
library(tidyr)
economicalDamageNew<-gather(economicalDamage[1:5, 1:3], Type, Total, Property:Crop)
library(ggplot2)
ggplot(economicalDamageNew) +
aes(x=reorder(EVTYPE, +Total), y=Total, fill=Type) + geom_histogram(stat="identity", alpha=1)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
As per the result, flood is major contribute for economical damage