library(ggplot2)
In this analysis we use the Storm data to determine which events are most harmful to the population of the US and which events cause the most damage.
download.file('https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2','project_data', method = 'curl')
project_data <- read.csv('project_data.csv')
head(project_data)
## STATE__ BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1 1 4/18/1950 0:00:00 0130 CST 97 MOBILE AL
## 2 1 4/18/1950 0:00:00 0145 CST 3 BALDWIN AL
## 3 1 2/20/1951 0:00:00 1600 CST 57 FAYETTE AL
## 4 1 6/8/1951 0:00:00 0900 CST 89 MADISON AL
## 5 1 11/15/1951 0:00:00 1500 CST 43 CULLMAN AL
## 6 1 11/15/1951 0:00:00 2000 CST 77 LAUDERDALE AL
## EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO 0 0
## 2 TORNADO 0 0
## 3 TORNADO 0 0
## 4 TORNADO 0 0
## 5 TORNADO 0 0
## 6 TORNADO 0 0
## COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1 NA 0 14.0 100 3 0 0
## 2 NA 0 2.0 150 2 0 0
## 3 NA 0 0.1 123 2 0 0
## 4 NA 0 0.0 100 2 0 0
## 5 NA 0 0.0 150 2 0 0
## 6 NA 0 1.5 177 2 0 0
## INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1 15 25.0 K 0
## 2 0 2.5 K 0
## 3 2 25.0 K 0
## 4 2 2.5 K 0
## 5 2 2.5 K 0
## 6 6 2.5 K 0
## LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1 3040 8812 3051 8806 1
## 2 3042 8755 0 0 2
## 3 3340 8742 0 0 3
## 4 3458 8626 0 0 4
## 5 3412 8642 0 0 5
## 6 3450 8748 0 0 6
dim(project_data)
## [1] 902297 37
str(project_data)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "000","0000","0001",..: 152 167 2645 1563 2524 3126 122 1563 3126 3126 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 826 826 826 826 826 826 826 826 826 826 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels ""," Christiansburg",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels ""," CANTON"," TULIA",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","+","-","0",..: 16 16 16 16 16 16 16 16 16 16 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","0","2","?",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","\t","\t\t",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
inj_data <- aggregate(INJURIES ~ EVTYPE, project_data,sum)
attach(inj_data)
inj_data <- inj_data[order(-INJURIES),]
inj_data <- inj_data[1:10,]
fat_data <- aggregate(FATALITIES ~ EVTYPE, project_data,sum)
attach(fat_data)
## The following object is masked from inj_data:
##
## EVTYPE
fat_data <- fat_data[order(-FATALITIES),]
fat_data <- fat_data[1:10,]
dmg_data <- aggregate(PROPDMG ~ EVTYPE, project_data,sum)
attach(dmg_data)
## The following object is masked from fat_data:
##
## EVTYPE
## The following object is masked from inj_data:
##
## EVTYPE
dmg_data <- dmg_data[order(-PROPDMG),]
dmg_data <- dmg_data[1:10,]
ggplot(aes(x = EVTYPE, y = INJURIES), data = inj_data) +
geom_bar(stat = 'identity', aes(fill = EVTYPE), position = position_dodge(width = 0.5)) +
ggtitle('Top 10 Event with most injuries') +
geom_text(aes(label = INJURIES), vjust = -0.5) +
#ylab('Number of Injuries') +
xlab('Event')+
labs(fill = 'Event') +
theme(axis.text.x = element_text(angle = 75, hjust = 1),
plot.title = element_text(hjust = 0.5), axis.text.y=element_blank(),
axis.ticks.y =element_blank(), axis.title.y=element_blank())
ggplot(aes(x = EVTYPE, y = FATALITIES), data = fat_data) +
geom_bar(stat = 'identity', aes(fill = EVTYPE), position = position_dodge(width = 0.5)) +
ggtitle('Top 10 Event with most Fatalities') +
geom_text(aes(label = FATALITIES), vjust = -0.5) +
ylab('Number of Fatalities') +
xlab('Event')+
labs(fill = 'Event') +
theme(axis.text.x = element_text(angle = 25, hjust = 1), plot.title = element_text(hjust = 0.5),
axis.text.y=element_blank(),
axis.ticks.y =element_blank(), axis.title.y=element_blank())
ggplot(aes(x = EVTYPE, y = PROPDMG), data = dmg_data) +
geom_bar(stat = 'identity', aes(fill = EVTYPE), position = position_dodge(width = 0.5)) +
ggtitle('Top 10 Event with most Damage') +
geom_text(aes(label = PROPDMG), vjust = -0.5) +
ylab('Damage Amount in USD') +
xlab('Event')+
labs(fill = 'Event') +
theme(axis.text.x = element_text(angle = 25, hjust = 1), plot.title = element_text(hjust = 0.5),
axis.text.y=element_blank(),
axis.ticks.y =element_blank(), axis.title.y=element_blank())