storm <- read.csv("repdata_data_StormData.csv.bz2")
#inspect the data set
str(storm)
## 'data.frame': 902297 obs. of 37 variables:
## $ STATE__ : num 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_DATE : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
## $ BGN_TIME : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
## $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ COUNTY : num 97 3 57 89 43 77 9 123 125 57 ...
## $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
## $ STATE : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ EVTYPE : Factor w/ 985 levels " HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
## $ BGN_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BGN_AZI : Factor w/ 35 levels ""," N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_DATE : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_TIME : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY_END: num 0 0 0 0 0 0 0 0 0 0 ...
## $ COUNTYENDN: logi NA NA NA NA NA NA ...
## $ END_RANGE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ END_AZI : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LENGTH : num 14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
## $ WIDTH : num 100 150 123 100 150 177 33 33 100 100 ...
## $ F : int 3 2 2 2 2 2 2 1 3 3 ...
## $ MAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FATALITIES: num 0 0 0 0 0 0 0 0 1 0 ...
## $ INJURIES : num 15 0 2 2 2 6 1 0 14 0 ...
## $ PROPDMG : num 25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
## $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
## $ CROPDMG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WFO : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ZONENAMES : Factor w/ 25112 levels ""," "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LATITUDE : num 3040 3042 3340 3458 3412 ...
## $ LONGITUDE : num 8812 8755 8742 8626 8642 ...
## $ LATITUDE_E: num 3051 0 0 0 0 ...
## $ LONGITUDE_: num 8806 0 0 0 0 ...
## $ REMARKS : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ REFNUM : num 1 2 3 4 5 6 7 8 9 10 ...
health <- storm[, c("FATALITIES", "INJURIES", "EVTYPE")]
# To get nationawide information, group the health data frame by EVTYPE
# and calculate a total number for each type
health_type <- aggregate(. ~ EVTYPE, health, FUN = sum)
# Get fatality data for each type and sort it in decreasing order
fatalities <- health_type[order(health_type$FATALITIES, decreasing = T),
c("EVTYPE", "FATALITIES")]
# Get the top 5 types that cause the most fatalities
fatalities5 <- fatalities[1:5, ]
# Plot the top 5 types
library(ggplot2)
x <- as.character(fatalities5$EVTYPE)
g1 <- ggplot(fatalities5, aes(x = factor(EVTYPE, levels = x), y = FATALITIES,
fill = factor(EVTYPE, levels = x)))
g1a <- g1 + geom_bar(stat = "identity", width = .4)
g1b <- g1a + xlab("severe weather type") + ylab("nationwide fatalities")
g1c <- g1b + scale_fill_discrete(name = "weather type")
g1d <- g1c + scale_x_discrete(labels = abbreviate)
g1d
# Get injury data for each type and sort it in decreasing order
injuries <- health_type[order(health_type$INJURIES, decreasing = T),
c("EVTYPE", "INJURIES")]
# Get the top 5 types that cause the most fatalities
injuries5 <- injuries[1:5, ]
# Plot the top 5 types
y <- as.character(injuries5$EVTYPE)
g2 <- ggplot(injuries5, aes(x = factor(EVTYPE, levels = y), y = INJURIES,
fill = factor(EVTYPE, levels = y)))
g2a <- g2 + geom_bar(stat = "identity", width = .4)
g2b <- g2a + xlab("severe weather type") + ylab("nationwide injuries")
g2c <- g2b + scale_fill_discrete(name = "weather type")
g2d <- g2c + scale_x_discrete(labels = abbreviate)
g2d
economy <- storm[, c("PROPDMG", "CROPDMG", "EVTYPE")]
# To get nationawide information, group the economy data frame by EVTYPE
# and calculate a total number for each type
economy_type <- aggregate(. ~ EVTYPE, economy, FUN = sum)
# consider property and crop damage together
# add property and crop damage to form a new variable damage
economy_type$damage <- apply(economy_type[,2:3], 1, sum, na.rm = T)
# update economy with only EVTYPE and damage variables
economy_type <- economy_type[, c(1,4)]
# order weather types by damage
economy_type <- economy_type[order(economy_type$damage, decreasing = T), ]
# Get the top 5 weather types to cause economy damage
economy5 <- economy_type[1:5,]
# Plot the top 5 types
z <- as.character(economy5$EVTYPE)
g3 <- ggplot(economy5, aes(x = factor(EVTYPE, levels = z), y = damage,
fill = factor(EVTYPE, levels = z)))
g3a <- g3 + geom_bar(stat = "identity", width = .4)
g3b <- g3a + xlab("severe weather type") + ylab("nationwide damage")
g3c <- g3b + scale_fill_discrete(name = "weather type")
g3d <- g3c + scale_x_discrete(labels = abbreviate)
g3d