Data Processing

Loading libraries

library(dplyr)
library(car)
library(sqldf)

Loading and preprocessing the data

## Read the data from disc
rawdata <- read.table(bzfile("repdata_data_StormData.csv.bz2"),
                      comment.char="", sep = ",", fill = TRUE, 
                      header = TRUE)

## Make a copy of the raw data
tidydata <- tbl_df(rawdata)

## Selecting relevant variables (subsetting)
tidysub1 <- tidydata[, c("REFNUM","EVTYPE","FATALITIES","INJURIES","PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")]

## Remove all events with no health impact or no damage cost
tidysub0 <- tidysub[with(tidysub, FATALITIES==0 &
                              INJURIES ==0 & PROPDMG==0 & CROPDMG==0 ),]

tidysub1 <- sqldf("SELECT t1.* FROM tidysub t1 LEFT JOIN tidysub0 t2
                  ON t1.REFNUM = t2.REFNUM 
                  WHERE t2.REFNUM IS NULL")
## Loading required package: tcltk

Regrouping the events variable

events <- unique(tidysub1$EVTYPE)

eventsdf <- data.frame("recno"= seq(1:length(events)),"events"=events)
eventsdf$EVGRPS <- as.factor(rep(NA, dim(eventsdf)[1]))

eventsdf$EVGRPS <- recode(eventsdf$recno, 
"c(1, 23, 24, 193, 200, 207, 263, 315, 355, 357, 361, 442, 533, 750)='TORNADO'; 

c(27, 90, 99, 180, 182, 307, 409, 410, 486, 490, 774, 779, 873, 922)='EXTREME HEAT'; 

c(2, 10, 16, 19, 25, 26, 35, 63, 83, 100, 126, 127, 177, 212, 217, 218, 223, 274, 283, 288, 292, 303, 304, 308, 312, 313, 333, 371, 373, 380, 382, 422, 435, 436, 437, 438, 439, 440, 448, 449, 450, 451, 453, 460, 462, 471, 476, 479, 481, 500, 501, 502, 508, 534, 542, 545, 547, 548, 554, 555, 565, 601, 608, 637, 747, 769, 770, 788, 796, 797, 798, 799, 802, 829, 851, 852, 854, 918, 924, 957, 978, 982, 983)= 'THUNDERSTORM WITH WIND'; 

c(83)='HIGH WIND/WIND BURST'; 

c(4, 11, 37, 43, 54, 189, 222, 237, 244, 252, 253, 306, 372, 383, 417, 420, 497, 505, 511, 513, 514, 518, 599, 616, 630, 639, 644, 645, 665, 755, 777, 786, 804, 813, 814, 960, 967, 977)='EXTREME COLD';
                          
c(5, 7, 8, 47, 51, 53, 65, 78, 89, 141, 162, 166, 187, 188, 198, 231, 232, 234, 238, 241, 243, 266, 267, 269, 270, 284, 291, 301, 329, 331, 332, 336, 337, 338, 349, 364, 365, 367, 389, 390, 404, 411, 419, 424, 425, 426, 427, 428, 429, 431, 432, 463, 487, 522, 524, 525, 528, 529, 531, 532, 583, 615, 617, 634, 641, 642, 649, 657, 658, 664, 666, 667, 670, 678, 682, 683, 684, 687, 688, 767, 768, 771, 790, 801, 803, 815, 816, 820, 831, 896, 900, 905, 909, 913, 931, 934, 948, 961, 968)='WINTER WEATHER/HEAVY SNOW'; 

c(6, 20, 21, 34, 36, 39, 50, 52, 97, 105, 107, 110, 129, 130, 134, 143, 173, 181, 203, 214, 235, 248, 268, 271, 275, 295, 302, 320, 325, 327, 359, 360, 369, 395, 407, 421, 452, 454, 478, 496, 499, 527, 535, 563, 566, 592, 595, 597, 598, 600, 605, 606, 618, 623, 628, 629, 633, 654, 656, 751, 753, 754, 763, 764, 787, 848, 871, 907, 926, 981) ='FLOOD/HEAVY RAIN'; 

c(9, 12, 13, 208, 209, 210, 211, 226, 309, 318, 444, 445, 446, 652, 837, 881, 973)='HURRICANE/TROPICAL STORM';
                          
c(3, 378, 379, 377, 381, 384, 286, 458, 169, 375, 536, 537, 541, 413, 681, 224, 765, 971)='HAIL' 
" )


sel1 <-as.numeric(eventsdf$EVGRPS)
## Warning: NAs introduced by coercion
sel2 <- sel1>0 & sel1<999
sel3 <- !is.na(sel2)
sel4 <- is.na(sel2)

repl <- eventsdf[sel3,]
eventsdf1 <- eventsdf[sel4,]

eventsdf2 <- sqldf("select t1.recno, t1.events, t2.events as EVGRPS from eventsdf t1 INNER JOIN
                  repl t2 on t1.recno=t2.recno")
eventsdf <- rbind(eventsdf1, eventsdf2)

tidysub2 <- sqldf("select t1.*, t2.EVGRPS from tidysub1 t1 LEFT JOIN eventsdf t2 on 
                  t1.EVTYPE=t2.events")
tidysub2 <- tidysub2[,-2]

Q 1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health?

Calculating the human cost of weather events
## Combine the number of injuries and fatalities to estimate the human cost of each event
tidysub2$HUMCOST <- tidysub2$FATALITIES + tidysub2$INJURIES

## List group events by type, calculate the total number of human fatalities and injuries and sort the total in descending order
costbyevnt <- group_by(tidysub2, EVGRPS)
sumhcost <- summarise(costbyevnt, tot_hcost=sum(HUMCOST))
sumhcost <- sumhcost[order(sumhcost$tot_hcost,decreasing=TRUE),]

Graphics

humhigh5 <- head(sumhcost,5)

barplot(humhigh5$tot_hcost, 
        main= "Five Most Harmful Weather Events in The US",
        ylab="Fatalities and Injuries", names.arg=humhigh5$EVGRPS, 
        horiz=FALSE, angle = 45)

plot of chunk graph1

Across the United States, which types of events have the greatest economic consequences?

## Combine losses from property damages and crops to estimate the economic consequences
if (toupper(tidysub2$PROPDMGEXP)=='K'){
    tidysub2$Prop <- tidysub2$PROPDMG / (1000*1000)
}else {
    if (toupper(tidysub2$PROPDMGEXP)=='M'){
        tidysub2$Prop <- tidysub2$PROPDMG / 1000
    }else {
        if (toupper(tidysub2$PROPDMGEXP)=='B'){
            tidysub2$Prop <- tidysub2$PROPDMG / 1
    } else {
        tidysub2$Prop <- 0
    }
}
}
## Warning: the condition has length > 1 and only the first element will be
## used
if (toupper(tidysub2$CROPDMGEXP)=='K'){
    tidysub2$Crop <- tidysub2$CROPDMG / (1000*1000)
}else {
    if (toupper(tidysub2$CROPDMGEXP)=='M'){
        tidysub2$Crop <- tidysub2$CROPDMG / 1000
    }else {
        if (toupper(tidysub2$CROPDMGEXP)=='B'){
            tidysub2$Crop <- tidysub2$CROPDMG / 1
        } else {
            tidysub2$Crop <- 0
        }
    }
}
## Warning: the condition has length > 1 and only the first element will be used
## Warning: the condition has length > 1 and only the first element will be used
## Warning: the condition has length > 1 and only the first element will be used
tidysub2$PROPCOST <- tidysub2$Prop + tidysub2$Crop

## List group events by type, calculate the total amount lost and sort the total in descending order
costbyevnt <- group_by(tidysub2, EVGRPS)
sumpcost <- summarise(costbyevnt, tot_pcost=sum(PROPCOST))
sumpcost <- sumpcost[order(sumpcost$tot_pcost,decreasing=TRUE),]

Graphics

prophigh5 <- head(sumpcost,5)

barplot(prophigh5$tot_pcost, 
        main= "Five Most Costly Weather Events in The US",
        ylab="Cost is Billions", names.arg=prophigh5$EVGRPS, 
        horiz=FALSE, angle = 45)

plot of chunk graph2

Results

## Compose the answer to the first question, store it and display it
Answer1 <- paste("The most harmful types of events (as indicated in the EVTYPE variable), with respect to population health is", sumhcost[1,1], "with",sumhcost[1,2], "fatalities and injuries combined.")
Answer1
## [1] "The most harmful types of events (as indicated in the EVTYPE variable), with respect to population health is TORNADO with 97095 fatalities and injuries combined."
## Compose the answer to the first question, store it and display it
Answer2 <- paste("The types of events (as indicated in the EVTYPE variable), with the greatest economic consequences is", sumpcost[1,1], "with loss of ",sumpcost[1,2], "Millions in property and crop damages combined.")

Answer2
## [1] "The types of events (as indicated in the EVTYPE variable), with the greatest economic consequences is TORNADO with loss of  3.21744425 Millions in property and crop damages combined."