Summary

Installing packages

if(!("futile.logger" %in% rownames(installed.packages()))){
        install.packages("futile.logger")  
}
if(!("R.utils" %in% rownames(installed.packages()))){
        install.packages("R.utils")  
}
if(!("dplyr" %in% rownames(installed.packages()))){
        install.packages("dplyr")  
}
if(!("sqldf" %in% rownames(installed.packages()))){
        install.packages("sqldf")
}

#install.packages("devtools")
library(devtools)
## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 3.1 from http://cran.r-project.org/bin/windows/Rtools/ and then run find_rtools().
install_github("easyGgplot2", "kassambara")
## Warning: Username parameter is deprecated. Please use kassambara/
## easyGgplot2
## Downloading github repo kassambara/easyGgplot2@master
## Installing easyGgplot2
## "C:/PROGRA~1/R/R-31~1.3/bin/x64/R" --vanilla CMD INSTALL  \
##   "C:/Users/JoaoBoscoJares/AppData/Local/Temp/Rtmp65Lq5z/devtools1c0c42263bf/kassambara-easyGgplot2-5cd4a37"  \
##   --library="C:/Users/JoaoBoscoJares/Documents/R/win-library/3.1"  \
##   --install-tests

Main data.frame cache and other variables definition

if(!exists("mainFullDF")){
        mainFullDF <<- NULL
}

mainFileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2";
compressedFiledPATH <- paste0(getwd(),"/StormData.csv.bz2")
unCompressedFiledPATH <- paste0(getwd(),"/","StormData.csv")

Get the main file

getFileByURL <- function(url){
        flog.info(paste0("into getFileByURL where url is: ",url))
                if(!file.exists(compressedFiledPATH) & !file.exists(unCompressedFiledPATH)){
                        flog.info(paste0("file named ",compressedFiledPATH," doesnt exists."))
                        
                        download.file(url, compressedFiledPATH)
                        
                        flog.info(paste0("file: ",compressedFiledPATH,"downloaded with succsseful."))
                }
        flog.info(paste0("file named ",compressedFiledPATH," exists."))
        
}

getFileByURL(mainFileURL)

Extract the main file

unConpressFile <- function(){
        flog.info(paste0("into unzipFile, named: ",compressedFiledPATH))
                flog.info(paste0("file named: ",compressedFiledPATH," exists.. will be decompressed."))
                
                if (!file.exists(unCompressedFiledPATH)) {
                        library(R.utils)
                        bunzip2(compressedFiledPATH,unCompressedFiledPATH, remove = T)
                }

                flog.info(paste0("file named: ",compressedFiledPATH," decompressed, successful!"))
        
        flog.info(paste0("The file named: ",compressedFiledPATH, "is already decompressed."))
}

unConpressFile()

Read the main file

readMainFullDF <- function(){
        flog.info(paste0("into readMainFullDF"))
        
        if (is.null(mainFullDF)) {
                mainFullDF <<- read.csv2(unCompressedFiledPATH, header = TRUE, sep = ",")
        }
        flog.info(paste0("mainFullDF generated!"))

}

readMainFullDF()

Read the main file

createNewCollumnWithYearsAsInt <- function(){
        library(dplyr)
        flog.info(paste0("into createNewCollumnWithYearsAsInt"))
        if (dim(mainFullDF)[2] == 38) {
                flog.info(paste0("will return NULL"))
                return(NULL)
        }
        mainFullDF <<- mainFullDF %>%
                mutate(FYEARS = (as.numeric(format(as.Date(BGN_DATE, format = "%m/%d/%Y %H:%M:%S"), "%Y"))))
        
        flog.info(paste0("out createNewCollumnWithYearsAsInt"))
}
createNewCollumnWithYearsAsInt()

On the graphic above is evident that the most number of accurate records begins from 1990 to 2010. So, I decided to use a subset the whole dataset between those two dates.

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
        mainFullDF <<- sqldf("select * from mainFullDF where FYEARS >= '1990' and FYEARS < '2011'")
## Loading required package: tcltk

Results

The number of Fatalities

library(sqldf)
        sumOfFatalities <- sqldf("select EVTYPE, sum(FATALITIES) as sumF from mainFullDF where FYEARS >= '1990' and FYEARS < '2011' group by EVTYPE")
print(paste0("The number of Fatalities between 1990 and 2010 was ",sum(sumOfFatalities[2])))
## [1] "The number of Fatalities between 1990 and 2010 was 10085"
        sumOfFatalitiesTop10 <- sqldf("select EVTYPE, sum(FATALITIES) as sumF from mainFullDF where FYEARS >= '1990' and FYEARS < '2011' group by EVTYPE order by sumF desc limit 10")

print(sumOfFatalitiesTop10)
##            EVTYPE sumF
## 1  EXCESSIVE HEAT 1867
## 2         TORNADO 1165
## 3     FLASH FLOOD  910
## 4            HEAT  874
## 5       LIGHTNING  790
## 6           FLOOD  412
## 7     RIP CURRENT  339
## 8       TSTM WIND  327
## 9       HIGH WIND  244
## 10      AVALANCHE  215
library(easyGgplot2)
ggplot2.barplot(data=sumOfFatalitiesTop10, xName="EVTYPE", yName="sumF",
                groupName="EVTYPE")

The number of Injuries

library(sqldf)
        sumOfInjuries <- sqldf("select EVTYPE, sum(INJURIES) as SumI from mainFullDF where FYEARS >= '1990' and FYEARS < '2011' group by EVTYPE")
print(paste0("The number of Injuries between 1990 and 2010 was ",sum(sumOfInjuries[2])))
## [1] "The number of Injuries between 1990 and 2010 was 65907"
        sumOfInjuriesTop10 <- sqldf("select EVTYPE, sum(INJURIES) as SumI from mainFullDF where FYEARS >= '1990' and FYEARS < '2011' group by EVTYPE order by SumI desc limit 10")

print(sumOfInjuriesTop10)
##               EVTYPE  SumI
## 1            TORNADO 20511
## 2              FLOOD  6779
## 3     EXCESSIVE HEAT  6387
## 4          LIGHTNING  5036
## 5          TSTM WIND  5022
## 6          ICE STORM  1975
## 7        FLASH FLOOD  1747
## 8               HEAT  1489
## 9       WINTER STORM  1321
## 10 HURRICANE/TYPHOON  1275
library(easyGgplot2)
ggplot2.barplot(data=sumOfInjuriesTop10, xName="EVTYPE", yName="SumI",
                groupName="EVTYPE")

Subset Prop/Crop Damage Estimates values

mutatePropDMG <- function(){
        library(dplyr)
        mainFullDF <<- mainFullDF %>%
                mutate(FPROPDMG = ifelse(is.null(PROPDMGEXP) | is.na(PROPDMGEXP) | PROPDMGEXP=="",0,
                                           ifelse(PROPDMGEXP=='H',2,
                                                  ifelse(PROPDMGEXP=='K',3,
                                                        ifelse(PROPDMGEXP=='M',6,
                                                               ifelse(PROPDMGEXP=='B',9,PROPDMGEXP))))))
}

mutateCropDMG <- function(){
        library(dplyr)
        mainFullDF <<- mainFullDF %>%
                mutate(FCROPDMG = ifelse(is.null(CROPDMGEXP) | is.na(CROPDMGEXP) | CROPDMGEXP=="",0,
                                           ifelse(CROPDMGEXP=='H',2,
                                                  ifelse(CROPDMGEXP=='K',3,
                                                         ifelse(CROPDMGEXP=='M',6,
                                                                ifelse(CROPDMGEXP=='B',9,CROPDMGEXP))))))
        
}

mutatePropDMG()
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mutateCropDMG()
names(mainFullDF)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"     "FYEARS"     "FPROPDMG"   "FCROPDMG"
        library(sqldf)
        sumOfMostdemagepropsTop10 <- sqldf("select EVTYPE, sum(FPROPDMG) as sumProp from mainFullDF group by EVTYPE order by sumProp desc limit 10")
        
        print(sumOfMostdemagepropsTop10)
##                EVTYPE sumProp
## 1                HAIL  225321
## 2           TSTM WIND  190148
## 3   THUNDERSTORM WIND  181430
## 4         FLASH FLOOD   92426
## 5             TORNADO   62025
## 6               FLOOD   45224
## 7  THUNDERSTORM WINDS   36902
## 8           HIGH WIND   36867
## 9           LIGHTNING   31136
## 10       WINTER STORM   19427
library(easyGgplot2)
ggplot2.barplot(data=sumOfMostdemagepropsTop10, xName="EVTYPE", yName="sumProp",
                groupName="EVTYPE")

        library(sqldf)
        sumOfMostdemageCropsTop10 <- sqldf("select EVTYPE, sum(FCROPDMG) as sumCrop from mainFullDF group by EVTYPE order by sumCrop desc limit 10")
        
        print(sumOfMostdemageCropsTop10)
##               EVTYPE sumCrop
## 1               HAIL  195315
## 2  THUNDERSTORM WIND  179382
## 3        FLASH FLOOD   54879
## 4              FLOOD   30951
## 5          HIGH WIND   28236
## 6            TORNADO   22449
## 7          TSTM WIND   20226
## 8       WINTER STORM   16644
## 9     WINTER WEATHER   16230
## 10        HEAVY SNOW   15282
library(easyGgplot2)
ggplot2.barplot(data=sumOfMostdemageCropsTop10, xName="EVTYPE", yName="sumCrop",
                groupName="EVTYPE")

Conclusion

  1. Across the United States (considering the years between 1990 to 2010), the type of event that’s most harmful with respect to population health is Tornado, followed by excessive heat, as the graphics and tables above are showing.
  2. Across the United States (considering the years between 1990 to 2010), the types of events that have the greatest economic consequences was Hail, followed by Thunderstorm wind, as the graphics and tables above are showing.