Synopsis

Storms and other severe weather events can cause both public health and economic problems for communities and municipalities. This project involves exploring the U.S. National Oceanic and Atmospheric Administration’s (NOAA) storm database. This database tracks characteristics of major storms and weather events in the United States, including when and where they occur, as well as estimates of any fatalities, injuries, and property damage. The data analysis must address the following questions : 1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health ? 2. Across the United States, which types of events have the greatest economic consequences ? (Source : Course material)

Data prrocessing

df <- read.csv("RepDataStorm.csv")
dd <- df
names(df)
##  [1] "STATE__"    "BGN_DATE"   "BGN_TIME"   "TIME_ZONE"  "COUNTY"    
##  [6] "COUNTYNAME" "STATE"      "EVTYPE"     "BGN_RANGE"  "BGN_AZI"   
## [11] "BGN_LOCATI" "END_DATE"   "END_TIME"   "COUNTY_END" "COUNTYENDN"
## [16] "END_RANGE"  "END_AZI"    "END_LOCATI" "LENGTH"     "WIDTH"     
## [21] "F"          "MAG"        "FATALITIES" "INJURIES"   "PROPDMG"   
## [26] "PROPDMGEXP" "CROPDMG"    "CROPDMGEXP" "WFO"        "STATEOFFIC"
## [31] "ZONENAMES"  "LATITUDE"   "LONGITUDE"  "LATITUDE_E" "LONGITUDE_"
## [36] "REMARKS"    "REFNUM"
str(df)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : Factor w/ 35 levels "","  N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_DATE  : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_TIME  : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ WFO       : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZONENAMES : Factor w/ 25112 levels "","                                                                                                                               "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : Factor w/ 436774 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

Counting, reporting and eliminating NAs

Automatic elimination of NAs-only columns. This is an arbitrary choice that we can make as a minority of lines contains NAs.
if(exists("SummaryTable_NA")==TRUE){
remove(SummaryTable_NA)
}

for (i in 1:ncol(df)){
  Total_NA_Days <- sum(is.na(df[i]))
  if(Total_NA_Days > 0){
    if(exists("SummaryTable_NA")==TRUE){
      SummaryTable_NA <-  rbind(SummaryTable_NA, c(Total_NA_Days, colnames(df[i])))
      print(paste("Number of NAs: ", Total_NA_Days, " in variable ", colnames(df[i]), sep=""))
    }
    if(exists("SummaryTable_NA")==FALSE){
      SummaryTable_NA <-  c(Total_NA_Days, colnames(df[i]))
      print(paste("Number of NAs: ", Total_NA_Days, " in variable ", colnames(df[i]), sep=""))
      
    }
    }
}
## [1] "Number of NAs: 902297 in variable COUNTYENDN"
## [1] "Number of NAs: 843563 in variable F"
## [1] "Number of NAs: 47 in variable LATITUDE"
## [1] "Number of NAs: 40 in variable LATITUDE_E"
for (i in ncol(df):1){
  Total_NA_Days <- sum(is.na(df[i]))
  if(Total_NA_Days > 0.5 * nrow(df)){
    print(paste("The following variable has been automatically removed: ", colnames(df[i]), " as the entire column has NAs", sep=""))
    df[i] <- NULL
  }
  
}
## [1] "The following variable has been automatically removed: F as the entire column has NAs"
## [1] "The following variable has been automatically removed: COUNTYENDN as the entire column has NAs"
### Removing NA by deleting rows with NA
df <- na.omit(df)
print(paste(abs(nrow(df)-nrow(dd))," rows containing NAs have been deleted", sep=""))
## [1] "47 rows containing NAs have been deleted"

Automatic eliminationm of NAs-only columns

Columns which cannot be used and might create some system issues should be removed.
for (i in ncol(df):1){
  if(is.null(nrow(df[i]))== TRUE){
    df[i] <- NULL
  }
  
}

Isolating Data which will be used

In order to manipulate a lighter database we will keep only columns that are the most relevant to us.
SelectedVariable <- c("EVTYPE", "FATALITIES", "INJURIES", "PROPDMG", "PROPDMGEXP", "CROPDMG", "CROPDMGEXP")
Newdf <- df[SelectedVariable]
str(Newdf)
## 'data.frame':    902250 obs. of  7 variables:
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
Newdf_Safe_Copy <- Newdf

Converting Property & Crop Damage Data

Datas are stored using string, we will convert them in their actual dollar equivalent amount.
Newdf$PROPDMGEXP <- mapvalues(Newdf$PROPDMGEXP, from = c("K", "M","", "B", "m", "+", "0", "5", "6", "?", "4", "2", "3", "h", "7", "H", "-", "1", "8"), to = c(10^3, 10^6, 1, 10^9, 10^6, 0,1,10^5, 10^6, 0, 10^4, 10^2, 10^3, 10^2, 10^7, 10^2, 0, 10, 10^8))
Newdf$PROPDMGEXP <- as.numeric(as.character(Newdf$PROPDMGEXP))
Newdf$PROPDMGTOTAL <- (Newdf$PROPDMG * Newdf$PROPDMGEXP)/1000000000

Newdf$CROPDMGEXP <- mapvalues(Newdf$CROPDMGEXP, from = c("","M", "K", "m", "B", "?", "0", "k","2"), to = c(1,10^6, 10^3, 10^6, 10^9, 0, 1, 10^3, 10^2))
Newdf$CROPDMGEXP <- as.numeric(as.character(Newdf$CROPDMGEXP))
Newdf$CROPDMGTOTAL <- (Newdf$CROPDMG * Newdf$CROPDMGEXP)/1000000000

Results

Managing Total Fatalities

TotalHarmfulEvents <- aggregate(FATALITIES ~ EVTYPE, data = Newdf,  FUN="sum")
TenMostHarmful <- TotalHarmfulEvents[order(-TotalHarmfulEvents$FATALITIES), ][1:10, ]
print(TenMostHarmful)
##             EVTYPE FATALITIES
## 834        TORNADO       5633
## 130 EXCESSIVE HEAT       1903
## 153    FLASH FLOOD        978
## 275           HEAT        937
## 464      LIGHTNING        816
## 856      TSTM WIND        504
## 170          FLOOD        470
## 585    RIP CURRENT        368
## 359      HIGH WIND        248
## 19       AVALANCHE        224
MaxValue <- max(TotalHarmfulEvents$FATALITIES)
k <- which(TotalHarmfulEvents[,2] == MaxValue)
Most_Harmful_Event_Fatalities <- TotalHarmfulEvents$EVTYPE[k]
Most_Harmful_Event_Fatalities
## [1] TORNADO
## 985 Levels:    HIGH SURF ADVISORY  COASTAL FLOOD ... WND
par(mfrow = c(1,1), mar = c(12, 4, 3, 2), mgp = c(3, 1, 0), cex = 0.8)
barplot(TenMostHarmful$FATALITIES, names.arg = TenMostHarmful$EVTYPE, las = 3, main = "10 Most harmful events - Fatalities", ylab = "Total Fatalities - Sum")

dev.copy(png, "Ten_Most_Harmful_event_Fatalities.png", width = 480, height = 480)
## png 
##   3
dev.off()
## png 
##   2

Managing Injuries

TotalInjuries <- aggregate(INJURIES ~ EVTYPE, data = Newdf,  FUN="sum")
TeMostHarmful_iNjuries <- TotalInjuries[order(-TotalInjuries$INJURIES), ][1:10, ]

MaxValue <- max(TeMostHarmful_iNjuries$INJURIES)
k <- which(TeMostHarmful_iNjuries[,2] == MaxValue)
Most_Harmful_Event_Injuries <- TeMostHarmful_iNjuries$EVTYPE[k]
Most_Harmful_Event_Injuries
## [1] TORNADO
## 985 Levels:    HIGH SURF ADVISORY  COASTAL FLOOD ... WND
par(mfrow = c(1,1), mar = c(12, 6, 3, 2), mgp = c(4, 1, 0), cex = 0.8)
barplot(TeMostHarmful_iNjuries$INJURIES, names.arg = TeMostHarmful_iNjuries$EVTYPE, las = 3, main = "10 Most hamrful events - Injuries", ylab = "Total Injuries")

dev.copy(png, "Ten_Most_Harmful_event_Injuries.png", width = 480, height = 480)
## png 
##   3
dev.off()
## png 
##   2

Assessing Property Damage

TotalPropertyDamageEvent <- aggregate(PROPDMGTOTAL ~ EVTYPE, data = Newdf,  FUN="sum")

TotalPropertyDamage <- TotalPropertyDamageEvent[order(-TotalPropertyDamageEvent$PROPDMGTOTAL), ][1:10, ]
TotalPropertyDamage
##                EVTYPE PROPDMGTOTAL
## 170             FLOOD   144.657710
## 411 HURRICANE/TYPHOON    69.305840
## 834           TORNADO    56.947381
## 670       STORM SURGE    43.323536
## 153       FLASH FLOOD    16.822424
## 244              HAIL    15.735268
## 402         HURRICANE    11.868319
## 848    TROPICAL STORM     7.703891
## 972      WINTER STORM     6.688497
## 359         HIGH WIND     5.270046
MaxValue <- max(TotalPropertyDamageEvent$PROPDMGTOTAL)
k <- which(TotalPropertyDamageEvent[,2] == MaxValue)
Most_Harmful_Event_Property_Damage <- TotalPropertyDamageEvent$EVTYPE[k]
Most_Harmful_Event_Property_Damage
## [1] FLOOD
## 985 Levels:    HIGH SURF ADVISORY  COASTAL FLOOD ... WND
par(mfrow = c(1,1), mar = c(12, 6, 3, 2), mgp = c(3, 1, 0), cex = 0.8)
barplot(TotalPropertyDamage$PROPDMGTOTAL, names.arg = TotalPropertyDamage$EVTYPE, las = 3, main = "10 hamrfule events - Properties Damage", ylab = "Value of Damages (Bln)")

dev.copy(png, "TotalPropertyDamagePerEvent.png", width = 480, height = 480)
## png 
##   3

Crop Damage

Total_Crop_DamageEvent <- aggregate(CROPDMGTOTAL ~ EVTYPE, data = Newdf,  FUN="sum")
Total_Crop_Damage <- Total_Crop_DamageEvent[order(-Total_Crop_DamageEvent$CROPDMGTOTAL), ][1:10, ]
Total_Crop_Damage
##                EVTYPE CROPDMGTOTAL
## 95            DROUGHT    13.972566
## 170             FLOOD     5.661968
## 590       RIVER FLOOD     5.029459
## 427         ICE STORM     5.022113
## 244              HAIL     3.025954
## 402         HURRICANE     2.741910
## 411 HURRICANE/TYPHOON     2.607873
## 153       FLASH FLOOD     1.421067
## 140      EXTREME COLD     1.292973
## 212      FROST/FREEZE     1.094086
MaxValue <- max(Total_Crop_DamageEvent$CROPDMGTOTAL)
k <- which(Total_Crop_DamageEvent[,2] == MaxValue)
Most_Harmful_Event_Crop_Damage <- Total_Crop_DamageEvent$EVTYPE[k]
Most_Harmful_Event_Crop_Damage
## [1] DROUGHT
## 985 Levels:    HIGH SURF ADVISORY  COASTAL FLOOD ... WND
par(mfrow = c(1,1), mar = c(10, 6, 3, 2), mgp = c(3, 1, 0), cex = 0.6)
barplot(Total_Crop_Damage$CROPDMGTOTAL, names.arg = Total_Crop_Damage$EVTYPE, las = 2, main = "10 hamrfule events - Crop Damage", ylab = "Value of Damages (Bln)")

dev.copy(png, "Total_Crop_DamagePerEvent.png", width = 480, height = 480)
## png 
##   4

Conclusion

print(paste(Most_Harmful_Event_Fatalities," is makes the most fatalities out of all storms in the US, based on our dataset.", sep=""))
## [1] "TORNADO is makes the most fatalities out of all storms in the US, based on our dataset."
print(paste(Most_Harmful_Event_Injuries,"is makes the most injuries out of all storms", sep=""))
## [1] "TORNADOis makes the most injuries out of all storms"
print(paste(Most_Harmful_Event_Property_Damage,"is makes the most property damages out of all storms", sep=""))
## [1] "FLOODis makes the most property damages out of all storms"
print(paste(Most_Harmful_Event_Crop_Damage,"is makes the most crop damages out of all storms", sep=""))
## [1] "DROUGHTis makes the most crop damages out of all storms"