# importing all needed library
suppressPackageStartupMessages({
  library(data.table)
  library(dplyr)
  library(ggplot2)
  library(DataExplorer)
  library(tidyr)
  library(ggrepel)
  library(ggthemes)
})

Synopsis

There are various kinds of weather events happend in U.S. Some of them are just an ordinary event that has no harm to our life, while some others are very destructive both in health and economic side. The problem is which event does have greatest impact to our community for both economic and health perspective. This analysis made to answer that question. This analysis started from the raw data, processing it by using various tools including some basic statistical measurement. And finally get a conclusion as the answer to the main questions defined.

Data Processing

In this analysis, we use storm data that can be imported from Storm Data. The documentation about the data itself can be found here. If you have questions about the data, you might want to quick check the FAQ section here.

To import the data, I used fread() function from data.table package.

# import data set to our working environment
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"

df_weather <- fread(fileUrl, na.strings = c("", "NA"))

First of all, let’s summarize the data

# inspecting the data set
head(df_weather)
##    STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1:       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2:       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3:       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
## 4:       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL
## 5:       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL
## 6:       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL
##     EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END COUNTYENDN
## 1: TORNADO         0    <NA>       <NA>     <NA>     <NA>          0         NA
## 2: TORNADO         0    <NA>       <NA>     <NA>     <NA>          0         NA
## 3: TORNADO         0    <NA>       <NA>     <NA>     <NA>          0         NA
## 4: TORNADO         0    <NA>       <NA>     <NA>     <NA>          0         NA
## 5: TORNADO         0    <NA>       <NA>     <NA>     <NA>          0         NA
## 6: TORNADO         0    <NA>       <NA>     <NA>     <NA>          0         NA
##    END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES INJURIES PROPDMG
## 1:         0    <NA>       <NA>   14.0   100 3   0          0       15    25.0
## 2:         0    <NA>       <NA>    2.0   150 2   0          0        0     2.5
## 3:         0    <NA>       <NA>    0.1   123 2   0          0        2    25.0
## 4:         0    <NA>       <NA>    0.0   100 2   0          0        2     2.5
## 5:         0    <NA>       <NA>    0.0   150 2   0          0        2     2.5
## 6:         0    <NA>       <NA>    1.5   177 2   0          0        6     2.5
##    PROPDMGEXP CROPDMG CROPDMGEXP  WFO STATEOFFIC ZONENAMES LATITUDE LONGITUDE
## 1:          K       0       <NA> <NA>       <NA>      <NA>     3040      8812
## 2:          K       0       <NA> <NA>       <NA>      <NA>     3042      8755
## 3:          K       0       <NA> <NA>       <NA>      <NA>     3340      8742
## 4:          K       0       <NA> <NA>       <NA>      <NA>     3458      8626
## 5:          K       0       <NA> <NA>       <NA>      <NA>     3412      8642
## 6:          K       0       <NA> <NA>       <NA>      <NA>     3450      8748
##    LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1:       3051       8806    <NA>      1
## 2:          0          0    <NA>      2
## 3:          0          0    <NA>      3
## 4:          0          0    <NA>      4
## 5:          0          0    <NA>      5
## 6:          0          0    <NA>      6
# summarize the data
summary(df_weather)
##     STATE__       BGN_DATE           BGN_TIME          TIME_ZONE        
##  Min.   : 1.0   Length:902297      Length:902297      Length:902297     
##  1st Qu.:19.0   Class :character   Class :character   Class :character  
##  Median :30.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :31.2                                                           
##  3rd Qu.:45.0                                                           
##  Max.   :95.0                                                           
##                                                                         
##      COUNTY       COUNTYNAME           STATE              EVTYPE         
##  Min.   :  0.0   Length:902297      Length:902297      Length:902297     
##  1st Qu.: 31.0   Class :character   Class :character   Class :character  
##  Median : 75.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :100.6                                                           
##  3rd Qu.:131.0                                                           
##  Max.   :873.0                                                           
##                                                                          
##    BGN_RANGE          BGN_AZI           BGN_LOCATI          END_DATE        
##  Min.   :   0.000   Length:902297      Length:902297      Length:902297     
##  1st Qu.:   0.000   Class :character   Class :character   Class :character  
##  Median :   0.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :   1.484                                                           
##  3rd Qu.:   1.000                                                           
##  Max.   :3749.000                                                           
##                                                                             
##    END_TIME           COUNTY_END COUNTYENDN       END_RANGE       
##  Length:902297      Min.   :0    Mode:logical   Min.   :  0.0000  
##  Class :character   1st Qu.:0    NA's:902297    1st Qu.:  0.0000  
##  Mode  :character   Median :0                   Median :  0.0000  
##                     Mean   :0                   Mean   :  0.9862  
##                     3rd Qu.:0                   3rd Qu.:  0.0000  
##                     Max.   :0                   Max.   :925.0000  
##                                                                   
##    END_AZI           END_LOCATI            LENGTH              WIDTH         
##  Length:902297      Length:902297      Min.   :   0.0000   Min.   :   0.000  
##  Class :character   Class :character   1st Qu.:   0.0000   1st Qu.:   0.000  
##  Mode  :character   Mode  :character   Median :   0.0000   Median :   0.000  
##                                        Mean   :   0.2301   Mean   :   7.503  
##                                        3rd Qu.:   0.0000   3rd Qu.:   0.000  
##                                        Max.   :2315.0000   Max.   :4400.000  
##                                                                              
##        F               MAG            FATALITIES          INJURIES        
##  Min.   :0.0      Min.   :    0.0   Min.   :  0.0000   Min.   :   0.0000  
##  1st Qu.:0.0      1st Qu.:    0.0   1st Qu.:  0.0000   1st Qu.:   0.0000  
##  Median :1.0      Median :   50.0   Median :  0.0000   Median :   0.0000  
##  Mean   :0.9      Mean   :   46.9   Mean   :  0.0168   Mean   :   0.1557  
##  3rd Qu.:1.0      3rd Qu.:   75.0   3rd Qu.:  0.0000   3rd Qu.:   0.0000  
##  Max.   :5.0      Max.   :22000.0   Max.   :583.0000   Max.   :1700.0000  
##  NA's   :843563                                                           
##     PROPDMG         PROPDMGEXP           CROPDMG         CROPDMGEXP       
##  Min.   :   0.00   Length:902297      Min.   :  0.000   Length:902297     
##  1st Qu.:   0.00   Class :character   1st Qu.:  0.000   Class :character  
##  Median :   0.00   Mode  :character   Median :  0.000   Mode  :character  
##  Mean   :  12.06                      Mean   :  1.527                     
##  3rd Qu.:   0.50                      3rd Qu.:  0.000                     
##  Max.   :5000.00                      Max.   :990.000                     
##                                                                           
##      WFO             STATEOFFIC         ZONENAMES            LATITUDE   
##  Length:902297      Length:902297      Length:902297      Min.   :   0  
##  Class :character   Class :character   Class :character   1st Qu.:2802  
##  Mode  :character   Mode  :character   Mode  :character   Median :3540  
##                                                           Mean   :2875  
##                                                           3rd Qu.:4019  
##                                                           Max.   :9706  
##                                                           NA's   :47    
##    LONGITUDE        LATITUDE_E     LONGITUDE_       REMARKS         
##  Min.   :-14451   Min.   :   0   Min.   :-14455   Length:902297     
##  1st Qu.:  7247   1st Qu.:   0   1st Qu.:     0   Class :character  
##  Median :  8707   Median :   0   Median :     0   Mode  :character  
##  Mean   :  6940   Mean   :1452   Mean   :  3509                     
##  3rd Qu.:  9605   3rd Qu.:3549   3rd Qu.:  8735                     
##  Max.   : 17124   Max.   :9706   Max.   :106220                     
##                   NA's   :40                                        
##      REFNUM      
##  Min.   :     1  
##  1st Qu.:225575  
##  Median :451149  
##  Mean   :451149  
##  3rd Qu.:676723  
##  Max.   :902297  
## 

Before going further with the data, we need first to rename the variables into lower case. Just to make the analysis become easier.

# rename the variables name
names(df_weather) <- tolower(names(df_weather))

From the summary, first we can see that in our data, there is gap between the measurement data, the fatalities, injuries, propdmg, and cropdmg and second our data has some NA’s value here. For the first problem, we will apply scale() function to them to standardize the scale of the data.

df_weather <- df_weather %>% 
  mutate(fatalities = scale(fatalities),
         injuries = scale(injuries),
         propdmg = scale(propdmg),
         cropdmg = scale(cropdmg))

To get know more about the missing value, let’s plot them using plot_missing() function from DataExplorer package.

plot_missing(df_weather,
             title = "Missing Data From All Initial Data Set Variables/Features",
             theme_config = list(plot.title = element_text(hjust = .5)))
Plotting the Missing Data in Overall Data Set

Plotting the Missing Data in Overall Data Set

The plot clearly shows that the countyendn has 100% missing value, f has 93.49% missing values, and end_azi has 80.33% missing values. Considering their very high missing values, I am sure that they has no significant meaning to the analysis at least for now, so we can remove them as the plot suggesting to remove them. Since we want to measure the impact of the weather event to the prop/crop damage and health, then we can say roughly, we don’t need these variables that has missing values as majority (more than 10%), because they have no significant meaning to the main analysis.

# Removing unused variables with high number of missing values
df_weather <- df_weather %>% 
  select(-c(countyendn, f, end_azi, cropdmgexp, zonenames, bgn_azi, end_locati,
            propdmgexp, bgn_locati, remarks, stateoffic, end_date, end_time, wfo))

Now, let’s check the class of the variables.

str(df_weather)
## Classes 'data.table' and 'data.frame':   902297 obs. of  23 variables:
##  $ state__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ bgn_date  : chr  "4/18/1950 0:00:00" "4/18/1950 0:00:00" "2/20/1951 0:00:00" "6/8/1951 0:00:00" ...
##  $ bgn_time  : chr  "0130" "0145" "1600" "0900" ...
##  $ time_zone : chr  "CST" "CST" "CST" "CST" ...
##  $ county    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ countyname: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ state     : chr  "AL" "AL" "AL" "AL" ...
##  $ evtype    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ bgn_range : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ county_end: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ end_range : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ length    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ width     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ mag       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fatalities: num [1:902297, 1] -0.0219 -0.0219 -0.0219 -0.0219 -0.0219 ...
##   ..- attr(*, "scaled:center")= num 0.0168
##   ..- attr(*, "scaled:scale")= num 0.765
##  $ injuries  : num [1:902297, 1] 2.7328 -0.0287 0.3395 0.3395 0.3395 ...
##   ..- attr(*, "scaled:center")= num 0.156
##   ..- attr(*, "scaled:scale")= num 5.43
##  $ propdmg   : num [1:902297, 1] 0.218 -0.161 0.218 -0.161 -0.161 ...
##   ..- attr(*, "scaled:center")= num 12.1
##   ..- attr(*, "scaled:scale")= num 59.5
##  $ cropdmg   : num [1:902297, 1] -0.0689 -0.0689 -0.0689 -0.0689 -0.0689 ...
##   ..- attr(*, "scaled:center")= num 1.53
##   ..- attr(*, "scaled:scale")= num 22.2
##  $ latitude  : num  3040 3042 3340 3458 3412 ...
##  $ longitude : num  8812 8755 8742 8626 8642 ...
##  $ latitude_e: num  3051 0 0 0 0 ...
##  $ longitude_: num  8806 0 0 0 0 ...
##  $ refnum    : num  1 2 3 4 5 6 7 8 9 10 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Here, we can see that we have date that is still stored as character. It will be easier for us to make analysis in date time format. Remember that in R, we can save date time format either in POSIXct or POSIXlt class. We also want to change the state__, county, state, and event type (evtype) to be a factor just in case if we need them in plot.

# chnage the type of some variables
df_weather <- df_weather %>% 
  mutate(bgn_date = as.POSIXct(bgn_date, format = "%m/%d/%Y %H:%M:%S"),
         county = as.factor(county),
         state = as.factor(state),
         state__ = as.factor(state__))

str(df_weather)
## Classes 'data.table' and 'data.frame':   902297 obs. of  23 variables:
##  $ state__   : Factor w/ 70 levels "1","2","4","5",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ bgn_date  : POSIXct, format: "1950-04-18" "1950-04-18" ...
##  $ bgn_time  : chr  "0130" "0145" "1600" "0900" ...
##  $ time_zone : chr  "CST" "CST" "CST" "CST" ...
##  $ county    : Factor w/ 557 levels "0","1","2","3",..: 98 4 58 90 44 78 10 124 126 58 ...
##  $ countyname: chr  "MOBILE" "BALDWIN" "FAYETTE" "MADISON" ...
##  $ state     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ evtype    : chr  "TORNADO" "TORNADO" "TORNADO" "TORNADO" ...
##  $ bgn_range : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ county_end: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ end_range : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ length    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ width     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ mag       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fatalities: num [1:902297, 1] -0.0219 -0.0219 -0.0219 -0.0219 -0.0219 ...
##   ..- attr(*, "scaled:center")= num 0.0168
##   ..- attr(*, "scaled:scale")= num 0.765
##  $ injuries  : num [1:902297, 1] 2.7328 -0.0287 0.3395 0.3395 0.3395 ...
##   ..- attr(*, "scaled:center")= num 0.156
##   ..- attr(*, "scaled:scale")= num 5.43
##  $ propdmg   : num [1:902297, 1] 0.218 -0.161 0.218 -0.161 -0.161 ...
##   ..- attr(*, "scaled:center")= num 12.1
##   ..- attr(*, "scaled:scale")= num 59.5
##  $ cropdmg   : num [1:902297, 1] -0.0689 -0.0689 -0.0689 -0.0689 -0.0689 ...
##   ..- attr(*, "scaled:center")= num 1.53
##   ..- attr(*, "scaled:scale")= num 22.2
##  $ latitude  : num  3040 3042 3340 3458 3412 ...
##  $ longitude : num  8812 8755 8742 8626 8642 ...
##  $ latitude_e: num  3051 0 0 0 0 ...
##  $ longitude_: num  8806 0 0 0 0 ...
##  $ refnum    : num  1 2 3 4 5 6 7 8 9 10 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Now, we can focus on our weather event “evtype” and look further inside it.

length(unique(df_weather$evtype))
## [1] 985

We have 985 kinds of weather events here. It is not possible to examine them one by one, but we can take the 10 or 20 examples of them just to check the validity name of the event to ensure they are the data we need.

unique(df_weather$evtype)[1:10]
##  [1] "TORNADO"                   "TSTM WIND"                
##  [3] "HAIL"                      "FREEZING RAIN"            
##  [5] "SNOW"                      "ICE STORM/FLASH FLOOD"    
##  [7] "SNOW/ICE"                  "WINTER STORM"             
##  [9] "HURRICANE OPAL/HIGH WINDS" "THUNDERSTORM WINDS"

In order to get the subset of the data, we will get data that only have higher mean values of the impact for each type of event. In simple, we will start examining the weather event that has value greater than its overall mean. Before applying this calculation, we will also get the mean for each destruction impact for each aspect.

# This code will get subset of the data by doing the following
# First it will summarize the mean of the impact for each type of destruction
# by each event type
# Then it will get the subset of the data where the destruction has the higher
# value of the mean of the destruction from all event type
# If the value of the mean of the destruction type does lower than its mean,
# it will be set to NA, original value taken instead
df_weather_mean <- df_weather %>% 
  group_by(evtype) %>% 
  summarize(fatalities = mean(fatalities),
            injuries = mean(injuries),
            propdmg = mean(propdmg),
            cropdmg = mean(cropdmg)) %>%
  ungroup() %>% 
  mutate(fatalities = ifelse(fatalities > mean(fatalities), fatalities, NA),
         injuries = ifelse(injuries > mean(injuries), injuries, NA),
         propdmg = ifelse(propdmg > mean(propdmg), propdmg, NA),
         cropdmg = ifelse(cropdmg > mean(cropdmg), cropdmg, NA)) %>% 
  filter(!is.na(fatalities) | !is.na(injuries) | !is.na(propdmg) |
         !is.na(cropdmg))
## `summarise()` ungrouping output (override with `.groups` argument)

Result

We have the value from each destructive event, both from health and economic perspective. In order to get a conclusion about the high impact health perspective, we of course need to weighting the injuries and fatalities, while the economic can be weighted by its propdmg and cropdmg. Why? This is because the health effect must be measured by not only the fatalities, but also the injuries. It means, even though the event has very strong fatalities, but it has less injuries, then we can prefer to focus on the event with average fatalities with higher injuries values. This is also applied to economic side from the prop and crop damage.

# health effect
df_weather_mean %>% 
  ggplot(aes(x = fatalities, y = injuries)) +
  geom_point(aes(col = as.factor(evtype))) +
  geom_label_repel(aes(label = tolower(evtype))) +
  guides(col = FALSE) +
  labs(title = "Plot Between Fatalities and Injuries\n(Health Perspective)",
       x = "Fatalities",
       y = "Injuries")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 245 rows containing missing values (geom_point).
## Warning: Removed 245 rows containing missing values (geom_label_repel).
Data Plotting for Health Perspective

Data Plotting for Health Perspective

df_weather_mean %>% 
  ggplot(aes(x = propdmg, y = cropdmg)) +
  geom_point(aes(col = as.factor(evtype))) +
  geom_label_repel(aes(label = tolower(evtype))) +
  guides(col = FALSE) +
  labs(title = "Plot Between Properties and Crop Damage\n(Economic Perspective)",
       x = "Properties Damage",
       y = "Crop Damage") +
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 249 rows containing missing values (geom_point).
## Warning: Removed 249 rows containing missing values (geom_label_repel).
Data Plotting for Economic Perspective

Data Plotting for Economic Perspective

From the plot, it is clearly that we can answer both questions easily.

  1. Across the United States, which types of events (as indicated in the EVTYPE variable) are most harmful with respect to population health? The answer is TROPICAL STORM GORDON.

  2. Across the United States, which types of events have the greatest economic consequences? The answer is also TROPICAL STORM GORDON.

So the conclusion of the analysis is that TROPICAL STORM GORDON is the most harmful with respect to the population health and have the greatest economic consequences.