Synopsis

knitr::opts_chunk$set(cache = TRUE,echo=T)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

set the system locale as English

Because my Rstudio was set system locale as Mandarin Chinese, if not change system locale as English,
it’ll have some problem to load data into Rstudio

Sys.setlocale("LC_ALL", "English")  ##important  ##if `EOF within quoted string` occured
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"

Load data from Internet

if(!file.exists("repdata_data_StormData.csv")){
  url<-"https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
download.file(url,destfile ="repdata_data_StormData.csv.bz2" )
}
storm<-read.csv(bzfile("repdata_data_StormData.csv.bz2"))

viewing into data

dim(storm)
## [1] 902297     37
head(storm)
##   STATE__           BGN_DATE BGN_TIME TIME_ZONE COUNTY COUNTYNAME STATE
## 1       1  4/18/1950 0:00:00     0130       CST     97     MOBILE    AL
## 2       1  4/18/1950 0:00:00     0145       CST      3    BALDWIN    AL
## 3       1  2/20/1951 0:00:00     1600       CST     57    FAYETTE    AL
## 4       1   6/8/1951 0:00:00     0900       CST     89    MADISON    AL
## 5       1 11/15/1951 0:00:00     1500       CST     43    CULLMAN    AL
## 6       1 11/15/1951 0:00:00     2000       CST     77 LAUDERDALE    AL
##    EVTYPE BGN_RANGE BGN_AZI BGN_LOCATI END_DATE END_TIME COUNTY_END
## 1 TORNADO         0                                               0
## 2 TORNADO         0                                               0
## 3 TORNADO         0                                               0
## 4 TORNADO         0                                               0
## 5 TORNADO         0                                               0
## 6 TORNADO         0                                               0
##   COUNTYENDN END_RANGE END_AZI END_LOCATI LENGTH WIDTH F MAG FATALITIES
## 1         NA         0                      14.0   100 3   0          0
## 2         NA         0                       2.0   150 2   0          0
## 3         NA         0                       0.1   123 2   0          0
## 4         NA         0                       0.0   100 2   0          0
## 5         NA         0                       0.0   150 2   0          0
## 6         NA         0                       1.5   177 2   0          0
##   INJURIES PROPDMG PROPDMGEXP CROPDMG CROPDMGEXP WFO STATEOFFIC ZONENAMES
## 1       15    25.0          K       0                                    
## 2        0     2.5          K       0                                    
## 3        2    25.0          K       0                                    
## 4        2     2.5          K       0                                    
## 5        2     2.5          K       0                                    
## 6        6     2.5          K       0                                    
##   LATITUDE LONGITUDE LATITUDE_E LONGITUDE_ REMARKS REFNUM
## 1     3040      8812       3051       8806              1
## 2     3042      8755          0          0              2
## 3     3340      8742          0          0              3
## 4     3458      8626          0          0              4
## 5     3412      8642          0          0              5
## 6     3450      8748          0          0              6
str(storm)
## 'data.frame':    902297 obs. of  37 variables:
##  $ STATE__   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_DATE  : Factor w/ 16335 levels "1/1/1966 0:00:00",..: 6523 6523 4242 11116 2224 2224 2260 383 3980 3980 ...
##  $ BGN_TIME  : Factor w/ 3608 levels "00:00:00 AM",..: 272 287 2705 1683 2584 3186 242 1683 3186 3186 ...
##  $ TIME_ZONE : Factor w/ 22 levels "ADT","AKS","AST",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ COUNTY    : num  97 3 57 89 43 77 9 123 125 57 ...
##  $ COUNTYNAME: Factor w/ 29601 levels "","5NM E OF MACKINAC BRIDGE TO PRESQUE ISLE LT MI",..: 13513 1873 4598 10592 4372 10094 1973 23873 24418 4598 ...
##  $ STATE     : Factor w/ 72 levels "AK","AL","AM",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ EVTYPE    : Factor w/ 985 levels "   HIGH SURF ADVISORY",..: 834 834 834 834 834 834 834 834 834 834 ...
##  $ BGN_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ BGN_AZI   : Factor w/ 35 levels "","  N"," NW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ BGN_LOCATI: Factor w/ 54429 levels "","- 1 N Albion",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_DATE  : Factor w/ 6663 levels "","1/1/1993 0:00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_TIME  : Factor w/ 3647 levels ""," 0900CST",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY_END: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ COUNTYENDN: logi  NA NA NA NA NA NA ...
##  $ END_RANGE : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ END_AZI   : Factor w/ 24 levels "","E","ENE","ESE",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ END_LOCATI: Factor w/ 34506 levels "","- .5 NNW",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LENGTH    : num  14 2 0.1 0 0 1.5 1.5 0 3.3 2.3 ...
##  $ WIDTH     : num  100 150 123 100 150 177 33 33 100 100 ...
##  $ F         : int  3 2 2 2 2 2 2 1 3 3 ...
##  $ MAG       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ FATALITIES: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ INJURIES  : num  15 0 2 2 2 6 1 0 14 0 ...
##  $ PROPDMG   : num  25 2.5 25 2.5 2.5 2.5 2.5 2.5 25 25 ...
##  $ PROPDMGEXP: Factor w/ 19 levels "","-","?","+",..: 17 17 17 17 17 17 17 17 17 17 ...
##  $ CROPDMG   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CROPDMGEXP: Factor w/ 9 levels "","?","0","2",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ WFO       : Factor w/ 542 levels ""," CI","$AC",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ STATEOFFIC: Factor w/ 250 levels "","ALABAMA, Central",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ZONENAMES : Factor w/ 25112 levels "","                                                                                                                               "| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ LATITUDE  : num  3040 3042 3340 3458 3412 ...
##  $ LONGITUDE : num  8812 8755 8742 8626 8642 ...
##  $ LATITUDE_E: num  3051 0 0 0 0 ...
##  $ LONGITUDE_: num  8806 0 0 0 0 ...
##  $ REMARKS   : Factor w/ 436781 levels "","-2 at Deer Park\n",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ REFNUM    : num  1 2 3 4 5 6 7 8 9 10 ...

change colnames of data from upper case to lower case

names(storm)<-tolower(names(storm))
names(storm)
##  [1] "state__"    "bgn_date"   "bgn_time"   "time_zone"  "county"    
##  [6] "countyname" "state"      "evtype"     "bgn_range"  "bgn_azi"   
## [11] "bgn_locati" "end_date"   "end_time"   "county_end" "countyendn"
## [16] "end_range"  "end_azi"    "end_locati" "length"     "width"     
## [21] "f"          "mag"        "fatalities" "injuries"   "propdmg"   
## [26] "propdmgexp" "cropdmg"    "cropdmgexp" "wfo"        "stateoffic"
## [31] "zonenames"  "latitude"   "longitude"  "latitude_e" "longitude_"
## [36] "remarks"    "refnum"

Fatalities/Injuries

  • caculate the total number of fatalities and injuries by each event type
health<-storm%>%
  mutate(casualty=fatalities+injuries)%>%      
  group_by(evtype)%>%                          
  summarize(total=sum(casualty))%>%            
  arrange(desc(total))                         

the class of health data is not the data.frame, I want to convert it to data.frame

class(health)
## [1] "tbl_df"     "tbl"        "data.frame"
  • convert data to data.frame
health<-as.data.frame(health)
  • only pick first 20 rows
health<-health[1:20,]

Damage

  • the levels of cropdmgexp and prodmgexp
levels(storm$cropdmgexp)
## [1] ""  "?" "0" "2" "B" "k" "K" "m" "M"
levels(storm$propdmgexp)
##  [1] ""  "-" "?" "+" "0" "1" "2" "3" "4" "5" "6" "7" "8" "B" "h" "H" "K"
## [18] "m" "M"
  • convert symbels to number

We can see the levels of cropdmgexp and prodmgexp are composed by several symbols

B==billion, M==million, K==thousand, H==hundred, the other symbol are mean unknown, or not available.
I want to convert symbel to number so it’ll be more easy to deal with

levels(storm$cropdmgexp)<-c(0,0,0,0,1000000000,1000,1000,1000000,1000000)
levels(storm$propdmgexp)<-c(0,0,0,0,0,0,0,0,0,0,0,0,0,1000000000,100,100,1000,1000000,1000000)
levels(storm$cropdmgexp)
## [1] "0"     "1e+09" "1000"  "1e+06"
levels(storm$propdmgexp)
## [1] "0"     "1e+09" "100"   "1000"  "1e+06"
storm$cropdmgexp<-as.numeric(as.character(storm$cropdmgexp))
storm$propdmgexp<-as.numeric(as.character(storm$propdmgexp))
  • caculate the total number of damage by each event type
damage<-storm%>%
  group_by(evtype)%>%
  summarize(total=sum((cropdmg*cropdmgexp)+(propdmg*propdmgexp)))%>%
  arrange(desc(total))
  • convert data to data.frame
damage<-as.data.frame(damage)
  • only pick first 20 rows
damage<-damage[1:20,]

Result

ggplot(health,aes(reorder(evtype,total),total,fill=evtype))+
  geom_bar(stat="identity")+
  theme(legend.position = "none")+
  labs(x="event type",y="total fatalities/injuries",title="The total number of fatalities and injuries")+
  coord_flip()

health
##                evtype total
## 1             TORNADO 96979
## 2      EXCESSIVE HEAT  8428
## 3           TSTM WIND  7461
## 4               FLOOD  7259
## 5           LIGHTNING  6046
## 6                HEAT  3037
## 7         FLASH FLOOD  2755
## 8           ICE STORM  2064
## 9   THUNDERSTORM WIND  1621
## 10       WINTER STORM  1527
## 11          HIGH WIND  1385
## 12               HAIL  1376
## 13  HURRICANE/TYPHOON  1339
## 14         HEAVY SNOW  1148
## 15           WILDFIRE   986
## 16 THUNDERSTORM WINDS   972
## 17           BLIZZARD   906
## 18                FOG   796
## 19        RIP CURRENT   600
## 20   WILD/FOREST FIRE   557

the type of most harmful to population health: TORNADO,
caused 96979 people injuried or fatalitied

the second is EXCESSIVE HEAT, endangered 8428 people’s life

ggplot(damage,aes(reorder(evtype,total),total,fill=evtype))+
  geom_bar(stat="identity")+
  theme(legend.position = "none")+
  labs(x="event type",y="total damage",title="The total number of damage")+
  coord_flip()

damage
##                       evtype        total
## 1                      FLOOD 150319678250
## 2          HURRICANE/TYPHOON  71913712800
## 3                    TORNADO  57352113590
## 4                STORM SURGE  43323541000
## 5                       HAIL  18758221670
## 6                FLASH FLOOD  17562128610
## 7                    DROUGHT  15018672000
## 8                  HURRICANE  14610229010
## 9                RIVER FLOOD  10148404500
## 10                 ICE STORM   8967041310
## 11            TROPICAL STORM   8382236550
## 12              WINTER STORM   6715441250
## 13                 HIGH WIND   5908617560
## 14                  WILDFIRE   5060586800
## 15                 TSTM WIND   5038935790
## 16          STORM SURGE/TIDE   4642038000
## 17         THUNDERSTORM WIND   3897964190
## 18            HURRICANE OPAL   3191846000
## 19          WILD/FOREST FIRE   3108626330
## 20 HEAVY RAIN/SEVERE WEATHER   2500000000

the type of greatest economic consequences: FLOOD, caused 150319678250 dollars loss

second is HURRICANE/TYPHOON, caused 71913712800 dollars loss