Data Processing
- Data is read from the file “repdata_data_StormData.csv.bz2”
- It is read into the data frame named dat
- The data frame dat contains 902297 obserations of 37 variables.
- As data is processed to dat1, dat2, dat3, dat4, and dat5, we keep track of how health and economic costs are changed in these transformations
- We have to clean as the duplicates of events arise from being capital or not
- There are duplicates as well from spelling typos
- dat (as read from file, exply only year is retained for the date)
- dat -> dat1 (only info for 50 states and DC retained, only needed columns kept )
- dat1 -> dat2 (property damage calculated based on mutliplier)
- dat2 -> dat3 (crop damage calculated based on multiplier)
- dat3 -> dat4 (events with more than 50 occurences retained, outlier in CA corrected)
- dat4 -> dat5 (events aggregrated over year)
3. Summary of Health Costs
tot.fatalaties <- sum(dat$FATALITIES)
cat("The total number of fatalaties is",tot.fatalaties,"\n")
## The total number of fatalaties is 15145
tot.injuries <- sum(dat$INJURIES)
cat("The total number of injuries is",tot.injuries,"\n")
## The total number of injuries is 140528
4. Select the subset of 50 states and DC, into dat1
states.all <- unique(dat$STATE)
states <- states.all[c(1:50,52)]
cat("states:\n")
## states:
print(states)
## [1] "AL" "AZ" "AR" "CA" "CO" "CT" "DE" "DC" "FL" "GA" "HI" "ID" "IL" "IN"
## [15] "IA" "KS" "KY" "LA" "ME" "MD" "MA" "MI" "MN" "MS" "MO" "MT" "NE" "NV"
## [29] "NH" "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN"
## [43] "TX" "UT" "VT" "VA" "WA" "WV" "WI" "WY" "AK"
col.keep <- c("YEAR","EVTYPE","STATE","FATALITIES","INJURIES", "PROPDMG","PROPDMGEXP","CROPDMG","CROPDMGEXP")
dat1 <- subset(x = dat, subset = STATE %in% states, select = col.keep)
dat1.fatalaties <- sum(dat1$FATALITIES)
cat("The total number of fatalaties in dat1 is",dat1.fatalaties,"\n")
## The total number of fatalaties in dat1 is 14867
cat("The ratio of fatalaties retained in dat1 is",dat1.fatalaties/tot.fatalaties,"\n")
## The ratio of fatalaties retained in dat1 is 0.9816441
dat1.injuries <- sum(dat1$INJURIES)
cat("The total number of injuries in dat1 is",dat1.injuries,"\n")
## The total number of injuries in dat1 is 139835
cat("The ratio of injuries retained in dat1 is",dat1.injuries/tot.injuries,"\n")
## The ratio of injuries retained in dat1 is 0.9950686
cat("The number of rows in dat1 is",nrow(dat1),"\n")
## The number of rows in dat1 is 883623
cat("The ratio of rows retained in dat1 is",nrow(dat1)/nrow(dat),"\n")
## The ratio of rows retained in dat1 is 0.9793039
5. Processing to find property damage costs in dat2
dat1$PROPDMGEXP[is.na(dat1$PROPDMGEXP)] <- 1
dat2 <- filter(dat1, PROPDMGEXP %in% c("K","M","B","1"))
dat2$PROPDMGEXP <- ifelse(dat2$PROPDMGEXP=="K",1000,
ifelse(dat2$PROPDMGEXP=="M",1000000,
ifelse(dat2$PROPDMGEXP=="B",1000000000,1)))
dat2$PROPDMG <- dat2$PROPDMG*dat2$PROPDMGEXP
dat2.fatalaties <- sum(dat2$FATALITIES)
cat("The total number of fatalaties in dat2 is",dat2.fatalaties,"\n")
## The total number of fatalaties in dat2 is 14860
cat("The ratio of fatalaties retained in dat2 is",dat2.fatalaties/tot.fatalaties,"\n")
## The ratio of fatalaties retained in dat2 is 0.9811819
dat2.injuries <- sum(dat2$INJURIES)
cat("The total number of injuries in dat2 is",dat2.injuries,"\n")
## The total number of injuries in dat2 is 139763
cat("The ratio of injuries retained in dat2 is",dat2.injuries/tot.injuries,"\n")
## The ratio of injuries retained in dat2 is 0.9945562
cat("The number of rows in dat2 is",nrow(dat2),"\n")
## The number of rows in dat2 is 883320
cat("The ratio of rows retained in dat2 is",nrow(dat2)/nrow(dat),"\n")
## The ratio of rows retained in dat2 is 0.9789681
property.dat2 <- sum(dat2$PROPDMG)
cat("The total Property Damage in dat2 is",property.dat2,"\n")
## The total Property Damage in dat2 is 423825059797
6. Processing to find crop damage costs in dat3
dat2 <- dat2[,-7]
dat2$CROPDMGEXP[is.na(dat2$CROPDMGEXP)] <- 1
dat3 <- filter(dat2, CROPDMGEXP %in% c("K","M","B","1"))
dat3$CROPDMGEXP <- ifelse(dat3$CROPDMGEXP=="K",1000,
ifelse(dat3$CROPDMGEXP=="M",1000000,
ifelse(dat3$CROPDMGEXP=="B",1000000000,1)))
dat3$CROPDMG <- dat3$CROPDMG*dat3$CROPDMGEXP
dat3.fatalaties <- sum(dat3$FATALITIES)
cat("The total number of fatalaties in dat3 is",dat3.fatalaties,"\n")
## The total number of fatalaties in dat3 is 14857
cat("The ratio of fatalaties retained in dat3 is",dat3.fatalaties/tot.fatalaties,"\n")
## The ratio of fatalaties retained in dat3 is 0.9809838
dat3.injuries <- sum(dat3$INJURIES)
cat("The total number of injuries in dat3 is",dat3.injuries,"\n")
## The total number of injuries in dat3 is 139743
cat("The ratio of injuries retained in dat3 is",dat3.injuries/tot.injuries,"\n")
## The ratio of injuries retained in dat3 is 0.9944139
cat("The number of rows in dat3 is",nrow(dat3),"\n")
## The number of rows in dat3 is 883272
cat("The ratio of rows retained in dat3 is",nrow(dat3)/nrow(dat),"\n")
## The ratio of rows retained in dat3 is 0.9789149
property.dat3 <- sum(dat3$PROPDMG)
cat("The total Property Damage in dat3 is",property.dat3,"\n")
## The total Property Damage in dat3 is 423823458297
cat("The ratio of Property Damage retained in dat3 is",property.dat3/property.dat2,"\n")
## The ratio of Property Damage retained in dat3 is 0.9999962
crop.dat3 <- sum(dat3$CROPDMG)
cat("The total Crop Damage in dat3 is",crop.dat3,"\n")
## The total Crop Damage in dat3 is 48291524921
dat3 <- dat3[,-8]
7. Cleaning up EVTYPE in dat4
cat("The number of unique events in EVTYPE is", length(unique(dat3$EVTYPE)), "\n")
## The number of unique events in EVTYPE is 949
evts <- table(dat$EVTYPE)
evts.name <- rownames(evts)
evts.vector <- as.vector(evts)
names(evts.vector) <- evts.name
evts.sort <- sort(evts.vector, decreasing = TRUE)
evts.sort.50 <- evts.sort[evts.sort>50]
cat("There are",length(evts.sort.50),"events with more than 50 occurences.\n")
## There are 87 events with more than 50 occurences.
dat4 <- filter(dat3, EVTYPE %in% names(evts.sort.50))
dat4.fatalaties <- sum(dat4$FATALITIES)
cat("The total number of fatalaties in dat4 is",dat4.fatalaties,"\n")
## The total number of fatalaties in dat4 is 14456
cat("The ratio of fatalaties retained in dat4 is",dat4.fatalaties/tot.fatalaties,"\n")
## The ratio of fatalaties retained in dat4 is 0.9545064
dat4.injuries <- sum(dat4$INJURIES)
cat("The total number of injuries in dat4 is",dat4.injuries,"\n")
## The total number of injuries in dat4 is 138538
cat("The ratio of injuries retained in dat4 is",dat4.injuries/tot.injuries,"\n")
## The ratio of injuries retained in dat4 is 0.9858391
cat("The number of rows in dat4 is",nrow(dat4),"\n")
## The number of rows in dat4 is 880049
cat("The ratio of rows retained in dat4 is",nrow(dat4)/nrow(dat),"\n")
## The ratio of rows retained in dat4 is 0.9753429
property.dat4 <- sum(dat4$PROPDMG)
cat("The total Property Damage in dat4 is",property.dat4,"\n")
## The total Property Damage in dat4 is 412916487081
cat("The ratio of Property Damage retained in dat4 is",property.dat4/property.dat2,"\n")
## The ratio of Property Damage retained in dat4 is 0.9742616
crop.dat4 <- sum(dat4$CROPDMG)
cat("The total Crop Damage in dat4 is",crop.dat4,"\n")
## The total Crop Damage in dat4 is 47154361491
cat("The ratio of Crop Damage retained in dat4 is",crop.dat4/crop.dat3,"\n")
## The ratio of Crop Damage retained in dat4 is 0.9764521
8. Correcting for outlier in CA (mentioned in class forum, multiplier wrong)
temp <- dat4[dat4$STATE == "CA" & dat4$PROPDMG>100000000000,]
cat("There is an outlier here:\n")
## There is an outlier here:
print(temp)
## YEAR EVTYPE STATE FATALITIES INJURIES PROPDMG CROPDMG
## 593640 2006 FLOOD CA 0 0 1.15e+11 32500000
num <- as.integer(rownames(temp))
dat4[num,"PROPDMG"] = dat4[num,"PROPDMG"]/1000
9. Aggregating over years in dat5
dat5 <- aggregate(cbind(FATALITIES, INJURIES, PROPDMG, CROPDMG )
~ YEAR + STATE + EVTYPE, sum, data = dat4)
dat5.fatalaties <- sum(dat5$FATALITIES)
cat("The total number of fatalaties in dat5 is",dat5.fatalaties,"\n")
## The total number of fatalaties in dat5 is 14456
cat("The ratio of fatalaties retained in dat5 is",dat5.fatalaties/tot.fatalaties,"\n")
## The ratio of fatalaties retained in dat5 is 0.9545064
dat5.injuries <- sum(dat5$INJURIES)
cat("The total number of injuries in dat5 is",dat5.injuries,"\n")
## The total number of injuries in dat5 is 138538
cat("The ratio of injuries retained in dat5 is",dat5.injuries/tot.injuries,"\n")
## The ratio of injuries retained in dat5 is 0.9858391
cat("The number of rows in dat5 is",nrow(dat5),"\n")
## The number of rows in dat5 is 21693
cat("The ratio of rows retained in dat5 is",nrow(dat5)/nrow(dat),"\n")
## The ratio of rows retained in dat5 is 0.02404197
property.dat5 <- sum(dat5$PROPDMG)
cat("Note the reduction from last value of Property Damage, for dat4, is only due to outlier correction\n")
## Note the reduction from last value of Property Damage, for dat4, is only due to outlier correction
cat("The total Property Damage in dat5 is",property.dat5,"\n")
## The total Property Damage in dat5 is 298031487081
cat("The ratio of Property Damage retained in dat5 is",property.dat5/property.dat2,"\n")
## The ratio of Property Damage retained in dat5 is 0.7031946
crop.dat5 <- sum(dat5$CROPDMG)
cat("The total Crop Damage in dat5 is",crop.dat5,"\n")
## The total Crop Damage in dat5 is 47154361491
cat("The ratio of Crop Damage retained in dat5 is",crop.dat5/crop.dat3,"\n")
## The ratio of Crop Damage retained in dat5 is 0.9764521