#https://github.com/UrbanInstitute/urbnmapr #https://medium.com/@urban_institute/how-to-create-state-and-county-maps-easily-in-r-577d29300bb2
#https://www.tandfonline.com/doi/full/10.1080/2330443X.2017.1408439
Does an increase in powerplant sites in each county lead to an increase in cancer rate?
# remove US territories:
superfund <- superfund[!(superfund$state == "AS" | superfund$state == "MP" | superfund$state=="GU" | superfund$state == "PR" | superfund$state == "VI"),]
superfund_inter <- merge(x = superfund, y = fipcodes, by = c("state","county"), all.x = TRUE)
superfund <- merge(x = superfund_inter, y = fipcodesmanual, by = c("state","county"), all.x = TRUE)
table(is.na(superfund$FIPS))
## < table of extent 0 >
superfund$FIPS <- superfund$FIPS.x
superfund$FIPS[!is.na(superfund$FIPS.y)] = superfund$FIPS.y[!is.na(superfund$FIPS.y)]
superfund$FIPS[which(superfund$state == "DC")]= 11001
superfund <- superfund[!is.na(superfund$FIPS),]
##
## FALSE
## 2088
##
## FALSE
## 2918
##
## FALSE
## 8573
# average site score for each county:
avg_sitescore <- data.frame(aggregate(Site.Score ~ FIPS, data = superfund, FUN = mean))
# number of superfund sites:
sites_count <- superfund %>% group_by(FIPS) %>% tally()
names(sites_count)[2] <- "superfundsites_count"
# number of federal facilitator indicator:
ffi_yes <- superfund %>%
group_by(FIPS) %>% summarise(count = sum(Federal.Facility.Indicator =="Yes"))
names(ffi_yes)[2] <- "fed_yes"
ffi_no <- superfund %>%
group_by(FIPS) %>% summarise(count = sum(Federal.Facility.Indicator =="No"))
names(ffi_no)[2] <- "fed_no"
agg_sites <- data.frame(Reduce(function(x, y) merge(x, y, all=TRUE), list(avg_sitescore,sites_count,ffi_yes,ffi_no)))
write.csv(agg_sites,"agg_sites.csv")
West coast and areas around New York city show a recent falling trend. Many counties that show rising trend are mostly in the south and middle of the country.
## Primary Fuel Type Correlation
## 1 Biomass 0.01046
## 2 Coal 0.03817
## 3 Cogeneration -0.0246
## 4 Gas -0.01955
## 5 Geothermal -0.09358
## 6 Hydro -0.02512
## 7 Nuclear 0.06352
## 8 Oil 0.04254
## 9 Other -0.00416
## 10 Petcoke 0.01331
## 11 Solar -0.03424
## 12 Storage -0.01469
## 13 Waste 0.07453
## 14 Wind -0.0676
## [1] 0.0548556496 0.0574804728 -0.0034205405 0.0570932218 -0.0936945869
## [6] -0.0009187172 0.0601585969 0.0268915227 -0.0064996373 0.0231038227
## [11] -0.0053740829 0.0205004066 0.1029651196 -0.0610151015
## Primary.Fuel.Type Correlation Cor_rising Cor_falling Cor_stable
## 1 Biomass 0.01046 0.08552 -0.12134 0.05486
## 2 Coal 0.03817 -0.1883 -0.10739 0.05748
## 3 Cogeneration -0.0246 -0.05466 -0.06597 -0.00342
## 4 Gas -0.01955 -0.02781 -0.09717 0.05709
## 5 Geothermal -0.09358 <NA> -0.0941 -0.09369
## 6 Hydro -0.02512 -0.01999 -0.0596 -0.00092
## 7 Nuclear 0.06352 -0.02855 0.10381 0.06016
## 8 Oil 0.04254 0.21631 0.12448 0.02689
## 9 Other -0.00416 <NA> 0.01741 -0.0065
## 10 Petcoke 0.01331 -0.02855 -0.04907 0.0231
## 11 Solar -0.03424 -0.23797 -0.06011 -0.00537
## 12 Storage -0.01469 <NA> -0.06577 0.0205
## 13 Waste 0.07453 0.06892 0.06773 0.10297
## 14 Wind -0.0676 -0.3668 -0.09169 -0.06102
library(corrplot)
library(PerformanceAnalytics)
all_cor <- subset(all_three, select = c("FIPS", "Age.AdjustedCANCERcases_per100000",
"Site.Score",
"fed_yes","fed_no","Biomass","Coal","Cogeneration","Gas","Geothermal","Hydro","Nuclear","Oil","Other","Petcoke","Solar","Storage","Waste","Wind","plant_count","avg_cap"))
names(all_cor)[2] <- "AdjustedCancerRCases"
all_complete <- na.omit(all_cor)
res <- cor(all_complete[,2:ncol(all_complete)])
corrplot(res, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
# Average capacity per county # Something # Something # Something # Transform data further for the drop down menus:
##
## Call:
## lm(formula = Age.AdjustedCANCERcases_per100000 ~ Biomass + Coal +
## Cogeneration + Gas + Geothermal + Hydro + Nuclear + Oil +
## Other + Petcoke + Solar + Storage + Waste + Wind, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -209.363 -29.750 5.748 34.677 230.658
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 444.8385 1.8538 239.958 < 2e-16 ***
## Biomass 3.1170 4.7828 0.652 0.514699
## Coal 4.8621 2.7845 1.746 0.081023 .
## Cogeneration -10.7086 9.1120 -1.175 0.240121
## Gas -0.6216 0.9347 -0.665 0.506175
## Geothermal -7.9753 2.4432 -3.264 0.001125 **
## Hydro -0.6393 0.7107 -0.899 0.368584
## Nuclear 13.9560 7.7711 1.796 0.072739 .
## Oil 1.7037 1.3758 1.238 0.215801
## Other -0.3822 14.1878 -0.027 0.978512
## Petcoke 11.7479 14.9129 0.788 0.430974
## Solar -0.1693 0.3875 -0.437 0.662154
## Storage -4.1388 7.0243 -0.589 0.555821
## Waste 8.0923 2.3638 3.423 0.000637 ***
## Wind -2.8749 1.0163 -2.829 0.004743 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.56 on 1336 degrees of freedom
## Multiple R-squared: 0.03188, Adjusted R-squared: 0.02174
## F-statistic: 3.143 on 14 and 1336 DF, p-value: 7.163e-05
## [1] 49.10689
library(rpart)
library(rattle)
library(tree)
set.seed(123)
# index_final <- sample(1:nrow(all_complete),nrow(all_complete)*0.75)
# train_final = all_complete[index_final,]
# test_final = all_complete[-index_final,]
# Decisoin Tree
tree_simple <- rpart(Age.AdjustedCANCERcases_per100000 ~ Biomass+
Coal+
Cogeneration+
Gas+
Geothermal+
Hydro+
Nuclear+
Oil+
Other+
Petcoke+
Solar+
Storage+
Waste+
Wind, data= train)
pred_test <- predict(tree_simple, newdata = test)
tree_rmse <- sqrt(sum((pred_test - test$Age.AdjustedCANCERcases_per100000)^2/nrow(test),na.rm=TRUE))
print(round(tree_rmse,5))
## [1] 48.3234
fancyRpartPlot(tree_simple, caption = 'Full tree')
savePlotToFile('simple_tree.png')
## [1] TRUE
cancer_superfund <- subset(all_three, select = c("FIPS","Age.AdjustedCANCERcases_per100000","Site.Score","superfundsites_count","fed_yes","fed_no"))
cancer_superfund <- na.omit(cancer_superfund)
cor(cancer_superfund$Age.AdjustedCANCERcases_per100000, cancer_superfund$Site.Score)
## [1] 0.0154947
index_2 <- sample(1:nrow(cancer_superfund),nrow(cancer_superfund)*0.75)
train_2 = cancer_superfund[index_2,]
test_2 = cancer_superfund[-index_2,]
tree_simple2 <- rpart(Age.AdjustedCANCERcases_per100000 ~ Site.Score+
superfundsites_count+fed_yes+fed_no, data=cancer_superfund)
pred_test2 <- predict(tree_simple2, newdata = test_2)
tree_rmse2 <- sqrt(mean((pred_test2-test_2$Age.AdjustedCANCERcases_per100000)^2))
print(tree_rmse2)
## [1] 43.30044
fancyRpartPlot(tree_simple2, caption = 'Full tree')
Distance between superfundsites and distance to the hard-hitting county. (People could be living in different counties but have interactions with counties with high superfund sites.)
Type of cancer vs superfund sites Type of cancer vs powerplants