#https://github.com/UrbanInstitute/urbnmapr #https://medium.com/@urban_institute/how-to-create-state-and-county-maps-easily-in-r-577d29300bb2

read the following article:

#https://www.tandfonline.com/doi/full/10.1080/2330443X.2017.1408439

Does an increase in powerplant sites in each county lead to an increase in cancer rate?

Loading available data in:

Cleaning the datasets:

{###: Subsection, # working within subsection}

FIP Codes:

superfund sites Data:

# remove US territories: 

superfund <- superfund[!(superfund$state == "AS" | superfund$state == "MP" | superfund$state=="GU" | superfund$state == "PR" | superfund$state == "VI"),]

superfund_inter <- merge(x = superfund, y = fipcodes, by = c("state","county"), all.x = TRUE)

superfund <- merge(x = superfund_inter, y = fipcodesmanual, by = c("state","county"), all.x = TRUE)

table(is.na(superfund$FIPS))

## < table of extent 0 >

superfund$FIPS <- superfund$FIPS.x
superfund$FIPS[!is.na(superfund$FIPS.y)] =   superfund$FIPS.y[!is.na(superfund$FIPS.y)]

superfund$FIPS[which(superfund$state == "DC")]= 11001

superfund <- superfund[!is.na(superfund$FIPS),]

Cancer Rate Data:

Power Plant Data

Final Removal of US’s territories. Keep US states and DC ONLY:

Further cleaning:

The THREE Final Datasets:

## 
## FALSE 
##  2088

## 
## FALSE 
##  2918

## 
## FALSE 
##  8573

Superfund sites aggregate tables:

# average site score for each county:
avg_sitescore <- data.frame(aggregate(Site.Score ~ FIPS, data = superfund, FUN = mean))

# number of superfund sites: 
sites_count <- superfund %>% group_by(FIPS) %>% tally()
names(sites_count)[2] <- "superfundsites_count"

# number of federal facilitator indicator: 
ffi_yes <- superfund %>% 
  group_by(FIPS) %>% summarise(count = sum(Federal.Facility.Indicator =="Yes"))

names(ffi_yes)[2] <- "fed_yes"

ffi_no <- superfund %>% 
  group_by(FIPS) %>% summarise(count = sum(Federal.Facility.Indicator =="No"))

names(ffi_no)[2] <- "fed_no"

agg_sites <- data.frame(Reduce(function(x, y) merge(x, y, all=TRUE), list(avg_sitescore,sites_count,ffi_yes,ffi_no)))

write.csv(agg_sites,"agg_sites.csv")

Further Cleaning of powerplants_final before merging with cancer data:

Time to merge the two set before do a final cleaning:

Exploratory Data Analysis:

Cancer Rate Map

Location of powerplants by types:

Recent Trend of Cancer Rate

County map of recent trend.

West coast and areas around New York city show a recent falling trend. Many counties that show rising trend are mostly in the south and middle of the country.

map of superfund sites by county:

List of Top 50 and Bottom 50 Counties based on Cancer Rate:

Powerplant:

MERGE ALL THREE:

Correlation between cancer rate and number of powerplants:

##    Primary Fuel Type Correlation
## 1            Biomass     0.01046
## 2               Coal     0.03817
## 3       Cogeneration     -0.0246
## 4                Gas    -0.01955
## 5         Geothermal    -0.09358
## 6              Hydro    -0.02512
## 7            Nuclear     0.06352
## 8                Oil     0.04254
## 9              Other    -0.00416
## 10           Petcoke     0.01331
## 11             Solar    -0.03424
## 12           Storage    -0.01469
## 13             Waste     0.07453
## 14              Wind     -0.0676

Correlation:Investigate recent trend with the type of fuel used.

##  [1]  0.0548556496  0.0574804728 -0.0034205405  0.0570932218 -0.0936945869
##  [6] -0.0009187172  0.0601585969  0.0268915227 -0.0064996373  0.0231038227
## [11] -0.0053740829  0.0205004066  0.1029651196 -0.0610151015

##    Primary.Fuel.Type Correlation Cor_rising Cor_falling Cor_stable
## 1            Biomass     0.01046    0.08552    -0.12134    0.05486
## 2               Coal     0.03817    -0.1883    -0.10739    0.05748
## 3       Cogeneration     -0.0246   -0.05466    -0.06597   -0.00342
## 4                Gas    -0.01955   -0.02781    -0.09717    0.05709
## 5         Geothermal    -0.09358       <NA>     -0.0941   -0.09369
## 6              Hydro    -0.02512   -0.01999     -0.0596   -0.00092
## 7            Nuclear     0.06352   -0.02855     0.10381    0.06016
## 8                Oil     0.04254    0.21631     0.12448    0.02689
## 9              Other    -0.00416       <NA>     0.01741    -0.0065
## 10           Petcoke     0.01331   -0.02855    -0.04907     0.0231
## 11             Solar    -0.03424   -0.23797    -0.06011   -0.00537
## 12           Storage    -0.01469       <NA>    -0.06577     0.0205
## 13             Waste     0.07453    0.06892     0.06773    0.10297
## 14              Wind     -0.0676    -0.3668    -0.09169   -0.06102

Correlation of all possible variables for modeling:

library(corrplot)
library(PerformanceAnalytics)

all_cor <- subset(all_three, select = c("FIPS",                                      "Age.AdjustedCANCERcases_per100000",
                                        "Site.Score",
"fed_yes","fed_no","Biomass","Coal","Cogeneration","Gas","Geothermal","Hydro","Nuclear","Oil","Other","Petcoke","Solar","Storage","Waste","Wind","plant_count","avg_cap"))

names(all_cor)[2] <- "AdjustedCancerRCases"

all_complete <- na.omit(all_cor)

res <- cor(all_complete[,2:ncol(all_complete)])

corrplot(res, type = "upper", order = "hclust",
         tl.col = "black", tl.srt = 45)

# Average capacity per county # Something # Something # Something # Transform data further for the drop down menus:

cool plotly

# Something:

Modeling:

Split the dataset for test and train:

Linear Regression: Primay fuel and cancer rate

## 
## Call:
## lm(formula = Age.AdjustedCANCERcases_per100000 ~ Biomass + Coal + 
##     Cogeneration + Gas + Geothermal + Hydro + Nuclear + Oil + 
##     Other + Petcoke + Solar + Storage + Waste + Wind, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -209.363  -29.750    5.748   34.677  230.658 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  444.8385     1.8538 239.958  < 2e-16 ***
## Biomass        3.1170     4.7828   0.652 0.514699    
## Coal           4.8621     2.7845   1.746 0.081023 .  
## Cogeneration -10.7086     9.1120  -1.175 0.240121    
## Gas           -0.6216     0.9347  -0.665 0.506175    
## Geothermal    -7.9753     2.4432  -3.264 0.001125 ** 
## Hydro         -0.6393     0.7107  -0.899 0.368584    
## Nuclear       13.9560     7.7711   1.796 0.072739 .  
## Oil            1.7037     1.3758   1.238 0.215801    
## Other         -0.3822    14.1878  -0.027 0.978512    
## Petcoke       11.7479    14.9129   0.788 0.430974    
## Solar         -0.1693     0.3875  -0.437 0.662154    
## Storage       -4.1388     7.0243  -0.589 0.555821    
## Waste          8.0923     2.3638   3.423 0.000637 ***
## Wind          -2.8749     1.0163  -2.829 0.004743 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52.56 on 1336 degrees of freedom
## Multiple R-squared:  0.03188,    Adjusted R-squared:  0.02174 
## F-statistic: 3.143 on 14 and 1336 DF,  p-value: 7.163e-05

## [1] 49.10689

Decision Tree:

library(rpart)
library(rattle)
library(tree)
set.seed(123)

# index_final <- sample(1:nrow(all_complete),nrow(all_complete)*0.75)
# train_final = all_complete[index_final,]
# test_final = all_complete[-index_final,]

# Decisoin Tree
tree_simple <- rpart(Age.AdjustedCANCERcases_per100000 ~ Biomass+
              Coal+
              Cogeneration+
              Gas+
              Geothermal+
              Hydro+
              Nuclear+
              Oil+
              Other+
              Petcoke+
              Solar+
              Storage+
              Waste+
              Wind, data= train)
pred_test <- predict(tree_simple, newdata = test)

tree_rmse <- sqrt(sum((pred_test - test$Age.AdjustedCANCERcases_per100000)^2/nrow(test),na.rm=TRUE))

print(round(tree_rmse,5))

## [1] 48.3234

fancyRpartPlot(tree_simple, caption = 'Full tree')

savePlotToFile('simple_tree.png')

## [1] TRUE

Decision Tree for superfund sites per county:

cancer_superfund <- subset(all_three, select = c("FIPS","Age.AdjustedCANCERcases_per100000","Site.Score","superfundsites_count","fed_yes","fed_no"))

cancer_superfund <- na.omit(cancer_superfund)

cor(cancer_superfund$Age.AdjustedCANCERcases_per100000, cancer_superfund$Site.Score)

## [1] 0.0154947

index_2 <- sample(1:nrow(cancer_superfund),nrow(cancer_superfund)*0.75)
train_2 = cancer_superfund[index_2,]
test_2 = cancer_superfund[-index_2,]

tree_simple2 <- rpart(Age.AdjustedCANCERcases_per100000 ~ Site.Score+
                        superfundsites_count+fed_yes+fed_no, data=cancer_superfund)

pred_test2 <- predict(tree_simple2, newdata = test_2)

tree_rmse2 <- sqrt(mean((pred_test2-test_2$Age.AdjustedCANCERcases_per100000)^2))
print(tree_rmse2)

## [1] 43.30044

fancyRpartPlot(tree_simple2, caption = 'Full tree')

Possible avenue to look into:

Distance between superfundsites and distance to the hard-hitting county. (People could be living in different counties but have interactions with counties with high superfund sites.)

Future Work:

Type of cancer vs superfund sites Type of cancer vs powerplants

Vid Chan

5/5/2020