Exploratory Data Analysis
Libraries
print("Loading R Libraries...")
## [1] "Loading R Libraries..."
#############################################################################
knitr::opts_chunk$set(echo = TRUE)
library(Amelia)
library(car)
library(corrplot)
library(ggcorrplot)
library(ggplot2)
library(kableExtra)
library(knitr)
library(leaflet)
library(leaflet.extras)
## Warning: package 'leaflet.extras' was built under R version 4.0.3
library(maptools)
library(MASS)
library(psych)
library(QuantPsyc)
library(raster)
library(reticulate)
library(sf)
library(shiny)
library(sp)
library(spatialEco)
library(spatialreg)
library(spData)
library(spdep)
library(tmap)
library(tmaptools)
library(totalcensus)
library(rmapshaper)
#############################################################################
Load County Data
print("Loading data for county analysis...")
## [1] "Loading data for county analysis..."
#############################################################################
h=read.csv("d:/heart/updated thf4.csv")
counties0<- shapefile("d:/heart/cb_2018_us_county_500k.shp")
counties0=ms_simplify(counties0, method="vis", keep_shapes=TRUE )
## Warning in sp::proj4string(sp): CRS object has comment, which is lost in output
geodata=read.csv("d:/heart/geoinputnew.csv")
geodata$GEOID=as.character(geodata$GEOID)
for (i in 1:nrow(geodata)){
if (nchar(geodata$GEOID[i])<5){
geodata$GEOID[i]=paste0("0", geodata$GEOID[i])}
}
Missing
#Merge and delete counties without data
counties=merge(counties0, geodata, by="GEOID", type="left", all.x=TRUE)
counties=sp.na.omit((counties))
## Deleting rows: 13272829303436395661637576788081838687889092949596102103109110125127130147148153160166170172179189204206209210212214216231237243255267269270279280283284286291292297299300303304305306307308309321322323324325326327328331332334335337340343348360362370372383387393394396402404414416421427431435451464467468474482483484490494502503504505506507508509510511512516517562563566569570571574575578583584586592595596597602605606607608609610611621627629631634639640641643650657659666667675676679681683685689690692694695697700702704705723733740741744749750751752755761762765766772784786805808809811814816817818819821824827829830832844846851856867868874875877882883884899903904916924927928932935940941948951957961962966968969985990994998100210031011101410151016101910231027103410361039104010451046106210651068106910701073107610811087109310961097111111161118111911231129113511381145114911501151115511561161117411751193119611991200120112021203120612091210121812201223123112351238124212431248125012531255125612571265127712781283128512871288129112931294130013021303130413051317133913521354135513561363137213781384139413951396139714021405141214151430143114321439144114421443144414481452145814591460146114631465146714681470147114751476147714801482148514871488149114961497150115061507151015141515152015221526152715281530153415371538154115421544155015521553155415621569157215741580158415871589159215951601160316041605161316141615161616171618161916251628164416481649165416641665166816741678168116831686168916911697170017151720172617321733173517411749175017511763176417651766177217771783178418051808181318181826182918321841184318441846184718511857186018641867187018711872187318741880188118821885188618911896189919091910191319161918191919231924192519271932193319511954195619581959196419771980198219861989199720082020202220242038204120502052205420582061206220632064206820812082209821122117212121222123213921452150215621582164216921752181218221852186218721932195219622032204220622172218222222282232224122432247224922502256225722582259226122642267227522792290229222952296230223122313231923232328233323352343234523572361236223702371237423752379238423912400241724192425242624272429243024532457246024622466247024722477247824792485248925042523252525292532253325352536253925412542254625622563256825732574257625772578257925882589259225982605260626072609261626172623262526302632263526422649265326542656265926602666266726692676267826792680268326842689270127092711271627182719272327252737274027412743274827622763276627672780278627872788279228022803280828132816281828232824283028312837283828442849285128522860286428682869287128762883288528902891289228932895289728992901290629072911291529182920292129252931293229342943294429482952295329572964296629722980298229852986298729902998300330063009301330163020302130413053305430563061306730693071307530783079308130883091310131023103311331253138315031513153316131653167316931713181318431923193319531983205320732113212321332223229
counties@data$SumAdmits=NULL
#############################################################################
missmap(counties@data)

Environment Set Up
print("Loading Python Environment...")
## [1] "Loading Python Environment..."
#############################################################################
Sys.setenv('RETICULATE_PYTHON'='C://ProgramData//Anaconda3//')
use_python('C://ProgramData//Anaconda3//python.exe')
use_condaenv("base")
py$RANDOM_SEED = 1234
matplotlib=import('matplotlib.pyplot')
options(scipen=9); options(digits=3)
#############################################################################
Prepare Functions
print("Load Functions...")
## [1] "Load Functions..."
################################PRINT########################################
myprint=function(x,nm) {
x%>%kbl(col.names = nm)%>%kable_classic(full_width = F, html_font = "Cambria")}
#############################################################################
##############################COREELATION####################################
corfunction=function(d){
mycorr=cor(d[, 1:ncol(d)]); p.mat=cor_pmat(d[,1:ncol(d)])
myplot=ggcorrplot(mycorr, hc.order=TRUE,type="lower",colors=c("red", "white","green"),tl.cex = 8, tl.col = "black", lab=TRUE, lab_size=2, p.mat=p.mat, insig="pch", pch=4)
print(myplot)}
#############################################################################
#################################SCALE#######################################
myscale=function(x) {(x-min(x))/(max(x)-min(x))}
#############################################################################
Correlations
corfunction(counties@data[, 12:29])

Principal Components
#############################################################################
a=prcomp(counties@data[, 18:24], center=TRUE, scale=TRUE, retx=TRUE)
checkvar=a$sdev^2/sum(a$sdev^2)
names(checkvar)=c("PC1","PC2", "PC3", "PC4", "PC5", "PC6", "PC7")
counties@data$PC1=a$x[,1]
counties@data[, 18:24]=NULL
#############################################################################
checkvar%>%kbl(col.names="% Variance")%>%kable_classic(full_width = F, html_font = "Cambria")
|
|
% Variance
|
|
PC1
|
0.974
|
|
PC2
|
0.012
|
|
PC3
|
0.005
|
|
PC4
|
0.004
|
|
PC5
|
0.003
|
|
PC6
|
0.001
|
|
PC7
|
0.000
|
corfunction(counties@data[, 12:ncol(counties@data)])

Min Max Scaling
print("Min-Max Scaling...")
## [1] "Min-Max Scaling..."
#############################################################################
temp=counties@data$AdmitRate
for (i in 12:ncol(counties@data)){
counties@data[,i]=myscale(counties@data[,i])}
counties@data$AdmitRate2=temp
#############################################################################
Models
Linear
#############################################################################
fit1=lm(AdmitRate~., data=counties@data[13:23])
counties@data$m0=residuals(fit1)
#############################################################################
summary(fit1)
##
## Call:
## lm(formula = AdmitRate ~ ., data = counties@data[13:23])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1523 -0.0281 -0.0102 0.0147 0.9092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.11521 0.04364 -2.64 0.0083 **
## MeanIncome 0.13202 0.05053 2.61 0.0090 **
## MeanProfitMgn 0.12637 0.05549 2.28 0.0229 *
## MeanCash -0.09749 0.05063 -1.93 0.0543 .
## MeanEquity 0.14395 0.04555 3.16 0.0016 **
## MeanMedicare 0.04461 0.00824 5.41 6.8e-08 ***
## MeanMedicaid 0.00738 0.01232 0.60 0.5491
## MeanNonProfit 0.01173 0.00260 4.52 6.6e-06 ***
## MeanAffil 0.05355 0.00686 7.80 8.9e-15 ***
## MeanSTAC 0.04117 0.00300 13.72 < 2e-16 ***
## PC1 0.05083 0.02938 1.73 0.0837 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0575 on 2420 degrees of freedom
## Multiple R-squared: 0.15, Adjusted R-squared: 0.146
## F-statistic: 42.7 on 10 and 2420 DF, p-value: <2e-16
print("####################################################################")
## [1] "####################################################################"
anova(fit1)
## Analysis of Variance Table
##
## Response: AdmitRate
## Df Sum Sq Mean Sq F value Pr(>F)
## MeanIncome 1 0.14 0.143 43.17 0.000000000061 ***
## MeanProfitMgn 1 0.08 0.076 22.90 0.000001811236 ***
## MeanCash 1 0.01 0.009 2.86 0.0912 .
## MeanEquity 1 0.13 0.130 39.25 0.000000000439 ***
## MeanMedicare 1 0.03 0.034 10.28 0.0014 **
## MeanMedicaid 1 0.00 0.000 0.06 0.8136
## MeanNonProfit 1 0.07 0.073 21.98 0.000002909443 ***
## MeanAffil 1 0.31 0.305 92.23 < 2e-16 ***
## MeanSTAC 1 0.63 0.633 191.30 < 2e-16 ***
## PC1 1 0.01 0.010 2.99 0.0837 .
## Residuals 2420 8.00 0.003
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print("####################################################################")
## [1] "####################################################################"
vif(fit1)
## MeanIncome MeanProfitMgn MeanCash MeanEquity MeanMedicare
## 1.10 1.03 1.20 1.26 1.58
## MeanMedicaid MeanNonProfit MeanAffil MeanSTAC PC1
## 1.17 1.04 1.17 1.40 1.23
Spatial Models
#############################################################################
#We coerce the sf object into a new sp object
ncovr_sp <- as(counties, "Spatial")
list.queen<-poly2nb(counties, queen=TRUE)
list.rook<-poly2nb(counties, queen=FALSE)
W=rwm=nb2listw(list.queen, style="W", zero.policy=TRUE)
W2=rwm2=nb2listw(list.rook, style="W", zero.policy=TRUE)
m1=lm.morantest(fit1, rwm, alternative="two.sided", zero.policy=TRUE)
m2=lm.LMtests(fit1, rwm, test = c("LMerr","LMlag","RLMerr","RLMlag","SARMA"), zero.policy=TRUE)
counties@data$STATEFP=paste0("S",counties@data$STATEFP)
write.csv(counties@data[, c(13:23, 2)], "d:/heart/countiespython.csv", row.names = FALSE)
#############################################################################
m1
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = AdmitRate ~ ., data = counties@data[13:23])
## weights: rwm
##
## Moran I statistic standard deviate = 2, p-value = 0.1
## alternative hypothesis: two.sided
## sample estimates:
## Observed Moran I Expectation Variance
## 0.021527 -0.001022 0.000187
print("####################################################################")
## [1] "####################################################################"
m2
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AdmitRate ~ ., data = counties@data[13:23])
## weights: rwm
##
## LMerr = 3, df = 1, p-value = 0.1
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AdmitRate ~ ., data = counties@data[13:23])
## weights: rwm
##
## LMlag = 2, df = 1, p-value = 0.2
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AdmitRate ~ ., data = counties@data[13:23])
## weights: rwm
##
## RLMerr = 61, df = 1, p-value = 6e-15
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AdmitRate ~ ., data = counties@data[13:23])
## weights: rwm
##
## RLMlag = 60, df = 1, p-value = 1e-14
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = AdmitRate ~ ., data = counties@data[13:23])
## weights: rwm
##
## SARMA = 62, df = 2, p-value = 3e-14
print("####################################################################")
## [1] "####################################################################"
Queen FOC
#############################################################################
#Queen
mylag1=stsls(AdmitRate ~ ., zero.policy=TRUE, data = counties@data[13:23], W)
## Warning: Function stsls moved to the spatialreg package
counties@data$queen=mylag1$res
moran1=moran.mc(counties@data$queen, W, 50, zero.policy=TRUE) #LISA
#mylisa1=lisa(counties, d1=0, d2=2000, statistic="I", zcol="AdmitRate")
part1=as.data.frame(summary(mylag1)$Coef)
myr1=1-sum(mylag1$residuals^2 / (var(counties@data$AdmitRate)*(nrow(counties@data)-1)))
#############################################################################
summary(mylag1)
##
## Call:spatialreg::stsls(formula = formula, data = data, listw = listw,
## zero.policy = zero.policy, na.action = na.action, robust = robust,
## HC = HC, legacy = legacy, W2X = W2X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1439575 -0.0304633 -0.0095213 0.0166718 0.8816833
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Rho -0.4968078 0.0790875 -6.2817 3.348e-10
## (Intercept) -0.1005310 0.0445953 -2.2543 0.024177
## MeanIncome 0.1356061 0.0515700 2.6296 0.008550
## MeanProfitMgn 0.1400301 0.0566759 2.4707 0.013484
## MeanCash -0.0948724 0.0516759 -1.8359 0.066371
## MeanEquity 0.1426501 0.0464818 3.0689 0.002148
## MeanMedicare 0.0465374 0.0084175 5.5287 3.227e-08
## MeanMedicaid -0.0045307 0.0127110 -0.3564 0.721510
## MeanNonProfit 0.0133506 0.0026635 5.0124 5.375e-07
## MeanAffil 0.0540647 0.0070041 7.7190 1.177e-14
## MeanSTAC 0.0437893 0.0030917 14.1635 < 2.2e-16
## PC1 0.0410687 0.0300187 1.3681 0.171279
##
## Residual variance (sigma squared): 0.00344, (sigma: 0.0587)
moran1
##
## Monte-Carlo simulation of Moran I
##
## data: counties@data$queen
## weights: W
## number of simulations + 1: 51
##
## statistic = 0.2, observed rank = 51, p-value = 0.02
## alternative hypothesis: greater
Rook FOC
#############################################################################
mylag2=stsls(AdmitRate~.,zero.policy=TRUE, data=counties@data[13:23], W2)
## Warning: Function stsls moved to the spatialreg package
counties@data$rook=mylag2$res
moran2=moran.mc(counties@data$rook, W2, 50, zero.policy=TRUE) #LISA
#mylisa2=lisa(counties, d1=0, d2=2000, statistic="I", zcol="AdmitRate")
part2=as.data.frame(summary(mylag2)$Coef)
#############################################################################
summary(mylag2)
##
## Call:spatialreg::stsls(formula = formula, data = data, listw = listw,
## zero.policy = zero.policy, na.action = na.action, robust = robust,
## HC = HC, legacy = legacy, W2X = W2X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1448378 -0.0306670 -0.0093664 0.0168486 0.8811411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Rho -0.5087675 0.0781682 -6.5086 7.584e-11
## (Intercept) -0.0992554 0.0446623 -2.2224 0.026259
## MeanIncome 0.1339721 0.0516383 2.5944 0.009475
## MeanProfitMgn 0.1400778 0.0567507 2.4683 0.013576
## MeanCash -0.0939002 0.0517478 -1.8146 0.069590
## MeanEquity 0.1434405 0.0465450 3.0818 0.002058
## MeanMedicare 0.0467089 0.0084296 5.5411 3.006e-08
## MeanMedicaid -0.0046416 0.0127208 -0.3649 0.715198
## MeanNonProfit 0.0132299 0.0026646 4.9651 6.868e-07
## MeanAffil 0.0539419 0.0070135 7.6912 1.465e-14
## MeanSTAC 0.0437442 0.0030930 14.1428 < 2.2e-16
## PC1 0.0413981 0.0300544 1.3774 0.168377
##
## Residual variance (sigma squared): 0.00345, (sigma: 0.0588)
moran2
##
## Monte-Carlo simulation of Moran I
##
## data: counties@data$rook
## weights: W2
## number of simulations + 1: 51
##
## statistic = 0.2, observed rank = 51, p-value = 0.02
## alternative hypothesis: greater
Interactive Map
#############################################################################
counties@data$ABB=convert_fips_to_names(counties@data$STATEFP)
qpal<-colorBin("Reds",c(-4, -3,-2,-1,0,1,2,3,4),bins=9,pretty=TRUE, alpha=.1)
qpal2<-colorBin("Blues",c(0,150, 300, 450,600,750,900,1050,1200),bins=9,pretty=TRUE, alpha=.1)
leaf=leaflet() %>%
#Overlays
addTiles(group = "OSM (default)") %>%
#Panes
addMapPane("borders", zIndex = 410) %>%
#Base Diagrams
addPolylines(data = counties,
color = "black",
opacity = 1,
weight = 1, group="Borders", options = pathOptions(pane = "borders"))%>%
fitBounds(-124.8, -66.9, 24.4,49.4) %>%
setView(-98.6, 39.83, zoom = 4)%>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal2(1000*counties@data$AdmitRate2/3),
popup = paste("County: ", counties@data$NAME, "<br>",
"Admit Rate/1K: ", 1000*counties@data$AdmitRate2/3),
group="Admission Rate") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m0),
popup = paste("County: ", counties@data$NAME, "<br>", "OLS Residuals: ", counties@data$m0), group="OLS Residuals") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(queen),
popup = paste("County: ", counties@data$NAME, "<br>",
"Queen Residuals: ", counties@data$queen), group="Queen Residuals") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(rook),
popup = paste("County: ", counties@data$NAME, "<br>",
"Rook Residuals: ", counties@data$rook), group="Rook Residuals") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m1),
popup = paste("County: ", counties@data$NAME, "<br>",
"Mean Income Res: ", counties@data$MeanIncome), group="Income") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m2),
popup = paste("County: ", counties@data$NAME, "<br>",
"Mean Profit Mgn Res: ", counties@data$MeanProfitMgn), group="Profit Margin") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m3),
popup = paste("County: ", counties@data$NAME, "<br>",
"Mean Cash on Hand Res: ", counties@data$MeanCash), group="Cash on Hand") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m4),
popup = paste("County: ", counties@data$NAME, "<br>",
"Mean Equity Res: ", counties@data$MeanEquity), group="Equity") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m5),popup = paste("County: ", counties@data$NAME, "<br>",
"% Medicare Res: ", round(counties@data$MeanMedicare),3), group="% Medicare") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m6),
popup = paste("County: ", counties@data$NAME, "<br>",
"% Medicare Res: ", counties@data$MeanMedicaid), group="% Medicaid") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m7),
popup = paste("County: ", counties@data$NAME, "<br>",
"% Non-profit: ", counties@data$MeanProfitMgn), group="% Non-Profit") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m8),
popup = paste("County: ", counties@data$NAME, "<br>",
"% Major Med Sch Res: ", counties@data$MeanAffil), group="Med School Aff.") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m9),
popup = paste("County: ", counties@data$NAME, "<br>",
"STAC: ", counties@data$MeanSTAC)
, group="% STAC") %>%
addPolygons(data=counties,stroke = FALSE,
fillOpacity = 1, smoothFactor = 0.2,
color=~qpal(m10),
popup = paste("County: ", counties@data$NAME, "<br>",
"Workload Res: ", counties@data$PC1), group="Workload") %>%
addLegend(data=counties,
"bottomleft", opacity=1, pal = qpal,
values = ~m10,
title = "Residuals")%>%
addLegend(data=counties, group="Admission Rate",
"topleft", opacity=1, pal = qpal2,
values = ~AdmitRate2*1000/3,
title = "Admit Rate")%>%
addLayersControl(
baseGroups = c("Admission Rate","OLS Residuals", "Queen Residuals", "Rook Residuals", "Income", "Profit Margin", "Cash on Hand","Equity", "% Medicare", "% Medicaid", "%STAC", "Workload"),
overlayGroups = c("Borders"),
options = layersControlOptions(collapsed = TRUE)
)
leaf