R Markdown

##Read in and join data from Arc We also dropped 8 Census Tracts that are not included in the CDC data and have very limited ACS data, to avoid skewing the results.

CDC_ACS_Dat <- read.csv("PhillyDataCombined.csv")

MEP_Dat <- read.csv("PhillyTractsMEP.csv")

#creating the link to join the two datasets together
MEP_Dat$Geo_FIPS <- MEP_Dat$GEOID10

#joining all data
dat <- full_join(CDC_ACS_Dat, MEP_Dat, by = "Geo_FIPS")

#this is the joined, base dataset without the empty rows
dat2 <- dat[complete.cases(dat[90]),]

Vars of Interest

Total Pop: “A00001” Pop by Gender: “A02001_001” Pop by Age: A01001 Pop by Race: A03001 Average household size: A10003 Median income: A14006 Nativity by Citizenship status: A06001 Children living with single parents: A10065

#Total Population (continuous)
dat2$TotalPop <- dat2$SE_A00001_001
dat2$Log_TotalPop <- log(dat2$TotalPop)

#Percent female (maybe binary)
dat2$PctFem <- (dat2$SE_A02001_003/dat2$SE_A00002_001)*100
dat2$Log_PctFem <- log(dat2$PctFem)

#Percent male
dat2$PctMale <- (dat2$SE_A02001_002/dat2$SE_A00002_001)*100
dat2$Log_PctMale <- log(dat2$PctMale)

#Pct Pop by Age - up to 17
dat2$PopUnd17 <- ((dat2$SE_A01001_002+dat2$SE_A01001_003+dat2$SE_A01001_004+dat2$SE_A01001_005)/dat2$SE_A01001_001)*100
dat2$Log_PopUnd17 <- log(dat2$PopUnd17)

#Pct Pop Between 18-54
dat2$Pop18to54 <- ((dat2$SE_A01001_006+dat2$SE_A01001_007+dat2$SE_A01001_008+dat2$SE_A01001_009)/dat2$SE_A01001_001)*100
dat2$Log_Pop18to54 <- log(dat2$Pop18to54)

#Pct Pop 55+
dat2$PopOver55 <- ((dat2$SE_A01001_010+dat2$SE_A01001_011+dat2$SE_A01001_012+dat2$SE_A01001_013)/dat2$SE_A01001_001)*100
dat2$Log_PopOver55 <- log(dat2$PopOver55)

#Pct pop that is white
dat2$PctWhite <- (dat2$SE_A03001_002/dat2$SE_A03001_001)*100
dat2$Log_PctWhite <- log(dat2$PctWhite)

#Pct pop that is black
dat2$PctBlack <- (dat2$SE_A03001_003/dat2$SE_A03001_001)*100
dat2$Log_PctBlack <- log(dat2$PctBlack)

#Pct pop American Indian, Asian, Hawaiian or PI, other race
dat2$PctAAPI <- ((dat2$SE_A03001_004+dat2$SE_A03001_005+dat2$SE_A03001_006+dat2$SE_A03001_007+dat2$SE_A03001_008)/dat2$SE_A03001_001)*100
dat2$Log_PctAAPI <- log(dat2$PctAAPI)

#Majority white (binary)
dat2$MajorityWhite[dat2$PctWhite > "0.50"] <- 1
dat2$MajorityWhite[dat2$PctWhite <= "0.50"] <- 0

#Avg Household size
dat2$AvgHHSize <- dat2$SE_A10003_001
dat2$Log_AvgHHSize <- log(dat2$AvgHHSize)

#Unemployment rate
dat2$UnemploymentRate <- (dat2$SE_A17002_006/dat2$SE_A17002_002)*100
dat2$Log_UnemploymentRate <- log(dat2$UnemploymentRate)

#Median income
dat2$MedHHInc <- dat2$SE_A14006_001
dat2$Log_MedHHInc <- log(dat2$MedHHInc)

#Pct non-citizens
dat2$PctNonCitizen <- (dat2$SE_A06001_005/dat2$SE_A06001_001)*100
dat2$Log_PctNonCitizen <- log(dat2$PctNonCitizen)

#Pct children of single parents of total children
dat2$PctSingleParent <- (dat2$SE_A10065_001/dat2$SE_A10065_002)*100
dat2$Log_PctSingleParent <- log(dat2$PctSingleParent)

#Pct Unwell for 14 Days
dat2$PctUnwellAdults <- dat2$Physical_health_not_good_for___14_days_among_adults_aged___18_ye
dat2$Log_PctUnwellAdults <- log(dat2$PctUnwellAdults)

#Adult asthma rate
dat2$AsthmaRate <- dat2$Current_asthma_among_adults_aged___18_years
dat2$Log_AsthmaRate <- log(dat2$AsthmaRate)

#Routine doctor visit rate
dat2$PctRoutineDrVisit <- dat2$Visits_to_doctor_for_routine_checkup_within_the_past_year_among
dat2$Log_PctRoutineDrVisit <- log(dat2$PctRoutineDrVisit)

#Cancer rate
dat2$CancerRate <- dat2$Cancer__excluding_skin_cancer__among_adults_aged___18_years
dat2$Log_CancerRate <- log(dat2$CancerRate)

#Uninsured Rate
dat2$PctUninsured <- dat2$Current_lack_of_health_insurance_among_adults_aged_18_64_years
dat2$Log_PctUninsured <- log(dat2$PctUninsured)

#Density visit rate
dat2$PctDentistVisit <- dat2$Visits_to_dentist_or_dental_clinic_among_adults_aged___18_years
dat2$Log_PctDentistVisit <- log(dat2$PctDentistVisit)

#Mental health not good for 14 days rate
dat2$PctMentallyUnwell <- dat2$Mental_health_not_good_for___14_days_among_adults_aged___18_year
dat2$Log_PctMentallyUnwell <- log(dat2$PctMentallyUnwell)

#MEP
dat2$AvgMEP <- dat2$MEAN_mep
dat2$Log_AvgMEP <- log(dat2$AvgMEP)

is.na(dat2) <- sapply(dat2, is.infinite)

#Categorized variables for t-tests and chi-squared

#Majority female
dat2$MajFemale <- 0
dat2$MajFemale[dat2$PctFem >= "50"] <- 1

#Avg Household size
summary(dat2$AvgHHSize)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.350   2.225   2.540   2.539   2.875   3.950       1
#categorized by above the median =1, others =0
dat2$BigHH <- 0
dat2$BigHH[dat2$AvgHHSize > "2.540"] <- 1


#Median income
summary(dat2$MedHHInc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   10793   29766   44184   49394   62033  143646       2
#categorized by above the median=1, others=0
dat2$HighIncome <- 0
dat2$HighIncome[dat2$MedHHInc > "44184"] <- 1

##histograms of key variables

hist(dat2$AvgHHSize,
     main="Histogram of Average Household Size", 
     xlab="Avg. Household Size")

hist(dat2$PctBlack,
     main="Histogram of Black Percentage of Population", 
     xlab="% of Census Population that is Black")

hist(dat2$Pop18to54,
     main="Histogram of Population Percentage Between 18 and 54", 
     xlab="% of Census Population that is 18-54 Years Old")

##table showing data stats for variables in the final model

varsInterest <- c("AvgMEP", "AvgHHSize", "Pop18to54", "CancerRate", "PctUnwellAdults", "PctBlack")
datviz<-dat2[ , which(names(dat2) %in% varsInterest)]
head(datviz)
##   Pop18to54  PctBlack AvgHHSize PctUnwellAdults CancerRate   AvgMEP
## 1  76.05634  6.653715      1.64             5.9        3.9 470.3547
## 2  70.34965  4.300699      2.46            11.6        3.6 493.3073
## 3  60.88626  7.090103      1.70             7.3        5.6 448.2941
## 4  67.17011 11.367250      1.52             8.4        5.7 439.0412
## 5  51.98926  5.223334      1.40             8.7       10.6 478.3530
## 6  88.51750 32.913881      1.47            10.5        2.7 498.1745
stargazer(datviz, type = "text")
## 
## ======================================================================
## Statistic        N   Mean   St. Dev.   Min   Pctl(25) Pctl(75)   Max  
## ----------------------------------------------------------------------
## Pop18to54       376 53.589   11.150  18.231   46.512   58.552  98.078 
## PctBlack        376 42.995   34.350   0.000   10.394   80.430  98.516 
## AvgHHSize       375  2.539   0.470    1.350   2.225    2.875    3.950 
## PctUnwellAdults 376 14.815   4.708    4.500   11.575   18.025  27.200 
## CancerRate      376  5.902   1.810    1.000   4.700    6.700   18.300 
## AvgMEP          376 297.769  84.261  145.438 232.310  357.976  499.603
## ----------------------------------------------------------------------

T-Tests

t.test (AvgMEP ~ MajFemale, data = dat2, paired = FALSE)#AvgMEP & Majority Female
## 
##  Welch Two Sample t-test
## 
## data:  AvgMEP by MajFemale
## t = 3.0567, df = 164.41, p-value = 0.002612
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  10.79203 50.17157
## sample estimates:
## mean in group 0 mean in group 1 
##        320.3062        289.8244
t.test (AvgMEP ~ BigHH, data = dat2, paired = FALSE)#AvgMEP & HH Size
## 
##  Welch Two Sample t-test
## 
## data:  AvgMEP by BigHH
## t = 4.9632, df = 330.97, p-value = 0.000001112
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  25.15968 58.19892
## sample estimates:
## mean in group 0 mean in group 1 
##        318.3871        276.7078
t.test (AvgMEP ~ HighIncome, data = dat2, paired = FALSE)#AvgMEP & HHI
## 
##  Welch Two Sample t-test
## 
## data:  AvgMEP by HighIncome
## t = 1.6802, df = 275.27, p-value = 0.09405
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.632101 33.300514
## sample estimates:
## mean in group 0 mean in group 1 
##        304.5390        289.2048

Cor Tests

cor.test(dat2$AvgMEP, dat2$MajFemale)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$MajFemale
## t = -3.1149, df = 374, p-value = 0.001982
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2560348 -0.0588262
## sample estimates:
##        cor 
## -0.1590163
cor.test(dat2$AvgMEP, dat2$BigHH)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$BigHH
## t = -4.9431, df = 374, p-value = 0.000001163
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3402535 -0.1502664
## sample estimates:
##        cor 
## -0.2476391
cor.test(dat2$AvgMEP, dat2$HighIncome)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$HighIncome
## t = -1.7572, df = 374, p-value = 0.07971
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.18988627  0.01074658
## sample estimates:
##        cor 
## -0.0904879
CrossTable(dat2$AvgMEP, dat2$MajFemale, fisher = FALSE, chisq = TRUE, 
            expected = TRUE, sresid = FALSE, format="SPSS")#needs to be separated by quartiles. Do we split AvgMEP or other variable?

Correlation tests

Total Pop: small p-value, negatively correlated Pct Fem: 0.06 p-value, negatively correlated Pct Male: 0.062 p-value, positively correlated Pop Under 17: small p-value, negatively correlated Pop 18-54: small p-value, positively correlated Pop 55+: small p-value, negatively correlated Pct White: 0.049 p-value, positively correlated Pct Black: 0.006 p-value, negatively correlated Pct AAPI: 0.012 p-value, positively correlated Majority WHite: 0.5821, minor correlation - ACCEPT NULL Avg HH Size: small p-value, negatively correlated Unemployment Rate: 0.045 p-value, negatively correlated Median HH Income: 0.084 p-value, positively correlated Pct Non Citizen: 0.79 p-value, minor correlation - ACCEPT NULL Pct Single Parent: data issue Pct Unwell Adults: small p-value, negatively correlated Asthma Rate: 0.044 p-value, negatively correlated Pct Routine Dr Visit: small p-value, negatively correlated Cancer Rate: small p-value, negatively correlated Pct Uninsured: 0.88 p-value, minor correlation - ACCEPT NULL Pct Dentist Visits: 0.42 p-value, minor correlation - ACCEPT NULL Pct Mentally Unwell: 0.88 p-value, minor correlation - ACCEPT NULL

cor.test(dat2$AvgMEP, dat2$TotalPop)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$TotalPop
## t = -2.9414, df = 374, p-value = 0.003471
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.24773626 -0.04999172
## sample estimates:
##        cor 
## -0.1503676
cor.test(dat2$AvgMEP, dat2$PctFem)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctFem
## t = -1.8703, df = 374, p-value = 0.06223
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.195493539  0.004923445
## sample estimates:
##         cor 
## -0.09626062
cor.test(dat2$AvgMEP, dat2$PctMale)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctMale
## t = 1.8703, df = 374, p-value = 0.06223
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.004923445  0.195493539
## sample estimates:
##        cor 
## 0.09626062
cor.test(dat2$AvgMEP, dat2$PopUnd17)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PopUnd17
## t = -6.1361, df = 374, p-value = 0.000000002156
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3915923 -0.2076490
## sample estimates:
##        cor 
## -0.3024338
cor.test(dat2$AvgMEP, dat2$Pop18to54)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$Pop18to54
## t = 11.519, df = 374, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4330079 0.5827118
## sample estimates:
##       cor 
## 0.5117338
cor.test(dat2$AvgMEP, dat2$PopOver55)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PopOver55
## t = -6.8634, df = 374, p-value = 0.00000000002805
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4213428 -0.2414915
## sample estimates:
##       cor 
## -0.334459
cor.test(dat2$AvgMEP, dat2$PctWhite)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctWhite
## t = 1.9785, df = 374, p-value = 0.0486
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.000646634 0.200844897
## sample estimates:
##       cor 
## 0.1017761
cor.test(dat2$AvgMEP, dat2$PctBlack)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctBlack
## t = -2.7435, df = 374, p-value = 0.006372
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.23820873 -0.03988687
## sample estimates:
##        cor 
## -0.1404564
cor.test(dat2$AvgMEP, dat2$PctAAPI)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctAAPI
## t = 2.5167, df = 374, p-value = 0.01226
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02828103 0.22721866
## sample estimates:
##       cor 
## 0.1290481
cor.test(dat2$AvgMEP, dat2$MajorityWhite)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$MajorityWhite
## t = 0.55078, df = 374, p-value = 0.5821
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07287755  0.12923246
## sample estimates:
##        cor 
## 0.02846841
cor.test(dat2$AvgMEP, dat2$AvgHHSize)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$AvgHHSize
## t = -7.2594, df = 373, p-value = 0.000000000002278
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4375267 -0.2598327
## sample estimates:
##        cor 
## -0.3518455
cor.test(dat2$AvgMEP, dat2$UnemploymentRate)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$UnemploymentRate
## t = -2.0137, df = 374, p-value = 0.04476
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.202578925 -0.002454204
## sample estimates:
##        cor 
## -0.1035646
cor.test(dat2$AvgMEP, dat2$MedHHInc)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$MedHHInc
## t = 1.7307, df = 372, p-value = 0.08433
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01214194  0.18906755
## sample estimates:
##       cor 
## 0.0893746
cor.test(dat2$AvgMEP, dat2$PctNonCitizen)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctNonCitizen
## t = 0.26569, df = 374, p-value = 0.7906
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08752041  0.11471407
## sample estimates:
##        cor 
## 0.01373732
cor.test(dat2$AvgMEP, dat2$PctSingleParent)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctSingleParent
## t = -0.31531, df = 364, p-value = 0.7527
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.11883329  0.08613148
## sample estimates:
##        cor 
## -0.0165245
cor.test(dat2$AvgMEP, dat2$PctUnwellAdults)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctUnwellAdults
## t = -3.6245, df = 374, p-value = 0.0003296
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.28012904 -0.08465283
## sample estimates:
##        cor 
## -0.1842118
cor.test(dat2$AvgMEP, dat2$AsthmaRate)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$AsthmaRate
## t = -2.0246, df = 374, p-value = 0.04362
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.203119198 -0.003017657
## sample estimates:
##       cor 
## -0.104122
cor.test(dat2$AvgMEP, dat2$PctRoutineDrVisit)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctRoutineDrVisit
## t = -6.3347, df = 374, p-value = 0.0000000006832
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3998347 -0.2169812
## sample estimates:
##        cor 
## -0.3112863
cor.test(dat2$AvgMEP, dat2$CancerRate)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$CancerRate
## t = -8.8248, df = 374, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4954731 -0.3277650
## sample estimates:
##        cor 
## -0.4151397
cor.test(dat2$AvgMEP, dat2$PctUninsured)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctUninsured
## t = -0.15328, df = 374, p-value = 0.8783
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.10897466  0.09328503
## sample estimates:
##          cor 
## -0.007925876
cor.test(dat2$AvgMEP, dat2$PctDentistVisit)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctDentistVisit
## t = 0.80416, df = 374, p-value = 0.4218
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05984116  0.14208553
## sample estimates:
##        cor 
## 0.04154642
cor.test(dat2$AvgMEP, dat2$PctMentallyUnwell)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctMentallyUnwell
## t = -0.13853, df = 374, p-value = 0.8899
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.10822078  0.09404121
## sample estimates:
##         cor 
## -0.00716305

Plots!

boxplot((AvgMEP) ~ MajFemale, data= dat2)

boxplot((AvgMEP) ~ AvgHHSize, data= dat2)

boxplot((AvgMEP) ~ MedHHInc, data= dat2)

boxplot((Log_AvgMEP) ~ Log_MedHHInc, data= dat2)

More plots!!

Total Pop: small p-value, negatively correlated Pct Fem: 0.06 p-value, negatively correlated Pct Male: 0.062 p-value, positively correlated Pop Under 17: small p-value, negatively correlated Pop 18-54: small p-value, positively correlated Pop 55+: small p-value, negatively correlated Pct White: 0.049 p-value, positively correlated Pct Black: 0.006 p-value, negatively correlated Pct AAPI: 0.012 p-value, positively correlated Majority WHite: 0.5821, minor correlation - ACCEPT NULL Avg HH Size: small p-value, negatively correlated Unemployment Rate: 0.045 p-value, negatively correlated Median HH Income: 0.084 p-value, positively correlated Pct Non Citizen: 0.79 p-value, minor correlation - ACCEPT NULL Pct Single Parent: data issue Pct Unwell Adults: small p-value, negatively correlated Asthma Rate: 0.044 p-value, negatively correlated Pct Routine Dr Visit: small p-value, negatively correlated Cancer Rate: small p-value, negatively correlated Pct Uninsured: 0.88 p-value, minor correlation - ACCEPT NULL Pct Dentist Visits: 0.42 p-value, minor correlation - ACCEPT NULL Pct Mentally Unwell: 0.88 p-value, minor correlation - ACCEPT NULL

ggplot(data=dat2, aes(AvgMEP)) + 
  geom_histogram() +
  labs(title="Average MEP Distribution", x="Value of MEP", y="Number of Tracts")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=dat2, aes(Log_AvgMEP)) + 
  geom_histogram() +
  labs(title="Log Average MEP Distribution", x="Log Value of MEP", y="Number of Tracts")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=dat2, aes(Log_MedHHInc, Log_AvgMEP)) + geom_point()+ 
   ggtitle("Log Median Income Against Log Avg MEP") 
## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(data=dat2, aes(MedHHInc, AvgMEP)) + geom_point()+ 
   ggtitle("Median Income Against Avg MEP")
## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(data=dat2, aes(AvgHHSize, AvgMEP)) + geom_point()+ 
   ggtitle("Avg Household Size Against Avg MEP")
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(data=dat2, aes(Log_AvgHHSize, Log_AvgMEP)) + geom_point()+ 
   ggtitle("Log Avg Household Size Against Log Avg MEP")
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(data=dat2, aes(Log_TotalPop, Log_AvgMEP)) + geom_point()+ 
   ggtitle("Log Total Pop Against Log Avg MEP")

ggplot(data=dat2, aes(Log_Pop18to54, Log_AvgMEP)) + geom_point()+ 
   ggtitle("Log Population 18-54 Against Log Avg MEP")

ggplot(data=dat2, aes(Log_PctUnwellAdults, Log_AvgMEP)) + geom_point()+ 
   ggtitle("Log Unwell Adults Against Log Avg MEP")

ggplot(data=dat2, aes(Log_CancerRate, Log_AvgMEP)) + geom_point()+ 
   ggtitle("Log Cancer Rate Against Log Avg MEP")

ggplot(data=dat2, aes(PctBlack, AvgMEP)) + geom_point()+ 
   ggtitle("Log Cancer Rate Against Log Avg MEP")

ggplot(data=dat2, aes(Log_PctBlack, Log_AvgMEP)) + geom_point()+ 
   ggtitle("Log Cancer Rate Against Log Avg MEP")
## Warning: Removed 1 rows containing missing values (geom_point).

Total Pop: small p-value, negatively correlated Pop Under 17: small p-value, negatively correlated Pop 18-54: small p-value, positively correlated Pop 55+: small p-value, negatively correlated Pct White: 0.049 p-value, positively correlated Pct Black: 0.006 p-value, negatively correlated Pct AAPI: 0.012 p-value, positively correlated Avg HH Size: small p-value, negatively correlated Unemployment Rate: 0.045 p-value, negatively correlated Pct Unwell Adults: small p-value, negatively correlated Asthma Rate: 0.044 p-value, negatively correlated Pct Routine Dr Visit: small p-value, negatively correlated Cancer Rate: small p-value, negatively correlated

cor.test(dat2$AvgMEP, dat2$TotalPop)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$TotalPop
## t = -2.9414, df = 374, p-value = 0.003471
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.24773626 -0.04999172
## sample estimates:
##        cor 
## -0.1503676
cor.test(dat2$AvgMEP, dat2$PopUnd17)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PopUnd17
## t = -6.1361, df = 374, p-value = 0.000000002156
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3915923 -0.2076490
## sample estimates:
##        cor 
## -0.3024338
cor.test(dat2$AvgMEP, dat2$Pop18to54)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$Pop18to54
## t = 11.519, df = 374, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4330079 0.5827118
## sample estimates:
##       cor 
## 0.5117338
cor.test(dat2$AvgMEP, dat2$PopOver55)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PopOver55
## t = -6.8634, df = 374, p-value = 0.00000000002805
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4213428 -0.2414915
## sample estimates:
##       cor 
## -0.334459
cor.test(dat2$AvgMEP, dat2$PctWhite)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctWhite
## t = 1.9785, df = 374, p-value = 0.0486
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.000646634 0.200844897
## sample estimates:
##       cor 
## 0.1017761
cor.test(dat2$AvgMEP, dat2$PctBlack)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctBlack
## t = -2.7435, df = 374, p-value = 0.006372
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.23820873 -0.03988687
## sample estimates:
##        cor 
## -0.1404564
cor.test(dat2$AvgMEP, dat2$PctAAPI)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctAAPI
## t = 2.5167, df = 374, p-value = 0.01226
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02828103 0.22721866
## sample estimates:
##       cor 
## 0.1290481
cor.test(dat2$AvgMEP, dat2$AvgHHSize)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$AvgHHSize
## t = -7.2594, df = 373, p-value = 0.000000000002278
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4375267 -0.2598327
## sample estimates:
##        cor 
## -0.3518455
cor.test(dat2$AvgMEP, dat2$UnemploymentRate)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$UnemploymentRate
## t = -2.0137, df = 374, p-value = 0.04476
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.202578925 -0.002454204
## sample estimates:
##        cor 
## -0.1035646
cor.test(dat2$AvgMEP, dat2$PctUnwellAdults)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctUnwellAdults
## t = -3.6245, df = 374, p-value = 0.0003296
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.28012904 -0.08465283
## sample estimates:
##        cor 
## -0.1842118
cor.test(dat2$AvgMEP, dat2$AsthmaRate)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$AsthmaRate
## t = -2.0246, df = 374, p-value = 0.04362
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.203119198 -0.003017657
## sample estimates:
##       cor 
## -0.104122
cor.test(dat2$AvgMEP, dat2$PctRoutineDrVisit)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$PctRoutineDrVisit
## t = -6.3347, df = 374, p-value = 0.0000000006832
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3998347 -0.2169812
## sample estimates:
##        cor 
## -0.3112863
cor.test(dat2$AvgMEP, dat2$CancerRate)
## 
##  Pearson's product-moment correlation
## 
## data:  dat2$AvgMEP and dat2$CancerRate
## t = -8.8248, df = 374, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4954731 -0.3277650
## sample estimates:
##        cor 
## -0.4151397
mod <- lm(Log_AvgMEP ~ Log_AvgHHSize, data = dat2)
summary (mod)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_AvgHHSize, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65162 -0.18810  0.01651  0.22363  0.53666 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    6.08632    0.06575  92.566 < 0.0000000000000002 ***
## Log_AvgHHSize -0.46863    0.07038  -6.659      0.0000000000991 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2651 on 373 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1062, Adjusted R-squared:  0.1038 
## F-statistic: 44.34 on 1 and 373 DF,  p-value: 0.00000000009909
par(mfrow=c(1,1))           
plot (dat2$Log_AvgHHSize, dat2$Log_AvgMEP )
abline (mod, col="red")

mod1 <- lm(Log_AvgMEP ~ Log_AvgHHSize + Log_PctBlack, data = dat2)
summary (mod1)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_AvgHHSize + Log_PctBlack, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65706 -0.18373  0.02349  0.22347  0.53368 
## 
## Coefficients:
##                Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    6.085266   0.069546  87.500 < 0.0000000000000002 ***
## Log_AvgHHSize -0.461901   0.071563  -6.454        0.00000000034 ***
## Log_PctBlack  -0.001388   0.010093  -0.137                0.891    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2655 on 371 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1039, Adjusted R-squared:  0.09903 
## F-statistic:  21.5 on 2 and 371 DF,  p-value: 0.000000001464
mod2 <- lm(Log_AvgMEP ~ Log_UnemploymentRate, data = dat2)
summary (mod2)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_UnemploymentRate, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68590 -0.21165 -0.01259  0.24020  0.55371 
## 
## Coefficients:
##                      Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)           5.76218    0.03999 144.102 < 0.0000000000000002 ***
## Log_UnemploymentRate -0.05167    0.01849  -2.794              0.00548 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2769 on 371 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.0206, Adjusted R-squared:  0.01796 
## F-statistic: 7.805 on 1 and 371 DF,  p-value: 0.005482
par(mfrow=c(1,1))           
plot (dat2$Log_UnemploymentRate, dat2$Log_AvgMEP )
abline (mod2, col="red")

mod3 <- lm(Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54, data = dat2)
summary (mod3)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62106 -0.16890 -0.00087  0.18742  0.66977 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    3.33009    0.27794  11.981 < 0.0000000000000002 ***
## Log_AvgHHSize -0.27654    0.06519  -4.242             0.000028 ***
## Log_Pop18to54  0.65174    0.06426  10.142 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.235 on 372 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2998, Adjusted R-squared:  0.2961 
## F-statistic: 79.65 on 2 and 372 DF,  p-value: < 0.00000000000000022
mod4 <- lm(Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54 + Log_PctRoutineDrVisit, data = dat2)
summary (mod4)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54 + Log_PctRoutineDrVisit, 
##     data = dat2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6193 -0.1676 -0.0026  0.1879  0.6721 
## 
## Coefficients:
##                       Estimate Std. Error t value           Pr(>|t|)    
## (Intercept)            3.15748    1.61010   1.961             0.0506 .  
## Log_AvgHHSize         -0.27507    0.06666  -4.127 0.0000454684977027 ***
## Log_Pop18to54          0.65761    0.08397   7.831 0.0000000000000512 ***
## Log_PctRoutineDrVisit  0.03388    0.31128   0.109             0.9134    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2353 on 371 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2999, Adjusted R-squared:  0.2942 
## F-statistic: 52.97 on 3 and 371 DF,  p-value: < 0.00000000000000022
mod5 <- lm(Log_AvgMEP ~ Log_PctRoutineDrVisit, data = dat2)
summary (mod5)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_PctRoutineDrVisit, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.74360 -0.17974 -0.01809  0.21574  0.56659 
## 
## Coefficients:
##                       Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)            12.3896     1.1887  10.423 < 0.0000000000000002 ***
## Log_PctRoutineDrVisit  -1.5412     0.2721  -5.664         0.0000000295 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2694 on 374 degrees of freedom
## Multiple R-squared:  0.07901,    Adjusted R-squared:  0.07655 
## F-statistic: 32.08 on 1 and 374 DF,  p-value: 0.00000002948
mod6 <- lm(Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54 + Log_CancerRate, data = dat2)
summary (mod6)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54 + Log_CancerRate, 
##     data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.60560 -0.16157 -0.02097  0.18568  0.60398 
## 
## Coefficients:
##                Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     5.43758    0.50214  10.829 < 0.0000000000000002 ***
## Log_AvgHHSize  -0.41499    0.06906  -6.009        0.00000000446 ***
## Log_Pop18to54   0.27475    0.09811   2.800              0.00537 ** 
## Log_CancerRate -0.28242    0.05678  -4.974        0.00000100566 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2278 on 371 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3436, Adjusted R-squared:  0.3383 
## F-statistic: 64.74 on 3 and 371 DF,  p-value: < 0.00000000000000022
mod7 <- lm(Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54 + Log_CancerRate + Log_PctUnwellAdults, data = dat2)
summary (mod7)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54 + Log_CancerRate + 
##     Log_PctUnwellAdults, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.60568 -0.15872 -0.02106  0.15081  0.74977 
## 
## Coefficients:
##                     Estimate Std. Error t value          Pr(>|t|)    
## (Intercept)          4.03273    0.58652   6.876 0.000000000026353 ***
## Log_AvgHHSize       -0.54635    0.07385  -7.398 0.000000000000934 ***
## Log_Pop18to54        0.49969    0.10879   4.593 0.000005989620398 ***
## Log_CancerRate      -0.22527    0.05697  -3.954 0.000092125428930 ***
## Log_PctUnwellAdults  0.20293    0.04648   4.366 0.000016477385806 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2225 on 370 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3758, Adjusted R-squared:  0.369 
## F-statistic: 55.68 on 4 and 370 DF,  p-value: < 0.00000000000000022

The Variance Inflation Factor (VIF) detects multicollinearity in regression analysis. The VIF for these variables is relatively small (with anything less than 10 considered appropriate), meaning there are no problems of collinearity.

mod8 <- lm(Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54 + Log_CancerRate + Log_PctUnwellAdults + Log_PctBlack, data = dat2)
summary (mod8)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_AvgHHSize + Log_Pop18to54 + Log_CancerRate + 
##     Log_PctUnwellAdults + Log_PctBlack, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.70344 -0.14362 -0.00223  0.15156  0.71918 
## 
## Coefficients:
##                     Estimate Std. Error t value           Pr(>|t|)    
## (Intercept)          4.14022    0.58164   7.118 0.0000000000057677 ***
## Log_AvgHHSize       -0.59840    0.07564  -7.912 0.0000000000000301 ***
## Log_Pop18to54        0.47692    0.10794   4.419 0.0000130857529105 ***
## Log_CancerRate      -0.26638    0.05807  -4.587 0.0000061710619077 ***
## Log_PctUnwellAdults  0.27889    0.05242   5.320 0.0000001804957684 ***
## Log_PctBlack        -0.03109    0.01006  -3.090            0.00215 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2202 on 368 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3885, Adjusted R-squared:  0.3802 
## F-statistic: 46.76 on 5 and 368 DF,  p-value: < 0.00000000000000022
vif(mod8)
##       Log_AvgHHSize       Log_Pop18to54      Log_CancerRate Log_PctUnwellAdults 
##            1.664222            3.502726            2.777506            2.582808 
##        Log_PctBlack 
##            1.480532
stargazer(mod6, mod7, mod8, type = "text")
## 
## ===========================================================================================
##                                               Dependent variable:                          
##                     -----------------------------------------------------------------------
##                                                   Log_AvgMEP                               
##                               (1)                     (2)                     (3)          
## -------------------------------------------------------------------------------------------
## Log_AvgHHSize              -0.415***               -0.546***               -0.598***       
##                             (0.069)                 (0.074)                 (0.076)        
##                                                                                            
## Log_Pop18to54              0.275***                0.500***                0.477***        
##                             (0.098)                 (0.109)                 (0.108)        
##                                                                                            
## Log_CancerRate             -0.282***               -0.225***               -0.266***       
##                             (0.057)                 (0.057)                 (0.058)        
##                                                                                            
## Log_PctUnwellAdults                                0.203***                0.279***        
##                                                     (0.046)                 (0.052)        
##                                                                                            
## Log_PctBlack                                                               -0.031***       
##                                                                             (0.010)        
##                                                                                            
## Constant                   5.438***                4.033***                4.140***        
##                             (0.502)                 (0.587)                 (0.582)        
##                                                                                            
## -------------------------------------------------------------------------------------------
## Observations                  375                     375                     374          
## R2                           0.344                   0.376                   0.389         
## Adjusted R2                  0.338                   0.369                   0.380         
## Residual Std. Error    0.228 (df = 371)        0.222 (df = 370)        0.220 (df = 368)    
## F Statistic         64.738*** (df = 3; 371) 55.681*** (df = 4; 370) 46.763*** (df = 5; 368)
## ===========================================================================================
## Note:                                                           *p<0.1; **p<0.05; ***p<0.01

##Correlation between final variables and MEP

par(mfrow=c(2, 3))           
plot(dat2$Log_AvgHHSize, dat2$Log_AvgMEP, main=paste("Corr=", cor(dat2$Log_AvgHHSize, dat2$Log_AvgMEP, use="complete.obs", method="pearson")))
plot(dat2$Log_Pop18to54, dat2$Log_AvgMEP, main=paste("Corr=", cor(dat2$Log_Pop18to54, dat2$Log_AvgMEP, use="complete.obs", method="pearson")))
plot(dat2$Log_CancerRate, dat2$Log_AvgMEP, main=paste("Corr=", cor(dat2$Log_CancerRate, dat2$Log_AvgMEP, use="complete.obs", method="pearson")))
plot(dat2$Log_PctUnwellAdults, dat2$Log_AvgMEP, main=paste("Corr=", cor(dat2$Log_PctUnwellAdults, dat2$Log_AvgMEP, use="complete.obs", method="pearson")))
plot(dat2$Log_PctBlack, dat2$Log_AvgMEP, main=paste("Corr=", cor(dat2$Log_PctBlack, dat2$Log_AvgMEP, use="complete.obs", method="pearson")))

mod9 <- lm(Log_AvgMEP ~ Log_MedHHInc, data=dat2)
summary (mod9)
## 
## Call:
## lm(formula = Log_AvgMEP ~ Log_MedHHInc, data = dat2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.67988 -0.20508 -0.04325  0.19902  0.58192 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   6.09748    0.30435  20.034 <0.0000000000000002 ***
## Log_MedHHInc -0.04104    0.02846  -1.442                0.15    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2793 on 372 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.005558,   Adjusted R-squared:  0.002885 
## F-statistic: 2.079 on 1 and 372 DF,  p-value: 0.1502
par(mfrow=c(1,1))           
plot (dat2$Log_MedHHInc, dat2$Log_AvgMEP )
abline (mod9, col="red")