#knitr::opts_chunk$set(echo=TRUE, warning = FALSE, message = FALSE, results = 'hide', include=TRUE, fig.keep = 'all')
## SET UP ##
#Setup for our analysis, including necessary packages.
setwd("~/Documents/UPenn/2022 Spring/CPLN 505 - Planning by Numbers/Assignment 2")
library(tidyverse)
library(tidycensus)
library(ggplot2)
library(dplyr)
library(data.table)
require("ggrepel")
library(hrbrthemes)
library(viridis)
library(ggridges)
library(lubridate)
library(tigris)
library(gganimate)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(mapview)
library(ggcorrplot)
library(RColorBrewer)
library(stargazer)
library(correlationfunnel)
library(tidyquant)
library(plotly)
library(shiny)
library(bslib)
library(shinydashboard)
library(DT)
options(scipen=999)
dat <- read.csv("cwns2012.csv", header = TRUE)
TO: Mayor Kenney
FROM: Riddhi Batra & Emily Goldstein
DATE: 3 March 2022
RE: Water Infrastructure Needs in Philadelphia
Dear Mayor Kenney,
Since former Mayor Michael Nutter’s introduction of the Greenworks Philadelphia plan in 2009, Philadelphia has been on track to perform major improvements to its waste and stormwater infrastructure with the goal of reducing the overall burden on Philly’s aging water infrastructure and saving the City critical dollars in future facility repairs and improvements. In an effort to understand the potential scale and impact of these improvements, we have performed several comparative statistical analyses across geographic regions, states, and counties, studying their relationship to Philadelphia County, and identifying critical metrics impacting Philly’s potential infrastructure need costs.1 We have outlined our key findings below and provided a more detailed and technical evaluation in our attached analysis.
Philadelphia’s water infrastructure needs are closely related to the county’s population and population density.
We identified positive relationships between Philadelphia’s population and population density and its share of infrastructure needs. This finding is intuitive and unsurprising. Philadelphia County alone accounts for almost 20% of Pennsylvania’s official needs and Philadelphia County contains the highest population density of all counties in PA (and growing– between 2000 – 2010 by 0.56%). Additionally, over 24% of the residential population receiving treatment in Pennsylvania belong to Philadelphia County. We also evaluated the relationship between the percentage of white identifying populations and median income, finding these metrics to have little to no association with infrastructure needs.
The City of Philadelphia’s total need accounts for just over 3% of the four states within the Delaware River Watershed, and its mean population density falls within the third quartile.
Within the Delaware River Watershed (DRW), a critical source for Philadelphia’s drinking water, the City of Philadelphia accounted for over 3% of the four states, including New York, New Jersey, and Delaware, also within the DRW. Like Philadelphia’s high need percentage in PA, it also has a comparatively high need relative to other related regions. While Philadelphia is dominant in its percentage share of need in PA, the state of PA itself falls near the middle range of need at number 28 when compared to mean need to all 49 states. However, in terms of median need, PA falls at number 18 within the third quartile of all states—a discrepancy which can be explained by the significant differences in min and max need values within PA that contribute towards skewing the state’s mean need.
Our analysis showed that every unit increase in residences receiving treatment is accompanied by an almost $700 increase in total need for Philadelphia.
Within the population of Philadelphia only about 62% of the residential and non-residential population is receiving water treatment. We anticipate this percentage share of the population using city water collection and treatment to continue to grow as it has over the past 10 years of our study period. As the number of Philly’s water infrastructure users increases, so will the cost burden experienced by the City.
We also recommend that particular attention be paid to upgrading the quality and capacity of CSS (Combined Sewer System) infrastructure in Philadelphia through adequate budgetary allocations and consistent quality evaluations.
The presence of a CSS facility also impacts Philadelphia’s water infrastructure requirements. Our enclosed regression analysis found that the increase in official need per unit increase in population density is 132% higher for facilities with a CSS than those without. This could be attributed to the high maintenance costs associated with Combined Sewer Systems.
A strategy taking into consideration the steady increase in the key metrics of population and population density, along with astute attention to Philadelphia’s aging CSS system, should support the City in its efforts to improve its water infrastructure and lower its operational costs.
Sincerely,
Riddhi Batra & Emily Goldstein
Future city planners aspiring to be semi-competent in R.
We started by creating a new data frame to include change and percent change for all metrics.
dat1 <- select(dat, 1:18, 21:26, 30:31, 35, 37:38, 40)%>%
mutate(CHANGE_RES_REC_COLLCTN = PROJ_RES_REC_COLLCTN - PRES_RES_REC_COLLCTN)%>%
mutate(PCT_CHANGE_RES_REC_COLLCTN = (CHANGE_RES_REC_COLLCTN/PRES_RES_REC_COLLCTN)*100)%>%
mutate(CHANGE_RES_REC_TRMT = PROJ_RES_REC_TRMT - PRES_RES_REC_TRMT)%>%
mutate(PCT_CHANGE_RES_REC_TRMT = (CHANGE_RES_REC_TRMT/PRES_RES_REC_TRMT)*100)%>%
mutate(CHANGE_N_RES_REC_TRMT = PROJ_N_RES_REC_TRMT - PRES_N_RES_REC_TRTM)%>%
mutate(PCT_CHANGE_N_RES_REC_TRMT = (CHANGE_N_RES_REC_TRMT/PRES_N_RES_REC_TRTM)*100)%>%
mutate(CHANGE_N_RES_REC_COLLCTN = PROJ_N_RES_REC_COLLCTN - PRES_N_RES_REC_COLLCTN)%>%
mutate(PCT_CHANGE_N_RES_REC_COLLCTN = (CHANGE_N_RES_REC_COLLCTN/PRES_N_RES_REC_COLLCTN)*100)%>%
mutate(ALAND_SQMILE = ALAND/2.59e+6)%>%
mutate(POP_DENSITY_10 = POP10/ALAND_SQMILE)%>%
mutate(POP_DENSITY_00 = POP00/ALAND_SQMILE)%>%
mutate(CHANGE_POP_DENSITY = POP_DENSITY_10 - POP_DENSITY_00)%>%
mutate(PCT_CHANGE_POP_DENSITY = (CHANGE_POP_DENSITY/POP_DENSITY_00)*100)%>%
mutate(MEDINC99_INFL = MEDINC99*1.29)%>%
mutate(CHANGE_MEDINC = MEDINC09 - MEDINC99_INFL)%>%
mutate(PCT_CHANGE_MEDINC = (CHANGE_MEDINC/MEDINC99_INFL)*100)
Total Official Need
Total Reported Need
The facility with the highest reported need is CWNS NUMBER 36002001005, in Queens County, NY, EPA Region 2.
summary(dat1$TOTAL_OFFICIAL_NEED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 704599 2409822 21623215 7660571 5160108526
Top Ten Facilities with the Most Need
dat_top10 <- dat1%>%
slice_max(TOTAL_OFFICIAL_NEED, n = 10)
DT::datatable(
dat_top10,
height = 150,
options = list(scrollX = TRUE)
)
Population Density
Change in Population Density
summary(dat1$PCT_CHANGE_POP_DENSITY)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -46.605 -0.640 4.269 7.341 11.730 110.355
Density Plot: Facilities with Population Density Below Mean Value
mean_pop.dens <- mean(dat1$POP_DENSITY_10)
dat1%>%
filter(POP_DENSITY_10 <= 532.99)%>%
ggplot(aes(x=POP_DENSITY_10)) +
ggtitle("Population Density Distribution") +
xlab("Population Density (2010) less than mean values") + ylab("Density") +
geom_density(fill="orchid", color="orchid3", alpha=0.8)
Median Income
Change in Median Income
summary(dat1$CHANGE_MEDINC)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -15488.5 -3924.0 -1614.8 -1530.3 706.9 25537.0
Percent Change in Median Income
summary(dat1$PCT_CHANGE_MEDINC)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -36.025 -6.512 -2.663 -2.447 1.244 49.526
Median Income: Plotted
ggplot(dat1, aes(x=MEDINC09)) +
geom_density(fill="orchid", color="orchid3", alpha=0.8)
Residential Collection
Change in Residential Population Receiving Collection
summary(dat1$CHANGE_RES_REC_COLLCTN)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -88018.0 19.0 275.5 4636.7 1800.0 1300000.0
Percent Change in Residential Population Receiving Collection
summary(dat1$PCT_CHANGE_RES_REC_COLLCTN)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -100.000 1.745 12.500 Inf 35.780 Inf 23
Non-Residential Collection
Change in Non-Residential Population Receiving Collection
summary(dat1$CHANGE_N_RES_REC_COLLCTN)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -254000.0 0.0 0.0 376.8 0.0 354330.0
Percent Change in Non-Residential Population Receiving Collection
summary(dat1$PCT_CHANGE_N_RES_REC_COLLCTN)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -100.00 0.00 23.46 Inf 100.00 Inf 7217
Residential Treatment
Change in Residential Population Receiving Treatment
summary(dat1$CHANGE_RES_REC_TRMT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -187386 31 326 5477 2096 1300028
Percent Change in Residential Population Receiving Treatment
summary(dat1$PCT_CHANGE_RES_REC_TRMT)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -100.000 2.302 13.636 Inf 37.883 Inf 18
Non-Residential Treatment
Change in Non-Residential Population Receiving Treatment
summary(dat1$CHANGE_N_RES_REC_TRMT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -254000.0 0.0 0.0 437.9 0.0 354330.0
Percent Change in Non-Residential Population Receiving Treatment
summary(dat1$PCT_CHANGE_N_RES_REC_TRMT)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -100.00 0.00 24.62 Inf 100.00 Inf 7102
EPA
Summary Stats by Need of EPA Regions
dat_epa <- group_by(dat1, EPA_REGION)%>%
summarise(count = n(),
mean_need = mean(TOTAL_OFFICIAL_NEED),
median_need = median(TOTAL_OFFICIAL_NEED),
mean_popdensity00 = mean(POP_DENSITY_00),
mean_popdensity10 = mean(POP_DENSITY_10),
median_popdensity00 = median(POP_DENSITY_00),
median_popdensity10 = median(POP_DENSITY_10))
summary(dat_epa$median_need)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 946351 1896198 3022067 4276328 3977127 17740293
Summary Stats by Density of EPA Regions
summary(dat_epa$mean_popdensity00)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 105.6 179.5 411.7 568.6 596.5 2208.0
Top Ten by EPA Region
dat_top10epa <- dat1%>%
group_by(EPA_REGION)%>%
slice_max(TOTAL_OFFICIAL_NEED, n=10)
DT::datatable(
dat_top10epa,
height = 100,
options = list(scrollX = TRUE)
)
#CHUNK FOR BREAKING UP TABLES
Bottom Ten by EPA Region
dat_bottom10epa <- dat1%>%
group_by(EPA_REGION)%>%
slice_min(TOTAL_OFFICIAL_NEED, n=10)
DT::datatable(
dat_bottom10epa,
height = 100,
options = list(scrollX = TRUE)
)
We created scatterplots to visualize relationships between population density and official need for different groupings of the CWNS data set.
Of the 10 EPAs represented:
– EPA 9 has a mid-range population density with the highest official need.
– EPA 2 has the highest population density with a high official need.
– EPA 8 has the lowest population density and the lowest reported need.
– EPA 3, which contains Philadelphia, has a relatively lower population density and official need.
These findings are consistent with the presence of large urban centers in EPA 9 and EPA 2 – situated on the West and East coasts respectively, both regions contain cities and counties with high populations and water requirements – such as Los Angeles, San Franciso, New York, Queens, and Hudson.
Plot: Population Density vs Total Need, by EPA
ggplot(dat_epa, aes(x = mean_popdensity10,
y = mean_need)) +
geom_point(size=5, shape=20, color="darkorchid4") +
geom_text_repel(aes(label = rownames(dat_epa), size = 4)) +
theme(legend.position = "bottom") +
ggtitle("Population Density vs. Total Need, by EPA") +
xlab("Mean Population Density (2010)") + ylab("Mean Official Need")
STATE
Top Ten by State
dat_state <- group_by(dat1, STATEFP, STATE)%>%
summarise(count = n(),
mean_need = mean(TOTAL_OFFICIAL_NEED),
median_need = median(TOTAL_OFFICIAL_NEED),
mean_popdensity00 = mean(POP_DENSITY_00),
mean_popdensity10 = mean(POP_DENSITY_10),
median_popdensity00 = median(POP_DENSITY_00),
median_popdensity10 = median(POP_DENSITY_10))%>%
filter(count > 2)
dat_top10state <- dat1%>%
group_by(STATEFP)%>%
slice_max(TOTAL_OFFICIAL_NEED, n = 10)
DT::datatable(
dat_top10state,
height = 100,
options = list(scrollX = TRUE)
)
#CHUNK FOR BREAKING UP TABLES
Bottom Ten by State
dat_bottom10state <- dat1%>%
group_by(STATEFP)%>%
slice_min(TOTAL_OFFICIAL_NEED, n = 10)
DT::datatable(
dat_bottom10state,
height = 100,
options = list(scrollX = TRUE)
)
When grouped by “State”, we can observe population versus official need in a manner consistent with our EPA grouping, in addition to an unexpected cameo:
– California, falling under EPA 9, has mid-range population density with a high need.
– New Jersey has the highest population density with a mid-range need, and New York has a high population density with a high need; both belonging to EPA 2.
– Pennsylvania has a mid-range population density with a low official need.
Surprisingly, the highest need of all states is shouldered by Hawaii, despite a low population density. This could be attributed to the fact that Hawaii’s current water infrastructure is outdated1 and requires extra attention due to its geographic susceptibility to climate change. 2
Population Density vs. Total Need by State
ggplot(dat_state, aes(x = mean_popdensity10,
y = mean_need)) +
geom_point(size=4, shape=20, color="springgreen4") +
geom_text_repel(aes(label = STATE, size = 2)) +
theme(legend.position = "bottom") +
ggtitle("Population Density vs. Total Need, by State") +
xlab("Mean Population Density (2010)") + ylab("Mean Official Need")
In order to begin contextualizing Philadelphia’s water needs, we decided to focus our statistical analysis on the four states falling under the Delaware River Watershed (12,800 sq miles). While Philadelphia’s water supply is split between two sources – the Schyulkill and Delaware rivers – we’ve chosen to compare infrastructure and resources to its northeastern neighbors, in particular the states of New York, New Jersey, and Delaware. As they are geographically related and rely on the Delaware River Watershed for much of their potable water needs, these states are a logical point of reference. Within the EPA’s regional, large aquatic ecosystems (LAE) classification system, these states fall within regions 2 (NY, NJ) and 3 (PA, DE).
dat_PA <- filter(dat1, STATEFP == 42)
dat_NJ <- filter(dat1, STATEFP == 34)
dat_NY <- filter(dat1, STATEFP == 9)
dat_DE <- filter(dat1, STATEFP == 10)
dat_focus_states <- filter(dat1, STATE %in% c("PA", "NJ", "NY", "DE"))
Summary Statistics
Of the four states sharing the Delaware River Watershed:
– The highest need is for the state of New York, which also has a relatively high population density.
– Delaware has the lowest need and population statistics.
– Seen in conjunction with New York and New Jersey, Pennsylvania’s population density and total need is much lower.
Total Official Need
summary(dat_focus_states$TOTAL_OFFICIAL_NEED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1067266 3371533 37872196 8494217 5160108526
Median Income
summary(dat_focus_states$CHANGE_MEDINC)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7174.6 -1722.5 -547.4 -494.3 1052.2 14726.6
Population Density
summary(dat_focus_states$CHANGE_POP_DENSITY)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -79.88 -1.52 11.16 34.49 45.69 2148.63
Residential Collection
summary(dat_focus_states$CHANGE_RES_REC_COLLCTN)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -88018.0 0.0 100.0 841.1 748.5 35408.0
Residential Treatment
summary(dat_focus_states$CHANGE_RES_REC_TRMT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -111031.0 0.0 180.5 1184.9 1047.8 54959.0
Box Plots: Official Need by Focus States
We constructed box plots to understand the distribution of total need within each of these four states. Facilities with outlier values in New York, New Jersey, and Pennsylvania skewed the representation of our box plots.
New York, especially, was literally off the charts!
ggplot(dat_focus_states,
aes(x=STATE, y=TOTAL_OFFICIAL_NEED)) +
geom_boxplot(fill="seagreen3", alpha=0.2) +
geom_jitter(color="gray50", size=0.4, alpha=0.9) +
ggtitle("Water Infrastructure Need: Delaware River Watershed") +
xlab("STATE") + ylab("Official Need")
##NY need is skewing comparative visualization - literally off the charts!
We decided to log transform both variables to fix our box plots. The dots behind each box plot represent need values for each state’s facility, which gave us a way to visualize the spread of data.
– As surmised above, the largest spread in values can be seen in New Jersey, New York, and Pennsylvania.
– Median values of total need are about the same for all states, with a slight upward lean in the range of data for Delaware.
ggplot(dat_focus_states,
aes(x=STATE, y=TOTAL_OFFICIAL_NEED)) +
scale_y_log10() +
geom_boxplot(fill="seagreen3", alpha=0.2) +
geom_jitter(color="gray50", size=0.4, alpha=0.9) +
ggtitle("Water Infrastructure Need: Delaware River Watershed") +
xlab("STATE") + ylab("Official Need (Log Transformed)")
Summary of Percent Change Residential Collection for Focus States
summary(dat_focus_states$PCT_CHANGE_RES_REC_COLLCTN)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -79.095 0.000 4.349 Inf 14.333 Inf 2
Change in Residential Population Receiving Collection
ggplot(dat_focus_states,
aes(x=STATE, y=PCT_CHANGE_RES_REC_COLLCTN)) +
geom_boxplot(fill="seagreen4", alpha=0.2) +
xlab("STATE") + ylab("Change") +
ylim(-75,75)
dat_focus_states_summary <- filter(dat1, STATE %in% c("PA", "NJ", "NY", "DE"))%>%
group_by(STATE)%>%
summarise(count = n(),
mean_need = mean(TOTAL_OFFICIAL_NEED),
median_need = median(TOTAL_OFFICIAL_NEED),
mean_popdensity00 = mean(POP_DENSITY_00),
mean_popdensity10 = mean(POP_DENSITY_10),
median_popdensity00 = median(POP_DENSITY_00),
median_popdensity10 = median(POP_DENSITY_10))
Population Density vs. Total Need by Focus State
ggplot(dat_focus_states_summary, aes(x = mean_popdensity10,
y = mean_need)) +
geom_point(size=5, shape=20, color="olivedrab4") +
geom_text_repel(aes(label = STATE, size=3)) +
theme(legend.position = "bottom") +
ggtitle("Water Infrastructure Need: Delaware River Watershed") +
xlab("Mean Population Density (2010)") + ylab("Mean Official Need")
dat_PA_counties <- group_by(dat_PA, COUNTYNAME)%>%
summarise(count = n(),
mean_need = mean(TOTAL_OFFICIAL_NEED),
median_need = median(TOTAL_OFFICIAL_NEED),
mean_popdensity00 = mean(POP_DENSITY_00),
mean_popdensity10 = mean(POP_DENSITY_10),
median_popdensity00 = median(POP_DENSITY_00),
median_popdensity10 = median(POP_DENSITY_10))
Population Density vs. Official Need
Within Pennsylvania, Philadelphia County unsurprisingly reported the highest total need with the highest population density within the state of Pennsylvania. While all other counties were clustered near the graph origin with low population density and low need, Delaware and Lackawanna Counties exhibited some deviation from the origin yet remain far from Philadelphia Counties statistics.
ggplot(dat_PA_counties, aes(x = mean_popdensity10,
y = mean_need)) +
geom_point(size=5, shape=20, color="darkgoldenrod1") +
geom_text_repel(aes(label = COUNTYNAME, size=3)) +
theme(legend.position = "bottom") +
ggtitle("Water Infrastructure Need: Pennsylvania Counties") +
xlab("Mean Population Density (2010)") + ylab("Mean Official Need")
Certain counties are standing out a lot…
datPA <- filter(dat1, STATE == "PA")%>%
top_n(10, TOTAL_OFFICIAL_NEED)
Official Need for Top 10 Counties
Even when compared to the top 10 counties exhibiting high need within Pennsylvania, Philadelphia’s needs stack up much higher. This isn’t very surprising, given the high population density of Philadelphia County (11373 in 2010) as compared to other Pennsylvania counties.
ggplot(datPA, aes(x=as.factor(COUNTYNAME), fill=as.factor(TOTAL_OFFICIAL_NEED))) +
geom_bar(color="darkgoldenrod3", fill="darkgoldenrod1", alpha=0.8) +
xlab ("County") + ylab ("Total Official Need")
The non-normal distribution of the following density plot helps visualize this stark difference.
#DensityPlots
ggplot(dat_PA, aes(x=POP_DENSITY_10)) +
geom_density(fill="darkgoldenrod1", color="darkgoldenrod2", alpha=0.8) +
ggtitle("Variation in Population Density: Pennsylvania Counties") +
xlab("Population Density (2010)")
The following chart representing absolute values of need shows the comparatively high need of Philadelphia County with respect to Delaware and Lackawanna.
dat1%>%
filter(COUNTYNAME %in% c("Lackawanna", "Philadelphia", "Delaware"))%>%
ggplot(aes(x=COUNTYNAME, y=TOTAL_OFFICIAL_NEED)) +
geom_boxplot(fill="darkgoldenrod1", alpha=0.2) +
geom_jitter(color="gray50", size=0.4, alpha=0.9) +
ggtitle("Water Infrastructure Need: Pennsylvania Counties") +
xlab("STATE") + ylab("Official Need")
Log Transform
On log-transforming the chart variables, the extent of Philadelphia County’s need, relative to other PA counties can be clearly noticed in the differences in median values and the spread of data. This clearly demonstrates the extent to which Philadelphia County alone contributes to Pennsylvania’s water infrastructure needs.
#Log Transform
dat1%>%
filter(COUNTYNAME %in% c("Lackawanna", "Philadelphia", "Delaware"))%>%
ggplot(aes(x=COUNTYNAME, y=TOTAL_OFFICIAL_NEED)) +
geom_boxplot(fill="darkgoldenrod1", alpha=0.2) +
scale_y_log10() +
geom_jitter(color="gray50", size=0.4, alpha=0.9) +
ggtitle("Water Infrastructure Need: Pennsylvania Counties") +
xlab("STATE") + ylab("Official Need (Log Transformed)")
Philadelphia Water Department has three water treatment plants, the Baxter Plant, Queen Lane Plant and Belmont Plant. Located in Torresdale, the Baxter Plant cleans water from the Delaware. Both the Queen Lane (East Falls) and Belmont (Wynnefield) plants source water from the Schuylkill.
Summary
##LOOK AT PHILLY
dat_PHL <- filter(dat1, COUNTYFP == 101, STATEFP == 42)
#colnames(dat_PHL)
summary(dat_PHL$TOTAL_OFFICIAL_NEED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 249190563 383117348 517044133 429336765 519409866 521775599
Total Need
#Density Plot: Total Need
ggplot(dat_PHL, aes(x=TOTAL_OFFICIAL_NEED)) +
geom_density(fill="coral", color="coral2", alpha=0.8) +
ggtitle("Variation in Need: Philadelphia Facilities") +
xlab("Official Need")
Of the three facilities, the highest reported need is for Philadelphia Water Dept (SW).
#Compare Changes in Collection and Treatment
#Barplots
ggplot(dat_PHL, aes(y=CHANGE_RES_REC_TRMT, x=CWNS_NUMBER)) +
geom_bar(position="dodge", stat="identity", color="coral1", fill="coral1", alpha=0.6) +
ggtitle("Change in Residential Population Receiving Treatment") +
xlab("CWNS Facility") + ylab("Change in Treatment Received")
Magnitude of overall budgeted need for CSS is significantly more than budgeted need for MS4.
Means are significantly different as the test statistic is greater than the critical value.
Test statistic is within the interval of 95% confidence.
dat_YesCSS <- filter(dat, CSS == "1")
dat_YesMS4 <- filter(dat, MS4 == "1")
#sample means
meanNeed_YesCSS <-mean(dat_YesCSS$TOTAL_OFFICIAL_NEED)
meanNeed_YesMS4 <-mean(dat_YesMS4$TOTAL_OFFICIAL_NEED)
##Mean need for CSS is larger than MS4
#sample standardized deviations
stdNeed_YesCSS <-sd(dat_YesCSS$TOTAL_OFFICIAL_NEED)
stdNeed_YesMS4 <-sd(dat_YesMS4$TOTAL_OFFICIAL_NEED)
#number of obs
numobs_YesCSS <- dim(dat_YesCSS)[1]
numobs_YesMS4 <- dim(dat_YesMS4)[1]
#test stat calc: difference in the means & standardized deviations
numerator_teststat <- meanNeed_YesCSS - meanNeed_YesMS4
denom_teststat <-
((stdNeed_YesCSS*stdNeed_YesCSS)/numobs_YesCSS +
(stdNeed_YesMS4*stdNeed_YesMS4)/numobs_YesMS4)^.5
test_stat<-numerator_teststat/denom_teststat
abs(test_stat)>1.96
## [1] TRUE
Paired T-test of CSS vs. MS4s
Reject null hypothesis of equal means.
t.test(dat_YesCSS$TOTAL_OFFICIAL_NEED, dat_YesMS4$TOTAL_OFFICIAL_NEED)
##
## Welch Two Sample t-test
##
## data: dat_YesCSS$TOTAL_OFFICIAL_NEED and dat_YesMS4$TOTAL_OFFICIAL_NEED
## t = 6.712, df = 699.19, p-value = 0.00000000003972
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 70514032 128823442
## sample estimates:
## mean of x mean of y
## 113053301 13384564
##Reject null hypothesis of equal means.
Summary of Official Need for CSS Facilities
summary(dat_YesCSS$TOTAL_OFFICIAL_NEED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 2832885 8813826 113053301 36254645 5160108526
Summary of Official Need for MS4 Facilities
summary(dat_YesMS4$TOTAL_OFFICIAL_NEED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 628112 2161876 13384564 6708263 3289580276
Mean need of TMDL impacted sites is greater than non-TMDL sites.
Test statistic is within the interval of 95% confidence.
dat_YesTMDL <- filter(dat, TMDL_INDICATOR == "Y")
dat_NoTMDL <- filter(dat, TMDL_INDICATOR == "N")
#sample means
meanNeed_YesTMDL <-mean(dat_YesTMDL$TOTAL_OFFICIAL_NEED)
meanNeed_NoTMDL <-mean(dat_NoTMDL$TOTAL_OFFICIAL_NEED)
##Mean need of TMDL impacted sites is greater than non-TMDL sites.
#sample standardized deviations
stdNeed_YesTMDL <-sd(dat_YesTMDL$TOTAL_OFFICIAL_NEED)
stdNeed_NoTMDL <-sd(dat_NoTMDL$TOTAL_OFFICIAL_NEED)
#number of obs
numobs_YesTMDL <- dim(dat_YesTMDL)[1]
numobs_NoTMDL <- dim(dat_NoTMDL)[1]
#test stat calc: difference in the means & standardized deviations
numerator_teststatTMDL <- meanNeed_NoTMDL - meanNeed_YesTMDL
denom_teststatTMDL <-
((stdNeed_NoTMDL*stdNeed_NoTMDL)/numobs_NoTMDL +
(stdNeed_YesTMDL*stdNeed_YesTMDL)/numobs_YesTMDL)^.5
test_statTMDL<-numerator_teststatTMDL/denom_teststatTMDL
abs(test_statTMDL)>1.96
## [1] TRUE
Paired T-test of TMDL and Official Need
Summary of Official Need for Non-TMDL Facilities
#paired t-test
summary(dat_NoTMDL$TOTAL_OFFICIAL_NEED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 690815 2371608 20228097 7536767 5160108526
Summary of Official Need for TMDL Facilities
summary(dat_YesTMDL$TOTAL_OFFICIAL_NEED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1249456 3862453 58996731 14372950 3945228972
Paired T-test of Need by EPA Regions with TMDL Facilities
Region 1
Fail to reject null hypothesis; cannot establish significance.
#compare TMDL by need by EPA regions
t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
data = filter(dat, EPA_REGION == 1), paired = FALSE)
##
## Welch Two Sample t-test
##
## data: TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = -0.85922, df = 3.0193, p-value = 0.453
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
## -542766502 311336752
## sample estimates:
## mean in group N mean in group Y
## 32685334 148400209
##High p-value/low t-value
Region 2
Fail to reject null hypothesis; cannot establish significance.
t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
data = filter(dat, EPA_REGION == 2), paired = FALSE)
##
## Welch Two Sample t-test
##
## data: TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = -3.5245, df = 5.0045, p-value = 0.01681
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
## -3033874754 -475206220
## sample estimates:
## mean in group N mean in group Y
## 39662349 1794202835
##High p-value/low t-value
Region 3
Fail to reject null hypothesis; cannot establish significance.
t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
data = filter(dat, EPA_REGION == 3), paired = FALSE)
##
## Welch Two Sample t-test
##
## data: TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = 2.6523, df = 31.104, p-value = 0.01247
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
## 2196333 16806577
## sample estimates:
## mean in group N mean in group Y
## 18403975 8902520
##High p-value/low t-value
Region 4
Fail to reject null hypothesis; cannot establish significance.
t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
data = filter(dat, EPA_REGION == 4), paired = FALSE)
##
## Welch Two Sample t-test
##
## data: TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = 0.12885, df = 18.325, p-value = 0.8989
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
## -17470739 19756906
## sample estimates:
## mean in group N mean in group Y
## 22352317 21209234
##High p-value/low t-value -->
Region 5
Reject null hypothesis, significance between need and TMDL.
t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
data = filter(dat, EPA_REGION == 5), paired = FALSE)
##
## Welch Two Sample t-test
##
## data: TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = 3.9798, df = 1436.2, p-value = 0.00007241
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
## 5244922 15440674
## sample estimates:
## mean in group N mean in group Y
## 15088120 4745322
##Low p-value/high t-value --> reject null hypothesis
Region 6
Reject null hypothesis, significance between need and TMDL.
t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
data = filter(dat, EPA_REGION == 6), paired = FALSE)
##
## Welch Two Sample t-test
##
## data: TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = -3.074, df = 63.917, p-value = 0.003104
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
## -80860108 -17158802
## sample estimates:
## mean in group N mean in group Y
## 9455932 58465387
##Low p-value/high t-value --> reject null hypothesis
Regions 7 & 8
Failed t-test; not enough values.
#t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
# data = filter(dat, EPA_REGION == 7), paired = FALSE)
##failed t-test, not enough values
#t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
# data = filter(dat, EPA_REGION == 8), paired = FALSE)
##failed t-test, not enough values
Region 9
Fail to reject null hypothesis; cannot establish significance.
t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
data = filter(dat, EPA_REGION == 9), paired = FALSE)
##
## Welch Two Sample t-test
##
## data: TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = -0.87332, df = 1.0125, p-value = 0.5415
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
## -1749036595 1517883950
## sample estimates:
## mean in group N mean in group Y
## 65458334 181034656
##Very high p-value/low t-value --> cannot establish significance; accept null hypothesis?
Region 10
Fail to reject null hypothesis; cannot establish significance.
t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
data = filter(dat, EPA_REGION == 10), paired = FALSE)
##
## Welch Two Sample t-test
##
## data: TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = 1.2645, df = 252, p-value = 0.2072
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
## -4907121 22511440
## sample estimates:
## mean in group N mean in group Y
## 26134688 17332528
##High p-value/low t-value --> fail to reject null hypothesis; cannot establish significance
Cannot establish significant association between facility needs and TMDL impacted-sites.
Some significance was established in regions 5 & 6; otherwise there were too few values in regions 7 & 8; could not establish significance in regions 1 - 4, 9, 10.
Cross Tables of Official Need and Region
Summary of Official Need
library(ggplot2)
library(vcd)
library (gmodels)
library (MASS)
library(fabricatr)
library(reactable)
summary(dat$TOTAL_OFFICIAL_NEED)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 704599 2409822 21623215 7660571 5160108526
## cross table of need and region --> too large need to group
# (dat$TOTAL_OFFICIAL_NEED, dat$EPA_REGION, fisher = FALSE, chisq = TRUE,
# expected = TRUE, sresid = FALSE, format="SPSS")
##CHI2: 74282.34; P: 0.00000000000000057
##Minimum expected frequency: 0.03361045; Cells with Expected Frequency < 5: 79130 of 79140 (99.98736%)
Official Need Grouped by Five Equal Sections
dat1_intervalfive <- dat1 %>% mutate(intervalfive_Need = ntile(dat1$TOTAL_OFFICIAL_NEED, 5))%>%
mutate(intervalfive_pchmedinc = ntile(dat1$PCT_CHANGE_MEDINC, 5))%>%
mutate(intervalfive_pchpopden = ntile(dat1$PCT_CHANGE_POP_DENSITY, 5))%>%
mutate(intervalfive_pwhite10 = ntile(dat1$PCTWHITE10, 5))%>%
mutate(intervalfive_medinc09 = ntile(dat1$MEDINC09, 5))%>%
mutate(intervalfive_pop10 = ntile(dat1$POP_DENSITY_10, 5))
CrossTable(dat1_intervalfive$EPA_REGION, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE,
expected = TRUE, sresid = FALSE, format="SPSS")
##
## Cell Contents
## |-------------------------|
## | Count |
## | Expected Values |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 8420
##
## | dat1_intervalfive$intervalfive_Need
## dat1_intervalfive$EPA_REGION | 1 | 2 | 3 | 4 | 5 | Row Total |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 42 | 51 | 63 | 83 | 126 | 365 |
## | 73.000 | 73.000 | 73.000 | 73.000 | 73.000 | |
## | 13.164 | 6.630 | 1.370 | 1.370 | 38.479 | |
## | 11.507% | 13.973% | 17.260% | 22.740% | 34.521% | 4.335% |
## | 2.494% | 3.029% | 3.741% | 4.929% | 7.482% | |
## | 0.499% | 0.606% | 0.748% | 0.986% | 1.496% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 2 | 105 | 109 | 130 | 150 | 120 | 614 |
## | 122.800 | 122.800 | 122.800 | 122.800 | 122.800 | |
## | 2.580 | 1.551 | 0.422 | 6.025 | 0.064 | |
## | 17.101% | 17.752% | 21.173% | 24.430% | 19.544% | 7.292% |
## | 6.235% | 6.473% | 7.720% | 8.907% | 7.126% | |
## | 1.247% | 1.295% | 1.544% | 1.781% | 1.425% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 3 | 125 | 165 | 239 | 258 | 244 | 1031 |
## | 206.200 | 206.200 | 206.200 | 206.200 | 206.200 | |
## | 31.976 | 8.232 | 5.217 | 13.013 | 6.929 | |
## | 12.124% | 16.004% | 23.181% | 25.024% | 23.666% | 12.245% |
## | 7.423% | 9.798% | 14.192% | 15.321% | 14.489% | |
## | 1.485% | 1.960% | 2.838% | 3.064% | 2.898% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 4 | 226 | 266 | 231 | 320 | 335 | 1378 |
## | 275.600 | 275.600 | 275.600 | 275.600 | 275.600 | |
## | 8.927 | 0.334 | 7.218 | 7.153 | 12.802 | |
## | 16.401% | 19.303% | 16.763% | 23.222% | 24.311% | 16.366% |
## | 13.420% | 15.796% | 13.717% | 19.002% | 19.893% | |
## | 2.684% | 3.159% | 2.743% | 3.800% | 3.979% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5 | 311 | 415 | 424 | 347 | 255 | 1752 |
## | 350.400 | 350.400 | 350.400 | 350.400 | 350.400 | |
## | 4.430 | 11.910 | 15.459 | 0.033 | 25.974 | |
## | 17.751% | 23.687% | 24.201% | 19.806% | 14.555% | 20.808% |
## | 18.468% | 24.644% | 25.178% | 20.606% | 15.143% | |
## | 3.694% | 4.929% | 5.036% | 4.121% | 3.029% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 6 | 253 | 234 | 207 | 185 | 184 | 1063 |
## | 212.600 | 212.600 | 212.600 | 212.600 | 212.600 | |
## | 7.677 | 2.154 | 0.148 | 3.583 | 3.847 | |
## | 23.801% | 22.013% | 19.473% | 17.404% | 17.310% | 12.625% |
## | 15.024% | 13.895% | 12.292% | 10.986% | 10.926% | |
## | 3.005% | 2.779% | 2.458% | 2.197% | 2.185% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 7 | 427 | 251 | 205 | 126 | 92 | 1101 |
## | 220.200 | 220.200 | 220.200 | 220.200 | 220.200 | |
## | 194.215 | 4.308 | 1.049 | 40.298 | 74.638 | |
## | 38.783% | 22.797% | 18.619% | 11.444% | 8.356% | 13.076% |
## | 25.356% | 14.905% | 12.173% | 7.482% | 5.463% | |
## | 5.071% | 2.981% | 2.435% | 1.496% | 1.093% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 8 | 131 | 138 | 116 | 88 | 42 | 515 |
## | 103.000 | 103.000 | 103.000 | 103.000 | 103.000 | |
## | 7.612 | 11.893 | 1.641 | 2.184 | 36.126 | |
## | 25.437% | 26.796% | 22.524% | 17.087% | 8.155% | 6.116% |
## | 7.779% | 8.195% | 6.888% | 5.226% | 2.494% | |
## | 1.556% | 1.639% | 1.378% | 1.045% | 0.499% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 9 | 22 | 14 | 21 | 64 | 197 | 318 |
## | 63.600 | 63.600 | 63.600 | 63.600 | 63.600 | |
## | 27.210 | 38.682 | 28.534 | 0.003 | 279.804 | |
## | 6.918% | 4.403% | 6.604% | 20.126% | 61.950% | 3.777% |
## | 1.306% | 0.831% | 1.247% | 3.800% | 11.698% | |
## | 0.261% | 0.166% | 0.249% | 0.760% | 2.340% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 10 | 42 | 41 | 48 | 63 | 89 | 283 |
## | 56.600 | 56.600 | 56.600 | 56.600 | 56.600 | |
## | 3.766 | 4.300 | 1.307 | 0.724 | 18.547 | |
## | 14.841% | 14.488% | 16.961% | 22.261% | 31.449% | 3.361% |
## | 2.494% | 2.435% | 2.850% | 3.741% | 5.285% | |
## | 0.499% | 0.487% | 0.570% | 0.748% | 1.057% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 1684 | 1684 | 1684 | 1684 | 1684 | 8420 |
## | 20.000% | 20.000% | 20.000% | 20.000% | 20.000% | |
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 1025.513 d.f. = 36 p = 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000699429
##
##
##
## Minimum expected frequency: 56.6
Percent Change in Median Income
# Potential Association: Percent change in median income
CrossTable(dat1_intervalfive$intervalfive_pchmedinc, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE,
expected = TRUE, sresid = FALSE, format="SPSS")
##
## Cell Contents
## |-------------------------|
## | Count |
## | Expected Values |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 8420
##
## | dat1_intervalfive$intervalfive_Need
## dat1_intervalfive$intervalfive_pchmedinc | 1 | 2 | 3 | 4 | 5 | Row Total |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 335 | 364 | 343 | 333 | 309 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 0.010 | 2.197 | 0.114 | 0.043 | 2.295 | |
## | 19.893% | 21.615% | 20.368% | 19.774% | 18.349% | 20.000% |
## | 19.893% | 21.615% | 20.368% | 19.774% | 18.349% | |
## | 3.979% | 4.323% | 4.074% | 3.955% | 3.670% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 2 | 334 | 327 | 331 | 350 | 342 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 0.023 | 0.285 | 0.100 | 0.517 | 0.080 | |
## | 19.834% | 19.418% | 19.656% | 20.784% | 20.309% | 20.000% |
## | 19.834% | 19.418% | 19.656% | 20.784% | 20.309% | |
## | 3.967% | 3.884% | 3.931% | 4.157% | 4.062% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 3 | 342 | 325 | 329 | 350 | 338 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 0.080 | 0.413 | 0.181 | 0.517 | 0.004 | |
## | 20.309% | 19.299% | 19.537% | 20.784% | 20.071% | 20.000% |
## | 20.309% | 19.299% | 19.537% | 20.784% | 20.071% | |
## | 4.062% | 3.860% | 3.907% | 4.157% | 4.014% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 4 | 299 | 340 | 349 | 332 | 364 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 4.242 | 0.030 | 0.442 | 0.068 | 2.197 | |
## | 17.755% | 20.190% | 20.724% | 19.715% | 21.615% | 20.000% |
## | 17.755% | 20.190% | 20.724% | 19.715% | 21.615% | |
## | 3.551% | 4.038% | 4.145% | 3.943% | 4.323% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5 | 374 | 328 | 332 | 319 | 331 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 4.109 | 0.230 | 0.068 | 0.941 | 0.100 | |
## | 22.209% | 19.477% | 19.715% | 18.943% | 19.656% | 20.000% |
## | 22.209% | 19.477% | 19.715% | 18.943% | 19.656% | |
## | 4.442% | 3.895% | 3.943% | 3.789% | 3.931% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 1684 | 1684 | 1684 | 1684 | 1684 | 8420 |
## | 20.000% | 20.000% | 20.000% | 20.000% | 20.000% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 19.28741 d.f. = 16 p = 0.2540411
##
##
##
## Minimum expected frequency: 336.8
Percent Change in Population Density
# Potential Association: Percent change in population density
CrossTable(dat1_intervalfive$intervalfive_pchpopden, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE,
expected = TRUE, sresid = FALSE, format="SPSS")
##
## Cell Contents
## |-------------------------|
## | Count |
## | Expected Values |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 8420
##
## | dat1_intervalfive$intervalfive_Need
## dat1_intervalfive$intervalfive_pchpopden | 1 | 2 | 3 | 4 | 5 | Row Total |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 453 | 393 | 371 | 292 | 175 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 40.090 | 9.378 | 3.473 | 5.959 | 77.729 | |
## | 26.900% | 23.337% | 22.031% | 17.340% | 10.392% | 20.000% |
## | 26.900% | 23.337% | 22.031% | 17.340% | 10.392% | |
## | 5.380% | 4.667% | 4.406% | 3.468% | 2.078% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 2 | 345 | 371 | 374 | 327 | 267 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 0.200 | 3.473 | 4.109 | 0.285 | 14.466 | |
## | 20.487% | 22.031% | 22.209% | 19.418% | 15.855% | 20.000% |
## | 20.487% | 22.031% | 22.209% | 19.418% | 15.855% | |
## | 4.097% | 4.406% | 4.442% | 3.884% | 3.171% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 3 | 323 | 351 | 326 | 347 | 337 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 0.565 | 0.599 | 0.346 | 0.309 | 0.000 | |
## | 19.181% | 20.843% | 19.359% | 20.606% | 20.012% | 20.000% |
## | 19.181% | 20.843% | 19.359% | 20.606% | 20.012% | |
## | 3.836% | 4.169% | 3.872% | 4.121% | 4.002% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 4 | 300 | 308 | 325 | 354 | 397 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 4.021 | 2.463 | 0.413 | 0.878 | 10.760 | |
## | 17.815% | 18.290% | 19.299% | 21.021% | 23.575% | 20.000% |
## | 17.815% | 18.290% | 19.299% | 21.021% | 23.575% | |
## | 3.563% | 3.658% | 3.860% | 4.204% | 4.715% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5 | 263 | 261 | 288 | 364 | 508 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 16.171 | 17.060 | 7.071 | 2.197 | 87.023 | |
## | 15.618% | 15.499% | 17.102% | 21.615% | 30.166% | 20.000% |
## | 15.618% | 15.499% | 17.102% | 21.615% | 30.166% | |
## | 3.124% | 3.100% | 3.420% | 4.323% | 6.033% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 1684 | 1684 | 1684 | 1684 | 1684 | 8420 |
## | 20.000% | 20.000% | 20.000% | 20.000% | 20.000% | |
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 309.038 d.f. = 16 p = 0.00000000000000000000000000000000000000000000000000000003417345
##
##
##
## Minimum expected frequency: 336.8
Percentage of Population, white in 2010
# Potential Association: Percentage of Population White 2010
CrossTable(dat1_intervalfive$intervalfive_pwhite10, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE,
expected = TRUE, sresid = FALSE, format="SPSS")
##
## Cell Contents
## |-------------------------|
## | Count |
## | Expected Values |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 8420
##
## | dat1_intervalfive$intervalfive_Need
## dat1_intervalfive$intervalfive_pwhite10 | 1 | 2 | 3 | 4 | 5 | Row Total |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 315 | 292 | 267 | 329 | 481 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 1.411 | 5.959 | 14.466 | 0.181 | 61.739 | |
## | 18.705% | 17.340% | 15.855% | 19.537% | 28.563% | 20.000% |
## | 18.705% | 17.340% | 15.855% | 19.537% | 28.563% | |
## | 3.741% | 3.468% | 3.171% | 3.907% | 5.713% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 2 | 284 | 276 | 307 | 349 | 468 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 8.277 | 10.976 | 2.637 | 0.442 | 51.109 | |
## | 16.865% | 16.390% | 18.230% | 20.724% | 27.791% | 20.000% |
## | 16.865% | 16.390% | 18.230% | 20.724% | 27.791% | |
## | 3.373% | 3.278% | 3.646% | 4.145% | 5.558% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 3 | 298 | 318 | 333 | 382 | 353 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 4.470 | 1.049 | 0.043 | 6.066 | 0.779 | |
## | 17.696% | 18.884% | 19.774% | 22.684% | 20.962% | 20.000% |
## | 17.696% | 18.884% | 19.774% | 22.684% | 20.962% | |
## | 3.539% | 3.777% | 3.955% | 4.537% | 4.192% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 4 | 334 | 400 | 360 | 339 | 251 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 0.023 | 11.859 | 1.598 | 0.014 | 21.858 | |
## | 19.834% | 23.753% | 21.378% | 20.131% | 14.905% | 20.000% |
## | 19.834% | 23.753% | 21.378% | 20.131% | 14.905% | |
## | 3.967% | 4.751% | 4.276% | 4.026% | 2.981% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5 | 453 | 398 | 417 | 285 | 131 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 40.090 | 11.121 | 19.098 | 7.967 | 125.753 | |
## | 26.900% | 23.634% | 24.762% | 16.924% | 7.779% | 20.000% |
## | 26.900% | 23.634% | 24.762% | 16.924% | 7.779% | |
## | 5.380% | 4.727% | 4.952% | 3.385% | 1.556% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 1684 | 1684 | 1684 | 1684 | 1684 | 8420 |
## | 20.000% | 20.000% | 20.000% | 20.000% | 20.000% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 408.9846 d.f. = 16 p = 0.00000000000000000000000000000000000000000000000000000000000000000000000000004758703
##
##
##
## Minimum expected frequency: 336.8
Median Income in 2009
CrossTable(dat1_intervalfive$intervalfive_medinc09, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE,
expected = TRUE, sresid = FALSE, format="SPSS")
##
## Cell Contents
## |-------------------------|
## | Count |
## | Expected Values |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 8420
##
## | dat1_intervalfive$intervalfive_Need
## dat1_intervalfive$intervalfive_medinc09 | 1 | 2 | 3 | 4 | 5 | Row Total |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 432 | 401 | 342 | 322 | 187 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 26.909 | 12.238 | 0.080 | 0.650 | 66.627 | |
## | 25.653% | 23.812% | 20.309% | 19.121% | 11.105% | 20.000% |
## | 25.653% | 23.812% | 20.309% | 19.121% | 11.105% | |
## | 5.131% | 4.762% | 4.062% | 3.824% | 2.221% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 2 | 351 | 362 | 365 | 346 | 260 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 0.599 | 1.886 | 2.361 | 0.251 | 17.513 | |
## | 20.843% | 21.496% | 21.675% | 20.546% | 15.439% | 20.000% |
## | 20.843% | 21.496% | 21.675% | 20.546% | 15.439% | |
## | 4.169% | 4.299% | 4.335% | 4.109% | 3.088% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 3 | 361 | 325 | 344 | 318 | 336 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 1.739 | 0.413 | 0.154 | 1.049 | 0.002 | |
## | 21.437% | 19.299% | 20.428% | 18.884% | 19.952% | 20.000% |
## | 21.437% | 19.299% | 20.428% | 18.884% | 19.952% | |
## | 4.287% | 3.860% | 4.086% | 3.777% | 3.990% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 4 | 300 | 340 | 331 | 327 | 386 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 4.021 | 0.030 | 0.100 | 0.285 | 7.187 | |
## | 17.815% | 20.190% | 19.656% | 19.418% | 22.922% | 20.000% |
## | 17.815% | 20.190% | 19.656% | 19.418% | 22.922% | |
## | 3.563% | 4.038% | 3.931% | 3.884% | 4.584% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5 | 240 | 256 | 302 | 371 | 515 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 27.821 | 19.384 | 3.596 | 3.473 | 94.285 | |
## | 14.252% | 15.202% | 17.933% | 22.031% | 30.582% | 20.000% |
## | 14.252% | 15.202% | 17.933% | 22.031% | 30.582% | |
## | 2.850% | 3.040% | 3.587% | 4.406% | 6.116% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 1684 | 1684 | 1684 | 1684 | 1684 | 8420 |
## | 20.000% | 20.000% | 20.000% | 20.000% | 20.000% | |
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 292.6544 d.f. = 16 p = 0.00000000000000000000000000000000000000000000000000008450559
##
##
##
## Minimum expected frequency: 336.8
Population Density in 2010
CrossTable(dat1_intervalfive$intervalfive_pop10, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE,
expected = TRUE, sresid = FALSE, format="SPSS")
##
## Cell Contents
## |-------------------------|
## | Count |
## | Expected Values |
## | Chi-square contribution |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 8420
##
## | dat1_intervalfive$intervalfive_Need
## dat1_intervalfive$intervalfive_pop10 | 1 | 2 | 3 | 4 | 5 | Row Total |
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 566 | 412 | 369 | 245 | 92 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 155.976 | 16.790 | 3.079 | 25.021 | 177.931 | |
## | 33.610% | 24.466% | 21.912% | 14.549% | 5.463% | 20.000% |
## | 33.610% | 24.466% | 21.912% | 14.549% | 5.463% | |
## | 6.722% | 4.893% | 4.382% | 2.910% | 1.093% | |
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 2 | 414 | 421 | 354 | 319 | 176 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 17.695 | 21.050 | 0.878 | 0.941 | 76.771 | |
## | 24.584% | 25.000% | 21.021% | 18.943% | 10.451% | 20.000% |
## | 24.584% | 25.000% | 21.021% | 18.943% | 10.451% | |
## | 4.917% | 5.000% | 4.204% | 3.789% | 2.090% | |
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 3 | 281 | 369 | 366 | 387 | 281 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 9.245 | 3.079 | 2.532 | 7.482 | 9.245 | |
## | 16.686% | 21.912% | 21.734% | 22.981% | 16.686% | 20.000% |
## | 16.686% | 21.912% | 21.734% | 22.981% | 16.686% | |
## | 3.337% | 4.382% | 4.347% | 4.596% | 3.337% | |
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 4 | 221 | 274 | 314 | 377 | 498 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 39.815 | 11.710 | 1.543 | 4.798 | 77.154 | |
## | 13.124% | 16.271% | 18.646% | 22.387% | 29.572% | 20.000% |
## | 13.124% | 16.271% | 18.646% | 22.387% | 29.572% | |
## | 2.625% | 3.254% | 3.729% | 4.477% | 5.914% | |
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5 | 202 | 208 | 281 | 356 | 637 | 1684 |
## | 336.800 | 336.800 | 336.800 | 336.800 | 336.800 | |
## | 53.952 | 49.256 | 9.245 | 1.095 | 267.577 | |
## | 11.995% | 12.352% | 16.686% | 21.140% | 37.827% | 20.000% |
## | 11.995% | 12.352% | 16.686% | 21.140% | 37.827% | |
## | 2.399% | 2.470% | 3.337% | 4.228% | 7.565% | |
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 1684 | 1684 | 1684 | 1684 | 1684 | 8420 |
## | 20.000% | 20.000% | 20.000% | 20.000% | 20.000% | |
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 1043.86 d.f. = 16 p = 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004522926
##
##
##
## Minimum expected frequency: 336.8
2010 Population and Official Need
Low degree of correlation @ 0.18
##Correlation between 2010population and official need
cor.test(dat1$POP10, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat1$POP10 and dat1$TOTAL_OFFICIAL_NEED
## t = 16.355, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1547099 0.1961150
## sample estimates:
## cor
## 0.1754901
## small correlation @ 0.18
##scatter plot##
ggplot(dat1, aes(x = POP10,
y = TOTAL_OFFICIAL_NEED)) +
geom_point(size=2, shape=20, color="orchid") +
theme(legend.position = "bottom") +
ggtitle("2010 Population x Total Official Need") +
geom_smooth(method=lm) +
xlab("Total Population (2010)") + ylab("Total Official Need")
##Degree of correlation:
###Perfect: If value is near ? 1, then it said to be a perfect correlation: as one variable increases, the other variable tends to also increase (if positive) or decrease (if negative).
###High degree: value between ? 0.50 and ? 1
###Moderate degree: value between ? 0.30 and ? 0.49
###Low degree: value below + .29
###No correlation: value is 0.
2010 Population Density and Official Need
Moderate degree of correlation @ 0.43
cor.test(dat1$POP_DENSITY_10, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat1$POP_DENSITY_10 and dat1$TOTAL_OFFICIAL_NEED
## t = 43.501, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4108153 0.4456975
## sample estimates:
## cor
## 0.428416
ggplot(dat1, aes(x = POP_DENSITY_10,
y = TOTAL_OFFICIAL_NEED)) +
geom_point(size=2, shape=20, color="orchid") +
theme(legend.position = "bottom") +
ggtitle("2010 Population Density x Total Official Need") +
geom_smooth(method=lm) +
xlab("Population Density (2010)") + ylab("Total Official Need")
##
Median Income and Official Need
Little to no correlation @ 0.05
cor.test(dat1$MEDINC09, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat1$MEDINC09 and dat1$TOTAL_OFFICIAL_NEED
## t = 4.8097, df = 8418, p-value = 0.000001538
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03102467 0.07362785
## sample estimates:
## cor
## 0.05235008
ggplot(dat1, aes(x = MEDINC09,
y = TOTAL_OFFICIAL_NEED)) +
geom_point(size=2, shape=20, color="orchid") +
theme(legend.position = "bottom") +
ggtitle("Median Income x Total Official Need") +
geom_smooth(method=lm) +
xlab("Median Income (2009)") + ylab("Total Official Need")
Change in Residential Collection and Official Need
Low degree of correlation @ 0.22
cor.test(dat1$CHANGE_RES_REC_COLLCTN, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat1$CHANGE_RES_REC_COLLCTN and dat1$TOTAL_OFFICIAL_NEED
## t = 20.412, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1967184 0.2374248
## sample estimates:
## cor
## 0.217166
ggplot(dat1, aes(x = CHANGE_RES_REC_COLLCTN,
y = TOTAL_OFFICIAL_NEED)) +
geom_point(size=2, shape=20, color="orchid") +
theme(legend.position = "bottom") +
ggtitle("Change in Residential Collection x Total Official Need") +
geom_smooth(method=lm) +
xlab("Change in Res Collection") + ylab("Total Official Need")
Change in Residential Collection and Median Income
Little to no correlation @ 0.064
cor.test(dat1$MEDINC09, dat1$CHANGE_RES_REC_COLLCTN, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat1$MEDINC09 and dat1$CHANGE_RES_REC_COLLCTN
## t = 5.8425, df = 8418, p-value = 0.000000005333
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04224740 0.08479516
## sample estimates:
## cor
## 0.06355016
ggplot(dat1, aes(x = MEDINC09,
y = CHANGE_RES_REC_COLLCTN)) +
geom_point(size=2, shape=20, color="orchid") +
theme(legend.position = "bottom") +
ggtitle("Change in Residential Collection x Median Income") +
geom_smooth(method=lm) +
xlab("Median Income (2009)") + ylab("Change in Res Collection")
library(tidyverse)
library(tidycensus)
library(ggplot2)
library(dplyr)
library(data.table)
require("ggrepel")
library(hrbrthemes)
library(viridis)
library(ggridges)
library(Hmisc)
library(stargazer)
options(scipen=999)
dat <- read.csv("cwns2012.csv", header = TRUE)
dat2 <- dat%>%
mutate(LOG_NEED = log(TOTAL_OFFICIAL_NEED))%>%
mutate(RES_BURDEN = TOTAL_OFFICIAL_NEED/POP10)%>%
mutate(LOG_RES_BURDEN = log(RES_BURDEN))%>%
mutate(LOG_MEDINC09 = log(MEDINC09))%>%
mutate(MEDINC99_INFL = MEDINC99*1.29)%>%
mutate(LOG_MEDINC99_INFL = log(MEDINC99_INFL))%>%
mutate(LOG_POP10 = log(POP10))%>%
mutate(LOG_POP00 = log(POP00))%>%
mutate(ALAND_SQMILE = ALAND/2.59e+6)%>%
mutate(POP_DENSITY_10 = POP10/ALAND_SQMILE)%>%
mutate(LOG_POP_DENSITY_10 = log(POP_DENSITY_10))%>%
mutate(LOG_RES_REC_COLLCTN = log(PRES_RES_REC_COLLCTN))%>%
mutate(LOG_RES_REC_TRMT = log(PRES_RES_REC_TRMT))
Box Plots
dat2_focus_states <- filter(dat2, STATE %in% c("PA", "NJ", "NY", "DE"))
#Boxplots
ggplot(dat2_focus_states,
aes(x=STATE, y=TOTAL_OFFICIAL_NEED)) +
geom_boxplot(fill="seagreen3", alpha=0.2) +
geom_jitter(color="gray50", size=0.4, alpha=0.9) +
ggtitle("Water Infrastructure Need: Delaware River Watershed") +
xlab("STATE") + ylab("Official Need")
Box Plot with Log Transformed Variables
ggplot(dat2_focus_states,
aes(x=STATE, y=LOG_NEED)) +
geom_boxplot(fill="seagreen3", alpha=0.2) +
geom_jitter(color="gray50", size=0.4, alpha=0.9) +
ggtitle("Water Infrastructure Need: Delaware River Watershed") +
xlab("STATE") + ylab("Official Need (Log Adjusted)")
Density Plots
ggplot(dat2_focus_states,
aes(x=RES_BURDEN)) +
geom_density(fill="seagreen3", color="seagreen1", alpha=0.7) +
ggtitle("Residential Burden: Delaware River Watershed") +
xlab("Residential Burden") + ylab("Density")
Density plots with Log Transformed Variables
#Density plots with Log Transformed Variables
ggplot(dat2_focus_states,
aes(x=LOG_RES_BURDEN)) +
geom_density(fill="seagreen3", color="seagreen1", alpha=0.7) +
ggtitle("Residential Burden: Delaware River Watershed") +
xlab("Residential Burden (Log Adjusted)") + ylab("Density")
Density Plots
dat2_PA <- filter(dat2, STATEFP == 42)
#Density plots
ggplot(dat2_PA,
aes(x=MEDINC09)) +
geom_density(fill="darkgoldenrod1", color="darkgoldenrod2", alpha=0.7) +
ggtitle("Median Income Distribution: Pennsylvania Counties") +
xlab("Median Income (2009)") + ylab("Density")
Density Plots with Log Transformed Variables
#Density plots with Log Transformed Variables
ggplot(dat2_PA,
aes(x=LOG_MEDINC09)) +
geom_density(fill="darkgoldenrod1", color="darkgoldenrod2", alpha=0.7) +
ggtitle("Median Income Distribution: Pennsylvania Counties") +
xlab("Median Income (2009, Log Adjusted)") + ylab("Density")
Bar Plots
Total Need
#Barplots: Total Need
dat2%>%
filter(COUNTYFP == 101, STATEFP == 42)%>%
ggplot(aes(y=TOTAL_OFFICIAL_NEED, x=CWNS_NUMBER)) +
geom_bar(position="dodge", stat="identity", color="coral1", fill="coral1", alpha=0.6) +
ggtitle("Total Official Need: Philadelphia Facilities") +
xlab("CWNS Facility") + ylab("Total Need")
Residential Burden
Log adjustment not required as variables are similar in value.
#Barplots: Res Burden
dat2%>%
filter(COUNTYFP == 101, STATEFP == 42)%>%
ggplot(aes(y=RES_BURDEN, x=CWNS_NUMBER)) +
geom_bar(position="dodge", stat="identity", color="coral1", fill="coral1", alpha=0.6) +
ggtitle("Residential Burden: Philadelphia Facilities") +
xlab("CWNS Facility") + ylab("Residential Burden")
##log adjustment not required as variables are similar in value
Total Population vs. Official Need
cor = 0.18, small correlation
cor.test(dat2$POP10, dat2$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat2$POP10 and dat2$TOTAL_OFFICIAL_NEED
## t = 16.355, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1547099 0.1961150
## sample estimates:
## cor
## 0.1754901
#cor = 0.18, small correlation
Population Density vs. Official Need
cor = 0.43, moderate correlation
cor.test(dat2$POP_DENSITY_10, dat2$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat2$POP_DENSITY_10 and dat2$TOTAL_OFFICIAL_NEED
## t = 43.501, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4108153 0.4456975
## sample estimates:
## cor
## 0.428416
#cor=0.43, variables exhibit a medium correlation
Median Income vs. Official Need
cor = 0.052, little to no correlation
cor.test(dat2$MEDINC09, dat2$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat2$MEDINC09 and dat2$TOTAL_OFFICIAL_NEED
## t = 4.8097, df = 8418, p-value = 0.000001538
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03102467 0.07362785
## sample estimates:
## cor
## 0.05235008
#cor 0.052, little to no correlation
Residential Burden vs. Official Need
cor = 0.33, moderate correlation
cor.test(dat2$RES_BURDEN, dat2$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat2$RES_BURDEN and dat2$TOTAL_OFFICIAL_NEED
## t = 32.449, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3143058 0.3522786
## sample estimates:
## cor
## 0.3334274
#cor 0.33, medium correlation
Present Residential Population Receiving Collection vs. Official Need
cor = 0.52, high correlation
cor.test(dat$PRES_RES_REC_COLLCTN, dat$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat$PRES_RES_REC_COLLCTN and dat$TOTAL_OFFICIAL_NEED
## t = 57.264, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5139253 0.5446732
## sample estimates:
## cor
## 0.5294731
#cor 0.52, high correlation
Present Residential Pop Receiving Treatment vs. Official Need
cor = 0.53, high correlation
#Need vs. Present Residential Pop Receiving Treatment
cor.test(dat$PRES_RES_REC_TRMT, dat$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat$PRES_RES_REC_TRMT and dat$TOTAL_OFFICIAL_NEED
## t = 57.507, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5155744 0.5462493
## sample estimates:
## cor
## 0.5310858
#cor 0.53, high correlation
Present Non-Residential Pop Receiving Collection vs. Official Need
cor = 0.44, moderate correlation
#Need vs. Present Non-Residential Pop Receiving Collection
cor.test(dat$PRES_N_RES_REC_COLLCTN, dat$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat$PRES_N_RES_REC_COLLCTN and dat$TOTAL_OFFICIAL_NEED
## t = 45.619, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4279247 0.4621801
## sample estimates:
## cor
## 0.4452153
#cor 0.44, medium correlation
Present Non-Residential Pop Receiving Treatment vs. Official Need
cor = 0.43, moderate correlation
#Need vs. Present Non-Residential Pop Receiving Treatment
cor.test(dat$PRES_N_RES_REC_TRTM, dat$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat$PRES_N_RES_REC_TRTM and dat$TOTAL_OFFICIAL_NEED
## t = 43.771, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4130163 0.4478193
## sample estimates:
## cor
## 0.4305779
#cor 0.43, medium correlation
Land Area vs. Official Need
cor = 0.003, almost no correlation
#Need vs. Land Area
cor.test(dat1$ALAND_SQMILE, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat1$ALAND_SQMILE and dat1$TOTAL_OFFICIAL_NEED
## t = 0.36351, df = 8418, p-value = 0.7162
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01739962 0.02531992
## sample estimates:
## cor
## 0.003961961
#cor 0.003, almost no correlation
Water Area vs. Official Need
cor = 0.02, almost no correlation
#Need vs. Water Area
cor.test(dat$AWATER, dat$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat$AWATER and dat$TOTAL_OFFICIAL_NEED
## t = 2.2582, df = 8418, p-value = 0.02396
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.003247044 0.045941400
## sample estimates:
## cor
## 0.02460544
#cor = 0.02, almost no correlation
Variables with Significant Correlation
– Population Density
– Residential Burden
– Present Residential Pop Receiving Collection/Treatment
– Present Non-Residential Pop Receiving Collection/Treatment
Based on the correlation tests, the highest correlation is exhibited between Official Need and Present Residential Pop Receiving Treatment. We chose to use the median value because the large range of observations within the data set can disproportionately skew the mean value.
###Construct dummy groups to classify facilities below median values
median_trmt <- median(dat1$PRES_RES_REC_TRMT)
##Growing or Shrinking Cities
###Based on pop density change mean value
mean_popdensity <- mean(dat1$CHANGE_POP_DENSITY)
##CSS
###Whether or not it's part of a CSS facility
##already a binary!
dat1$DUMMY_TRMT <- ifelse(dat1$PRES_RES_REC_TRMT >= 2759.5, 1, 0)
dat1$DUMMY_POPDENSITY <- ifelse(dat1$CHANGE_POP_DENSITY >= 24.43, 1, 0)
DT::datatable(
dat1,
height = 200,
options = list(scrollX = TRUE)
)
From our correlation tests, we selected the following variables which exhibited a significant correlation with total need:
Population Density
Residential Burden
Present Residential Pop Receiving Treatment
Regression Plot
mod.pop.density <- lm(TOTAL_OFFICIAL_NEED ~ POP_DENSITY_10, data = dat1)
summary(mod.pop.density)
##
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ POP_DENSITY_10, data = dat1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1722672791 -10797745 -6312025 -3289469 4552683118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5993102.0 1371913.1 4.368 0.0000127 ***
## POP_DENSITY_10 29325.2 674.1 43.501 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 121500000 on 8418 degrees of freedom
## Multiple R-squared: 0.1835, Adjusted R-squared: 0.1834
## F-statistic: 1892 on 1 and 8418 DF, p-value: < 0.00000000000000022
#R2 = 0.184; Adjusted R2 = 0.183
##p values??
#Regression Plot
ggplot(dat1, aes(POP_DENSITY_10, TOTAL_OFFICIAL_NEED)) +
geom_point(size=3, shape=20, color="orchid") +
geom_smooth(method='lm', se=FALSE, color="grey40") +
ggtitle("Regression: Need and Population Density") +
xlab("Population Density") + ylab("Total Need")
Residual Plot
Funnel shaped plot indicates heteroscedasticity, which implies a non-constant variation in error terms, i.e. error terms increase as fitted value increases.
##Residual Plots
summary(mod.pop.density$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1722672791 -10797745 -6312025 0 -3289469 4552683118
plot(fitted(mod.pop.density), mod.pop.density$residuals,
main = "Fitted Model Against Residuals: Population Density",
xlab = "Fitted Model", ylab = "Residuals",
col = "orchid", pch = 16)
abline(0,0, col = "grey40")
Histogram of Regressed Population Density and Official Need
Residuals are mostly clustered around mean with a skew towards the left, evident from large negative values of min, 1st quartile, and median. Using log adjusted values can help cluster data closer to mean.
hist(mod.pop.density$residuals, breaks = 10,
main = "Regression Residual Spread: Population Density",
xlab = "Residuals",
col = "orchid", border = "orchid3")
#mod_pop_density_predict <- predict(mod.pop.density)
#DT::datatable(
# mod_pop_density_predict,
# height = 100,
# options = list(scrollX = TRUE)
#)
Log Official Need vs. Log Population Density
Removing infinite values from “log adjusted need” column.
Low R squared, as seen from large spread of data about the mode means that the model does not sufficiently account for variance.
##remove infinite values from "log adjusted need" column
dat2_reg <- dat2%>%
filter(LOG_NEED > -Inf)
mod.log.pop.density <- lm(LOG_NEED ~ LOG_POP_DENSITY_10, data = dat2_reg)
summary(mod.log.pop.density)
##
## Call:
## lm(formula = LOG_NEED ~ LOG_POP_DENSITY_10, data = dat2_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8064 -1.1451 -0.0249 1.0973 6.1504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.84607 0.05523 232.58 <0.0000000000000002 ***
## LOG_POP_DENSITY_10 0.40626 0.01085 37.45 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.69 on 8261 degrees of freedom
## Multiple R-squared: 0.1451, Adjusted R-squared: 0.145
## F-statistic: 1402 on 1 and 8261 DF, p-value: < 0.00000000000000022
ggplot(dat2_reg, aes(LOG_POP_DENSITY_10, LOG_NEED)) +
geom_point(size=3, shape=20, color="orchid") +
geom_smooth(method='lm', se=FALSE, color="grey40") +
ggtitle("Regression: Need and Population Density") +
xlab("Population Density (Log Adjusted)") + ylab("Total Need (Log Adjusted)")
#R2 = 0.1451; Adjusted R2 = 0.145
##low R squared, as seen from large spread of data about the model
##means that the model does not sufficiently account for variance
Residual Plot of Log Population Density
Much better spread, residuals clustered closer to the mean in a more homoscedastic plot, closer to normal distribution. However, variance still increases as model values increase.
summary(mod.log.pop.density$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7.80641 -1.14511 -0.02493 0.00000 1.09732 6.15044
#better, less negative skew from mean
plot(fitted(mod.log.pop.density), mod.log.pop.density$residuals,
main = "Fitted Model Against Residuals: Log Adj. Population Density",
xlab = "Fitted Model", ylab = "Residuals",
col = "orchid", pch = 16)
abline(0,0, col = "grey40")
hist(mod.log.pop.density$residuals, breaks = 10,
main = "Regression Residual Spread: Population Density",
xlab = "Residuals",
col = "orchid", border = "orchid3")
##much better spread, residuals clustered closer to the mean in a more homoscedastic plot, closer to normal distribution
##however, variance still increases as model values increase
Large range across mean values seen in summary of residual plot.
mod.res.burden <- lm(TOTAL_OFFICIAL_NEED ~ RES_BURDEN, data = dat2)
summary(mod.res.burden)
##
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ RES_BURDEN, data = dat2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1966254161 -12024541 -8159307 -6205059 4926111573
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7884945 1444881 5.457 0.0000000498 ***
## RES_BURDEN 97748 3012 32.449 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 126800000 on 8418 degrees of freedom
## Multiple R-squared: 0.1112, Adjusted R-squared: 0.1111
## F-statistic: 1053 on 1 and 8418 DF, p-value: < 0.00000000000000022
#R2 = 0.11; Adjusted R2 = 0.11
##low R squared
##p values??
ggplot(dat2, aes(RES_BURDEN, TOTAL_OFFICIAL_NEED)) +
geom_point(size=3, shape=20, color="orchid") +
geom_smooth(method='lm', se=FALSE, color="grey40") +
ggtitle("Regression: Need and Residential Burden") +
xlab("Residential Burden") + ylab("Total Need")
#Residual Plots
summary(mod.res.burden$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1966254161 -12024541 -8159307 0 -6205059 4926111573
##large range across mean values
Histogram of Residuals
Large range but maximum values clustered closer to mean, skewed toward the left of mean value.
hist(mod.res.burden$residuals, breaks = 10,
main = "Regression Residual Spread: Residential Burden",
xlab = "Residuals",
col = "orchid", border = "orchid3")
##large range but maximum values clustered closer to mean, skewed toward the left of mean value
Heteroscedastic plot, exhibits high non-covariance as fitted values increase.
plot(fitted(mod.res.burden), mod.res.burden$residuals,
main = "Fitted Model Against Residuals: Residential Burden",
xlab = "Fitted Model", ylab = "Residuals",
col = "orchid", pch = 16)
abline(0,0, col = "grey40")
##heteroscedastic plot, exhibits high non-covariance as fitted values increase
Range far from mean on both sides.
mod.res.trmt <- lm(TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat)
summary(mod.res.trmt)
##
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1981362007 -7599612 -5839460 -3348041 4666003598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6109417.49 1270553.40 4.808 0.00000155 ***
## PRES_RES_REC_TRMT 670.21 11.65 57.507 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 113900000 on 8418 degrees of freedom
## Multiple R-squared: 0.2821, Adjusted R-squared: 0.282
## F-statistic: 3307 on 1 and 8418 DF, p-value: < 0.00000000000000022
#R2 = 0.282; Adjusted R2 = 0.282
ggplot(dat, aes(PRES_RES_REC_TRMT, TOTAL_OFFICIAL_NEED)) +
geom_point(size=3, shape=20, color="orchid") +
geom_smooth(method='lm', se=FALSE, color="grey40") +
ggtitle("Regression: Need and Residential Treatment") +
xlab("Residential Treatment") + ylab("Total Need")
#Residual Plots
summary(mod.res.trmt$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1981362007 -7599612 -5839460 0 -3348041 4666003598
#range far from mean on both sides
Large range but maximum values clustered closer to mean.
hist(mod.res.trmt$residuals, breaks = 10,
main = "Regression Residual Spread: Residential Treatment",
xlab = "Residuals",
col = "orchid", border = "orchid3")
##large range but maximum values clustered closer to mean
Heteroscedastic plot, high non-variance.
plot(fitted(mod.res.trmt), mod.res.trmt$residuals,
main = "Fitted Model Against Residuals: Residential Collection",
xlab = "Fitted Model", ylab = "Residuals",
col = "orchid", pch = 16)
abline(0,0, col = "grey40")
##heteroscedastic plot, high non-variance
Log Need vs. Log Residential Treatment
Removed infinite values. Model still does not explain variance very well, but is much better than non-log-adjusted variables.
#LOG Need vs. LOG Residential Treatment
##remove infinite values
dat2_reg <- dat2%>%
filter(LOG_NEED > -Inf)%>%
filter(LOG_RES_REC_TRMT > -Inf)
mod.log.res.trmt <- lm(LOG_NEED ~ LOG_RES_REC_TRMT, data = dat2_reg)
summary(mod.log.res.trmt)
##
## Call:
## lm(formula = LOG_NEED ~ LOG_RES_REC_TRMT, data = dat2_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.4591 -0.8723 0.0485 0.9241 9.5216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.610620 0.068825 139.64 <0.0000000000000002 ***
## LOG_RES_REC_TRMT 0.640192 0.008284 77.28 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.391 on 8240 degrees of freedom
## Multiple R-squared: 0.4202, Adjusted R-squared: 0.4201
## F-statistic: 5972 on 1 and 8240 DF, p-value: < 0.00000000000000022
#R2 = 0.42; Adjusted R2 = 0.42
##model still does not explain variance very well, but is much better than non-log-adjusted variables
##p-value?
Residual Plots
Less spread of range across mean.
#Residual Plots
summary(mod.log.res.trmt$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8.45911 -0.87230 0.04848 0.00000 0.92411 9.52156
##less spread of range across mean
Tighter clustering around mean, almost exhibiting a normal distribution.
hist(mod.log.res.trmt$residuals, breaks = 10,
main = "Regression Residual Spread: Log Adj. Residential Treatment",
xlab = "Residuals",
col = "orchid", border = "orchid3")
##tighter clustering around mean, almost exhibiting a normal distribution
Error terms definitely exhibit more homoscedasticity, with more variance towards the left of mean. However, terms are still spread along a wide range from the fitted model.
plot(fitted(mod.log.res.trmt), mod.log.res.trmt$residuals,
main = "Fitted Model Against Residuals: Log Adj. Residential Treatment",
xlab = "Fitted Model", ylab = "Residuals",
col = "orchid", pch = 16)
abline(0,0, col = "grey40")
##error terms definitely exhibit more homoscedasticity, with more variance towards the left of mean
##however, terms are still spread along a wide range from the fitted model
#predict(mod.log.res.trmt)
#predict(mod.log.res.trmt, int = "c")
#predict(mod.log.res.trmt, int = "p")
We used stargazer to compare our regression models and classify statistical significance. The interpretation is as follows:
– Unit increase in pop density would result in an increase of $29,325.19 in total need.
– Unit increase in residential burden would result in an increase of $97,748.53 in total need.
– Unit increase in residential population receiving collection would result in an increase of $670.21 in total need.
– R-squared and Adjusted R-squared values are low for all, implying that the model does not explain variance very well. However, considering these regression models use real-world data sets capturing figures across a national scale which naturally contain large variances, values of R2 around 0.3 are usually considered good
– P-values of all coefficients are less than 0.001, implying that the model reports contains high statistical significance (99.99% confidence)
##compare regression results through a comparative stargazer table
stargazer(mod.pop.density, mod.res.burden, mod.res.trmt, type = "text", star.cutoffs = c(0.05, 0.01, 0.001))
##
## ==================================================================================
## Dependent variable:
## --------------------------------------------------
## TOTAL_OFFICIAL_NEED
## (1) (2) (3)
## ----------------------------------------------------------------------------------
## POP_DENSITY_10 29,325.190***
## (674.121)
##
## RES_BURDEN 97,748.530***
## (3,012.402)
##
## PRES_RES_REC_TRMT 670.210***
## (11.654)
##
## Constant 5,993,102.000*** 7,884,945.000*** 6,109,417.000***
## (1,371,913.000) (1,444,881.000) (1,270,553.000)
##
## ----------------------------------------------------------------------------------
## Observations 8,420 8,420 8,420
## R2 0.184 0.111 0.282
## Adjusted R2 0.183 0.111 0.282
## Residual Std. Error (df = 8418) 121,493,463.000 126,763,417.000 113,928,392.000
## F Statistic (df = 1; 8418) 1,892.368*** 1,052.918*** 3,307.086***
## ==================================================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
##unit increase in pop density would be accompanied by an increase of $29,325.19 in total need
##unit increase in residential burden --> increase of $97,748.53 in total need
##unit increase in residential population receiving collection would result in an increase of $670.21 in total need
We also compared regression models which used log-transformed data:
– R-squared values are lower for “log of population density”, but much higher for “log of population receiving treatment”, and residual plots (above) exhibit a better fit about the mean. This showed us that while log adjustment proves useful for clustering data with high min/max values, it doesn’t necessarily affect the predictive power of the regression model.
– P-values of all coefficients are less than 0.001, implying that the model contains a high statistical significance (99.99% confidence).
stargazer(mod.log.pop.density, mod.log.res.trmt, type = "text", star.cutoffs = c(0.05, 0.01, 0.001))
##
## ===========================================================================
## Dependent variable:
## -------------------------------------------------------
## LOG_NEED
## (1) (2)
## ---------------------------------------------------------------------------
## LOG_POP_DENSITY_10 0.406***
## (0.011)
##
## LOG_RES_REC_TRMT 0.640***
## (0.008)
##
## Constant 12.846*** 9.611***
## (0.055) (0.069)
##
## ---------------------------------------------------------------------------
## Observations 8,263 8,242
## R2 0.145 0.420
## Adjusted R2 0.145 0.420
## Residual Std. Error 1.690 (df = 8261) 1.391 (df = 8240)
## F Statistic 1,402.444*** (df = 1; 8261) 5,972.063*** (df = 1; 8240)
## ===========================================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
##R-squared values are not necessarily better, but residual plots (above) do exhibit a better fit about the mean
##as such, log adjustment proves useful for clustering data with high min/max values, it doesn't necessarily affect the predictive power of the regression model
Need vs. Treatment; CSS Presence Dummies
#Need vs. Treatment; CSS Presence Dummies
##THIS IS IMP!!!
dat_dummy_css0 <- filter(dat1, CSS == 0)
dat_dummy_css1 <- filter(dat1, CSS == 1)
cor = 0.60, high correlation
cor.test(dat_dummy_css0$PRES_RES_REC_TRMT, dat_dummy_css0$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat_dummy_css0$PRES_RES_REC_TRMT and dat_dummy_css0$TOTAL_OFFICIAL_NEED
## t = 66.159, df = 7722, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5870440 0.6155166
## sample estimates:
## cor
## 0.6014713
#cor = 0.60, high correlation
mod.poptrmt.css0 <- lm(TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat_dummy_css0)
summary(mod.poptrmt.css0)
##
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat_dummy_css0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1325081109 -5209912 -4174949 -1529259 2893546542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4729349.436 663193.452 7.131 0.00000000000109 ***
## PRES_RES_REC_TRMT 465.561 7.037 66.159 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57140000 on 7722 degrees of freedom
## Multiple R-squared: 0.3618, Adjusted R-squared: 0.3617
## F-statistic: 4377 on 1 and 7722 DF, p-value: < 0.00000000000000022
#R2 = 0.36; Adjusted R2 = 0.36
cor = 0.54, high correlation
cor.test(dat_dummy_css1$PRES_RES_REC_TRMT, dat_dummy_css1$TOTAL_OFFICIAL_NEED, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: dat_dummy_css1$PRES_RES_REC_TRMT and dat_dummy_css1$TOTAL_OFFICIAL_NEED
## t = 17.347, df = 694, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4959223 0.5997712
## sample estimates:
## cor
## 0.549969
#cor = 0.54, high correlation
mod.poptrmt.css1 <- lm(TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat_dummy_css1)
summary(mod.poptrmt.css1)
##
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat_dummy_css1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1642329689 -45039862 -34361916 -26278668 4341723558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33599689.94 13211573.89 2.543 0.0112 *
## PRES_RES_REC_TRMT 1077.82 62.13 17.347 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 326900000 on 694 degrees of freedom
## Multiple R-squared: 0.3025, Adjusted R-squared: 0.3015
## F-statistic: 300.9 on 1 and 694 DF, p-value: < 0.00000000000000022
#R2 = 0.30; Adjusted R2 = 0.30
For facilities without CSS:
– unit increase in residential population receiving treatment results in a $465.56 increase in total need
Comparatively, for facilities with CSS:
– unit increase in residential population receiving treatment results in a $1077.82 increase in total need
stargazer(mod.poptrmt.css0, mod.poptrmt.css1, type = "text", star.cutoffs = c(0.05, 0.01, 0.001))
##
## ==========================================================================
## Dependent variable:
## ------------------------------------------------------
## TOTAL_OFFICIAL_NEED
## (1) (2)
## --------------------------------------------------------------------------
## PRES_RES_REC_TRMT 465.561*** 1,077.820***
## (7.037) (62.131)
##
## Constant 4,729,349.000*** 33,599,690.000*
## (663,193.500) (13,211,574.000)
##
## --------------------------------------------------------------------------
## Observations 7,724 696
## R2 0.362 0.302
## Adjusted R2 0.362 0.301
## Residual Std. Error 57,140,327.000 (df = 7722) 326,930,190.000 (df = 694)
## F Statistic 4,377.043*** (df = 1; 7722) 300.933*** (df = 1; 694)
## ==========================================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
##for facilities without CSS:
###unit increase in population density results in a $2,380 increase in total need
###unit increase in residential population receiving treatment results in a $461 increase in total need
##comparatively, for facilities with CSS:
#####unit increase in population density results in a $20,477 increase in total need
###unit increase in residential population receiving treatment results in a $833 increase in total need
This implies a 132% difference in need, per unit of increase in residential population receiving treatment, for variables with CSS facilities!
((20477 - 2380)/(2380))*100
## [1] 760.3782
##implies a +760 percent difference per unit of population density increase, for variables with CSS facilities
##!!!!!!!!!!!!!!!!
###observations are statistically significant with reasonably high R2 values
Need vs. Treatment: Mean Treatment Threshold Dummies
Too low, no point in looking much further.
dat_dummy_trmt0 <- filter(dat1, DUMMY_TRMT == 0)
dat_dummy_trmt1 <- filter(dat1, DUMMY_TRMT == 1)%>%
mutate(RES_BURDEN = TOTAL_OFFICIAL_NEED/POP10)
mod.income.dummytrmt0 <- lm(TOTAL_OFFICIAL_NEED ~ MEDINC09 + PRES_RES_REC_TRMT, data = dat_dummy_trmt0)
summary(mod.income.dummytrmt0)
##
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ MEDINC09 + PRES_RES_REC_TRMT,
## data = dat_dummy_trmt0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7358465 -2095107 -1148444 85561 268672717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2056471.918 559390.560 -3.676 0.00024 ***
## MEDINC09 65.608 9.521 6.891 0.00000000000636 ***
## PRES_RES_REC_TRMT 1162.541 170.618 6.814 0.00000000001086 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8117000 on 4207 degrees of freedom
## Multiple R-squared: 0.02308, Adjusted R-squared: 0.02261
## F-statistic: 49.69 on 2 and 4207 DF, p-value: < 0.00000000000000022
#R2 = 0.023; Adjusted R2 = 0.023
##too low, no point looking much further
Create a multiple linear regression with variables exhibiting high correlations. Controlled for dummy variable of “residential population receiving treatment > median of res pop receiving treatment”
The values are reasonable, plot and look at residuals.
##create a multiple linear regression with variables exhibiting high correlations
##controlled for dummy variable of "residential population receiving treatment > of res pop receiving treatment"
mod.dummytrmt1 <- lm(TOTAL_OFFICIAL_NEED ~ POP_DENSITY_10 + RES_BURDEN + PRES_RES_REC_TRMT, data = dat_dummy_trmt1)
summary(mod.dummytrmt1)
##
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ POP_DENSITY_10 + RES_BURDEN +
## PRES_RES_REC_TRMT, data = dat_dummy_trmt1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1830743422 -15644256 4978917 12951099 4141397389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -19571664.88 2408965.46 -8.125 0.000000000000000585 ***
## POP_DENSITY_10 21590.08 844.40 25.569 < 0.0000000000000002 ***
## RES_BURDEN 92444.79 3606.06 25.636 < 0.0000000000000002 ***
## PRES_RES_REC_TRMT 524.15 15.28 34.307 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 140300000 on 4206 degrees of freedom
## Multiple R-squared: 0.4439, Adjusted R-squared: 0.4435
## F-statistic: 1119 on 3 and 4206 DF, p-value: < 0.00000000000000022
#R2 = 0.44; Adjusted R2 = 0.44
##reasonable values, we can plot and look at residuals
Higher spread from the mean.
ggplot(dat_dummy_trmt1, aes(PRES_RES_REC_TRMT + RES_BURDEN + POP_DENSITY_10, TOTAL_OFFICIAL_NEED)) +
geom_point(size=3, shape=20, color="orchid") +
geom_smooth(method='lm', se=FALSE, color="grey40") +
ggtitle("Multiple Regression with Dummies") +
xlab("Res Trmt + Pop Density + Res Burden") + ylab("Total Need")
summary(mod.dummytrmt1$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1830743422 -15644256 4978917 0 12951099 4141397389
##high spread from mean
Still a heteroscedastic plot. Log transforming would help.
plot(fitted(mod.dummytrmt1), mod.dummytrmt1$residuals,
main = "Fitted Model Against Residuals: Residential Trmt with Dummies",
xlab = "Fitted Model", ylab = "Residuals",
col = "orchid", pch = 16)
abline(0,0, col = "grey40")
##still a heteroscedastic plot
##log transforming would help, but we're going to sleep instead
##see you at part 2.3!
COMING SOON 🥵