#knitr::opts_chunk$set(echo=TRUE, warning = FALSE, message = FALSE, results = 'hide', include=TRUE, fig.keep = 'all')

## SET UP ##

#Setup for our analysis, including necessary packages.

setwd("~/Documents/UPenn/2022 Spring/CPLN 505 - Planning by Numbers/Assignment 2")

library(tidyverse)
library(tidycensus)
library(ggplot2)
library(dplyr)
library(data.table)
require("ggrepel")
library(hrbrthemes)
library(viridis)
library(ggridges)
library(lubridate)
library(tigris)
library(gganimate)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(mapview)
library(ggcorrplot)
library(RColorBrewer)
library(stargazer)
library(correlationfunnel)
library(tidyquant)
library(plotly)
library(shiny)
library(bslib)
library(shinydashboard)
library(DT)

options(scipen=999)

dat <- read.csv("cwns2012.csv", header = TRUE)

Memorandum

TO: Mayor Kenney

FROM: Riddhi Batra & Emily Goldstein

DATE: 3 March 2022

RE: Water Infrastructure Needs in Philadelphia

Dear Mayor Kenney,

Since former Mayor Michael Nutter’s introduction of the Greenworks Philadelphia plan in 2009, Philadelphia has been on track to perform major improvements to its waste and stormwater infrastructure with the goal of reducing the overall burden on Philly’s aging water infrastructure and saving the City critical dollars in future facility repairs and improvements. In an effort to understand the potential scale and impact of these improvements, we have performed several comparative statistical analyses across geographic regions, states, and counties, studying their relationship to Philadelphia County, and identifying critical metrics impacting Philly’s potential infrastructure need costs.1 We have outlined our key findings below and provided a more detailed and technical evaluation in our attached analysis.

Philadelphia’s water infrastructure needs are closely related to the county’s population and population density.

We identified positive relationships between Philadelphia’s population and population density and its share of infrastructure needs. This finding is intuitive and unsurprising. Philadelphia County alone accounts for almost 20% of Pennsylvania’s official needs and Philadelphia County contains the highest population density of all counties in PA (and growing– between 2000 – 2010 by 0.56%). Additionally, over 24% of the residential population receiving treatment in Pennsylvania belong to Philadelphia County. We also evaluated the relationship between the percentage of white identifying populations and median income, finding these metrics to have little to no association with infrastructure needs.

The City of Philadelphia’s total need accounts for just over 3% of the four states within the Delaware River Watershed, and its mean population density falls within the third quartile.

Within the Delaware River Watershed (DRW), a critical source for Philadelphia’s drinking water, the City of Philadelphia accounted for over 3% of the four states, including New York, New Jersey, and Delaware, also within the DRW. Like Philadelphia’s high need percentage in PA, it also has a comparatively high need relative to other related regions. While Philadelphia is dominant in its percentage share of need in PA, the state of PA itself falls near the middle range of need at number 28 when compared to mean need to all 49 states. However, in terms of median need, PA falls at number 18 within the third quartile of all states—a discrepancy which can be explained by the significant differences in min and max need values within PA that contribute towards skewing the state’s mean need.

Our analysis showed that every unit increase in residences receiving treatment is accompanied by an almost $700 increase in total need for Philadelphia.

Within the population of Philadelphia only about 62% of the residential and non-residential population is receiving water treatment. We anticipate this percentage share of the population using city water collection and treatment to continue to grow as it has over the past 10 years of our study period. As the number of Philly’s water infrastructure users increases, so will the cost burden experienced by the City.

We also recommend that particular attention be paid to upgrading the quality and capacity of CSS (Combined Sewer System) infrastructure in Philadelphia through adequate budgetary allocations and consistent quality evaluations.

The presence of a CSS facility also impacts Philadelphia’s water infrastructure requirements. Our enclosed regression analysis found that the increase in official need per unit increase in population density is 132% higher for facilities with a CSS than those without. This could be attributed to the high maintenance costs associated with Combined Sewer Systems.

A strategy taking into consideration the steady increase in the key metrics of population and population density, along with astute attention to Philadelphia’s aging CSS system, should support the City in its efforts to improve its water infrastructure and lower its operational costs.

Sincerely,

Riddhi Batra & Emily Goldstein

Future city planners aspiring to be semi-competent in R.

Part 1: Comparing Groups and Testing for Association & Correlation

Part 1.1: Contextualizing Reported Need

We started by creating a new data frame to include change and percent change for all metrics.

dat1 <- select(dat, 1:18, 21:26, 30:31, 35, 37:38, 40)%>%
  mutate(CHANGE_RES_REC_COLLCTN = PROJ_RES_REC_COLLCTN - PRES_RES_REC_COLLCTN)%>%
    mutate(PCT_CHANGE_RES_REC_COLLCTN = (CHANGE_RES_REC_COLLCTN/PRES_RES_REC_COLLCTN)*100)%>%
  mutate(CHANGE_RES_REC_TRMT = PROJ_RES_REC_TRMT - PRES_RES_REC_TRMT)%>%
    mutate(PCT_CHANGE_RES_REC_TRMT = (CHANGE_RES_REC_TRMT/PRES_RES_REC_TRMT)*100)%>%
  mutate(CHANGE_N_RES_REC_TRMT = PROJ_N_RES_REC_TRMT - PRES_N_RES_REC_TRTM)%>%
    mutate(PCT_CHANGE_N_RES_REC_TRMT = (CHANGE_N_RES_REC_TRMT/PRES_N_RES_REC_TRTM)*100)%>%
  mutate(CHANGE_N_RES_REC_COLLCTN = PROJ_N_RES_REC_COLLCTN - PRES_N_RES_REC_COLLCTN)%>%
    mutate(PCT_CHANGE_N_RES_REC_COLLCTN = (CHANGE_N_RES_REC_COLLCTN/PRES_N_RES_REC_COLLCTN)*100)%>%
  mutate(ALAND_SQMILE = ALAND/2.59e+6)%>%
  mutate(POP_DENSITY_10 = POP10/ALAND_SQMILE)%>%
  mutate(POP_DENSITY_00 = POP00/ALAND_SQMILE)%>%
  mutate(CHANGE_POP_DENSITY = POP_DENSITY_10 - POP_DENSITY_00)%>%
    mutate(PCT_CHANGE_POP_DENSITY = (CHANGE_POP_DENSITY/POP_DENSITY_00)*100)%>%
  mutate(MEDINC99_INFL = MEDINC99*1.29)%>%
  mutate(CHANGE_MEDINC = MEDINC09 - MEDINC99_INFL)%>%
    mutate(PCT_CHANGE_MEDINC = (CHANGE_MEDINC/MEDINC99_INFL)*100)

Part 1.2: Summary Statistics

National Statistics for USA

Total Official Need

Total Reported Need

The facility with the highest reported need is CWNS NUMBER 36002001005, in Queens County, NY, EPA Region 2.

summary(dat1$TOTAL_OFFICIAL_NEED)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##          0     704599    2409822   21623215    7660571 5160108526

Top Ten Facilities with the Most Need

dat_top10 <- dat1%>%
  slice_max(TOTAL_OFFICIAL_NEED, n = 10)


DT::datatable(
  dat_top10,
  height = 150,
  options = list(scrollX = TRUE)
)

Population Density

Change in Population Density

summary(dat1$PCT_CHANGE_POP_DENSITY)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -46.605  -0.640   4.269   7.341  11.730 110.355

Density Plot: Facilities with Population Density Below Mean Value

mean_pop.dens <- mean(dat1$POP_DENSITY_10)

dat1%>%
  filter(POP_DENSITY_10 <= 532.99)%>%
  ggplot(aes(x=POP_DENSITY_10)) +
  ggtitle("Population Density Distribution") +
  xlab("Population Density (2010) less than mean values") + ylab("Density") +
  geom_density(fill="orchid", color="orchid3", alpha=0.8)

Median Income

Change in Median Income

summary(dat1$CHANGE_MEDINC)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -15488.5  -3924.0  -1614.8  -1530.3    706.9  25537.0

Percent Change in Median Income

summary(dat1$PCT_CHANGE_MEDINC)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -36.025  -6.512  -2.663  -2.447   1.244  49.526

Median Income: Plotted

ggplot(dat1, aes(x=MEDINC09)) +
geom_density(fill="orchid", color="orchid3", alpha=0.8)

Residential Collection

Change in Residential Population Receiving Collection

summary(dat1$CHANGE_RES_REC_COLLCTN)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##  -88018.0      19.0     275.5    4636.7    1800.0 1300000.0

Percent Change in Residential Population Receiving Collection

summary(dat1$PCT_CHANGE_RES_REC_COLLCTN)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -100.000    1.745   12.500      Inf   35.780      Inf       23

Non-Residential Collection

Change in Non-Residential Population Receiving Collection

summary(dat1$CHANGE_N_RES_REC_COLLCTN)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -254000.0       0.0       0.0     376.8       0.0  354330.0

Percent Change in Non-Residential Population Receiving Collection

summary(dat1$PCT_CHANGE_N_RES_REC_COLLCTN)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -100.00    0.00   23.46     Inf  100.00     Inf    7217

Residential Treatment

Change in Residential Population Receiving Treatment

summary(dat1$CHANGE_RES_REC_TRMT)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -187386      31     326    5477    2096 1300028

Percent Change in Residential Population Receiving Treatment

summary(dat1$PCT_CHANGE_RES_REC_TRMT)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -100.000    2.302   13.636      Inf   37.883      Inf       18

Non-Residential Treatment

Change in Non-Residential Population Receiving Treatment

summary(dat1$CHANGE_N_RES_REC_TRMT)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -254000.0       0.0       0.0     437.9       0.0  354330.0

Percent Change in Non-Residential Population Receiving Treatment

summary(dat1$PCT_CHANGE_N_RES_REC_TRMT)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -100.00    0.00   24.62     Inf  100.00     Inf    7102

Group by EPA Regions

EPA

Summary Stats by Need of EPA Regions

dat_epa <- group_by(dat1, EPA_REGION)%>%
  summarise(count = n(),
            mean_need = mean(TOTAL_OFFICIAL_NEED),
            median_need = median(TOTAL_OFFICIAL_NEED),
            mean_popdensity00 = mean(POP_DENSITY_00),
            mean_popdensity10 = mean(POP_DENSITY_10),
            median_popdensity00 = median(POP_DENSITY_00),
            median_popdensity10 = median(POP_DENSITY_10))

summary(dat_epa$median_need)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   946351  1896198  3022067  4276328  3977127 17740293

Summary Stats by Density of EPA Regions

summary(dat_epa$mean_popdensity00)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   105.6   179.5   411.7   568.6   596.5  2208.0

Top Ten by EPA Region

dat_top10epa <- dat1%>%
  group_by(EPA_REGION)%>%
  slice_max(TOTAL_OFFICIAL_NEED, n=10)

DT::datatable(
  dat_top10epa,
  height = 100,
  options = list(scrollX = TRUE)
)

#CHUNK FOR BREAKING UP TABLES

Bottom Ten by EPA Region

dat_bottom10epa <- dat1%>%
  group_by(EPA_REGION)%>%
  slice_min(TOTAL_OFFICIAL_NEED, n=10)

DT::datatable(
  dat_bottom10epa,
  height = 100,
  options = list(scrollX = TRUE)
)

We created scatterplots to visualize relationships between population density and official need for different groupings of the CWNS data set.

Of the 10 EPAs represented:

– EPA 9 has a mid-range population density with the highest official need.

– EPA 2 has the highest population density with a high official need.

– EPA 8 has the lowest population density and the lowest reported need.

– EPA 3, which contains Philadelphia, has a relatively lower population density and official need.

These findings are consistent with the presence of large urban centers in EPA 9 and EPA 2 – situated on the West and East coasts respectively, both regions contain cities and counties with high populations and water requirements – such as Los Angeles, San Franciso, New York, Queens, and Hudson.

Plot: Population Density vs Total Need, by EPA

ggplot(dat_epa, aes(x = mean_popdensity10,
                    y = mean_need)) +
  geom_point(size=5, shape=20, color="darkorchid4") +
  geom_text_repel(aes(label = rownames(dat_epa), size = 4)) +
                     theme(legend.position = "bottom") +
  ggtitle("Population Density vs. Total Need, by EPA") +
  xlab("Mean Population Density (2010)") + ylab("Mean Official Need")

Group by State

STATE

Top Ten by State

dat_state <- group_by(dat1, STATEFP, STATE)%>%
  summarise(count = n(),
            mean_need = mean(TOTAL_OFFICIAL_NEED),
            median_need = median(TOTAL_OFFICIAL_NEED),
            mean_popdensity00 = mean(POP_DENSITY_00),
            mean_popdensity10 = mean(POP_DENSITY_10),
            median_popdensity00 = median(POP_DENSITY_00),
            median_popdensity10 = median(POP_DENSITY_10))%>%
  filter(count > 2)

dat_top10state <- dat1%>%
  group_by(STATEFP)%>%
  slice_max(TOTAL_OFFICIAL_NEED, n = 10)

DT::datatable(
  dat_top10state,
  height = 100,
  options = list(scrollX = TRUE)
)

#CHUNK FOR BREAKING UP TABLES

Bottom Ten by State

dat_bottom10state <- dat1%>%
  group_by(STATEFP)%>%
  slice_min(TOTAL_OFFICIAL_NEED, n = 10)

DT::datatable(
  dat_bottom10state,
  height = 100,
  options = list(scrollX = TRUE)
)

When grouped by “State”, we can observe population versus official need in a manner consistent with our EPA grouping, in addition to an unexpected cameo:

– California, falling under EPA 9, has mid-range population density with a high need.

– New Jersey has the highest population density with a mid-range need, and New York has a high population density with a high need; both belonging to EPA 2.

– Pennsylvania has a mid-range population density with a low official need.

Surprisingly, the highest need of all states is shouldered by Hawaii, despite a low population density. This could be attributed to the fact that Hawaii’s current water infrastructure is outdated¹ and requires extra attention due to its geographic susceptibility to climate change. ²

Population Density vs. Total Need by State

ggplot(dat_state, aes(x = mean_popdensity10,
                    y = mean_need)) +
  geom_point(size=4, shape=20, color="springgreen4") +
  geom_text_repel(aes(label = STATE, size = 2)) +
  theme(legend.position = "bottom") +
  ggtitle("Population Density vs. Total Need, by State") +
  xlab("Mean Population Density (2010)") + ylab("Mean Official Need")

Focus States: NY, NJ, PA, DE

In order to begin contextualizing Philadelphia’s water needs, we decided to focus our statistical analysis on the four states falling under the Delaware River Watershed (12,800 sq miles). While Philadelphia’s water supply is split between two sources – the Schyulkill and Delaware rivers – we’ve chosen to compare infrastructure and resources to its northeastern neighbors, in particular the states of New York, New Jersey, and Delaware. As they are geographically related and rely on the Delaware River Watershed for much of their potable water needs, these states are a logical point of reference. Within the EPA’s regional, large aquatic ecosystems (LAE) classification system, these states fall within regions 2 (NY, NJ) and 3 (PA, DE).

dat_PA <- filter(dat1, STATEFP == 42)
dat_NJ <- filter(dat1, STATEFP == 34)
dat_NY <- filter(dat1, STATEFP == 9)
dat_DE <- filter(dat1, STATEFP == 10)

dat_focus_states <- filter(dat1, STATE %in% c("PA", "NJ", "NY", "DE"))

Summary Statistics

Of the four states sharing the Delaware River Watershed:

– The highest need is for the state of New York, which also has a relatively high population density.

– Delaware has the lowest need and population statistics.

– Seen in conjunction with New York and New Jersey, Pennsylvania’s population density and total need is much lower.

Total Official Need

summary(dat_focus_states$TOTAL_OFFICIAL_NEED)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##          0    1067266    3371533   37872196    8494217 5160108526

Median Income

summary(dat_focus_states$CHANGE_MEDINC)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -7174.6 -1722.5  -547.4  -494.3  1052.2 14726.6

Population Density

summary(dat_focus_states$CHANGE_POP_DENSITY)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -79.88   -1.52   11.16   34.49   45.69 2148.63

Residential Collection

summary(dat_focus_states$CHANGE_RES_REC_COLLCTN)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -88018.0      0.0    100.0    841.1    748.5  35408.0

Residential Treatment

summary(dat_focus_states$CHANGE_RES_REC_TRMT)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -111031.0       0.0     180.5    1184.9    1047.8   54959.0

Box Plots: Official Need by Focus States

We constructed box plots to understand the distribution of total need within each of these four states. Facilities with outlier values in New York, New Jersey, and Pennsylvania skewed the representation of our box plots.

New York, especially, was literally off the charts!

ggplot(dat_focus_states, 
       aes(x=STATE, y=TOTAL_OFFICIAL_NEED)) + 
  geom_boxplot(fill="seagreen3", alpha=0.2) + 
  geom_jitter(color="gray50", size=0.4, alpha=0.9) +
  ggtitle("Water Infrastructure Need: Delaware River Watershed") +
  xlab("STATE") + ylab("Official Need")

##NY need is skewing comparative visualization - literally off the charts!

We decided to log transform both variables to fix our box plots. The dots behind each box plot represent need values for each state’s facility, which gave us a way to visualize the spread of data.

– As surmised above, the largest spread in values can be seen in New Jersey, New York, and Pennsylvania.

– Median values of total need are about the same for all states, with a slight upward lean in the range of data for Delaware.

ggplot(dat_focus_states, 
       aes(x=STATE, y=TOTAL_OFFICIAL_NEED)) + 
  scale_y_log10() +
  geom_boxplot(fill="seagreen3", alpha=0.2) + 
  geom_jitter(color="gray50", size=0.4, alpha=0.9) +
  ggtitle("Water Infrastructure Need: Delaware River Watershed") +
  xlab("STATE") + ylab("Official Need (Log Transformed)")

Summary of Percent Change Residential Collection for Focus States

summary(dat_focus_states$PCT_CHANGE_RES_REC_COLLCTN)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -79.095   0.000   4.349     Inf  14.333     Inf       2

Change in Residential Population Receiving Collection

ggplot(dat_focus_states, 
       aes(x=STATE, y=PCT_CHANGE_RES_REC_COLLCTN)) + 
  geom_boxplot(fill="seagreen4", alpha=0.2) +
  xlab("STATE") + ylab("Change") +
  ylim(-75,75)

dat_focus_states_summary <- filter(dat1, STATE %in% c("PA", "NJ", "NY", "DE"))%>%
  group_by(STATE)%>%
  summarise(count = n(),
            mean_need = mean(TOTAL_OFFICIAL_NEED),
            median_need = median(TOTAL_OFFICIAL_NEED),
            mean_popdensity00 = mean(POP_DENSITY_00),
            mean_popdensity10 = mean(POP_DENSITY_10),
            median_popdensity00 = median(POP_DENSITY_00),
            median_popdensity10 = median(POP_DENSITY_10))

Population Density vs. Total Need by Focus State

ggplot(dat_focus_states_summary, aes(x = mean_popdensity10,
                    y = mean_need)) +
  geom_point(size=5, shape=20, color="olivedrab4") +
  geom_text_repel(aes(label = STATE, size=3)) +
  theme(legend.position = "bottom") +
  ggtitle("Water Infrastructure Need: Delaware River Watershed") +
  xlab("Mean Population Density (2010)") + ylab("Mean Official Need")

Pennsylvania Counties

dat_PA_counties <- group_by(dat_PA, COUNTYNAME)%>%
  summarise(count = n(),
            mean_need = mean(TOTAL_OFFICIAL_NEED),
            median_need = median(TOTAL_OFFICIAL_NEED),
            mean_popdensity00 = mean(POP_DENSITY_00),
            mean_popdensity10 = mean(POP_DENSITY_10),
            median_popdensity00 = median(POP_DENSITY_00),
            median_popdensity10 = median(POP_DENSITY_10))

Population Density vs. Official Need

Within Pennsylvania, Philadelphia County unsurprisingly reported the highest total need with the highest population density within the state of Pennsylvania. While all other counties were clustered near the graph origin with low population density and low need, Delaware and Lackawanna Counties exhibited some deviation from the origin yet remain far from Philadelphia Counties statistics.

ggplot(dat_PA_counties, aes(x = mean_popdensity10,
                          y = mean_need)) +
  geom_point(size=5, shape=20, color="darkgoldenrod1") +
  geom_text_repel(aes(label = COUNTYNAME, size=3)) +
  theme(legend.position = "bottom") +
  ggtitle("Water Infrastructure Need: Pennsylvania Counties") +
  xlab("Mean Population Density (2010)") + ylab("Mean Official Need")

Certain counties are standing out a lot…

datPA <- filter(dat1, STATE == "PA")%>%
  top_n(10, TOTAL_OFFICIAL_NEED)

Official Need for Top 10 Counties

Even when compared to the top 10 counties exhibiting high need within Pennsylvania, Philadelphia’s needs stack up much higher. This isn’t very surprising, given the high population density of Philadelphia County (11373 in 2010) as compared to other Pennsylvania counties.

ggplot(datPA, aes(x=as.factor(COUNTYNAME), fill=as.factor(TOTAL_OFFICIAL_NEED))) +  
  geom_bar(color="darkgoldenrod3", fill="darkgoldenrod1", alpha=0.8) +
  xlab ("County") + ylab ("Total Official Need")

The non-normal distribution of the following density plot helps visualize this stark difference.

#DensityPlots
ggplot(dat_PA, aes(x=POP_DENSITY_10)) +
  geom_density(fill="darkgoldenrod1", color="darkgoldenrod2", alpha=0.8) +
  ggtitle("Variation in Population Density: Pennsylvania Counties") +
  xlab("Population Density (2010)")

The following chart representing absolute values of need shows the comparatively high need of Philadelphia County with respect to Delaware and Lackawanna.

dat1%>%
  filter(COUNTYNAME %in% c("Lackawanna", "Philadelphia", "Delaware"))%>%
  ggplot(aes(x=COUNTYNAME, y=TOTAL_OFFICIAL_NEED)) + 
  geom_boxplot(fill="darkgoldenrod1", alpha=0.2) + 
  geom_jitter(color="gray50", size=0.4, alpha=0.9) +
  ggtitle("Water Infrastructure Need: Pennsylvania Counties") +
  xlab("STATE") + ylab("Official Need")

Log Transform

On log-transforming the chart variables, the extent of Philadelphia County’s need, relative to other PA counties can be clearly noticed in the differences in median values and the spread of data. This clearly demonstrates the extent to which Philadelphia County alone contributes to Pennsylvania’s water infrastructure needs.

#Log Transform
dat1%>%
  filter(COUNTYNAME %in% c("Lackawanna", "Philadelphia", "Delaware"))%>%
  ggplot(aes(x=COUNTYNAME, y=TOTAL_OFFICIAL_NEED)) + 
  geom_boxplot(fill="darkgoldenrod1", alpha=0.2) + 
  scale_y_log10() +
  geom_jitter(color="gray50", size=0.4, alpha=0.9) +
  ggtitle("Water Infrastructure Need: Pennsylvania Counties") +
  xlab("STATE") + ylab("Official Need (Log Transformed)")

Philadelphia

Philadelphia Water Department has three water treatment plants, the Baxter Plant, Queen Lane Plant and Belmont Plant. Located in Torresdale, the Baxter Plant cleans water from the Delaware. Both the Queen Lane (East Falls) and Belmont (Wynnefield) plants source water from the Schuylkill.

Summary

##LOOK AT PHILLY
dat_PHL <- filter(dat1, COUNTYFP == 101, STATEFP == 42)

#colnames(dat_PHL)
summary(dat_PHL$TOTAL_OFFICIAL_NEED)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 249190563 383117348 517044133 429336765 519409866 521775599

Total Need

#Density Plot: Total Need
ggplot(dat_PHL, aes(x=TOTAL_OFFICIAL_NEED)) +
  geom_density(fill="coral", color="coral2", alpha=0.8) +
  ggtitle("Variation in Need: Philadelphia Facilities") +
  xlab("Official Need")

Of the three facilities, the highest reported need is for Philadelphia Water Dept (SW).

#Compare Changes in Collection and Treatment

#Barplots

ggplot(dat_PHL, aes(y=CHANGE_RES_REC_TRMT, x=CWNS_NUMBER)) + 
  geom_bar(position="dodge", stat="identity", color="coral1", fill="coral1", alpha=0.6) +
  ggtitle("Change in Residential Population Receiving Treatment") + 
  xlab("CWNS Facility") + ylab("Change in Treatment Received")

Part 1.3: Test of Association

Overall Budgeted Need for Facilites with CSS vs. MS4s

Magnitude of overall budgeted need for CSS is significantly more than budgeted need for MS4.

Means are significantly different as the test statistic is greater than the critical value.

Test statistic is within the interval of 95% confidence.

dat_YesCSS <- filter(dat, CSS == "1") 
dat_YesMS4 <- filter(dat, MS4 == "1")

#sample means
meanNeed_YesCSS <-mean(dat_YesCSS$TOTAL_OFFICIAL_NEED)
meanNeed_YesMS4 <-mean(dat_YesMS4$TOTAL_OFFICIAL_NEED)
##Mean need for CSS is larger than MS4

#sample standardized deviations 
stdNeed_YesCSS <-sd(dat_YesCSS$TOTAL_OFFICIAL_NEED)
stdNeed_YesMS4 <-sd(dat_YesMS4$TOTAL_OFFICIAL_NEED)

#number of obs
numobs_YesCSS <- dim(dat_YesCSS)[1]
numobs_YesMS4 <- dim(dat_YesMS4)[1]

#test stat calc: difference in the means & standardized deviations
numerator_teststat <- meanNeed_YesCSS - meanNeed_YesMS4
denom_teststat <-
  ((stdNeed_YesCSS*stdNeed_YesCSS)/numobs_YesCSS +
     (stdNeed_YesMS4*stdNeed_YesMS4)/numobs_YesMS4)^.5

test_stat<-numerator_teststat/denom_teststat

abs(test_stat)>1.96

## [1] TRUE

Paired T-test of CSS vs. MS4s

Reject null hypothesis of equal means.

t.test(dat_YesCSS$TOTAL_OFFICIAL_NEED, dat_YesMS4$TOTAL_OFFICIAL_NEED)

## 
##  Welch Two Sample t-test
## 
## data:  dat_YesCSS$TOTAL_OFFICIAL_NEED and dat_YesMS4$TOTAL_OFFICIAL_NEED
## t = 6.712, df = 699.19, p-value = 0.00000000003972
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   70514032 128823442
## sample estimates:
## mean of x mean of y 
## 113053301  13384564

##Reject null hypothesis of equal means.

Summary of Official Need for CSS Facilities

summary(dat_YesCSS$TOTAL_OFFICIAL_NEED)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##          0    2832885    8813826  113053301   36254645 5160108526

Summary of Official Need for MS4 Facilities

summary(dat_YesMS4$TOTAL_OFFICIAL_NEED)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##          0     628112    2161876   13384564    6708263 3289580276

Difference in Facility Need Based on TMDL

Mean need of TMDL impacted sites is greater than non-TMDL sites.

Test statistic is within the interval of 95% confidence.

dat_YesTMDL <- filter(dat, TMDL_INDICATOR == "Y") 
dat_NoTMDL <- filter(dat, TMDL_INDICATOR == "N")

#sample means
meanNeed_YesTMDL <-mean(dat_YesTMDL$TOTAL_OFFICIAL_NEED)
meanNeed_NoTMDL <-mean(dat_NoTMDL$TOTAL_OFFICIAL_NEED)
##Mean need of TMDL impacted sites is greater than non-TMDL sites. 

#sample standardized deviations 
stdNeed_YesTMDL <-sd(dat_YesTMDL$TOTAL_OFFICIAL_NEED)
stdNeed_NoTMDL <-sd(dat_NoTMDL$TOTAL_OFFICIAL_NEED)

#number of obs
numobs_YesTMDL <- dim(dat_YesTMDL)[1]
numobs_NoTMDL <- dim(dat_NoTMDL)[1]

#test stat calc: difference in the means & standardized deviations
numerator_teststatTMDL <- meanNeed_NoTMDL - meanNeed_YesTMDL
denom_teststatTMDL <-
  ((stdNeed_NoTMDL*stdNeed_NoTMDL)/numobs_NoTMDL +
     (stdNeed_YesTMDL*stdNeed_YesTMDL)/numobs_YesTMDL)^.5

test_statTMDL<-numerator_teststatTMDL/denom_teststatTMDL

abs(test_statTMDL)>1.96

## [1] TRUE

Paired T-test of TMDL and Official Need

Summary of Official Need for Non-TMDL Facilities

#paired t-test
summary(dat_NoTMDL$TOTAL_OFFICIAL_NEED)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##          0     690815    2371608   20228097    7536767 5160108526

Summary of Official Need for TMDL Facilities

summary(dat_YesTMDL$TOTAL_OFFICIAL_NEED)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##          0    1249456    3862453   58996731   14372950 3945228972

Paired T-test of Need by EPA Regions with TMDL Facilities

Region 1

Fail to reject null hypothesis; cannot establish significance.

#compare TMDL by need by EPA regions
t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
       data = filter(dat, EPA_REGION == 1), paired = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = -0.85922, df = 3.0193, p-value = 0.453
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
##  -542766502  311336752
## sample estimates:
## mean in group N mean in group Y 
##        32685334       148400209

##High p-value/low t-value

Region 2

Fail to reject null hypothesis; cannot establish significance.

t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
       data = filter(dat, EPA_REGION == 2), paired = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = -3.5245, df = 5.0045, p-value = 0.01681
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
##  -3033874754  -475206220
## sample estimates:
## mean in group N mean in group Y 
##        39662349      1794202835

##High p-value/low t-value

Region 3

Fail to reject null hypothesis; cannot establish significance.

t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
       data = filter(dat, EPA_REGION == 3), paired = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = 2.6523, df = 31.104, p-value = 0.01247
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
##   2196333 16806577
## sample estimates:
## mean in group N mean in group Y 
##        18403975         8902520

##High p-value/low t-value

Region 4

Fail to reject null hypothesis; cannot establish significance.

t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
       data = filter(dat, EPA_REGION == 4), paired = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = 0.12885, df = 18.325, p-value = 0.8989
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
##  -17470739  19756906
## sample estimates:
## mean in group N mean in group Y 
##        22352317        21209234

##High p-value/low t-value -->

Region 5

Reject null hypothesis, significance between need and TMDL.

t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
       data = filter(dat, EPA_REGION == 5), paired = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = 3.9798, df = 1436.2, p-value = 0.00007241
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
##   5244922 15440674
## sample estimates:
## mean in group N mean in group Y 
##        15088120         4745322

##Low p-value/high t-value --> reject null hypothesis

Region 6

Reject null hypothesis, significance between need and TMDL.

t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
       data = filter(dat, EPA_REGION == 6), paired = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = -3.074, df = 63.917, p-value = 0.003104
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
##  -80860108 -17158802
## sample estimates:
## mean in group N mean in group Y 
##         9455932        58465387

##Low p-value/high t-value --> reject null hypothesis

Regions 7 & 8

Failed t-test; not enough values.

#t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
#       data = filter(dat, EPA_REGION == 7), paired = FALSE)
##failed t-test, not enough values 

#t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
#       data = filter(dat, EPA_REGION == 8), paired = FALSE)
##failed t-test, not enough values

Region 9

Fail to reject null hypothesis; cannot establish significance.

t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
       data = filter(dat, EPA_REGION == 9), paired = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = -0.87332, df = 1.0125, p-value = 0.5415
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
##  -1749036595  1517883950
## sample estimates:
## mean in group N mean in group Y 
##        65458334       181034656

##Very high p-value/low t-value --> cannot establish significance; accept null hypothesis?

Region 10

Fail to reject null hypothesis; cannot establish significance.

t.test(TOTAL_OFFICIAL_NEED ~ TMDL_INDICATOR,
       data = filter(dat, EPA_REGION == 10), paired = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  TOTAL_OFFICIAL_NEED by TMDL_INDICATOR
## t = 1.2645, df = 252, p-value = 0.2072
## alternative hypothesis: true difference in means between group N and group Y is not equal to 0
## 95 percent confidence interval:
##  -4907121 22511440
## sample estimates:
## mean in group N mean in group Y 
##        26134688        17332528

##High p-value/low t-value --> fail to reject null hypothesis; cannot establish significance

Cannot establish significant association between facility needs and TMDL impacted-sites.

Some significance was established in regions 5 & 6; otherwise there were too few values in regions 7 & 8; could not establish significance in regions 1 - 4, 9, 10.

Significance in Need by Region

Cross Tables of Official Need and Region

Summary of Official Need

library(ggplot2)
library(vcd)
library (gmodels)               
library (MASS)
library(fabricatr)
library(reactable)

summary(dat$TOTAL_OFFICIAL_NEED)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##          0     704599    2409822   21623215    7660571 5160108526

## cross table of need and region --> too large need to group
# (dat$TOTAL_OFFICIAL_NEED, dat$EPA_REGION, fisher = FALSE, chisq = TRUE, 
#            expected = TRUE, sresid = FALSE, format="SPSS")

##CHI2: 74282.34; P: 0.00000000000000057
##Minimum expected frequency: 0.03361045; Cells with Expected Frequency < 5: 79130 of 79140 (99.98736%)

Official Need Grouped by Five Equal Sections

dat1_intervalfive <- dat1 %>% mutate(intervalfive_Need = ntile(dat1$TOTAL_OFFICIAL_NEED, 5))%>%
  mutate(intervalfive_pchmedinc = ntile(dat1$PCT_CHANGE_MEDINC, 5))%>% 
  mutate(intervalfive_pchpopden = ntile(dat1$PCT_CHANGE_POP_DENSITY, 5))%>% 
  mutate(intervalfive_pwhite10 = ntile(dat1$PCTWHITE10, 5))%>% 
  mutate(intervalfive_medinc09 = ntile(dat1$MEDINC09, 5))%>% 
  mutate(intervalfive_pop10 = ntile(dat1$POP_DENSITY_10, 5))

CrossTable(dat1_intervalfive$EPA_REGION, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE, 
           expected = TRUE, sresid = FALSE, format="SPSS")

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |         Expected Values |
## | Chi-square contribution |
## |             Row Percent |
## |          Column Percent |
## |           Total Percent |
## |-------------------------|
## 
## Total Observations in Table:  8420 
## 
##                              | dat1_intervalfive$intervalfive_Need 
## dat1_intervalfive$EPA_REGION |        1  |        2  |        3  |        4  |        5  | Row Total | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            1 |       42  |       51  |       63  |       83  |      126  |      365  | 
##                              |   73.000  |   73.000  |   73.000  |   73.000  |   73.000  |           | 
##                              |   13.164  |    6.630  |    1.370  |    1.370  |   38.479  |           | 
##                              |   11.507% |   13.973% |   17.260% |   22.740% |   34.521% |    4.335% | 
##                              |    2.494% |    3.029% |    3.741% |    4.929% |    7.482% |           | 
##                              |    0.499% |    0.606% |    0.748% |    0.986% |    1.496% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            2 |      105  |      109  |      130  |      150  |      120  |      614  | 
##                              |  122.800  |  122.800  |  122.800  |  122.800  |  122.800  |           | 
##                              |    2.580  |    1.551  |    0.422  |    6.025  |    0.064  |           | 
##                              |   17.101% |   17.752% |   21.173% |   24.430% |   19.544% |    7.292% | 
##                              |    6.235% |    6.473% |    7.720% |    8.907% |    7.126% |           | 
##                              |    1.247% |    1.295% |    1.544% |    1.781% |    1.425% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            3 |      125  |      165  |      239  |      258  |      244  |     1031  | 
##                              |  206.200  |  206.200  |  206.200  |  206.200  |  206.200  |           | 
##                              |   31.976  |    8.232  |    5.217  |   13.013  |    6.929  |           | 
##                              |   12.124% |   16.004% |   23.181% |   25.024% |   23.666% |   12.245% | 
##                              |    7.423% |    9.798% |   14.192% |   15.321% |   14.489% |           | 
##                              |    1.485% |    1.960% |    2.838% |    3.064% |    2.898% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            4 |      226  |      266  |      231  |      320  |      335  |     1378  | 
##                              |  275.600  |  275.600  |  275.600  |  275.600  |  275.600  |           | 
##                              |    8.927  |    0.334  |    7.218  |    7.153  |   12.802  |           | 
##                              |   16.401% |   19.303% |   16.763% |   23.222% |   24.311% |   16.366% | 
##                              |   13.420% |   15.796% |   13.717% |   19.002% |   19.893% |           | 
##                              |    2.684% |    3.159% |    2.743% |    3.800% |    3.979% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            5 |      311  |      415  |      424  |      347  |      255  |     1752  | 
##                              |  350.400  |  350.400  |  350.400  |  350.400  |  350.400  |           | 
##                              |    4.430  |   11.910  |   15.459  |    0.033  |   25.974  |           | 
##                              |   17.751% |   23.687% |   24.201% |   19.806% |   14.555% |   20.808% | 
##                              |   18.468% |   24.644% |   25.178% |   20.606% |   15.143% |           | 
##                              |    3.694% |    4.929% |    5.036% |    4.121% |    3.029% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            6 |      253  |      234  |      207  |      185  |      184  |     1063  | 
##                              |  212.600  |  212.600  |  212.600  |  212.600  |  212.600  |           | 
##                              |    7.677  |    2.154  |    0.148  |    3.583  |    3.847  |           | 
##                              |   23.801% |   22.013% |   19.473% |   17.404% |   17.310% |   12.625% | 
##                              |   15.024% |   13.895% |   12.292% |   10.986% |   10.926% |           | 
##                              |    3.005% |    2.779% |    2.458% |    2.197% |    2.185% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            7 |      427  |      251  |      205  |      126  |       92  |     1101  | 
##                              |  220.200  |  220.200  |  220.200  |  220.200  |  220.200  |           | 
##                              |  194.215  |    4.308  |    1.049  |   40.298  |   74.638  |           | 
##                              |   38.783% |   22.797% |   18.619% |   11.444% |    8.356% |   13.076% | 
##                              |   25.356% |   14.905% |   12.173% |    7.482% |    5.463% |           | 
##                              |    5.071% |    2.981% |    2.435% |    1.496% |    1.093% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            8 |      131  |      138  |      116  |       88  |       42  |      515  | 
##                              |  103.000  |  103.000  |  103.000  |  103.000  |  103.000  |           | 
##                              |    7.612  |   11.893  |    1.641  |    2.184  |   36.126  |           | 
##                              |   25.437% |   26.796% |   22.524% |   17.087% |    8.155% |    6.116% | 
##                              |    7.779% |    8.195% |    6.888% |    5.226% |    2.494% |           | 
##                              |    1.556% |    1.639% |    1.378% |    1.045% |    0.499% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            9 |       22  |       14  |       21  |       64  |      197  |      318  | 
##                              |   63.600  |   63.600  |   63.600  |   63.600  |   63.600  |           | 
##                              |   27.210  |   38.682  |   28.534  |    0.003  |  279.804  |           | 
##                              |    6.918% |    4.403% |    6.604% |   20.126% |   61.950% |    3.777% | 
##                              |    1.306% |    0.831% |    1.247% |    3.800% |   11.698% |           | 
##                              |    0.261% |    0.166% |    0.249% |    0.760% |    2.340% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                           10 |       42  |       41  |       48  |       63  |       89  |      283  | 
##                              |   56.600  |   56.600  |   56.600  |   56.600  |   56.600  |           | 
##                              |    3.766  |    4.300  |    1.307  |    0.724  |   18.547  |           | 
##                              |   14.841% |   14.488% |   16.961% |   22.261% |   31.449% |    3.361% | 
##                              |    2.494% |    2.435% |    2.850% |    3.741% |    5.285% |           | 
##                              |    0.499% |    0.487% |    0.570% |    0.748% |    1.057% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                 Column Total |     1684  |     1684  |     1684  |     1684  |     1684  |     8420  | 
##                              |   20.000% |   20.000% |   20.000% |   20.000% |   20.000% |           | 
## -----------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  1025.513     d.f. =  36     p =  0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000699429 
## 
## 
##  
##        Minimum expected frequency: 56.6

Variables Hypothesized to be Associated with Infrastructure Need

Percent Change in Median Income

# Potential Association: Percent change in median income
CrossTable(dat1_intervalfive$intervalfive_pchmedinc, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE, 
           expected = TRUE, sresid = FALSE, format="SPSS")

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |         Expected Values |
## | Chi-square contribution |
## |             Row Percent |
## |          Column Percent |
## |           Total Percent |
## |-------------------------|
## 
## Total Observations in Table:  8420 
## 
##                                          | dat1_intervalfive$intervalfive_Need 
## dat1_intervalfive$intervalfive_pchmedinc |        1  |        2  |        3  |        4  |        5  | Row Total | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        1 |      335  |      364  |      343  |      333  |      309  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |    0.010  |    2.197  |    0.114  |    0.043  |    2.295  |           | 
##                                          |   19.893% |   21.615% |   20.368% |   19.774% |   18.349% |   20.000% | 
##                                          |   19.893% |   21.615% |   20.368% |   19.774% |   18.349% |           | 
##                                          |    3.979% |    4.323% |    4.074% |    3.955% |    3.670% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        2 |      334  |      327  |      331  |      350  |      342  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |    0.023  |    0.285  |    0.100  |    0.517  |    0.080  |           | 
##                                          |   19.834% |   19.418% |   19.656% |   20.784% |   20.309% |   20.000% | 
##                                          |   19.834% |   19.418% |   19.656% |   20.784% |   20.309% |           | 
##                                          |    3.967% |    3.884% |    3.931% |    4.157% |    4.062% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        3 |      342  |      325  |      329  |      350  |      338  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |    0.080  |    0.413  |    0.181  |    0.517  |    0.004  |           | 
##                                          |   20.309% |   19.299% |   19.537% |   20.784% |   20.071% |   20.000% | 
##                                          |   20.309% |   19.299% |   19.537% |   20.784% |   20.071% |           | 
##                                          |    4.062% |    3.860% |    3.907% |    4.157% |    4.014% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        4 |      299  |      340  |      349  |      332  |      364  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |    4.242  |    0.030  |    0.442  |    0.068  |    2.197  |           | 
##                                          |   17.755% |   20.190% |   20.724% |   19.715% |   21.615% |   20.000% | 
##                                          |   17.755% |   20.190% |   20.724% |   19.715% |   21.615% |           | 
##                                          |    3.551% |    4.038% |    4.145% |    3.943% |    4.323% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        5 |      374  |      328  |      332  |      319  |      331  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |    4.109  |    0.230  |    0.068  |    0.941  |    0.100  |           | 
##                                          |   22.209% |   19.477% |   19.715% |   18.943% |   19.656% |   20.000% | 
##                                          |   22.209% |   19.477% |   19.715% |   18.943% |   19.656% |           | 
##                                          |    4.442% |    3.895% |    3.943% |    3.789% |    3.931% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                             Column Total |     1684  |     1684  |     1684  |     1684  |     1684  |     8420  | 
##                                          |   20.000% |   20.000% |   20.000% |   20.000% |   20.000% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  19.28741     d.f. =  16     p =  0.2540411 
## 
## 
##  
##        Minimum expected frequency: 336.8

Percent Change in Population Density

# Potential Association: Percent change in population density
CrossTable(dat1_intervalfive$intervalfive_pchpopden, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE, 
           expected = TRUE, sresid = FALSE, format="SPSS")

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |         Expected Values |
## | Chi-square contribution |
## |             Row Percent |
## |          Column Percent |
## |           Total Percent |
## |-------------------------|
## 
## Total Observations in Table:  8420 
## 
##                                          | dat1_intervalfive$intervalfive_Need 
## dat1_intervalfive$intervalfive_pchpopden |        1  |        2  |        3  |        4  |        5  | Row Total | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        1 |      453  |      393  |      371  |      292  |      175  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |   40.090  |    9.378  |    3.473  |    5.959  |   77.729  |           | 
##                                          |   26.900% |   23.337% |   22.031% |   17.340% |   10.392% |   20.000% | 
##                                          |   26.900% |   23.337% |   22.031% |   17.340% |   10.392% |           | 
##                                          |    5.380% |    4.667% |    4.406% |    3.468% |    2.078% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        2 |      345  |      371  |      374  |      327  |      267  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |    0.200  |    3.473  |    4.109  |    0.285  |   14.466  |           | 
##                                          |   20.487% |   22.031% |   22.209% |   19.418% |   15.855% |   20.000% | 
##                                          |   20.487% |   22.031% |   22.209% |   19.418% |   15.855% |           | 
##                                          |    4.097% |    4.406% |    4.442% |    3.884% |    3.171% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        3 |      323  |      351  |      326  |      347  |      337  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |    0.565  |    0.599  |    0.346  |    0.309  |    0.000  |           | 
##                                          |   19.181% |   20.843% |   19.359% |   20.606% |   20.012% |   20.000% | 
##                                          |   19.181% |   20.843% |   19.359% |   20.606% |   20.012% |           | 
##                                          |    3.836% |    4.169% |    3.872% |    4.121% |    4.002% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        4 |      300  |      308  |      325  |      354  |      397  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |    4.021  |    2.463  |    0.413  |    0.878  |   10.760  |           | 
##                                          |   17.815% |   18.290% |   19.299% |   21.021% |   23.575% |   20.000% | 
##                                          |   17.815% |   18.290% |   19.299% |   21.021% |   23.575% |           | 
##                                          |    3.563% |    3.658% |    3.860% |    4.204% |    4.715% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                        5 |      263  |      261  |      288  |      364  |      508  |     1684  | 
##                                          |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                          |   16.171  |   17.060  |    7.071  |    2.197  |   87.023  |           | 
##                                          |   15.618% |   15.499% |   17.102% |   21.615% |   30.166% |   20.000% | 
##                                          |   15.618% |   15.499% |   17.102% |   21.615% |   30.166% |           | 
##                                          |    3.124% |    3.100% |    3.420% |    4.323% |    6.033% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                             Column Total |     1684  |     1684  |     1684  |     1684  |     1684  |     8420  | 
##                                          |   20.000% |   20.000% |   20.000% |   20.000% |   20.000% |           | 
## -----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  309.038     d.f. =  16     p =  0.00000000000000000000000000000000000000000000000000000003417345 
## 
## 
##  
##        Minimum expected frequency: 336.8

Percentage of Population, white in 2010

# Potential Association: Percentage of Population White 2010
CrossTable(dat1_intervalfive$intervalfive_pwhite10, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE, 
           expected = TRUE, sresid = FALSE, format="SPSS")

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |         Expected Values |
## | Chi-square contribution |
## |             Row Percent |
## |          Column Percent |
## |           Total Percent |
## |-------------------------|
## 
## Total Observations in Table:  8420 
## 
##                                         | dat1_intervalfive$intervalfive_Need 
## dat1_intervalfive$intervalfive_pwhite10 |        1  |        2  |        3  |        4  |        5  | Row Total | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       1 |      315  |      292  |      267  |      329  |      481  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |    1.411  |    5.959  |   14.466  |    0.181  |   61.739  |           | 
##                                         |   18.705% |   17.340% |   15.855% |   19.537% |   28.563% |   20.000% | 
##                                         |   18.705% |   17.340% |   15.855% |   19.537% |   28.563% |           | 
##                                         |    3.741% |    3.468% |    3.171% |    3.907% |    5.713% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       2 |      284  |      276  |      307  |      349  |      468  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |    8.277  |   10.976  |    2.637  |    0.442  |   51.109  |           | 
##                                         |   16.865% |   16.390% |   18.230% |   20.724% |   27.791% |   20.000% | 
##                                         |   16.865% |   16.390% |   18.230% |   20.724% |   27.791% |           | 
##                                         |    3.373% |    3.278% |    3.646% |    4.145% |    5.558% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       3 |      298  |      318  |      333  |      382  |      353  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |    4.470  |    1.049  |    0.043  |    6.066  |    0.779  |           | 
##                                         |   17.696% |   18.884% |   19.774% |   22.684% |   20.962% |   20.000% | 
##                                         |   17.696% |   18.884% |   19.774% |   22.684% |   20.962% |           | 
##                                         |    3.539% |    3.777% |    3.955% |    4.537% |    4.192% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       4 |      334  |      400  |      360  |      339  |      251  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |    0.023  |   11.859  |    1.598  |    0.014  |   21.858  |           | 
##                                         |   19.834% |   23.753% |   21.378% |   20.131% |   14.905% |   20.000% | 
##                                         |   19.834% |   23.753% |   21.378% |   20.131% |   14.905% |           | 
##                                         |    3.967% |    4.751% |    4.276% |    4.026% |    2.981% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       5 |      453  |      398  |      417  |      285  |      131  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |   40.090  |   11.121  |   19.098  |    7.967  |  125.753  |           | 
##                                         |   26.900% |   23.634% |   24.762% |   16.924% |    7.779% |   20.000% | 
##                                         |   26.900% |   23.634% |   24.762% |   16.924% |    7.779% |           | 
##                                         |    5.380% |    4.727% |    4.952% |    3.385% |    1.556% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            Column Total |     1684  |     1684  |     1684  |     1684  |     1684  |     8420  | 
##                                         |   20.000% |   20.000% |   20.000% |   20.000% |   20.000% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  408.9846     d.f. =  16     p =  0.00000000000000000000000000000000000000000000000000000000000000000000000000004758703 
## 
## 
##  
##        Minimum expected frequency: 336.8

Median Income in 2009

CrossTable(dat1_intervalfive$intervalfive_medinc09, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE, 
           expected = TRUE, sresid = FALSE, format="SPSS")

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |         Expected Values |
## | Chi-square contribution |
## |             Row Percent |
## |          Column Percent |
## |           Total Percent |
## |-------------------------|
## 
## Total Observations in Table:  8420 
## 
##                                         | dat1_intervalfive$intervalfive_Need 
## dat1_intervalfive$intervalfive_medinc09 |        1  |        2  |        3  |        4  |        5  | Row Total | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       1 |      432  |      401  |      342  |      322  |      187  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |   26.909  |   12.238  |    0.080  |    0.650  |   66.627  |           | 
##                                         |   25.653% |   23.812% |   20.309% |   19.121% |   11.105% |   20.000% | 
##                                         |   25.653% |   23.812% |   20.309% |   19.121% |   11.105% |           | 
##                                         |    5.131% |    4.762% |    4.062% |    3.824% |    2.221% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       2 |      351  |      362  |      365  |      346  |      260  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |    0.599  |    1.886  |    2.361  |    0.251  |   17.513  |           | 
##                                         |   20.843% |   21.496% |   21.675% |   20.546% |   15.439% |   20.000% | 
##                                         |   20.843% |   21.496% |   21.675% |   20.546% |   15.439% |           | 
##                                         |    4.169% |    4.299% |    4.335% |    4.109% |    3.088% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       3 |      361  |      325  |      344  |      318  |      336  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |    1.739  |    0.413  |    0.154  |    1.049  |    0.002  |           | 
##                                         |   21.437% |   19.299% |   20.428% |   18.884% |   19.952% |   20.000% | 
##                                         |   21.437% |   19.299% |   20.428% |   18.884% |   19.952% |           | 
##                                         |    4.287% |    3.860% |    4.086% |    3.777% |    3.990% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       4 |      300  |      340  |      331  |      327  |      386  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |    4.021  |    0.030  |    0.100  |    0.285  |    7.187  |           | 
##                                         |   17.815% |   20.190% |   19.656% |   19.418% |   22.922% |   20.000% | 
##                                         |   17.815% |   20.190% |   19.656% |   19.418% |   22.922% |           | 
##                                         |    3.563% |    4.038% |    3.931% |    3.884% |    4.584% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                       5 |      240  |      256  |      302  |      371  |      515  |     1684  | 
##                                         |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                         |   27.821  |   19.384  |    3.596  |    3.473  |   94.285  |           | 
##                                         |   14.252% |   15.202% |   17.933% |   22.031% |   30.582% |   20.000% | 
##                                         |   14.252% |   15.202% |   17.933% |   22.031% |   30.582% |           | 
##                                         |    2.850% |    3.040% |    3.587% |    4.406% |    6.116% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                            Column Total |     1684  |     1684  |     1684  |     1684  |     1684  |     8420  | 
##                                         |   20.000% |   20.000% |   20.000% |   20.000% |   20.000% |           | 
## ----------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  292.6544     d.f. =  16     p =  0.00000000000000000000000000000000000000000000000000008450559 
## 
## 
##  
##        Minimum expected frequency: 336.8

Population Density in 2010

CrossTable(dat1_intervalfive$intervalfive_pop10, dat1_intervalfive$intervalfive_Need, fisher = FALSE, chisq = TRUE, 
           expected = TRUE, sresid = FALSE, format="SPSS")

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |         Expected Values |
## | Chi-square contribution |
## |             Row Percent |
## |          Column Percent |
## |           Total Percent |
## |-------------------------|
## 
## Total Observations in Table:  8420 
## 
##                                      | dat1_intervalfive$intervalfive_Need 
## dat1_intervalfive$intervalfive_pop10 |        1  |        2  |        3  |        4  |        5  | Row Total | 
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                    1 |      566  |      412  |      369  |      245  |       92  |     1684  | 
##                                      |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                      |  155.976  |   16.790  |    3.079  |   25.021  |  177.931  |           | 
##                                      |   33.610% |   24.466% |   21.912% |   14.549% |    5.463% |   20.000% | 
##                                      |   33.610% |   24.466% |   21.912% |   14.549% |    5.463% |           | 
##                                      |    6.722% |    4.893% |    4.382% |    2.910% |    1.093% |           | 
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                    2 |      414  |      421  |      354  |      319  |      176  |     1684  | 
##                                      |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                      |   17.695  |   21.050  |    0.878  |    0.941  |   76.771  |           | 
##                                      |   24.584% |   25.000% |   21.021% |   18.943% |   10.451% |   20.000% | 
##                                      |   24.584% |   25.000% |   21.021% |   18.943% |   10.451% |           | 
##                                      |    4.917% |    5.000% |    4.204% |    3.789% |    2.090% |           | 
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                    3 |      281  |      369  |      366  |      387  |      281  |     1684  | 
##                                      |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                      |    9.245  |    3.079  |    2.532  |    7.482  |    9.245  |           | 
##                                      |   16.686% |   21.912% |   21.734% |   22.981% |   16.686% |   20.000% | 
##                                      |   16.686% |   21.912% |   21.734% |   22.981% |   16.686% |           | 
##                                      |    3.337% |    4.382% |    4.347% |    4.596% |    3.337% |           | 
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                    4 |      221  |      274  |      314  |      377  |      498  |     1684  | 
##                                      |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                      |   39.815  |   11.710  |    1.543  |    4.798  |   77.154  |           | 
##                                      |   13.124% |   16.271% |   18.646% |   22.387% |   29.572% |   20.000% | 
##                                      |   13.124% |   16.271% |   18.646% |   22.387% |   29.572% |           | 
##                                      |    2.625% |    3.254% |    3.729% |    4.477% |    5.914% |           | 
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                                    5 |      202  |      208  |      281  |      356  |      637  |     1684  | 
##                                      |  336.800  |  336.800  |  336.800  |  336.800  |  336.800  |           | 
##                                      |   53.952  |   49.256  |    9.245  |    1.095  |  267.577  |           | 
##                                      |   11.995% |   12.352% |   16.686% |   21.140% |   37.827% |   20.000% | 
##                                      |   11.995% |   12.352% |   16.686% |   21.140% |   37.827% |           | 
##                                      |    2.399% |    2.470% |    3.337% |    4.228% |    7.565% |           | 
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
##                         Column Total |     1684  |     1684  |     1684  |     1684  |     1684  |     8420  | 
##                                      |   20.000% |   20.000% |   20.000% |   20.000% |   20.000% |           | 
## -------------------------------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  1043.86     d.f. =  16     p =  0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004522926 
## 
## 
##  
##        Minimum expected frequency: 336.8

Part 1.4: Pearson Correlation Tests

2010 Population and Official Need

Low degree of correlation @ 0.18

##Correlation between 2010population and official need
cor.test(dat1$POP10, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat1$POP10 and dat1$TOTAL_OFFICIAL_NEED
## t = 16.355, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1547099 0.1961150
## sample estimates:
##       cor 
## 0.1754901

## small correlation @ 0.18

##scatter plot##

ggplot(dat1, aes(x = POP10,
                            y = TOTAL_OFFICIAL_NEED)) +
  geom_point(size=2, shape=20, color="orchid") +
  theme(legend.position = "bottom") +
  ggtitle("2010 Population x Total Official Need") +
  geom_smooth(method=lm) + 
  xlab("Total Population (2010)") + ylab("Total Official Need")

##Degree of correlation:
###Perfect: If value is near ? 1, then it said to be a perfect correlation: as one variable increases, the other variable tends to also increase (if positive) or decrease (if negative).
###High degree: value between ? 0.50 and ? 1
###Moderate degree: value between ? 0.30 and ? 0.49
###Low degree: value below + .29
###No correlation: value is 0.

2010 Population Density and Official Need

Moderate degree of correlation @ 0.43

cor.test(dat1$POP_DENSITY_10, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat1$POP_DENSITY_10 and dat1$TOTAL_OFFICIAL_NEED
## t = 43.501, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4108153 0.4456975
## sample estimates:
##      cor 
## 0.428416

ggplot(dat1, aes(x = POP_DENSITY_10,
                            y = TOTAL_OFFICIAL_NEED)) +
  geom_point(size=2, shape=20, color="orchid") +
  theme(legend.position = "bottom") +
  ggtitle("2010 Population Density x Total Official Need") +
  geom_smooth(method=lm) + 
  xlab("Population Density (2010)") + ylab("Total Official Need")

##

Median Income and Official Need

Little to no correlation @ 0.05

cor.test(dat1$MEDINC09, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat1$MEDINC09 and dat1$TOTAL_OFFICIAL_NEED
## t = 4.8097, df = 8418, p-value = 0.000001538
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03102467 0.07362785
## sample estimates:
##        cor 
## 0.05235008

ggplot(dat1, aes(x = MEDINC09,
                            y = TOTAL_OFFICIAL_NEED)) +
  geom_point(size=2, shape=20, color="orchid") +
  theme(legend.position = "bottom") +
  ggtitle("Median Income x Total Official Need") +
  geom_smooth(method=lm) + 
  xlab("Median Income (2009)") + ylab("Total Official Need")

Change in Residential Collection and Official Need

Low degree of correlation @ 0.22

cor.test(dat1$CHANGE_RES_REC_COLLCTN, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat1$CHANGE_RES_REC_COLLCTN and dat1$TOTAL_OFFICIAL_NEED
## t = 20.412, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1967184 0.2374248
## sample estimates:
##      cor 
## 0.217166

ggplot(dat1, aes(x = CHANGE_RES_REC_COLLCTN,
                            y = TOTAL_OFFICIAL_NEED)) +
  geom_point(size=2, shape=20, color="orchid") +
  theme(legend.position = "bottom") +
  ggtitle("Change in Residential Collection x Total Official Need") +
  geom_smooth(method=lm) + 
  xlab("Change in Res Collection") + ylab("Total Official Need")

Change in Residential Collection and Median Income

Little to no correlation @ 0.064

cor.test(dat1$MEDINC09, dat1$CHANGE_RES_REC_COLLCTN, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat1$MEDINC09 and dat1$CHANGE_RES_REC_COLLCTN
## t = 5.8425, df = 8418, p-value = 0.000000005333
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04224740 0.08479516
## sample estimates:
##        cor 
## 0.06355016

ggplot(dat1, aes(x = MEDINC09,
                            y = CHANGE_RES_REC_COLLCTN)) +
  geom_point(size=2, shape=20, color="orchid") +
  theme(legend.position = "bottom") +
  ggtitle("Change in Residential Collection x Median Income") +
  geom_smooth(method=lm) + 
  xlab("Median Income (2009)") + ylab("Change in Res Collection")

Part 2: Regression

Part 2.1: Preliminiary Tests

library(tidyverse)
library(tidycensus)
library(ggplot2)
library(dplyr)
library(data.table)
require("ggrepel")
library(hrbrthemes)
library(viridis)
library(ggridges)
library(Hmisc)
library(stargazer)

options(scipen=999)

dat <- read.csv("cwns2012.csv", header = TRUE)


dat2 <- dat%>%
  mutate(LOG_NEED = log(TOTAL_OFFICIAL_NEED))%>%
    mutate(RES_BURDEN = TOTAL_OFFICIAL_NEED/POP10)%>%
  mutate(LOG_RES_BURDEN = log(RES_BURDEN))%>%
    mutate(LOG_MEDINC09 = log(MEDINC09))%>%
  mutate(MEDINC99_INFL = MEDINC99*1.29)%>%
    mutate(LOG_MEDINC99_INFL = log(MEDINC99_INFL))%>%
  mutate(LOG_POP10 = log(POP10))%>%
    mutate(LOG_POP00 = log(POP00))%>%
  mutate(ALAND_SQMILE = ALAND/2.59e+6)%>%
  mutate(POP_DENSITY_10 = POP10/ALAND_SQMILE)%>%
 mutate(LOG_POP_DENSITY_10 = log(POP_DENSITY_10))%>%
  mutate(LOG_RES_REC_COLLCTN = log(PRES_RES_REC_COLLCTN))%>%
  mutate(LOG_RES_REC_TRMT = log(PRES_RES_REC_TRMT))

Focus on Delaware Watershed

Box Plots

dat2_focus_states <- filter(dat2, STATE %in% c("PA", "NJ", "NY", "DE"))

#Boxplots
ggplot(dat2_focus_states, 
      aes(x=STATE, y=TOTAL_OFFICIAL_NEED)) + 
  geom_boxplot(fill="seagreen3", alpha=0.2) + 
  geom_jitter(color="gray50", size=0.4, alpha=0.9) +
  ggtitle("Water Infrastructure Need: Delaware River Watershed") +
  xlab("STATE") + ylab("Official Need")

Box Plot with Log Transformed Variables

ggplot(dat2_focus_states, 
       aes(x=STATE, y=LOG_NEED)) + 
  geom_boxplot(fill="seagreen3", alpha=0.2) + 
  geom_jitter(color="gray50", size=0.4, alpha=0.9) +
  ggtitle("Water Infrastructure Need: Delaware River Watershed") +
  xlab("STATE") + ylab("Official Need (Log Adjusted)")

Density Plots

ggplot(dat2_focus_states, 
       aes(x=RES_BURDEN)) +
  geom_density(fill="seagreen3", color="seagreen1", alpha=0.7) +
  ggtitle("Residential Burden: Delaware River Watershed") +
  xlab("Residential Burden") + ylab("Density")

Density plots with Log Transformed Variables

#Density plots with Log Transformed Variables
ggplot(dat2_focus_states, 
       aes(x=LOG_RES_BURDEN)) +
  geom_density(fill="seagreen3", color="seagreen1", alpha=0.7) +
  ggtitle("Residential Burden: Delaware River Watershed") +
  xlab("Residential Burden (Log Adjusted)") + ylab("Density")

Focus on Philadelphia

Density Plots

dat2_PA <- filter(dat2, STATEFP == 42)


#Density plots
ggplot(dat2_PA, 
       aes(x=MEDINC09)) +
  geom_density(fill="darkgoldenrod1", color="darkgoldenrod2", alpha=0.7) +
  ggtitle("Median Income Distribution: Pennsylvania Counties") +
  xlab("Median Income (2009)") + ylab("Density")

Density Plots with Log Transformed Variables

#Density plots with Log Transformed Variables
ggplot(dat2_PA, 
       aes(x=LOG_MEDINC09)) +
  geom_density(fill="darkgoldenrod1", color="darkgoldenrod2", alpha=0.7) +
  ggtitle("Median Income Distribution: Pennsylvania Counties") +
  xlab("Median Income (2009, Log Adjusted)") + ylab("Density")

Bar Plots

Total Need

#Barplots: Total Need

dat2%>%
  filter(COUNTYFP == 101, STATEFP == 42)%>%
ggplot(aes(y=TOTAL_OFFICIAL_NEED, x=CWNS_NUMBER)) + 
  geom_bar(position="dodge", stat="identity", color="coral1", fill="coral1", alpha=0.6) +
  ggtitle("Total Official Need: Philadelphia Facilities") + 
  xlab("CWNS Facility") + ylab("Total Need")

Residential Burden

Log adjustment not required as variables are similar in value.

#Barplots: Res Burden

dat2%>%
  filter(COUNTYFP == 101, STATEFP == 42)%>%
ggplot(aes(y=RES_BURDEN, x=CWNS_NUMBER)) + 
  geom_bar(position="dodge", stat="identity", color="coral1", fill="coral1", alpha=0.6) +
  ggtitle("Residential Burden: Philadelphia Facilities") + 
  xlab("CWNS Facility") + ylab("Residential Burden")

##log adjustment not required as variables are similar in value

Pearson Correlation Tests

Total Population vs. Official Need

cor = 0.18, small correlation

cor.test(dat2$POP10, dat2$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat2$POP10 and dat2$TOTAL_OFFICIAL_NEED
## t = 16.355, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1547099 0.1961150
## sample estimates:
##       cor 
## 0.1754901

#cor = 0.18, small correlation

Population Density vs. Official Need

cor = 0.43, moderate correlation

cor.test(dat2$POP_DENSITY_10, dat2$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat2$POP_DENSITY_10 and dat2$TOTAL_OFFICIAL_NEED
## t = 43.501, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4108153 0.4456975
## sample estimates:
##      cor 
## 0.428416

#cor=0.43, variables exhibit a medium correlation

Median Income vs. Official Need

cor = 0.052, little to no correlation

cor.test(dat2$MEDINC09, dat2$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat2$MEDINC09 and dat2$TOTAL_OFFICIAL_NEED
## t = 4.8097, df = 8418, p-value = 0.000001538
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03102467 0.07362785
## sample estimates:
##        cor 
## 0.05235008

#cor 0.052, little to no correlation

Residential Burden vs. Official Need

cor = 0.33, moderate correlation

cor.test(dat2$RES_BURDEN, dat2$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat2$RES_BURDEN and dat2$TOTAL_OFFICIAL_NEED
## t = 32.449, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3143058 0.3522786
## sample estimates:
##       cor 
## 0.3334274

#cor 0.33, medium correlation

Present Residential Population Receiving Collection vs. Official Need

cor = 0.52, high correlation

cor.test(dat$PRES_RES_REC_COLLCTN, dat$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat$PRES_RES_REC_COLLCTN and dat$TOTAL_OFFICIAL_NEED
## t = 57.264, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5139253 0.5446732
## sample estimates:
##       cor 
## 0.5294731

#cor 0.52, high correlation

Present Residential Pop Receiving Treatment vs. Official Need

cor = 0.53, high correlation

#Need vs. Present Residential Pop Receiving Treatment
cor.test(dat$PRES_RES_REC_TRMT, dat$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat$PRES_RES_REC_TRMT and dat$TOTAL_OFFICIAL_NEED
## t = 57.507, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5155744 0.5462493
## sample estimates:
##       cor 
## 0.5310858

#cor 0.53, high correlation

Present Non-Residential Pop Receiving Collection vs. Official Need

cor = 0.44, moderate correlation

#Need vs. Present Non-Residential Pop Receiving Collection
cor.test(dat$PRES_N_RES_REC_COLLCTN, dat$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat$PRES_N_RES_REC_COLLCTN and dat$TOTAL_OFFICIAL_NEED
## t = 45.619, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4279247 0.4621801
## sample estimates:
##       cor 
## 0.4452153

#cor 0.44, medium correlation

Present Non-Residential Pop Receiving Treatment vs. Official Need

cor = 0.43, moderate correlation

#Need vs. Present Non-Residential Pop Receiving Treatment
cor.test(dat$PRES_N_RES_REC_TRTM, dat$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat$PRES_N_RES_REC_TRTM and dat$TOTAL_OFFICIAL_NEED
## t = 43.771, df = 8418, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4130163 0.4478193
## sample estimates:
##       cor 
## 0.4305779

#cor 0.43, medium correlation

Land Area vs. Official Need

cor = 0.003, almost no correlation

#Need vs. Land Area
cor.test(dat1$ALAND_SQMILE, dat1$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat1$ALAND_SQMILE and dat1$TOTAL_OFFICIAL_NEED
## t = 0.36351, df = 8418, p-value = 0.7162
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01739962  0.02531992
## sample estimates:
##         cor 
## 0.003961961

#cor 0.003, almost no correlation

Water Area vs. Official Need

cor = 0.02, almost no correlation

#Need vs. Water Area
cor.test(dat$AWATER, dat$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat$AWATER and dat$TOTAL_OFFICIAL_NEED
## t = 2.2582, df = 8418, p-value = 0.02396
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.003247044 0.045941400
## sample estimates:
##        cor 
## 0.02460544

#cor = 0.02, almost no correlation

Variables with Significant Correlation

– Population Density

– Residential Burden

– Present Residential Pop Receiving Collection/Treatment

– Present Non-Residential Pop Receiving Collection/Treatment

Dummy Variables

Based on the correlation tests, the highest correlation is exhibited between Official Need and Present Residential Pop Receiving Treatment. We chose to use the median value because the large range of observations within the data set can disproportionately skew the mean value.

###Construct dummy groups to classify facilities below median values

median_trmt <- median(dat1$PRES_RES_REC_TRMT)

##Growing or Shrinking Cities
###Based on pop density change mean value
mean_popdensity <- mean(dat1$CHANGE_POP_DENSITY)


##CSS
###Whether or not it's part of a CSS facility
##already a binary!

dat1$DUMMY_TRMT <- ifelse(dat1$PRES_RES_REC_TRMT >= 2759.5, 1, 0)
dat1$DUMMY_POPDENSITY <- ifelse(dat1$CHANGE_POP_DENSITY >= 24.43, 1, 0) 

DT::datatable(
  dat1,
  height = 200,
  options = list(scrollX = TRUE)
)

Part 2.2: Bivariate Regressions

From our correlation tests, we selected the following variables which exhibited a significant correlation with total need:

Population Density
Residential Burden
Present Residential Pop Receiving Treatment

Need vs. Population Density

Regression Plot

mod.pop.density <- lm(TOTAL_OFFICIAL_NEED ~ POP_DENSITY_10, data = dat1)
summary(mod.pop.density)

## 
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ POP_DENSITY_10, data = dat1)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -1722672791   -10797745    -6312025    -3289469  4552683118 
## 
## Coefficients:
##                 Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)    5993102.0  1371913.1   4.368            0.0000127 ***
## POP_DENSITY_10   29325.2      674.1  43.501 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 121500000 on 8418 degrees of freedom
## Multiple R-squared:  0.1835, Adjusted R-squared:  0.1834 
## F-statistic:  1892 on 1 and 8418 DF,  p-value: < 0.00000000000000022

#R2 = 0.184; Adjusted R2 = 0.183 
##p values??

#Regression Plot

ggplot(dat1, aes(POP_DENSITY_10, TOTAL_OFFICIAL_NEED)) +
  geom_point(size=3, shape=20, color="orchid") +
  geom_smooth(method='lm', se=FALSE, color="grey40") +
  ggtitle("Regression: Need and Population Density") +
  xlab("Population Density") + ylab("Total Need")

Residual Plot

Funnel shaped plot indicates heteroscedasticity, which implies a non-constant variation in error terms, i.e. error terms increase as fitted value increases.

##Residual Plots
summary(mod.pop.density$residuals)

##        Min.     1st Qu.      Median        Mean     3rd Qu.        Max. 
## -1722672791   -10797745    -6312025           0    -3289469  4552683118

plot(fitted(mod.pop.density), mod.pop.density$residuals,
     main = "Fitted Model Against Residuals: Population Density",
     xlab = "Fitted Model", ylab = "Residuals",
     col = "orchid", pch = 16)
abline(0,0, col = "grey40")

Histogram of Regressed Population Density and Official Need

Residuals are mostly clustered around mean with a skew towards the left, evident from large negative values of min, 1st quartile, and median. Using log adjusted values can help cluster data closer to mean.

hist(mod.pop.density$residuals, breaks = 10,
     main = "Regression Residual Spread: Population Density",
     xlab = "Residuals",
     col = "orchid", border = "orchid3")

#mod_pop_density_predict <- predict(mod.pop.density)

#DT::datatable(
#  mod_pop_density_predict,
#  height = 100,
#  options = list(scrollX = TRUE)
#)

Log Official Need vs. Log Population Density

Removing infinite values from “log adjusted need” column.

Low R squared, as seen from large spread of data about the mode means that the model does not sufficiently account for variance.

##remove infinite values from "log adjusted need" column
dat2_reg <- dat2%>%
  filter(LOG_NEED > -Inf)

mod.log.pop.density <- lm(LOG_NEED ~ LOG_POP_DENSITY_10, data = dat2_reg)

summary(mod.log.pop.density)

## 
## Call:
## lm(formula = LOG_NEED ~ LOG_POP_DENSITY_10, data = dat2_reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8064 -1.1451 -0.0249  1.0973  6.1504 
## 
## Coefficients:
##                    Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)        12.84607    0.05523  232.58 <0.0000000000000002 ***
## LOG_POP_DENSITY_10  0.40626    0.01085   37.45 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.69 on 8261 degrees of freedom
## Multiple R-squared:  0.1451, Adjusted R-squared:  0.145 
## F-statistic:  1402 on 1 and 8261 DF,  p-value: < 0.00000000000000022

ggplot(dat2_reg, aes(LOG_POP_DENSITY_10, LOG_NEED)) +
  geom_point(size=3, shape=20, color="orchid") +
  geom_smooth(method='lm', se=FALSE, color="grey40") +
  ggtitle("Regression: Need and Population Density") +
  xlab("Population Density (Log Adjusted)") + ylab("Total Need (Log Adjusted)")

#R2 = 0.1451; Adjusted R2 = 0.145
##low R squared, as seen from large spread of data about the model
##means that the model does not sufficiently account for variance

Residual Plot of Log Population Density

Much better spread, residuals clustered closer to the mean in a more homoscedastic plot, closer to normal distribution. However, variance still increases as model values increase.

summary(mod.log.pop.density$residuals)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -7.80641 -1.14511 -0.02493  0.00000  1.09732  6.15044

#better, less negative skew from mean

plot(fitted(mod.log.pop.density), mod.log.pop.density$residuals,
     main = "Fitted Model Against Residuals: Log Adj. Population Density",
     xlab = "Fitted Model", ylab = "Residuals",
     col = "orchid", pch = 16)
abline(0,0, col = "grey40")

hist(mod.log.pop.density$residuals, breaks = 10,
     main = "Regression Residual Spread: Population Density",
     xlab = "Residuals",
     col = "orchid", border = "orchid3")

##much better spread, residuals clustered closer to the mean in a more homoscedastic plot, closer to normal distribution
##however, variance still increases as model values increase

Need vs. Residential Burden

Large range across mean values seen in summary of residual plot.

mod.res.burden <- lm(TOTAL_OFFICIAL_NEED ~ RES_BURDEN, data = dat2)
summary(mod.res.burden)

## 
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ RES_BURDEN, data = dat2)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -1966254161   -12024541    -8159307    -6205059  4926111573 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  7884945    1444881   5.457         0.0000000498 ***
## RES_BURDEN     97748       3012  32.449 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 126800000 on 8418 degrees of freedom
## Multiple R-squared:  0.1112, Adjusted R-squared:  0.1111 
## F-statistic:  1053 on 1 and 8418 DF,  p-value: < 0.00000000000000022

#R2 = 0.11; Adjusted R2 = 0.11
##low R squared
##p values??

ggplot(dat2, aes(RES_BURDEN, TOTAL_OFFICIAL_NEED)) +
  geom_point(size=3, shape=20, color="orchid") +
  geom_smooth(method='lm', se=FALSE, color="grey40") +
    ggtitle("Regression: Need and Residential Burden") +
  xlab("Residential Burden") + ylab("Total Need")

#Residual Plots
summary(mod.res.burden$residuals)

##        Min.     1st Qu.      Median        Mean     3rd Qu.        Max. 
## -1966254161   -12024541    -8159307           0    -6205059  4926111573

##large range across mean values

Histogram of Residuals

Large range but maximum values clustered closer to mean, skewed toward the left of mean value.

hist(mod.res.burden$residuals, breaks = 10,
     main = "Regression Residual Spread: Residential Burden",
     xlab = "Residuals",
     col = "orchid", border = "orchid3")

##large range but maximum values clustered closer to mean, skewed toward the left of mean value

Heteroscedastic plot, exhibits high non-covariance as fitted values increase.

plot(fitted(mod.res.burden), mod.res.burden$residuals,
     main = "Fitted Model Against Residuals: Residential Burden",
     xlab = "Fitted Model", ylab = "Residuals",
     col = "orchid", pch = 16)
abline(0,0, col = "grey40")

##heteroscedastic plot, exhibits high non-covariance as fitted values increase

Need vs. Present Residential Treatment

Range far from mean on both sides.

mod.res.trmt <- lm(TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat)
summary(mod.res.trmt)

## 
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -1981362007    -7599612    -5839460    -3348041  4666003598 
## 
## Coefficients:
##                     Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)       6109417.49 1270553.40   4.808           0.00000155 ***
## PRES_RES_REC_TRMT     670.21      11.65  57.507 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113900000 on 8418 degrees of freedom
## Multiple R-squared:  0.2821, Adjusted R-squared:  0.282 
## F-statistic:  3307 on 1 and 8418 DF,  p-value: < 0.00000000000000022

#R2 = 0.282; Adjusted R2 = 0.282

ggplot(dat, aes(PRES_RES_REC_TRMT, TOTAL_OFFICIAL_NEED)) +
  geom_point(size=3, shape=20, color="orchid") +
  geom_smooth(method='lm', se=FALSE, color="grey40") +
  ggtitle("Regression: Need and Residential Treatment") +
  xlab("Residential Treatment") + ylab("Total Need")

#Residual Plots
summary(mod.res.trmt$residuals)

##        Min.     1st Qu.      Median        Mean     3rd Qu.        Max. 
## -1981362007    -7599612    -5839460           0    -3348041  4666003598

#range far from mean on both sides

Large range but maximum values clustered closer to mean.

hist(mod.res.trmt$residuals, breaks = 10,
     main = "Regression Residual Spread: Residential Treatment",
     xlab = "Residuals",
     col = "orchid", border = "orchid3")

##large range but maximum values clustered closer to mean

Heteroscedastic plot, high non-variance.

plot(fitted(mod.res.trmt), mod.res.trmt$residuals,
     main = "Fitted Model Against Residuals: Residential Collection",
     xlab = "Fitted Model", ylab = "Residuals",
     col = "orchid", pch = 16)
abline(0,0, col = "grey40")

##heteroscedastic plot, high non-variance

Log Need vs. Log Residential Treatment

Removed infinite values. Model still does not explain variance very well, but is much better than non-log-adjusted variables.

#LOG Need vs. LOG Residential Treatment

##remove infinite values
dat2_reg <- dat2%>%
  filter(LOG_NEED > -Inf)%>%
  filter(LOG_RES_REC_TRMT > -Inf)

mod.log.res.trmt <- lm(LOG_NEED ~ LOG_RES_REC_TRMT, data = dat2_reg)
summary(mod.log.res.trmt)

## 
## Call:
## lm(formula = LOG_NEED ~ LOG_RES_REC_TRMT, data = dat2_reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4591 -0.8723  0.0485  0.9241  9.5216 
## 
## Coefficients:
##                  Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)      9.610620   0.068825  139.64 <0.0000000000000002 ***
## LOG_RES_REC_TRMT 0.640192   0.008284   77.28 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.391 on 8240 degrees of freedom
## Multiple R-squared:  0.4202, Adjusted R-squared:  0.4201 
## F-statistic:  5972 on 1 and 8240 DF,  p-value: < 0.00000000000000022

#R2 = 0.42; Adjusted R2 = 0.42
##model still does not explain variance very well, but is much better than non-log-adjusted variables
##p-value?

Residual Plots

Less spread of range across mean.

#Residual Plots
summary(mod.log.res.trmt$residuals)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -8.45911 -0.87230  0.04848  0.00000  0.92411  9.52156

##less spread of range across mean

Tighter clustering around mean, almost exhibiting a normal distribution.

hist(mod.log.res.trmt$residuals, breaks = 10,
     main = "Regression Residual Spread: Log Adj. Residential Treatment",
     xlab = "Residuals",
     col = "orchid", border = "orchid3")

##tighter clustering around mean, almost exhibiting a normal distribution

Error terms definitely exhibit more homoscedasticity, with more variance towards the left of mean. However, terms are still spread along a wide range from the fitted model.

plot(fitted(mod.log.res.trmt), mod.log.res.trmt$residuals,
     main = "Fitted Model Against Residuals: Log Adj. Residential Treatment",
     xlab = "Fitted Model", ylab = "Residuals",
     col = "orchid", pch = 16)
abline(0,0, col = "grey40")

##error terms definitely exhibit more homoscedasticity, with more variance towards the left of mean
##however, terms are still spread along a wide range from the fitted model

#predict(mod.log.res.trmt)
#predict(mod.log.res.trmt, int = "c")
#predict(mod.log.res.trmt, int = "p")

Stargazer Outputs

We used stargazer to compare our regression models and classify statistical significance. The interpretation is as follows:

– Unit increase in pop density would result in an increase of $29,325.19 in total need.

– Unit increase in residential burden would result in an increase of $97,748.53 in total need.

– Unit increase in residential population receiving collection would result in an increase of $670.21 in total need.

– R-squared and Adjusted R-squared values are low for all, implying that the model does not explain variance very well. However, considering these regression models use real-world data sets capturing figures across a national scale which naturally contain large variances, values of R2 around 0.3 are usually considered good

– P-values of all coefficients are less than 0.001, implying that the model reports contains high statistical significance (99.99% confidence)

##compare regression results through a comparative stargazer table
stargazer(mod.pop.density, mod.res.burden, mod.res.trmt, type = "text", star.cutoffs = c(0.05, 0.01, 0.001))

## 
## ==================================================================================
##                                                Dependent variable:                
##                                 --------------------------------------------------
##                                                TOTAL_OFFICIAL_NEED                
##                                       (1)              (2)              (3)       
## ----------------------------------------------------------------------------------
## POP_DENSITY_10                   29,325.190***                                    
##                                    (674.121)                                      
##                                                                                   
## RES_BURDEN                                        97,748.530***                   
##                                                    (3,012.402)                    
##                                                                                   
## PRES_RES_REC_TRMT                                                    670.210***   
##                                                                       (11.654)    
##                                                                                   
## Constant                        5,993,102.000*** 7,884,945.000*** 6,109,417.000***
##                                 (1,371,913.000)  (1,444,881.000)  (1,270,553.000) 
##                                                                                   
## ----------------------------------------------------------------------------------
## Observations                         8,420            8,420            8,420      
## R2                                   0.184            0.111            0.282      
## Adjusted R2                          0.183            0.111            0.282      
## Residual Std. Error (df = 8418) 121,493,463.000  126,763,417.000  113,928,392.000 
## F Statistic (df = 1; 8418)        1,892.368***     1,052.918***     3,307.086***  
## ==================================================================================
## Note:                                                *p<0.05; **p<0.01; ***p<0.001

##unit increase in pop density would be accompanied by an increase of $29,325.19 in total need
##unit increase in residential burden --> increase of $97,748.53 in total need
##unit increase in residential population receiving collection would result in an increase of $670.21 in total need

We also compared regression models which used log-transformed data:

– R-squared values are lower for “log of population density”, but much higher for “log of population receiving treatment”, and residual plots (above) exhibit a better fit about the mean. This showed us that while log adjustment proves useful for clustering data with high min/max values, it doesn’t necessarily affect the predictive power of the regression model.

– P-values of all coefficients are less than 0.001, implying that the model contains a high statistical significance (99.99% confidence).

stargazer(mod.log.pop.density, mod.log.res.trmt, type = "text", star.cutoffs = c(0.05, 0.01, 0.001))

## 
## ===========================================================================
##                                       Dependent variable:                  
##                     -------------------------------------------------------
##                                            LOG_NEED                        
##                                 (1)                         (2)            
## ---------------------------------------------------------------------------
## LOG_POP_DENSITY_10           0.406***                                      
##                               (0.011)                                      
##                                                                            
## LOG_RES_REC_TRMT                                         0.640***          
##                                                           (0.008)          
##                                                                            
## Constant                     12.846***                   9.611***          
##                               (0.055)                     (0.069)          
##                                                                            
## ---------------------------------------------------------------------------
## Observations                   8,263                       8,242           
## R2                             0.145                       0.420           
## Adjusted R2                    0.145                       0.420           
## Residual Std. Error      1.690 (df = 8261)           1.391 (df = 8240)     
## F Statistic         1,402.444*** (df = 1; 8261) 5,972.063*** (df = 1; 8240)
## ===========================================================================
## Note:                                         *p<0.05; **p<0.01; ***p<0.001

##R-squared values are not necessarily better, but residual plots (above) do exhibit a better fit about the mean
##as such, log adjustment proves useful for clustering data with high min/max values, it doesn't necessarily affect the predictive power of the regression model

Dummy Regressions

Need vs. Treatment; CSS Presence Dummies

#Need vs. Treatment; CSS Presence Dummies
##THIS IS IMP!!!

dat_dummy_css0 <- filter(dat1, CSS == 0)
dat_dummy_css1 <- filter(dat1, CSS == 1)

cor = 0.60, high correlation

cor.test(dat_dummy_css0$PRES_RES_REC_TRMT, dat_dummy_css0$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat_dummy_css0$PRES_RES_REC_TRMT and dat_dummy_css0$TOTAL_OFFICIAL_NEED
## t = 66.159, df = 7722, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5870440 0.6155166
## sample estimates:
##       cor 
## 0.6014713

#cor = 0.60, high correlation

mod.poptrmt.css0 <- lm(TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat_dummy_css0)
summary(mod.poptrmt.css0)

## 
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat_dummy_css0)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -1325081109    -5209912    -4174949    -1529259  2893546542 
## 
## Coefficients:
##                      Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)       4729349.436  663193.452   7.131     0.00000000000109 ***
## PRES_RES_REC_TRMT     465.561       7.037  66.159 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57140000 on 7722 degrees of freedom
## Multiple R-squared:  0.3618, Adjusted R-squared:  0.3617 
## F-statistic:  4377 on 1 and 7722 DF,  p-value: < 0.00000000000000022

#R2 = 0.36; Adjusted R2 = 0.36

cor = 0.54, high correlation

cor.test(dat_dummy_css1$PRES_RES_REC_TRMT, dat_dummy_css1$TOTAL_OFFICIAL_NEED, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  dat_dummy_css1$PRES_RES_REC_TRMT and dat_dummy_css1$TOTAL_OFFICIAL_NEED
## t = 17.347, df = 694, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4959223 0.5997712
## sample estimates:
##      cor 
## 0.549969

#cor = 0.54, high correlation

mod.poptrmt.css1 <- lm(TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat_dummy_css1)
summary(mod.poptrmt.css1)

## 
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ PRES_RES_REC_TRMT, data = dat_dummy_css1)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -1642329689   -45039862   -34361916   -26278668  4341723558 
## 
## Coefficients:
##                      Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept)       33599689.94 13211573.89   2.543              0.0112 *  
## PRES_RES_REC_TRMT     1077.82       62.13  17.347 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 326900000 on 694 degrees of freedom
## Multiple R-squared:  0.3025, Adjusted R-squared:  0.3015 
## F-statistic: 300.9 on 1 and 694 DF,  p-value: < 0.00000000000000022

#R2 = 0.30; Adjusted R2 = 0.30

For facilities without CSS:

– unit increase in residential population receiving treatment results in a $465.56 increase in total need

Comparatively, for facilities with CSS:

– unit increase in residential population receiving treatment results in a $1077.82 increase in total need

stargazer(mod.poptrmt.css0, mod.poptrmt.css1, type = "text", star.cutoffs = c(0.05, 0.01, 0.001))

## 
## ==========================================================================
##                                      Dependent variable:                  
##                     ------------------------------------------------------
##                                      TOTAL_OFFICIAL_NEED                  
##                                 (1)                        (2)            
## --------------------------------------------------------------------------
## PRES_RES_REC_TRMT           465.561***                 1,077.820***       
##                               (7.037)                    (62.131)         
##                                                                           
## Constant                 4,729,349.000***            33,599,690.000*      
##                            (663,193.500)             (13,211,574.000)     
##                                                                           
## --------------------------------------------------------------------------
## Observations                   7,724                       696            
## R2                             0.362                      0.302           
## Adjusted R2                    0.362                      0.301           
## Residual Std. Error 57,140,327.000 (df = 7722)  326,930,190.000 (df = 694)
## F Statistic         4,377.043*** (df = 1; 7722)  300.933*** (df = 1; 694) 
## ==========================================================================
## Note:                                        *p<0.05; **p<0.01; ***p<0.001

##for facilities without CSS:
###unit increase in population density results in a $2,380 increase in total need
###unit increase in residential population receiving treatment results in a $461 increase in total need
##comparatively, for facilities with CSS:
#####unit increase in population density results in a $20,477 increase in total need
###unit increase in residential population receiving treatment results in a $833 increase in total need

This implies a 132% difference in need, per unit of increase in residential population receiving treatment, for variables with CSS facilities!

((20477 - 2380)/(2380))*100

## [1] 760.3782

##implies a +760 percent difference per unit of population density increase, for variables with CSS facilities
##!!!!!!!!!!!!!!!!

###observations are statistically significant with reasonably high R2 values

Need vs. Treatment: Mean Treatment Threshold Dummies

Too low, no point in looking much further.

dat_dummy_trmt0 <- filter(dat1, DUMMY_TRMT == 0)

dat_dummy_trmt1 <- filter(dat1, DUMMY_TRMT == 1)%>%
  mutate(RES_BURDEN = TOTAL_OFFICIAL_NEED/POP10)

mod.income.dummytrmt0 <- lm(TOTAL_OFFICIAL_NEED ~ MEDINC09 + PRES_RES_REC_TRMT, data = dat_dummy_trmt0)
summary(mod.income.dummytrmt0)

## 
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ MEDINC09 + PRES_RES_REC_TRMT, 
##     data = dat_dummy_trmt0)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
##  -7358465  -2095107  -1148444     85561 268672717 
## 
## Coefficients:
##                       Estimate   Std. Error t value         Pr(>|t|)    
## (Intercept)       -2056471.918   559390.560  -3.676          0.00024 ***
## MEDINC09                65.608        9.521   6.891 0.00000000000636 ***
## PRES_RES_REC_TRMT     1162.541      170.618   6.814 0.00000000001086 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8117000 on 4207 degrees of freedom
## Multiple R-squared:  0.02308,    Adjusted R-squared:  0.02261 
## F-statistic: 49.69 on 2 and 4207 DF,  p-value: < 0.00000000000000022

#R2 = 0.023; Adjusted R2 = 0.023
##too low, no point looking much further

Create a multiple linear regression with variables exhibiting high correlations. Controlled for dummy variable of “residential population receiving treatment > median of res pop receiving treatment”

The values are reasonable, plot and look at residuals.

##create a multiple linear regression with variables exhibiting high correlations
##controlled for dummy variable of "residential population receiving treatment > of res pop receiving treatment"
mod.dummytrmt1 <- lm(TOTAL_OFFICIAL_NEED ~ POP_DENSITY_10 + RES_BURDEN + PRES_RES_REC_TRMT, data = dat_dummy_trmt1)
summary(mod.dummytrmt1)

## 
## Call:
## lm(formula = TOTAL_OFFICIAL_NEED ~ POP_DENSITY_10 + RES_BURDEN + 
##     PRES_RES_REC_TRMT, data = dat_dummy_trmt1)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -1830743422   -15644256     4978917    12951099  4141397389 
## 
## Coefficients:
##                       Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)       -19571664.88   2408965.46  -8.125 0.000000000000000585 ***
## POP_DENSITY_10        21590.08       844.40  25.569 < 0.0000000000000002 ***
## RES_BURDEN            92444.79      3606.06  25.636 < 0.0000000000000002 ***
## PRES_RES_REC_TRMT       524.15        15.28  34.307 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 140300000 on 4206 degrees of freedom
## Multiple R-squared:  0.4439, Adjusted R-squared:  0.4435 
## F-statistic:  1119 on 3 and 4206 DF,  p-value: < 0.00000000000000022

#R2 = 0.44; Adjusted R2 = 0.44
##reasonable values, we can plot and look at residuals

Higher spread from the mean.

ggplot(dat_dummy_trmt1, aes(PRES_RES_REC_TRMT + RES_BURDEN + POP_DENSITY_10, TOTAL_OFFICIAL_NEED)) +
  geom_point(size=3, shape=20, color="orchid") +
  geom_smooth(method='lm', se=FALSE, color="grey40") +
  ggtitle("Multiple Regression with Dummies") +
  xlab("Res Trmt + Pop Density + Res Burden") + ylab("Total Need")

summary(mod.dummytrmt1$residuals)

##        Min.     1st Qu.      Median        Mean     3rd Qu.        Max. 
## -1830743422   -15644256     4978917           0    12951099  4141397389

##high spread from mean

Still a heteroscedastic plot. Log transforming would help.

plot(fitted(mod.dummytrmt1), mod.dummytrmt1$residuals,
     main = "Fitted Model Against Residuals: Residential Trmt with Dummies",
     xlab = "Fitted Model", ylab = "Residuals",
     col = "orchid", pch = 16)
abline(0,0, col = "grey40")

##still a heteroscedastic plot
##log transforming would help, but we're going to sleep instead
##see you at part 2.3!

Part 2.3 - Multivariate Regressions

COMING SOON 🥵

CPLN 505: Assignment 2

Riddhi Batra & Emily Goldstein

3 March 2022

Memorandum

Part 1: Comparing Groups and Testing for Association & Correlation

Part 1.1: Contextualizing Reported Need

Part 1.2: Summary Statistics

National Statistics for USA

Group by EPA Regions

Group by State

Focus States: NY, NJ, PA, DE

Pennsylvania Counties

Philadelphia

Part 1.3: Test of Association

Overall Budgeted Need for Facilites with CSS vs. MS4s

Difference in Facility Need Based on TMDL

Significance in Need by Region

Variables Hypothesized to be Associated with Infrastructure Need

Part 1.4: Pearson Correlation Tests

Part 2: Regression

Part 2.1: Preliminiary Tests

Focus on Delaware Watershed

Focus on Philadelphia

Pearson Correlation Tests

Dummy Variables

Part 2.2: Bivariate Regressions

Need vs. Population Density

Need vs. Residential Burden

Need vs. Present Residential Treatment

Stargazer Outputs

Dummy Regressions

Part 2.3 - Multivariate Regressions