This dataset includes sewer outflow condition from each county in New York State with different demographic data such as educational status and estimated obesity counts. In this assignment, I firstly created two pivot tables to understand the overall picture of the dataset by chooding the variables I will be using in the plots.
I created a table with counties as the main columns and with permitted outflow and demographic dataset to see the basic differences between counties in the NY State.
df %>%
tab_cells('Number of Permitted Outfalls' = df$`Number.of.Permitted.Outfalls`,
'High School Grad'= df$`TotalHighSchGrad`,
'Total College/Associated'= df$`TotalSomeCollegeOrAssociate`,
'Bachelor or Higher'= df$`Total_BachelorOrHigher`,
'Est Count Obese'= df$`Est_Count_Obese`) %>%
tab_cols(County,total()) %>%
tab_stat_median() %>% tab_pivot() %>%
set_caption("Statisctics of Sewer Outflows per County") %>%
htmlTable(.,
css.cell = c("width: 250",
rep("width:70", ncol(.) - 1)))
| Statisctics of Sewer Outflows per County | |||||||||||||||||||||||||||||||||||||||
|  County |  #Total | ||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|  Albany |  Bronx |  Broome |  Cayuga |  Chautauqua |  Chemung |  Clinton |  Cols |  Columbia |  Dutchess |  Erie |  Essex |  Franklin |  Greene |  Herkimer |  Jefferson |  Kings |  Madison |  Monroe |  Montgomery |  Niagara |  Oneida |  Onondaga |  Orange |  Orleans |  Oswego |  Q:X |  Queens |  Rensselaer |  Richmond |  Saratoga |  Ulster |  Warren |  Washington |  Westchester |  Y:AC |  | |||
|  Number of Permitted Outfalls | |||||||||||||||||||||||||||||||||||||||
|    Median | 12 | 33 | 9 | 16 | 1 | 11 | 11 | 8 | 6 | 52 | 2 | 2 | 6 | 3 | 16 | 79 | 1 | 6 | 4 | 9 | 47 | 62 | 13 | 13 | 6 | 44 | 49 | 36 | 4 | 13 | 1 | 11 | 4 |  | 33 | ||||
|  High School Grad | |||||||||||||||||||||||||||||||||||||||
|    Median | 10996 | 42371 | 6977 | 2188 | 3733 | 3087 | 3295 | 1493 | 9600 | 24840 | 1116 | 1947 | 1870 | 1766 | 7210 | 62380 | 2886 | 19677 | 1403 | 5821 | 5951 | 11059 | 12725 | 1333 | 3724 | 48475 | 5244 | 10689 | 4521 | 4882 | 1480 | 2054 | 21973 |  | 11059 | ||||
|  Total College/Associated | |||||||||||||||||||||||||||||||||||||||
|    Median | 22975 | 70140 | 14368 | 2993 | 7040 | 3110 | 5716 | 1635 | 16395 | 42971 | 1056 | 1834 | 1288 | 2752 | 5773 | 98458 | 5680 | 40056 | 1609 | 8850 | 11008 | 26851 | 18665 | 1343 | 8111 | 96136 | 7965 | 19827 | 9473 | 9050 | 2423 | 1787 | 37837 |  | 26851 | ||||
|  Total Bachelor or Higher | |||||||||||||||||||||||||||||||||||||||
|    Median | 8686 | 12449 | 3789 | 425 | 923 | 401 | 1113 | 584 | 3578 | 13954 | 277 | 283 | 205 | 312 | 858 | 42466 | 609 | 10717 | 312 | 2200 | 2432 | 6724 | 4183 | 164 | 845 | 31892 | 2365 | 6882 | 2616 | 1799 | 434 | 329 | 16302 |  | 6882 | ||||
|  Est Count Obese | |||||||||||||||||||||||||||||||||||||||
|    Median | 135384 | 659701 | 77727 | 39274 | 62623 | 40132 | 43489 | 29671 | 134689 | 436528 | 20317 | 25783 | 23323 | 30622 | 56361 | 1030497 | 35194 | 363611 | 24712 | 101676 | 113786 | 217439 | 174116 | 20485 | 57840 | 932363 | 76996 | 198353 | 107124 | 90721 | 30605 | 33260 | 383940 |  | 217439 | ||||
The second table I created is based on three different type of seweage outflow facility with the counts of permitted and actual events and other demographic data.
df %>%
tab_cells('Number of Permitted Outfalls' = df$`Number.of.Permitted.Outfalls`,
'Event Count' = df$`Event.Count`,
'High School Grad'= df$`TotalHighSchGrad`,
'Total College/Associated'= df$`TotalSomeCollegeOrAssociate`,
'Bachelor or Higher'= df$`Total_BachelorOrHigher`,
'Est Count Obese'= df$`Est_Count_Obese`) %>%
tab_cols('Discharge Activation Type' = df$`Discharge.Activation.Type` ,total()) %>%
tab_stat_median() %>% tab_pivot() %>%
set_caption("Statisctics of Sewer Outflows per Discharge Activation Type") %>%
htmlTable(.,
css.cell = c("width: 250",
rep("width:70", ncol(.) - 1)))
| Statisctics of Sewer Outflows per Discharge Activation Type | ||||||
|  Discharge Activation Typ |  #Total | |||||
|---|---|---|---|---|---|---|
|  Model |  Monitoring |  Observation |  | |||
|  Number of Permitted Outfalls | ||||||
|    Median | 44 | 16 | 12 |  | 33 | |
|  Event Count | ||||||
|    Median | 46706 | 24287 | 13283 |  | 20476 | |
|  High School Grad | ||||||
|    Median | 24840 | 12725 | 5821 |  | 11059 | |
|  Total College/Associated | ||||||
|    Median | 42971 | 19827 | 8850 |  | 26851 | |
|  Total Bachelor or Higher | ||||||
|    Median | 12449 | 6882 | 2200 |  | 6882 | |
|  Est Count Obese | ||||||
|    Median | 436528 | 198353 | 101676 |  | 217439 | |
As we can see in the graph below, with the largest base, no prise that model’s event count is way more than the other two types of the facilities.
library(ggplot2)
g<-ggplot(df, aes(Discharge.Activation.Type,Event.Count))+
geom_bar(stat = "identity", color = "pink")+
ggtitle("Overflow Event Count per Type")+ xlab("Discharge Activation Type") + ylab("Event Count") +
theme_grey()
g
## Warning: Removed 4 rows containing missing values (position_stack).
## Relationship between number of permitted outfalls and outfall event counts The second chart shows us the number of permitted outfalls and the actual outfall events. Although there’re some outliers with low permitted number but high outfall numbers, from the lm line we can tell that they’re basically showing a positive relationship.
g2<-ggplot(df, aes(Number.of.Permitted.Outfalls,Event.Count))+
geom_point(alpha = "0.5", color = "red")+
geom_smooth(method ="lm")+
coord_cartesian() + scale_color_gradient() +
ggtitle("Relationship between number of permitted outfalls and outfall event counts")+ xlab("Number of Permitted Outfalls") + ylab("Event Count") +
theme_grey()
g2
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
With demographic information included, I was curious to see if those variables are big influencers to the outfalls number.
#obesity
g3<-ggplot(df, aes(Est_Count_Obese,Event.Count))+
geom_point(alpha = "0.5", color = "green")+
geom_smooth(method ="lm")+
coord_cartesian() + scale_color_gradient() +
ggtitle("Relationship between outfall event counts and estimated obesity population")+ xlab("Estimiated Obesity Population") + ylab("Event Count") +
theme_grey()
g3 + annotate("text", x = 250000, y = 100000, label = "positive relationship")
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
#education: highschool
df$percentage_HS <- df$TotalHighSchGrad/df$PopAged18to24
g4<-ggplot(df, aes(percentage_HS,Event.Count))+
geom_point(alpha = "0.5", color = "green")+
geom_smooth(method ="lm")+
coord_cartesian() + scale_color_gradient() +
ggtitle("Hign Shcool Graduate")+ xlab("Bachelor or Higher") + ylab("Event Count") +
theme_grey()
g4
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
#education: bachelor or higher
df$percentage_BH <- df$Total_BachelorOrHigher/df$PopAged18to24
g5<-ggplot(df, aes(percentage_BH,Event.Count))+
geom_point(alpha = "0.5", color = "green")+
geom_smooth(method ="lm")+
coord_cartesian() + scale_color_gradient() +
ggtitle("Bachelor or Higher")+ xlab("Bachelor or Higher") + ylab("Event Count") +
theme_grey()
g5
## Don't know how to automatically pick scale for object of type labelled/integer. Defaulting to continuous.
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).