Check out my rpubs site for other scripts: https://rpubs.com/deyvis305/

Welcome! This script is made available as an exploratory data analysis demo analyzing a dataset from nyc open data on recycling rates. The data was presented as part of a project at COOP. The presentation can be found here.

Load the data.

## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Manipulate the data type for future use.

recycling.dcr$Zone<-as.factor(recycling.dcr$Zone)
recycling.dcr$Fiscal.Year<-as.factor(recycling.dcr$Fiscal.Year)

Diversion Rates

Plot the Diversion rates for all years for each borough.

library(ggplot2)
#Diversion rate 2016-2019 all boroughs
#combine Queens West, Queens East; Combine Brooklyn South and North
ggplot(data = recycling.dcr, 
       aes(x=Zone, 
           y=Diversion.Rate.Total..Total.Recycling...Total.Waste.))+
  geom_boxplot(aes(color=Zone), show.legend = FALSE,lwd=1)+
  labs(x="Zone", y="Diversion Rate",
       title="Total recycling rate for each borough from 2016 to 2019")+
  theme_classic()+
  theme(plot.title = element_text(hjust=0.5))+
  scale_color_manual(values=c("black","black","black",
                              "black","black"))+
  geom_jitter(alpha=0.1,width=0.2, color="tan")+
  coord_flip()

Breakdown the plots to show all years.

#boxplot
ggplot(data = recycling.dcr, 
       aes(x=Zone, 
           y=Diversion.Rate.Total..Total.Recycling...Total.Waste.))+
  geom_boxplot(aes(color=Fiscal.Year))+
  labs(x="Zone", y="Diversion Rate",
       title="Total recycling rates for each borough from 2016 to 2019")+
  theme_classic()+
  theme(plot.title = element_text(hjust=0.5))

FiscalYear<-recycling.dcr$Fiscal.Year
DiversionRateTotal<-recycling.dcr$Diversion.Rate.Total..Total.Recycling...Total.Waste.
Zone<-recycling.dcr$Zone
r.data<-data.frame(FiscalYear,DiversionRateTotal,Zone)

ggplot(data = r.data, 
       aes(x=FiscalYear, 
           y=DiversionRateTotal))+
  geom_jitter(aes(color=Zone), alpha=0.5)+
  labs(x="Fiscal Year", y="Diversion Rate",
       title="Total recycling for each borough from 2016 to 2019")+
  theme_classic()+
  theme(plot.title = element_text(hjust=0.5))

Separate by Zone and adding a regression line.

r.data$FiscalYear<-as.double(r.data$FiscalYear)

ggplot(data = r.data, 
       aes(x=FiscalYear, 
           y=DiversionRateTotal))+
  geom_jitter(alpha=0.1)+
  labs(x="1=2016   2=2017   3=2018   4=2019", y="Diversion Rate",
       title="Total recycling for each borough from 2016 to 2019")+
  theme_classic()+
  facet_wrap(~Zone, nrow = 1)+
  geom_smooth(method = "lm", se=FALSE)+
  theme(legend.position = "none", plot.title = element_text(hjust=0.5))
## `geom_smooth()` using formula = 'y ~ x'

Which districts are the worst at diversion?

#ALL DISTRICTS
abc1<-ggplot(data = recycling.dcr, 
       aes(x=District, 
           y=Diversion.Rate.Total..Total.Recycling...Total.Waste.))+
  geom_boxplot(aes(color=Fiscal.Year))+
  labs(y="Diversion Rate",
       title="Total recycling rates for each NYC district from 2016 to 2019")+
  theme_classic()+
  theme(axis.text.x = element_text(angle=70, vjust=0.5))+
  theme(plot.title = element_text(hjust=0.5))

ggplotly(abc1) #click on the color squares to exlude/include

Capture Rates

#Capture rate 2016-2019 all boroughs
#combine Queens West, Queens East; Combine Brooklyn South and North
ggplot(data = recycling.dcr, 
       aes(x=Zone, 
           y=Capture.Rate.Total...Total.Recycling...Leaves..Recycling......Max.Paper...Max.MGP..x100))+
  geom_boxplot(aes(color=Zone), show.legend = FALSE, lwd=1)+
  labs(x="Zone", y="Capture Rate Total",
       title="Total capture rate for each borough from 2016 to 2019")+
  theme_classic()+
  theme(plot.title = element_text(hjust=0.5))+
  scale_color_manual(values=c("black","black","black",
                              "black","black"))+
  geom_jitter(alpha=0.1,width=0.2, color="paleturquoise4")+
  coord_flip()

#boxplot
ggplot(data = recycling.dcr, 
       aes(x=Zone, 
           y=Capture.Rate.Total...Total.Recycling...Leaves..Recycling......Max.Paper...Max.MGP..x100))+
  geom_boxplot(aes(color=Fiscal.Year))+
  labs(x="Zone", y="Capture Rate",
       title="Total capture rates for each borough from 2016 to 2019")+
  theme_classic()+
  theme(plot.title = element_text(hjust=0.5))

CaptureRateTotal<-recycling.dcr$Capture.Rate.Total...Total.Recycling...Leaves..Recycling......Max.Paper...Max.MGP..x100
r.data2<-data.frame(FiscalYear,CaptureRateTotal,Zone)

ggplot(data = r.data, 
       aes(x=FiscalYear, 
           y=CaptureRateTotal))+
  geom_jitter(aes(color=Zone), alpha=0.5)+
  labs(x="Fiscal Year", y="Capture Rate",
       title="Total capture rates for each borough from 2016 to 2019")+
  theme_classic()+
  theme(plot.title = element_text(hjust=0.5))

Separate by Zone and adding a regression line.

r.data2$FiscalYear<-as.double(r.data2$FiscalYear)

ggplot(data = r.data2, 
       aes(x=FiscalYear, 
           y=CaptureRateTotal))+
  geom_jitter(alpha=0.1)+
  labs(x="1=2016   2=2017   3=2018   4=2019", y="Capture Rate",
       title="Total capture rate for each borough from 2016 to 2019")+
  theme_classic()+
  facet_wrap(~Zone, nrow = 1)+
  geom_smooth(method = "lm", se=FALSE)+
  theme(legend.position = "none", plot.title = element_text(hjust=0.5))
## `geom_smooth()` using formula = 'y ~ x'

Which districts are the worst at capture?

#ALL DISTRICTS
abc2<-ggplot(data = recycling.dcr, 
       aes(x=District, 
           y=Capture.Rate.Total...Total.Recycling...Leaves..Recycling......Max.Paper...Max.MGP..x100))+
  geom_boxplot(aes(color=Fiscal.Year))+
  labs(y="Capture Rate",
       title="Total capture rates for each NYC district from 2016 to 2019")+
  theme_classic()+
  theme(axis.text.x = element_text(angle=70, vjust=0.5))+
  theme(plot.title = element_text(hjust=0.5))

ggplotly(abc2) #click on the color squares to exclude/include

Relationship between diversion and capture rates

### what is the relationship between cr and dr?
#scatter plot with linear model fit
ggplot(data=recycling.dcr, aes(x=Diversion.Rate.Total..Total.Recycling...Total.Waste.,y=Capture.Rate.Total...Total.Recycling...Leaves..Recycling......Max.Paper...Max.MGP..x100))+
geom_point(aes(color=Fiscal.Year, shape=Zone),alpha=0.1)+
geom_smooth(method="lm", aes(color=Fiscal.Year))+
facet_wrap(~Zone, nrow = 1)+
theme_classic()+
labs(x="diversion rate", y="capture rate", 
     title = "Relationship between diversion and capture rates for all zones grouped by fiscal year")
## `geom_smooth()` using formula = 'y ~ x'

Something interesting about the data is that there seems to be two different clusters for Manhattan! The association between diversion and capture rates are positive, except for Staten Island during year 2018 and 2019 where it was negative!

THE END