Municipal waste management is crucial for sustainable urban development. Efficient waste management can lead to economic savings, reduced environmental impact, and improved public health. With its diverse regions and varied urbanization levels, the target country provides an ideal case study to explore the factors influencing the efficiency of waste management.
table for columns description -
Variable (Label) region (Region) province Province name (Name of municipality) tc (Cost per capita eur) cres residual cost per capita csor Sorted cost per caputa istat National code area km2 pop population alt altitude m.s.l. isle dummy municipality on isle sea dummy coastal municipality pden population density (people per km2) wden waste per km2 urb urbanization index (1 low, 3 high) fee fee scheme d_fee dmmy PAYT sample Reg with PAYT organic organic % paper paper% glass glass % wood wood % metal metal % plastic plastic % raee raee % texile texile % other other % msw_so msw sorted kg msw_un msw unsorted kg msw Municipal solid waste kg sor Share of sorther waste geo 1 South, 2 Center, 3 North roads Km of roads within the municipality s_wteregio Share of sw sent to W2E plants - regional figure s_landfill share of waste to landifil gdp Municipal revenues EUR (p) - log proads People per km of roads (log) wage Taxable income EUR (p) - log finance Municipal revenues EUR (p) - log
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("/Users/minervasingh/Documents/Waste")
df=read.csv("public_data_waste_fee.csv")
head(df)
#Understand the structure of the data
str(df)
## 'data.frame': 4341 obs. of 39 variables:
## $ region : chr "ER" "ER" "ER" "ER" ...
## $ province : chr "FRRA" "FRRA" "FRRA" "FRRA" ...
## $ name : chr "Comacchio" "Lagosanto" "Goro" "Mesola" ...
## $ tc : num 502 228 268 199 234 ...
## $ cres : num 129.3 49.5 50.6 41.1 58.3 ...
## $ csor : num 66.4 44.1 44.6 40.4 26 ...
## $ istat : int 38006 38011 38025 38014 110005 38010 38030 58120 27005 8050 ...
## $ area : num 283.8 34.4 26.6 84.3 35.7 ...
## $ pop : int 22648 4952 3895 7140 12193 3003 7364 67626 11793 2861 ...
## $ alt : int 1 1 1 1 1 1 1 1 1 2 ...
## $ isle : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sea : int 1 1 1 1 1 0 0 1 1 1 ...
## $ pden : num 79.8 143.8 146.3 84.7 341.5 ...
## $ wden : num 119671 70031 81117 43320 201565 ...
## $ urb : int 2 3 3 3 2 3 3 2 2 2 ...
## $ fee : chr "PAYT" "PAYT" "PAYT" "PAYT" ...
## $ d_fee : int 1 1 1 1 0 1 1 0 0 0 ...
## $ sample : int 1 1 1 1 0 1 1 0 1 0 ...
## $ organic : num NA 35.041 37.377 45.31 0.428 ...
## $ paper : num 4.36 9.89 11.99 9.76 6.6 ...
## $ glass : num 3.59 9.52 6.65 7.55 4.33 ...
## $ wood : num 2.27 4.00 1.32e-05 1.49e-01 2.30 ...
## $ metal : num 0.462 1.861 0.745 0.747 0.103 ...
## $ plASTIc : num 1.13 4.64 5.22 5.2 5.12 ...
## $ raee : num 0.346 1.609 NA NA 0.275 ...
## $ texile : num 0.112 0.351 0.449 0.4 0.287 ...
## $ other : num 3.2 9.02 16.04 9.77 4.05 ...
## $ msw_so : num 20396261 1831407 1694922 2881055 3026700 ...
## $ msw_un : int 13560520 580460 464400 770860 4169180 349620 556540 7895520 5659520 296480 ...
## $ msw : int 33956781 2411867 2159322 3651915 7195880 1682628 3336429 33435410 15175582 1553789 ...
## $ sor : num 60.1 75.9 78.5 78.9 42.1 ...
## $ geo : num 3 3 3 3 1 3 NA 2 3 3 ...
## $ roads : num 285 11 49 165 60 65 NA 329 77 17 ...
## $ s_wteregio: num 33.11 33.11 33.11 33.11 4.05 ...
## $ s_landfill: num 15.2 15.2 15.2 15.2 45.4 ...
## $ gdp : num 7.27 7.11 7.27 7.09 7.25 ...
## $ proads : num 4.35 6.08 4.34 3.71 5.27 ...
## $ wage : num 9.44 9.51 8.89 9.43 9.13 ...
## $ finance : num 7.49 7.32 7.49 7.3 7.46 ...
summary(df)
## region province name tc
## Length:4341 Length:4341 Length:4341 Min. : 25.69
## Class :character Class :character Class :character 1st Qu.:108.04
## Mode :character Mode :character Mode :character Median :136.62
## Mean :154.24
## 3rd Qu.:179.16
## Max. :977.42
##
## cres csor istat area
## Min. : 4.27 Min. : 3.39 Min. : 1272 Min. : 0.12
## 1st Qu.: 27.34 1st Qu.: 31.25 1st Qu.: 18135 1st Qu.: 10.85
## Median : 41.69 Median : 48.88 Median : 42015 Median : 22.73
## Mean : 54.18 Mean : 52.68 Mean : 47470 Mean : 40.99
## 3rd Qu.: 66.49 3rd Qu.: 66.44 3rd Qu.: 70049 3rd Qu.: 47.49
## Max. :670.32 Max. :582.16 Max. :111107 Max. :1287.39
## NA's :52 NA's :67 NA's :6
## pop alt isle sea
## Min. : 34 Min. : 1.0 Min. :0.000000 Min. :0.0000
## 1st Qu.: 1579 1st Qu.: 80.0 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 3535 Median : 240.0 Median :0.000000 Median :0.0000
## Mean : 10204 Mean : 310.1 Mean :0.005075 Mean :0.1684
## 3rd Qu.: 8199 3rd Qu.: 459.0 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :2617175 Max. :1816.0 Max. :1.000000 Max. :1.0000
## NA's :6 NA's :6 NA's :6
## pden wden urb fee
## Min. : 2.48 Min. : 892 Min. :1.00 Length:4341
## 1st Qu.: 62.59 1st Qu.: 23571 1st Qu.:2.00 Class :character
## Median : 151.32 Median : 68343 Median :3.00 Mode :character
## Mean : 405.05 Mean : 192058 Mean :2.49
## 3rd Qu.: 399.37 3rd Qu.: 194625 3rd Qu.:3.00
## Max. :12122.83 Max. :4978556 Max. :3.00
## NA's :6 NA's :6 NA's :6
## d_fee sample organic paper
## Min. :0.0000 Min. :0.0000 Min. : 0.0137 Min. : 0.00001
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:11.1273 1st Qu.: 8.65575
## Median :0.0000 Median :1.0000 Median :24.9686 Median :10.87905
## Mean :0.1283 Mean :0.5469 Mean :22.2751 Mean :10.96127
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:31.8224 3rd Qu.:13.06169
## Max. :1.0000 Max. :1.0000 Max. :61.6391 Max. :45.28813
## NA's :512 NA's :25
## glass wood metal plASTIc
## Min. : 0.000 Min. : 0.000 Min. : 0.00001 Min. : 0.00001
## 1st Qu.: 7.147 1st Qu.: 2.086 1st Qu.: 0.87686 1st Qu.: 4.12610
## Median : 9.102 Median : 4.024 Median : 1.54092 Median : 5.79227
## Mean : 9.407 Mean : 4.114 Mean : 1.76442 Mean : 6.11216
## 3rd Qu.:11.278 3rd Qu.: 5.708 3rd Qu.: 2.35093 3rd Qu.: 7.54472
## Max. :39.836 Max. :25.117 Max. :20.67146 Max. :31.60474
## NA's :33 NA's :1095 NA's :246 NA's :39
## raee texile other msw_so
## Min. : 0.0000 Min. : 0.0000 Min. : 0.02946 Min. : 0
## 1st Qu.: 0.7769 1st Qu.: 0.3464 1st Qu.: 3.96485 1st Qu.: 373965
## Median : 1.1754 Median : 0.6292 Median : 7.13491 Median : 1040737
## Mean : 1.2331 Mean : 0.7570 Mean : 7.94159 Mean : 3248581
## 3rd Qu.: 1.5686 3rd Qu.: 0.9908 3rd Qu.:11.12996 3rd Qu.: 2725645
## Max. :17.9536 Max. :10.5845 Max. :37.15592 Max. :765130099
## NA's :314 NA's :1013 NA's :136
## msw_un msw sor geo
## Min. : 6185 Min. :1.997e+04 Min. : 0.25 Min. :1.000
## 1st Qu.: 175180 1st Qu.:6.117e+05 1st Qu.:57.83 1st Qu.:1.000
## Median : 409060 Median :1.524e+06 Median :70.84 Median :3.000
## Mean : 2042522 Mean :5.311e+06 Mean :66.24 Mean :2.291
## 3rd Qu.: 1056920 3rd Qu.:3.954e+06 3rd Qu.:79.09 3rd Qu.:3.000
## Max. :926757220 Max. :1.692e+09 Max. :97.48 Max. :3.000
## NA's :285
## roads s_wteregio s_landfill gdp
## Min. : 1.0 Min. : 0.000 Min. : 3.603 Min. : 6.035
## 1st Qu.: 25.0 1st Qu.: 8.905 1st Qu.: 4.551 1st Qu.: 6.811
## Median : 51.0 Median :24.468 Median :11.297 Median : 7.128
## Mean : 101.9 Mean :21.867 Mean :20.001 Mean : 7.231
## 3rd Qu.: 105.0 3rd Qu.:38.501 3rd Qu.:31.493 3rd Qu.: 7.571
## Max. :14970.0 Max. :65.122 Max. :92.532 Max. :10.539
## NA's :443 NA's :285 NA's :285 NA's :386
## proads wage finance
## Min. :-3.367 Min. : 7.866 Min. : 6.216
## 1st Qu.: 3.498 1st Qu.: 9.226 1st Qu.: 7.016
## Median : 4.356 Median : 9.523 Median : 7.342
## Mean : 4.254 Mean : 9.447 Mean : 7.448
## 3rd Qu.: 5.075 3rd Qu.: 9.665 3rd Qu.: 7.798
## Max. : 8.980 Max. :10.485 Max. :10.855
## NA's :443 NA's :285 NA's :386
# identify data with missing values
names(which(sapply(df, function(x) any(is.na(x)))))
## [1] "cres" "csor" "area" "alt" "isle"
## [6] "sea" "pden" "wden" "urb" "organic"
## [11] "paper" "glass" "wood" "metal" "plASTIc"
## [16] "raee" "texile" "other" "geo" "roads"
## [21] "s_wteregio" "s_landfill" "gdp" "proads" "wage"
## [26] "finance"
df=na.omit(df)
s=summary(df)
Correlation between variables
# correlation
data_num2 <- select_if(df, is.numeric) # Subset numeric columns
# compute correlataion between numeric variables
c= cor(df[, unlist(lapply(df, is.numeric))]) # Properly apply cor
library(corrplot)
## corrplot 0.92 loaded
# Insignificant correlation are crossed
col<- colorRampPalette(c("blue", "white", "red"))(20)
heatmap(x = c, col = col, symm = TRUE)
Aims:
Understand the relationship between waste composition, urbanization, population density, and waste management costs in municipalities.
df$urb=as.factor(df$urb)
df %>% group_by(urb)
For urbanisation level 1
urb1=subset(df,urb=="1")
urb1_x=urb1 %>% group_by(region,province,name) %>%
summarise(glassm = mean(glass), woodm=mean(wood),metalm=mean(metal),
platicm=mean(plASTIc),cost=mean(tc),areas=mean(area),gdpm=mean(gdp))
## `summarise()` has grouped output by 'region', 'province'. You can override
## using the `.groups` argument.
urb1_x
ggplot(urb1_x, aes(x=glassm, y=gdpm,xName='Cost of Disposing Glass Disturbance',yName='GDP',color=region,legendPosition="bottom")) +geom_point()+ ggtitle("GDP vs Cost of Glass Disposal Across Regions") +
xlab("Cost of Glass Disposal") + ylab("GDP")
Regression Model For Modelling the Variation in Fees
model <- lm(tc ~ pop + area + gdp + paper + texile + glass + wood + metal, data = df)
summary(model)
##
## Call:
## lm(formula = tc ~ pop + area + gdp + paper + texile + glass +
## wood + metal, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -230.51 -27.44 -6.74 19.27 600.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.337e+02 2.089e+01 -11.187 < 2e-16 ***
## pop 3.945e-04 1.261e-04 3.128 0.00178 **
## area 2.651e-02 2.852e-02 0.929 0.35282
## gdp 6.328e+01 2.789e+00 22.688 < 2e-16 ***
## paper -1.982e+00 3.429e-01 -5.778 8.72e-09 ***
## texile -9.911e+00 1.891e+00 -5.241 1.76e-07 ***
## glass -2.989e+00 4.609e-01 -6.485 1.11e-10 ***
## wood 4.114e-01 5.184e-01 0.794 0.42751
## metal -8.410e+00 1.127e+00 -7.462 1.27e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.13 on 2008 degrees of freedom
## Multiple R-squared: 0.3072, Adjusted R-squared: 0.3045
## F-statistic: 111.3 on 8 and 2008 DF, p-value: < 2.2e-16
Random forest regression
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(123) # Setting a seed ensures reproducibility
splitIndex <- createDataPartition(urb1$tc, p = 0.75, list = FALSE)
train_data <- urb1[splitIndex,]
test_data <- urb1[-splitIndex,]
model1 <- randomForest(tc ~ ., data = train_data, ntree = 100, mtry = 3)
print(model1)
##
## Call:
## randomForest(formula = tc ~ ., data = train_data, ntree = 100, mtry = 3)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 1429.676
## % Var explained: 51.97
Variable Importnace
# Skapar graferna
res2 <- data.frame(varImp(model1)) # hämtar variable importance
res2$impProcent <- round(res2$Overall/sum(res2$Overall)*100) # räknar om importance för varje till procent av hela
res2$var <- rownames(res2) # kopierar radnamnet
res2 %>% ggplot(aes(x=impProcent, y=reorder(var, impProcent))) +
geom_bar(stat="identity")