Municipal waste management is crucial for sustainable urban development. Efficient waste management can lead to economic savings, reduced environmental impact, and improved public health. With its diverse regions and varied urbanization levels, the target country provides an ideal case study to explore the factors influencing the efficiency of waste management.

table for columns description -

Variable (Label) region (Region) province Province name (Name of municipality) tc (Cost per capita eur) cres residual cost per capita csor Sorted cost per caputa istat National code area km2 pop population alt altitude m.s.l. isle dummy municipality on isle sea dummy coastal municipality pden population density (people per km2) wden waste per km2 urb urbanization index (1 low, 3 high) fee fee scheme d_fee dmmy PAYT sample Reg with PAYT organic organic % paper paper% glass glass % wood wood % metal metal % plastic plastic % raee raee % texile texile % other other % msw_so msw sorted kg msw_un msw unsorted kg msw Municipal solid waste kg sor Share of sorther waste geo 1 South, 2 Center, 3 North roads Km of roads within the municipality s_wteregio Share of sw sent to W2E plants - regional figure s_landfill share of waste to landifil gdp Municipal revenues EUR (p) - log proads People per km of roads (log) wage Taxable income EUR (p) - log finance Municipal revenues EUR (p) - log

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("/Users/minervasingh/Documents/Waste")

df=read.csv("public_data_waste_fee.csv")

head(df)
#Understand the structure of the data

str(df)
## 'data.frame':    4341 obs. of  39 variables:
##  $ region    : chr  "ER" "ER" "ER" "ER" ...
##  $ province  : chr  "FRRA" "FRRA" "FRRA" "FRRA" ...
##  $ name      : chr  "Comacchio" "Lagosanto" "Goro" "Mesola" ...
##  $ tc        : num  502 228 268 199 234 ...
##  $ cres      : num  129.3 49.5 50.6 41.1 58.3 ...
##  $ csor      : num  66.4 44.1 44.6 40.4 26 ...
##  $ istat     : int  38006 38011 38025 38014 110005 38010 38030 58120 27005 8050 ...
##  $ area      : num  283.8 34.4 26.6 84.3 35.7 ...
##  $ pop       : int  22648 4952 3895 7140 12193 3003 7364 67626 11793 2861 ...
##  $ alt       : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ isle      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sea       : int  1 1 1 1 1 0 0 1 1 1 ...
##  $ pden      : num  79.8 143.8 146.3 84.7 341.5 ...
##  $ wden      : num  119671 70031 81117 43320 201565 ...
##  $ urb       : int  2 3 3 3 2 3 3 2 2 2 ...
##  $ fee       : chr  "PAYT" "PAYT" "PAYT" "PAYT" ...
##  $ d_fee     : int  1 1 1 1 0 1 1 0 0 0 ...
##  $ sample    : int  1 1 1 1 0 1 1 0 1 0 ...
##  $ organic   : num  NA 35.041 37.377 45.31 0.428 ...
##  $ paper     : num  4.36 9.89 11.99 9.76 6.6 ...
##  $ glass     : num  3.59 9.52 6.65 7.55 4.33 ...
##  $ wood      : num  2.27 4.00 1.32e-05 1.49e-01 2.30 ...
##  $ metal     : num  0.462 1.861 0.745 0.747 0.103 ...
##  $ plASTIc   : num  1.13 4.64 5.22 5.2 5.12 ...
##  $ raee      : num  0.346 1.609 NA NA 0.275 ...
##  $ texile    : num  0.112 0.351 0.449 0.4 0.287 ...
##  $ other     : num  3.2 9.02 16.04 9.77 4.05 ...
##  $ msw_so    : num  20396261 1831407 1694922 2881055 3026700 ...
##  $ msw_un    : int  13560520 580460 464400 770860 4169180 349620 556540 7895520 5659520 296480 ...
##  $ msw       : int  33956781 2411867 2159322 3651915 7195880 1682628 3336429 33435410 15175582 1553789 ...
##  $ sor       : num  60.1 75.9 78.5 78.9 42.1 ...
##  $ geo       : num  3 3 3 3 1 3 NA 2 3 3 ...
##  $ roads     : num  285 11 49 165 60 65 NA 329 77 17 ...
##  $ s_wteregio: num  33.11 33.11 33.11 33.11 4.05 ...
##  $ s_landfill: num  15.2 15.2 15.2 15.2 45.4 ...
##  $ gdp       : num  7.27 7.11 7.27 7.09 7.25 ...
##  $ proads    : num  4.35 6.08 4.34 3.71 5.27 ...
##  $ wage      : num  9.44 9.51 8.89 9.43 9.13 ...
##  $ finance   : num  7.49 7.32 7.49 7.3 7.46 ...
summary(df)
##     region            province             name                 tc        
##  Length:4341        Length:4341        Length:4341        Min.   : 25.69  
##  Class :character   Class :character   Class :character   1st Qu.:108.04  
##  Mode  :character   Mode  :character   Mode  :character   Median :136.62  
##                                                           Mean   :154.24  
##                                                           3rd Qu.:179.16  
##                                                           Max.   :977.42  
##                                                                           
##       cres             csor            istat             area        
##  Min.   :  4.27   Min.   :  3.39   Min.   :  1272   Min.   :   0.12  
##  1st Qu.: 27.34   1st Qu.: 31.25   1st Qu.: 18135   1st Qu.:  10.85  
##  Median : 41.69   Median : 48.88   Median : 42015   Median :  22.73  
##  Mean   : 54.18   Mean   : 52.68   Mean   : 47470   Mean   :  40.99  
##  3rd Qu.: 66.49   3rd Qu.: 66.44   3rd Qu.: 70049   3rd Qu.:  47.49  
##  Max.   :670.32   Max.   :582.16   Max.   :111107   Max.   :1287.39  
##  NA's   :52       NA's   :67                        NA's   :6        
##       pop               alt              isle               sea        
##  Min.   :     34   Min.   :   1.0   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:   1579   1st Qu.:  80.0   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :   3535   Median : 240.0   Median :0.000000   Median :0.0000  
##  Mean   :  10204   Mean   : 310.1   Mean   :0.005075   Mean   :0.1684  
##  3rd Qu.:   8199   3rd Qu.: 459.0   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :2617175   Max.   :1816.0   Max.   :1.000000   Max.   :1.0000  
##                    NA's   :6        NA's   :6          NA's   :6       
##       pden               wden              urb           fee           
##  Min.   :    2.48   Min.   :    892   Min.   :1.00   Length:4341       
##  1st Qu.:   62.59   1st Qu.:  23571   1st Qu.:2.00   Class :character  
##  Median :  151.32   Median :  68343   Median :3.00   Mode  :character  
##  Mean   :  405.05   Mean   : 192058   Mean   :2.49                     
##  3rd Qu.:  399.37   3rd Qu.: 194625   3rd Qu.:3.00                     
##  Max.   :12122.83   Max.   :4978556   Max.   :3.00                     
##  NA's   :6          NA's   :6         NA's   :6                        
##      d_fee            sample          organic            paper         
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.0137   Min.   : 0.00001  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:11.1273   1st Qu.: 8.65575  
##  Median :0.0000   Median :1.0000   Median :24.9686   Median :10.87905  
##  Mean   :0.1283   Mean   :0.5469   Mean   :22.2751   Mean   :10.96127  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:31.8224   3rd Qu.:13.06169  
##  Max.   :1.0000   Max.   :1.0000   Max.   :61.6391   Max.   :45.28813  
##                                    NA's   :512       NA's   :25        
##      glass             wood            metal             plASTIc        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.00001   Min.   : 0.00001  
##  1st Qu.: 7.147   1st Qu.: 2.086   1st Qu.: 0.87686   1st Qu.: 4.12610  
##  Median : 9.102   Median : 4.024   Median : 1.54092   Median : 5.79227  
##  Mean   : 9.407   Mean   : 4.114   Mean   : 1.76442   Mean   : 6.11216  
##  3rd Qu.:11.278   3rd Qu.: 5.708   3rd Qu.: 2.35093   3rd Qu.: 7.54472  
##  Max.   :39.836   Max.   :25.117   Max.   :20.67146   Max.   :31.60474  
##  NA's   :33       NA's   :1095     NA's   :246        NA's   :39        
##       raee             texile            other              msw_so         
##  Min.   : 0.0000   Min.   : 0.0000   Min.   : 0.02946   Min.   :        0  
##  1st Qu.: 0.7769   1st Qu.: 0.3464   1st Qu.: 3.96485   1st Qu.:   373965  
##  Median : 1.1754   Median : 0.6292   Median : 7.13491   Median :  1040737  
##  Mean   : 1.2331   Mean   : 0.7570   Mean   : 7.94159   Mean   :  3248581  
##  3rd Qu.: 1.5686   3rd Qu.: 0.9908   3rd Qu.:11.12996   3rd Qu.:  2725645  
##  Max.   :17.9536   Max.   :10.5845   Max.   :37.15592   Max.   :765130099  
##  NA's   :314       NA's   :1013      NA's   :136                           
##      msw_un               msw                 sor             geo       
##  Min.   :     6185   Min.   :1.997e+04   Min.   : 0.25   Min.   :1.000  
##  1st Qu.:   175180   1st Qu.:6.117e+05   1st Qu.:57.83   1st Qu.:1.000  
##  Median :   409060   Median :1.524e+06   Median :70.84   Median :3.000  
##  Mean   :  2042522   Mean   :5.311e+06   Mean   :66.24   Mean   :2.291  
##  3rd Qu.:  1056920   3rd Qu.:3.954e+06   3rd Qu.:79.09   3rd Qu.:3.000  
##  Max.   :926757220   Max.   :1.692e+09   Max.   :97.48   Max.   :3.000  
##                                                          NA's   :285    
##      roads           s_wteregio       s_landfill          gdp        
##  Min.   :    1.0   Min.   : 0.000   Min.   : 3.603   Min.   : 6.035  
##  1st Qu.:   25.0   1st Qu.: 8.905   1st Qu.: 4.551   1st Qu.: 6.811  
##  Median :   51.0   Median :24.468   Median :11.297   Median : 7.128  
##  Mean   :  101.9   Mean   :21.867   Mean   :20.001   Mean   : 7.231  
##  3rd Qu.:  105.0   3rd Qu.:38.501   3rd Qu.:31.493   3rd Qu.: 7.571  
##  Max.   :14970.0   Max.   :65.122   Max.   :92.532   Max.   :10.539  
##  NA's   :443       NA's   :285      NA's   :285      NA's   :386     
##      proads            wage           finance      
##  Min.   :-3.367   Min.   : 7.866   Min.   : 6.216  
##  1st Qu.: 3.498   1st Qu.: 9.226   1st Qu.: 7.016  
##  Median : 4.356   Median : 9.523   Median : 7.342  
##  Mean   : 4.254   Mean   : 9.447   Mean   : 7.448  
##  3rd Qu.: 5.075   3rd Qu.: 9.665   3rd Qu.: 7.798  
##  Max.   : 8.980   Max.   :10.485   Max.   :10.855  
##  NA's   :443      NA's   :285      NA's   :386
# identify data with missing values
names(which(sapply(df, function(x) any(is.na(x)))))
##  [1] "cres"       "csor"       "area"       "alt"        "isle"      
##  [6] "sea"        "pden"       "wden"       "urb"        "organic"   
## [11] "paper"      "glass"      "wood"       "metal"      "plASTIc"   
## [16] "raee"       "texile"     "other"      "geo"        "roads"     
## [21] "s_wteregio" "s_landfill" "gdp"        "proads"     "wage"      
## [26] "finance"
df=na.omit(df)

s=summary(df)

Correlation between variables

# correlation
data_num2 <- select_if(df, is.numeric)             # Subset numeric columns 

# compute correlataion between numeric variables
c= cor(df[, unlist(lapply(df, is.numeric))])    # Properly apply cor  


library(corrplot)
## corrplot 0.92 loaded
# Insignificant correlation are crossed
col<- colorRampPalette(c("blue", "white", "red"))(20)
heatmap(x = c, col = col, symm = TRUE)

Aims:

  1. To understand the relationship between waste composition, urbanization, population density, and waste management costs in Italian municipalities.
  2. To identify best practices in waste management across different regions and propose recommendations for less efficient municipalities.
  3. To develop a predictive model that estimates waste management costs based on socio-economic and geographical factors.

Understand the relationship between waste composition, urbanization, population density, and waste management costs in municipalities.

df$urb=as.factor(df$urb)

df %>% group_by(urb)

For urbanisation level 1

urb1=subset(df,urb=="1")

urb1_x=urb1 %>% group_by(region,province,name) %>% 
  summarise(glassm = mean(glass), woodm=mean(wood),metalm=mean(metal),
            platicm=mean(plASTIc),cost=mean(tc),areas=mean(area),gdpm=mean(gdp))
## `summarise()` has grouped output by 'region', 'province'. You can override
## using the `.groups` argument.
urb1_x
ggplot(urb1_x, aes(x=glassm, y=gdpm,xName='Cost of Disposing Glass Disturbance',yName='GDP',color=region,legendPosition="bottom")) +geom_point()+ ggtitle("GDP vs Cost of Glass Disposal Across Regions") +
  xlab("Cost of Glass Disposal") + ylab("GDP")

Regression Model For Modelling the Variation in Fees

model <- lm(tc ~ pop + area + gdp + paper + texile + glass + wood + metal, data = df)
summary(model)
## 
## Call:
## lm(formula = tc ~ pop + area + gdp + paper + texile + glass + 
##     wood + metal, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -230.51  -27.44   -6.74   19.27  600.26 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.337e+02  2.089e+01 -11.187  < 2e-16 ***
## pop          3.945e-04  1.261e-04   3.128  0.00178 ** 
## area         2.651e-02  2.852e-02   0.929  0.35282    
## gdp          6.328e+01  2.789e+00  22.688  < 2e-16 ***
## paper       -1.982e+00  3.429e-01  -5.778 8.72e-09 ***
## texile      -9.911e+00  1.891e+00  -5.241 1.76e-07 ***
## glass       -2.989e+00  4.609e-01  -6.485 1.11e-10 ***
## wood         4.114e-01  5.184e-01   0.794  0.42751    
## metal       -8.410e+00  1.127e+00  -7.462 1.27e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.13 on 2008 degrees of freedom
## Multiple R-squared:  0.3072, Adjusted R-squared:  0.3045 
## F-statistic: 111.3 on 8 and 2008 DF,  p-value: < 2.2e-16

Random forest regression

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(123)  # Setting a seed ensures reproducibility
splitIndex <- createDataPartition(urb1$tc, p = 0.75, list = FALSE)
train_data <- urb1[splitIndex,]
test_data <- urb1[-splitIndex,]

model1 <- randomForest(tc ~ ., data = train_data, ntree = 100, mtry = 3)
print(model1)
## 
## Call:
##  randomForest(formula = tc ~ ., data = train_data, ntree = 100,      mtry = 3) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 1429.676
##                     % Var explained: 51.97

Variable Importnace

# Skapar graferna
res2            <- data.frame(varImp(model1)) # hämtar variable importance
res2$impProcent <- round(res2$Overall/sum(res2$Overall)*100) # räknar  om importance för varje till procent av hela
res2$var        <- rownames(res2) # kopierar radnamnet

res2 %>% ggplot(aes(x=impProcent, y=reorder(var, impProcent))) +
  geom_bar(stat="identity")