Our purpose is to analyse the results of the European Parliament election in Italy for the year 2019. The analysis will be performed at provincial level.
library(dplyr)
library(ggplot2)
setwd("~/Coursera/Data_Science")
fileloc <- "./datasets/Italia/europee2019_scrutini_area_italia.csv"
itdata <- read.csv(fileloc, header = TRUE, sep = ";", strip.white = TRUE, stringsAsFactors = FALSE)
names(itdata)[c(3,4,5,6)] <- c("Provincia", "Comune", "Lista", "Voti")
str(itdata)
## 'data.frame': 124515 obs. of 6 variables:
## $ CIRCOSCRIZIONE: chr "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" ...
## $ REGIONE : chr "LIGURIA" "LIGURIA" "LIGURIA" "LIGURIA" ...
## $ Provincia : chr "GENOVA" "GENOVA" "GENOVA" "GENOVA" ...
## $ Comune : chr "ARENZANO" "ARENZANO" "ARENZANO" "ARENZANO" ...
## $ Lista : chr "PARTITO DEMOCRATICO" "LEGA SALVINI PREMIER" "MOVIMENTO 5 STELLE" "FORZA ITALIA" ...
## $ Voti : int 1904 1809 969 415 350 236 150 115 60 33 ...
provdata <- itdata %>%
group_by(CIRCOSCRIZIONE, REGIONE, Provincia, Lista) %>%
summarise(Voti = sum(Voti))
prov_total <- provdata %>%
group_by(Provincia) %>%
summarise(TotVoti = sum(Voti))
print(arrange(prov_total, desc(TotVoti)))
## Source: local data frame [107 x 2]
##
## Provincia TotVoti
## (chr) (int)
## 1 ROMA 1723583
## 2 MILANO 1452603
## 3 TORINO 1117522
## 4 NAPOLI 1027989
## 5 BRESCIA 643829
## 6 BERGAMO 579830
## 7 BARI 527280
## 8 FIRENZE 521666
## 9 BOLOGNA 517039
## 10 PADOVA 487405
## .. ... ...
In the following table we display the vote percentages given to the first party, that is LEGA, in each of the 18 provinces of the 5 Italian regions selected (Abruzzo, Lazio, Marche, Molise, Umbria):
## compute the vote percentages for each Party (i.e. Lista)
provdata <- inner_join(x = provdata, y = prov_total, by = "Provincia")
provdata <- mutate(provdata, pctVoti = round(x = 100 * Voti / TotVoti, digits = 2))
## display the vote percentages for Lega (The League)
nameLista <- "LEGA SALVINI PREMIER"
nameRegioni <- c("ABRUZZO", "LAZIO", "MARCHE", "MOLISE", "UMBRIA")
mydata <- provdata[provdata$Lista == nameLista & provdata$REGIONE %in% nameRegioni, c("REGIONE", "Provincia", "Voti", "pctVoti")]
print(arrange(mydata, desc(pctVoti)))
## Source: local data frame [18 x 4]
##
## REGIONE Provincia Voti pctVoti
## (chr) (chr) (int) (dbl)
## 1 MARCHE FERMO 35194 43.37
## 2 LAZIO VITERBO 64810 41.42
## 3 LAZIO RIETI 31573 41.04
## 4 MARCHE MACERATA 64063 40.99
## 5 LAZIO FROSINONE 96671 40.35
## 6 LAZIO LATINA 91230 38.90
## 7 UMBRIA TERNI 42031 38.56
## 8 ABRUZZO TERAMO 51515 38.13
## 9 UMBRIA PERUGIA 129427 38.06
## 10 MARCHE ASCOLI PICENO 41155 38.00
## 11 MARCHE PESARO E URBINO 72898 37.82
## 12 ABRUZZO L'AQUILA 43471 35.67
## 13 ABRUZZO CHIETI 58727 34.42
## 14 MARCHE ANCONA 77751 34.13
## 15 ABRUZZO PESCARA 51657 33.54
## 16 LAZIO ROMA 509605 29.57
## 17 MOLISE CAMPOBASSO 27349 24.57
## 18 MOLISE ISERNIA 9195 23.38
mycol <- c("blue", "yellow", "tomato")
parties <- c("LEGA SALVINI PREMIER", "MOVIMENTO 5 STELLE", "PARTITO DEMOCRATICO")
mydata <- provdata[provdata$Lista %in% parties & provdata$REGIONE %in% nameRegioni, ]
# Boxplot
gg <- ggplot(data = mydata, aes(x = Lista, y = pctVoti, fill = Lista))
gg + geom_boxplot() + scale_fill_manual(values = mycol)
# Barplot
gg <- ggplot(data = mydata, aes(x = Provincia, y = pctVoti, fill = Lista))
gg + geom_bar(stat = "identity", position = "dodge") + scale_fill_manual(values = mycol) + coord_flip()
Now we want to compare the vote percentages of 2 parties, LEGA and M5S, across the 18 provinces:
parties <- c("LEGA SALVINI PREMIER", "FRATELLI D'ITALIA", "FORZA ITALIA",
"MOVIMENTO 5 STELLE", "PARTITO DEMOCRATICO")
mydata <- provdata[provdata$Lista %in% parties & provdata$REGIONE %in% nameRegioni, ]
# first we need to spread rows into columns
library(tidyr)
wdata <- spread(data = mydata[, -c(5:6)], key = Lista, value = pctVoti)
print(wdata[, -(1:2)])
## Source: local data frame [18 x 6]
##
## Provincia FORZA ITALIA FRATELLI D'ITALIA LEGA SALVINI PREMIER
## (chr) (dbl) (dbl) (dbl)
## 1 FROSINONE 8.13 8.90 40.35
## 2 LATINA 11.98 10.58 38.90
## 3 RIETI 6.68 8.17 41.04
## 4 ROMA 5.92 8.88 29.57
## 5 VITERBO 6.35 8.55 41.42
## 6 ANCONA 4.88 5.40 34.13
## 7 ASCOLI PICENO 6.55 5.92 38.00
## 8 FERMO 5.42 5.64 43.37
## 9 MACERATA 5.87 7.91 40.99
## 10 PESARO E URBINO 5.50 4.66 37.82
## 11 PERUGIA 6.52 6.70 38.06
## 12 TERNI 6.10 6.22 38.56
## 13 CHIETI 9.36 5.63 34.42
## 14 L'AQUILA 10.42 10.54 35.67
## 15 PESCARA 9.90 6.72 33.54
## 16 TERAMO 7.92 5.87 38.13
## 17 CAMPOBASSO 13.00 5.47 24.57
## 18 ISERNIA 21.84 8.75 23.38
## Variables not shown: MOVIMENTO 5 STELLE (dbl), PARTITO DEMOCRATICO (dbl)
rownames(wdata) <- wdata$Provincia
names(wdata) <- gsub(pattern = " ", replacement = ".", x = names(wdata))
gs <- ggplot(data = wdata, aes(x = LEGA.SALVINI.PREMIER, y = MOVIMENTO.5.STELLE))
gs + geom_point(aes(text = paste("Regione:", REGIONE)), size = 2, color = "forestgreen")
pcx <- princomp(x = wdata[, -c(1:3)], cor = FALSE)
summary(pcx)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 7.0403320 4.1627923 3.1840604 1.19978052 0.33169730
## Proportion of Variance 0.6307518 0.2205168 0.1290134 0.01831791 0.00140009
## Cumulative Proportion 0.6307518 0.8512686 0.9802820 0.99859991 1.00000000
plot(pcx) # screeplot: Variance vs PC
loadings(pcx)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## FORZA.ITALIA 0.489 -0.304 0.440 -0.479 0.495
## FRATELLI.D'ITALIA -0.181 0.389 0.836 0.341
## LEGA.SALVINI.PREMIER -0.697 -0.516 -0.226 -0.166 0.412
## MOVIMENTO.5.STELLE 0.437 -0.762 0.177 0.442
## PARTITO.DEMOCRATICO -0.291 0.779 0.151 -0.111 0.523
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings 1.0 1.0 1.0 1.0 1.0
## Proportion Var 0.2 0.2 0.2 0.2 0.2
## Cumulative Var 0.2 0.4 0.6 0.8 1.0
biplot(pcx, col = c(4, 2), cex = c(0.6, 0.7), xlabs = rownames(wdata))
library(maps)
mymap <- map_data(map = "italy")
mymap$Provincia <- toupper(mymap$region)
area <- inner_join(x = provdata, y = mymap, by = "Provincia")
## choose a party (Lista)
votiLega <- area[area$Lista == "LEGA SALVINI PREMIER" & area$REGIONE %in% nameRegioni, ]
summary(votiLega$pctVoti)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.38 34.13 38.06 36.09 38.90 41.42
## define 5 intervals of votes percentages
votiLega$pctVoti_LEGA <- cut_interval(x = votiLega$pctVoti, n = 5, dig.lab = 3)
gg <- ggplot() +
geom_polygon(data = votiLega, aes(x = long, y = lat, group = group, fill = pctVoti_LEGA), color = "blue")
gg + scale_fill_brewer(palette = "PuBu") + coord_fixed(ratio = 1.3) + theme_bw()