Our purpose is to analyse the results of the European Parliament election in Italy for the year 2019. The analysis will be performed at regional level.
First we will display both the total votes and the percentages of votes that the 5 main parties obtained.
library(dplyr)
library(ggplot2)
setwd("~/Coursera/Data_Science")
fileloc <- "./datasets/Italia/europee2019_scrutini_area_italia.csv"
itdata <- read.csv(fileloc, header = TRUE, sep = ";", strip.white = TRUE, stringsAsFactors = FALSE)
names(itdata)[c(3,4,5,6)] <- c("Provincia", "Comune", "Lista", "Voti")
str(itdata)
## 'data.frame': 124515 obs. of 6 variables:
## $ CIRCOSCRIZIONE: chr "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" ...
## $ REGIONE : chr "LIGURIA" "LIGURIA" "LIGURIA" "LIGURIA" ...
## $ Provincia : chr "GENOVA" "GENOVA" "GENOVA" "GENOVA" ...
## $ Comune : chr "ARENZANO" "ARENZANO" "ARENZANO" "ARENZANO" ...
## $ Lista : chr "PARTITO DEMOCRATICO" "LEGA SALVINI PREMIER" "MOVIMENTO 5 STELLE" "FORZA ITALIA" ...
## $ Voti : int 1904 1809 969 415 350 236 150 115 60 33 ...
regdata <- itdata %>%
group_by(CIRCOSCRIZIONE, REGIONE, Lista) %>%
summarise(Voti = sum(Voti))
reg_total <- regdata %>%
group_by(REGIONE) %>%
summarise(TotVoti = sum(Voti))
print(arrange(reg_total, desc(TotVoti)))
## Source: local data frame [20 x 2]
##
## REGIONE TotVoti
## (chr) (int)
## 1 LOMBARDIA 4857141
## 2 VENETO 2475148
## 3 LAZIO 2431086
## 4 EMILIA-ROMAGNA 2250389
## 5 PIEMONTE 2188837
## 6 CAMPANIA 2184604
## 7 TOSCANA 1870391
## 8 PUGLIA 1595373
## 9 SICILIA 1537935
## 10 MARCHE 766303
## 11 LIGURIA 742915
## 12 CALABRIA 729337
## 13 ABRUZZO 581643
## 14 FRIULI-VENEZIA GIULIA 577192
## 15 TRENTINO-ALTO ADIGE 495810
## 16 SARDEGNA 491454
## 17 UMBRIA 449074
## 18 BASILICATA 237840
## 19 MOLISE 150646
## 20 VALLE D'AOSTA 49844
In the following table we display the vote percentages given to the first party, that is LEGA, in each of the 20 Italian regions:
## compute the vote percentages for each Party (i.e. Lista)
regdata <- inner_join(x = regdata, y = reg_total, by = "REGIONE")
regdata <- mutate(regdata, pctVoti = round(x = 100 * Voti / TotVoti, digits = 2))
## display the vote percentages for Lega (The League)
nameLista <- "LEGA SALVINI PREMIER"
mydata <- regdata[regdata$Lista == nameLista, c("REGIONE", "Voti", "pctVoti")]
print(arrange(mydata, desc(pctVoti)))
## Source: local data frame [20 x 3]
##
## REGIONE Voti pctVoti
## (chr) (int) (dbl)
## 1 VENETO 1234610 49.88
## 2 LOMBARDIA 2107080 43.38
## 3 FRIULI-VENEZIA GIULIA 245636 42.56
## 4 UMBRIA 171458 38.18
## 5 MARCHE 291061 37.98
## 6 VALLE D'AOSTA 18525 37.17
## 7 PIEMONTE 813005 37.14
## 8 ABRUZZO 205370 35.31
## 9 LIGURIA 251696 33.88
## 10 EMILIA-ROMAGNA 759948 33.77
## 11 LAZIO 793889 32.66
## 12 TOSCANA 588727 31.48
## 13 TRENTINO-ALTO ADIGE 137739 27.78
## 14 SARDEGNA 135496 27.57
## 15 PUGLIA 403424 25.29
## 16 MOLISE 36544 24.26
## 17 BASILICATA 55453 23.32
## 18 CALABRIA 164915 22.61
## 19 SICILIA 319439 20.77
## 20 CAMPANIA 419623 19.21
mycol <- c("blue", "yellow", "tomato")
parties <- c("LEGA SALVINI PREMIER", "MOVIMENTO 5 STELLE", "PARTITO DEMOCRATICO")
mydata <- regdata[regdata$Lista %in% parties, ]
gg <- ggplot(data = mydata, aes(x = REGIONE, y = pctVoti, fill = Lista))
gg + geom_bar(stat = "identity", position = "dodge") + coord_flip() +
scale_fill_manual(values = mycol)
Now we want to compare the vote percentages of 2 parties, LEGA and M5S, across the 20 regions:
parties <- c("LEGA SALVINI PREMIER", "FRATELLI D'ITALIA", "FORZA ITALIA",
"MOVIMENTO 5 STELLE", "PARTITO DEMOCRATICO")
mydata <- regdata[regdata$Lista %in% parties, ]
# first we need to spread rows into columns
library(tidyr)
wdata <- spread(data = mydata[, -c(4:5)], key = Lista, value = pctVoti)
rownames(wdata) <- wdata$REGIONE
names(wdata) <- gsub(pattern = " ", replacement = ".", x = names(wdata))
gs <- ggplot(data = wdata, aes(x = LEGA.SALVINI.PREMIER, y = MOVIMENTO.5.STELLE))
gs + geom_point(aes(text = paste("Regione:", REGIONE)), size = 2, color = "forestgreen")
pcx <- princomp(x = wdata[, -c(1,2)], cor = FALSE)
summary(pcx)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 11.6115463 4.6507076 3.73759655 1.84038033
## Proportion of Variance 0.7675988 0.1231380 0.07953147 0.01928277
## Cumulative Proportion 0.7675988 0.8907368 0.97026829 0.98955106
## Comp.5
## Standard deviation 1.35474985
## Proportion of Variance 0.01044894
## Cumulative Proportion 1.00000000
loadings(pcx)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## FORZA.ITALIA -0.249 -0.161 -0.230 0.924
## FRATELLI.D'ITALIA -0.209 0.973
## LEGA.SALVINI.PREMIER 0.644 -0.388 -0.645 -0.120
## MOVIMENTO.5.STELLE -0.696 -0.593 -0.358 -0.182
## PARTITO.DEMOCRATICO 0.187 0.903 -0.368 0.116
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings 1.0 1.0 1.0 1.0 1.0
## Proportion Var 0.2 0.2 0.2 0.2 0.2
## Cumulative Var 0.2 0.4 0.6 0.8 1.0
plot(pcx) # screeplot: Var vs PC
biplot(pcx, col = c(4, 2), cex = c(0.6, 0.7), xlabs = rownames(wdata))