Our purpose is to analyse the results of the European Parliament election in Italy for the year 2019. The analysis will be performed at regional level.
First we will display both the total votes and the percentages of votes that the 5 main parties obtained.

library(dplyr)
library(ggplot2)
setwd("~/Coursera/Data_Science")
fileloc <- "./datasets/Italia/europee2019_scrutini_area_italia.csv"
itdata <- read.csv(fileloc, header = TRUE, sep = ";", strip.white = TRUE, stringsAsFactors = FALSE)
names(itdata)[c(3,4,5,6)] <- c("Provincia", "Comune", "Lista", "Voti")
str(itdata)
## 'data.frame':    124515 obs. of  6 variables:
##  $ CIRCOSCRIZIONE: chr  "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" ...
##  $ REGIONE       : chr  "LIGURIA" "LIGURIA" "LIGURIA" "LIGURIA" ...
##  $ Provincia     : chr  "GENOVA" "GENOVA" "GENOVA" "GENOVA" ...
##  $ Comune        : chr  "ARENZANO" "ARENZANO" "ARENZANO" "ARENZANO" ...
##  $ Lista         : chr  "PARTITO DEMOCRATICO" "LEGA SALVINI PREMIER" "MOVIMENTO 5 STELLE" "FORZA ITALIA" ...
##  $ Voti          : int  1904 1809 969 415 350 236 150 115 60 33 ...

1 - Analysis at regional level

regdata <- itdata %>% 
  group_by(CIRCOSCRIZIONE, REGIONE, Lista) %>%
  summarise(Voti = sum(Voti))

reg_total <- regdata %>%
  group_by(REGIONE) %>%
  summarise(TotVoti = sum(Voti))

print(arrange(reg_total, desc(TotVoti)))
## Source: local data frame [20 x 2]
## 
##                  REGIONE TotVoti
##                    (chr)   (int)
## 1              LOMBARDIA 4857141
## 2                 VENETO 2475148
## 3                  LAZIO 2431086
## 4         EMILIA-ROMAGNA 2250389
## 5               PIEMONTE 2188837
## 6               CAMPANIA 2184604
## 7                TOSCANA 1870391
## 8                 PUGLIA 1595373
## 9                SICILIA 1537935
## 10                MARCHE  766303
## 11               LIGURIA  742915
## 12              CALABRIA  729337
## 13               ABRUZZO  581643
## 14 FRIULI-VENEZIA GIULIA  577192
## 15   TRENTINO-ALTO ADIGE  495810
## 16              SARDEGNA  491454
## 17                UMBRIA  449074
## 18            BASILICATA  237840
## 19                MOLISE  150646
## 20         VALLE D'AOSTA   49844

In the following table we display the vote percentages given to the first party, that is LEGA, in each of the 20 Italian regions:

## compute the vote percentages for each Party (i.e. Lista)
regdata <- inner_join(x = regdata, y = reg_total, by = "REGIONE")
regdata <- mutate(regdata, pctVoti = round(x = 100 * Voti / TotVoti, digits = 2))

## display the vote percentages for Lega (The League)
nameLista <- "LEGA SALVINI PREMIER"
mydata <- regdata[regdata$Lista == nameLista, c("REGIONE", "Voti", "pctVoti")]

print(arrange(mydata, desc(pctVoti)))
## Source: local data frame [20 x 3]
## 
##                  REGIONE    Voti pctVoti
##                    (chr)   (int)   (dbl)
## 1                 VENETO 1234610   49.88
## 2              LOMBARDIA 2107080   43.38
## 3  FRIULI-VENEZIA GIULIA  245636   42.56
## 4                 UMBRIA  171458   38.18
## 5                 MARCHE  291061   37.98
## 6          VALLE D'AOSTA   18525   37.17
## 7               PIEMONTE  813005   37.14
## 8                ABRUZZO  205370   35.31
## 9                LIGURIA  251696   33.88
## 10        EMILIA-ROMAGNA  759948   33.77
## 11                 LAZIO  793889   32.66
## 12               TOSCANA  588727   31.48
## 13   TRENTINO-ALTO ADIGE  137739   27.78
## 14              SARDEGNA  135496   27.57
## 15                PUGLIA  403424   25.29
## 16                MOLISE   36544   24.26
## 17            BASILICATA   55453   23.32
## 18              CALABRIA  164915   22.61
## 19               SICILIA  319439   20.77
## 20              CAMPANIA  419623   19.21

2 - Barplot of the vote percentages for the 3 main parties

mycol <- c("blue", "yellow", "tomato")
parties <- c("LEGA SALVINI PREMIER", "MOVIMENTO 5 STELLE", "PARTITO DEMOCRATICO")

mydata <- regdata[regdata$Lista %in% parties, ]

gg <- ggplot(data = mydata, aes(x = REGIONE, y = pctVoti, fill = Lista))
gg + geom_bar(stat = "identity", position = "dodge") + coord_flip() + 
  scale_fill_manual(values = mycol)

3 - Scatterplot of the vote percentages for 2 parties

Now we want to compare the vote percentages of 2 parties, LEGA and M5S, across the 20 regions:

parties <- c("LEGA SALVINI PREMIER", "FRATELLI D'ITALIA", "FORZA ITALIA", 
             "MOVIMENTO 5 STELLE", "PARTITO DEMOCRATICO")

mydata <- regdata[regdata$Lista %in% parties, ]

# first we need to spread rows into columns
library(tidyr)
wdata <- spread(data = mydata[, -c(4:5)], key = Lista, value = pctVoti)
rownames(wdata) <- wdata$REGIONE
names(wdata) <- gsub(pattern = " ", replacement = ".", x = names(wdata))

gs <- ggplot(data = wdata, aes(x = LEGA.SALVINI.PREMIER, y = MOVIMENTO.5.STELLE))
gs + geom_point(aes(text = paste("Regione:", REGIONE)), size = 2, color = "forestgreen")

4 - Principal Components Analysis (PCA)

pcx <- princomp(x = wdata[, -c(1,2)], cor = FALSE)
summary(pcx)
## Importance of components:
##                            Comp.1    Comp.2     Comp.3     Comp.4
## Standard deviation     11.6115463 4.6507076 3.73759655 1.84038033
## Proportion of Variance  0.7675988 0.1231380 0.07953147 0.01928277
## Cumulative Proportion   0.7675988 0.8907368 0.97026829 0.98955106
##                            Comp.5
## Standard deviation     1.35474985
## Proportion of Variance 0.01044894
## Cumulative Proportion  1.00000000
loadings(pcx)
## 
## Loadings:
##                      Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## FORZA.ITALIA         -0.249 -0.161 -0.230  0.924       
## FRATELLI.D'ITALIA                  -0.209         0.973
## LEGA.SALVINI.PREMIER  0.644 -0.388 -0.645        -0.120
## MOVIMENTO.5.STELLE   -0.696        -0.593 -0.358 -0.182
## PARTITO.DEMOCRATICO   0.187  0.903 -0.368  0.116       
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0
plot(pcx)   # screeplot: Var vs PC

biplot(pcx, col = c(4, 2), cex = c(0.6, 0.7), xlabs = rownames(wdata))