Our purpose is to analyse the results of the European Parliament election in Italy for the year 2019. The analysis will be performed at provincial level.

library(dplyr)
library(ggplot2)
setwd("~/Coursera/Data_Science")
fileloc <- "./datasets/Italia/europee2019_scrutini_area_italia.csv"
itdata <- read.csv(fileloc, header = TRUE, sep = ";", strip.white = TRUE, stringsAsFactors = FALSE)
names(itdata)[c(3,4,5,6)] <- c("Provincia", "Comune", "Lista", "Voti")
str(itdata)
## 'data.frame':    124515 obs. of  6 variables:
##  $ CIRCOSCRIZIONE: chr  "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" "I : ITALIA NORD-OCCIDENTALE" ...
##  $ REGIONE       : chr  "LIGURIA" "LIGURIA" "LIGURIA" "LIGURIA" ...
##  $ Provincia     : chr  "GENOVA" "GENOVA" "GENOVA" "GENOVA" ...
##  $ Comune        : chr  "ARENZANO" "ARENZANO" "ARENZANO" "ARENZANO" ...
##  $ Lista         : chr  "PARTITO DEMOCRATICO" "LEGA SALVINI PREMIER" "MOVIMENTO 5 STELLE" "FORZA ITALIA" ...
##  $ Voti          : int  1904 1809 969 415 350 236 150 115 60 33 ...

1 - Analysis at provincial level

provdata <- itdata %>% 
  group_by(CIRCOSCRIZIONE, REGIONE, Provincia, Lista) %>%
  summarise(Voti = sum(Voti))

prov_total <- provdata %>%
  group_by(Provincia) %>%
  summarise(TotVoti = sum(Voti))

print(arrange(prov_total, desc(TotVoti)))
## Source: local data frame [107 x 2]
## 
##    Provincia TotVoti
##        (chr)   (int)
## 1       ROMA 1723583
## 2     MILANO 1452603
## 3     TORINO 1117522
## 4     NAPOLI 1027989
## 5    BRESCIA  643829
## 6    BERGAMO  579830
## 7       BARI  527280
## 8    FIRENZE  521666
## 9    BOLOGNA  517039
## 10    PADOVA  487405
## ..       ...     ...

In the following table we display the vote percentages given to the first party, that is LEGA, in each of the 18 provinces of the 5 Italian regions selected (Abruzzo, Lazio, Marche, Molise, Umbria):

## compute the vote percentages for each Party (i.e. Lista)
provdata <- inner_join(x = provdata, y = prov_total, by = "Provincia")
provdata <- mutate(provdata, pctVoti = round(x = 100 * Voti / TotVoti, digits = 2))

## display the vote percentages for Lega (The League)
nameLista <- "LEGA SALVINI PREMIER"
nameRegioni <- c("ABRUZZO", "LAZIO", "MARCHE", "MOLISE", "UMBRIA")
mydata <- provdata[provdata$Lista == nameLista & provdata$REGIONE %in% nameRegioni, c("REGIONE", "Provincia", "Voti", "pctVoti")]

print(arrange(mydata, desc(pctVoti)))
## Source: local data frame [18 x 4]
## 
##    REGIONE       Provincia   Voti pctVoti
##      (chr)           (chr)  (int)   (dbl)
## 1   MARCHE           FERMO  35194   43.37
## 2    LAZIO         VITERBO  64810   41.42
## 3    LAZIO           RIETI  31573   41.04
## 4   MARCHE        MACERATA  64063   40.99
## 5    LAZIO       FROSINONE  96671   40.35
## 6    LAZIO          LATINA  91230   38.90
## 7   UMBRIA           TERNI  42031   38.56
## 8  ABRUZZO          TERAMO  51515   38.13
## 9   UMBRIA         PERUGIA 129427   38.06
## 10  MARCHE   ASCOLI PICENO  41155   38.00
## 11  MARCHE PESARO E URBINO  72898   37.82
## 12 ABRUZZO        L'AQUILA  43471   35.67
## 13 ABRUZZO          CHIETI  58727   34.42
## 14  MARCHE          ANCONA  77751   34.13
## 15 ABRUZZO         PESCARA  51657   33.54
## 16   LAZIO            ROMA 509605   29.57
## 17  MOLISE      CAMPOBASSO  27349   24.57
## 18  MOLISE         ISERNIA   9195   23.38

2 - Plot of the vote percentages for the 3 main parties

mycol <- c("blue", "yellow", "tomato")
parties <- c("LEGA SALVINI PREMIER", "MOVIMENTO 5 STELLE", "PARTITO DEMOCRATICO")

mydata <- provdata[provdata$Lista %in% parties & provdata$REGIONE %in% nameRegioni, ]

# Boxplot
gg <- ggplot(data = mydata, aes(x = Lista, y = pctVoti, fill = Lista))
gg + geom_boxplot() + scale_fill_manual(values = mycol)

# Barplot
gg <- ggplot(data = mydata, aes(x = Provincia, y = pctVoti, fill = Lista))
gg + geom_bar(stat = "identity", position = "dodge") + scale_fill_manual(values = mycol) + coord_flip()

3 - Scatterplot of the vote percentages for 2 parties

Now we want to compare the vote percentages of 2 parties, LEGA and M5S, across the 18 provinces:

parties <- c("LEGA SALVINI PREMIER", "FRATELLI D'ITALIA", "FORZA ITALIA", 
             "MOVIMENTO 5 STELLE", "PARTITO DEMOCRATICO")

mydata <- provdata[provdata$Lista %in% parties & provdata$REGIONE %in% nameRegioni, ]

# first we need to spread rows into columns
library(tidyr)
wdata <- spread(data = mydata[, -c(5:6)], key = Lista, value = pctVoti)
print(wdata[, -(1:2)])
## Source: local data frame [18 x 6]
## 
##          Provincia FORZA ITALIA FRATELLI D'ITALIA LEGA SALVINI PREMIER
##              (chr)        (dbl)             (dbl)                (dbl)
## 1        FROSINONE         8.13              8.90                40.35
## 2           LATINA        11.98             10.58                38.90
## 3            RIETI         6.68              8.17                41.04
## 4             ROMA         5.92              8.88                29.57
## 5          VITERBO         6.35              8.55                41.42
## 6           ANCONA         4.88              5.40                34.13
## 7    ASCOLI PICENO         6.55              5.92                38.00
## 8            FERMO         5.42              5.64                43.37
## 9         MACERATA         5.87              7.91                40.99
## 10 PESARO E URBINO         5.50              4.66                37.82
## 11         PERUGIA         6.52              6.70                38.06
## 12           TERNI         6.10              6.22                38.56
## 13          CHIETI         9.36              5.63                34.42
## 14        L'AQUILA        10.42             10.54                35.67
## 15         PESCARA         9.90              6.72                33.54
## 16          TERAMO         7.92              5.87                38.13
## 17      CAMPOBASSO        13.00              5.47                24.57
## 18         ISERNIA        21.84              8.75                23.38
## Variables not shown: MOVIMENTO 5 STELLE (dbl), PARTITO DEMOCRATICO (dbl)
rownames(wdata) <- wdata$Provincia
names(wdata) <- gsub(pattern = " ", replacement = ".", x = names(wdata))

gs <- ggplot(data = wdata, aes(x = LEGA.SALVINI.PREMIER, y = MOVIMENTO.5.STELLE))
gs + geom_point(aes(text = paste("Regione:", REGIONE)), size = 2, color = "forestgreen")

4 - Principal Components Analysis (PCA)

pcx <- princomp(x = wdata[, -c(1:3)], cor = FALSE)
summary(pcx)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
## Standard deviation     7.0403320 4.1627923 3.1840604 1.19978052 0.33169730
## Proportion of Variance 0.6307518 0.2205168 0.1290134 0.01831791 0.00140009
## Cumulative Proportion  0.6307518 0.8512686 0.9802820 0.99859991 1.00000000
plot(pcx)   # screeplot: Variance vs PC

loadings(pcx)
## 
## Loadings:
##                      Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## FORZA.ITALIA          0.489 -0.304  0.440 -0.479  0.495
## FRATELLI.D'ITALIA           -0.181  0.389  0.836  0.341
## LEGA.SALVINI.PREMIER -0.697 -0.516 -0.226 -0.166  0.412
## MOVIMENTO.5.STELLE    0.437        -0.762  0.177  0.442
## PARTITO.DEMOCRATICO  -0.291  0.779  0.151 -0.111  0.523
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0
biplot(pcx, col = c(4, 2), cex = c(0.6, 0.7), xlabs = rownames(wdata))

5 - Map of the results

library(maps)
mymap <- map_data(map = "italy")
mymap$Provincia <- toupper(mymap$region)
area <- inner_join(x = provdata, y = mymap, by = "Provincia")

## choose a party (Lista)
votiLega <- area[area$Lista == "LEGA SALVINI PREMIER" & area$REGIONE %in% nameRegioni, ]
summary(votiLega$pctVoti)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.38   34.13   38.06   36.09   38.90   41.42
## define 5 intervals of votes percentages
votiLega$pctVoti_LEGA <- cut_interval(x = votiLega$pctVoti, n = 5, dig.lab = 3)
gg <- ggplot() + 
  geom_polygon(data = votiLega, aes(x = long, y = lat, group = group, fill = pctVoti_LEGA), color = "blue")
gg + scale_fill_brewer(palette = "PuBu") + coord_fixed(ratio = 1.3) + theme_bw()