For these visualizations, I started with the code from D. Toshkovs’s webpage (http://www.dimiter.eu/Visualizations.html), which is also available directly on GitHub. However, the bits and pieces of the code have been changed and adjusted following multiple online resources.
library(foreign)
library(ggplot2)
library(ggridges)
library(forcats)
#read the data
data=read.dta("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Scaled summaries/validation_summaries/05022021_validation_scaled1.dta")
data$var1=data$lr_probabilities_wobp
#png("lr_probabilities.png",width=600, height=400, res=96)
par(family='Montserrat')
#setting upcolors ## a lot of messy code!
col1<-rgb(225, 25, 27, alpha=255, max=255)
col2<-rgb(30, 93, 160, alpha=255, max=255)
col3<-rgb(255, 220, 69, alpha=255, max=255)
col1a<-rgb(225, 25, 27, alpha=195, max=255)
col2a<-rgb(30, 93, 160, alpha=195, max=255)
col3a<-rgb(255, 220, 69, alpha=195, max=255)
col3b<-rgb(248, 202, 8, alpha=255, max=255)
bg.col<-rgb(255, 251, 245, alpha=255,max=255)
dark.color=rgb(35, 35, 58, max=255) # dark color
#extra colors
main.color=rgb(238, 80, 121, alpha=175, max=255) #main color
main.color2<-rgb(64,193,230, alpha=175, max=255)
main.color3<-rgb(50,100,10, alpha=175, max=255)
main.color4<-rgb(80,10,100, alpha=175, max=255)
main.color5<-rgb(155,138,4, alpha=175, max=255)
main.colora=rgb(238, 80, 121, alpha=255, max=255) #main color
main.color2a<-rgb(64,193,230, alpha=255, max=255)
main.color3a<-rgb(50,100,10, alpha=255, max=255)
main.color4a<-rgb(80,10,100, alpha=255, max=255)
main.color5a<-rgb(155,138,4, alpha=255, max=255)
colors<-c(main.color, main.color3, main.color4, main.color2, main.color5 )
colors2<-c(main.colora, main.color3a, main.color4a, main.color2a, main.color5a)
background.color=rgb(255, 251, 245, max=255) #color for the background
dark.color=rgb(35, 35, 58, max=255) # dark color
#plot
d=ggplot(data, aes(x=data$var1, y=validated, fill=data$validated, color=data$validated)) +
geom_density_ridges(scale = 4) +
scale_fill_cyclical(values = c(col1a, col2a)) +
scale_color_cyclical(values = c(col1,col2)) +
scale_x_continuous(limits=c(min(data$var1,na.rm=T),max(data$var1,na.rm=T)), expand = c(0.01, 0)) +
scale_y_discrete(expand = c(0.01, 1)) + labs(title = 'Density plot of the LR probabilities: Validated and full sample', x='LR without bullet points')+
theme_ridges(font_size = 12, grid = TRUE) + theme(plot.margin = unit(c(1,1,1,1), "lines"),
plot.title = element_text(color=dark.color, size=14, face="italic", hjust=0),
plot.subtitle = element_text(color=dark.color, size=12, face="italic"),
# plot.background = element_rect(fill=background.color, colour=dark.color),
# axis.text.y=element_blank(), axis.ticks.y=element_blank( ),
axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0.5))+
coord_cartesian(ylim = c(1, 10), clip = 'off')
print(d)
## Picking joint bandwidth of 0.0417
## Warning: Removed 112 rows containing non-finite values (stat_density_ridges).
The figure below shows the distribution of probabilities generated by the LR model. For this presentation, I will be using the predictions of the models excluding the bullet points as tokens. The graph below distinguishes between reviewed/validated cases & non validated cases.
p=ggplot(data, aes(x = data$lr_probabilities_wobp, fill = data$validated)) + # Draw overlaying histogram with frequencies
geom_histogram(position = "identity", alpha = 0.3, bins = 50) +
scale_color_cyclical(values = c(col1,col2)) +
labs(title = 'Frequencies for LR probabilities: Validated and full sample', x='LR without bullet points')+
theme_ridges(font_size = 12, grid = TRUE) +
theme(legend.title = element_text(color=dark.color, size=12),
plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=12, face="italic"),
axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0.5))+
coord_cartesian(ylim = c(1, 50), clip = 'off')
p <- p + guides(fill=guide_legend(title="Validated?"))
print(p)
## Warning: Removed 112 rows containing non-finite values (stat_bin).
It appears that overall, validation efforts allowed us to review a representative sample of the procedures. HOwever, the SIF model’s probaility distribution looks quite odd (though, I think there is a transformation that can be used to deal with it).
p=ggplot(data, aes(x = data$sif_lr_probs, fill = data$validated)) + # Draw overlaying histogram with frequencies
geom_histogram(position = "identity", alpha = 0.3, bins = 50) +
scale_color_cyclical(values = c(col1,col2)) +
labs(title = 'SIF_LR probabilities: Validated and full sample', x='SIF_LR predictions')+
theme_ridges(font_size = 12, grid = TRUE) +
theme( legend.title = element_text(color=dark.color, size=12),
plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=12, face="italic"),
axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0.5))+
coord_cartesian(ylim = c(1, 100), clip = 'off')
p <- p + guides(fill=guide_legend(title="Validated?"))
print(p)
## Warning: Removed 112 rows containing non-finite values (stat_bin).
Let’s take a look into the PR across the Committees ( as proxies for policy areas). Note an interesting pro-increase skewness of the ECON (in contrast to Hagemann et al.).
#Plot1
p=ggplot(data, aes(x =data$lr_probabilities_wobp, y = data$comm, fill=data$comm)) +
geom_density_ridges( alpha = 0.8) +
labs(title ="Predictions of the Logit model", x='LR without bullet points', y='Committee') +
theme_ridges(font_size = 12, grid = TRUE) + theme( legend.title = element_text(color=dark.color, size=12),
plot.title = element_text( size=12, face="italic", hjust=0.5),
plot.subtitle = element_text( size=12, face="italic"),
axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color,hjust=0.5))
coord_cartesian(ylim = c(1, 50), clip = 'off')
## <ggproto object: Class CoordCartesian, Coord, gg>
## aspect: function
## backtransform_range: function
## clip: off
## default: FALSE
## distance: function
## expand: TRUE
## is_free: function
## is_linear: function
## labels: function
## limits: list
## modify_scales: function
## range: function
## render_axis_h: function
## render_axis_v: function
## render_bg: function
## render_fg: function
## setup_data: function
## setup_layout: function
## setup_panel_guides: function
## setup_panel_params: function
## setup_params: function
## train_panel_guides: function
## transform: function
## super: <ggproto object: Class CoordCartesian, Coord, gg>
p <- p + guides(fill=guide_legend(title="Committees", reverse = TRUE))
print(p)
## Picking joint bandwidth of 0.068
## Warning: Removed 112 rows containing non-finite values (stat_density_ridges).
In the next step, I removed the procedures which were categorised into different groups by the logit model and the coder(s).
p=ggplot(data, aes(x = data$lr_probabilities_wobp, fill = data$ea_lrwobp)) + # Draw overlaying histogram with frequencies
geom_histogram(position = "identity", alpha = 0.3, bins = 50) +
# scale_color_cyclical(values = c(col1, col3, col3b)) +
scale_fill_cyclical(values = c(col1, col2a)) +
labs(title = 'Overview of the LR and EA agreement', x='LR without bullet points')+
theme_ridges(font_size = 12, grid = TRUE) +
theme(legend.title = element_text(color=dark.color, size=10),
plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
axis.text.y=element_text(color=dark.color, hjust=0.5),
axis.title.x = element_text(color=dark.color, hjust=0.5))+
coord_cartesian(ylim = c(0, 15), clip = 'off')
p <- p + guides(fill=guide_legend(title="Correcly categorised?"))
print(p)
## Warning: Removed 112 rows containing non-finite values (stat_bin).
##More plots Below, I summarize the clustering of the LR probabilities. THe bars capture both agreement and disagreement between the coder’s classification and the model’s categorization. The bars on the background show the means probability for for groups of the cases.
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data1=subset(data, ea_lrwobp=="FALSE" | ea_lrwobp=="TRUE") ## subset only cases which were actually coded
gd <- data1 %>% ## group them by the agreement between LR and EA coding
group_by(ea_lrwobp) %>%
summarise(lr_probabilities_wobp = mean(lr_probabilities_wobp))
gd
## # A tibble: 2 x 2
## ea_lrwobp lr_probabilities_wobp
## * <fct> <dbl>
## 1 FALSE 0.441
## 2 TRUE 0.420
ggplot(data1, aes(x =ea_lrwobp , y =lr_probabilities_wobp)) +
geom_point()+
geom_bar(data=gd, stat = "identity", alpha = .2)+
ggrepel::geom_text_repel(aes(label = data1$comm), color = "black", size = 2.5, segment.color = "grey") +
geom_point() +
guides(color = "none", fill = "none") +
theme_ridges(font_size = 12, grid = TRUE) +
theme(legend.title = element_text(color=dark.color, size=12),
plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=12, face="italic"),
axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0.5))+
labs(
title = "Probability by agreement with EA",
x = "LR and EA agree on classification",
y = " LR Probaility"
)
## Warning: Use of `data1$comm` is discouraged. Use `comm` instead.
In the plot below it is evident that the most frequent disagreements between the summary-level machine scaling nd the coder occurs in such areas as ENVI (16 procedures); ECON (7), TRAN(3), PECH (3), INTA(2).
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
tabyl(data1, comm, ea_lrwobp )
## comm FALSE TRUE
## AGRI 1 2
## BUDG 0 1
## CULT 1 0
## ECON 7 8
## EMPL 0 1
## ENVI 16 18
## IMCO 1 7
## INTA 2 14
## ITRE 1 6
## JURI 1 9
## LIBE 1 7
## PECH 3 10
## REGI 0 2
## TRAN 3 19
tabyl(data1, comm, ea_lrwobp )%>%
adorn_percentages("col") %>%
adorn_pct_formatting(digits = 1)
## comm FALSE TRUE
## AGRI 2.7% 1.9%
## BUDG 0.0% 1.0%
## CULT 2.7% 0.0%
## ECON 18.9% 7.7%
## EMPL 0.0% 1.0%
## ENVI 43.2% 17.3%
## IMCO 2.7% 6.7%
## INTA 5.4% 13.5%
## ITRE 2.7% 5.8%
## JURI 2.7% 8.7%
## LIBE 2.7% 6.7%
## PECH 8.1% 9.6%
## REGI 0.0% 1.9%
## TRAN 8.1% 18.3%
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
p= ggplot(data1, aes(x =ea_lrwobp , y =comm)) +
geom_count(aes(color=comm), alpha = 0.3)+
scale_colour_hue(guide = "none")+
scale_fill_brewer(palette=1)+
scale_size(range = c(0, 25))+
# scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A") +
theme(legend.title = element_text(color=dark.color, size=12),
plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=12, face="italic"),
axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0))+
theme_minimal()+
labs(
title = "Occurances of misclassification by Committee",
x = "Coder-Model agreement",
y = "Committee"
)
p + theme(legend.title = element_text(color = "black", size = 10),
legend.key.size = unit(0.5, "cm"),
legend.key.width = unit(0.5,"cm")
)
## Some cases were misclassified by all models.
Here are the cases where the classifications by LR and SIF-LR models agree with each other, but both coders captured mis-categorization by the models. The plot below locates these cases using the similarity measures. SIF_negative captures the degree to which the summary is somewhat similar to the candidate summary used to trans the model. A similar interpretation applies to the SIF_positive. THe middle of the map,(on the intersection of SIF-Negative=0.5 and SIF_positive=0.5), represents a borderline cases, which exhibit similarity to both summaries expanding the competencies, and those seeking to maintain the SQ.
#trying to plot the cases where LR and SIF went wrong and 2 coders agree
#subset the relevant cases
failure=subset(data1, data1$completely_wrong_lrwobp =="Both models wrong" & data1$coder_agree=="TRUE" )
#head(failure)
##creaate a color scheme==> still not sure why it does nto fully work here... NOTABENE: TAKE A LOOK WHEN POSSIBLE!!!
library(RColorBrewer)
myColors <- brewer.pal(8,"Set1")
names(myColors) <- levels(failure$comm)
colScale <- scale_colour_manual(name = "comm",values = myColors)
f=ggplot(failure, aes(x=failure$sif_pos_similarity, y=failure$sif_neg_similarity, color=comm)) +
geom_point(aes(color=comm), size=3, alpha=0.6)+
#scale_fill_brewer(palette=1)
theme(legend.title = element_text(color=dark.color, size=12),
plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=12, face="italic"),
axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0))+
theme_minimal()+
labs(
title = "Cases classified inccorectly by COMM over SIF similarity",
x = "SIF_Positive similarity",
y = "SIF-Negative similarity"
)
f + expand_limits(x =c(0, 1), y=c(0,1))
## Warning: Use of `failure$sif_pos_similarity` is discouraged. Use
## `sif_pos_similarity` instead.
## Warning: Use of `failure$sif_neg_similarity` is discouraged. Use
## `sif_neg_similarity` instead.
It would also be nice to get an overview of the variance in EU seeking to change the competence level over years and policy areas.
#quick rename of the messy variables
names(data)[names(data) == "interinstitutionalprocedurecodey"] <- "inter_year" # I will use the Year indicated in the procedure code
names(data1)[names(data1) == "interinstitutionalprocedurecodey"] <- "inter_year"
data <- data %>% mutate(inter_year = replace(inter_year, startsWith(cod, "2018"), 2018)) %>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2017"), 2017))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2016"), 2016))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2015"), 2015)) %>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2014"), 2014))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2013"), 2013))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2012"), 2012)) %>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2019"), 2019))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2020"), 2020))
#change double-years in the procedure year into Year indicated in the procedure name
data <- data %>% mutate(inter_year = ifelse(celexnumber == "32012D0243", 2010, inter_year))
data <- data %>% mutate(inter_year = ifelse(celexnumber == "32011R1232", 2008, inter_year))
#summarize(yearly_lrprobability)
data_ts=subset(data, !is.na(lr_probabilities_wobp))
yearly_lrprobability <- data_ts %>%
group_by(inter_year, comm) %>%
summarize(avg_prob = mean(lr_probabilities_wobp))
## `summarise()` has grouped output by 'inter_year'. You can override using the `.groups` argument.
competence= ggplot(data =subset(yearly_lrprobability, !is.na(avg_prob)),
aes(y=avg_prob, x=inter_year, group=comm, color=comm)) +
geom_point()+
geom_line() + #+ facet_grid(. ~ comm, scales = "free", space = "free")
theme_minimal()+
theme(legend.title = element_text(color=dark.color, size=10.5),
plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=10, face="italic"),
axis.title.y = element_text(size = 8, face='italic'),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'),
legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))+
labs(
title = " Average probaility of proposal advocating for more EU by Committee",
x = "Year the proposal tabled",
y = "Probality of Expanding the EU Competence"
)
competence+guides(colour = guide_legend(override.aes = list(size=1)))
data_nona=subset(data, !is.na(lr_probabilities_wobp))
yearly_aggr2_lrprobability <- data_nona %>%
group_by(inter_year) %>%
summarise(avg_prob2= mean(lr_probabilities_wobp),
sd_av=sd(lr_probabilities_wobp),
max_ci = avg_prob2+ 1*sd_av,
min_ci = avg_prob2 - 1*sd_av) %>%
ungroup()
#dt=subset(yearly_aggr2_lrprobability, !is.na(min_ci)) # no way of calculating the CI for very early proposals// but they can be dropped
ggplot(data=yearly_aggr2_lrprobability,
aes(x=inter_year, color = inter_year, group=1)) +
geom_line(aes(y= avg_prob2))+
geom_line( aes (y =max_ci), alpha = 0.3)+
geom_line(aes(y=min_ci), alpha=0.3, )+
theme_minimal()+
theme(legend.title = element_blank(),
plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=10, face="italic"),
axis.title.y = element_text(size = 8, face='italic'),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'),
legend.text=element_blank (), legend.key = element_blank(), legend.position = "none")+
labs(
title = " Mean Pr of extending EU authority 1SD variance",
x = "Year the proposal was tabled",
y = "Probability of Expanding the EU Competence"
)
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 2 row(s) containing missing values (geom_path).
Here is the subsample of procedures on which the EA and the LRw/obp disagreed. For this subsample I calculated the absolute difference in probabilities provided by two LR models.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.6 v purrr 0.3.4
## v readr 1.4.0 v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
data_diff=subset(data_nona, select=c(lr_probabilities, lr_probabilities_wobp, lr_predictions_wobp, validated, cod, comm, inter_year, ea_categ, lr_predictions_wobp,
ak_categ, lrbp_lrwobp_agree, ea_lrwobp ))
data_diff$abs_dist_lr=abs(data_diff$lr_probabilities-data_diff$lr_probabilities_wobp) # calculate absolute distance between LR and LRw/o bullet point
summary(data_diff)
## lr_probabilities lr_probabilities_wobp lr_predictions_wobp validated
## Min. :0.1000 Min. :0.1100 Min. :0.0000 NO :637
## 1st Qu.:0.3300 1st Qu.:0.3300 1st Qu.:0.0000 YES:141
## Median :0.4400 Median :0.4400 Median :0.0000
## Mean :0.4627 Mean :0.4636 Mean :0.3856
## 3rd Qu.:0.5800 3rd Qu.:0.5700 3rd Qu.:1.0000
## Max. :0.9700 Max. :0.9700 Max. :1.0000
## cod comm inter_year ea_categ
## Length:778 Length:778 Length:778 Not_expanding: 91
## Class :character Class :character Class :character Expanding : 50
## Mode :character Mode :character Mode :character NA's :637
##
##
##
## lr_predictions_wobp.1 ak_categ lrbp_lrwobp_agree ea_lrwobp
## Min. :0.0000 Not_expanding: 9 FALSE: 24 FALSE: 37
## 1st Qu.:0.0000 Expanding : 10 TRUE :754 TRUE :104
## Median :0.0000 NA's :759 NA's :637
## Mean :0.3856
## 3rd Qu.:1.0000
## Max. :1.0000
## abs_dist_lr
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.01000
## Mean :0.01301
## 3rd Qu.:0.01000
## Max. :0.12000
mismatch=data_diff%>%
filter(validated=="YES" & ea_lrwobp=="FALSE" ) %>%
select(cod, comm, abs_dist_lr) %>%
group_by(comm) %>%
mutate(counts=n()) %>%
count(comm) %>%
ungroup()
head(mismatch) #showing a total number of disagreements between EA and the LRwobp by committee
## # A tibble: 6 x 2
## comm n
## <chr> <int>
## 1 AGRI 1
## 2 CULT 1
## 3 ECON 7
## 4 ENVI 16
## 5 IMCO 1
## 6 INTA 2
mismatch=data_diff%>%
filter(validated=="YES" & ea_lrwobp=="FALSE" ) %>%
select(cod, comm, abs_dist_lr)
head(mismatch) # to show the absolute difference in probabilities of expanding competencies between LR and LRw/obp by procedure and including committee responsible. NOTA BENE: these are the cases on which the LRw/obp and EA disagree.
## cod comm abs_dist_lr
## 752 2015/0148(COD) ENVI 0.00000000
## 753 2008/0196(COD) IMCO 0.02000001
## 755 2012/0305(COD) ENVI 0.00000000
## 757 2012/0364(COD) ECON 0.08000001
## 758 2009/0116(COD) PECH 0.00000000
## 759 2017/0293(COD) ENVI 0.00000000
THe following plot shows the number of the proposals that expand the competence of the EU within each committee over the entire timeframe.
# calculate the N of classification as 'expanding' by committee
# 1. subset relevant variables not to drag everything around
expandEU=data_nona %>%
filter(lr_predictions_wobp==1) %>%
select(cod, comm, lr_predictions_wobp, validated, lr_probabilities_wobp, inter_year )
head(expandEU) # alright, looking good!
## cod comm lr_predictions_wobp validated lr_probabilities_wobp
## 2 2019/0040(COD) TRAN 1 NO 0.55
## 5 2011/0038(COD) JURI 1 NO 0.56
## 6 2013/0015(COD) TRAN 1 NO 0.67
## 8 2010/0160(COD) ECON 1 NO 0.83
## 15 2016/0377(COD) ITRE 1 NO 0.73
## 17 2009/0099(COD) ECON 1 NO 0.68
## inter_year
## 2 2019
## 5 2011
## 6 2013
## 8 2010
## 15 2016
## 17 2009
#2. for the peace of mind let's take a look into clusters as well
expandEUugr= expandEU %>%
group_by(comm, inter_year ) %>%
mutate(counts=n()) %>%
count(comm) %>%
ungroup()
head(expandEUugr)
## # A tibble: 6 x 3
## comm inter_year n
## <chr> <chr> <int>
## 1 AFCO 2010 1
## 2 AFCO 2018 1
## 3 AFET 2011 5
## 4 AFET 2016 1
## 5 AGRI 2010 2
## 6 AGRI 2014 1
#2. use calculated frequencies and make a plot
expand=ggplot(expandEUugr, aes(x=inter_year, y=comm, group=comm, size=n)) +
geom_point(aes(size=n, color=comm), alpha=0.5 )+
theme_minimal()+
theme(legend.title = element_text(color=dark.color, size=10.5),
plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=10, face="italic"),
axis.title.y = element_text(size = 8, face='italic'),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'),
legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))+
labs(
title = " N of proposals expanding the EU competence by COM over time",
x = "Year the proposal tabled",
y = " Committee")
expand+guides(fill=guide_legend(title="Committees", reverse = TRUE))
expand
The plot below maps the patters of SQ preservation by COmmittee and and over years. The larger the bubble for each COM-YEAR, the more proposals maintaining SQ were put forward by the Commission that year.
# calculate the N of the summaries that keep SQ
SQ_eu =data_nona %>%
filter(lr_predictions_wobp==0) %>%
select(cod, comm, lr_predictions_wobp, validated, lr_probabilities_wobp, inter_year )
head(SQ_eu) # alright, looking good!
## cod comm lr_predictions_wobp validated lr_probabilities_wobp
## 3 2016/0368(COD) TRAN 0 NO 0.26
## 4 2013/0443(COD) ENVI 0 NO 0.40
## 9 2011/0354(COD) IMCO 0 NO 0.16
## 10 2011/0281(COD) AGRI 0 NO 0.24
## 11 2011/0051(COD) LIBE 0 NO 0.47
## 12 2014/0174(COD) JURI 0 NO 0.23
## inter_year
## 3 2016
## 4 2013
## 9 2011
## 10 2011
## 11 2011
## 12 2014
#2. for the peace of midf let's take a look into clusters as well
SQ_EUgr= SQ_eu %>%
group_by(comm, inter_year ) %>%
mutate(counts=n()) %>%
count(comm) %>%
ungroup()
head(SQ_EUgr)
## # A tibble: 6 x 3
## comm inter_year n
## <chr> <chr> <int>
## 1 AFCO 2012 1
## 2 AFCO 2017 1
## 3 AFCO 2020 1
## 4 AFET 2009 1
## 5 AFET 2011 1
## 6 AFET 2020 1
#2. use calculated frequencies and make a plot
sq=ggplot(SQ_EUgr, aes(x=inter_year, y=comm, group=comm, size=n)) +
geom_point(aes(size=n, color=comm), alpha=0.5 )+
theme_minimal()+
theme(legend.title = element_text(color=dark.color, size=10),
plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=10, face="italic"),
axis.title.y = element_text(size = 8, face='italic'),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'),
legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))+
labs(
title = " N of proposals keeping SQ in the EU by COM over time",
x = "Year the proposal tabled",
y = " Committee"
)
# guides(colour = guide_legend(override.aes = list(size=1)))+
# guides(fill=guide_legend(title="Committees", reverse = FALSE))
sq
there after I check the summaries for those procedures (using LO of the EP)
hist( data_diff$lr_probabilities_wobp)
highest_prob= data_diff %>%
filter( lr_probabilities_wobp>=0.8) %>%
select(cod, comm, lr_probabilities_wobp, abs_dist_lr)
highest_prob
## cod comm lr_probabilities_wobp abs_dist_lr
## 8 2010/0160(COD) ECON 0.83 0.00999999
## 35 2013/0164(COD) ITRE 0.88 0.00000000
## 43 2008/0246(COD) TRAN 0.89 0.00999999
## 66 2011/0394(COD) ITRE 0.84 0.05000001
## 135 2010/0039(COD) LIBE 0.88 0.00999999
## 143 2011/0129(COD) LIBE 0.97 0.00000000
## 156 2011/0421(COD) ENVI 0.80 0.00999999
## 158 2011/0454(COD) CONT 0.83 0.05000001
## 159 2010/0280(COD) ECON 0.80 0.00000000
## 258 2011/0412(COD) AFET 0.88 0.00999999
## 261 2011/0368(COD) LIBE 0.88 0.00999999
## 264 2013/0081(COD) LIBE 0.84 0.01999998
## 280 2013/0091(COD) LIBE 0.84 0.00999999
## 282 2011/0369(COD) JURI 0.80 0.02000004
## 318 2012/0036(COD) LIBE 0.87 0.00999999
## 329 2016/0275(COD) BUDG 0.90 0.00000000
## 346 2012/0245(COD) DEVE 0.93 0.00000000
## 361 2015/0287(COD) IMCO 0.84 0.00000000
## 364 2011/0406(COD) DEVE 0.95 0.00999999
## 371 2010/0242(COD) EMPL 0.81 0.00999999
## 385 2011/0404(COD) AFET 0.87 0.00999999
## 393 2011/0365(COD) LIBE 0.90 0.00999999
## 429 2011/0405(COD) AFET 0.90 0.00999999
## 499 2010/0383(COD) JURI 0.84 0.00999999
## 500 2011/0413(COD) AFET 0.90 0.00000000
## 508 2010/0207(COD) ECON 0.82 0.00000000
## 526 2011/0339(COD) ENVI 0.94 0.00999999
## 637 2011/0415(COD) AFET 0.82 0.00999999
## 671 2010/0275(COD) ITRE 0.87 0.00999999
## 708 2015/0277(COD) TRAN 0.84 0.00999999
## 748 2011/0427(COD) LIBE 0.90 0.00000000
## 776 2019/0108(COD) TRAN 0.80 0.00000000
## 779 2019/0107(COD) TRAN 0.81 0.00999999
#let's see the groupings by comm
highest_prob_grouped= highest_prob %>%
group_by(comm) %>%
mutate(counts=n()) %>%
count(comm) %>%
ungroup()
highest_prob_grouped # the clear pattern emerges as the highest probabilities are frequently identified in LIBE (8); AFET(5); TRAN(4) ITRE and ECON(3)
## # A tibble: 12 x 2
## comm n
## <chr> <int>
## 1 AFET 5
## 2 BUDG 1
## 3 CONT 1
## 4 DEVE 2
## 5 ECON 3
## 6 EMPL 1
## 7 ENVI 2
## 8 IMCO 1
## 9 ITRE 3
## 10 JURI 2
## 11 LIBE 8
## 12 TRAN 4
#lets check if any of the highest probability cases were validated
highest_prob_valid= data_diff %>%
filter( lr_probabilities_wobp>=0.8 & validated=="TRUE") %>%
select(cod, comm, lr_probabilities_wobp)
highest_prob_valid # does not seem so, hence yet a bit more of manual checking is in order ("YEY" #sarcasm)
## [1] cod comm lr_probabilities_wobp
## <0 rows> (or 0-length row.names)
#let's again list cods for LIBE as it is the largest cluster
LIBE=highest_prob %>%
filter(comm=="LIBE") %>%
select(cod, comm, lr_probabilities_wobp, abs_dist_lr )
LIBE
## cod comm lr_probabilities_wobp abs_dist_lr
## 135 2010/0039(COD) LIBE 0.88 0.00999999
## 143 2011/0129(COD) LIBE 0.97 0.00000000
## 261 2011/0368(COD) LIBE 0.88 0.00999999
## 264 2013/0081(COD) LIBE 0.84 0.01999998
## 280 2013/0091(COD) LIBE 0.84 0.00999999
## 318 2012/0036(COD) LIBE 0.87 0.00999999
## 393 2011/0365(COD) LIBE 0.90 0.00999999
## 748 2011/0427(COD) LIBE 0.90 0.00000000
##After Matching with CAP Cleaning up and merging the files in order to keep all the data in one place. The resulting doc below contains the output from a) CAP matching (see vars: Final_CAP1 for the final results; all variables "cap*_count“,”CAP*" contain the information regrding count and the corresponding policy area; the sequential numbering of the variable indicates whether the policy is the best match or second best match. the ties can be identified by exaining the corresponding cap_count variables.). The variable capcode codes th policy area using CAP numeric identifiers.
#setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/EURVOC/Matching PolAreas/trial2")
#cap=read.csv( "C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/EURVOC/Matching #PolAreas/trial2/CAP_EurLex_Matched_19022021.csv")
#scaled_data=read.dta("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Scaled #summaries/validation_summaries/05022021_validation_scaled1.dta")
#full_file=merge(cap, scaled_data, by="cod")
#colnames(full_file)
##drop irrelevant variables
#full_file$correctly_both=NULL
#full_file$correctly_class_sif_lr=NULL
#full_file[, c("both_wrong" , "ea" ,"borderline", "ak", "coder_agree", "lrbp_sif_lrwobp" , "ea_categ", #"example_type", "ak_categ" , "full_agreem", "ea_lrbp" , "ea_lrwobp", "ea_sif" ,"lrwobp_sif_agree" )] #=list(NULL)
#full_file[,c("correctly_class_lr", "lrpb_sif_agree", "lrbp_lrwobp_agree")]= list(NULL)
#colnames(full_file)
#Creating a code for CAP
#full_file$capcode[Final_CAP1="macroeconomics"]=1
#full_file$capcode[Final_CAP1="civil liberties"]=2
#full_file$capcode[Final_CAP1="health" ]=3
#full_file$capcode[Final_CAP1= "agriculture"]=4
#full_file$capcode[Final_CAP1= "labour and employment"]=5
#full_file$capcode[Final_CAP1= "education"]=6
#full_file$capcode[Final_CAP1="environment"]=7
#full_file$capcode[Final_CAP1="energy" ]=8
#full_file$capcode[Final_CAP1="immigration" ]=9
#full_file$capcode[Final_CAP1="transportation" ]=10
#full_file$capcode[Final_CAP1="law_and_crime"]=12
#full_file$capcode[Final_CAP1="social policy"]=13
#full_file$capcode[Final_CAP1="regional policy"]=14
#full_file$capcode[Final_CAP1="banking_market"]=15
#full_file$capcode[Final_CAP1="defense" ]=16
#full_file$capcode[Final_CAP1="technology"]=17
#full_file$capcode[Final_CAP1="foreign trade"]=18
#full_file$capcode[Final_CAP1="international_affairs"]=19
#full_file$capcode[Final_CAP1="EU_governance"]=20
#full_file$capcode[Final_CAP1="PLWMTL"]=21
#full_file$capcode[Final_CAP1="culture"]=23
#full_file$capcode[Final_CAP1="fisheries"]=24
#save(full_file, file = "EurLEx_CAP_scaling_merged_19022021.RData")
#getwd()
#write.csv(full_file, file = "EurLEx_CAP_scaling_merged_19022021.csv")
#full_file$Final_CAP1
##Let’s check the distirbutions of the predictions across CAP
#reload the packages
setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Scaled summaries/validation_summaries")
library(foreign)
library(ggplot2)
library(ggridges)
library(forcats)
EurLEx_CAP_scaling_merged_19022021 <- read_csv("EurLEx_CAP_scaling_merged_19022021.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## cod = col_character(),
## EUROVOCdescriptor = col_character(),
## Subjectmatter = col_character(),
## maxvar1 = col_character(),
## maxvar2 = col_character(),
## maxvar3 = col_character(),
## maxvar4 = col_character(),
## maxvar5 = col_character(),
## CAP1 = col_character(),
## CAP2 = col_character(),
## CAP3 = col_character(),
## CAP4 = col_character(),
## Final_CAP1 = col_character(),
## celexnumber = col_character(),
## form = col_character(),
## comm = col_character(),
## procedure = col_character(),
## interinstitutionalprocedurecodey = col_character(),
## interinstitutionalprocedurecoden = col_character(),
## dateofpublication = col_character()
## # ... with 14 more columns
## )
## i Use `spec()` for the full column specifications.
data= EurLEx_CAP_scaling_merged_19022021
p=ggplot(data, aes(x =data$lr_probabilities_wobp, y = data$Final_CAP1, fill=data$Final_CAP1)) +
geom_density_ridges( alpha = 0.6) +
labs(title ="Predictions of the Logit model", x='LR without bullet points', y='CAP major category') +
theme_ridges(font_size = 12, grid = TRUE) + theme(legend.title = element_text(color=dark.color, size=10),
plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=10, face="italic"),
axis.title.y = element_text(size = 8, face='italic', hjust = 0.5),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'),
legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))
# coord_cartesian(ylim = c(1, 50), clip = 'off')
p <- p + guides(fill=guide_legend(title="CAP", reverse = TRUE))
print(p)
## Picking joint bandwidth of 0.0678
## Warning: Removed 111 rows containing non-finite values (stat_density_ridges).
#need to re-do the years
library(tidyverse)
library(dplyr)
#quick rename of the messy variables
names(data)[names(data) == "interinstitutionalprocedurecodey"] <- "inter_year" # I will use the Year indicated in the procedure code
#names(data1)[names(data1) == "interinstitutionalprocedurecodey"] <- "inter_year"
data <- data %>% mutate(inter_year = replace(inter_year, startsWith(cod, "2018"), 2018)) %>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2017"), 2017))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2016"), 2016))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2015"), 2015)) %>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2014"), 2014))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2013"), 2013))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2012"), 2012)) %>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2019"), 2019))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2020"), 2020))
#change double-years in the procedure year into Year indicated in the procedure name
data <- data %>% mutate(inter_year = ifelse(celexnumber == "32012D0243", 2010, inter_year))
data <- data %>% mutate(inter_year = ifelse(celexnumber == "32011R1232", 2008, inter_year))
head(data$inter_year)
## [1] "2005" "2006" "2006" "2007" "2007" "2007"
## get to plotting by year ==> get yearly summaries for the average probabilities across the CAPs
#summarize(yearly_lrprobability)
data_ts_cap=subset(data, !is.na(lr_probabilities_wobp))
data_ts_cap$groups=ifelse(data_ts_cap$capcode==c(1:10), 1, 2)
## Warning in data_ts_cap$capcode == c(1:10): longer object length is not a
## multiple of shorter object length
yearly_lrprobability_cap <- data_ts_cap %>%
group_by(inter_year, Final_CAP1, groups) %>%
summarize(avg_prob_cap = mean(lr_probabilities_wobp))
## `summarise()` has grouped output by 'inter_year', 'Final_CAP1'. You can override using the `.groups` argument.
ts= ggplot(data =subset(yearly_lrprobability_cap, !is.na(avg_prob_cap)),
aes(y=avg_prob_cap, x=inter_year, group=Final_CAP1, color=Final_CAP1)) +
geom_point() +
geom_line() + #facet_wrap(. ~ Final_CAP1+groups(), scales = "free", space = "free")+
theme_minimal()+
theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=9, face="italic"),
axis.title.y = element_text(size = 6, face='italic'),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=6, face='italic'))+
labs(
title = " Average probability of proposal advocating for more EU by CAP category",
x = "Year the proposal tabled",
y = "Probality of Expanding the EU Competence"
)
ts= ts+facet_wrap(~Final_CAP1, ncol= 7, scales="fixed")+theme(strip.text.x = element_text(size =6, colour = "black"))
ts=ts+ theme(legend.position="none" ) # drop the positions
ticks=element_text(face ="italic", color = "black", size = 5)
ts + scale_x_discrete(breaks=c("2010","2015","2020"),
labels=c("2010", "2015", "2020"))+theme(axis.text.x = ticks) # adjust the N and the size of the ticks