Visualising scaling and validation

For these visualizations, I started with the code from D. Toshkovs’s webpage (http://www.dimiter.eu/Visualizations.html), which is also available directly on GitHub. However, the bits and pieces of the code have been changed and adjusted following multiple online resources.

Look into densities of validated and non-validated samples.

library(foreign)
library(ggplot2)
library(ggridges)
library(forcats)

#read the data
data=read.dta("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Scaled summaries/validation_summaries/05022021_validation_scaled1.dta")

data$var1=data$lr_probabilities_wobp


#png("lr_probabilities.png",width=600, height=400, res=96)
par(family='Montserrat')

#setting upcolors  ## a lot of messy code!

col1<-rgb(225, 25, 27, alpha=255, max=255)
col2<-rgb(30, 93, 160, alpha=255, max=255)
col3<-rgb(255, 220, 69, alpha=255, max=255)
col1a<-rgb(225, 25, 27, alpha=195, max=255)
col2a<-rgb(30, 93, 160, alpha=195, max=255)
col3a<-rgb(255, 220, 69, alpha=195, max=255)
col3b<-rgb(248, 202, 8, alpha=255, max=255)
bg.col<-rgb(255, 251, 245, alpha=255,max=255)
dark.color=rgb(35, 35, 58, max=255) # dark color


#extra colors

main.color=rgb(238, 80, 121, alpha=175, max=255) #main color
main.color2<-rgb(64,193,230, alpha=175, max=255)
main.color3<-rgb(50,100,10, alpha=175, max=255)
main.color4<-rgb(80,10,100, alpha=175, max=255)
main.color5<-rgb(155,138,4, alpha=175, max=255)

main.colora=rgb(238, 80, 121, alpha=255, max=255) #main color
main.color2a<-rgb(64,193,230, alpha=255, max=255)
main.color3a<-rgb(50,100,10, alpha=255, max=255)
main.color4a<-rgb(80,10,100, alpha=255, max=255)
main.color5a<-rgb(155,138,4, alpha=255, max=255)

colors<-c(main.color, main.color3, main.color4, main.color2, main.color5 )
colors2<-c(main.colora, main.color3a, main.color4a, main.color2a, main.color5a)

background.color=rgb(255, 251, 245, max=255) #color for the background
dark.color=rgb(35, 35, 58, max=255) # dark color



#plot

d=ggplot(data, aes(x=data$var1, y=validated,  fill=data$validated,  color=data$validated)) +
  geom_density_ridges(scale = 4) + 
  scale_fill_cyclical(values = c(col1a, col2a)) +
  scale_color_cyclical(values = c(col1,col2)) +
  scale_x_continuous(limits=c(min(data$var1,na.rm=T),max(data$var1,na.rm=T)), expand = c(0.01, 0)) +
  scale_y_discrete(expand = c(0.01, 1)) + labs(title = 'Density plot of the LR probabilities: Validated and full sample', x='LR without bullet points')+
  theme_ridges(font_size = 12, grid = TRUE) + theme(plot.margin = unit(c(1,1,1,1), "lines"), 
                                                    
                                                    plot.title = element_text(color=dark.color, size=14, face="italic", hjust=0),
                                                    
                                                    plot.subtitle = element_text(color=dark.color, size=12, face="italic"),
                                                    
                                                   # plot.background = element_rect(fill=background.color, colour=dark.color),
                                                    
                                                  #  axis.text.y=element_blank(), axis.ticks.y=element_blank( ),
                                                    
                                                    axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0.5))+
  coord_cartesian(ylim = c(1, 10), clip = 'off') 

print(d)
## Picking joint bandwidth of 0.0417
## Warning: Removed 112 rows containing non-finite values (stat_density_ridges).

Frequencies

The figure below shows the distribution of probabilities generated by the LR model. For this presentation, I will be using the predictions of the models excluding the bullet points as tokens. The graph below distinguishes between reviewed/validated cases & non validated cases.

p=ggplot(data, aes(x = data$lr_probabilities_wobp, fill = data$validated)) +                       # Draw overlaying histogram with frequencies
  geom_histogram(position = "identity", alpha = 0.3, bins = 50) + 
  scale_color_cyclical(values = c(col1,col2)) +
  labs(title = 'Frequencies for LR probabilities: Validated and full sample', x='LR without bullet points')+
  theme_ridges(font_size = 12, grid = TRUE) +
  theme(legend.title = element_text(color=dark.color, size=12), 
         plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
         plot.subtitle = element_text(color=dark.color, size=12, face="italic"), 
         axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0.5))+
  coord_cartesian(ylim = c(1, 50), clip = 'off') 

p <- p + guides(fill=guide_legend(title="Validated?"))
print(p)
## Warning: Removed 112 rows containing non-finite values (stat_bin).

Let’s take a look into SIF_LR model

It appears that overall, validation efforts allowed us to review a representative sample of the procedures. HOwever, the SIF model’s probaility distribution looks quite odd (though, I think there is a transformation that can be used to deal with it).

p=ggplot(data, aes(x = data$sif_lr_probs, fill = data$validated)) +                       # Draw overlaying histogram with frequencies
  geom_histogram(position = "identity", alpha = 0.3, bins = 50) + 
  scale_color_cyclical(values = c(col1,col2)) +
  labs(title = 'SIF_LR probabilities: Validated and full sample', x='SIF_LR predictions')+
  theme_ridges(font_size = 12, grid = TRUE) +
  theme( legend.title = element_text(color=dark.color, size=12), 
         plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
         plot.subtitle = element_text(color=dark.color, size=12, face="italic"), 
         axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0.5))+
  coord_cartesian(ylim = c(1, 100), clip = 'off') 

p <- p + guides(fill=guide_legend(title="Validated?"))
print(p)
## Warning: Removed 112 rows containing non-finite values (stat_bin).

Probabilities of increasing the competence across the Committees.

Let’s take a look into the PR across the Committees ( as proxies for policy areas). Note an interesting pro-increase skewness of the ECON (in contrast to Hagemann et al.).

#Plot1
p=ggplot(data, aes(x =data$lr_probabilities_wobp,  y = data$comm, fill=data$comm)) +
  geom_density_ridges( alpha = 0.8) + 
  
  labs(title ="Predictions of the Logit model", x='LR without bullet points', y='Committee') +
  
   theme_ridges(font_size = 12, grid = TRUE) +    theme( legend.title = element_text(color=dark.color, size=12),
                                                       
                                                       plot.title = element_text( size=12, face="italic", hjust=0.5),
                                                       
                                                       plot.subtitle = element_text( size=12, face="italic"), 
                                                       
                                                       axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color,hjust=0.5))

  coord_cartesian(ylim = c(1, 50), clip = 'off') 
## <ggproto object: Class CoordCartesian, Coord, gg>
##     aspect: function
##     backtransform_range: function
##     clip: off
##     default: FALSE
##     distance: function
##     expand: TRUE
##     is_free: function
##     is_linear: function
##     labels: function
##     limits: list
##     modify_scales: function
##     range: function
##     render_axis_h: function
##     render_axis_v: function
##     render_bg: function
##     render_fg: function
##     setup_data: function
##     setup_layout: function
##     setup_panel_guides: function
##     setup_panel_params: function
##     setup_params: function
##     train_panel_guides: function
##     transform: function
##     super:  <ggproto object: Class CoordCartesian, Coord, gg>
p <- p + guides(fill=guide_legend(title="Committees", reverse = TRUE))
print(p)
## Picking joint bandwidth of 0.068
## Warning: Removed 112 rows containing non-finite values (stat_density_ridges).

How do things change if we remove misclassified cases from the distribution?

In the next step, I removed the procedures which were categorised into different groups by the logit model and the coder(s).

p=ggplot(data, aes(x = data$lr_probabilities_wobp,  fill = data$ea_lrwobp)) +                       # Draw overlaying histogram with frequencies
  geom_histogram(position = "identity", alpha = 0.3, bins = 50) + 
 # scale_color_cyclical(values = c(col1, col3, col3b)) +
               scale_fill_cyclical(values = c(col1, col2a)) +
  
                labs(title = 'Overview of the LR and EA agreement', x='LR without bullet points')+
                
                theme_ridges(font_size = 12, grid = TRUE) +
  
                theme(legend.title = element_text(color=dark.color, size=10), 
                      
                      plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                     
                       plot.subtitle = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                      
                      axis.text.y=element_text(color=dark.color, hjust=0.5),
                      
                      axis.title.x = element_text(color=dark.color, hjust=0.5))+
  coord_cartesian(ylim = c(0, 15), clip = 'off') 

p <- p + guides(fill=guide_legend(title="Correcly categorised?"))
print(p)
## Warning: Removed 112 rows containing non-finite values (stat_bin).

##More plots Below, I summarize the clustering of the LR probabilities. THe bars capture both agreement and disagreement between the coder’s classification and the model’s categorization. The bars on the background show the means probability for for groups of the cases.

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data1=subset(data, ea_lrwobp=="FALSE" | ea_lrwobp=="TRUE")  ## subset only cases which were actually coded
gd <- data1 %>%   ## group them by the agreement between LR and EA coding
        group_by(ea_lrwobp) %>% 
        summarise(lr_probabilities_wobp = mean(lr_probabilities_wobp))
gd
## # A tibble: 2 x 2
##   ea_lrwobp lr_probabilities_wobp
## * <fct>                     <dbl>
## 1 FALSE                     0.441
## 2 TRUE                      0.420
ggplot(data1, aes(x =ea_lrwobp , y =lr_probabilities_wobp)) +
  geom_point()+
  geom_bar(data=gd, stat = "identity",  alpha = .2)+
  ggrepel::geom_text_repel(aes(label = data1$comm), color = "black", size = 2.5, segment.color = "grey") +
  geom_point() +
  guides(color = "none", fill = "none") +
   theme_ridges(font_size = 12, grid = TRUE) +
  theme(legend.title = element_text(color=dark.color, size=12), 
         plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
         plot.subtitle = element_text(color=dark.color, size=12, face="italic"), 
         axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0.5))+
  labs(
    title = "Probability by agreement with EA",
    x = "LR and EA agree on classification",
    y = " LR Probaility"
  )
## Warning: Use of `data1$comm` is discouraged. Use `comm` instead.

What are the most frequent disagreement with EA by policy area?

In the plot below it is evident that the most frequent disagreements between the summary-level machine scaling nd the coder occurs in such areas as ENVI (16 procedures); ECON (7), TRAN(3), PECH (3), INTA(2).

library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
 tabyl(data1, comm, ea_lrwobp )
##  comm FALSE TRUE
##  AGRI     1    2
##  BUDG     0    1
##  CULT     1    0
##  ECON     7    8
##  EMPL     0    1
##  ENVI    16   18
##  IMCO     1    7
##  INTA     2   14
##  ITRE     1    6
##  JURI     1    9
##  LIBE     1    7
##  PECH     3   10
##  REGI     0    2
##  TRAN     3   19
 tabyl(data1, comm, ea_lrwobp )%>%
     adorn_percentages("col") %>%
   adorn_pct_formatting(digits = 1)
##  comm FALSE  TRUE
##  AGRI  2.7%  1.9%
##  BUDG  0.0%  1.0%
##  CULT  2.7%  0.0%
##  ECON 18.9%  7.7%
##  EMPL  0.0%  1.0%
##  ENVI 43.2% 17.3%
##  IMCO  2.7%  6.7%
##  INTA  5.4% 13.5%
##  ITRE  2.7%  5.8%
##  JURI  2.7%  8.7%
##  LIBE  2.7%  6.7%
##  PECH  8.1%  9.6%
##  REGI  0.0%  1.9%
##  TRAN  8.1% 18.3%
library(viridis) 
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
p= ggplot(data1, aes(x =ea_lrwobp , y =comm)) +
 geom_count(aes(color=comm), alpha = 0.3)+
   scale_colour_hue(guide = "none")+
   scale_fill_brewer(palette=1)+
    scale_size(range = c(0, 25))+
   # scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A") +
    theme(legend.title = element_text(color=dark.color, size=12), 
         plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
         plot.subtitle = element_text(color=dark.color, size=12, face="italic"), 
         axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0))+
  theme_minimal()+
        labs(
    title = "Occurances of misclassification by Committee",
    x = "Coder-Model agreement",
    y = "Committee"
  )
 
 p + theme(legend.title = element_text(color = "black", size = 10),
           legend.key.size = unit(0.5, "cm"),
  legend.key.width = unit(0.5,"cm") 
  )

## Some cases were misclassified by all models.

Here are the cases where the classifications by LR and SIF-LR models agree with each other, but both coders captured mis-categorization by the models. The plot below locates these cases using the similarity measures. SIF_negative captures the degree to which the summary is somewhat similar to the candidate summary used to trans the model. A similar interpretation applies to the SIF_positive. THe middle of the map,(on the intersection of SIF-Negative=0.5 and SIF_positive=0.5), represents a borderline cases, which exhibit similarity to both summaries expanding the competencies, and those seeking to maintain the SQ.

#trying to plot the cases where LR and SIF went wrong and 2 coders agree 
#subset the relevant cases

failure=subset(data1, data1$completely_wrong_lrwobp  =="Both models wrong" &  data1$coder_agree=="TRUE"  )
#head(failure)

##creaate a color scheme==>  still not sure why it does nto fully work here... NOTABENE: TAKE  A LOOK WHEN POSSIBLE!!!
library(RColorBrewer)

myColors <- brewer.pal(8,"Set1")
names(myColors) <- levels(failure$comm)
colScale <- scale_colour_manual(name = "comm",values = myColors)


f=ggplot(failure, aes(x=failure$sif_pos_similarity, y=failure$sif_neg_similarity, color=comm)) +
  geom_point(aes(color=comm), size=3, alpha=0.6)+
   #scale_fill_brewer(palette=1)
 theme(legend.title = element_text(color=dark.color, size=12), 
         plot.title = element_text(color=dark.color, size=12, face="italic", hjust=0.5),
         plot.subtitle = element_text(color=dark.color, size=12, face="italic"), 
         axis.title.y = element_blank(), axis.title.x = element_text(color=dark.color, hjust=0))+
  theme_minimal()+
        labs(
    title = "Cases classified inccorectly by COMM over SIF similarity",
    x = "SIF_Positive similarity",
    y = "SIF-Negative similarity"
  )
f + expand_limits(x =c(0, 1), y=c(0,1))
## Warning: Use of `failure$sif_pos_similarity` is discouraged. Use
## `sif_pos_similarity` instead.
## Warning: Use of `failure$sif_neg_similarity` is discouraged. Use
## `sif_neg_similarity` instead.

It would also be nice to get an overview of the variance in EU seeking to change the competence level over years and policy areas.

#quick rename of the messy variables
names(data)[names(data) == "interinstitutionalprocedurecodey"] <- "inter_year"   # I will use the Year indicated in the procedure code
names(data1)[names(data1) == "interinstitutionalprocedurecodey"] <- "inter_year"

data <- data %>%  mutate(inter_year = replace(inter_year, startsWith(cod, "2018"), 2018)) %>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2017"), 2017))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2016"), 2016))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2015"), 2015)) %>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2014"), 2014))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2013"), 2013))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2012"), 2012)) %>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2019"), 2019))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2020"), 2020))

#change double-years in the procedure year into Year indicated in the procedure name
data <- data %>% mutate(inter_year = ifelse(celexnumber == "32012D0243", 2010,  inter_year))
data <- data %>% mutate(inter_year = ifelse(celexnumber == "32011R1232", 2008,  inter_year)) 

#summarize(yearly_lrprobability)
data_ts=subset(data, !is.na(lr_probabilities_wobp))
yearly_lrprobability <- data_ts %>%
    group_by(inter_year,  comm) %>%
    summarize(avg_prob = mean(lr_probabilities_wobp))
## `summarise()` has grouped output by 'inter_year'. You can override using the `.groups` argument.
competence= ggplot(data =subset(yearly_lrprobability, !is.na(avg_prob)),
                   aes(y=avg_prob, x=inter_year, group=comm, color=comm)) +
                    geom_point()+
                    geom_line() +      #+ facet_grid(. ~ comm, scales = "free", space = "free")
                    theme_minimal()+
                    theme(legend.title = element_text(color=dark.color, size=10.5), 
                         plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=10, face="italic"), 
                          axis.title.y = element_text(size = 8, face='italic'),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'), 
                          legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))+
                     labs(
                    title = " Average probaility of proposal advocating for more EU by Committee",
                    x = "Year the proposal tabled",
                    y = "Probality of Expanding the EU Competence"
                  )
competence+guides(colour = guide_legend(override.aes = list(size=1)))

Here is an overview of the average probability of expanding the EU authority over years.

data_nona=subset(data, !is.na(lr_probabilities_wobp))



yearly_aggr_lrprobability <- data_nona %>%
    group_by(inter_year)%>%
    summarise(avg_prob1= mean(lr_probabilities_wobp))
              

           
ggplot(yearly_aggr_lrprobability, aes( x=inter_year, y=avg_prob1, group=1, color=inter_year)) +
                  geom_point( )   +
                  geom_line()+
                  theme_minimal()+
                    theme(legend.title = element_blank(), 
                         plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=10, face="italic"), 
                          axis.title.y = element_text(size = 8, face='italic'),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'), 
                          legend.text=element_blank (), legend.key = element_blank(), legend.position = "none")+
                     labs(
                    title = " Average probability of proposal expanding EU competencies",
                    x = "Year the proposal tabled",
                    y = "Probality of Expanding the EU Competence"
                  )

The variance in the probabilities of the scaled proposals implying increase in the EU competencies. Thinner lines show mean+/- 1 standard deviation .

data_nona=subset(data, !is.na(lr_probabilities_wobp))



yearly_aggr2_lrprobability <- data_nona %>%
    group_by(inter_year) %>%
      summarise(avg_prob2= mean(lr_probabilities_wobp), 
              sd_av=sd(lr_probabilities_wobp), 
              max_ci = avg_prob2+ 1*sd_av,
              min_ci = avg_prob2 - 1*sd_av)  %>%
  ungroup()

#dt=subset(yearly_aggr2_lrprobability, !is.na(min_ci)) # no way of calculating the CI for very early proposals// but they can be dropped
                  
          
              ggplot(data=yearly_aggr2_lrprobability,  
                          aes(x=inter_year,   color = inter_year, group=1)) +
                        geom_line(aes(y= avg_prob2))+
                        geom_line( aes (y =max_ci), alpha = 0.3)+
                          geom_line(aes(y=min_ci), alpha=0.3, )+
                theme_minimal()+
                    theme(legend.title = element_blank(), 
                         plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=10, face="italic"), 
                          axis.title.y = element_text(size = 8, face='italic'),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'), 
                          legend.text=element_blank (), legend.key = element_blank(), legend.position = "none")+
                     labs(
                    title = " Mean Pr of extending EU authority 1SD variance",
                    x = "Year the proposal was tabled",
                    y = "Probability of Expanding the EU Competence"
                  )
## Warning: Removed 2 row(s) containing missing values (geom_path).

## Warning: Removed 2 row(s) containing missing values (geom_path).

A few strange cases

Here is the subsample of procedures on which the EA and the LRw/obp disagreed. For this subsample I calculated the absolute difference in probabilities provided by two LR models.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.6     v purrr   0.3.4
## v readr   1.4.0     v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)

 data_diff=subset(data_nona, select=c(lr_probabilities, lr_probabilities_wobp, lr_predictions_wobp, validated, cod, comm, inter_year, ea_categ, lr_predictions_wobp,
                                      ak_categ, lrbp_lrwobp_agree, ea_lrwobp  ))

data_diff$abs_dist_lr=abs(data_diff$lr_probabilities-data_diff$lr_probabilities_wobp) # calculate absolute distance between LR and LRw/o bullet point

summary(data_diff)  
##  lr_probabilities lr_probabilities_wobp lr_predictions_wobp validated
##  Min.   :0.1000   Min.   :0.1100        Min.   :0.0000      NO :637  
##  1st Qu.:0.3300   1st Qu.:0.3300        1st Qu.:0.0000      YES:141  
##  Median :0.4400   Median :0.4400        Median :0.0000               
##  Mean   :0.4627   Mean   :0.4636        Mean   :0.3856               
##  3rd Qu.:0.5800   3rd Qu.:0.5700        3rd Qu.:1.0000               
##  Max.   :0.9700   Max.   :0.9700        Max.   :1.0000               
##      cod                comm            inter_year                 ea_categ  
##  Length:778         Length:778         Length:778         Not_expanding: 91  
##  Class :character   Class :character   Class :character   Expanding    : 50  
##  Mode  :character   Mode  :character   Mode  :character   NA's         :637  
##                                                                              
##                                                                              
##                                                                              
##  lr_predictions_wobp.1          ak_categ   lrbp_lrwobp_agree ea_lrwobp  
##  Min.   :0.0000        Not_expanding:  9   FALSE: 24         FALSE: 37  
##  1st Qu.:0.0000        Expanding    : 10   TRUE :754         TRUE :104  
##  Median :0.0000        NA's         :759                     NA's :637  
##  Mean   :0.3856                                                         
##  3rd Qu.:1.0000                                                         
##  Max.   :1.0000                                                         
##   abs_dist_lr     
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.01000  
##  Mean   :0.01301  
##  3rd Qu.:0.01000  
##  Max.   :0.12000
mismatch=data_diff%>% 
  filter(validated=="YES" & ea_lrwobp=="FALSE" ) %>%
   select(cod, comm, abs_dist_lr) %>%
  group_by(comm) %>%
  mutate(counts=n()) %>%
  count(comm) %>%
  ungroup()
head(mismatch)  #showing a total number of disagreements between EA and the LRwobp by committee
## # A tibble: 6 x 2
##   comm      n
##   <chr> <int>
## 1 AGRI      1
## 2 CULT      1
## 3 ECON      7
## 4 ENVI     16
## 5 IMCO      1
## 6 INTA      2
mismatch=data_diff%>% 
  filter(validated=="YES" & ea_lrwobp=="FALSE" ) %>%
   select(cod, comm, abs_dist_lr)
head(mismatch) # to show the absolute difference in probabilities of expanding competencies between LR and LRw/obp  by procedure and including committee responsible.  NOTA BENE: these are the cases on which the LRw/obp and EA disagree.
##                cod comm abs_dist_lr
## 752 2015/0148(COD) ENVI  0.00000000
## 753 2008/0196(COD) IMCO  0.02000001
## 755 2012/0305(COD) ENVI  0.00000000
## 757 2012/0364(COD) ECON  0.08000001
## 758 2009/0116(COD) PECH  0.00000000
## 759 2017/0293(COD) ENVI  0.00000000

Plotting the frequency of the competency expansion by COMM by year

THe following plot shows the number of the proposals that expand the competence of the EU within each committee over the entire timeframe.

# calculate the N of classification as 'expanding' by committee
# 1. subset relevant variables not to drag everything around
 
expandEU=data_nona %>%
   filter(lr_predictions_wobp==1) %>%
  select(cod, comm, lr_predictions_wobp, validated, lr_probabilities_wobp,  inter_year )
head(expandEU) # alright, looking good!
##               cod comm lr_predictions_wobp validated lr_probabilities_wobp
## 2  2019/0040(COD) TRAN                   1        NO                  0.55
## 5  2011/0038(COD) JURI                   1        NO                  0.56
## 6  2013/0015(COD) TRAN                   1        NO                  0.67
## 8  2010/0160(COD) ECON                   1        NO                  0.83
## 15 2016/0377(COD) ITRE                   1        NO                  0.73
## 17 2009/0099(COD) ECON                   1        NO                  0.68
##    inter_year
## 2        2019
## 5        2011
## 6        2013
## 8        2010
## 15       2016
## 17       2009
#2. for the peace of mind let's take a look into clusters as well
expandEUugr= expandEU %>%
    group_by(comm, inter_year ) %>%
  mutate(counts=n()) %>%
  count(comm) %>%
  ungroup()
head(expandEUugr) 
## # A tibble: 6 x 3
##   comm  inter_year     n
##   <chr> <chr>      <int>
## 1 AFCO  2010           1
## 2 AFCO  2018           1
## 3 AFET  2011           5
## 4 AFET  2016           1
## 5 AGRI  2010           2
## 6 AGRI  2014           1
#2. use calculated frequencies and make a plot


expand=ggplot(expandEUugr, aes(x=inter_year, y=comm,  group=comm, size=n)) +
          geom_point(aes(size=n, color=comm), alpha=0.5 )+
          theme_minimal()+
                    theme(legend.title = element_text(color=dark.color, size=10.5), 
                         plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=10, face="italic"), 
                          axis.title.y = element_text(size = 8, face='italic'),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'), 
                          legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))+
                     labs(
                    title = " N of proposals expanding the EU competence by COM over time",
                    x = "Year the proposal tabled",
                    y = " Committee")
                  
expand+guides(fill=guide_legend(title="Committees", reverse = TRUE))

expand

What about SQ-preserving proposals?

The plot below maps the patters of SQ preservation by COmmittee and and over years. The larger the bubble for each COM-YEAR, the more proposals maintaining SQ were put forward by the Commission that year.

# calculate the N of the summaries that keep SQ
SQ_eu =data_nona %>%
   filter(lr_predictions_wobp==0) %>%
  select(cod, comm, lr_predictions_wobp, validated, lr_probabilities_wobp,  inter_year )
head(SQ_eu) # alright, looking good!
##               cod comm lr_predictions_wobp validated lr_probabilities_wobp
## 3  2016/0368(COD) TRAN                   0        NO                  0.26
## 4  2013/0443(COD) ENVI                   0        NO                  0.40
## 9  2011/0354(COD) IMCO                   0        NO                  0.16
## 10 2011/0281(COD) AGRI                   0        NO                  0.24
## 11 2011/0051(COD) LIBE                   0        NO                  0.47
## 12 2014/0174(COD) JURI                   0        NO                  0.23
##    inter_year
## 3        2016
## 4        2013
## 9        2011
## 10       2011
## 11       2011
## 12       2014
#2. for the peace of midf let's take a look into clusters as well
SQ_EUgr= SQ_eu %>%
    group_by(comm, inter_year ) %>%
  mutate(counts=n()) %>%
  count(comm) %>%
  ungroup()
head(SQ_EUgr)
## # A tibble: 6 x 3
##   comm  inter_year     n
##   <chr> <chr>      <int>
## 1 AFCO  2012           1
## 2 AFCO  2017           1
## 3 AFCO  2020           1
## 4 AFET  2009           1
## 5 AFET  2011           1
## 6 AFET  2020           1
#2. use calculated frequencies and make a plot


sq=ggplot(SQ_EUgr, aes(x=inter_year, y=comm,  group=comm, size=n)) +
          geom_point(aes(size=n, color=comm), alpha=0.5 )+
          theme_minimal()+
                    theme(legend.title = element_text(color=dark.color, size=10), 
                         plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=10, face="italic"), 
                          axis.title.y = element_text(size = 8, face='italic'),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'), 
                          legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))+
                     labs(
                    title = " N of proposals keeping SQ in the EU by COM over time",
                    x = "Year the proposal tabled",
                    y = " Committee"
                  ) 
                 # guides(colour = guide_legend(override.aes = list(size=1)))+
                  #   guides(fill=guide_legend(title="Committees", reverse = FALSE))

sq

Here I would like to identify the observations which got the highest scores in by the LRwobp within the entire sample

there after I check the summaries for those procedures (using LO of the EP)

hist( data_diff$lr_probabilities_wobp)

highest_prob=  data_diff %>%
  filter( lr_probabilities_wobp>=0.8) %>%
  select(cod, comm,  lr_probabilities_wobp, abs_dist_lr)

highest_prob
##                cod comm lr_probabilities_wobp abs_dist_lr
## 8   2010/0160(COD) ECON                  0.83  0.00999999
## 35  2013/0164(COD) ITRE                  0.88  0.00000000
## 43  2008/0246(COD) TRAN                  0.89  0.00999999
## 66  2011/0394(COD) ITRE                  0.84  0.05000001
## 135 2010/0039(COD) LIBE                  0.88  0.00999999
## 143 2011/0129(COD) LIBE                  0.97  0.00000000
## 156 2011/0421(COD) ENVI                  0.80  0.00999999
## 158 2011/0454(COD) CONT                  0.83  0.05000001
## 159 2010/0280(COD) ECON                  0.80  0.00000000
## 258 2011/0412(COD) AFET                  0.88  0.00999999
## 261 2011/0368(COD) LIBE                  0.88  0.00999999
## 264 2013/0081(COD) LIBE                  0.84  0.01999998
## 280 2013/0091(COD) LIBE                  0.84  0.00999999
## 282 2011/0369(COD) JURI                  0.80  0.02000004
## 318 2012/0036(COD) LIBE                  0.87  0.00999999
## 329 2016/0275(COD) BUDG                  0.90  0.00000000
## 346 2012/0245(COD) DEVE                  0.93  0.00000000
## 361 2015/0287(COD) IMCO                  0.84  0.00000000
## 364 2011/0406(COD) DEVE                  0.95  0.00999999
## 371 2010/0242(COD) EMPL                  0.81  0.00999999
## 385 2011/0404(COD) AFET                  0.87  0.00999999
## 393 2011/0365(COD) LIBE                  0.90  0.00999999
## 429 2011/0405(COD) AFET                  0.90  0.00999999
## 499 2010/0383(COD) JURI                  0.84  0.00999999
## 500 2011/0413(COD) AFET                  0.90  0.00000000
## 508 2010/0207(COD) ECON                  0.82  0.00000000
## 526 2011/0339(COD) ENVI                  0.94  0.00999999
## 637 2011/0415(COD) AFET                  0.82  0.00999999
## 671 2010/0275(COD) ITRE                  0.87  0.00999999
## 708 2015/0277(COD) TRAN                  0.84  0.00999999
## 748 2011/0427(COD) LIBE                  0.90  0.00000000
## 776 2019/0108(COD) TRAN                  0.80  0.00000000
## 779 2019/0107(COD) TRAN                  0.81  0.00999999
#let's see the groupings by comm
highest_prob_grouped= highest_prob %>%
    group_by(comm) %>%
  mutate(counts=n()) %>%
  count(comm) %>%
  ungroup()

highest_prob_grouped # the clear pattern emerges as the highest probabilities are frequently identified in LIBE (8); AFET(5); TRAN(4) ITRE and ECON(3)
## # A tibble: 12 x 2
##    comm      n
##    <chr> <int>
##  1 AFET      5
##  2 BUDG      1
##  3 CONT      1
##  4 DEVE      2
##  5 ECON      3
##  6 EMPL      1
##  7 ENVI      2
##  8 IMCO      1
##  9 ITRE      3
## 10 JURI      2
## 11 LIBE      8
## 12 TRAN      4
#lets check if any of the highest probability cases were validated
highest_prob_valid= data_diff %>%
  filter( lr_probabilities_wobp>=0.8 & validated=="TRUE") %>%
  select(cod, comm,  lr_probabilities_wobp)

highest_prob_valid  # does not seem so, hence yet a bit more of manual checking is in order  ("YEY" #sarcasm)
## [1] cod                   comm                  lr_probabilities_wobp
## <0 rows> (or 0-length row.names)
#let's again list cods for LIBE as it is the largest cluster

LIBE=highest_prob %>%
  filter(comm=="LIBE") %>%
  select(cod, comm,  lr_probabilities_wobp, abs_dist_lr )

LIBE
##                cod comm lr_probabilities_wobp abs_dist_lr
## 135 2010/0039(COD) LIBE                  0.88  0.00999999
## 143 2011/0129(COD) LIBE                  0.97  0.00000000
## 261 2011/0368(COD) LIBE                  0.88  0.00999999
## 264 2013/0081(COD) LIBE                  0.84  0.01999998
## 280 2013/0091(COD) LIBE                  0.84  0.00999999
## 318 2012/0036(COD) LIBE                  0.87  0.00999999
## 393 2011/0365(COD) LIBE                  0.90  0.00999999
## 748 2011/0427(COD) LIBE                  0.90  0.00000000

##After Matching with CAP Cleaning up and merging the files in order to keep all the data in one place. The resulting doc below contains the output from a) CAP matching (see vars: Final_CAP1 for the final results; all variables "cap*_count“,”CAP*" contain the information regrding count and the corresponding policy area; the sequential numbering of the variable indicates whether the policy is the best match or second best match. the ties can be identified by exaining the corresponding cap_count variables.). The variable capcode codes th policy area using CAP numeric identifiers.

#setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/EURVOC/Matching PolAreas/trial2")

#cap=read.csv( "C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/EURVOC/Matching #PolAreas/trial2/CAP_EurLex_Matched_19022021.csv")


#scaled_data=read.dta("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Scaled #summaries/validation_summaries/05022021_validation_scaled1.dta")


#full_file=merge(cap, scaled_data, by="cod")
#colnames(full_file)


##drop irrelevant variables
#full_file$correctly_both=NULL
#full_file$correctly_class_sif_lr=NULL

#full_file[, c("both_wrong" , "ea" ,"borderline", "ak", "coder_agree", "lrbp_sif_lrwobp" , "ea_categ", #"example_type", "ak_categ" , "full_agreem", "ea_lrbp"  ,  "ea_lrwobp", "ea_sif"  ,"lrwobp_sif_agree"    )] #=list(NULL)
  
#full_file[,c("correctly_class_lr",  "lrpb_sif_agree",  "lrbp_lrwobp_agree")]= list(NULL)
     
#colnames(full_file)

#Creating a code for CAP
#full_file$capcode[Final_CAP1="macroeconomics"]=1
#full_file$capcode[Final_CAP1="civil liberties"]=2
#full_file$capcode[Final_CAP1="health" ]=3
#full_file$capcode[Final_CAP1= "agriculture"]=4
#full_file$capcode[Final_CAP1= "labour and employment"]=5
#full_file$capcode[Final_CAP1= "education"]=6
#full_file$capcode[Final_CAP1="environment"]=7
#full_file$capcode[Final_CAP1="energy" ]=8
#full_file$capcode[Final_CAP1="immigration" ]=9
#full_file$capcode[Final_CAP1="transportation" ]=10

#full_file$capcode[Final_CAP1="law_and_crime"]=12
#full_file$capcode[Final_CAP1="social policy"]=13
#full_file$capcode[Final_CAP1="regional policy"]=14
#full_file$capcode[Final_CAP1="banking_market"]=15
#full_file$capcode[Final_CAP1="defense" ]=16
#full_file$capcode[Final_CAP1="technology"]=17
#full_file$capcode[Final_CAP1="foreign trade"]=18
#full_file$capcode[Final_CAP1="international_affairs"]=19
#full_file$capcode[Final_CAP1="EU_governance"]=20
#full_file$capcode[Final_CAP1="PLWMTL"]=21

#full_file$capcode[Final_CAP1="culture"]=23
#full_file$capcode[Final_CAP1="fisheries"]=24





#save(full_file, file = "EurLEx_CAP_scaling_merged_19022021.RData")
#getwd()

#write.csv(full_file, file = "EurLEx_CAP_scaling_merged_19022021.csv")

#full_file$Final_CAP1

##Let’s check the distirbutions of the predictions across CAP

#reload the packages
setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Scaled summaries/validation_summaries")

library(foreign)
library(ggplot2)
library(ggridges)
library(forcats)
EurLEx_CAP_scaling_merged_19022021 <- read_csv("EurLEx_CAP_scaling_merged_19022021.csv")
## Warning: Missing column names filled in: 'X1' [1]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   cod = col_character(),
##   EUROVOCdescriptor = col_character(),
##   Subjectmatter = col_character(),
##   maxvar1 = col_character(),
##   maxvar2 = col_character(),
##   maxvar3 = col_character(),
##   maxvar4 = col_character(),
##   maxvar5 = col_character(),
##   CAP1 = col_character(),
##   CAP2 = col_character(),
##   CAP3 = col_character(),
##   CAP4 = col_character(),
##   Final_CAP1 = col_character(),
##   celexnumber = col_character(),
##   form = col_character(),
##   comm = col_character(),
##   procedure = col_character(),
##   interinstitutionalprocedurecodey = col_character(),
##   interinstitutionalprocedurecoden = col_character(),
##   dateofpublication = col_character()
##   # ... with 14 more columns
## )
## i Use `spec()` for the full column specifications.
data= EurLEx_CAP_scaling_merged_19022021


p=ggplot(data, aes(x =data$lr_probabilities_wobp,  y = data$Final_CAP1, fill=data$Final_CAP1)) +
  geom_density_ridges( alpha = 0.6) + 
  
  labs(title ="Predictions of the Logit model", x='LR without bullet points', y='CAP major category') +
  
   theme_ridges(font_size = 12, grid = TRUE) +   theme(legend.title = element_text(color=dark.color, size=10), 
                         plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=10, face="italic"), 
                          axis.title.y = element_text(size = 8, face='italic', hjust = 0.5),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'), 
                          legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))

 # coord_cartesian(ylim = c(1, 50), clip = 'off') 

p <- p + guides(fill=guide_legend(title="CAP", reverse = TRUE))
print(p)
## Picking joint bandwidth of 0.0678
## Warning: Removed 111 rows containing non-finite values (stat_density_ridges).

#need to re-do the years
library(tidyverse)
library(dplyr)

#quick rename of the messy variables
names(data)[names(data) == "interinstitutionalprocedurecodey"] <- "inter_year"   # I will use the Year indicated in the procedure code
#names(data1)[names(data1) == "interinstitutionalprocedurecodey"] <- "inter_year"

data <- data %>%  mutate(inter_year = replace(inter_year, startsWith(cod, "2018"), 2018)) %>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2017"), 2017))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2016"), 2016))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2015"), 2015)) %>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2014"), 2014))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2013"), 2013))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2012"), 2012)) %>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2019"), 2019))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2020"), 2020))

#change double-years in the procedure year into Year indicated in the procedure name
data <- data %>% mutate(inter_year = ifelse(celexnumber == "32012D0243", 2010,  inter_year))
data <- data %>% mutate(inter_year = ifelse(celexnumber == "32011R1232", 2008,  inter_year)) 

head(data$inter_year)
## [1] "2005" "2006" "2006" "2007" "2007" "2007"
## get to plotting by year ==> get yearly summaries for the average probabilities across the CAPs
#summarize(yearly_lrprobability)
data_ts_cap=subset(data, !is.na(lr_probabilities_wobp))

data_ts_cap$groups=ifelse(data_ts_cap$capcode==c(1:10), 1, 2)
## Warning in data_ts_cap$capcode == c(1:10): longer object length is not a
## multiple of shorter object length
yearly_lrprobability_cap <- data_ts_cap %>%
    group_by(inter_year,  Final_CAP1, groups) %>%
    summarize(avg_prob_cap = mean(lr_probabilities_wobp))
## `summarise()` has grouped output by 'inter_year', 'Final_CAP1'. You can override using the `.groups` argument.
ts= ggplot(data =subset(yearly_lrprobability_cap, !is.na(avg_prob_cap)),
                   aes(y=avg_prob_cap, x=inter_year, group=Final_CAP1, color=Final_CAP1)) +
                    geom_point()    +
                    geom_line() +       #facet_wrap(. ~ Final_CAP1+groups(), scales = "free", space = "free")+
                    theme_minimal()+
                    theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=9, face="italic"), 
                          axis.title.y = element_text(size = 6, face='italic'),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=6, face='italic'))+
                     labs(
                    title = " Average probability of proposal advocating for more EU by CAP category",
                    x = "Year the proposal tabled",
                    y = "Probality of Expanding the EU Competence"
                  ) 

ts= ts+facet_wrap(~Final_CAP1, ncol= 7, scales="fixed")+theme(strip.text.x = element_text(size =6, colour = "black"))
ts=ts+  theme(legend.position="none" ) # drop the positions 
ticks=element_text(face ="italic", color = "black", size = 5)
ts + scale_x_discrete(breaks=c("2010","2015","2020"),
        labels=c("2010", "2015", "2020"))+theme(axis.text.x = ticks)   # adjust the N and the size of the ticks