second round of scaling

The MA team forwarded the second round of scaling on the summary level. Below I will compare the results to the first wave of scaled predictions, and will seek to identify the key differences if any appear.

library(foreign)
library(tidyr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v dplyr   1.0.4
## v tibble  3.0.6     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)


##loading the main file  minus some extra details which we do not need now
library(foreign)
library(dplyr)
full_file=read.csv("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/EurLEx_CAP_scaling_merged_19022021 - Copy.csv")


full_file$capcode=ifelse(full_file$Final_CAP1=="macroeconomics", 1, 
                         ifelse(full_file$Final_CAP1=="civil liberties", 2, 
                          ifelse(full_file$Final_CAP1=="health", 3, 
                           ifelse(full_file$Final_CAP1== "agriculture", 4, 
                            ifelse(full_file$Final_CAP1=="labour and employment",5, 
                             ifelse(full_file$Final_CAP1=="education", 6, 
                              ifelse(full_file$Final_CAP1=="environment", 7 , 
                               ifelse(full_file$Final_CAP1=="energy", 8,
                                 ifelse(full_file$Final_CAP1=="immigration", 9,
                                  ifelse(full_file$Final_CAP1=="transportation", 10,
                                    ifelse(full_file$Final_CAP1=="law_and_crime", 12,
                                     ifelse(full_file$Final_CAP1=="social policy", 13, 
                                      ifelse(full_file$Final_CAP1=="regional policy", 14, 
                                        ifelse(full_file$Final_CAP1=="banking_market", 15 , 
                                          ifelse(full_file$Final_CAP1=="defense", 16, 
                                           ifelse(full_file$Final_CAP1=="technology", 17, 
                                            ifelse(full_file$Final_CAP1=="foreign trade", 18, 
                                             ifelse (full_file$Final_CAP1=="international_affairs", 19,
                                               ifelse(full_file$Final_CAP1=="EU_governance", 20, 
                                                  ifelse(full_file$Final_CAP1=="PLWMTL", 21, 
                                                   ifelse(full_file$Final_CAP1=="culture", 23, 
                                                    ifelse(full_file$Final_CAP1=="fisheries", 24, 
                                                           999 ))) ) ) ) )))) ))))))) ))  ))) 




#load the results of the second round of scaling  with only positive extar candidates included
getwd()
## [1] "C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries"
#setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries")
data=read.csv("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries/210302_legis_sum_cod_scaled.csv")

colnames(data)
##  [1] "COD"                    "Unnamed..0_x"           "LR_Predictions_wo.bp"  
##  [4] "LR_Probabilities_wo.bp" "Unnamed..0_y"           "SIF_LR_Predictions"    
##  [7] "SIF_LR_Probs"           "SIF_Pos_Similarity"     "SIF_Neg_Similarity"    
## [10] "LR_SIF_Agree"
#rename columns for merging the datasets

names(data)[1] <- "cod"
data_m=left_join(full_file, data, "cod")
 
colnames(data_m) 
##   [1] "X"                                "cod"                             
##   [3] "requests"                         "Trilogue_progress"               
##   [5] "EUROVOCdescriptor"                "Subjectmatter"                   
##   [7] "Macroeconomics"                   "Rights"                          
##   [9] "Health"                           "Agriculture"                     
##  [11] "Labour"                           "Education"                       
##  [13] "Environment"                      "Energy"                          
##  [15] "Immigration"                      "Transportation"                  
##  [17] "LawCrime"                         "SocialPolicy"                    
##  [19] "RegionalPolicy"                   "Market"                          
##  [21] "Defence"                          "Technology"                      
##  [23] "ForeignTrade"                     "InternationalRelations"          
##  [25] "EUgovernance"                     "PLWMTI"                          
##  [27] "CultureMedia"                     "Fisheries"                       
##  [29] "maxval"                           "maxvar1"                         
##  [31] "maxval2"                          "maxvar2"                         
##  [33] "maxval3"                          "maxvar3"                         
##  [35] "maxval4"                          "maxvar4"                         
##  [37] "maxval5"                          "maxvar5"                         
##  [39] "CAP1"                             "CAP2"                            
##  [41] "CAP3"                             "CAP4"                            
##  [43] "cap1_count"                       "cap2_count"                      
##  [45] "cap3_count"                       "cap4_count"                      
##  [47] "Final_CAP1"                       "a"                               
##  [49] "celexnumber"                      "form"                            
##  [51] "comm"                             "nofterms_in_eurovoc"             
##  [53] "procedure"                        "interinstitutionalprocedurecodey"
##  [55] "interinstitutionalprocedurecoden" "dateofdocument"                  
##  [57] "dateofpublication"                "documentyear"                    
##  [59] "dateofeffect"                     "dateofendofvalidity"             
##  [61] "deadline"                         "inforceindicator"                
##  [63] "no_recitals"                      "no_articles"                     
##  [65] "wordcount"                        "delegated"                       
##  [67] "implementing"                     "proposal_date"                   
##  [69] "oppcomms"                         "amendments"                      
##  [71] "rcv_yes"                          "rcv_no"                          
##  [73] "rcv_abst"                         "rcv_yes2"                        
##  [75] "rcv_no2"                          "rcv_abst2"                       
##  [77] "codification"                     "recast"                          
##  [79] "repeal"                           "agreement"                       
##  [81] "compromise"                       "com_1_rap_1_name"                
##  [83] "com_1_rap_1_euparty"              "simplified_procedure"            
##  [85] "summary_dummy"                    "summary_classif"                 
##  [87] "reference"                        "ep_amendments_tabled"            
##  [89] "ep_amendments_adopted"            "cn_yes"                          
##  [91] "cn_no"                            "cn_abst"                         
##  [93] "proc_length_m"                    "proc_length_y"                   
##  [95] "cn_pref_type"                     "package_deals"                   
##  [97] "package_name"                     "package_code"                    
##  [99] "agreementwithcom"                 "lr_predictions"                  
## [101] "lr_probabilities"                 "sif_lr_predictions"              
## [103] "sif_lr_probs"                     "sif_pos_similarity"              
## [105] "sif_neg_similarity"               "lr_predictions_wobp"             
## [107] "lr_probabilities_wobp"            "example"                         
## [109] "validated"                        "completely_wrong_lrbp"           
## [111] "completely_wrong_lrwobp"          "training"                        
## [113] "capcode"                          "Unnamed..0_x"                    
## [115] "LR_Predictions_wo.bp"             "LR_Probabilities_wo.bp"          
## [117] "Unnamed..0_y"                     "SIF_LR_Predictions"              
## [119] "SIF_LR_Probs"                     "SIF_Pos_Similarity"              
## [121] "SIF_Neg_Similarity"               "LR_SIF_Agree"
#save file
write.csv( data_m, "C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries/03032020Full_file_second_wave_sum.csv", na=" ") 

cross tab the predictions

cross tabling the first and the second wave of predictions==> there are 99 non-missing cases which changed the ‘camp’. We might have to do manual validation on these.

table ( data_m$LR_Predictions_wo.bp, data_m$lr_predictions_wobp) 
##    
##       0   1
##   0 387  10
##   1  89 288

##Merge with the predictions which were generated by the extended sample (with extra “-” and extra “+” candidates identified after first wave validation).

#load the csv with the all additional candidates

data_f=read.csv("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries/03032021_summary_prediction_extra_candidates bfEP9_included.csv") 

names(data_f)[1] <- "cod"
data_m=left_join(data_m, data_f, "cod")
 
colnames(data_m) 
##   [1] "X"                                "cod"                             
##   [3] "requests"                         "Trilogue_progress"               
##   [5] "EUROVOCdescriptor"                "Subjectmatter"                   
##   [7] "Macroeconomics"                   "Rights"                          
##   [9] "Health"                           "Agriculture"                     
##  [11] "Labour"                           "Education"                       
##  [13] "Environment"                      "Energy"                          
##  [15] "Immigration"                      "Transportation"                  
##  [17] "LawCrime"                         "SocialPolicy"                    
##  [19] "RegionalPolicy"                   "Market"                          
##  [21] "Defence"                          "Technology"                      
##  [23] "ForeignTrade"                     "InternationalRelations"          
##  [25] "EUgovernance"                     "PLWMTI"                          
##  [27] "CultureMedia"                     "Fisheries"                       
##  [29] "maxval"                           "maxvar1"                         
##  [31] "maxval2"                          "maxvar2"                         
##  [33] "maxval3"                          "maxvar3"                         
##  [35] "maxval4"                          "maxvar4"                         
##  [37] "maxval5"                          "maxvar5"                         
##  [39] "CAP1"                             "CAP2"                            
##  [41] "CAP3"                             "CAP4"                            
##  [43] "cap1_count"                       "cap2_count"                      
##  [45] "cap3_count"                       "cap4_count"                      
##  [47] "Final_CAP1"                       "a"                               
##  [49] "celexnumber"                      "form"                            
##  [51] "comm"                             "nofterms_in_eurovoc"             
##  [53] "procedure"                        "interinstitutionalprocedurecodey"
##  [55] "interinstitutionalprocedurecoden" "dateofdocument"                  
##  [57] "dateofpublication"                "documentyear"                    
##  [59] "dateofeffect"                     "dateofendofvalidity"             
##  [61] "deadline"                         "inforceindicator"                
##  [63] "no_recitals"                      "no_articles"                     
##  [65] "wordcount"                        "delegated"                       
##  [67] "implementing"                     "proposal_date"                   
##  [69] "oppcomms"                         "amendments"                      
##  [71] "rcv_yes"                          "rcv_no"                          
##  [73] "rcv_abst"                         "rcv_yes2"                        
##  [75] "rcv_no2"                          "rcv_abst2"                       
##  [77] "codification"                     "recast"                          
##  [79] "repeal"                           "agreement"                       
##  [81] "compromise"                       "com_1_rap_1_name"                
##  [83] "com_1_rap_1_euparty"              "simplified_procedure"            
##  [85] "summary_dummy"                    "summary_classif"                 
##  [87] "reference"                        "ep_amendments_tabled"            
##  [89] "ep_amendments_adopted"            "cn_yes"                          
##  [91] "cn_no"                            "cn_abst"                         
##  [93] "proc_length_m"                    "proc_length_y"                   
##  [95] "cn_pref_type"                     "package_deals"                   
##  [97] "package_name"                     "package_code"                    
##  [99] "agreementwithcom"                 "lr_predictions"                  
## [101] "lr_probabilities"                 "sif_lr_predictions"              
## [103] "sif_lr_probs"                     "sif_pos_similarity"              
## [105] "sif_neg_similarity"               "lr_predictions_wobp"             
## [107] "lr_probabilities_wobp"            "example"                         
## [109] "validated"                        "completely_wrong_lrbp"           
## [111] "completely_wrong_lrwobp"          "training"                        
## [113] "capcode"                          "Unnamed..0_x"                    
## [115] "LR_Predictions_wo.bp.x"           "LR_Probabilities_wo.bp.x"        
## [117] "Unnamed..0_y"                     "SIF_LR_Predictions.x"            
## [119] "SIF_LR_Probs.x"                   "SIF_Pos_Similarity.x"            
## [121] "SIF_Neg_Similarity.x"             "LR_SIF_Agree"                    
## [123] "LR_Predictions_wo.bp.y"           "LR_Probabilities_wo.bp.y"        
## [125] "SIF_LR_Predictions.y"             "SIF_LR_Probs.y"                  
## [127] "SIF_Pos_Similarity.y"             "SIF_Neg_Similarity.y"
table ( data_m$LR_Predictions_wo.bp.x, data_m$LR_Predictions_wo.bp.y) 
##    
##       0   1
##   0 389   5
##   1 163 213
#save file
write.csv( data_m, "C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries/03032020Full_file_second_wave_sum_expanded.csv", na=" ")
col1<-rgb(225, 25, 27, alpha=255, max=255)
col2<-rgb(30, 93, 160, alpha=255, max=255)
col3<-rgb(255, 220, 69, alpha=255, max=255)
col1a<-rgb(225, 25, 27, alpha=195, max=255)
col2a<-rgb(30, 93, 160, alpha=195, max=255)
col3a<-rgb(255, 220, 69, alpha=195, max=255)
col3b<-rgb(248, 202, 8, alpha=255, max=255)
bg.col<-rgb(255, 251, 245, alpha=255,max=255)
dark.color=rgb(35, 35, 58, max=255) # dark color


#extra colors

main.color=rgb(238, 80, 121, alpha=175, max=255) #main color
main.color2<-rgb(64,193,230, alpha=175, max=255)
main.color3<-rgb(50,100,10, alpha=175, max=255)
main.color4<-rgb(80,10,100, alpha=175, max=255)
main.color5<-rgb(155,138,4, alpha=175, max=255)

main.colora=rgb(238, 80, 121, alpha=255, max=255) #main color
main.color2a<-rgb(64,193,230, alpha=255, max=255)
main.color3a<-rgb(50,100,10, alpha=255, max=255)
main.color4a<-rgb(80,10,100, alpha=255, max=255)
main.color5a<-rgb(155,138,4, alpha=255, max=255)

colors<-c(main.color, main.color3, main.color4, main.color2, main.color5 )
colors2<-c(main.colora, main.color3a, main.color4a, main.color2a, main.color5a)

background.color=rgb(255, 251, 245, max=255) #color for the background
dark.color=rgb(35, 35, 58, max=255) # dark color
library(ggplot2)
library(ggridges)
library(forcats)
#using only exra positive cases


p=ggplot(data_m, aes(x =data_m$LR_Probabilities_wo.bp.x,  y = data_m$Final_CAP1, fill=data_m$Final_CAP1)) +
 geom_density_ridges( alpha = 0.6) + 
  
  labs(title ="Figure 1. Predictions of the Logit model with extra positive cases", x='LR without bullet points', y='CAP major category') +
  
   theme_ridges(font_size = 12, grid = TRUE) +   theme(legend.title = element_blank(), 
                         plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=10, face="italic"), 
                          axis.title.y = element_text(size = 8, face='italic', hjust = 0.5),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'), 
                          legend.text=element_blank())

 # coord_cartesian(ylim = c(1, 50), clip = 'off') 

p <- p + theme(legend.position = "none")
print(p)
## Warning: Use of `data_m$LR_Probabilities_wo.bp.x` is discouraged. Use
## `LR_Probabilities_wo.bp.x` instead.
## Warning: Use of `data_m$Final_CAP1` is discouraged. Use `Final_CAP1` instead.

## Warning: Use of `data_m$Final_CAP1` is discouraged. Use `Final_CAP1` instead.
## Picking joint bandwidth of 0.0691
## Warning: Removed 102 rows containing non-finite values (stat_density_ridges).

##let’s take a look what happens when the expanded sample of both extra + and extra - candidates is used

pn=ggplot(data_m, aes(x =data_m$LR_Probabilities_wo.bp.y,  y = data_m$Final_CAP1, fill=data_m$Final_CAP1)) +
 geom_density_ridges( alpha = 0.6) + 
  
  labs(title ="Figure 2. Predictions of the Logit model extra + and - cases", x='LR without bullet points', y='CAP major category') +
  
   theme_ridges(font_size = 12, grid = TRUE) +   theme(legend.title = element_text(color=dark.color, size=10), 
                         plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=10, face="italic"), 
                          axis.title.y = element_text(size = 8, face='italic', hjust = 0.5),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'), 
                          legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))

 # coord_cartesian(ylim = c(1, 50), clip = 'off') 

pn <- pn + theme(legend.position = "none")
print(pn)
## Warning: Use of `data_m$LR_Probabilities_wo.bp.y` is discouraged. Use
## `LR_Probabilities_wo.bp.y` instead.
## Warning: Use of `data_m$Final_CAP1` is discouraged. Use `Final_CAP1` instead.

## Warning: Use of `data_m$Final_CAP1` is discouraged. Use `Final_CAP1` instead.
## Picking joint bandwidth of 0.0777
## Warning: Removed 119 rows containing non-finite values (stat_density_ridges).

##results are unclear==> timeseries might be more useful ==> First only extra positives

#get the year indicator


#quick rename of the messy variables
names(data_m)[names(data_m) == "interinstitutionalprocedurecodey"] <- "inter_year"   # I will use the Year indicated in the procedure code
#names(data1)[names(data1) == "interinstitutionalprocedurecodey"] <- "inter_year"

data_m <- data_m %>%  mutate(inter_year = replace(inter_year, startsWith(cod, "2018"), 2018)) %>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2017"), 2017))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2016"), 2016))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2015"), 2015)) %>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2014"), 2014))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2013"), 2013))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2012"), 2012)) %>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2019"), 2019))%>%
  mutate(inter_year = replace(inter_year, startsWith(cod, "2020"), 2020))

#change double-years in the procedure year into Year indicated in the procedure name
data_m <- data_m %>% mutate(inter_year = ifelse(celexnumber == "32012D0243", 2010,  inter_year))
data_m <- data_m %>% mutate(inter_year = ifelse(celexnumber == "32011R1232", 2008,  inter_year)) 

head(data_m$inter_year)
## [1] "2005" "2006" "2006" "2007" "2007" "2007"

##with extra positives and negatives==> see below

data_ts_cap=subset(data_m, !is.na(LR_Probabilities_wo.bp.x ))

yearly_lrprobability_cap <- data_ts_cap %>%
    group_by(inter_year,  Final_CAP1) %>%
    summarize(avg_prob_cap = mean(LR_Probabilities_wo.bp.x ))
## `summarise()` has grouped output by 'inter_year'. You can override using the `.groups` argument.
ts_pos= ggplot(data =subset(yearly_lrprobability_cap, !is.na(avg_prob_cap)),
                   aes(y=avg_prob_cap, x=inter_year, group=Final_CAP1, color=Final_CAP1)) +
                    geom_point()    +
                    geom_line() +
                    theme_minimal()+
                    theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=9, face="italic"), 
                          axis.title.y = element_text(size = 6, face='italic'),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=6, face='italic'))+
                     labs(
                    title = "Figure 3. TS by CAP when only EXTRA Positive candidates used",
                    x = "Year the proposal tabled",
                    y = "Probality of Expanding the EU Competence"
                  ) 

ts_pos= ts_pos+facet_wrap(~Final_CAP1, ncol= 7, scales="fixed")+theme(strip.text.x = element_text(size =6, colour = "black"))
ts_pos=ts_pos+  theme(legend.position="none" ) # drop the positions 
ticks=element_text(face ="italic", color = "black", size = 5)
ts_pos + scale_x_discrete(breaks=c("2010","2015","2020"),
        labels=c("2010", "2015", "2020"))+theme(axis.text.x = ticks)   # adjust the N and the size of the ticks

yearly_lrprobability_cap_y <- data_ts_cap %>%
    group_by(inter_year,  Final_CAP1) %>%
    summarize(avg_prob_cap_y = mean(LR_Probabilities_wo.bp.y ))
## `summarise()` has grouped output by 'inter_year'. You can override using the `.groups` argument.
ts_posneg= ggplot(data =subset(yearly_lrprobability_cap_y, !is.na(avg_prob_cap_y)),
                   aes(y=avg_prob_cap_y, x=inter_year, group=Final_CAP1, color=Final_CAP1)) +
                    geom_point()    +
                    geom_line() +
                    theme_minimal()+
                    theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=9, face="italic"), 
                          axis.title.y = element_text(size = 6, face='italic'),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=6, face='italic'))+
                     labs(
                    title = "Figure 4. TS by CAP when extra NEGATIVE & POSITIVE candidates used",
                    x = "Year the proposal tabled",
                    y = "Probality of Expanding the EU Competence"
                  ) 

ts_posneg= ts_posneg+facet_wrap(~Final_CAP1, ncol= 7, scales="fixed")+theme(strip.text.x = element_text(size =6, colour = "black"))
ts_posneg=ts_posneg+  theme(legend.position="none" ) # drop the positions 
ticks=element_text(face ="italic", color = "black", size = 5)
ts_posneg+ scale_x_discrete(breaks=c("2010","2015","2020"),
        labels=c("2010", "2015", "2020"))+theme(axis.text.x = ticks)   # adjust the N and the size of the ticks

##Overview

Overall, it seems that there are minor changes but the patterns remain more or less the same. *The use of extra negative cases results in more clustering around 0.5 values (see change of the scale on Y-axis and the location of the maximums within scaled summaries). Here one should also keep in mind a substantive loss of precision of the prediction (will be summarized in the main overview file)

*The use of the scaling based on the updated sample of only positive case results in more precision and higher recall of the model, suggesting fewer false positive and false negative predictions. The TS and the distribution of the average probability indicate only minor alterations to the key patterns identified in the original scaling;

*However, there are still a couple of thing to note here: When the sample of the candidates enriched with only positive cases, the updated scaling soothes the changes in such areas as immigration, International affairs (positive spikes in the first round of scaling); labor and employment (a negative drop around 2010). Details to follow.

The next steps: include the coding for the extreme cases which were used in training sets

#subset the needed infor and get rid of the useless DFs in environment
#get maximum values for the first wave of scaling in
#table( data_m$summary_classif, data_m$cod)

#orig_posit=data_m[data_m$summary_classif==1, "cod" ]

data_m=data_m %>%
  #Step1: replace with '1' predictions adn probailities for the cadidates from the original training sample: positive summaries
     mutate(lr_predictions_wobp=replace(lr_predictions_wobp, summary_classif==1, 1)) %>%
     mutate(lr_probabilities_wobp=replace(lr_probabilities_wobp, summary_classif==1, 1)) %>%
  #Step1.2. now the negative summaries 
      mutate(lr_predictions_wobp=replace(lr_predictions_wobp, summary_classif==-1, 0)) %>%
     mutate(lr_probabilities_wobp=replace(lr_probabilities_wobp, summary_classif==-1, 0)) %>%
  #Step 2. now do the same replacing for the scaling2 (including only new + candidates). Start with positive
    mutate(LR_Predictions_wo.bp.x=replace(LR_Predictions_wo.bp.x, summary_classif==1, 1)) %>%
     mutate(LR_Probabilities_wo.bp.x=replace(LR_Probabilities_wo.bp.x, summary_classif==1, 1)) %>%
  #Step 2.2 now scaling 2==> replacement for the original negative candidates
         mutate(LR_Predictions_wo.bp.x=replace(LR_Predictions_wo.bp.x, summary_classif==-1, 0)) %>%
         mutate(LR_Probabilities_wo.bp.x=replace(LR_Probabilities_wo.bp.x, summary_classif==-1, 0)) %>%
  #Step3. Replacing initial + candidates in the prob. and predictions for the scaling 3(including neg and pos candidates)
            mutate(LR_Predictions_wo.bp.y=replace(LR_Predictions_wo.bp.y, summary_classif==1, 1)) %>%
            mutate(LR_Probabilities_wo.bp.y=replace(LR_Probabilities_wo.bp.y, summary_classif==1, 1)) %>%
  #Step3.2. now scaling 3==> replacement for the original negative candidates
                 mutate(LR_Predictions_wo.bp.y=replace(LR_Predictions_wo.bp.y, summary_classif==-1, 0)) %>%
                   mutate(LR_Probabilities_wo.bp.y=replace(LR_Probabilities_wo.bp.y, summary_classif==-1, 0))

colnames(data_m)
##   [1] "X"                                "cod"                             
##   [3] "requests"                         "Trilogue_progress"               
##   [5] "EUROVOCdescriptor"                "Subjectmatter"                   
##   [7] "Macroeconomics"                   "Rights"                          
##   [9] "Health"                           "Agriculture"                     
##  [11] "Labour"                           "Education"                       
##  [13] "Environment"                      "Energy"                          
##  [15] "Immigration"                      "Transportation"                  
##  [17] "LawCrime"                         "SocialPolicy"                    
##  [19] "RegionalPolicy"                   "Market"                          
##  [21] "Defence"                          "Technology"                      
##  [23] "ForeignTrade"                     "InternationalRelations"          
##  [25] "EUgovernance"                     "PLWMTI"                          
##  [27] "CultureMedia"                     "Fisheries"                       
##  [29] "maxval"                           "maxvar1"                         
##  [31] "maxval2"                          "maxvar2"                         
##  [33] "maxval3"                          "maxvar3"                         
##  [35] "maxval4"                          "maxvar4"                         
##  [37] "maxval5"                          "maxvar5"                         
##  [39] "CAP1"                             "CAP2"                            
##  [41] "CAP3"                             "CAP4"                            
##  [43] "cap1_count"                       "cap2_count"                      
##  [45] "cap3_count"                       "cap4_count"                      
##  [47] "Final_CAP1"                       "a"                               
##  [49] "celexnumber"                      "form"                            
##  [51] "comm"                             "nofterms_in_eurovoc"             
##  [53] "procedure"                        "inter_year"                      
##  [55] "interinstitutionalprocedurecoden" "dateofdocument"                  
##  [57] "dateofpublication"                "documentyear"                    
##  [59] "dateofeffect"                     "dateofendofvalidity"             
##  [61] "deadline"                         "inforceindicator"                
##  [63] "no_recitals"                      "no_articles"                     
##  [65] "wordcount"                        "delegated"                       
##  [67] "implementing"                     "proposal_date"                   
##  [69] "oppcomms"                         "amendments"                      
##  [71] "rcv_yes"                          "rcv_no"                          
##  [73] "rcv_abst"                         "rcv_yes2"                        
##  [75] "rcv_no2"                          "rcv_abst2"                       
##  [77] "codification"                     "recast"                          
##  [79] "repeal"                           "agreement"                       
##  [81] "compromise"                       "com_1_rap_1_name"                
##  [83] "com_1_rap_1_euparty"              "simplified_procedure"            
##  [85] "summary_dummy"                    "summary_classif"                 
##  [87] "reference"                        "ep_amendments_tabled"            
##  [89] "ep_amendments_adopted"            "cn_yes"                          
##  [91] "cn_no"                            "cn_abst"                         
##  [93] "proc_length_m"                    "proc_length_y"                   
##  [95] "cn_pref_type"                     "package_deals"                   
##  [97] "package_name"                     "package_code"                    
##  [99] "agreementwithcom"                 "lr_predictions"                  
## [101] "lr_probabilities"                 "sif_lr_predictions"              
## [103] "sif_lr_probs"                     "sif_pos_similarity"              
## [105] "sif_neg_similarity"               "lr_predictions_wobp"             
## [107] "lr_probabilities_wobp"            "example"                         
## [109] "validated"                        "completely_wrong_lrbp"           
## [111] "completely_wrong_lrwobp"          "training"                        
## [113] "capcode"                          "Unnamed..0_x"                    
## [115] "LR_Predictions_wo.bp.x"           "LR_Probabilities_wo.bp.x"        
## [117] "Unnamed..0_y"                     "SIF_LR_Predictions.x"            
## [119] "SIF_LR_Probs.x"                   "SIF_Pos_Similarity.x"            
## [121] "SIF_Neg_Similarity.x"             "LR_SIF_Agree"                    
## [123] "LR_Predictions_wo.bp.y"           "LR_Probabilities_wo.bp.y"        
## [125] "SIF_LR_Predictions.y"             "SIF_LR_Probs.y"                  
## [127] "SIF_Pos_Similarity.y"             "SIF_Neg_Similarity.y"
data=data_m[c(2, 47, 51, 54, 113, 100:108, 115:128) ]

rm(data_f)
rm(full_file, joined_file, negolinks)
## Warning in rm(full_file, joined_file, negolinks): object 'joined_file' not found
## Warning in rm(full_file, joined_file, negolinks): object 'negolinks' not found
rm(data_ts_cap)

#rename variables in a straightforward way
#only extra positive candidates: ==> all marked with x
data=data %>% 
  rename(LR_Predictions_posit= LR_Predictions_wo.bp.x, 
       LR_Probabilities_posit=LR_Probabilities_wo.bp.x,
        SIF_LR_Predictions_posit=SIF_LR_Predictions.x,
       SIF_LR_Probs_posit= SIF_LR_Probs.x,
       SIF_Pos_Similarity_posit=SIF_Pos_Similarity.x)

##same thing for the mix of the extra negative and extra positive candidates
data= data%>%
  rename(LR_Predictions_NP= LR_Predictions_wo.bp.y, 
       LR_Probabilities_NP=LR_Probabilities_wo.bp.y,
        SIF_LR_Predictions_NP=SIF_LR_Predictions.y,
       SIF_LR_Probs_NP= SIF_LR_Probs.y,
       SIF_Pos_Similarity_NP=SIF_Pos_Similarity.y)




# information on training summaries ==> 17 negative and 3 positive

data$LR_Predictions_NP[data$cod=="2019/0010(COD)"] =0

data$LR_Predictions_NP[data$cod=="2019/0009(COD)"]=0

data$LR_Predictions_NP[data$cod=="2016/0075(COD)"]=0

data$LR_Predictions_NP[data$cod=="2016/0034(COD)"]=0

data$LR_Predictions_NP[data$cod=="2016/0033(COD)"]=0

data$LR_Predictions_NP[data$cod=="2016/0029(COD)"]=0

data$LR_Predictions_NP[data$cod=="2015/0218(COD)"]=0

data$LR_Predictions_NP[data$cod=="2014/0279(COD)"]=0

data$LR_Predictions_NP[data$cod=="2014/0213(COD)"]=0

data$LR_Predictions_NP[data$cod=="2014/0164(COD)"]=0

data$LR_Predictions_NP[data$cod=="2013/0327(COD)"]=0

data$LR_Predictions_NP[data$cod=="2013/0303(COD)"]=0

data$LR_Predictions_NP[data$cod=="2013/0013(COD)"]=0

data$LR_Predictions_NP[data$cod=="2011/0002(COD)"]=0

data$LR_Predictions_NP[data$cod=="2009/0173(COD)"]=0

data$LR_Predictions_NP[data$cod=="2008/0009(COD)"]=0

data$LR_Predictions_NP[data$cod=="2015/0295(COD)"]=0

#positives candidates
data$LR_Predictions_NP[data$cod=="2015/0289(COD)"]=1

data$LR_Predictions_NP[data$cod=="2009/0077(COD)"]=1

data$LR_Predictions_NP[data$cod=="2009/0076(COD)"]=1




#same thing for the probabilities in the LR with neg&posit

data$LR_Probabilities_NP[data$cod=="2019/0010(COD)"] =0

data$LR_Probabilities_NP[data$cod=="2019/0009(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2016/0075(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2016/0034(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2016/0033(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2016/0029(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2015/0218(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2014/0279(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2014/0213(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2014/0164(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2013/0327(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2013/0303(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2013/0013(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2011/0002(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2009/0173(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2008/0009(COD)"]=0

data$LR_Probabilities_NP[data$cod=="2015/0295(COD)"]=0

#positives candidates
data$LR_Probabilities_NP[data$cod=="2015/0289(COD)"]=1

data$LR_Probabilities_NP[data$cod=="2009/0077(COD)"]=1

data$LR_Probabilities_NP[data$cod=="2009/0076(COD)"]=1



##make replacements for only positives
data$LR_Predictions_posit[data$cod=="2015/0289(COD)"]=1

data$LR_Predictions_posit[data$cod=="2009/0077(COD)"]=1

data$LR_Predictions_posit[data$cod=="2009/0076(COD)"]=1


#positive sample probabilities  LR_Probabilities_posit

data$LR_Probabilities_posit[data$cod=="2015/0289(COD)"]=1

data$LR_Probabilities_posit[data$cod=="2009/0077(COD)"]=1

data$LR_Probabilities_posit[data$cod=="2009/0076(COD)"]=1

##plot by category side by side==> a bit of a pain, but doable and might be the only way to see any differences

# get the time-series subsamples and ondicators calculated
ts_set2<-  data %>%       ## only extra positive candidates
    group_by(Final_CAP1, inter_year) %>%
    summarize(avg_prob_posit = mean(LR_Probabilities_posit)) %>%
  ungroup()
## `summarise()` has grouped output by 'Final_CAP1'. You can override using the `.groups` argument.
ts_set3=data %>% ## extra negative and positive
   group_by(Final_CAP1, inter_year)  %>%
   summarize(avg_prob_NP= mean(LR_Probabilities_NP)) 
## `summarise()` has grouped output by 'Final_CAP1'. You can override using the `.groups` argument.
ts_set1= data%>% # original scaling
     group_by(Final_CAP1, inter_year)  %>%
   summarize(avg_prob_orig=mean(lr_probabilities_wobp))
## `summarise()` has grouped output by 'Final_CAP1'. You can override using the `.groups` argument.
rm(ts_set)
## Warning in rm(ts_set): object 'ts_set' not found
p=ggplot()+
  geom_line(data=subset(ts_set1, !is.na(avg_prob_orig)), aes(x=inter_year, y=avg_prob_orig , group=Final_CAP1, color="red"))+
  
  geom_line(data=subset(ts_set2, !is.na(avg_prob_posit)), aes(x=inter_year, y=avg_prob_posit , group=Final_CAP1, color="darkgreen"))+
  geom_line(data=subset(ts_set3, !is.na(avg_prob_NP)), aes(x=inter_year, y=avg_prob_NP , group=Final_CAP1 , color="blue"))+
   theme_minimal()+
  theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=9, face="italic"), 
                          axis.title.y = element_text(size = 6, face='italic'),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=6, face='italic'))+
                     labs(
                    title = " Figure 5. TS by CAP  using al three scaling attempts",
                    x = "Year the proposal tabled",
                    y = "Probality of Expanding the EU Competence"
                  ) 

p=p + scale_colour_manual(values = c("red", "darkgreen", "blue"), labels=c("Original", "Extra +", "Extra + & -"))

p=  p+  theme(legend.position="left" ) # drop the positions 

ticks=element_text(face ="italic", color = "black", size = 5)
p=p+ scale_x_discrete(breaks=c("2010","2015","2020"),
        labels=c("2010", "2015", "2020"))+theme(axis.text.x = ticks)   # adjust the N and the size of the ticks
p+facet_wrap(~Final_CAP1, ncol= 7, scales="fixed")+theme(strip.text.x = element_text(size =6, colour = "black"))

## There are some cases which changed their ‘camp’: in a nut-shell some procedures were classified as, for instance, + in the first round of scaling, but turned out to be in a “-” groups after the second or third round of scaling. Here, I will take a look into what those cases and whether there is any specific pattern that emerges across them.

#coding the changes from W1 to W2. NOTA BENE: in W1 15 procedures were omitted==> hence the NA values
data=data%>%
  mutate(W1W2_change=ifelse(lr_predictions_wobp==LR_Predictions_posit, 0, 1)) 

table(data$W1W2_change)
## 
##   0   1 
## 771 101
head(data$W1W2_change)
## [1] 1 0 0 0 0 0
##track down changes from W2 to W3
data=data%>% 
  mutate(W2W3_change=ifelse(LR_Predictions_posit==LR_Predictions_NP, 0, 1)) 

table(data$W2W3_change)
## 
##   0   1 
## 712 173
#track the change from W1 to W3
data=data%>% 
  mutate(W1W3_change=ifelse(lr_predictions_wobp==LR_Predictions_NP, 0, 1)) 

table(data$W1W3_change)
## 
##   0   1 
## 754 118
##labels makes our lives easier

data$W1W2_change=ordered(data$W1W2_change, levels=c(0, 1), labels=c("Same class", "Different class"))
data$W2W3_change=ordered(data$W2W3_change, levels=c(0, 1), labels=c("Same class", "Different class"))
data$W1W3_change=ordered(data$W1W3_change, levels=c(0, 1), labels=c("Same class", "Different class"))
table(data$W1W3_change)
## 
##      Same class Different class 
##             754             118
 library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
 tabyl(data, Final_CAP1, W1W2_change ) 
##             Final_CAP1 Same class Different class NA_
##            agriculture         29               8   1
##         banking_market        196              21   0
##        civil liberties         11               0   1
##                culture          7               1   0
##                defense          4               0   0
##              education          1               1   0
##                 energy         17               1   1
##            environment         66               5   1
##          EU_governance         29               2   4
##              fisheries         37               6   0
##          foreign trade         63               1   1
##                 health         16               2   0
##            immigration         41              12   1
##  international_affairs         33               2   1
##  labour and employment         28               4   0
##          law_and_crime         28               2   4
##         macroeconomics         28               4   0
##        regional policy          9               3   1
##          social policy          6               3   0
##             technology         71               9   1
##         transportation         51              14   0
 # 4 procedures are omitted in W2 and W3 scaling

 #all good and nice, but if you need==> look at %
  #tabyl(data, Final_CAP1, W1W2_change )%>%
   #  adorn_percentages("col") %>%
   #adorn_pct_formatting(digits = 1)

 ##### W2 vs W3
 tabyl(data, Final_CAP1, W2W3_change ) 
##             Final_CAP1 Same class Different class NA_
##            agriculture         31               7   0
##         banking_market        170              47   0
##        civil liberties         11               1   0
##                culture          6               2   0
##                defense          4               0   0
##              education          1               1   0
##                 energy         17               1   1
##            environment         58              14   0
##          EU_governance         31               4   0
##              fisheries         33              10   0
##          foreign trade         61               4   0
##                 health         15               3   0
##            immigration         42              12   0
##  international_affairs         31               5   0
##  labour and employment         28               4   0
##          law_and_crime         29               3   2
##         macroeconomics         22              10   0
##        regional policy         11               2   0
##          social policy          5               4   0
##             technology         65              15   1
##         transportation         41              24   0
##W1 vs W3
tabyl(data, Final_CAP1, W1W3_change) 
##             Final_CAP1 Same class Different class NA_
##            agriculture         34               3   1
##         banking_market        179              38   0
##        civil liberties         10               1   1
##                culture          7               1   0
##                defense          4               0   0
##              education          2               0   0
##                 energy         18               0   1
##            environment         58              13   1
##          EU_governance         27               4   4
##              fisheries         39               4   0
##          foreign trade         61               3   1
##                 health         17               1   0
##            immigration         43              10   1
##  international_affairs         32               3   1
##  labour and employment         28               4   0
##          law_and_crime         29               1   4
##         macroeconomics         26               6   0
##        regional policy         11               1   1
##          social policy          8               1   0
##             technology         70              10   1
##         transportation         51              14   0

##The question is how much difference there are substantively in the level of prediction if we look beyond dummy classification==> lets calculate differences by procedures between the waves of predictions using the continuous probability indicator

data=data %>%
  mutate(
    pr_diff_w1w2= data$lr_probabilities_wobp - data$LR_Probabilities_posit, 
    pr_diff_w3w2= data$LR_Probabilities_NP -  data$LR_Predictions_posit, 
    pr_diff_w1w3= data$lr_probabilities_wobp - data$LR_Probabilities_NP)


#subset only cases when the change occurred
w1w2=subset(data, W1W2_change=="Different class" ) 
w2w3=subset(data, W2W3_change=="Different class"   )
w1w3=subset(data, W1W3_change=="Different class" )

##brewup

pinkish=rgb(1, 0,   0.6, 0.5)
greenish=rgb(0.2, 0.8, 0.6, 0.5)
kindaorange=rgb(0.8, 0.6, 0.6, 0.8)

#selecting extreme cases for labeling on the plot
w1w2$extreme=ifelse(w1w2$pr_diff_w1w2< -0.5, 1, 0)

table(w1w2$extreme)
## 
##  0  1 
## 99  2
# W1 vs W2==> group by cap category 

w1=ggplot(w1w2, aes(x=pr_diff_w1w2, y=Final_CAP1,  group=Final_CAP1)) +
          geom_point(alpha=0.5, aes(color = cut(pr_diff_w1w2, c(-Inf, -0.5, -0.25, 0.25, Inf))) )+xlim(-1, 1)+
          geom_text(data=w1w2[w1w2$extreme==1,] , aes(label=cod), size=2.5, hjust=-.2)+
             scale_colour_manual(name="Difference", 
                                values=c("(-Inf,-0.5]"=pinkish,
                                         "(-0.5,-0.25]"=kindaorange,
                                         "(-0.25,0.25]"=greenish,
                                          "(0.25, Inf]"=kindaorange),      
                                labels = c("Large difference", "Medium diffrende", "Similar", "Medium Difference"), na.translate=F)+
   theme_minimal()+
   theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=9), 
                          axis.title.y = element_text(size = 9),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=9))+
                     labs(
                    title = " Figure 6. Differeneces in scaling b/n W1 and W2 by CAP",
                    x = "Predicted probability",
                    y = " CAP category"
                  ) 
  
w1  

w1w3$extreme=ifelse(w1w3$pr_diff_w1w3> 0.5 | w1w3$pr_diff_w1w3< -0.5 , 1, 0)

table(w1w3$extreme)
## 
##   0   1 
## 115   3
# W1 vs W2==> group by cap category 

w3=ggplot(w1w3, aes(x=pr_diff_w1w3, y=Final_CAP1,  group=Final_CAP1)) +
          geom_point(aes(color = pr_diff_w1w3))+
          xlim(-1, 1)+
           geom_text(data=w1w3[w1w3$extreme==1,] , aes(label=cod), size=2.5, vjust=-0.2, hjust=+0.3)+
      theme_minimal()+
  
   theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
                         plot.subtitle = element_text(color=dark.color, size=9), 
                          axis.title.y = element_text(size = 9),
                          axis.title.x = element_text(color=dark.color, hjust=0.5, size=9),
                           legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))+
                     labs(col="Difference",
                    title = " Figure 7. Differeneces in scaling b/n W1 and W3 by CAP",
                    x = "Predicted probability",
                    y = " CAP category"
                  ) 
  

w3

##What do the figues above tell us: