The MA team forwarded the second round of scaling on the summary level. Below I will compare the results to the first wave of scaled predictions, and will seek to identify the key differences if any appear.
library(foreign)
library(tidyr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v dplyr 1.0.4
## v tibble 3.0.6 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
##loading the main file minus some extra details which we do not need now
library(foreign)
library(dplyr)
full_file=read.csv("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/EurLEx_CAP_scaling_merged_19022021 - Copy.csv")
full_file$capcode=ifelse(full_file$Final_CAP1=="macroeconomics", 1,
ifelse(full_file$Final_CAP1=="civil liberties", 2,
ifelse(full_file$Final_CAP1=="health", 3,
ifelse(full_file$Final_CAP1== "agriculture", 4,
ifelse(full_file$Final_CAP1=="labour and employment",5,
ifelse(full_file$Final_CAP1=="education", 6,
ifelse(full_file$Final_CAP1=="environment", 7 ,
ifelse(full_file$Final_CAP1=="energy", 8,
ifelse(full_file$Final_CAP1=="immigration", 9,
ifelse(full_file$Final_CAP1=="transportation", 10,
ifelse(full_file$Final_CAP1=="law_and_crime", 12,
ifelse(full_file$Final_CAP1=="social policy", 13,
ifelse(full_file$Final_CAP1=="regional policy", 14,
ifelse(full_file$Final_CAP1=="banking_market", 15 ,
ifelse(full_file$Final_CAP1=="defense", 16,
ifelse(full_file$Final_CAP1=="technology", 17,
ifelse(full_file$Final_CAP1=="foreign trade", 18,
ifelse (full_file$Final_CAP1=="international_affairs", 19,
ifelse(full_file$Final_CAP1=="EU_governance", 20,
ifelse(full_file$Final_CAP1=="PLWMTL", 21,
ifelse(full_file$Final_CAP1=="culture", 23,
ifelse(full_file$Final_CAP1=="fisheries", 24,
999 ))) ) ) ) )))) ))))))) )) )))
#load the results of the second round of scaling with only positive extar candidates included
getwd()
## [1] "C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries"
#setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries")
data=read.csv("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries/210302_legis_sum_cod_scaled.csv")
colnames(data)
## [1] "COD" "Unnamed..0_x" "LR_Predictions_wo.bp"
## [4] "LR_Probabilities_wo.bp" "Unnamed..0_y" "SIF_LR_Predictions"
## [7] "SIF_LR_Probs" "SIF_Pos_Similarity" "SIF_Neg_Similarity"
## [10] "LR_SIF_Agree"
#rename columns for merging the datasets
names(data)[1] <- "cod"
data_m=left_join(full_file, data, "cod")
colnames(data_m)
## [1] "X" "cod"
## [3] "requests" "Trilogue_progress"
## [5] "EUROVOCdescriptor" "Subjectmatter"
## [7] "Macroeconomics" "Rights"
## [9] "Health" "Agriculture"
## [11] "Labour" "Education"
## [13] "Environment" "Energy"
## [15] "Immigration" "Transportation"
## [17] "LawCrime" "SocialPolicy"
## [19] "RegionalPolicy" "Market"
## [21] "Defence" "Technology"
## [23] "ForeignTrade" "InternationalRelations"
## [25] "EUgovernance" "PLWMTI"
## [27] "CultureMedia" "Fisheries"
## [29] "maxval" "maxvar1"
## [31] "maxval2" "maxvar2"
## [33] "maxval3" "maxvar3"
## [35] "maxval4" "maxvar4"
## [37] "maxval5" "maxvar5"
## [39] "CAP1" "CAP2"
## [41] "CAP3" "CAP4"
## [43] "cap1_count" "cap2_count"
## [45] "cap3_count" "cap4_count"
## [47] "Final_CAP1" "a"
## [49] "celexnumber" "form"
## [51] "comm" "nofterms_in_eurovoc"
## [53] "procedure" "interinstitutionalprocedurecodey"
## [55] "interinstitutionalprocedurecoden" "dateofdocument"
## [57] "dateofpublication" "documentyear"
## [59] "dateofeffect" "dateofendofvalidity"
## [61] "deadline" "inforceindicator"
## [63] "no_recitals" "no_articles"
## [65] "wordcount" "delegated"
## [67] "implementing" "proposal_date"
## [69] "oppcomms" "amendments"
## [71] "rcv_yes" "rcv_no"
## [73] "rcv_abst" "rcv_yes2"
## [75] "rcv_no2" "rcv_abst2"
## [77] "codification" "recast"
## [79] "repeal" "agreement"
## [81] "compromise" "com_1_rap_1_name"
## [83] "com_1_rap_1_euparty" "simplified_procedure"
## [85] "summary_dummy" "summary_classif"
## [87] "reference" "ep_amendments_tabled"
## [89] "ep_amendments_adopted" "cn_yes"
## [91] "cn_no" "cn_abst"
## [93] "proc_length_m" "proc_length_y"
## [95] "cn_pref_type" "package_deals"
## [97] "package_name" "package_code"
## [99] "agreementwithcom" "lr_predictions"
## [101] "lr_probabilities" "sif_lr_predictions"
## [103] "sif_lr_probs" "sif_pos_similarity"
## [105] "sif_neg_similarity" "lr_predictions_wobp"
## [107] "lr_probabilities_wobp" "example"
## [109] "validated" "completely_wrong_lrbp"
## [111] "completely_wrong_lrwobp" "training"
## [113] "capcode" "Unnamed..0_x"
## [115] "LR_Predictions_wo.bp" "LR_Probabilities_wo.bp"
## [117] "Unnamed..0_y" "SIF_LR_Predictions"
## [119] "SIF_LR_Probs" "SIF_Pos_Similarity"
## [121] "SIF_Neg_Similarity" "LR_SIF_Agree"
#save file
write.csv( data_m, "C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries/03032020Full_file_second_wave_sum.csv", na=" ")
cross tabling the first and the second wave of predictions==> there are 99 non-missing cases which changed the ‘camp’. We might have to do manual validation on these.
table ( data_m$LR_Predictions_wo.bp, data_m$lr_predictions_wobp)
##
## 0 1
## 0 387 10
## 1 89 288
##Merge with the predictions which were generated by the extended sample (with extra “-” and extra “+” candidates identified after first wave validation).
#load the csv with the all additional candidates
data_f=read.csv("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries/03032021_summary_prediction_extra_candidates bfEP9_included.csv")
names(data_f)[1] <- "cod"
data_m=left_join(data_m, data_f, "cod")
colnames(data_m)
## [1] "X" "cod"
## [3] "requests" "Trilogue_progress"
## [5] "EUROVOCdescriptor" "Subjectmatter"
## [7] "Macroeconomics" "Rights"
## [9] "Health" "Agriculture"
## [11] "Labour" "Education"
## [13] "Environment" "Energy"
## [15] "Immigration" "Transportation"
## [17] "LawCrime" "SocialPolicy"
## [19] "RegionalPolicy" "Market"
## [21] "Defence" "Technology"
## [23] "ForeignTrade" "InternationalRelations"
## [25] "EUgovernance" "PLWMTI"
## [27] "CultureMedia" "Fisheries"
## [29] "maxval" "maxvar1"
## [31] "maxval2" "maxvar2"
## [33] "maxval3" "maxvar3"
## [35] "maxval4" "maxvar4"
## [37] "maxval5" "maxvar5"
## [39] "CAP1" "CAP2"
## [41] "CAP3" "CAP4"
## [43] "cap1_count" "cap2_count"
## [45] "cap3_count" "cap4_count"
## [47] "Final_CAP1" "a"
## [49] "celexnumber" "form"
## [51] "comm" "nofterms_in_eurovoc"
## [53] "procedure" "interinstitutionalprocedurecodey"
## [55] "interinstitutionalprocedurecoden" "dateofdocument"
## [57] "dateofpublication" "documentyear"
## [59] "dateofeffect" "dateofendofvalidity"
## [61] "deadline" "inforceindicator"
## [63] "no_recitals" "no_articles"
## [65] "wordcount" "delegated"
## [67] "implementing" "proposal_date"
## [69] "oppcomms" "amendments"
## [71] "rcv_yes" "rcv_no"
## [73] "rcv_abst" "rcv_yes2"
## [75] "rcv_no2" "rcv_abst2"
## [77] "codification" "recast"
## [79] "repeal" "agreement"
## [81] "compromise" "com_1_rap_1_name"
## [83] "com_1_rap_1_euparty" "simplified_procedure"
## [85] "summary_dummy" "summary_classif"
## [87] "reference" "ep_amendments_tabled"
## [89] "ep_amendments_adopted" "cn_yes"
## [91] "cn_no" "cn_abst"
## [93] "proc_length_m" "proc_length_y"
## [95] "cn_pref_type" "package_deals"
## [97] "package_name" "package_code"
## [99] "agreementwithcom" "lr_predictions"
## [101] "lr_probabilities" "sif_lr_predictions"
## [103] "sif_lr_probs" "sif_pos_similarity"
## [105] "sif_neg_similarity" "lr_predictions_wobp"
## [107] "lr_probabilities_wobp" "example"
## [109] "validated" "completely_wrong_lrbp"
## [111] "completely_wrong_lrwobp" "training"
## [113] "capcode" "Unnamed..0_x"
## [115] "LR_Predictions_wo.bp.x" "LR_Probabilities_wo.bp.x"
## [117] "Unnamed..0_y" "SIF_LR_Predictions.x"
## [119] "SIF_LR_Probs.x" "SIF_Pos_Similarity.x"
## [121] "SIF_Neg_Similarity.x" "LR_SIF_Agree"
## [123] "LR_Predictions_wo.bp.y" "LR_Probabilities_wo.bp.y"
## [125] "SIF_LR_Predictions.y" "SIF_LR_Probs.y"
## [127] "SIF_Pos_Similarity.y" "SIF_Neg_Similarity.y"
table ( data_m$LR_Predictions_wo.bp.x, data_m$LR_Predictions_wo.bp.y)
##
## 0 1
## 0 389 5
## 1 163 213
#save file
write.csv( data_m, "C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/EurLex/Second_scaling_summaries/03032020Full_file_second_wave_sum_expanded.csv", na=" ")
col1<-rgb(225, 25, 27, alpha=255, max=255)
col2<-rgb(30, 93, 160, alpha=255, max=255)
col3<-rgb(255, 220, 69, alpha=255, max=255)
col1a<-rgb(225, 25, 27, alpha=195, max=255)
col2a<-rgb(30, 93, 160, alpha=195, max=255)
col3a<-rgb(255, 220, 69, alpha=195, max=255)
col3b<-rgb(248, 202, 8, alpha=255, max=255)
bg.col<-rgb(255, 251, 245, alpha=255,max=255)
dark.color=rgb(35, 35, 58, max=255) # dark color
#extra colors
main.color=rgb(238, 80, 121, alpha=175, max=255) #main color
main.color2<-rgb(64,193,230, alpha=175, max=255)
main.color3<-rgb(50,100,10, alpha=175, max=255)
main.color4<-rgb(80,10,100, alpha=175, max=255)
main.color5<-rgb(155,138,4, alpha=175, max=255)
main.colora=rgb(238, 80, 121, alpha=255, max=255) #main color
main.color2a<-rgb(64,193,230, alpha=255, max=255)
main.color3a<-rgb(50,100,10, alpha=255, max=255)
main.color4a<-rgb(80,10,100, alpha=255, max=255)
main.color5a<-rgb(155,138,4, alpha=255, max=255)
colors<-c(main.color, main.color3, main.color4, main.color2, main.color5 )
colors2<-c(main.colora, main.color3a, main.color4a, main.color2a, main.color5a)
background.color=rgb(255, 251, 245, max=255) #color for the background
dark.color=rgb(35, 35, 58, max=255) # dark color
library(ggplot2)
library(ggridges)
library(forcats)
#using only exra positive cases
p=ggplot(data_m, aes(x =data_m$LR_Probabilities_wo.bp.x, y = data_m$Final_CAP1, fill=data_m$Final_CAP1)) +
geom_density_ridges( alpha = 0.6) +
labs(title ="Figure 1. Predictions of the Logit model with extra positive cases", x='LR without bullet points', y='CAP major category') +
theme_ridges(font_size = 12, grid = TRUE) + theme(legend.title = element_blank(),
plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=10, face="italic"),
axis.title.y = element_text(size = 8, face='italic', hjust = 0.5),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'),
legend.text=element_blank())
# coord_cartesian(ylim = c(1, 50), clip = 'off')
p <- p + theme(legend.position = "none")
print(p)
## Warning: Use of `data_m$LR_Probabilities_wo.bp.x` is discouraged. Use
## `LR_Probabilities_wo.bp.x` instead.
## Warning: Use of `data_m$Final_CAP1` is discouraged. Use `Final_CAP1` instead.
## Warning: Use of `data_m$Final_CAP1` is discouraged. Use `Final_CAP1` instead.
## Picking joint bandwidth of 0.0691
## Warning: Removed 102 rows containing non-finite values (stat_density_ridges).
##let’s take a look what happens when the expanded sample of both extra + and extra - candidates is used
pn=ggplot(data_m, aes(x =data_m$LR_Probabilities_wo.bp.y, y = data_m$Final_CAP1, fill=data_m$Final_CAP1)) +
geom_density_ridges( alpha = 0.6) +
labs(title ="Figure 2. Predictions of the Logit model extra + and - cases", x='LR without bullet points', y='CAP major category') +
theme_ridges(font_size = 12, grid = TRUE) + theme(legend.title = element_text(color=dark.color, size=10),
plot.title = element_text(color=dark.color, size=10, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=10, face="italic"),
axis.title.y = element_text(size = 8, face='italic', hjust = 0.5),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=8, face='italic'),
legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))
# coord_cartesian(ylim = c(1, 50), clip = 'off')
pn <- pn + theme(legend.position = "none")
print(pn)
## Warning: Use of `data_m$LR_Probabilities_wo.bp.y` is discouraged. Use
## `LR_Probabilities_wo.bp.y` instead.
## Warning: Use of `data_m$Final_CAP1` is discouraged. Use `Final_CAP1` instead.
## Warning: Use of `data_m$Final_CAP1` is discouraged. Use `Final_CAP1` instead.
## Picking joint bandwidth of 0.0777
## Warning: Removed 119 rows containing non-finite values (stat_density_ridges).
##results are unclear==> timeseries might be more useful ==> First only extra positives
#get the year indicator
#quick rename of the messy variables
names(data_m)[names(data_m) == "interinstitutionalprocedurecodey"] <- "inter_year" # I will use the Year indicated in the procedure code
#names(data1)[names(data1) == "interinstitutionalprocedurecodey"] <- "inter_year"
data_m <- data_m %>% mutate(inter_year = replace(inter_year, startsWith(cod, "2018"), 2018)) %>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2017"), 2017))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2016"), 2016))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2015"), 2015)) %>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2014"), 2014))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2013"), 2013))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2012"), 2012)) %>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2019"), 2019))%>%
mutate(inter_year = replace(inter_year, startsWith(cod, "2020"), 2020))
#change double-years in the procedure year into Year indicated in the procedure name
data_m <- data_m %>% mutate(inter_year = ifelse(celexnumber == "32012D0243", 2010, inter_year))
data_m <- data_m %>% mutate(inter_year = ifelse(celexnumber == "32011R1232", 2008, inter_year))
head(data_m$inter_year)
## [1] "2005" "2006" "2006" "2007" "2007" "2007"
##with extra positives and negatives==> see below
data_ts_cap=subset(data_m, !is.na(LR_Probabilities_wo.bp.x ))
yearly_lrprobability_cap <- data_ts_cap %>%
group_by(inter_year, Final_CAP1) %>%
summarize(avg_prob_cap = mean(LR_Probabilities_wo.bp.x ))
## `summarise()` has grouped output by 'inter_year'. You can override using the `.groups` argument.
ts_pos= ggplot(data =subset(yearly_lrprobability_cap, !is.na(avg_prob_cap)),
aes(y=avg_prob_cap, x=inter_year, group=Final_CAP1, color=Final_CAP1)) +
geom_point() +
geom_line() +
theme_minimal()+
theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=9, face="italic"),
axis.title.y = element_text(size = 6, face='italic'),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=6, face='italic'))+
labs(
title = "Figure 3. TS by CAP when only EXTRA Positive candidates used",
x = "Year the proposal tabled",
y = "Probality of Expanding the EU Competence"
)
ts_pos= ts_pos+facet_wrap(~Final_CAP1, ncol= 7, scales="fixed")+theme(strip.text.x = element_text(size =6, colour = "black"))
ts_pos=ts_pos+ theme(legend.position="none" ) # drop the positions
ticks=element_text(face ="italic", color = "black", size = 5)
ts_pos + scale_x_discrete(breaks=c("2010","2015","2020"),
labels=c("2010", "2015", "2020"))+theme(axis.text.x = ticks) # adjust the N and the size of the ticks
yearly_lrprobability_cap_y <- data_ts_cap %>%
group_by(inter_year, Final_CAP1) %>%
summarize(avg_prob_cap_y = mean(LR_Probabilities_wo.bp.y ))
## `summarise()` has grouped output by 'inter_year'. You can override using the `.groups` argument.
ts_posneg= ggplot(data =subset(yearly_lrprobability_cap_y, !is.na(avg_prob_cap_y)),
aes(y=avg_prob_cap_y, x=inter_year, group=Final_CAP1, color=Final_CAP1)) +
geom_point() +
geom_line() +
theme_minimal()+
theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=9, face="italic"),
axis.title.y = element_text(size = 6, face='italic'),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=6, face='italic'))+
labs(
title = "Figure 4. TS by CAP when extra NEGATIVE & POSITIVE candidates used",
x = "Year the proposal tabled",
y = "Probality of Expanding the EU Competence"
)
ts_posneg= ts_posneg+facet_wrap(~Final_CAP1, ncol= 7, scales="fixed")+theme(strip.text.x = element_text(size =6, colour = "black"))
ts_posneg=ts_posneg+ theme(legend.position="none" ) # drop the positions
ticks=element_text(face ="italic", color = "black", size = 5)
ts_posneg+ scale_x_discrete(breaks=c("2010","2015","2020"),
labels=c("2010", "2015", "2020"))+theme(axis.text.x = ticks) # adjust the N and the size of the ticks
##Overview
Overall, it seems that there are minor changes but the patterns remain more or less the same. *The use of extra negative cases results in more clustering around 0.5 values (see change of the scale on Y-axis and the location of the maximums within scaled summaries). Here one should also keep in mind a substantive loss of precision of the prediction (will be summarized in the main overview file)
*The use of the scaling based on the updated sample of only positive case results in more precision and higher recall of the model, suggesting fewer false positive and false negative predictions. The TS and the distribution of the average probability indicate only minor alterations to the key patterns identified in the original scaling;
*However, there are still a couple of thing to note here: When the sample of the candidates enriched with only positive cases, the updated scaling soothes the changes in such areas as immigration, International affairs (positive spikes in the first round of scaling); labor and employment (a negative drop around 2010). Details to follow.
#subset the needed infor and get rid of the useless DFs in environment
#get maximum values for the first wave of scaling in
#table( data_m$summary_classif, data_m$cod)
#orig_posit=data_m[data_m$summary_classif==1, "cod" ]
data_m=data_m %>%
#Step1: replace with '1' predictions adn probailities for the cadidates from the original training sample: positive summaries
mutate(lr_predictions_wobp=replace(lr_predictions_wobp, summary_classif==1, 1)) %>%
mutate(lr_probabilities_wobp=replace(lr_probabilities_wobp, summary_classif==1, 1)) %>%
#Step1.2. now the negative summaries
mutate(lr_predictions_wobp=replace(lr_predictions_wobp, summary_classif==-1, 0)) %>%
mutate(lr_probabilities_wobp=replace(lr_probabilities_wobp, summary_classif==-1, 0)) %>%
#Step 2. now do the same replacing for the scaling2 (including only new + candidates). Start with positive
mutate(LR_Predictions_wo.bp.x=replace(LR_Predictions_wo.bp.x, summary_classif==1, 1)) %>%
mutate(LR_Probabilities_wo.bp.x=replace(LR_Probabilities_wo.bp.x, summary_classif==1, 1)) %>%
#Step 2.2 now scaling 2==> replacement for the original negative candidates
mutate(LR_Predictions_wo.bp.x=replace(LR_Predictions_wo.bp.x, summary_classif==-1, 0)) %>%
mutate(LR_Probabilities_wo.bp.x=replace(LR_Probabilities_wo.bp.x, summary_classif==-1, 0)) %>%
#Step3. Replacing initial + candidates in the prob. and predictions for the scaling 3(including neg and pos candidates)
mutate(LR_Predictions_wo.bp.y=replace(LR_Predictions_wo.bp.y, summary_classif==1, 1)) %>%
mutate(LR_Probabilities_wo.bp.y=replace(LR_Probabilities_wo.bp.y, summary_classif==1, 1)) %>%
#Step3.2. now scaling 3==> replacement for the original negative candidates
mutate(LR_Predictions_wo.bp.y=replace(LR_Predictions_wo.bp.y, summary_classif==-1, 0)) %>%
mutate(LR_Probabilities_wo.bp.y=replace(LR_Probabilities_wo.bp.y, summary_classif==-1, 0))
colnames(data_m)
## [1] "X" "cod"
## [3] "requests" "Trilogue_progress"
## [5] "EUROVOCdescriptor" "Subjectmatter"
## [7] "Macroeconomics" "Rights"
## [9] "Health" "Agriculture"
## [11] "Labour" "Education"
## [13] "Environment" "Energy"
## [15] "Immigration" "Transportation"
## [17] "LawCrime" "SocialPolicy"
## [19] "RegionalPolicy" "Market"
## [21] "Defence" "Technology"
## [23] "ForeignTrade" "InternationalRelations"
## [25] "EUgovernance" "PLWMTI"
## [27] "CultureMedia" "Fisheries"
## [29] "maxval" "maxvar1"
## [31] "maxval2" "maxvar2"
## [33] "maxval3" "maxvar3"
## [35] "maxval4" "maxvar4"
## [37] "maxval5" "maxvar5"
## [39] "CAP1" "CAP2"
## [41] "CAP3" "CAP4"
## [43] "cap1_count" "cap2_count"
## [45] "cap3_count" "cap4_count"
## [47] "Final_CAP1" "a"
## [49] "celexnumber" "form"
## [51] "comm" "nofterms_in_eurovoc"
## [53] "procedure" "inter_year"
## [55] "interinstitutionalprocedurecoden" "dateofdocument"
## [57] "dateofpublication" "documentyear"
## [59] "dateofeffect" "dateofendofvalidity"
## [61] "deadline" "inforceindicator"
## [63] "no_recitals" "no_articles"
## [65] "wordcount" "delegated"
## [67] "implementing" "proposal_date"
## [69] "oppcomms" "amendments"
## [71] "rcv_yes" "rcv_no"
## [73] "rcv_abst" "rcv_yes2"
## [75] "rcv_no2" "rcv_abst2"
## [77] "codification" "recast"
## [79] "repeal" "agreement"
## [81] "compromise" "com_1_rap_1_name"
## [83] "com_1_rap_1_euparty" "simplified_procedure"
## [85] "summary_dummy" "summary_classif"
## [87] "reference" "ep_amendments_tabled"
## [89] "ep_amendments_adopted" "cn_yes"
## [91] "cn_no" "cn_abst"
## [93] "proc_length_m" "proc_length_y"
## [95] "cn_pref_type" "package_deals"
## [97] "package_name" "package_code"
## [99] "agreementwithcom" "lr_predictions"
## [101] "lr_probabilities" "sif_lr_predictions"
## [103] "sif_lr_probs" "sif_pos_similarity"
## [105] "sif_neg_similarity" "lr_predictions_wobp"
## [107] "lr_probabilities_wobp" "example"
## [109] "validated" "completely_wrong_lrbp"
## [111] "completely_wrong_lrwobp" "training"
## [113] "capcode" "Unnamed..0_x"
## [115] "LR_Predictions_wo.bp.x" "LR_Probabilities_wo.bp.x"
## [117] "Unnamed..0_y" "SIF_LR_Predictions.x"
## [119] "SIF_LR_Probs.x" "SIF_Pos_Similarity.x"
## [121] "SIF_Neg_Similarity.x" "LR_SIF_Agree"
## [123] "LR_Predictions_wo.bp.y" "LR_Probabilities_wo.bp.y"
## [125] "SIF_LR_Predictions.y" "SIF_LR_Probs.y"
## [127] "SIF_Pos_Similarity.y" "SIF_Neg_Similarity.y"
data=data_m[c(2, 47, 51, 54, 113, 100:108, 115:128) ]
rm(data_f)
rm(full_file, joined_file, negolinks)
## Warning in rm(full_file, joined_file, negolinks): object 'joined_file' not found
## Warning in rm(full_file, joined_file, negolinks): object 'negolinks' not found
rm(data_ts_cap)
#rename variables in a straightforward way
#only extra positive candidates: ==> all marked with x
data=data %>%
rename(LR_Predictions_posit= LR_Predictions_wo.bp.x,
LR_Probabilities_posit=LR_Probabilities_wo.bp.x,
SIF_LR_Predictions_posit=SIF_LR_Predictions.x,
SIF_LR_Probs_posit= SIF_LR_Probs.x,
SIF_Pos_Similarity_posit=SIF_Pos_Similarity.x)
##same thing for the mix of the extra negative and extra positive candidates
data= data%>%
rename(LR_Predictions_NP= LR_Predictions_wo.bp.y,
LR_Probabilities_NP=LR_Probabilities_wo.bp.y,
SIF_LR_Predictions_NP=SIF_LR_Predictions.y,
SIF_LR_Probs_NP= SIF_LR_Probs.y,
SIF_Pos_Similarity_NP=SIF_Pos_Similarity.y)
# information on training summaries ==> 17 negative and 3 positive
data$LR_Predictions_NP[data$cod=="2019/0010(COD)"] =0
data$LR_Predictions_NP[data$cod=="2019/0009(COD)"]=0
data$LR_Predictions_NP[data$cod=="2016/0075(COD)"]=0
data$LR_Predictions_NP[data$cod=="2016/0034(COD)"]=0
data$LR_Predictions_NP[data$cod=="2016/0033(COD)"]=0
data$LR_Predictions_NP[data$cod=="2016/0029(COD)"]=0
data$LR_Predictions_NP[data$cod=="2015/0218(COD)"]=0
data$LR_Predictions_NP[data$cod=="2014/0279(COD)"]=0
data$LR_Predictions_NP[data$cod=="2014/0213(COD)"]=0
data$LR_Predictions_NP[data$cod=="2014/0164(COD)"]=0
data$LR_Predictions_NP[data$cod=="2013/0327(COD)"]=0
data$LR_Predictions_NP[data$cod=="2013/0303(COD)"]=0
data$LR_Predictions_NP[data$cod=="2013/0013(COD)"]=0
data$LR_Predictions_NP[data$cod=="2011/0002(COD)"]=0
data$LR_Predictions_NP[data$cod=="2009/0173(COD)"]=0
data$LR_Predictions_NP[data$cod=="2008/0009(COD)"]=0
data$LR_Predictions_NP[data$cod=="2015/0295(COD)"]=0
#positives candidates
data$LR_Predictions_NP[data$cod=="2015/0289(COD)"]=1
data$LR_Predictions_NP[data$cod=="2009/0077(COD)"]=1
data$LR_Predictions_NP[data$cod=="2009/0076(COD)"]=1
#same thing for the probabilities in the LR with neg&posit
data$LR_Probabilities_NP[data$cod=="2019/0010(COD)"] =0
data$LR_Probabilities_NP[data$cod=="2019/0009(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2016/0075(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2016/0034(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2016/0033(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2016/0029(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2015/0218(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2014/0279(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2014/0213(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2014/0164(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2013/0327(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2013/0303(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2013/0013(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2011/0002(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2009/0173(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2008/0009(COD)"]=0
data$LR_Probabilities_NP[data$cod=="2015/0295(COD)"]=0
#positives candidates
data$LR_Probabilities_NP[data$cod=="2015/0289(COD)"]=1
data$LR_Probabilities_NP[data$cod=="2009/0077(COD)"]=1
data$LR_Probabilities_NP[data$cod=="2009/0076(COD)"]=1
##make replacements for only positives
data$LR_Predictions_posit[data$cod=="2015/0289(COD)"]=1
data$LR_Predictions_posit[data$cod=="2009/0077(COD)"]=1
data$LR_Predictions_posit[data$cod=="2009/0076(COD)"]=1
#positive sample probabilities LR_Probabilities_posit
data$LR_Probabilities_posit[data$cod=="2015/0289(COD)"]=1
data$LR_Probabilities_posit[data$cod=="2009/0077(COD)"]=1
data$LR_Probabilities_posit[data$cod=="2009/0076(COD)"]=1
##plot by category side by side==> a bit of a pain, but doable and might be the only way to see any differences
# get the time-series subsamples and ondicators calculated
ts_set2<- data %>% ## only extra positive candidates
group_by(Final_CAP1, inter_year) %>%
summarize(avg_prob_posit = mean(LR_Probabilities_posit)) %>%
ungroup()
## `summarise()` has grouped output by 'Final_CAP1'. You can override using the `.groups` argument.
ts_set3=data %>% ## extra negative and positive
group_by(Final_CAP1, inter_year) %>%
summarize(avg_prob_NP= mean(LR_Probabilities_NP))
## `summarise()` has grouped output by 'Final_CAP1'. You can override using the `.groups` argument.
ts_set1= data%>% # original scaling
group_by(Final_CAP1, inter_year) %>%
summarize(avg_prob_orig=mean(lr_probabilities_wobp))
## `summarise()` has grouped output by 'Final_CAP1'. You can override using the `.groups` argument.
rm(ts_set)
## Warning in rm(ts_set): object 'ts_set' not found
p=ggplot()+
geom_line(data=subset(ts_set1, !is.na(avg_prob_orig)), aes(x=inter_year, y=avg_prob_orig , group=Final_CAP1, color="red"))+
geom_line(data=subset(ts_set2, !is.na(avg_prob_posit)), aes(x=inter_year, y=avg_prob_posit , group=Final_CAP1, color="darkgreen"))+
geom_line(data=subset(ts_set3, !is.na(avg_prob_NP)), aes(x=inter_year, y=avg_prob_NP , group=Final_CAP1 , color="blue"))+
theme_minimal()+
theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=9, face="italic"),
axis.title.y = element_text(size = 6, face='italic'),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=6, face='italic'))+
labs(
title = " Figure 5. TS by CAP using al three scaling attempts",
x = "Year the proposal tabled",
y = "Probality of Expanding the EU Competence"
)
p=p + scale_colour_manual(values = c("red", "darkgreen", "blue"), labels=c("Original", "Extra +", "Extra + & -"))
p= p+ theme(legend.position="left" ) # drop the positions
ticks=element_text(face ="italic", color = "black", size = 5)
p=p+ scale_x_discrete(breaks=c("2010","2015","2020"),
labels=c("2010", "2015", "2020"))+theme(axis.text.x = ticks) # adjust the N and the size of the ticks
p+facet_wrap(~Final_CAP1, ncol= 7, scales="fixed")+theme(strip.text.x = element_text(size =6, colour = "black"))
## There are some cases which changed their ‘camp’: in a nut-shell some procedures were classified as, for instance, + in the first round of scaling, but turned out to be in a “-” groups after the second or third round of scaling. Here, I will take a look into what those cases and whether there is any specific pattern that emerges across them.
#coding the changes from W1 to W2. NOTA BENE: in W1 15 procedures were omitted==> hence the NA values
data=data%>%
mutate(W1W2_change=ifelse(lr_predictions_wobp==LR_Predictions_posit, 0, 1))
table(data$W1W2_change)
##
## 0 1
## 771 101
head(data$W1W2_change)
## [1] 1 0 0 0 0 0
##track down changes from W2 to W3
data=data%>%
mutate(W2W3_change=ifelse(LR_Predictions_posit==LR_Predictions_NP, 0, 1))
table(data$W2W3_change)
##
## 0 1
## 712 173
#track the change from W1 to W3
data=data%>%
mutate(W1W3_change=ifelse(lr_predictions_wobp==LR_Predictions_NP, 0, 1))
table(data$W1W3_change)
##
## 0 1
## 754 118
##labels makes our lives easier
data$W1W2_change=ordered(data$W1W2_change, levels=c(0, 1), labels=c("Same class", "Different class"))
data$W2W3_change=ordered(data$W2W3_change, levels=c(0, 1), labels=c("Same class", "Different class"))
data$W1W3_change=ordered(data$W1W3_change, levels=c(0, 1), labels=c("Same class", "Different class"))
table(data$W1W3_change)
##
## Same class Different class
## 754 118
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
tabyl(data, Final_CAP1, W1W2_change )
## Final_CAP1 Same class Different class NA_
## agriculture 29 8 1
## banking_market 196 21 0
## civil liberties 11 0 1
## culture 7 1 0
## defense 4 0 0
## education 1 1 0
## energy 17 1 1
## environment 66 5 1
## EU_governance 29 2 4
## fisheries 37 6 0
## foreign trade 63 1 1
## health 16 2 0
## immigration 41 12 1
## international_affairs 33 2 1
## labour and employment 28 4 0
## law_and_crime 28 2 4
## macroeconomics 28 4 0
## regional policy 9 3 1
## social policy 6 3 0
## technology 71 9 1
## transportation 51 14 0
# 4 procedures are omitted in W2 and W3 scaling
#all good and nice, but if you need==> look at %
#tabyl(data, Final_CAP1, W1W2_change )%>%
# adorn_percentages("col") %>%
#adorn_pct_formatting(digits = 1)
##### W2 vs W3
tabyl(data, Final_CAP1, W2W3_change )
## Final_CAP1 Same class Different class NA_
## agriculture 31 7 0
## banking_market 170 47 0
## civil liberties 11 1 0
## culture 6 2 0
## defense 4 0 0
## education 1 1 0
## energy 17 1 1
## environment 58 14 0
## EU_governance 31 4 0
## fisheries 33 10 0
## foreign trade 61 4 0
## health 15 3 0
## immigration 42 12 0
## international_affairs 31 5 0
## labour and employment 28 4 0
## law_and_crime 29 3 2
## macroeconomics 22 10 0
## regional policy 11 2 0
## social policy 5 4 0
## technology 65 15 1
## transportation 41 24 0
##W1 vs W3
tabyl(data, Final_CAP1, W1W3_change)
## Final_CAP1 Same class Different class NA_
## agriculture 34 3 1
## banking_market 179 38 0
## civil liberties 10 1 1
## culture 7 1 0
## defense 4 0 0
## education 2 0 0
## energy 18 0 1
## environment 58 13 1
## EU_governance 27 4 4
## fisheries 39 4 0
## foreign trade 61 3 1
## health 17 1 0
## immigration 43 10 1
## international_affairs 32 3 1
## labour and employment 28 4 0
## law_and_crime 29 1 4
## macroeconomics 26 6 0
## regional policy 11 1 1
## social policy 8 1 0
## technology 70 10 1
## transportation 51 14 0
##The question is how much difference there are substantively in the level of prediction if we look beyond dummy classification==> lets calculate differences by procedures between the waves of predictions using the continuous probability indicator
data=data %>%
mutate(
pr_diff_w1w2= data$lr_probabilities_wobp - data$LR_Probabilities_posit,
pr_diff_w3w2= data$LR_Probabilities_NP - data$LR_Predictions_posit,
pr_diff_w1w3= data$lr_probabilities_wobp - data$LR_Probabilities_NP)
#subset only cases when the change occurred
w1w2=subset(data, W1W2_change=="Different class" )
w2w3=subset(data, W2W3_change=="Different class" )
w1w3=subset(data, W1W3_change=="Different class" )
##brewup
pinkish=rgb(1, 0, 0.6, 0.5)
greenish=rgb(0.2, 0.8, 0.6, 0.5)
kindaorange=rgb(0.8, 0.6, 0.6, 0.8)
#selecting extreme cases for labeling on the plot
w1w2$extreme=ifelse(w1w2$pr_diff_w1w2< -0.5, 1, 0)
table(w1w2$extreme)
##
## 0 1
## 99 2
# W1 vs W2==> group by cap category
w1=ggplot(w1w2, aes(x=pr_diff_w1w2, y=Final_CAP1, group=Final_CAP1)) +
geom_point(alpha=0.5, aes(color = cut(pr_diff_w1w2, c(-Inf, -0.5, -0.25, 0.25, Inf))) )+xlim(-1, 1)+
geom_text(data=w1w2[w1w2$extreme==1,] , aes(label=cod), size=2.5, hjust=-.2)+
scale_colour_manual(name="Difference",
values=c("(-Inf,-0.5]"=pinkish,
"(-0.5,-0.25]"=kindaorange,
"(-0.25,0.25]"=greenish,
"(0.25, Inf]"=kindaorange),
labels = c("Large difference", "Medium diffrende", "Similar", "Medium Difference"), na.translate=F)+
theme_minimal()+
theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=9),
axis.title.y = element_text(size = 9),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=9))+
labs(
title = " Figure 6. Differeneces in scaling b/n W1 and W2 by CAP",
x = "Predicted probability",
y = " CAP category"
)
w1
w1w3$extreme=ifelse(w1w3$pr_diff_w1w3> 0.5 | w1w3$pr_diff_w1w3< -0.5 , 1, 0)
table(w1w3$extreme)
##
## 0 1
## 115 3
# W1 vs W2==> group by cap category
w3=ggplot(w1w3, aes(x=pr_diff_w1w3, y=Final_CAP1, group=Final_CAP1)) +
geom_point(aes(color = pr_diff_w1w3))+
xlim(-1, 1)+
geom_text(data=w1w3[w1w3$extreme==1,] , aes(label=cod), size=2.5, vjust=-0.2, hjust=+0.3)+
theme_minimal()+
theme( plot.title = element_text(color=dark.color, size=9, face="italic", hjust=0.5),
plot.subtitle = element_text(color=dark.color, size=9),
axis.title.y = element_text(size = 9),
axis.title.x = element_text(color=dark.color, hjust=0.5, size=9),
legend.text=element_text(size=8), legend.key.size = unit(0.4, 'cm'))+
labs(col="Difference",
title = " Figure 7. Differeneces in scaling b/n W1 and W3 by CAP",
x = "Predicted probability",
y = " CAP category"
)
w3
##What do the figues above tell us:
Even among those procedures that ended up being classified in a different group based on the dichotomous measurement, only a few observations have somewhat pronounced differences.
There are only a few cases which show substantial difference in the scaling value across difference scaling-models. When scrutinizing the differences between W1 vis-a-vis W2, and W1 vis-a-vis W3, two procedures from Banking and market, and AGRI consistently stand out. It appears that they were estimated to have much less impetus for extension of the EU authority in comparison to the W2 and W3.
Unpacking this further: essentially the two outlying procedures (aka 2009/0076(COD) and 2009/0077(COD)) were picked as ideal candidates(+) and were included into the training set for W2 and W3.
The + outlying observation in Figure 7, is a an example of the new (-) candidate summary which was identified during validation and added into the training set for W3.