knitr::opts_chunk$set(warning = FALSE)
knitr::opts_knit$set(root.dir = "C:/Users/nasta/Dropbox/____Nordface_POst_doc/FULL_Texts_Scaling/annotations/Annotations_TRILOGUES/Output_file/Round2")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(foreign)
library(ggplot2)
library(stringr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.7     ✔ purrr   0.3.4
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(readr)
library(purrr)

library(tidycomm)
## Warning: package 'tidycomm' was built under R version 4.2.1
library(readxl)
require(lpSolve)
## Loading required package: lpSolve
stas_main=read_excel("C:/Users/nasta/Dropbox/____Nordface_POst_doc/FULL_Texts_Scaling/annotations/Annotations_TRILOGUES/Output_file/Round2/Round2_Stas_completed_10082022.xlsx")

table(stas_main$stas_eu_com)
## 
##   0   1 888 999 
##  66 125 351 215
table(stas_main$stas_eu_cn)
## 
##   0   1 888 999 
##  98 138 268 253
table(stas_main$stas_com_cn_change)
## 
##   1   2   3   4 888 999 
##   8  34 138   8 355 214
#load second coded Stas' sample
stas_checkd=read_excel("C:/Users/nasta/Dropbox/____Nordface_POst_doc/FULL_Texts_Scaling/annotations/Annotations_TRILOGUES/Output_file/Round2/R2_Stas_secondcoded_annotations_10082022.xlsx")
table(stas_checkd$sasha_eu_com)
## 
##   0   1 888 999 
##   3  36  51  60
table(stas_checkd$sasha_eu_cn)
## 
##   0   1 888 999 
##   7  43  30  70
table(stas_checkd$sasha_com_cn_change)
## 
##   1   2   3   4 888 999 
##   2   4  30   2  52  60
#load Sasha's main file

sasha_main=read_excel("C:/Users/nasta/Dropbox/____Nordface_POst_doc/FULL_Texts_Scaling/annotations/Annotations_TRILOGUES/Output_file/Round2/Round2_Sasha_completed_10082022.xlsx")

table(sasha_main$sasha_eu_com) 
## 
##   0   1 888 999 
##  49 105 311 272
table(sasha_main$sasha_eu_cn)
## 
##   0   1 888 999 
##  68 113 216 340
table(sasha_main$sasha_com_cn_change)
## 
##   1   2   3   4   5 888 999 
##   8  15 116   2   8 313 275
#load second coded Sasha's file
sasha_checkd= read_excel("C:/Users/nasta/Dropbox/____Nordface_POst_doc/FULL_Texts_Scaling/annotations/Annotations_TRILOGUES/Output_file/Round2/R2_secondcode_Sasha_05082022.xlsx")
## New names:
## • `` -> `...19`
table(sasha_checkd$stas_eu_com)
## 
##   0   1 888 999 
##  10  26 112  40
table(sasha_checkd$stas_eu_cn)
## 
##   0   1 888 999 
##  13  31  96  48
table(sasha_checkd$stas_com_cn_change)
## 
##   1   2   3   4 888 999 
##   1   6  28   1 112  40
sasha_checkd$`...19`=NULL

merge the files

# merge all stats related files

stas_all=left_join(stas_main, stas_checkd)
## Joining, by = c("index", "cod", "proposal_year", "index2", "identifier",
## "CAP_str", "article", "paragraph", "article_names", "COM", "EP", "Council",
## "Comments")
rm(stas_checkd, stas_main)
 stas_all=stas_all %>% mutate(sasha_eu_com=ifelse(identifier=='2011/0299(COD)_93', 999, sasha_eu_com), sasha_eu_cn=ifelse(identifier=='2011/0299(COD)_93', 999, sasha_eu_cn), sasha_com_cn_change=ifelse(identifier=='2011/0299(COD)_93', 999, sasha_com_cn_change))
#merge all sasha's files

sasha_all=left_join(sasha_main, sasha_checkd)
## Joining, by = c("index", "cod", "proposal_year", "index2", "identifier",
## "CAP_str", "COM", "EP", "Council", "Comments", "article", "paragraph",
## "article_names", "STRTHR", "full_cell_deleted")
rm(sasha_checkd, sasha_main)

sasha_all$...19=NULL
sasha_all$STRTHR=NULL
sasha_all$full_cell_deleted=NULL

sasha_all$article=as.character(sasha_all$article)
#cross merge
round_2_doubecoded=bind_rows(stas_all, sasha_all)


#select the double coded

double_coded=round_2_doubecoded %>% filter(!is.na(sasha_eu_com)) %>% filter(!is.na(stas_eu_com)) %>%
   
     mutate(stas_eu_com=ifelse(stas_eu_com=="XXX", 888, stas_eu_com ),
           
           sasha_eu_com= ifelse(sasha_eu_com=="xxx", 888, sasha_eu_com ),  sasha_eu_com= ifelse(sasha_eu_com=="XXX", 888, sasha_eu_com ),
           
            stas_eu_cn= ifelse(stas_eu_cn=="XXX", 888, stas_eu_cn ), 
           
           stas_com_cn_change= ifelse(stas_com_cn_change=="XXX", 888, stas_com_cn_change),
           
           sasha_eu_cn= ifelse( sasha_eu_cn=="xxx", 888,  sasha_eu_cn ),  
            sasha_eu_cn= ifelse( sasha_eu_cn=="x", 888,  sasha_eu_cn ),  sasha_eu_cn= ifelse( sasha_eu_cn=="XXX", 888,  sasha_eu_cn ),
           
           sasha_com_cn_change= ifelse(sasha_com_cn_change=="xxx", 888, sasha_com_cn_change ),   sasha_com_cn_change= ifelse(sasha_com_cn_change=="XXX", 888, sasha_com_cn_change ))  


double_coded=double_coded%>%
    mutate_at(c( 'stas_com_cn_change',  'sasha_com_cn_change', 'stas_eu_com', 'sasha_eu_com', 'stas_eu_cn', 'sasha_eu_cn'), as.numeric)

write.csv(double_coded, file="C:/Users/nasta/Dropbox/____Nordface_POst_doc/FULL_Texts_Scaling/annotations/Annotations_TRILOGUES/Output_file/Round2/Round2_disagreements.csv", na="", row.names = FALSE)




double_coded=double_coded %>% mutate(com_disagr=ifelse(stas_eu_com==sasha_eu_com, 0, 1), 
                                     cn_disagr=ifelse(stas_eu_cn==sasha_eu_cn, 0, 1), 
                                     change_disagree=ifelse(stas_com_cn_change==sasha_com_cn_change, 0, 1 ))

#transform into long format as needed

#com
com_long=double_coded %>%select(-c(stas_eu_cn, stas_com_cn_change, sasha_eu_cn, sasha_com_cn_change)) %>% 
                    pivot_longer(cols= ends_with("_eu_com"), names_to = 'coder_id', values_to = 'eu_com' )

#cn
council_long=double_coded %>%select(-c(stas_com_cn_change,  sasha_com_cn_change, stas_eu_com, sasha_eu_com)) %>%
                 pivot_longer(cols= ends_with("_eu_cn"), names_to = 'coder_id', values_to = 'eu_council' )

#change

change_long=double_coded %>%select(-c(stas_eu_cn,  sasha_eu_cn, stas_eu_com, sasha_eu_com)) %>%
                pivot_longer(cols= ends_with("_change"), names_to = 'coder_id', values_to = 'change' )

Run reliability check

#Commission

com_long %>%
        test_icr(identifier, coder_id, eu_com, )
## # A tibble: 1 × 8
##   Variable n_Units n_Coders n_Categories Level   Agreement Holstis_CR
##   <chr>      <int>    <int>        <int> <chr>       <dbl>      <dbl>
## 1 eu_com       288        2            4 nominal     0.844      0.844
## # … with 1 more variable: Krippendorffs_Alpha <dbl>

#Council

council_long %>%
        test_icr(identifier, coder_id, eu_council )
## # A tibble: 1 × 8
##   Variable   n_Units n_Coders n_Categories Level   Agreement Holstis_CR
##   <chr>        <int>    <int>        <int> <chr>       <dbl>      <dbl>
## 1 eu_council     288        2            4 nominal     0.792      0.792
## # … with 1 more variable: Krippendorffs_Alpha <dbl>

#change

change_long%>%test_icr(identifier, coder_id, change , levels = c(change = 'interval')) 
## # A tibble: 1 × 8
##   Variable n_Units n_Coders n_Categories Level    Agreement Holstis_CR
##   <chr>      <int>    <int>        <int> <chr>        <dbl>      <dbl>
## 1 change       288        2            6 interval     0.816      0.816
## # … with 1 more variable: Krippendorffs_Alpha <dbl>

The reconciled version of the double coding and reliabiltiy checks

#load the reconciled version

reconciled=read_excel("Round2_disagreements_reconciled_11082022.xlsx")
## New names:
## • `` -> `...23`
## • `` -> `...25`
## • `` -> `...27`
## • `` -> `...28`
## • `` -> `...29`
## • `` -> `...30`
## • `` -> `...31`
## • `` -> `...32`
recon=c(reconciled$identifier)

double_c=c(double_coded$identifier)

setdiff(recon, double_c)
## character(0)
rm(double_c)
# remove useless columns
colnames(reconciled)
##  [1] "index"               "cod"                 "proposal_year"      
##  [4] "index2"              "identifier"          "CAP_str"            
##  [7] "article"             "paragraph"           "article_names"      
## [10] "COM"                 "EP"                  "Council"            
## [13] "Comments"            "stas_eu_com"         "stas_eu_cn"         
## [16] "stas_com_cn_change"  "sasha_eu_com"        "sasha_eu_cn"        
## [19] "sasha_com_cn_change" "com_disagr"          "cn_disagr"          
## [22] "change_disagree"     "...23"               "Relevant"           
## [25] "...25"               "new code_ea"         "...27"              
## [28] "...28"               "...29"               "...30"              
## [31] "...31"               "...32"
reconciled=reconciled %>% select(-c(20:32)) %>% 
                            mutate_at('article', as.character)

#exclude the observations that are double coded from the main list

round_2_doubecoded=round_2_doubecoded %>% 
                    filter(!identifier  %in% recon )


#replace xxx into 888 in the main file
round_2_doubecoded=round_2_doubecoded  %>%
     mutate(stas_eu_com=ifelse(stas_eu_com=="XXX", 888, stas_eu_com ),
           
           sasha_eu_com= ifelse(sasha_eu_com=="xxx", 888, sasha_eu_com ),  sasha_eu_com= ifelse(sasha_eu_com=="XXX", 888, sasha_eu_com ),
           
            stas_eu_cn= ifelse(stas_eu_cn=="XXX", 888, stas_eu_cn ), 
           
           stas_com_cn_change= ifelse(stas_com_cn_change=="XXX", 888, stas_com_cn_change),
           
           sasha_eu_cn= ifelse( sasha_eu_cn=="xxx", 888,  sasha_eu_cn ),  
            sasha_eu_cn= ifelse( sasha_eu_cn=="x", 888,  sasha_eu_cn ),  sasha_eu_cn= ifelse( sasha_eu_cn=="XXX", 888,  sasha_eu_cn ),
           
           sasha_com_cn_change= ifelse(sasha_com_cn_change=="xxx", 888, sasha_com_cn_change ),   sasha_com_cn_change= ifelse(sasha_com_cn_change=="XXX", 888, sasha_com_cn_change ))  %>%
    mutate_at(c( 'stas_com_cn_change',  'sasha_com_cn_change', 'stas_eu_com', 'sasha_eu_com', 'stas_eu_cn', 'sasha_eu_cn'), as.numeric)

                        
#add the recolnciled cases in

round_2_final=bind_rows(round_2_doubecoded, reconciled)

#rename the variables 
round_2_final=round_2_final %>% 
                            rename("commission1"="stas_eu_com", 
                                   "council1"= "stas_eu_cn"     ,
                                  " change1"="stas_com_cn_change" , 
                                 "commission2"="sasha_eu_com"  , 
                                "council2"= "sasha_eu_cn"   , 
                                "change2"="sasha_com_cn_change" )


#file to send to Fabian
write.csv(round_2_final, file='C:/Users/nasta/Dropbox/____Nordface_POst_doc/FULL_Texts_Scaling/annotations/Annotations_TRILOGUES/Output_file/Round2/Round2_annotations_complete_11082022.csv', na='', row.names = FALSE)

rm(change_long, com_long, council_long, recon, double_coded, sasha_all, stas_all)

rm(round_2_doubecoded)
#com
com_long=reconciled %>%select(-c(stas_eu_cn, stas_com_cn_change, sasha_eu_cn, sasha_com_cn_change)) %>% 
                    pivot_longer(cols= ends_with("_eu_com"), names_to = 'coder_id', values_to = 'eu_com' )

#cn
council_long=reconciled %>%select(-c(stas_com_cn_change,  sasha_com_cn_change, stas_eu_com, sasha_eu_com)) %>%
                 pivot_longer(cols= ends_with("_eu_cn"), names_to = 'coder_id', values_to = 'eu_council' )

#change

change_long=reconciled %>%select(-c(stas_eu_cn,  sasha_eu_cn, stas_eu_com, sasha_eu_com)) %>%
                pivot_longer(cols= ends_with("_change"), names_to = 'coder_id', values_to = 'change' )
com_long %>%
        test_icr(identifier, coder_id, eu_com, )
## # A tibble: 1 × 8
##   Variable n_Units n_Coders n_Categories Level   Agreement Holstis_CR
##   <chr>      <int>    <int>        <int> <chr>       <dbl>      <dbl>
## 1 eu_com       288        2            4 nominal     0.906      0.906
## # … with 1 more variable: Krippendorffs_Alpha <dbl>
council_long %>%
        test_icr(identifier, coder_id, eu_council )
## # A tibble: 1 × 8
##   Variable   n_Units n_Coders n_Categories Level   Agreement Holstis_CR
##   <chr>        <int>    <int>        <int> <chr>       <dbl>      <dbl>
## 1 eu_council     288        2            4 nominal     0.878      0.878
## # … with 1 more variable: Krippendorffs_Alpha <dbl>
change_long%>%test_icr(identifier, coder_id, change , levels = c(change = 'interval'))
## # A tibble: 1 × 8
##   Variable n_Units n_Coders n_Categories Level    Agreement Holstis_CR
##   <chr>      <int>    <int>        <int> <chr>        <dbl>      <dbl>
## 1 change       288        2            6 interval     0.896      0.896
## # … with 1 more variable: Krippendorffs_Alpha <dbl>