CA

This is CA analysis using 16 terms, The terms have been chosen based on the theoretical relevance to the EU dimension as well as the frequency of these terms appearing in the titles of the legislative acts. In the next steps the number of terms will be reduced in a step by step manner.

#install.packages(c("FactoMineR", "factoextra", "dplyr","ggplot2")) 
#install.packages('knitr')

#install.packages(c('ggpubr', 'tidyr' , 'gplots', 'foreign', 'MASS', 'ade4', 'ca' , 'ExPosition '))
library(FactoMineR)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggpubr)
library(tidyr)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(foreign)
library(knitr)
setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/CA")

Import csv, and adjust var names

legis=read.csv("CA_16.csv" , stringsAsFactors = FALSE)
head(legis)
##   procedure_ref1     key value
## 1 2007/0112(COD) f_amend     1
## 2 2008/0028(COD) f_amend     1
## 3 2008/0147(COD) f_amend     1
## 4 2008/0157(COD) f_amend     1
## 5 2008/0183(COD) f_amend     1
## 6 2008/0196(COD) f_amend     1
#adjust the names of the variables

names(legis)[names(legis) == "procedure_ref1"] <- "id"

Setting up the table for the CA analysis

#getting the tables  for the CA 
legis_table <- table(legis$id, legis$key)
    
summary(legis_table)
## Number of cases in table: 2007 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 9479, df = 13408, p-value = 1
##  Chi-squared approximation may be incorrect
legis_table <- legis_table[,colSums(legis_table)>=0]

Correspondence analysis with all 16 terms.

CA_legis=CA(legis_table)

summary(CA_legis)
## 
## Call:
## CA(X = legis_table) 
## 
## The chi square of independence between the two variables is equal to 9479.07 (p-value =  1 ).
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               0.409   0.385   0.358   0.352   0.338   0.330   0.322
## % of var.              8.663   8.152   7.586   7.443   7.150   6.995   6.821
## Cumulative % of var.   8.663  16.815  24.401  31.844  38.994  45.989  52.810
##                        Dim.8   Dim.9  Dim.10  Dim.11  Dim.12  Dim.13  Dim.14
## Variance               0.300   0.285   0.281   0.269   0.255   0.245   0.240
## % of var.              6.360   6.035   5.947   5.702   5.395   5.184   5.075
## Cumulative % of var.  59.170  65.205  71.152  76.854  82.249  87.433  92.508
##                       Dim.15  Dim.16
## Variance               0.209   0.145
## % of var.              4.418   3.074
## Cumulative % of var.  96.926 100.000
## 
## Rows (the 10 first)
##                       Iner*1000    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
## 2005/0214(COD)      |     0.692 |  0.089  0.001  0.006 |  0.058  0.000  0.002 |
## 2006/0084(COD)      |     2.111 |  0.252  0.015  0.030 |  0.376  0.037  0.067 |
## 2006/0167(COD)      |     2.111 |  0.252  0.015  0.030 |  0.376  0.037  0.067 |
## 2007/0112(COD)      |    66.378 | -0.457  0.076  0.005 | -0.129  0.006  0.000 |
## 2007/0152(COD)      |    99.599 | -0.563  0.077  0.003 | -0.084  0.002  0.000 |
## 2007/0229(COD)      |    12.419 | -0.203  0.010  0.003 | -0.407  0.043  0.013 |
## 2007/0286(COD)      |     0.692 |  0.089  0.001  0.006 |  0.058  0.000  0.002 |
## 2008/0009(COD)      |     0.692 |  0.089  0.001  0.006 |  0.058  0.000  0.002 |
## 2008/0028(COD)      |     5.982 |  0.001  0.000  0.000 | -0.187  0.018  0.012 |
## 2008/0062(COD)      |    11.227 | -0.319  0.025  0.009 |  0.606  0.095  0.033 |
##                      Dim.3    ctr   cos2  
## 2005/0214(COD)       0.050  0.000  0.002 |
## 2006/0084(COD)      -0.682  0.129  0.219 |
## 2006/0167(COD)      -0.682  0.129  0.219 |
## 2007/0112(COD)       1.756  1.286  0.069 |
## 2007/0152(COD)       2.315  1.491  0.054 |
## 2007/0229(COD)      -0.319  0.028  0.008 |
## 2007/0286(COD)       0.050  0.000  0.002 |
## 2008/0009(COD)       0.050  0.000  0.002 |
## 2008/0028(COD)       0.361  0.072  0.043 |
## 2008/0062(COD)       0.285  0.023  0.007 |
## 
## Columns (the 10 first)
##                       Iner*1000     Dim.1     ctr    cos2     Dim.2     ctr
## f_agency            |   273.634 |  -0.293   0.293   0.004 |   0.309   0.346
## f_amend             |   175.934 |  -0.156   1.226   0.029 |  -0.137   0.994
## f_author            |   254.598 |  -0.438   0.678   0.011 |   0.252   0.238
## f_border            |   315.397 |  -0.464   1.128   0.015 |   0.716   2.854
## f_common            |   283.773 |  -0.316   0.475   0.007 |  -0.542   1.482
## f_commun            |   292.395 |  -0.224   0.361   0.005 |  -0.539   2.221
## f_establish         |   245.070 |  -0.389   2.597   0.043 |   0.000   0.000
## f_extend            |   330.842 |  -0.777   0.368   0.005 |  -0.140   0.013
## f_framework         |   290.183 |  -0.504   1.208   0.017 |  -0.544   1.494
## f_fund              |   288.674 |  -0.307   0.599   0.008 |  -0.545   2.002
##                        cos2     Dim.3     ctr    cos2  
## f_agency              0.005 |  -1.524   9.042   0.118 |
## f_amend               0.022 |   0.381   8.337   0.170 |
## f_author              0.004 |   0.316   0.403   0.006 |
## f_border              0.035 |   0.312   0.582   0.007 |
## f_common              0.020 |  -0.411   0.918   0.012 |
## f_commun              0.029 |   0.313   0.806   0.010 |
## f_establish           0.000 |  -0.926  16.812   0.246 |
## f_extend              0.000 |   2.742   5.227   0.057 |
## f_framework           0.020 |  -0.699   2.648   0.033 |
## f_fund                0.027 |   0.931   6.265   0.078 |

To assess the results of the analysis we should consider the size of the Eigenvalues, which indicates how much variance is explained by the dimensions constructed. The table below will show the eigenvalues by dimension as well as the % of variance explained by it (col 2)

get_eigenvalue(CA_legis) # these values allow to determine the number of axes to be considered
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1   0.4091316         8.662527                    8.662527
## Dim.2   0.3850301         8.152228                   16.814755
## Dim.3   0.3582809         7.585868                   24.400623
## Dim.4   0.3515437         7.443221                   31.843844
## Dim.5   0.3376856         7.149805                   38.993649
## Dim.6   0.3303959         6.995459                   45.989108
## Dim.7   0.3221592         6.821064                   52.810172
## Dim.8   0.3003927         6.360204                   59.170376
## Dim.9   0.2850266         6.034857                   65.205233
## Dim.10  0.2808809         5.947082                   71.152314
## Dim.11  0.2693077         5.702041                   76.854355
## Dim.12  0.2548095         5.395071                   82.249426
## Dim.13  0.2448325         5.183829                   87.433256
## Dim.14  0.2396795         5.074725                   92.507981
## Dim.15  0.2086571         4.417889                   96.925870
## Dim.16  0.1451913         3.074130                  100.000000

To visualize how much variance each dimension explains, I construct the plot below. It shows that overall the % of the variance explained even by the first two dimensions is rather low.

fviz_eig(CA_legis, addlabels = TRUE, ylim = c(0, 35))

Next to assess the result of the CA analysis, I check the degree to a) each terms is represented by the constructed dimensions (see cos2 for cols), and the

col <- get_ca_col(CA_legis)
col
## Correspondence Analysis - Results for columns
##  ===================================================
##   Name       Description                   
## 1 "$coord"   "Coordinates for the columns" 
## 2 "$cos2"    "Cos2 for the columns"        
## 3 "$contrib" "contributions of the columns"
## 4 "$inertia" "Inertia of the columns"
fviz_ca_col(CA_legis, col.col = "cos2",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel = TRUE)

fviz_cos2(CA_legis, choice = "col", axes = 1:2) # shows which terms are well represented on the factor map.

fviz_ca_row(CA_legis, col.row = "cos2",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel = FALSE)

I will use the extent of the terms’ contributions to the dimensions to narrow down the number of the terms used for the analysis

CA_legis$col$contrib  #it is evident that some of the terms have almost no contribution to the dimensions(e.g. intergovernmental).
##                   Dim 1        Dim 2       Dim 3      Dim 4        Dim 5
## f_agency     0.29315509 3.457412e-01  9.04227717  1.5389855 1.094401e-02
## f_amend      1.22600533 9.942714e-01  8.33719147  2.2411150 9.727815e-02
## f_author     0.67772145 2.376314e-01  0.40327648  3.9039429 2.072042e+00
## f_border     1.12830659 2.853784e+00  0.58205654  9.6563014 4.422922e+01
## f_common     0.47486522 1.481788e+00  0.91801102  6.8146323 4.126247e+00
## f_commun     0.36078244 2.220547e+00  0.80632646  2.4099280 1.664341e+01
## f_establish  2.59728153 2.411197e-09 16.81183408  1.7375666 3.504594e-02
## f_extend     0.36794982 1.264677e-02  5.22675597 31.2923346 9.393129e+00
## f_framework  1.20841544 1.494437e+00  2.64760514  4.8842639 1.206444e+00
## f_fund       0.59860625 2.002128e+00  6.26489180  8.6973727 8.037292e+00
## f_harmon    88.38152488 1.252654e+00  0.02956324  1.2291981 1.579688e-01
## f_intergov   0.07161469 1.230078e-01  8.44965685  0.1739962 4.800452e-04
## f_lay_down   0.14637911 3.053389e+00  2.50362464  4.5782034 7.492973e+00
## f_noterm     0.32890883 1.421797e-01  0.10257556  0.1498277 1.601493e-01
## f_provi      0.13826581 3.519798e+00 10.07820598 12.2978461 1.848993e+00
## f_recogn     0.29047062 7.549207e+01  7.99342988  3.9825654 4.488131e+00
## f_repeal     1.70974692 4.773928e+00 19.80271773  4.4119204 2.565510e-04
fviz_contrib(CA_legis, choice = "col", axes = 1:2)

let’s see how specific rows(aka procedures) contribute to the dimensions

head(CA_legis$row$contrib, 20)  # this can be plotted as well, but with the large N of observation the plot becomes quite useless
##                       Dim 1        Dim 2        Dim 3        Dim 4        Dim 5
## 2005/0214(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2006/0084(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2006/0167(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2007/0112(COD) 7.630698e-02 0.0064638272 1.2860816119 6.3547295743 1.7763112965
## 2007/0152(COD) 7.729255e-02 0.0018050465 1.4905481913 8.9967021038 2.7258577293
## 2007/0229(COD) 1.002197e-02 0.0429731907 0.0282879208 0.2328972379 0.1436385361
## 2007/0286(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0009(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0028(COD) 9.148977e-07 0.0181807926 0.0723367788 0.2530990327 0.0410250212
## 2008/0062(COD) 2.471151e-02 0.0951088956 0.0226493495 0.3376554791 1.5647462727
## 2008/0098(COD) 3.151594e+00 0.0750629299 0.0007443627 0.0293371318 0.0675445211
## 2008/0142(COD) 2.471151e-02 0.0951088956 0.0226493495 0.3376554791 1.5647462727
## 2008/0147(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0157(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0183(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0192(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2008/0196(COD) 2.734045e-03 0.0122124322 0.0244619563 0.0036200442 0.0007760116
## 2008/0198(COD) 7.095760e-03 0.0777127196 0.0833212084 0.1298243770 0.2249664154
## 2008/0211(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0222(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
tail(CA_legis$row$contrib, 20 )
##                       Dim 1        Dim 2        Dim 3        Dim 4        Dim 5
## 2019/0107(COD) 0.0807079270 0.0003489906 0.4241680831 0.0879182141 0.5728011067
## 2019/0108(COD) 0.0756150971 0.0036567348 0.3712795067 0.0725290211 0.6938013577
## 2019/0179(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2019/0180(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2019/0192(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2020/0043(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0054(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2020/0058(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0059(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0060(COD) 0.0068163497 0.0898851250 0.3780226215 0.1900332606 0.0552289767
## 2020/0065(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2020/0066(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0067(COD) 0.0286839587 0.0025626424 0.0684419857 0.1954503890 0.0559736946
## 2020/0068(COD) 0.0047896300 0.0205604063 0.1221009772 0.0039793075 0.4262028159
## 2020/0069(COD) 0.0304703119 0.1173007972 0.0095166374 0.0117973120 0.5066053742
## 2020/0071(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0075(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2020/0099(COD) 0.0070957596 0.0777127196 0.0833212084 0.1298243770 0.2249664154
## 2020/0113(COD) 0.0068163497 0.0898851250 0.3780226215 0.1900332606 0.0552289767
## 2020/0128(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
description16=dimdesc(CA_legis, axes = c(1,2))
 
 #description of dimension 1=> shows which variables correlate with  it the most and which rows correlate with DIM 1 most
#description16[[1]]  ##
 
 
 ##Same as above, but for dim2  ==> these generate a very VERY long table (only useful if really need to look into details on the correlation between the rows and dim)
 ## description16[[2]]

Here is the asymmetric plot which allow to see the association between red and blue point (rows and columns))

fviz_ca_biplot(CA_legis, select.row=list(contrib=15),
                map ="rowprincipal", arrow = c(TRUE, TRUE),
               repel = TRUE
               )

##Save the scree and bi-plot for a broader analysis with 16 terms

scree.plot <- fviz_eig(CA_legis)
# Biplot of row and column variables
biplot.ca <- fviz_ca_biplot(CA_legis)


ggexport(plotlist = list(scree.plot, biplot.ca), 
         filename = "CA16.pdf")
## file saved to CA16.pdf

Having seen the extent to which different terms contribute to the underlying dimensions, several of them can be excluded. Relying on the output for the first two dimensions, I have narrowed the terms for the next step CA to 11. The chunks below run the CA with those two terms (TERMS INCLUDED: HARMONIZATION, AMENDING, BORDER, ESTABLISH, REPEAL, FRAMEWORK, PROVISIONS, RECOGNITION,FIND, LAYING DOWN & a category where so term from the list was found). NOTA BENE: in each analysis, the value suggests a very cautious approach to the results!

setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/CA")
cat11=read.csv("CA_11.csv" , stringsAsFactors = FALSE)

head(cat11)
##   procedure_ref1     key value
## 1 2007/0112(COD) f_amend     1
## 2 2008/0028(COD) f_amend     1
## 3 2008/0147(COD) f_amend     1
## 4 2008/0157(COD) f_amend     1
## 5 2008/0183(COD) f_amend     1
## 6 2008/0196(COD) f_amend     1
#adjust the names of the variables

names(cat11)[names(cat11) == "procedure_ref1"] <- "id"##
## setting the tables

cat11_tb <- table(cat11$id, cat11$key)
    
summary(cat11_tb)
## Number of cases in table: 1845 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 5719, df = 8380, p-value = 1
##  Chi-squared approximation may be incorrect
cat11_tb <- cat11_tb[,colSums(cat11_tb)>=0]
##running the CA

CA11=CA(cat11_tb)

summary(CA11)
## 
## Call:
## CA(X = cat11_tb) 
## 
## The chi square of independence between the two variables is equal to 5718.815 (p-value =  1 ).
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               0.411   0.391   0.356   0.350   0.321   0.308   0.299
## % of var.             13.257  12.609  11.490  11.291  10.351   9.928   9.637
## Cumulative % of var.  13.257  25.866  37.356  48.646  58.997  68.926  78.563
##                        Dim.8   Dim.9  Dim.10
## Variance               0.270   0.238   0.156
## % of var.              8.708   7.690   5.039
## Cumulative % of var.  87.271  94.961 100.000
## 
## Rows (the 10 first)
##                       Iner*1000    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
## 2005/0214(COD)      |     0.648 |  0.056  0.000  0.003 |  0.052  0.000  0.002 |
## 2006/0084(COD)      |     2.024 |  0.256  0.017  0.035 |  0.315  0.027  0.053 |
## 2006/0167(COD)      |     2.024 |  0.256  0.017  0.035 |  0.315  0.027  0.053 |
## 2007/0112(COD)      |     0.725 | -0.117  0.004  0.020 | -0.070  0.001  0.007 |
## 2007/0152(COD)      |     0.648 |  0.056  0.000  0.003 |  0.052  0.000  0.002 |
## 2007/0229(COD)      |     0.648 |  0.056  0.000  0.003 |  0.052  0.000  0.002 |
## 2007/0286(COD)      |     0.648 |  0.056  0.000  0.003 |  0.052  0.000  0.002 |
## 2008/0009(COD)      |     0.648 |  0.056  0.000  0.003 |  0.052  0.000  0.002 |
## 2008/0028(COD)      |     5.807 | -0.059  0.002  0.001 | -0.216  0.026  0.017 |
## 2008/0062(COD)      |    11.139 | -0.457  0.055  0.020 |  0.456  0.058  0.020 |
##                      Dim.3    ctr   cos2  
## 2005/0214(COD)      -0.034  0.000  0.001 |
## 2006/0084(COD)      -0.532  0.086  0.152 |
## 2006/0167(COD)      -0.532  0.086  0.152 |
## 2007/0112(COD)       0.175  0.009  0.046 |
## 2007/0152(COD)      -0.034  0.000  0.001 |
## 2007/0229(COD)      -0.034  0.000  0.001 |
## 2007/0286(COD)      -0.034  0.000  0.001 |
## 2008/0009(COD)      -0.034  0.000  0.001 |
## 2008/0028(COD)       0.599  0.218  0.134 |
## 2008/0062(COD)      -0.314  0.030  0.010 |
## 
## Columns (the 10 first)
##                       Iner*1000     Dim.1     ctr    cos2     Dim.2     ctr
## f_amend             |   183.108 |  -0.186   1.873   0.042 |  -0.120   0.816
## f_border            |   322.818 |  -0.622   2.194   0.028 |   0.538   1.729
## f_establish         |   266.080 |  -0.394   2.881   0.044 |  -0.078   0.118
## f_framework         |   300.046 |  -0.575   1.703   0.023 |  -0.518   1.450
## f_fund              |   294.114 |  -0.458   1.438   0.020 |  -0.643   2.981
## f_harmon            |   394.993 |   5.722  86.374   0.899 |  -0.822   1.875
## f_lay_down          |   304.260 |   0.200   0.244   0.003 |  -0.714   3.250
## f_noterm            |    95.977 |   0.036   0.143   0.006 |   0.032   0.121
## f_provi             |   283.504 |  -0.295   0.495   0.007 |  -0.815   3.956
## f_recogn            |   377.913 |   0.557   0.409   0.004 |   7.600  80.101
##                        cos2     Dim.3     ctr    cos2  
## f_amend               0.017 |   0.230   3.309   0.064 |
## f_border              0.021 |  -0.354   0.819   0.009 |
## f_establish           0.002 |  -0.995  21.246   0.284 |
## f_framework           0.019 |  -0.897   4.772   0.057 |
## f_fund                0.040 |   1.465  16.974   0.206 |
## f_harmon              0.019 |   0.022   0.001   0.000 |
## f_lay_down            0.042 |   1.198  10.041   0.118 |
## f_noterm              0.005 |  -0.020   0.054   0.002 |
## f_provi               0.055 |   1.835  22.026   0.277 |
## f_recogn              0.828 |   2.475   9.324   0.088 |
fviz_ca_biplot(CA11) # quick biplot

Assessing the results by checking the eigen value and visualizing the results

get_eigenvalue(CA11)  ## still % of variance explained is very low but slitly imporving
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1   0.4109142        13.256884                    13.25688
## Dim.2   0.3908360        12.609121                    25.86601
## Dim.3   0.3561322        11.489510                    37.35552
## Dim.4   0.3499675        11.290626                    48.64614
## Dim.5   0.3208491        10.351212                    58.99735
## Dim.6   0.3077451         9.928451                    68.92580
## Dim.7   0.2987092         9.636934                    78.56274
## Dim.8   0.2699243         8.708280                    87.27102
## Dim.9   0.2383475         7.689550                    94.96057
## Dim.10  0.1562037         5.039432                   100.00000
fviz_eig(CA11, addlabels = TRUE, ylim = c(0, 35))

##another option: 

fviz_ca_biplot(CA11, 
             map ="rowprincipal", arrow = c(FALSE, TRUE),
              repel = FALSE)

Next to assess the result of the CA analysis, I check the degree to a) each terms is represented by the constructed dimensions (see cos2 for cols), and b) which raw points/procedures are well represented bu the dimensions

col <- get_ca_col(CA11)
col
## Correspondence Analysis - Results for columns
##  ===================================================
##   Name       Description                   
## 1 "$coord"   "Coordinates for the columns" 
## 2 "$cos2"    "Cos2 for the columns"        
## 3 "$contrib" "contributions of the columns"
## 4 "$inertia" "Inertia of the columns"
fviz_ca_col(CA11, col.col = "cos2",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel = TRUE)

fviz_cos2(CA11, choice = "col", axes = 1:2) # shows which terms are well represented on the factor map.

I will use the extent of the terms’ contributions to the dimensions to narrow down the number of the terms used for the analysis

CA11$col$contrib  #it is evident that some of the terms have almost no contribution to the dimensions(e.g. intergovernmental).
##                  Dim 1      Dim 2        Dim 3      Dim 4      Dim 5
## f_amend      1.8731039  0.8161941  3.308994385  5.9339082  0.3058423
## f_border     2.1935793  1.7286291  0.819499185 53.9382935  0.0323442
## f_establish  2.8808684  0.1184409 21.246005190  0.5776545  3.9382532
## f_framework  1.7028591  1.4500988  4.772204733  6.0306690 51.9905482
## f_fund       1.4384451  2.9806662 16.974431142  1.0356322  5.3846572
## f_harmon    86.3735417  1.8748396  0.001460197  3.0297824  3.1789169
## f_lay_down   0.2437805  3.2499131 10.040644541  8.6571414  2.0877030
## f_noterm     0.1434581  0.1211063  0.053681823  0.3218049  0.5483614
## f_provi      0.4946088  3.9560617 22.026127692  0.8995863  0.6146170
## f_recogn     0.4090857 80.1008766  9.323607941  1.0729918  6.3520629
## f_repeal     2.2466693  3.6031735 11.433343171 18.5025359 25.5666938
fviz_contrib(CA11, choice = "col", axes = 1:2)

#description of the dimensions

cal <- dimdesc(CA11, axes = c(1,2))
head(cal[[1]]$col, 15)
##                   coord
## f_border    -0.62189375
## f_framework -0.57534827
## f_fund      -0.45795070
## f_establish -0.39357355
## f_provi     -0.29530476
## f_amend     -0.18565476
## f_noterm     0.03598294
## f_lay_down   0.20044459
## f_repeal     0.29256124
## f_recogn     0.55690472
## f_harmon     5.72201668

Next to assess the result of the CA for 11 terms, I check contributions and quality of represnetation for rows/procedures.

##Rows that contribute the most to Dim.1 and Dim.2 are the most important in explaining the variability in the data set.
#Rows that do not contribute much to any dimension or that contribute to the last dimensions are less important.

head(CA11$row$contrib)
##                       Dim 1        Dim 2        Dim 3       Dim 4        Dim 5
## 2005/0214(COD) 0.0004156182 0.0003688867 0.0001794472 0.001094676 2.034636e-03
## 2006/0084(COD) 0.0173244285 0.0274820766 0.0861340829 0.116331097 2.297728e-01
## 2006/0167(COD) 0.0173244285 0.0274820766 0.0861340829 0.116331097 2.297728e-01
## 2007/0112(COD) 0.0035954350 0.0013514177 0.0093540949 0.027836503 4.480764e-06
## 2007/0152(COD) 0.0004156182 0.0003688867 0.0001794472 0.001094676 2.034636e-03
## 2007/0229(COD) 0.0004156182 0.0003688867 0.0001794472 0.001094676 2.034636e-03
tail(CA11$row$contrib)
##                       Dim 1        Dim 2        Dim 3        Dim 4        Dim 5
## 2020/0069(COD) 0.0035954350 0.0013514177 0.0093540949 2.783650e-02 4.480764e-06
## 2020/0071(COD) 0.0035954350 0.0013514177 0.0093540949 2.783650e-02 4.480764e-06
## 2020/0075(COD) 0.0395046146 0.0630612602 0.3990701383 2.258574e-06 1.064495e-01
## 2020/0099(COD) 0.0089715490 0.0824020574 0.2960544834 2.451650e-01 5.477868e-02
## 2020/0113(COD) 0.0211862658 0.0961823020 0.5950382014 2.434247e-05 1.527379e-02
## 2020/0128(COD) 0.0004156182 0.0003688867 0.0001794472 1.094676e-03 2.034636e-03
row1 <- get_ca_row(CA11)
row1
## Correspondence Analysis - Results for rows
##  ===================================================
##   Name       Description                
## 1 "$coord"   "Coordinates for the rows" 
## 2 "$cos2"    "Cos2 for the rows"        
## 3 "$contrib" "contributions of the rows"
## 4 "$inertia" "Inertia of the rows"
fviz_ca_row(CA11, col.row = "cos2",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel =FALSE)

#fviz_cos2(CA11, choice = "row", axes = 1:2) # shows which terms are well represented on the factor map. very messy for rows


fviz_ca_row(CA11, col.row = "contrib",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel = FALSE)

I will check the extent to which specific rows(aka procedures) contribute to the dimensions

head(CA_legis$row$contrib, 20)  # this can be plotted as well, but with the large N of observation the plot becomes quite useless
##                       Dim 1        Dim 2        Dim 3        Dim 4        Dim 5
## 2005/0214(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2006/0084(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2006/0167(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2007/0112(COD) 7.630698e-02 0.0064638272 1.2860816119 6.3547295743 1.7763112965
## 2007/0152(COD) 7.729255e-02 0.0018050465 1.4905481913 8.9967021038 2.7258577293
## 2007/0229(COD) 1.002197e-02 0.0429731907 0.0282879208 0.2328972379 0.1436385361
## 2007/0286(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0009(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0028(COD) 9.148977e-07 0.0181807926 0.0723367788 0.2530990327 0.0410250212
## 2008/0062(COD) 2.471151e-02 0.0951088956 0.0226493495 0.3376554791 1.5647462727
## 2008/0098(COD) 3.151594e+00 0.0750629299 0.0007443627 0.0293371318 0.0675445211
## 2008/0142(COD) 2.471151e-02 0.0951088956 0.0226493495 0.3376554791 1.5647462727
## 2008/0147(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0157(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0183(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0192(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2008/0196(COD) 2.734045e-03 0.0122124322 0.0244619563 0.0036200442 0.0007760116
## 2008/0198(COD) 7.095760e-03 0.0777127196 0.0833212084 0.1298243770 0.2249664154
## 2008/0211(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0222(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
tail(CA_legis$row$contrib, 20 )
##                       Dim 1        Dim 2        Dim 3        Dim 4        Dim 5
## 2019/0107(COD) 0.0807079270 0.0003489906 0.4241680831 0.0879182141 0.5728011067
## 2019/0108(COD) 0.0756150971 0.0036567348 0.3712795067 0.0725290211 0.6938013577
## 2019/0179(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2019/0180(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2019/0192(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2020/0043(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0054(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2020/0058(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0059(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0060(COD) 0.0068163497 0.0898851250 0.3780226215 0.1900332606 0.0552289767
## 2020/0065(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2020/0066(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0067(COD) 0.0286839587 0.0025626424 0.0684419857 0.1954503890 0.0559736946
## 2020/0068(COD) 0.0047896300 0.0205604063 0.1221009772 0.0039793075 0.4262028159
## 2020/0069(COD) 0.0304703119 0.1173007972 0.0095166374 0.0117973120 0.5066053742
## 2020/0071(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0075(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2020/0099(COD) 0.0070957596 0.0777127196 0.0833212084 0.1298243770 0.2249664154
## 2020/0113(COD) 0.0068163497 0.0898851250 0.3780226215 0.1900332606 0.0552289767
## 2020/0128(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901

Let’s take a look at the variables and observations that correlate most with the dimension of interest=> will be used to getting the procedure numbers for the candidate summaries

 description_11=dimdesc(CA11, axes = c(1,2))
 
 #description of dimension 1=> shows which variables correlate with  it the most and which rows correlate with DIM 1 most
 #description_11[[1]]  ## ==> if run this code there will be two very very long tables. 
 
  ##Same as above, but for dim2
 #description_11[[2]]

Save pdf of the bi plot and scree plot for the CA with 11 terms

###
scree.plot <- fviz_eig(CA11)
# Biplot of row and column variables
biplot.ca <- fviz_ca_biplot(CA11)


ggexport(plotlist = list(scree.plot, biplot.ca), 
         filename = "CA11.pdf")
## file saved to CA11.pdf

What is clear that the quality of the analysis is not improving with the reduction of the terms used. However, we do seem to capture more variance with a more focused approach in contrast to the approach suggesting to include all possible terms for the As the next step, I narrowed the analysis down even further. For the third iteration of the CA, I used the following terms: ‘HARMONIZE, BORDER, AMEND, ESTABLISH, REPEAL, RECOGNIZE, NOTERM’. NOTERM includes cases where nether of the term were identified in the title of the procedure.

setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/CA")
cat7=read.csv("CA_7.csv" , stringsAsFactors = FALSE)

head(cat7)
##   procedure_ref1     key value
## 1 2007/0112(COD) f_amend     1
## 2 2008/0028(COD) f_amend     1
## 3 2008/0147(COD) f_amend     1
## 4 2008/0157(COD) f_amend     1
## 5 2008/0183(COD) f_amend     1
## 6 2008/0196(COD) f_amend     1
#adjust the names of the variables

names(cat7)[names(cat7) == "procedure_ref1"] <- "id" 
## setting the tables

summary(cat7)
##       id                key                value      
##  Length:1665        Length:1665        Min.   :1.000  
##  Class :character   Class :character   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Median :2.000  
##                                        Mean   :1.505  
##                                        3rd Qu.:2.000  
##                                        Max.   :2.000
length(cat7$id)
## [1] 1665
cat7_tb <- table(cat7$id, cat7$key)
    
summary(cat7_tb)
## Number of cases in table: 1665 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 3269, df = 5028, p-value = 1
##  Chi-squared approximation may be incorrect
cat7_tb <- cat7_tb[,colSums(cat7_tb)>=0]
##running the CA

CA7=CA(cat7_tb)

summary(CA7)  ## overall results are still quite quationable
## 
## Call:
## CA(X = cat7_tb) 
## 
## The chi square of independence between the two variables is equal to 3268.551 (p-value =  1 ).
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6
## Variance               0.419   0.386   0.362   0.330   0.296   0.169
## % of var.             21.357  19.687  18.459  16.829  15.074   8.595
## Cumulative % of var.  21.357  41.044  59.503  76.332  91.405 100.000
## 
## Rows (the 10 first)
##                       Iner*1000    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
## 2005/0214(COD)      |     0.590 |  0.016  0.000  0.000 | -0.031  0.000  0.001 |
## 2006/0084(COD)      |     1.907 |  0.234  0.016  0.034 |  0.376  0.044  0.089 |
## 2006/0167(COD)      |     1.907 |  0.234  0.016  0.034 |  0.376  0.044  0.089 |
## 2007/0112(COD)      |     0.608 | -0.164  0.008  0.053 | -0.216  0.015  0.092 |
## 2007/0152(COD)      |     0.590 |  0.016  0.000  0.000 | -0.031  0.000  0.001 |
## 2007/0229(COD)      |     0.590 |  0.016  0.000  0.000 | -0.031  0.000  0.001 |
## 2007/0286(COD)      |     0.590 |  0.016  0.000  0.000 | -0.031  0.000  0.001 |
## 2008/0009(COD)      |     0.590 |  0.016  0.000  0.000 | -0.031  0.000  0.001 |
## 2008/0028(COD)      |     1.079 |  0.042  0.001  0.003 |  0.117  0.006  0.023 |
## 2008/0062(COD)      |    11.022 | -0.580  0.096  0.037 | -0.120  0.004  0.002 |
##                      Dim.3    ctr   cos2  
## 2005/0214(COD)       0.047  0.000  0.002 |
## 2006/0084(COD)      -0.960  0.305  0.580 |
## 2006/0167(COD)      -0.960  0.305  0.580 |
## 2007/0112(COD)       0.360  0.043  0.256 |
## 2007/0152(COD)       0.047  0.000  0.002 |
## 2007/0229(COD)       0.047  0.000  0.002 |
## 2007/0286(COD)       0.047  0.000  0.002 |
## 2008/0009(COD)       0.047  0.000  0.002 |
## 2008/0028(COD)      -0.416  0.086  0.289 |
## 2008/0062(COD)       1.502  0.748  0.246 |
## 
## Columns
##                       Iner*1000     Dim.1     ctr    cos2     Dim.2     ctr
## f_amend             |   190.538 |  -0.222   2.914   0.064 |  -0.249   3.982
## f_border            |   330.376 |  -0.761   3.566   0.045 |  -0.130   0.112
## f_establish         |   281.982 |  -0.376   2.862   0.043 |  -0.248   1.345
## f_harmon            |   404.655 |   5.548  88.199   0.914 |  -0.592   1.088
## f_noterm            |    89.444 |   0.010   0.012   0.001 |  -0.019   0.050
## f_recogn            |   377.327 |  -0.038   0.002   0.000 |   7.442  86.072
## f_repeal            |   288.772 |   0.293   2.444   0.035 |   0.488   7.352
##                        cos2     Dim.3     ctr    cos2  
## f_amend               0.081 |   0.405  11.173   0.212 |
## f_border              0.001 |   1.780  22.591   0.248 |
## f_establish           0.018 |  -0.551   7.093   0.091 |
## f_harmon              0.010 |   1.222   4.951   0.044 |
## f_noterm              0.002 |   0.028   0.112   0.005 |
## f_recogn              0.882 |   2.177   7.856   0.075 |
## f_repeal              0.098 |  -1.184  46.224   0.580 |
fviz_ca_biplot(CA7)  # quick bi-plot

Assessing the results by checking the eigen value and visualizing the results

get_eigenvalue(CA7)  ## still % of variance explained is improved 
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  0.4192591        21.357063                    21.35706
## Dim.2  0.3864659        19.686577                    41.04364
## Dim.3  0.3623666        18.458959                    59.50260
## Dim.4  0.3303703        16.829068                    76.33167
## Dim.5  0.2959134        15.073830                    91.40550
## Dim.6  0.1687181         8.594503                   100.00000
fviz_eig(CA7, addlabels = TRUE, ylim = c(0, 35))

##another option: 

#fviz_ca_biplot(CA11, 
             #  map ="rowprincipal", arrow = c(TRUE, TRUE),
              # repel = FALSE)

Next I check the degree to a) each terms is represented by the constructed dimensions (see cos2 for cols), and b) which raw points/procedures are well represented bu the dimensions (see below)

col <- get_ca_col(CA7)
col
## Correspondence Analysis - Results for columns
##  ===================================================
##   Name       Description                   
## 1 "$coord"   "Coordinates for the columns" 
## 2 "$cos2"    "Cos2 for the columns"        
## 3 "$contrib" "contributions of the columns"
## 4 "$inertia" "Inertia of the columns"
fviz_ca_col(CA7, col.col = "cos2",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel = TRUE)

fviz_cos2(CA7, choice = "col", axes = 1:2) # shows which terms are well represented on the factor map.

I will use the extent of the terms’ contributions to the dimensions to narrow down the number of the terms used for the analysis

CA7$col$contrib  #in the next iteration=> i will remove the NOTERM category and possibly 'border'.
##                    Dim 1      Dim 2      Dim 3      Dim 4      Dim 5
## f_amend      2.914351202  3.9815918 11.1733731 15.7681485  0.3700016
## f_border     3.565923552  0.1121130 22.5912396 66.5874515  4.2839422
## f_establish  2.862029691  1.3446149  7.0926213  7.0266840 72.6125545
## f_harmon    88.198989749  1.0879369  4.9505010  0.9576765  2.7398172
## f_noterm     0.012271079  0.0496369  0.1120782  2.1385788  1.5974700
## f_recogn     0.002043895 86.0717688  7.8561384  0.8272372  4.4400094
## f_repeal     2.444390833  7.3523377 46.2240483  6.6942234 13.9562051
fviz_contrib(CA7, choice = "col", axes = 1:2)

#description of the dimensionContribution ofs

cal <- dimdesc(CA7, axes = c(1,2))
head(cal[[1]]$col, 10)
##                   coord
## f_border    -0.76085196
## f_establish -0.37642277
## f_amend     -0.22221364
## f_recogn    -0.03777266
## f_noterm     0.01009834
## f_repeal     0.29282433
## f_harmon     5.54836703

Next to assess the result of the CA for 7 terms, I check contributions and quality of representation for rows/procedures.

##Rows that contribute the most to Dim.1 and Dim.2 are the most important in explaining the variability in the data set.
#Rows that do not contribute much to any dimension or that contribute to the last dimensions are less important.

head(CA7$row$contrib)
##                       Dim 1        Dim 2        Dim 3       Dim 4       Dim 5
## 2005/0214(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2006/0084(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2006/0167(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2007/0112(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2007/0152(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2007/0229(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
tail(CA7$row$contrib)
##                       Dim 1        Dim 2        Dim 3       Dim 4       Dim 5
## 2020/0069(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0071(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0075(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0099(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0113(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0128(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
row1 <- get_ca_row(CA7)
row1
## Correspondence Analysis - Results for rows
##  ===================================================
##   Name       Description                
## 1 "$coord"   "Coordinates for the rows" 
## 2 "$cos2"    "Cos2 for the rows"        
## 3 "$contrib" "contributions of the rows"
## 4 "$inertia" "Inertia of the rows"
fviz_ca_row(CA7, col.row = "cos2",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel =FALSE, title='COS2 plot')

#fviz_cos2(CA11, choice = "row", axes = 1:2) # shows which terms are well represented on the factor map. very messy for rows


fviz_ca_row(CA11, col.row = "contrib",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel = FALSE, title='contribution of the rows')

I will check the extent to which specific rows(aka procedures) contribute to the dimensions

head(CA7$row$contrib, 20)  # this can be plotted as well, but with the large N of observation the plot becomes quite useless
##                       Dim 1        Dim 2        Dim 3       Dim 4       Dim 5
## 2005/0214(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2006/0084(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2006/0167(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2007/0112(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2007/0152(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2007/0229(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2007/0286(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2008/0009(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2008/0028(COD) 7.418973e-04 0.0064120725 0.0859784609 0.003963991 0.128996055
## 2008/0062(COD) 9.629098e-02 0.0044653371 0.7482148179 2.157446834 0.218066442
## 2008/0098(COD) 3.899444e+00 0.0020459497 0.0006762595 0.124856754 0.004286880
## 2008/0142(COD) 9.629098e-02 0.0044653371 0.7482148179 2.157446834 0.218066442
## 2008/0147(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2008/0157(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2008/0183(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2008/0192(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2008/0196(COD) 7.418973e-04 0.0064120725 0.0859784609 0.003963991 0.128996055
## 2008/0198(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2008/0211(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2008/0222(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
tail(CA7$row$contrib, 20 )
##                       Dim 1        Dim 2        Dim 3       Dim 4       Dim 5
## 2019/0107(COD) 1.078192e-01 0.0212749566 0.7468603173 1.005572579 0.170643572
## 2019/0108(COD) 9.629098e-02 0.0044653371 0.7482148179 2.157446834 0.218066442
## 2019/0179(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2019/0180(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2019/0192(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0043(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0054(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0058(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0059(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0060(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0065(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0066(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0067(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0068(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0069(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0071(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0075(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0099(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0113(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0128(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712

Let’s take a look at the variables and observations that correlate most with the dimension of interest=> will be used to getting the procedure numbers for the candidate summaries

 description7=dimdesc(CA7, axes = c(1,2))
 
 #description of dimension 1=> shows which variables correlate with  it the most and which rows correlate with DIM 1 most
# description7[[1]]  ## ==> if run this code there will be two very long CORR tables. 
 
  ##Same as above, but for dim2
 #description7[[2]]

Save pdf of the bi plot and scree plot for the CA with 11 terms

###
scree.plot <- fviz_eig(CA7)
# Biplot of row and column variables
biplot.ca <- fviz_ca_biplot(CA7)


  

ggexport(plotlist = list(scree.plot, biplot.ca), 
         filename = "CA7.pdf")
## file saved to CA7.pdf

Based on the contribution of th terms to the dimensions and the extent to which they are represented, their contributions to the dimensions, I have narrowed the analysis to five key terms. In the new iteration, the following terms are included: HARMONIZE, AMEND, REPEAL ESTABLSH, RECOGNIZE

Load the dataset for 5 key terms, prepare for the CA and run the CA.

cat5=read.csv("CA_5.csv" , stringsAsFactors = FALSE)

head(cat5)
##   procedure_ref1     key value
## 1 2007/0112(COD) f_amend     1
## 2 2008/0028(COD) f_amend     1
## 3 2008/0147(COD) f_amend     1
## 4 2008/0157(COD) f_amend     1
## 5 2008/0183(COD) f_amend     1
## 6 2008/0196(COD) f_amend     1
#adjust the names of the variables

names(cat5)[names(cat5) == "procedure_ref1"] <- "id"##
## setting the tables

cat5_tb <- table(cat5$id, cat5$key)
   
summary(cat5_tb)
## Number of cases in table: 782 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 2079.1, df = 2416, p-value = 1
##  Chi-squared approximation may be incorrect
cat5_tb <- cat5_tb[,colSums(cat5_tb)>=0]
##running the CA

CA5=CA(cat5_tb)

summary(CA5)
## 
## Call:
## CA(X = cat5_tb) 
## 
## The chi square of independence between the two variables is equal to 2079.131 (p-value =  0.9999998 ).
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4
## Variance               0.759   0.701   0.662   0.537
## % of var.             28.549  26.353  24.883  20.215
## Cumulative % of var.  28.549  54.902  79.785 100.000
## 
## Rows (the 10 first)
##                       Iner*1000    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
## 2006/0084(COD)      |     3.746 |  0.302  0.015  0.031 |  1.022  0.191  0.357 |
## 2006/0167(COD)      |     3.746 |  0.302  0.015  0.031 |  1.022  0.191  0.357 |
## 2007/0112(COD)      |     1.148 | -0.334  0.019  0.124 | -0.580  0.061  0.375 |
## 2008/0028(COD)      |     1.169 | -0.016  0.000  0.001 |  0.221  0.018  0.107 |
## 2008/0098(COD)      |    24.955 |  3.119  3.278  0.997 | -0.126  0.006  0.002 |
## 2008/0147(COD)      |     1.148 | -0.334  0.019  0.124 | -0.580  0.061  0.375 |
## 2008/0157(COD)      |     1.148 | -0.334  0.019  0.124 | -0.580  0.061  0.375 |
## 2008/0183(COD)      |     1.148 | -0.334  0.019  0.124 | -0.580  0.061  0.375 |
## 2008/0192(COD)      |     3.746 |  0.302  0.015  0.031 |  1.022  0.191  0.357 |
## 2008/0196(COD)      |     1.169 | -0.016  0.000  0.001 |  0.221  0.018  0.107 |
##                      Dim.3    ctr   cos2  
## 2006/0084(COD)      -1.102  0.235  0.414 |
## 2006/0167(COD)      -1.102  0.235  0.414 |
## 2007/0112(COD)       0.547  0.058  0.333 |
## 2008/0028(COD)      -0.277  0.030  0.169 |
## 2008/0098(COD)      -0.098  0.004  0.001 |
## 2008/0147(COD)       0.547  0.058  0.333 |
## 2008/0157(COD)       0.547  0.058  0.333 |
## 2008/0183(COD)       0.547  0.058  0.333 |
## 2008/0192(COD)      -1.102  0.235  0.414 |
## 2008/0196(COD)      -0.277  0.030  0.169 |
## 
## Columns
##                       Iner*1000     Dim.1     ctr    cos2     Dim.2     ctr
## f_amend             |   315.783 |  -0.291   5.869   0.141 |  -0.486  17.735
## f_establish         |   452.081 |  -0.265   1.664   0.028 |  -0.014   0.005
## f_harmon            |   732.758 |   5.171  90.113   0.933 |  -1.066   4.148
## f_recogn            |   670.546 |   0.126   0.027   0.000 |   5.313  51.522
## f_repeal            |   487.568 |   0.263   2.327   0.036 |   0.856  26.589
##                        cos2     Dim.3     ctr    cos2  
## f_amend               0.394 |   0.445  15.742   0.330 |
## f_establish           0.000 |  -0.481   6.309   0.092 |
## f_harmon              0.040 |   0.737   2.099   0.019 |
## f_recogn              0.538 |   4.823  44.969   0.444 |
## f_repeal              0.382 |  -0.896  30.881   0.419 |
fviz_ca_biplot(CA5,  axes = c(1, 2)) # quick biplot

Assessing the results by checking the eigen value and visualizing the results

get_eigenvalue(CA5)  ## still % of variance explained is higher, but again, the p/chi2 ask for caution.
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  0.7590348         28.54871                    28.54871
## Dim.2  0.7006696         26.35349                    54.90220
## Dim.3  0.6615669         24.88277                    79.78497
## Dim.4  0.5374642         20.21503                   100.00000
fviz_eig(CA5, addlabels = TRUE, ylim = c(0, 35))

##another option: 

fviz_ca_biplot(CA5, 
             map ="rowprincipal", arrow = c(FALSE, TRUE),
              repel = FALSE)

Next to assess the result of the CA analysis, I check the degree to a) each terms is represented by the constructed dimensions (see cos2 for cols), and b) which raw points/procedures are well represented bu the dimensions

col <- get_ca_col(CA5)
col
## Correspondence Analysis - Results for columns
##  ===================================================
##   Name       Description                   
## 1 "$coord"   "Coordinates for the columns" 
## 2 "$cos2"    "Cos2 for the columns"        
## 3 "$contrib" "contributions of the columns"
## 4 "$inertia" "Inertia of the columns"
fviz_ca_col(CA5, col.col = "cos2",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel = TRUE)

fviz_cos2(CA5, choice = "col", axes = 1:3) # shows which terms are well represented on the factor map.

I will use the extent of the terms’ contributions to the dimensions to narrow down the number of the terms used for the analysis

CA5$col$contrib  # the degrees to whcih the terms contribute to the dimensions. 
##                   Dim 1        Dim 2     Dim 3     Dim 4
## f_amend      5.86879158 17.735372181 15.741645  7.968769
## f_establish  1.66392833  0.005128878  6.309245 73.991007
## f_harmon    90.11343601  4.148057838  2.098956  1.082006
## f_recogn     0.02670653 51.522208701 44.969345  2.202968
## f_repeal     2.32713755 26.589232402 30.880810 14.755250
fviz_contrib(CA5, choice = "col", axes = 1:3)

#description of the dimensions

cal <- dimdesc(CA5, axes = c(1,2))
head(cal[[1]]$col, 15)
##                  coord
## f_amend     -0.2907769
## f_establish -0.2646622
## f_recogn     0.1259050
## f_repeal     0.2634624
## f_harmon     5.1714698

Next to assess the result of the CA for 5 terms, I check contributions and quality of representation for rows/procedures.

##Rows that contribute the most to Dim.1 and Dim.2 are the most important in explaining the variability in the data set.
head(CA5$row$contrib)
##                       Dim 1       Dim 2      Dim 3      Dim 4
## 2006/0084(COD) 1.540662e-02 0.190695073 0.23456426 0.13795706
## 2006/0167(COD) 1.540662e-02 0.190695073 0.23456426 0.13795706
## 2007/0112(COD) 1.876678e-02 0.061436977 0.05775361 0.03598690
## 2008/0028(COD) 8.279923e-05 0.017826805 0.02976764 0.15743224
## 2008/0098(COD) 3.278145e+00 0.005765073 0.00370041 0.00146658
## 2008/0147(COD) 1.876678e-02 0.061436977 0.05775361 0.03598690
tail(CA5$row$contrib)
##                     Dim 1      Dim 2      Dim 3     Dim 4
## 2020/0066(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0067(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0069(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0071(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0075(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0113(COD) 0.01876678 0.06143698 0.05775361 0.0359869
row1 <- get_ca_row(CA5)
row1
## Correspondence Analysis - Results for rows
##  ===================================================
##   Name       Description                
## 1 "$coord"   "Coordinates for the rows" 
## 2 "$cos2"    "Cos2 for the rows"        
## 3 "$contrib" "contributions of the rows"
## 4 "$inertia" "Inertia of the rows"
fviz_ca_row(CA5, col.row = "cos2",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel =FALSE)

fviz_ca_row(CA5, col.row = "contrib",
            gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
            repel = FALSE)

I will check the extent to which specific rows(aka procedures) contribute to the dimensions

head(CA5$row$contrib, 20)  # this can be plotted as well, but with the large N of observation the plot becomes quite useless
##                       Dim 1        Dim 2        Dim 3      Dim 4
## 2006/0084(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
## 2006/0167(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
## 2007/0112(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0028(COD) 8.279923e-05 1.782680e-02 0.0297676432 0.15743224
## 2008/0098(COD) 3.278145e+00 5.765073e-03 0.0037004101 0.00146658
## 2008/0147(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0157(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0183(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0192(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
## 2008/0196(COD) 8.279923e-05 1.782680e-02 0.0297676432 0.15743224
## 2008/0227(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
## 2008/0237(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0242(COD) 3.423836e-02 3.253036e-02 0.0001950587 0.31872726
## 2008/0243(COD) 1.554726e-02 5.191466e-05 0.0676370273 0.97636064
## 2008/0246(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0249(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0257(COD) 3.423836e-02 3.253036e-02 0.0001950587 0.31872726
## 2008/0260(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0261(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2009/0005(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
tail(CA5$row$contrib, 20 )
##                     Dim 1        Dim 2      Dim 3     Dim 4
## 2018/0385(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2018/0390(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2018/0900(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0009(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0010(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0019(COD) 0.01554726 5.191466e-05 0.06763703 0.9763606
## 2019/0107(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0179(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0180(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0043(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0054(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0058(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0059(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0060(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0066(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0067(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0069(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0071(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0075(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0113(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869

Let’s take a look at the variables and observations that correlate most with the dimension of interest=> will be used to getting the procedure numbers for the candidate summaries

 description_5=dimdesc(CA5, axes = c(1,2))
 
 #description of dimension 1=> shows which variables correlate with  it the most and which rows correlate with DIM 1 most
 #description_5[[1]]  ## ==> if run this code there will be two very very long tables. 
 
  ##Same as above, but for dim2
 #description_5[[2]]

Save pdf of the bi plot and scree plot for the CA with 5

###
scree.plot <- fviz_eig(CA5)
# Biplot of row and column variables
biplot.ca <- fviz_ca_biplot(CA5)


ggexport(plotlist = list(scree.plot, biplot.ca), 
         filename = "CA5.pdf")
## file saved to CA5.pdf
#CA16: save the relevant things, bysort, then print  min/max
#relevant indicators: cos2 (plus the correlation resutls frm the analysis above)

ca16_cos=CA_legis$row$contrib

ca11_cos=CA11$row$cos2
ca7_cos=CA7$row$cos2
ca5_cos=CA5$row$cos2

fviz_contrib(CA5, choice = "row", axes = 1, top = 20, sort.val = 'asc')## these are20 candidate summaries for the +1

# -1 candidates are from the ordered double list