This is CA analysis using 16 terms, The terms have been chosen based on the theoretical relevance to the EU dimension as well as the frequency of these terms appearing in the titles of the legislative acts. In the next steps the number of terms will be reduced in a step by step manner.
#install.packages(c("FactoMineR", "factoextra", "dplyr","ggplot2"))
#install.packages('knitr')
#install.packages(c('ggpubr', 'tidyr' , 'gplots', 'foreign', 'MASS', 'ade4', 'ca' , 'ExPosition '))
library(FactoMineR)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggpubr)
library(tidyr)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(foreign)
library(knitr)
setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/CA")
Import csv, and adjust var names
legis=read.csv("CA_16.csv" , stringsAsFactors = FALSE)
head(legis)
## procedure_ref1 key value
## 1 2007/0112(COD) f_amend 1
## 2 2008/0028(COD) f_amend 1
## 3 2008/0147(COD) f_amend 1
## 4 2008/0157(COD) f_amend 1
## 5 2008/0183(COD) f_amend 1
## 6 2008/0196(COD) f_amend 1
#adjust the names of the variables
names(legis)[names(legis) == "procedure_ref1"] <- "id"
Setting up the table for the CA analysis
#getting the tables for the CA
legis_table <- table(legis$id, legis$key)
summary(legis_table)
## Number of cases in table: 2007
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 9479, df = 13408, p-value = 1
## Chi-squared approximation may be incorrect
legis_table <- legis_table[,colSums(legis_table)>=0]
Correspondence analysis with all 16 terms.
CA_legis=CA(legis_table)
summary(CA_legis)
##
## Call:
## CA(X = legis_table)
##
## The chi square of independence between the two variables is equal to 9479.07 (p-value = 1 ).
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 0.409 0.385 0.358 0.352 0.338 0.330 0.322
## % of var. 8.663 8.152 7.586 7.443 7.150 6.995 6.821
## Cumulative % of var. 8.663 16.815 24.401 31.844 38.994 45.989 52.810
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 0.300 0.285 0.281 0.269 0.255 0.245 0.240
## % of var. 6.360 6.035 5.947 5.702 5.395 5.184 5.075
## Cumulative % of var. 59.170 65.205 71.152 76.854 82.249 87.433 92.508
## Dim.15 Dim.16
## Variance 0.209 0.145
## % of var. 4.418 3.074
## Cumulative % of var. 96.926 100.000
##
## Rows (the 10 first)
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## 2005/0214(COD) | 0.692 | 0.089 0.001 0.006 | 0.058 0.000 0.002 |
## 2006/0084(COD) | 2.111 | 0.252 0.015 0.030 | 0.376 0.037 0.067 |
## 2006/0167(COD) | 2.111 | 0.252 0.015 0.030 | 0.376 0.037 0.067 |
## 2007/0112(COD) | 66.378 | -0.457 0.076 0.005 | -0.129 0.006 0.000 |
## 2007/0152(COD) | 99.599 | -0.563 0.077 0.003 | -0.084 0.002 0.000 |
## 2007/0229(COD) | 12.419 | -0.203 0.010 0.003 | -0.407 0.043 0.013 |
## 2007/0286(COD) | 0.692 | 0.089 0.001 0.006 | 0.058 0.000 0.002 |
## 2008/0009(COD) | 0.692 | 0.089 0.001 0.006 | 0.058 0.000 0.002 |
## 2008/0028(COD) | 5.982 | 0.001 0.000 0.000 | -0.187 0.018 0.012 |
## 2008/0062(COD) | 11.227 | -0.319 0.025 0.009 | 0.606 0.095 0.033 |
## Dim.3 ctr cos2
## 2005/0214(COD) 0.050 0.000 0.002 |
## 2006/0084(COD) -0.682 0.129 0.219 |
## 2006/0167(COD) -0.682 0.129 0.219 |
## 2007/0112(COD) 1.756 1.286 0.069 |
## 2007/0152(COD) 2.315 1.491 0.054 |
## 2007/0229(COD) -0.319 0.028 0.008 |
## 2007/0286(COD) 0.050 0.000 0.002 |
## 2008/0009(COD) 0.050 0.000 0.002 |
## 2008/0028(COD) 0.361 0.072 0.043 |
## 2008/0062(COD) 0.285 0.023 0.007 |
##
## Columns (the 10 first)
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr
## f_agency | 273.634 | -0.293 0.293 0.004 | 0.309 0.346
## f_amend | 175.934 | -0.156 1.226 0.029 | -0.137 0.994
## f_author | 254.598 | -0.438 0.678 0.011 | 0.252 0.238
## f_border | 315.397 | -0.464 1.128 0.015 | 0.716 2.854
## f_common | 283.773 | -0.316 0.475 0.007 | -0.542 1.482
## f_commun | 292.395 | -0.224 0.361 0.005 | -0.539 2.221
## f_establish | 245.070 | -0.389 2.597 0.043 | 0.000 0.000
## f_extend | 330.842 | -0.777 0.368 0.005 | -0.140 0.013
## f_framework | 290.183 | -0.504 1.208 0.017 | -0.544 1.494
## f_fund | 288.674 | -0.307 0.599 0.008 | -0.545 2.002
## cos2 Dim.3 ctr cos2
## f_agency 0.005 | -1.524 9.042 0.118 |
## f_amend 0.022 | 0.381 8.337 0.170 |
## f_author 0.004 | 0.316 0.403 0.006 |
## f_border 0.035 | 0.312 0.582 0.007 |
## f_common 0.020 | -0.411 0.918 0.012 |
## f_commun 0.029 | 0.313 0.806 0.010 |
## f_establish 0.000 | -0.926 16.812 0.246 |
## f_extend 0.000 | 2.742 5.227 0.057 |
## f_framework 0.020 | -0.699 2.648 0.033 |
## f_fund 0.027 | 0.931 6.265 0.078 |
To assess the results of the analysis we should consider the size of the Eigenvalues, which indicates how much variance is explained by the dimensions constructed. The table below will show the eigenvalues by dimension as well as the % of variance explained by it (col 2)
get_eigenvalue(CA_legis) # these values allow to determine the number of axes to be considered
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 0.4091316 8.662527 8.662527
## Dim.2 0.3850301 8.152228 16.814755
## Dim.3 0.3582809 7.585868 24.400623
## Dim.4 0.3515437 7.443221 31.843844
## Dim.5 0.3376856 7.149805 38.993649
## Dim.6 0.3303959 6.995459 45.989108
## Dim.7 0.3221592 6.821064 52.810172
## Dim.8 0.3003927 6.360204 59.170376
## Dim.9 0.2850266 6.034857 65.205233
## Dim.10 0.2808809 5.947082 71.152314
## Dim.11 0.2693077 5.702041 76.854355
## Dim.12 0.2548095 5.395071 82.249426
## Dim.13 0.2448325 5.183829 87.433256
## Dim.14 0.2396795 5.074725 92.507981
## Dim.15 0.2086571 4.417889 96.925870
## Dim.16 0.1451913 3.074130 100.000000
To visualize how much variance each dimension explains, I construct the plot below. It shows that overall the % of the variance explained even by the first two dimensions is rather low.
fviz_eig(CA_legis, addlabels = TRUE, ylim = c(0, 35))
Next to assess the result of the CA analysis, I check the degree to a) each terms is represented by the constructed dimensions (see cos2 for cols), and the
col <- get_ca_col(CA_legis)
col
## Correspondence Analysis - Results for columns
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the columns"
## 2 "$cos2" "Cos2 for the columns"
## 3 "$contrib" "contributions of the columns"
## 4 "$inertia" "Inertia of the columns"
fviz_ca_col(CA_legis, col.col = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
fviz_cos2(CA_legis, choice = "col", axes = 1:2) # shows which terms are well represented on the factor map.
fviz_ca_row(CA_legis, col.row = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = FALSE)
I will use the extent of the terms’ contributions to the dimensions to narrow down the number of the terms used for the analysis
CA_legis$col$contrib #it is evident that some of the terms have almost no contribution to the dimensions(e.g. intergovernmental).
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## f_agency 0.29315509 3.457412e-01 9.04227717 1.5389855 1.094401e-02
## f_amend 1.22600533 9.942714e-01 8.33719147 2.2411150 9.727815e-02
## f_author 0.67772145 2.376314e-01 0.40327648 3.9039429 2.072042e+00
## f_border 1.12830659 2.853784e+00 0.58205654 9.6563014 4.422922e+01
## f_common 0.47486522 1.481788e+00 0.91801102 6.8146323 4.126247e+00
## f_commun 0.36078244 2.220547e+00 0.80632646 2.4099280 1.664341e+01
## f_establish 2.59728153 2.411197e-09 16.81183408 1.7375666 3.504594e-02
## f_extend 0.36794982 1.264677e-02 5.22675597 31.2923346 9.393129e+00
## f_framework 1.20841544 1.494437e+00 2.64760514 4.8842639 1.206444e+00
## f_fund 0.59860625 2.002128e+00 6.26489180 8.6973727 8.037292e+00
## f_harmon 88.38152488 1.252654e+00 0.02956324 1.2291981 1.579688e-01
## f_intergov 0.07161469 1.230078e-01 8.44965685 0.1739962 4.800452e-04
## f_lay_down 0.14637911 3.053389e+00 2.50362464 4.5782034 7.492973e+00
## f_noterm 0.32890883 1.421797e-01 0.10257556 0.1498277 1.601493e-01
## f_provi 0.13826581 3.519798e+00 10.07820598 12.2978461 1.848993e+00
## f_recogn 0.29047062 7.549207e+01 7.99342988 3.9825654 4.488131e+00
## f_repeal 1.70974692 4.773928e+00 19.80271773 4.4119204 2.565510e-04
fviz_contrib(CA_legis, choice = "col", axes = 1:2)
let’s see how specific rows(aka procedures) contribute to the dimensions
head(CA_legis$row$contrib, 20) # this can be plotted as well, but with the large N of observation the plot becomes quite useless
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2005/0214(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2006/0084(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2006/0167(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2007/0112(COD) 7.630698e-02 0.0064638272 1.2860816119 6.3547295743 1.7763112965
## 2007/0152(COD) 7.729255e-02 0.0018050465 1.4905481913 8.9967021038 2.7258577293
## 2007/0229(COD) 1.002197e-02 0.0429731907 0.0282879208 0.2328972379 0.1436385361
## 2007/0286(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0009(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0028(COD) 9.148977e-07 0.0181807926 0.0723367788 0.2530990327 0.0410250212
## 2008/0062(COD) 2.471151e-02 0.0951088956 0.0226493495 0.3376554791 1.5647462727
## 2008/0098(COD) 3.151594e+00 0.0750629299 0.0007443627 0.0293371318 0.0675445211
## 2008/0142(COD) 2.471151e-02 0.0951088956 0.0226493495 0.3376554791 1.5647462727
## 2008/0147(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0157(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0183(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0192(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2008/0196(COD) 2.734045e-03 0.0122124322 0.0244619563 0.0036200442 0.0007760116
## 2008/0198(COD) 7.095760e-03 0.0777127196 0.0833212084 0.1298243770 0.2249664154
## 2008/0211(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0222(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
tail(CA_legis$row$contrib, 20 )
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2019/0107(COD) 0.0807079270 0.0003489906 0.4241680831 0.0879182141 0.5728011067
## 2019/0108(COD) 0.0756150971 0.0036567348 0.3712795067 0.0725290211 0.6938013577
## 2019/0179(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2019/0180(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2019/0192(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2020/0043(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0054(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2020/0058(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0059(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0060(COD) 0.0068163497 0.0898851250 0.3780226215 0.1900332606 0.0552289767
## 2020/0065(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2020/0066(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0067(COD) 0.0286839587 0.0025626424 0.0684419857 0.1954503890 0.0559736946
## 2020/0068(COD) 0.0047896300 0.0205604063 0.1221009772 0.0039793075 0.4262028159
## 2020/0069(COD) 0.0304703119 0.1173007972 0.0095166374 0.0117973120 0.5066053742
## 2020/0071(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0075(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2020/0099(COD) 0.0070957596 0.0777127196 0.0833212084 0.1298243770 0.2249664154
## 2020/0113(COD) 0.0068163497 0.0898851250 0.3780226215 0.1900332606 0.0552289767
## 2020/0128(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
description16=dimdesc(CA_legis, axes = c(1,2))
#description of dimension 1=> shows which variables correlate with it the most and which rows correlate with DIM 1 most
#description16[[1]] ##
##Same as above, but for dim2 ==> these generate a very VERY long table (only useful if really need to look into details on the correlation between the rows and dim)
## description16[[2]]
Here is the asymmetric plot which allow to see the association between red and blue point (rows and columns))
fviz_ca_biplot(CA_legis, select.row=list(contrib=15),
map ="rowprincipal", arrow = c(TRUE, TRUE),
repel = TRUE
)
##Save the scree and bi-plot for a broader analysis with 16 terms
scree.plot <- fviz_eig(CA_legis)
# Biplot of row and column variables
biplot.ca <- fviz_ca_biplot(CA_legis)
ggexport(plotlist = list(scree.plot, biplot.ca),
filename = "CA16.pdf")
## file saved to CA16.pdf
Having seen the extent to which different terms contribute to the underlying dimensions, several of them can be excluded. Relying on the output for the first two dimensions, I have narrowed the terms for the next step CA to 11. The chunks below run the CA with those two terms (TERMS INCLUDED: HARMONIZATION, AMENDING, BORDER, ESTABLISH, REPEAL, FRAMEWORK, PROVISIONS, RECOGNITION,FIND, LAYING DOWN & a category where so term from the list was found). NOTA BENE: in each analysis, the value suggests a very cautious approach to the results!
setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/CA")
cat11=read.csv("CA_11.csv" , stringsAsFactors = FALSE)
head(cat11)
## procedure_ref1 key value
## 1 2007/0112(COD) f_amend 1
## 2 2008/0028(COD) f_amend 1
## 3 2008/0147(COD) f_amend 1
## 4 2008/0157(COD) f_amend 1
## 5 2008/0183(COD) f_amend 1
## 6 2008/0196(COD) f_amend 1
#adjust the names of the variables
names(cat11)[names(cat11) == "procedure_ref1"] <- "id"##
## setting the tables
cat11_tb <- table(cat11$id, cat11$key)
summary(cat11_tb)
## Number of cases in table: 1845
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 5719, df = 8380, p-value = 1
## Chi-squared approximation may be incorrect
cat11_tb <- cat11_tb[,colSums(cat11_tb)>=0]
##running the CA
CA11=CA(cat11_tb)
summary(CA11)
##
## Call:
## CA(X = cat11_tb)
##
## The chi square of independence between the two variables is equal to 5718.815 (p-value = 1 ).
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 0.411 0.391 0.356 0.350 0.321 0.308 0.299
## % of var. 13.257 12.609 11.490 11.291 10.351 9.928 9.637
## Cumulative % of var. 13.257 25.866 37.356 48.646 58.997 68.926 78.563
## Dim.8 Dim.9 Dim.10
## Variance 0.270 0.238 0.156
## % of var. 8.708 7.690 5.039
## Cumulative % of var. 87.271 94.961 100.000
##
## Rows (the 10 first)
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## 2005/0214(COD) | 0.648 | 0.056 0.000 0.003 | 0.052 0.000 0.002 |
## 2006/0084(COD) | 2.024 | 0.256 0.017 0.035 | 0.315 0.027 0.053 |
## 2006/0167(COD) | 2.024 | 0.256 0.017 0.035 | 0.315 0.027 0.053 |
## 2007/0112(COD) | 0.725 | -0.117 0.004 0.020 | -0.070 0.001 0.007 |
## 2007/0152(COD) | 0.648 | 0.056 0.000 0.003 | 0.052 0.000 0.002 |
## 2007/0229(COD) | 0.648 | 0.056 0.000 0.003 | 0.052 0.000 0.002 |
## 2007/0286(COD) | 0.648 | 0.056 0.000 0.003 | 0.052 0.000 0.002 |
## 2008/0009(COD) | 0.648 | 0.056 0.000 0.003 | 0.052 0.000 0.002 |
## 2008/0028(COD) | 5.807 | -0.059 0.002 0.001 | -0.216 0.026 0.017 |
## 2008/0062(COD) | 11.139 | -0.457 0.055 0.020 | 0.456 0.058 0.020 |
## Dim.3 ctr cos2
## 2005/0214(COD) -0.034 0.000 0.001 |
## 2006/0084(COD) -0.532 0.086 0.152 |
## 2006/0167(COD) -0.532 0.086 0.152 |
## 2007/0112(COD) 0.175 0.009 0.046 |
## 2007/0152(COD) -0.034 0.000 0.001 |
## 2007/0229(COD) -0.034 0.000 0.001 |
## 2007/0286(COD) -0.034 0.000 0.001 |
## 2008/0009(COD) -0.034 0.000 0.001 |
## 2008/0028(COD) 0.599 0.218 0.134 |
## 2008/0062(COD) -0.314 0.030 0.010 |
##
## Columns (the 10 first)
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr
## f_amend | 183.108 | -0.186 1.873 0.042 | -0.120 0.816
## f_border | 322.818 | -0.622 2.194 0.028 | 0.538 1.729
## f_establish | 266.080 | -0.394 2.881 0.044 | -0.078 0.118
## f_framework | 300.046 | -0.575 1.703 0.023 | -0.518 1.450
## f_fund | 294.114 | -0.458 1.438 0.020 | -0.643 2.981
## f_harmon | 394.993 | 5.722 86.374 0.899 | -0.822 1.875
## f_lay_down | 304.260 | 0.200 0.244 0.003 | -0.714 3.250
## f_noterm | 95.977 | 0.036 0.143 0.006 | 0.032 0.121
## f_provi | 283.504 | -0.295 0.495 0.007 | -0.815 3.956
## f_recogn | 377.913 | 0.557 0.409 0.004 | 7.600 80.101
## cos2 Dim.3 ctr cos2
## f_amend 0.017 | 0.230 3.309 0.064 |
## f_border 0.021 | -0.354 0.819 0.009 |
## f_establish 0.002 | -0.995 21.246 0.284 |
## f_framework 0.019 | -0.897 4.772 0.057 |
## f_fund 0.040 | 1.465 16.974 0.206 |
## f_harmon 0.019 | 0.022 0.001 0.000 |
## f_lay_down 0.042 | 1.198 10.041 0.118 |
## f_noterm 0.005 | -0.020 0.054 0.002 |
## f_provi 0.055 | 1.835 22.026 0.277 |
## f_recogn 0.828 | 2.475 9.324 0.088 |
fviz_ca_biplot(CA11) # quick biplot
Assessing the results by checking the eigen value and visualizing the results
get_eigenvalue(CA11) ## still % of variance explained is very low but slitly imporving
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 0.4109142 13.256884 13.25688
## Dim.2 0.3908360 12.609121 25.86601
## Dim.3 0.3561322 11.489510 37.35552
## Dim.4 0.3499675 11.290626 48.64614
## Dim.5 0.3208491 10.351212 58.99735
## Dim.6 0.3077451 9.928451 68.92580
## Dim.7 0.2987092 9.636934 78.56274
## Dim.8 0.2699243 8.708280 87.27102
## Dim.9 0.2383475 7.689550 94.96057
## Dim.10 0.1562037 5.039432 100.00000
fviz_eig(CA11, addlabels = TRUE, ylim = c(0, 35))
##another option:
fviz_ca_biplot(CA11,
map ="rowprincipal", arrow = c(FALSE, TRUE),
repel = FALSE)
Next to assess the result of the CA analysis, I check the degree to a) each terms is represented by the constructed dimensions (see cos2 for cols), and b) which raw points/procedures are well represented bu the dimensions
col <- get_ca_col(CA11)
col
## Correspondence Analysis - Results for columns
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the columns"
## 2 "$cos2" "Cos2 for the columns"
## 3 "$contrib" "contributions of the columns"
## 4 "$inertia" "Inertia of the columns"
fviz_ca_col(CA11, col.col = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
fviz_cos2(CA11, choice = "col", axes = 1:2) # shows which terms are well represented on the factor map.
I will use the extent of the terms’ contributions to the dimensions to narrow down the number of the terms used for the analysis
CA11$col$contrib #it is evident that some of the terms have almost no contribution to the dimensions(e.g. intergovernmental).
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## f_amend 1.8731039 0.8161941 3.308994385 5.9339082 0.3058423
## f_border 2.1935793 1.7286291 0.819499185 53.9382935 0.0323442
## f_establish 2.8808684 0.1184409 21.246005190 0.5776545 3.9382532
## f_framework 1.7028591 1.4500988 4.772204733 6.0306690 51.9905482
## f_fund 1.4384451 2.9806662 16.974431142 1.0356322 5.3846572
## f_harmon 86.3735417 1.8748396 0.001460197 3.0297824 3.1789169
## f_lay_down 0.2437805 3.2499131 10.040644541 8.6571414 2.0877030
## f_noterm 0.1434581 0.1211063 0.053681823 0.3218049 0.5483614
## f_provi 0.4946088 3.9560617 22.026127692 0.8995863 0.6146170
## f_recogn 0.4090857 80.1008766 9.323607941 1.0729918 6.3520629
## f_repeal 2.2466693 3.6031735 11.433343171 18.5025359 25.5666938
fviz_contrib(CA11, choice = "col", axes = 1:2)
#description of the dimensions
cal <- dimdesc(CA11, axes = c(1,2))
head(cal[[1]]$col, 15)
## coord
## f_border -0.62189375
## f_framework -0.57534827
## f_fund -0.45795070
## f_establish -0.39357355
## f_provi -0.29530476
## f_amend -0.18565476
## f_noterm 0.03598294
## f_lay_down 0.20044459
## f_repeal 0.29256124
## f_recogn 0.55690472
## f_harmon 5.72201668
Next to assess the result of the CA for 11 terms, I check contributions and quality of represnetation for rows/procedures.
##Rows that contribute the most to Dim.1 and Dim.2 are the most important in explaining the variability in the data set.
#Rows that do not contribute much to any dimension or that contribute to the last dimensions are less important.
head(CA11$row$contrib)
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2005/0214(COD) 0.0004156182 0.0003688867 0.0001794472 0.001094676 2.034636e-03
## 2006/0084(COD) 0.0173244285 0.0274820766 0.0861340829 0.116331097 2.297728e-01
## 2006/0167(COD) 0.0173244285 0.0274820766 0.0861340829 0.116331097 2.297728e-01
## 2007/0112(COD) 0.0035954350 0.0013514177 0.0093540949 0.027836503 4.480764e-06
## 2007/0152(COD) 0.0004156182 0.0003688867 0.0001794472 0.001094676 2.034636e-03
## 2007/0229(COD) 0.0004156182 0.0003688867 0.0001794472 0.001094676 2.034636e-03
tail(CA11$row$contrib)
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2020/0069(COD) 0.0035954350 0.0013514177 0.0093540949 2.783650e-02 4.480764e-06
## 2020/0071(COD) 0.0035954350 0.0013514177 0.0093540949 2.783650e-02 4.480764e-06
## 2020/0075(COD) 0.0395046146 0.0630612602 0.3990701383 2.258574e-06 1.064495e-01
## 2020/0099(COD) 0.0089715490 0.0824020574 0.2960544834 2.451650e-01 5.477868e-02
## 2020/0113(COD) 0.0211862658 0.0961823020 0.5950382014 2.434247e-05 1.527379e-02
## 2020/0128(COD) 0.0004156182 0.0003688867 0.0001794472 1.094676e-03 2.034636e-03
row1 <- get_ca_row(CA11)
row1
## Correspondence Analysis - Results for rows
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the rows"
## 2 "$cos2" "Cos2 for the rows"
## 3 "$contrib" "contributions of the rows"
## 4 "$inertia" "Inertia of the rows"
fviz_ca_row(CA11, col.row = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel =FALSE)
#fviz_cos2(CA11, choice = "row", axes = 1:2) # shows which terms are well represented on the factor map. very messy for rows
fviz_ca_row(CA11, col.row = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = FALSE)
I will check the extent to which specific rows(aka procedures) contribute to the dimensions
head(CA_legis$row$contrib, 20) # this can be plotted as well, but with the large N of observation the plot becomes quite useless
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2005/0214(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2006/0084(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2006/0167(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2007/0112(COD) 7.630698e-02 0.0064638272 1.2860816119 6.3547295743 1.7763112965
## 2007/0152(COD) 7.729255e-02 0.0018050465 1.4905481913 8.9967021038 2.7258577293
## 2007/0229(COD) 1.002197e-02 0.0429731907 0.0282879208 0.2328972379 0.1436385361
## 2007/0286(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0009(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0028(COD) 9.148977e-07 0.0181807926 0.0723367788 0.2530990327 0.0410250212
## 2008/0062(COD) 2.471151e-02 0.0951088956 0.0226493495 0.3376554791 1.5647462727
## 2008/0098(COD) 3.151594e+00 0.0750629299 0.0007443627 0.0293371318 0.0675445211
## 2008/0142(COD) 2.471151e-02 0.0951088956 0.0226493495 0.3376554791 1.5647462727
## 2008/0147(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0157(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0183(COD) 1.476827e-03 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2008/0192(COD) 1.546150e-02 0.0366062180 0.1293139035 0.0261299749 0.0002377769
## 2008/0196(COD) 2.734045e-03 0.0122124322 0.0244619563 0.0036200442 0.0007760116
## 2008/0198(COD) 7.095760e-03 0.0777127196 0.0833212084 0.1298243770 0.2249664154
## 2008/0211(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2008/0222(COD) 9.570469e-04 0.0004396061 0.0003408325 0.0005073802 0.0005645901
tail(CA_legis$row$contrib, 20 )
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2019/0107(COD) 0.0807079270 0.0003489906 0.4241680831 0.0879182141 0.5728011067
## 2019/0108(COD) 0.0756150971 0.0036567348 0.3712795067 0.0725290211 0.6938013577
## 2019/0179(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2019/0180(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2019/0192(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2020/0043(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0054(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2020/0058(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0059(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0060(COD) 0.0068163497 0.0898851250 0.3780226215 0.1900332606 0.0552289767
## 2020/0065(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
## 2020/0066(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0067(COD) 0.0286839587 0.0025626424 0.0684419857 0.1954503890 0.0559736946
## 2020/0068(COD) 0.0047896300 0.0205604063 0.1221009772 0.0039793075 0.4262028159
## 2020/0069(COD) 0.0304703119 0.1173007972 0.0095166374 0.0117973120 0.5066053742
## 2020/0071(COD) 0.0014768271 0.0016937635 0.0327982178 0.0107923749 0.0012602017
## 2020/0075(COD) 0.0164410039 0.0467321077 0.2329680581 0.0982287089 0.1760547061
## 2020/0099(COD) 0.0070957596 0.0777127196 0.0833212084 0.1298243770 0.2249664154
## 2020/0113(COD) 0.0068163497 0.0898851250 0.3780226215 0.1900332606 0.0552289767
## 2020/0128(COD) 0.0009570469 0.0004396061 0.0003408325 0.0005073802 0.0005645901
Let’s take a look at the variables and observations that correlate most with the dimension of interest=> will be used to getting the procedure numbers for the candidate summaries
description_11=dimdesc(CA11, axes = c(1,2))
#description of dimension 1=> shows which variables correlate with it the most and which rows correlate with DIM 1 most
#description_11[[1]] ## ==> if run this code there will be two very very long tables.
##Same as above, but for dim2
#description_11[[2]]
Save pdf of the bi plot and scree plot for the CA with 11 terms
###
scree.plot <- fviz_eig(CA11)
# Biplot of row and column variables
biplot.ca <- fviz_ca_biplot(CA11)
ggexport(plotlist = list(scree.plot, biplot.ca),
filename = "CA11.pdf")
## file saved to CA11.pdf
What is clear that the quality of the analysis is not improving with the reduction of the terms used. However, we do seem to capture more variance with a more focused approach in contrast to the approach suggesting to include all possible terms for the As the next step, I narrowed the analysis down even further. For the third iteration of the CA, I used the following terms: ‘HARMONIZE, BORDER, AMEND, ESTABLISH, REPEAL, RECOGNIZE, NOTERM’. NOTERM includes cases where nether of the term were identified in the title of the procedure.
setwd("C:/Users/nasta/Dropbox/____Nordface_POst_doc/CA")
cat7=read.csv("CA_7.csv" , stringsAsFactors = FALSE)
head(cat7)
## procedure_ref1 key value
## 1 2007/0112(COD) f_amend 1
## 2 2008/0028(COD) f_amend 1
## 3 2008/0147(COD) f_amend 1
## 4 2008/0157(COD) f_amend 1
## 5 2008/0183(COD) f_amend 1
## 6 2008/0196(COD) f_amend 1
#adjust the names of the variables
names(cat7)[names(cat7) == "procedure_ref1"] <- "id"
## setting the tables
summary(cat7)
## id key value
## Length:1665 Length:1665 Min. :1.000
## Class :character Class :character 1st Qu.:1.000
## Mode :character Mode :character Median :2.000
## Mean :1.505
## 3rd Qu.:2.000
## Max. :2.000
length(cat7$id)
## [1] 1665
cat7_tb <- table(cat7$id, cat7$key)
summary(cat7_tb)
## Number of cases in table: 1665
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 3269, df = 5028, p-value = 1
## Chi-squared approximation may be incorrect
cat7_tb <- cat7_tb[,colSums(cat7_tb)>=0]
##running the CA
CA7=CA(cat7_tb)
summary(CA7) ## overall results are still quite quationable
##
## Call:
## CA(X = cat7_tb)
##
## The chi square of independence between the two variables is equal to 3268.551 (p-value = 1 ).
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6
## Variance 0.419 0.386 0.362 0.330 0.296 0.169
## % of var. 21.357 19.687 18.459 16.829 15.074 8.595
## Cumulative % of var. 21.357 41.044 59.503 76.332 91.405 100.000
##
## Rows (the 10 first)
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## 2005/0214(COD) | 0.590 | 0.016 0.000 0.000 | -0.031 0.000 0.001 |
## 2006/0084(COD) | 1.907 | 0.234 0.016 0.034 | 0.376 0.044 0.089 |
## 2006/0167(COD) | 1.907 | 0.234 0.016 0.034 | 0.376 0.044 0.089 |
## 2007/0112(COD) | 0.608 | -0.164 0.008 0.053 | -0.216 0.015 0.092 |
## 2007/0152(COD) | 0.590 | 0.016 0.000 0.000 | -0.031 0.000 0.001 |
## 2007/0229(COD) | 0.590 | 0.016 0.000 0.000 | -0.031 0.000 0.001 |
## 2007/0286(COD) | 0.590 | 0.016 0.000 0.000 | -0.031 0.000 0.001 |
## 2008/0009(COD) | 0.590 | 0.016 0.000 0.000 | -0.031 0.000 0.001 |
## 2008/0028(COD) | 1.079 | 0.042 0.001 0.003 | 0.117 0.006 0.023 |
## 2008/0062(COD) | 11.022 | -0.580 0.096 0.037 | -0.120 0.004 0.002 |
## Dim.3 ctr cos2
## 2005/0214(COD) 0.047 0.000 0.002 |
## 2006/0084(COD) -0.960 0.305 0.580 |
## 2006/0167(COD) -0.960 0.305 0.580 |
## 2007/0112(COD) 0.360 0.043 0.256 |
## 2007/0152(COD) 0.047 0.000 0.002 |
## 2007/0229(COD) 0.047 0.000 0.002 |
## 2007/0286(COD) 0.047 0.000 0.002 |
## 2008/0009(COD) 0.047 0.000 0.002 |
## 2008/0028(COD) -0.416 0.086 0.289 |
## 2008/0062(COD) 1.502 0.748 0.246 |
##
## Columns
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr
## f_amend | 190.538 | -0.222 2.914 0.064 | -0.249 3.982
## f_border | 330.376 | -0.761 3.566 0.045 | -0.130 0.112
## f_establish | 281.982 | -0.376 2.862 0.043 | -0.248 1.345
## f_harmon | 404.655 | 5.548 88.199 0.914 | -0.592 1.088
## f_noterm | 89.444 | 0.010 0.012 0.001 | -0.019 0.050
## f_recogn | 377.327 | -0.038 0.002 0.000 | 7.442 86.072
## f_repeal | 288.772 | 0.293 2.444 0.035 | 0.488 7.352
## cos2 Dim.3 ctr cos2
## f_amend 0.081 | 0.405 11.173 0.212 |
## f_border 0.001 | 1.780 22.591 0.248 |
## f_establish 0.018 | -0.551 7.093 0.091 |
## f_harmon 0.010 | 1.222 4.951 0.044 |
## f_noterm 0.002 | 0.028 0.112 0.005 |
## f_recogn 0.882 | 2.177 7.856 0.075 |
## f_repeal 0.098 | -1.184 46.224 0.580 |
fviz_ca_biplot(CA7) # quick bi-plot
Assessing the results by checking the eigen value and visualizing the results
get_eigenvalue(CA7) ## still % of variance explained is improved
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 0.4192591 21.357063 21.35706
## Dim.2 0.3864659 19.686577 41.04364
## Dim.3 0.3623666 18.458959 59.50260
## Dim.4 0.3303703 16.829068 76.33167
## Dim.5 0.2959134 15.073830 91.40550
## Dim.6 0.1687181 8.594503 100.00000
fviz_eig(CA7, addlabels = TRUE, ylim = c(0, 35))
##another option:
#fviz_ca_biplot(CA11,
# map ="rowprincipal", arrow = c(TRUE, TRUE),
# repel = FALSE)
Next I check the degree to a) each terms is represented by the constructed dimensions (see cos2 for cols), and b) which raw points/procedures are well represented bu the dimensions (see below)
col <- get_ca_col(CA7)
col
## Correspondence Analysis - Results for columns
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the columns"
## 2 "$cos2" "Cos2 for the columns"
## 3 "$contrib" "contributions of the columns"
## 4 "$inertia" "Inertia of the columns"
fviz_ca_col(CA7, col.col = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
fviz_cos2(CA7, choice = "col", axes = 1:2) # shows which terms are well represented on the factor map.
I will use the extent of the terms’ contributions to the dimensions to narrow down the number of the terms used for the analysis
CA7$col$contrib #in the next iteration=> i will remove the NOTERM category and possibly 'border'.
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## f_amend 2.914351202 3.9815918 11.1733731 15.7681485 0.3700016
## f_border 3.565923552 0.1121130 22.5912396 66.5874515 4.2839422
## f_establish 2.862029691 1.3446149 7.0926213 7.0266840 72.6125545
## f_harmon 88.198989749 1.0879369 4.9505010 0.9576765 2.7398172
## f_noterm 0.012271079 0.0496369 0.1120782 2.1385788 1.5974700
## f_recogn 0.002043895 86.0717688 7.8561384 0.8272372 4.4400094
## f_repeal 2.444390833 7.3523377 46.2240483 6.6942234 13.9562051
fviz_contrib(CA7, choice = "col", axes = 1:2)
#description of the dimensionContribution ofs
cal <- dimdesc(CA7, axes = c(1,2))
head(cal[[1]]$col, 10)
## coord
## f_border -0.76085196
## f_establish -0.37642277
## f_amend -0.22221364
## f_recogn -0.03777266
## f_noterm 0.01009834
## f_repeal 0.29282433
## f_harmon 5.54836703
Next to assess the result of the CA for 7 terms, I check contributions and quality of representation for rows/procedures.
##Rows that contribute the most to Dim.1 and Dim.2 are the most important in explaining the variability in the data set.
#Rows that do not contribute much to any dimension or that contribute to the last dimensions are less important.
head(CA7$row$contrib)
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2005/0214(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2006/0084(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2006/0167(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2007/0112(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2007/0152(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2007/0229(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
tail(CA7$row$contrib)
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2020/0069(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0071(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0075(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0099(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0113(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0128(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
row1 <- get_ca_row(CA7)
row1
## Correspondence Analysis - Results for rows
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the rows"
## 2 "$cos2" "Cos2 for the rows"
## 3 "$contrib" "contributions of the rows"
## 4 "$inertia" "Inertia of the rows"
fviz_ca_row(CA7, col.row = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel =FALSE, title='COS2 plot')
#fviz_cos2(CA11, choice = "row", axes = 1:2) # shows which terms are well represented on the factor map. very messy for rows
fviz_ca_row(CA11, col.row = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = FALSE, title='contribution of the rows')
I will check the extent to which specific rows(aka procedures) contribute to the dimensions
head(CA7$row$contrib, 20) # this can be plotted as well, but with the large N of observation the plot becomes quite useless
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2005/0214(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2006/0084(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2006/0167(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2007/0112(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2007/0152(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2007/0229(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2007/0286(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2008/0009(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2008/0028(COD) 7.418973e-04 0.0064120725 0.0859784609 0.003963991 0.128996055
## 2008/0062(COD) 9.629098e-02 0.0044653371 0.7482148179 2.157446834 0.218066442
## 2008/0098(COD) 3.899444e+00 0.0020459497 0.0006762595 0.124856754 0.004286880
## 2008/0142(COD) 9.629098e-02 0.0044653371 0.7482148179 2.157446834 0.218066442
## 2008/0147(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2008/0157(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2008/0183(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2008/0192(COD) 1.567669e-02 0.0440535174 0.3053273696 0.026752574 0.160741080
## 2008/0196(COD) 7.418973e-04 0.0064120725 0.0859784609 0.003963991 0.128996055
## 2008/0198(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2008/0211(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2008/0222(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
tail(CA7$row$contrib, 20 )
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 2019/0107(COD) 1.078192e-01 0.0212749566 0.7468603173 1.005572579 0.170643572
## 2019/0108(COD) 9.629098e-02 0.0044653371 0.7482148179 2.157446834 0.218066442
## 2019/0179(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2019/0180(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2019/0192(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0043(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0054(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0058(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0059(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0060(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0065(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0066(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0067(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0068(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0069(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0071(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0075(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0099(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
## 2020/0113(COD) 7.686608e-03 0.0145349492 0.0428540245 0.091655162 0.009147165
## 2020/0128(COD) 3.484343e-05 0.0001529024 0.0003682083 0.007706282 0.006426712
Let’s take a look at the variables and observations that correlate most with the dimension of interest=> will be used to getting the procedure numbers for the candidate summaries
description7=dimdesc(CA7, axes = c(1,2))
#description of dimension 1=> shows which variables correlate with it the most and which rows correlate with DIM 1 most
# description7[[1]] ## ==> if run this code there will be two very long CORR tables.
##Same as above, but for dim2
#description7[[2]]
Save pdf of the bi plot and scree plot for the CA with 11 terms
###
scree.plot <- fviz_eig(CA7)
# Biplot of row and column variables
biplot.ca <- fviz_ca_biplot(CA7)
ggexport(plotlist = list(scree.plot, biplot.ca),
filename = "CA7.pdf")
## file saved to CA7.pdf
Based on the contribution of th terms to the dimensions and the extent to which they are represented, their contributions to the dimensions, I have narrowed the analysis to five key terms. In the new iteration, the following terms are included: HARMONIZE, AMEND, REPEAL ESTABLSH, RECOGNIZE
Load the dataset for 5 key terms, prepare for the CA and run the CA.
cat5=read.csv("CA_5.csv" , stringsAsFactors = FALSE)
head(cat5)
## procedure_ref1 key value
## 1 2007/0112(COD) f_amend 1
## 2 2008/0028(COD) f_amend 1
## 3 2008/0147(COD) f_amend 1
## 4 2008/0157(COD) f_amend 1
## 5 2008/0183(COD) f_amend 1
## 6 2008/0196(COD) f_amend 1
#adjust the names of the variables
names(cat5)[names(cat5) == "procedure_ref1"] <- "id"##
## setting the tables
cat5_tb <- table(cat5$id, cat5$key)
summary(cat5_tb)
## Number of cases in table: 782
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 2079.1, df = 2416, p-value = 1
## Chi-squared approximation may be incorrect
cat5_tb <- cat5_tb[,colSums(cat5_tb)>=0]
##running the CA
CA5=CA(cat5_tb)
summary(CA5)
##
## Call:
## CA(X = cat5_tb)
##
## The chi square of independence between the two variables is equal to 2079.131 (p-value = 0.9999998 ).
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4
## Variance 0.759 0.701 0.662 0.537
## % of var. 28.549 26.353 24.883 20.215
## Cumulative % of var. 28.549 54.902 79.785 100.000
##
## Rows (the 10 first)
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## 2006/0084(COD) | 3.746 | 0.302 0.015 0.031 | 1.022 0.191 0.357 |
## 2006/0167(COD) | 3.746 | 0.302 0.015 0.031 | 1.022 0.191 0.357 |
## 2007/0112(COD) | 1.148 | -0.334 0.019 0.124 | -0.580 0.061 0.375 |
## 2008/0028(COD) | 1.169 | -0.016 0.000 0.001 | 0.221 0.018 0.107 |
## 2008/0098(COD) | 24.955 | 3.119 3.278 0.997 | -0.126 0.006 0.002 |
## 2008/0147(COD) | 1.148 | -0.334 0.019 0.124 | -0.580 0.061 0.375 |
## 2008/0157(COD) | 1.148 | -0.334 0.019 0.124 | -0.580 0.061 0.375 |
## 2008/0183(COD) | 1.148 | -0.334 0.019 0.124 | -0.580 0.061 0.375 |
## 2008/0192(COD) | 3.746 | 0.302 0.015 0.031 | 1.022 0.191 0.357 |
## 2008/0196(COD) | 1.169 | -0.016 0.000 0.001 | 0.221 0.018 0.107 |
## Dim.3 ctr cos2
## 2006/0084(COD) -1.102 0.235 0.414 |
## 2006/0167(COD) -1.102 0.235 0.414 |
## 2007/0112(COD) 0.547 0.058 0.333 |
## 2008/0028(COD) -0.277 0.030 0.169 |
## 2008/0098(COD) -0.098 0.004 0.001 |
## 2008/0147(COD) 0.547 0.058 0.333 |
## 2008/0157(COD) 0.547 0.058 0.333 |
## 2008/0183(COD) 0.547 0.058 0.333 |
## 2008/0192(COD) -1.102 0.235 0.414 |
## 2008/0196(COD) -0.277 0.030 0.169 |
##
## Columns
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr
## f_amend | 315.783 | -0.291 5.869 0.141 | -0.486 17.735
## f_establish | 452.081 | -0.265 1.664 0.028 | -0.014 0.005
## f_harmon | 732.758 | 5.171 90.113 0.933 | -1.066 4.148
## f_recogn | 670.546 | 0.126 0.027 0.000 | 5.313 51.522
## f_repeal | 487.568 | 0.263 2.327 0.036 | 0.856 26.589
## cos2 Dim.3 ctr cos2
## f_amend 0.394 | 0.445 15.742 0.330 |
## f_establish 0.000 | -0.481 6.309 0.092 |
## f_harmon 0.040 | 0.737 2.099 0.019 |
## f_recogn 0.538 | 4.823 44.969 0.444 |
## f_repeal 0.382 | -0.896 30.881 0.419 |
fviz_ca_biplot(CA5, axes = c(1, 2)) # quick biplot
Assessing the results by checking the eigen value and visualizing the results
get_eigenvalue(CA5) ## still % of variance explained is higher, but again, the p/chi2 ask for caution.
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 0.7590348 28.54871 28.54871
## Dim.2 0.7006696 26.35349 54.90220
## Dim.3 0.6615669 24.88277 79.78497
## Dim.4 0.5374642 20.21503 100.00000
fviz_eig(CA5, addlabels = TRUE, ylim = c(0, 35))
##another option:
fviz_ca_biplot(CA5,
map ="rowprincipal", arrow = c(FALSE, TRUE),
repel = FALSE)
Next to assess the result of the CA analysis, I check the degree to a) each terms is represented by the constructed dimensions (see cos2 for cols), and b) which raw points/procedures are well represented bu the dimensions
col <- get_ca_col(CA5)
col
## Correspondence Analysis - Results for columns
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the columns"
## 2 "$cos2" "Cos2 for the columns"
## 3 "$contrib" "contributions of the columns"
## 4 "$inertia" "Inertia of the columns"
fviz_ca_col(CA5, col.col = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
fviz_cos2(CA5, choice = "col", axes = 1:3) # shows which terms are well represented on the factor map.
I will use the extent of the terms’ contributions to the dimensions to narrow down the number of the terms used for the analysis
CA5$col$contrib # the degrees to whcih the terms contribute to the dimensions.
## Dim 1 Dim 2 Dim 3 Dim 4
## f_amend 5.86879158 17.735372181 15.741645 7.968769
## f_establish 1.66392833 0.005128878 6.309245 73.991007
## f_harmon 90.11343601 4.148057838 2.098956 1.082006
## f_recogn 0.02670653 51.522208701 44.969345 2.202968
## f_repeal 2.32713755 26.589232402 30.880810 14.755250
fviz_contrib(CA5, choice = "col", axes = 1:3)
#description of the dimensions
cal <- dimdesc(CA5, axes = c(1,2))
head(cal[[1]]$col, 15)
## coord
## f_amend -0.2907769
## f_establish -0.2646622
## f_recogn 0.1259050
## f_repeal 0.2634624
## f_harmon 5.1714698
Next to assess the result of the CA for 5 terms, I check contributions and quality of representation for rows/procedures.
##Rows that contribute the most to Dim.1 and Dim.2 are the most important in explaining the variability in the data set.
head(CA5$row$contrib)
## Dim 1 Dim 2 Dim 3 Dim 4
## 2006/0084(COD) 1.540662e-02 0.190695073 0.23456426 0.13795706
## 2006/0167(COD) 1.540662e-02 0.190695073 0.23456426 0.13795706
## 2007/0112(COD) 1.876678e-02 0.061436977 0.05775361 0.03598690
## 2008/0028(COD) 8.279923e-05 0.017826805 0.02976764 0.15743224
## 2008/0098(COD) 3.278145e+00 0.005765073 0.00370041 0.00146658
## 2008/0147(COD) 1.876678e-02 0.061436977 0.05775361 0.03598690
tail(CA5$row$contrib)
## Dim 1 Dim 2 Dim 3 Dim 4
## 2020/0066(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0067(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0069(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0071(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0075(COD) 0.01876678 0.06143698 0.05775361 0.0359869
## 2020/0113(COD) 0.01876678 0.06143698 0.05775361 0.0359869
row1 <- get_ca_row(CA5)
row1
## Correspondence Analysis - Results for rows
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the rows"
## 2 "$cos2" "Cos2 for the rows"
## 3 "$contrib" "contributions of the rows"
## 4 "$inertia" "Inertia of the rows"
fviz_ca_row(CA5, col.row = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel =FALSE)
fviz_ca_row(CA5, col.row = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = FALSE)
I will check the extent to which specific rows(aka procedures) contribute to the dimensions
head(CA5$row$contrib, 20) # this can be plotted as well, but with the large N of observation the plot becomes quite useless
## Dim 1 Dim 2 Dim 3 Dim 4
## 2006/0084(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
## 2006/0167(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
## 2007/0112(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0028(COD) 8.279923e-05 1.782680e-02 0.0297676432 0.15743224
## 2008/0098(COD) 3.278145e+00 5.765073e-03 0.0037004101 0.00146658
## 2008/0147(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0157(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0183(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0192(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
## 2008/0196(COD) 8.279923e-05 1.782680e-02 0.0297676432 0.15743224
## 2008/0227(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
## 2008/0237(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0242(COD) 3.423836e-02 3.253036e-02 0.0001950587 0.31872726
## 2008/0243(COD) 1.554726e-02 5.191466e-05 0.0676370273 0.97636064
## 2008/0246(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0249(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0257(COD) 3.423836e-02 3.253036e-02 0.0001950587 0.31872726
## 2008/0260(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2008/0261(COD) 1.876678e-02 6.143698e-02 0.0577536087 0.03598690
## 2009/0005(COD) 1.540662e-02 1.906951e-01 0.2345642571 0.13795706
tail(CA5$row$contrib, 20 )
## Dim 1 Dim 2 Dim 3 Dim 4
## 2018/0385(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2018/0390(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2018/0900(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0009(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0010(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0019(COD) 0.01554726 5.191466e-05 0.06763703 0.9763606
## 2019/0107(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0179(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2019/0180(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0043(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0054(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0058(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0059(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0060(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0066(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0067(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0069(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0071(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0075(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
## 2020/0113(COD) 0.01876678 6.143698e-02 0.05775361 0.0359869
Let’s take a look at the variables and observations that correlate most with the dimension of interest=> will be used to getting the procedure numbers for the candidate summaries
description_5=dimdesc(CA5, axes = c(1,2))
#description of dimension 1=> shows which variables correlate with it the most and which rows correlate with DIM 1 most
#description_5[[1]] ## ==> if run this code there will be two very very long tables.
##Same as above, but for dim2
#description_5[[2]]
Save pdf of the bi plot and scree plot for the CA with 5
###
scree.plot <- fviz_eig(CA5)
# Biplot of row and column variables
biplot.ca <- fviz_ca_biplot(CA5)
ggexport(plotlist = list(scree.plot, biplot.ca),
filename = "CA5.pdf")
## file saved to CA5.pdf
#CA16: save the relevant things, bysort, then print min/max
#relevant indicators: cos2 (plus the correlation resutls frm the analysis above)
ca16_cos=CA_legis$row$contrib
ca11_cos=CA11$row$cos2
ca7_cos=CA7$row$cos2
ca5_cos=CA5$row$cos2
fviz_contrib(CA5, choice = "row", axes = 1, top = 20, sort.val = 'asc')## these are20 candidate summaries for the +1
# -1 candidates are from the ordered double list