Plotting the patterns of parsed files

From here on I plot the descriptive for the collected dt.

# plot patterns of parsing across CAP policy areas

load("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Trilogues_dataset/Main_df_Trilogues_17012022.RData")


parsed_cap=trilogue_core_dt %>% select(cod, cap,capcode, parsing_type) %>%
                            group_by(cap, capcode, parsing_type) %>%
                            add_count() %>%
                                ungroup()
## Adding missing grouping variables: `celex`
table(parsed_cap$parsing_type)
## 
##   1   2   3 999 
## 178 227   2  45
parsed_cap$parsing_type=as.character(parsed_cap$parsing_type)

parsed_cap=parsed_cap %>% filter(!is.na(parsing_type)) %>% select(cap, parsing_type, n) %>% distinct()


distribution= ggplot(parsed_cap,  aes(x=cap, y=n, fill= parsing_type   )) +
        geom_bar(stat="identity")+
     theme_minimal()+
    scale_fill_discrete("Parsing methods", labels=c('Manually', 'Automatically', "Combined methods", "Not Parsed"))+
                      coord_flip() 
distribution    

ggsave( "Pattern_table.png")
## Saving 7 x 5 in image

plot the pattersn of the files across the whole dataset

load("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Trilogues_dataset/Main_df_Trilogues_17012022.RData")

#use all data and make a similar plot marking the category of cases where the tables were not collected.

all_files=trilogue_core_dt %>% select(cod, celex, cap, capcode, committee, parsing_type)

table(all_files$parsing_type)
## 
##   1   2   3 999 
## 178 227   2  45
all_files =all_files %>% mutate(parsing_type=ifelse(is.na(parsing_type), 4, parsing_type) ) %>%
                          group_by(cap, capcode, parsing_type) %>%
                            add_count() %>%
                                ungroup() %>%  select(cap, parsing_type, n) %>% distinct()

table(all_files$parsing_type)
## 
##   1   2   3   4 999 
##  20  21   2  20  12
all_files$parsing_type= as.character(all_files$parsing_type)

all_procedures= ggplot(all_files,  aes(x=cap, y=n, fill= parsing_type )) +
        geom_bar(stat="identity",  alpha=0.7)+
     theme_minimal()+ coord_flip() +

    scale_fill_discrete("Parsing methods", labels=c('Manually', 'Automatically', "Combined methods", "Tables are not collected", "Not Parsed"))+
                      coord_flip() 
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
all_procedures

plots by committee

committees=trilogue_core_dt %>% select(cod, celex, cap, capcode, committee, parsing_type)

table(committees$parsing_type)
## 
##   1   2   3 999 
## 178 227   2  45
committees =committees %>% mutate(parsing_type=ifelse(is.na(parsing_type), 4, parsing_type) ) %>%
                          group_by(committee, parsing_type) %>%
                            add_count() %>%
                                ungroup() %>%  select(committee, parsing_type, n) %>% distinct()

table(committees$parsing_type)
## 
##   1   2   3   4 999 
##  14  20   2  20  11
committees$parsing_type= as.character(committees$parsing_type)

proc_by_com= ggplot(committees,  aes(x=committee, y=n, fill= parsing_type )) +
        geom_bar(stat="identity",  alpha=0.7)+
     theme_minimal()+ coord_flip() +

    scale_fill_discrete("Parsing methods", labels=c('Manually', 'Automatically', "Combined methods", "Tables are not collected", "Not Parsed"))+
                      coord_flip() 
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
proc_by_com

plot by cap and years : Parsed procedures by year and policy area

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
## Interactive version
p <- trilogue_core_dt %>% select(cod, celex, cap, capcode, committee, parsing_type, documentyear) %>%
           #  mutate(parsing_type=ifelse(is.na(parsing_type), 4, parsing_type) ) %>%
    
                    filter(parsing_type<=3)%>%
                          group_by(cap, documentyear) %>%
                            add_count() %>%
                                ungroup() %>%  select(cap, parsing_type, n, documentyear) %>% distinct() %>%
           # mutate(parsing_type= as.character(parsing_type)) %>%

   # Reorder countries to having big bubbles on top
  arrange(desc(n)) %>%
  mutate(parsing_type = factor(parsing_type)) %>%
    mutate(documentyear=factor(documentyear)) %>%
  
  # prepare text for tooltip
  mutate(text = paste("N of procedures: ", n, sep="")) %>%
  
  # Classic ggplot
  ggplot( aes(x=documentyear, y=cap, size = n, color = cap, text=text)) +
    geom_point(alpha=0.7) +
    scale_size(range = c(1.4, 19), name="Parsing type") +
    scale_color_viridis(discrete=TRUE, guide=FALSE) +
    theme_ipsum() +
    theme(legend.position="none") + labs ( title = 'Parsed tables')
    #scale_size(range = c(1, 25), name="Number of parsed files")

# turn ggplot interactive with plotly
pp <- ggplotly(p, tooltip="text", width = 1500,
    height = 800)
pp

parsing by Committees: only patterns of the parsed files.

p <- trilogue_core_dt %>% select(cod, celex, cap, capcode, committee, parsing_type, documentyear) %>%
           #  mutate(parsing_type=ifelse(is.na(parsing_type), 4, parsing_type) ) %>%
    
                    filter(parsing_type<=3)%>%
                          group_by(committee, documentyear) %>%
                            add_count() %>%
                                ungroup() %>%  select(committee, parsing_type, n, documentyear) %>% distinct() %>%
           # mutate(parsing_type= as.character(parsing_type)) %>%

   # Reorder countries to having big bubbles on top
  arrange(desc(n)) %>%
  mutate(parsing_type = factor(parsing_type)) %>%
    mutate(documentyear=factor(documentyear)) %>%
  
  # prepare text for tooltip
  mutate(text = paste("N of procedures: ", n, sep="")) %>%
  
  # Classic ggplot
  ggplot( aes(x=documentyear, y=committee, size = n, color = committee, text=text)) +
    geom_point(alpha=0.7) +
    scale_size(range = c(1.4, 19), name="Parsing type") +
    scale_color_viridis(discrete=TRUE, guide=FALSE) +
    theme_ipsum() + theme(
  plot.margin = margin(.7, .7, .7, .7, "cm")) +
    theme(legend.position="none") + labs ( title = 'Parsed tables')
    #scale_size(range = c(1, 25), name="Number of parsed files")

# turn ggplot interactive with plotly
pp <- ggplotly(p, tooltip="text", width = 1500,
    height = 800)
pp