From here on I plot the descriptive for the collected dt.
# plot patterns of parsing across CAP policy areas
load("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Trilogues_dataset/Main_df_Trilogues_17012022.RData")
parsed_cap=trilogue_core_dt %>% select(cod, cap,capcode, parsing_type) %>%
group_by(cap, capcode, parsing_type) %>%
add_count() %>%
ungroup()
## Adding missing grouping variables: `celex`
table(parsed_cap$parsing_type)
##
## 1 2 3 999
## 178 227 2 45
parsed_cap$parsing_type=as.character(parsed_cap$parsing_type)
parsed_cap=parsed_cap %>% filter(!is.na(parsing_type)) %>% select(cap, parsing_type, n) %>% distinct()
distribution= ggplot(parsed_cap, aes(x=cap, y=n, fill= parsing_type )) +
geom_bar(stat="identity")+
theme_minimal()+
scale_fill_discrete("Parsing methods", labels=c('Manually', 'Automatically', "Combined methods", "Not Parsed"))+
coord_flip()
distribution
ggsave( "Pattern_table.png")
## Saving 7 x 5 in image
load("C:/Users/nasta/Dropbox/____Nordface_POst_doc/data/Trilogues_dataset/Main_df_Trilogues_17012022.RData")
#use all data and make a similar plot marking the category of cases where the tables were not collected.
all_files=trilogue_core_dt %>% select(cod, celex, cap, capcode, committee, parsing_type)
table(all_files$parsing_type)
##
## 1 2 3 999
## 178 227 2 45
all_files =all_files %>% mutate(parsing_type=ifelse(is.na(parsing_type), 4, parsing_type) ) %>%
group_by(cap, capcode, parsing_type) %>%
add_count() %>%
ungroup() %>% select(cap, parsing_type, n) %>% distinct()
table(all_files$parsing_type)
##
## 1 2 3 4 999
## 20 21 2 20 12
all_files$parsing_type= as.character(all_files$parsing_type)
all_procedures= ggplot(all_files, aes(x=cap, y=n, fill= parsing_type )) +
geom_bar(stat="identity", alpha=0.7)+
theme_minimal()+ coord_flip() +
scale_fill_discrete("Parsing methods", labels=c('Manually', 'Automatically', "Combined methods", "Tables are not collected", "Not Parsed"))+
coord_flip()
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
all_procedures
committees=trilogue_core_dt %>% select(cod, celex, cap, capcode, committee, parsing_type)
table(committees$parsing_type)
##
## 1 2 3 999
## 178 227 2 45
committees =committees %>% mutate(parsing_type=ifelse(is.na(parsing_type), 4, parsing_type) ) %>%
group_by(committee, parsing_type) %>%
add_count() %>%
ungroup() %>% select(committee, parsing_type, n) %>% distinct()
table(committees$parsing_type)
##
## 1 2 3 4 999
## 14 20 2 20 11
committees$parsing_type= as.character(committees$parsing_type)
proc_by_com= ggplot(committees, aes(x=committee, y=n, fill= parsing_type )) +
geom_bar(stat="identity", alpha=0.7)+
theme_minimal()+ coord_flip() +
scale_fill_discrete("Parsing methods", labels=c('Manually', 'Automatically', "Combined methods", "Tables are not collected", "Not Parsed"))+
coord_flip()
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
proc_by_com
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
## Interactive version
p <- trilogue_core_dt %>% select(cod, celex, cap, capcode, committee, parsing_type, documentyear) %>%
# mutate(parsing_type=ifelse(is.na(parsing_type), 4, parsing_type) ) %>%
filter(parsing_type<=3)%>%
group_by(cap, documentyear) %>%
add_count() %>%
ungroup() %>% select(cap, parsing_type, n, documentyear) %>% distinct() %>%
# mutate(parsing_type= as.character(parsing_type)) %>%
# Reorder countries to having big bubbles on top
arrange(desc(n)) %>%
mutate(parsing_type = factor(parsing_type)) %>%
mutate(documentyear=factor(documentyear)) %>%
# prepare text for tooltip
mutate(text = paste("N of procedures: ", n, sep="")) %>%
# Classic ggplot
ggplot( aes(x=documentyear, y=cap, size = n, color = cap, text=text)) +
geom_point(alpha=0.7) +
scale_size(range = c(1.4, 19), name="Parsing type") +
scale_color_viridis(discrete=TRUE, guide=FALSE) +
theme_ipsum() +
theme(legend.position="none") + labs ( title = 'Parsed tables')
#scale_size(range = c(1, 25), name="Number of parsed files")
# turn ggplot interactive with plotly
pp <- ggplotly(p, tooltip="text", width = 1500,
height = 800)
pp
p <- trilogue_core_dt %>% select(cod, celex, cap, capcode, committee, parsing_type, documentyear) %>%
# mutate(parsing_type=ifelse(is.na(parsing_type), 4, parsing_type) ) %>%
filter(parsing_type<=3)%>%
group_by(committee, documentyear) %>%
add_count() %>%
ungroup() %>% select(committee, parsing_type, n, documentyear) %>% distinct() %>%
# mutate(parsing_type= as.character(parsing_type)) %>%
# Reorder countries to having big bubbles on top
arrange(desc(n)) %>%
mutate(parsing_type = factor(parsing_type)) %>%
mutate(documentyear=factor(documentyear)) %>%
# prepare text for tooltip
mutate(text = paste("N of procedures: ", n, sep="")) %>%
# Classic ggplot
ggplot( aes(x=documentyear, y=committee, size = n, color = committee, text=text)) +
geom_point(alpha=0.7) +
scale_size(range = c(1.4, 19), name="Parsing type") +
scale_color_viridis(discrete=TRUE, guide=FALSE) +
theme_ipsum() + theme(
plot.margin = margin(.7, .7, .7, .7, "cm")) +
theme(legend.position="none") + labs ( title = 'Parsed tables')
#scale_size(range = c(1, 25), name="Number of parsed files")
# turn ggplot interactive with plotly
pp <- ggplotly(p, tooltip="text", width = 1500,
height = 800)
pp