This script was used for the paper “The cost of late payments in public procurement”, presented in the Open Data Research Symposium 2018. The datasets used for this analysis can be downloaded from this link: Datasets. The full paper can be found here: Paper. This paper was possible thanks to the support by Hivos.

1. Load packages

library(dplyr)
library(lubridate)
library(ggplot2)
library(readxl)
library(gmodels)
library(Hmisc)
library(ggthemes)
library(readr)
library(survminer)
library(survival)
library(kableExtra)
library(tidyr)
library(stringr)
library(flexsurv)
library(xts)
library(scales)
options(scipen = 999)

2. Graphs style

theme_Publication <- function(base_size = 14, base_family = "Helvetica") {
    library(grid)
    library(ggthemes)
    (theme_foundation(base_size = base_size, base_family = base_family) + theme(plot.title = element_text(face = "bold", 
        size = rel(1.2), hjust = 0.5), text = element_text(), panel.background = element_rect(colour = NA), 
        plot.background = element_rect(colour = NA), panel.border = element_rect(colour = NA), 
        axis.title = element_text(face = "bold", size = rel(1)), axis.title.y = element_text(angle = 90, 
            vjust = 2), axis.title.x = element_text(vjust = -0.2), axis.text = element_text(), 
        axis.line = element_line(colour = "black"), axis.ticks = element_line(), 
        panel.grid.major = element_line(colour = "#f0f0f0"), panel.grid.minor = element_blank(), 
        legend.key = element_rect(colour = NA), legend.position = "bottom", 
        legend.direction = "horizontal", legend.title = element_text(face = "italic"), 
        legend.key.size = unit(0.5, "cm"), legend.margin = unit(0, "cm"), plot.margin = unit(c(10, 
            5, 5, 5), "mm"), strip.background = element_rect(colour = "#f0f0f0", 
            fill = "#f0f0f0"), strip.text = element_text(face = "bold")))
    
}

scale_fill_Publication <- function(...) {
    library(scales)
    discrete_scale("fill", "Publication", manual_pal(values = c("#386cb0", "#fdb462", 
        "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", "#ffff33")), 
        ...)
    
}

scale_colour_Publication <- function(...) {
    library(scales)
    discrete_scale("colour", "Publication", manual_pal(values = c("#386cb0", 
        "#fdb462", "#7fc97f", "#ef3b2c", "#662506", "#a6cee3", "#fb9a99", "#984ea3", 
        "#ffff33")), ...)
    
}

3. Data Cleaning

# Import files
base2 <- read.csv("finaldataset.csv", stringsAsFactors = FALSE, encoding = "UTF-8")

# Transform dates
dates_vector <- c("obl_fecha_elaboracion", "obl_fecha_aprobacion", "obl_fecha_factura", 
    "str_fecha_ingreso", "str_fecha_aprobacion", "str_fecha_recepcion_tesoro", 
    "ot_fecha_generacion", "ot_fecha_deposito", "contr_fecha_firma_contrato")

for (i in dates_vector) {
    base2[, i] <- parse_date_time(base2[, i], orders = c("dmy", "mdy", "ymd"))
}

# Leave contracts in Guaranies (delete 10 obs), Delete observations without
# STR and OT
base2 <- base2 %>% filter(obl_moneda_descripcion == "GUARANIES", !is.na(str_numero_str), 
    !is.na(ot_numero_ot)) %>% # Delete obs where contract date>invoice date
mutate(prueba2 = ifelse(obl_fecha_factura < contr_fecha_firma_contrato, 1, 0)) %>% 
    filter(prueba2 == 0 | is.na(prueba2)) %>% # Create new variables
mutate(total = difftime(ot_fecha_deposito, obl_fecha_factura, units = c("days")), 
    total2 = difftime(ot_fecha_deposito, obl_fecha_aprobacion, units = c("days")), 
    r_factura = difftime(obl_fecha_aprobacion, obl_fecha_factura, units = c("days")), 
    r_str = difftime(str_fecha_recepcion_tesoro, obl_fecha_aprobacion, units = c("days")), 
    r_ot = difftime(ot_fecha_deposito, str_fecha_recepcion_tesoro, units = c("days")), 
    r_carga = difftime(obl_fecha_elaboracion, obl_fecha_factura, units = c("days")), 
    r_apro = difftime(obl_fecha_aprobacion, obl_fecha_elaboracion, units = c("days")), 
    r_carga_str = difftime(str_fecha_ingreso, obl_fecha_aprobacion, units = c("days")), 
    r_apro_str = difftime(str_fecha_aprobacion, str_fecha_ingreso, units = c("days")), 
    r_tesoro_str = difftime(str_fecha_recepcion_tesoro, str_fecha_aprobacion, 
        units = c("days")), r_carga_ot = difftime(ot_fecha_generacion, str_fecha_recepcion_tesoro, 
        units = c("days")), r_pago_ot = difftime(ot_fecha_deposito, ot_fecha_generacion, 
        units = c("days")), r_entidad = difftime(str_fecha_recepcion_tesoro, 
        obl_fecha_factura, units = c("days")), r_hacienda = difftime(ot_fecha_deposito, 
        str_fecha_recepcion_tesoro, units = c("days")), total_cat = ifelse(total < 
        30, "< 30", ifelse(total < 45, "30-<45", ifelse(total < 60, "45-<60", 
        ifelse(total < 75, "60-<75", "+ 75")))))
base2[, 29:42] <- lapply(base2[, 29:42], as.numeric)

base2$obl_entidad_descripcion <- gsub("PBLICA", "PUBLICA", base2$obl_entidad_descripcion, 
    fixed = T)
base2$obl_entidad_descripcion <- gsub("TRNSITO", "TRANSITO", base2$obl_entidad_descripcion, 
    fixed = T)
base2$obl_entidad_descripcion <- gsub("ESPRITU", "ESPIRITU", base2$obl_entidad_descripcion, 
    fixed = T)


# Identify extreme values
base_limpia <- base2 %>% filter(total > 10, total2 > 0, r_factura >= 0, r_str >= 
    0, r_ot >= 0, r_carga >= 0, r_apro >= 0, r_carga_str >= 0, r_apro_str >= 
    0, r_tesoro_str >= 0, r_carga_ot >= 0, r_pago_ot >= 0, r_entidad > 0, r_hacienda >= 
    0, obl_monto_obligado > 50000) %>% mutate(dataset = ifelse((r_factura > 
    110 | r_str > 37 | r_ot > 47), "B", "A"))

4. Exploratory Data Analysis

## Exploratory Data Analysis
indicadores <- read_excel("indicadores.xlsx")
datos <- base_limpia %>% left_join(indicadores, by = c(obl_anho_obligacion = "anio")) %>% 
    mutate(monto_real = obl_monto_obligado/deflator * 100, monto_dolar = (monto_real/5618.933)/1000000)
CrossTable(datos$total_cat, datos$dataset, prop.chisq = F, prop.t = F)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  377520 
## 
##  
##                 | datos$dataset 
## datos$total_cat |         A |         B | Row Total | 
## ----------------|-----------|-----------|-----------|
##            + 75 |     78056 |     59199 |    137255 | 
##                 |     0.569 |     0.431 |     0.364 | 
##                 |     0.247 |     0.962 |           | 
## ----------------|-----------|-----------|-----------|
##            < 30 |     70402 |         0 |     70402 | 
##                 |     1.000 |     0.000 |     0.186 | 
##                 |     0.223 |     0.000 |           | 
## ----------------|-----------|-----------|-----------|
##          30-<45 |     67553 |         1 |     67554 | 
##                 |     1.000 |     0.000 |     0.179 | 
##                 |     0.214 |     0.000 |           | 
## ----------------|-----------|-----------|-----------|
##          45-<60 |     57418 |       423 |     57841 | 
##                 |     0.993 |     0.007 |     0.153 | 
##                 |     0.182 |     0.007 |           | 
## ----------------|-----------|-----------|-----------|
##          60-<75 |     42554 |      1914 |     44468 | 
##                 |     0.957 |     0.043 |     0.118 | 
##                 |     0.135 |     0.031 |           | 
## ----------------|-----------|-----------|-----------|
##    Column Total |    315983 |     61537 |    377520 | 
##                 |     0.837 |     0.163 |           | 
## ----------------|-----------|-----------|-----------|
## 
##

summary(datos$total)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   11.00   35.00   57.00   76.22   97.00 1533.00

suma <- datos %>% group_by(total_cat) %>% summarise(sum(monto_real))


# Density plots
datos %>% filter(dataset == "A") %>% ggplot(aes(x = total)) + theme_Publication() + 
    labs(title = "Payment duration", subtitle = "Dataset A", x = "Total duration (days)", 
        y = "Number of invoices", caption = "Based on data from datos.hacienda.gov.py") + 
    geom_histogram(alpha = 0.8, fill = "#b2b2b2") + geom_vline(aes(xintercept = 30)) + 
    geom_vline(aes(xintercept = 60)) + annotate("text", x = 28, y = 8000, label = "optimal deadline", 
    angle = 90, size = 4) + annotate("text", x = 58, y = 8000, label = "local regulation deadline", 
    angle = 90, size = 4) + scale_x_continuous(breaks = seq(0, 185, 10))

datos %>% filter(dataset == "B") %>% ggplot(aes(x = total)) + theme_Publication() + 
    labs(title = "Payment duration", subtitle = "Dataset B", x = "Total duration (days)", 
        y = "Number of invoices", caption = "Based on data from datos.hacienda.gov.py") + 
    geom_histogram(alpha = 0.8, fill = "#b2b2b2") + geom_vline(aes(xintercept = 60)) + 
    annotate("text", x = 50, y = 8000, label = "local regulation deadline", 
        angle = 90, size = 4) + scale_x_continuous(breaks = seq(0, 1550, 50))

# By year
anio <- datos %>% filter((dataset == "A")) %>% group_by(obl_anho_obligacion) %>% 
    summarise(total2 = n(), duracion = median(total), monto = sum(obl_monto_obligado)) %>% 
    left_join(indicadores, by = c(obl_anho_obligacion = "anio")) %>% mutate(monto2 = monto/deflator * 
    100)
anio

## # A tibble: 7 x 7
##   obl_anho_obligacion total2 duracion        monto interes deflator  monto2
##                 <dbl>  <int>    <dbl>        <dbl>   <dbl>    <dbl>   <dbl>
## 1                2011  33894       34      1.05e12   0.174     81.0 1.29e12
## 2                2012  50309       37      1.96e12   0.172     84.8 2.31e12
## 3                2013  31970       58      1.90e12   0.193     85.5 2.22e12
## 4                2014  44632       57      2.67e12   0.212     89.9 2.97e12
## 5                2015  50868       56      3.61e12   0.197     90.0 4.01e12
## 6                2016  49941       56      3.75e12   0.181     94.7 3.95e12
## 7                2017  54369       52      4.22e12   0.181    100   4.22e12

## Stages of the process
etapas <- datos %>% filter((dataset == "A")) %>% select(obl_anho_obligacion, 
    obl_fuente_financiamiento_descripcion, r_factura, r_str, r_ot) %>% rename(`Invoice delay` = r_factura, 
    `TR delay` = r_str, `TO delay` = r_ot) %>% gather(etapa, duracion, -obl_anho_obligacion, 
    -obl_fuente_financiamiento_descripcion)
etapas$etapa <- factor(etapas$etapa, levels = c("Invoice delay", "TR delay", 
    "TO delay"))


# Stages and year
ggplot(etapas, aes(x = as.factor(obl_anho_obligacion), y = duracion, fill = etapa)) + 
    labs(title = "Payment duration by stage and year", subtitle = "Dataset A", 
        y = "Total duration (days)", x = "Year", caption = "Based on data from datos.hacienda.gov.py") + 
    theme_Publication() + geom_boxplot() + theme(text = element_text(size = 10)) + 
    scale_y_continuous(breaks = seq(0, 200, 10)) + facet_wrap(~etapa, ncol = 1) + 
    scale_fill_manual(" ", values = c(`Invoice delay` = "#559171", `TR delay` = "#b5c62d", 
        `TO delay` = "#FCA32B"))

etapas_detalle <- datos %>% filter((dataset == "A")) %>% select(obl_anho_obligacion, 
    r_carga, r_carga_str, r_apro_str, r_tesoro_str, r_carga_ot, r_pago_ot) %>% 
    group_by(obl_anho_obligacion) %>% summarise(`Invoice approval` = mean(r_carga), 
    `TR creation` = mean(r_carga_str), `TR approval` = mean(r_apro_str), `Transfer to Treasury` = median(r_tesoro_str), 
    `TO creation` = mean(r_carga_ot), Payment = mean(r_pago_ot)) %>% gather(etapa, 
    duracion, -obl_anho_obligacion)
etapas_detalle$etapa <- factor(etapas_detalle$etapa, levels = c("Payment", "TO creation", 
    "Transfer to Treasury", "TR approval", "TR creation", "Invoice approval"))

ggplot(etapas_detalle, aes(fill = etapa, y = duracion, x = obl_anho_obligacion)) + 
    geom_bar(stat = "identity") + theme_Publication() + scale_fill_manual(" ", 
    values = c(`Invoice approval` = "#559171", `TR creation` = "#b5c62d", `TR approval` = "#ffef96", 
        `Transfer to Treasury` = "#FCA32B", `TO creation` = "#5b5b5f", Payment = "#b2b2b2")) + 
    labs(title = "Average stage duration", subtitle = "Dataset A", y = "Total duration (days)", 
        x = "Year", caption = "Based on data from datos.hacienda.gov.py") + 
    scale_x_continuous(breaks = seq(2011, 2017, 1))

# Stages and funds
etapas$obl_fuente_financiamiento_descripcion <- gsub("10-RECURSOS DEL TESORO", 
    "Treasury funds", etapas$obl_fuente_financiamiento_descripcion, fixed = T)
etapas$obl_fuente_financiamiento_descripcion <- gsub("20-RECURSOS DEL CREDITO PUBLICO", 
    "Public credit funds", etapas$obl_fuente_financiamiento_descripcion, fixed = T)
etapas$obl_fuente_financiamiento_descripcion <- gsub("30-RECURSOS INSTITUCIONALES", 
    "Institutional funds", etapas$obl_fuente_financiamiento_descripcion, fixed = T)
etapas$obl_fuente_financiamiento_descripcion <- factor(etapas$obl_fuente_financiamiento_descripcion, 
    levels = c("Treasury funds", "Public credit funds", "Institutional funds"))
ggplot(etapas, aes(x = etapa, y = duracion, fill = etapa)) + labs(title = "Payment duration by type of funds", 
    subtitle = "Dataset A", y = "Total duration (days)", x = "", caption = "Based on data from datos.hacienda.gov.py") + 
    theme_Publication() + geom_boxplot() + theme(text = element_text(size = 10)) + 
    scale_y_continuous(breaks = seq(0, 200, 10)) + coord_flip() + facet_wrap(~obl_fuente_financiamiento_descripcion, 
    ncol = 1) + scale_fill_manual(" ", values = c(`Invoice delay` = "#559171", 
    `TR delay` = "#b5c62d", `TO delay` = "#FCA32B"))

# Top 10 entities with higher spending
etapas3 <- datos %>% select(obl_entidad_descripcion, r_carga, r_carga_str, r_apro_str, 
    r_tesoro_str, r_carga_ot, r_pago_ot) %>% group_by(obl_entidad_descripcion) %>% 
    summarise(`Invoice approval` = mean(r_carga), `TR creation` = mean(r_carga_str), 
        `TR approval` = mean(r_apro_str), `Transfer to Treasury` = median(r_tesoro_str), 
        `TO creation` = mean(r_carga_ot), Payment = mean(r_pago_ot)) %>% gather(stage, 
    duration, -obl_entidad_descripcion) %>% filter(obl_entidad_descripcion == 
    "013-MINISTERIO DE OBRAS PUBLICAS Y COMUNICACIONES" | obl_entidad_descripcion == 
    "008-MINISTERIO DE SALUD PUBLICA Y BIENESTAR SOCIAL" | obl_entidad_descripcion == 
    "007-MINISTERIO DE EDUCACION Y CIENCIAS" | obl_entidad_descripcion == "005-MINISTERIO DE DEFENSA NACIONAL" | 
    obl_entidad_descripcion == "003-MINISTERIO DEL INTERIOR" | obl_entidad_descripcion == 
    "001-PRESIDENCIA DE LA REPUBLICA" | obl_entidad_descripcion == "001-UNIVERSIDAD NACIONAL DE ASUNCIN" | 
    obl_entidad_descripcion == "006-MINISTERIO DE HACIENDA" | obl_entidad_descripcion == 
    "023-SECRETARIA NACIONAL DE LA VIVIENDA Y EL HABITAT" | obl_entidad_descripcion == 
    "002-JUSTICIA ELECTORAL")


etapas3$obl_entidad_descripcion <- gsub("013-MINISTERIO DE OBRAS PUBLICAS Y COMUNICACIONES", 
    "Ministry of Public Works", etapas3$obl_entidad_descripcion, fixed = T)
etapas3$obl_entidad_descripcion <- gsub("008-MINISTERIO DE SALUD PUBLICA Y BIENESTAR SOCIAL", 
    "Ministry of Health", etapas3$obl_entidad_descripcion, fixed = T)
etapas3$obl_entidad_descripcion <- gsub("007-MINISTERIO DE EDUCACION Y CIENCIAS", 
    "Ministry of Education", etapas3$obl_entidad_descripcion, fixed = T)
etapas3$obl_entidad_descripcion <- gsub("005-MINISTERIO DE DEFENSA NACIONAL", 
    "Ministry of Defense", etapas3$obl_entidad_descripcion, fixed = T)
etapas3$obl_entidad_descripcion <- gsub("003-MINISTERIO DEL INTERIOR", "Ministry of Interior", 
    etapas3$obl_entidad_descripcion, fixed = T)
etapas3$obl_entidad_descripcion <- gsub("001-PRESIDENCIA DE LA REPUBLICA", "Presidency", 
    etapas3$obl_entidad_descripcion, fixed = T)
etapas3$obl_entidad_descripcion <- gsub("001-UNIVERSIDAD NACIONAL DE ASUNCIN", 
    "National University of Asuncion", etapas3$obl_entidad_descripcion, fixed = T)
etapas3$obl_entidad_descripcion <- gsub("006-MINISTERIO DE HACIENDA", "Treasury", 
    etapas3$obl_entidad_descripcion, fixed = T)
etapas3$obl_entidad_descripcion <- gsub("023-SECRETARIA NACIONAL DE LA VIVIENDA Y EL HABITAT", 
    "Secretary of Housing", etapas3$obl_entidad_descripcion, fixed = T)
etapas3$obl_entidad_descripcion <- gsub("002-JUSTICIA ELECTORAL", "Electoral Justice", 
    etapas3$obl_entidad_descripcion, fixed = T)

etapas3$stage <- factor(etapas3$stage, levels = c("Payment", "TO creation", 
    "Transfer to Treasury", "TR approval", "TR creation", "Invoice approval"))

ggplot(etapas3, aes(fill = stage, y = duration, x = reorder(obl_entidad_descripcion, 
    duration))) + geom_bar(stat = "identity") + theme_Publication() + coord_flip() + 
    scale_fill_manual(values = c(`Invoice approval` = "#559171", `TR creation` = "#b5c62d", 
        `TR approval` = "#ffef96", `Transfer to Treasury` = "#FCA32B", `TO creation` = "#5b5b5f", 
        Payment = "#b2b2b2")) + labs(title = "Average payment duration by stage", 
    subtitle = "Top 10 entities with higher spending (full dataset)", y = "Total duration (days)", 
    x = "", caption = "Based on data from datos.hacienda.gov.py")

# By entity
instituciones <- datos %>% filter(obl_entidad_descripcion != "002-MECANISMO NACIONAL DE PREVENCIN DE LA TORTURA", 
    obl_entidad_descripcion != "002-COM. NAC. DE PREV. CONTRA LA TORTURA Y OTROS TRATOS O PENAS.", 
    obl_entidad_descripcion != "003-CRDITO AGRCOLA DE HABILITACIN", obl_entidad_descripcion != 
        "004-DIRECCION DE BENEFICENCIA Y AYUDA SOCIAL", obl_entidad_descripcion != 
        "007-SINDICATURA GENERAL DE QUIEBRAS", obl_entidad_descripcion != "013-ENTE REGULADOR DE SERVICIOS SANITARIOS", 
    obl_entidad_descripcion != "014-INSTITUTO NACIONAL DE COOPERATIVISMO", obl_entidad_descripcion != 
        "014-MINISTERIO DE LA MUJER", obl_entidad_descripcion != "016-SERVICIO NACIONAL DE CALIDAD Y SALUD ANIMAL", 
    obl_entidad_descripcion != "020-INSTITUTO FORESTAL NACIONAL", obl_entidad_descripcion != 
        "021-SECRETARA DEL AMBIENTE", obl_entidad_descripcion != "026-SECRETARIA DE DEFENSA DEL CONSUMIDOR Y EL USUARIO", 
    obl_entidad_descripcion != "027-COMISIN NACIONAL DE LA COMPETENCIA", obl_entidad_descripcion != 
        "028-AGENCIA NACIONAL DE TRANSITO Y SEGURIDAD VIAL", obl_entidad_descripcion != 
        "029-CONSEJO NACIONAL DE EDUCACION SUPERIOR", obl_entidad_descripcion != 
        "030-AGENCIA NACIONAL DE EVAL. Y ACRED. DE LA EDUCACION SUPERIOR", obl_entidad_descripcion != 
        "030-AGENCIA NACIONAL DE EVAL. Y ACRED. DE LA EDUCACION SUPERIOR", obl_entidad_descripcion != 
        "031-AUTORIDAD REGULADORA RADIOLGICA Y NUCLEAR") %>% group_by(obl_entidad_descripcion, 
    dataset) %>% summarise(facturas = n(), monto = sum(obl_monto_obligado, na.rm = TRUE), 
    montodolar = sum(monto_dolar), duracion_media = median(total), duracion_prom = mean(total), 
    duracion_max = max(total), invoice = median(r_factura), str = median(r_str), 
    ot = median(r_ot))
instituciones

## # A tibble: 84 x 11
## # Groups:   obl_entidad_descripcion [?]
##    obl_entidad_des… dataset facturas   monto montodolar duracion_media
##    <chr>            <chr>      <int>   <dbl>      <dbl>          <dbl>
##  1 001-CONGRESO NA… A           5223 1.13e11    22.6              60  
##  2 001-CONGRESO NA… B            580 2.00e10     3.97             97  
##  3 001-CONTRALORA … A           1376 3.78e10     7.27             35  
##  4 001-CONTRALORA … B            101 7.95e 9     1.56            130  
##  5 001-CORTE SUPRE… A           6448 2.16e11    40.8              50  
##  6 001-CORTE SUPRE… B            952 3.18e10     6.16            156  
##  7 001-DEFENSORA D… A            282 5.91e 9     1.17             42  
##  8 001-DEFENSORA D… B             17 3.11e 8     0.0628          165  
##  9 001-INSTITUTO N… A            123 6.14e 9     1.22             27  
## 10 001-INSTITUTO N… B              8 1.26e 9     0.239            98.5
## # … with 74 more rows, and 5 more variables: duracion_prom <dbl>,
## #   duracion_max <dbl>, invoice <dbl>, str <dbl>, ot <dbl>

salud_obras2 <- datos %>% filter((dataset == "A")) %>% select(obl_entidad_descripcion, 
    obl_fuente_financiamiento_descripcion, r_carga, r_carga_str, r_apro_str, 
    r_tesoro_str, r_carga_ot, r_pago_ot) %>% group_by(obl_entidad_descripcion, 
    obl_fuente_financiamiento_descripcion) %>% summarise(`Invoice approval` = mean(r_carga), 
    `TR creation` = mean(r_carga_str), `TR approval` = mean(r_apro_str), `Transfer to Treasury` = median(r_tesoro_str), 
    `TO creation` = mean(r_carga_ot), Payment = mean(r_pago_ot)) %>% gather(stage, 
    duration, -obl_entidad_descripcion, -obl_fuente_financiamiento_descripcion) %>% 
    filter(obl_entidad_descripcion == "013-MINISTERIO DE OBRAS PUBLICAS Y COMUNICACIONES" | 
        obl_entidad_descripcion == "008-MINISTERIO DE SALUD PUBLICA Y BIENESTAR SOCIAL")


salud_obras2$obl_entidad_descripcion <- gsub("013-MINISTERIO DE OBRAS PUBLICAS Y COMUNICACIONES", 
    "Ministry of Public Works", salud_obras2$obl_entidad_descripcion, fixed = T)
salud_obras2$obl_entidad_descripcion <- gsub("008-MINISTERIO DE SALUD PUBLICA Y BIENESTAR SOCIAL", 
    "Ministry of Health", salud_obras2$obl_entidad_descripcion, fixed = T)
salud_obras2$obl_fuente_financiamiento_descripcion <- gsub("10-RECURSOS DEL TESORO", 
    "Treasury funds", salud_obras2$obl_fuente_financiamiento_descripcion, fixed = T)
salud_obras2$obl_fuente_financiamiento_descripcion <- gsub("20-RECURSOS DEL CREDITO PUBLICO", 
    "Public credit funds", salud_obras2$obl_fuente_financiamiento_descripcion, 
    fixed = T)
salud_obras2$obl_fuente_financiamiento_descripcion <- gsub("30-RECURSOS INSTITUCIONALES", 
    "Institutional funds", salud_obras2$obl_fuente_financiamiento_descripcion, 
    fixed = T)
salud_obras2$stage <- factor(salud_obras2$stage, levels = c("Payment", "TO creation", 
    "Transfer to Treasury", "TR approval", "TR creation", "Invoice approval"))

ggplot(salud_obras2, aes(fill = stage, y = duration, x = obl_fuente_financiamiento_descripcion)) + 
    geom_bar(stat = "identity") + theme_Publication() + scale_fill_manual(values = c(`Invoice approval` = "#559171", 
    `TR creation` = "#b5c62d", `TR approval` = "#ffef96", `Transfer to Treasury` = "#FCA32B", 
    `TO creation` = "#5b5b5f", Payment = "#b2b2b2")) + labs(title = "Average payment duration by stage and funds", 
    subtitle = "Dataset A", y = "Total duration (days)", x = "", caption = "Based on data from datos.hacienda.gov.py") + 
    facet_wrap(~obl_entidad_descripcion, ncol = 2) + coord_flip()

salud_obras <- datos %>% filter((dataset == "A")) %>% filter(obl_entidad_descripcion == 
    "013-MINISTERIO DE OBRAS PUBLICAS Y COMUNICACIONES" | obl_entidad_descripcion == 
    "008-MINISTERIO DE SALUD PUBLICA Y BIENESTAR SOCIAL") %>% select(obl_entidad_descripcion, 
    obl_fuente_financiamiento_descripcion, r_factura, r_str, r_ot) %>% rename(`Invoice delay` = r_factura, 
    `TR delay` = r_str, `TO delay` = r_ot) %>% gather(etapa, duracion, -obl_entidad_descripcion, 
    -obl_fuente_financiamiento_descripcion)
salud_obras$etapa <- factor(salud_obras$etapa, levels = c("Invoice delay", "TR delay", 
    "TO delay"))

salud_obras$obl_fuente_financiamiento_descripcion <- gsub("10-RECURSOS DEL TESORO", 
    "Treasury funds", salud_obras$obl_fuente_financiamiento_descripcion, fixed = T)
salud_obras$obl_fuente_financiamiento_descripcion <- gsub("20-RECURSOS DEL CREDITO PUBLICO", 
    "Public credit funds", salud_obras$obl_fuente_financiamiento_descripcion, 
    fixed = T)
salud_obras$obl_fuente_financiamiento_descripcion <- gsub("30-RECURSOS INSTITUCIONALES", 
    "Institutional funds", salud_obras$obl_fuente_financiamiento_descripcion, 
    fixed = T)
salud_obras$obl_fuente_financiamiento_descripcion <- factor(salud_obras$obl_fuente_financiamiento_descripcion, 
    levels = c("Treasury funds", "Public credit funds", "Institutional funds"))
salud_obras$obl_entidad_descripcion <- gsub("013-MINISTERIO DE OBRAS PUBLICAS Y COMUNICACIONES", 
    "Ministry of Public Works", salud_obras$obl_entidad_descripcion, fixed = T)
salud_obras$obl_entidad_descripcion <- gsub("008-MINISTERIO DE SALUD PUBLICA Y BIENESTAR SOCIAL", 
    "Ministry of Health", salud_obras$obl_entidad_descripcion, fixed = T)


ggplot(salud_obras, aes(x = etapa, y = duracion, fill = obl_entidad_descripcion)) + 
    labs(title = "Payment duration by type of funds", subtitle = "Dataset A", 
        y = "Total duration (days)", x = "", caption = "Based on data from datos.hacienda.gov.py") + 
    theme_Publication() + geom_boxplot() + theme(text = element_text(size = 10)) + 
    scale_y_continuous(breaks = seq(0, 200, 10)) + facet_wrap(~obl_fuente_financiamiento_descripcion, 
    ncol = 3) + scale_fill_manual(" ", values = c(`Ministry of Public Works` = "#559171", 
    `Ministry of Health` = "#FCA32B"))

# Providers
proveedores2 <- base_limpia %>% mutate(retraso30 = ifelse(total >= 30, 1, 0)) %>% 
    group_by(prov_ruc) %>% summarise(monto = sum(obl_monto_obligado), contratos = n_distinct(obl_codigo_contratacion), 
    instituciones = n_distinct(obl_entidad_descripcion), duracion = median(total), 
    anios = n_distinct(obl_anho_obligacion), facturas = n(), retrasos = sum(retraso30)) %>% 
    mutate(dolar = monto/5618.933, dolar_cat = ifelse(dolar > 1000000, 1, 0), 
        porc_retraso = (round(retrasos/facturas * 100, 1)), porc_cat = ifelse(porc_retraso < 
            25, "<25", ifelse(porc_retraso < 50, "<50", ifelse(porc_retraso < 
            75, "<75", "75-100"))))
CrossTable(proveedores2$porc_cat)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4742 
## 
##  
##           |       <25 |       <50 |       <75 |    75-100 | 
##           |-----------|-----------|-----------|-----------|
##           |       548 |       390 |       935 |      2869 | 
##           |     0.116 |     0.082 |     0.197 |     0.605 | 
##           |-----------|-----------|-----------|-----------|
## 
## 
## 
##

# Type of purchase
gasto <- datos %>% filter(dataset == "A") %>% group_by(obl_grupo_descripcion) %>% 
    summarise(n = n(), duracion_media = median(total), duracion_prom = mean(total), 
        duracion_max = max(total))

# Seasonal trends

datos_st <- datos %>% filter(dataset == "A") %>% mutate(mesf = month(obl_fecha_factura), 
    mesp = month(ot_fecha_deposito), mes_aniof = format(as.Date(obl_fecha_factura), 
        "%Y-%m"), mes_aniop = format(as.Date(ot_fecha_deposito), "%Y-%m"))
datos_st$mes_aniof <- as.Date(as.yearmon(datos_st$mes_aniof))
datos_st$mes_aniop <- as.Date(as.yearmon(datos_st$mes_aniop))

facturas <- datos_st %>% filter(dataset == "A") %>% group_by(mes_aniof) %>% 
    summarise(Invoices = n())
pagos <- datos_st %>% filter(dataset == "A") %>% group_by(mes_aniop) %>% summarise(Payments = n())

tiempo <- left_join(facturas, pagos, by = c(mes_aniof = "mes_aniop"))
tiempo <- gather(tiempo, Series, cantidad, -mes_aniof)

ggplot(tiempo, aes(x = mes_aniof, y = cantidad, group = Series)) + geom_line(aes(color = Series), 
    na.rm = TRUE) + theme_Publication() + theme(legend.title = element_blank()) + 
    scale_color_manual(values = c("#559171", "#FCA32B")) + scale_y_continuous(labels = comma) + 
    scale_x_date(breaks = date_breaks("1 year"), labels = date_format("%b %y")) + 
    labs(title = "Invoices billed and paid per month 2011-2017", subtitle = "Dataset A", 
        y = "Number of invoices", x = "Month-Year", caption = "Based on data from datos.hacienda.gov.py")

# Facturas todas las instituciones
facturas_elab1 <- datos %>% filter(dataset == "A") %>% group_by(obl_fecha_factura) %>% 
    summarise(facturas_creadas = n()) %>% rename(fecha = obl_fecha_factura)

facturas_pago <- datos %>% filter(dataset == "A") %>% group_by(ot_fecha_deposito) %>% 
    summarise(facturas_pago = n()) %>% rename(fecha = ot_fecha_deposito)

facturas_acumuladas2 <- full_join(facturas_elab1, facturas_pago, by = "fecha")
facturas_acumuladas2[is.na(facturas_acumuladas2)] <- 0
facturas_acumuladas2 <- facturas_acumuladas2 %>% arrange(fecha) %>% mutate(acum_elab = cumsum(facturas_creadas), 
    acum_pago = cumsum(facturas_pago), pendientes_total = acum_elab - acum_pago)

ggplot(facturas_acumuladas2, aes(x = fecha, y = pendientes_total)) + geom_line() + 
    theme(legend.title = element_blank()) + theme_Publication() + scale_y_continuous(labels = comma) + 
    scale_x_datetime(breaks = date_breaks("1 year"), labels = date_format("%y")) + 
    labs(title = "Unpaid bills", y = "Number of invoices", subtitle = "Dataset A", 
        x = "Year", caption = "Based on data from datos.hacienda.gov.py")

# By quarter
trimestre <- datos %>% filter(dataset == "A") %>% mutate(quarter = quarter(obl_fecha_factura)) %>% 
    group_by(quarter) %>% summarise(duracionprom = mean(total), dur_media = median(total))

5. Survival analysis

nomina <- read_csv("nomina.csv")
nomina <- nomina %>% mutate(size = ifelse(funcionarios < 1000, "small", ifelse(funcionarios < 
    10000, "median", "large")))
nomina$size <- factor(nomina$size, levels = c("small", "median", "large"))


surv <- datos %>% filter(dataset == "A") %>% select(obl_id, obl_monto_obligado, 
    obl_fuente_financiamiento_descripcion, obl_nivel_descripcion, obl_entidad_descripcion, 
    total, r_factura, r_str, r_ot, prov_ruc, contr_monto_adjudicado, prov_ruc, 
    obl_grupo_descripcion, obl_subgrupo_descripcion) %>% left_join(nomina, by = c(obl_entidad_descripcion = "descripcionEntidad")) %>% 
    filter(!is.na(size)) %>% rename(type = obl_nivel_descripcion, funds = obl_fuente_financiamiento_descripcion, 
    official = funcionarios, purchase = obl_grupo_descripcion)


surv$type <- gsub("14-CONTRALORA GENERAL DE LA REPBLICA", "Legislative Branch", 
    surv$type, fixed = T)
surv$type <- gsub("11-PODER LEGISLATIVO", "Legislative Branch", surv$type, fixed = T)
surv$type <- gsub("15-DEFENSORA DEL PUEBLO", "Legislative Branch", surv$type, 
    fixed = T)
surv$type <- gsub("15-OTROS ORGANISMOS DEL ESTADO", "Other", surv$type, fixed = T)
surv$type <- gsub("23-ENTES AUTNOMOS Y AUTRQUICOS", "Other", surv$type, fixed = T)
surv$type <- gsub("27-ENTIDADES FINANCIERAS OFICIALES", "Other", surv$type, 
    fixed = T)
surv$type <- gsub("28-UNIVERSIDADES NACIONALES", "Universities", surv$type, 
    fixed = T)
surv$type <- gsub("12-PODER EJECUTIVO", "Executive Branch", surv$type, fixed = T)
surv$type <- gsub("13-PODER JUDICIAL", "Judiciary Branch", surv$type, fixed = T)
surv$funds <- gsub("10-RECURSOS DEL TESORO", "Treasury funds", surv$funds, fixed = T)
surv$funds <- gsub("20-RECURSOS DEL CREDITO PUBLICO", "Public credit funds", 
    surv$funds, fixed = T)
surv$funds <- gsub("30-RECURSOS INSTITUCIONALES", "Institutional funds", surv$funds, 
    fixed = T)
surv$funds <- factor(surv$funds, levels = c("Institutional funds", "Treasury funds", 
    "Public credit funds"))
surv$type <- factor(surv$type, levels = c("Executive Branch", "Judiciary Branch", 
    "Legislative Branch", "Universities", "Other"))
surv$purchase <- gsub("200-SERVICIOS NO PERSONALES", "services", surv$purchase, 
    fixed = T)
surv$purchase <- gsub("300-BIENES DE CONSUMO E INSUMOS", "goods and materials", 
    surv$purchase, fixed = T)
surv$purchase <- gsub("500-INVERSION   FSICA", "investment", surv$purchase, 
    fixed = T)
surv$purchase <- gsub("800-TRANSFERENCIAS", "other", surv$purchase, fixed = T)
surv$purchase <- gsub("400-BIENES  DE  CAMBIO", "other", surv$purchase, fixed = T)
surv$purchase <- factor(surv$purchase, levels = c("services", "goods and materials", 
    "investment", "other"))

surv <- surv %>% mutate(cens = 1, log_amount = log(obl_monto_obligado), log_contrato = log(contr_monto_adjudicado), 
    log_total = log(total), funds2 = ifelse(funds == "Treasury funds", 1, 0))

## Survival object
surv_object <- Surv(time = surv$total, event = surv$cens)

# Kaplan-Meyer curves
fit <- survfit(surv_object ~ 1, data = surv)
ggsurvplot(fit, data = surv, fun = "pct", palette = c("#E7B800"), ggtheme = theme_minimal(), 
    title = "Invoices payment duration", legend = "none")

fit1 <- survfit(surv_object ~ funds, data = surv)
plot1 <- ggsurvplot(fit1, data = surv, pval = F, fun = "pct", palette = c("#559171", 
    "#b5c62d", "#FCA32B"), legend.title = "Type of funds", ggtheme = theme_minimal())
plot1$plot + geom_vline(aes(xintercept = median(datos$total[datos$dataset == 
    "A"]))) + annotate("text", x = 46, y = 80, label = "Median duration", angle = 90, 
    size = 3)

fit2 <- survfit(surv_object ~ type, data = surv)
ggsurvplot(fit2, data = surv, pval = F, fun = "pct", legend.title = "Type of institution", 
    ggtheme = theme_minimal(), legend = "right", palette = c("#E7B800", "#2E9FDF", 
        "#ff7373", "#7fc97f", "#a6cee3"), legend.labs = c("Executive Branch", 
        "Judiciary Branch", "Legislative Branch", "Universities", "Other"))

fit3 <- survfit(surv_object ~ size, data = surv)
plot2 <- ggsurvplot(fit3, data = surv, pval = F, fun = "pct", legend.title = "Size of institution", 
    palette = c("#559171", "#b5c62d", "#FCA32B"), ggtheme = theme_minimal(), 
    legend.labs = c("small", "median", "large"))
plot2$plot + geom_vline(aes(xintercept = median(datos$total[datos$dataset == 
    "A"]))) + annotate("text", x = 46, y = 80, label = "Median duration", angle = 90, 
    size = 3)

fit4 <- survfit(surv_object ~ purchase, data = surv)
ggsurvplot(fit4, data = surv, pval = TRUE, fun = "pct", legend.title = "Type of purchase", 
    palette = c("#E7B800", "#2E9FDF", "#ff7373", "#7fc97f"))

## Parametric function tests
fit_exp <- flexsurvreg(surv_object ~ 1, dist = "exponential")
fit_weibull <- flexsurvreg(surv_object ~ 1, dist = "weibull")
fit_lognormal <- flexsurvreg(surv_object ~ 1, dist = "lnorm")
fit_loglogis <- flexsurvreg(surv_object ~ 1, dist = "llogis")

plot(fit_weibull)

plot(fit_exp)

plot(fit_lognormal)

plot(fit_loglogis)

# Log-test
fit_weibull$loglik

## [1] -1499671

fit_exp$loglik

## [1] -1584678

fit_loglogis$loglik

## [1] -1506679

fit_lognormal$loglik

## [1] -1499352

fit_weibull$AIC

## [1] 2999346

fit_exp$AIC

## [1] 3169358

fit_loglogis$AIC

## [1] 3013362

fit_lognormal$AIC

## [1] 2998708

## lognormal isn't statistically different from Weibull, we choose Weibull
qchisq(0.95, 2)

## [1] 5.991465

2 * (fit_weibull$loglik - fit_lognormal$loglik)  #2 df

## [1] -638.2443

# Models
weib <- flexsurvreg(Surv(total, cens) ~ log_amount + funds2 + size + ejecucion + 
    log_contrato + purchase, data = surv, dist = "weibull")
weib

## Call:
## flexsurvreg(formula = Surv(total, cens) ~ log_amount + funds2 + 
##     size + ejecucion + log_contrato + purchase, data = surv, 
##     dist = "weibull")
## 
## Estimates: 
##                              data mean   est         L95%      
## shape                                NA    2.065248    2.059635
## scale                                NA  115.517403  112.221653
## log_amount                    15.422829   -0.029945   -0.030926
## funds2                         0.707471    0.274678    0.270700
## sizemedian                     0.487889    0.198292    0.190504
## sizelarge                      0.447956    0.357829    0.349664
## ejecucion                      0.813933   -1.532200   -1.557684
## log_contrato                  19.849689    0.030648    0.029783
## purchasegoods and materials    0.421230    0.045439    0.041354
## purchaseinvestment             0.096565    0.036023    0.029218
## purchaseother                  0.002234    0.151268    0.113943
##                              U95%        se          exp(est)  
## shape                          2.070876    0.002868          NA
## scale                        118.909944    1.705990          NA
## log_amount                    -0.028965    0.000500    0.970499
## funds2                         0.278656    0.002030    1.316107
## sizemedian                     0.206080    0.003974    1.219319
## sizelarge                      0.365993    0.004166    1.430220
## ejecucion                     -1.506716    0.013002    0.216060
## log_contrato                   0.031513    0.000441    1.031122
## purchasegoods and materials    0.049524    0.002084    1.046487
## purchaseinvestment             0.042829    0.003472    1.036680
## purchaseother                  0.188593    0.019044    1.163308
##                              L95%        U95%      
## shape                                NA          NA
## scale                                NA          NA
## log_amount                     0.969547    0.971451
## funds2                         1.310881    1.321353
## sizemedian                     1.209859    1.228852
## sizelarge                      1.418591    1.441946
## ejecucion                      0.210623    0.221637
## log_contrato                   1.030231    1.032014
## purchasegoods and materials    1.042221    1.050771
## purchaseinvestment             1.029649    1.043760
## purchaseother                  1.120688    1.207550
## 
## N = 296299,  Events: 296299,  Censored: 0
## Total time at risk: 16546174
## Log-likelihood = -1385706, df = 11
## AIC = 2771433

weib1 <- survreg(Surv(total, cens) ~ log_amount + funds2 + size + ejecucion + 
    log_contrato + purchase, data = surv, dist = "weibull")
weib1

## Call:
## survreg(formula = Surv(total, cens) ~ log_amount + funds2 + size + 
##     ejecucion + log_contrato + purchase, data = surv, dist = "weibull")
## 
## Coefficients:
##                 (Intercept)                  log_amount 
##                  4.74942120                 -0.02994534 
##                      funds2                  sizemedian 
##                  0.27467804                  0.19829231 
##                   sizelarge                   ejecucion 
##                  0.35782862                 -1.53219995 
##                log_contrato purchasegoods and materials 
##                  0.03064767                  0.04543896 
##          purchaseinvestment               purchaseother 
##                  0.03602345                  0.15126805 
## 
## Scale= 0.4842034 
## 
## Loglik(model)= -1385706   Loglik(intercept only)= -1408384
##  Chisq= 45357.25 on 9 degrees of freedom, p= <0.0000000000000002 
## n=296299 (19455 observations deleted due to missingness)

summary(weib1)

## 
## Call:
## survreg(formula = Surv(total, cens) ~ log_amount + funds2 + size + 
##     ejecucion + log_contrato + purchase, data = surv, dist = "weibull")
##                                 Value Std. Error       z
## (Intercept)                  4.749421   0.014770  321.56
## log_amount                  -0.029945   0.000500  -59.84
## funds2                       0.274678   0.002030  135.32
## sizemedian                   0.198292   0.003974   49.90
## sizelarge                    0.357829   0.004166   85.90
## ejecucion                   -1.532200   0.013002 -117.84
## log_contrato                 0.030648   0.000441   69.42
## purchasegoods and materials  0.045439   0.002084   21.80
## purchaseinvestment           0.036023   0.003472   10.37
## purchaseother                0.151268   0.019044    7.94
## Log(scale)                  -0.725250   0.001389 -522.26
##                                                p
## (Intercept)                 < 0.0000000000000002
## log_amount                  < 0.0000000000000002
## funds2                      < 0.0000000000000002
## sizemedian                  < 0.0000000000000002
## sizelarge                   < 0.0000000000000002
## ejecucion                   < 0.0000000000000002
## log_contrato                < 0.0000000000000002
## purchasegoods and materials < 0.0000000000000002
## purchaseinvestment          < 0.0000000000000002
## purchaseother                  0.000000000000002
## Log(scale)                  < 0.0000000000000002
## 
## Scale= 0.484 
## 
## Weibull distribution
## Loglik(model)= -1385706   Loglik(intercept only)= -1408384
##  Chisq= 45357.25 on 9 degrees of freedom, p= 0 
## Number of Newton-Raphson Iterations: 5 
## n=296299 (19455 observations deleted due to missingness)

6. Cost estimation

costos <- datos %>% mutate(quarter = quarter(obl_fecha_factura)) %>% select(dataset, 
    obl_monto_obligado, obl_anho_obligacion, obl_entidad_descripcion, quarter, 
    total, total2, r_factura, r_str, r_ot, r_carga_str, r_apro_str, r_tesoro_str, 
    r_carga_ot, r_pago_ot, interes, deflator) %>% mutate(totalf = total/365, 
    retraso30 = total - 30, retraso30 = ifelse(retraso30 < 0, 0, retraso30), 
    retraso30f = retraso30/365, retraso75 = total - 75, retraso75 = ifelse(retraso75 < 
        0, 0, retraso75), retraso75f = retraso75/365, retrasofac = r_factura - 
        15, retrasofac = ifelse(retrasofac < 0, 0, retrasofac), retrasofacf = retrasofac/365, 
    retraso45 = total - 45, retraso45 = ifelse(retraso45 < 0, 0, retraso45), 
    retraso45f = retraso45/365, retraso60 = total - 60, retraso60 = ifelse(retraso60 < 
        0, 0, retraso60), retraso60f = retraso60/365, rfactura = r_factura/365, 
    rstr = r_str/365, rot = r_ot/365, rcargastr = r_carga_str/365, raprobstr = r_apro_str/365, 
    rtesorostr = r_tesoro_str/365, rcargaot = r_carga_ot/365, rpago = r_pago_ot/365, 
    costo30 = ((obl_monto_obligado * retraso30f * interes)/deflator) * 100, 
    costo_total = ((obl_monto_obligado * totalf * interes)/deflator) * 100, 
    costo_75 = ((obl_monto_obligado * retraso75f * interes)/deflator) * 100, 
    costo_fac = ((obl_monto_obligado * retrasofacf * interes)/deflator) * 100, 
    costo_45 = ((obl_monto_obligado * retraso45f * interes)/deflator) * 100, 
    costo_60 = ((obl_monto_obligado * retraso60f * interes)/deflator) * 100, 
    costo_total_nominal = (obl_monto_obligado * totalf * interes), costo_fac = ((obl_monto_obligado * 
        retrasofacf * interes)/deflator) * 100, costo_factura = ((obl_monto_obligado * 
        rfactura * interes)/deflator) * 100, costo_str = ((obl_monto_obligado * 
        rstr * interes)/deflator) * 100, costo_ot = ((obl_monto_obligado * rot * 
        interes)/deflator) * 100, costo_cargastr = ((obl_monto_obligado * rcargastr * 
        interes)/deflator) * 100, costo_aprovstr = ((obl_monto_obligado * raprobstr * 
        interes)/deflator) * 100, costo_tesorostr = ((obl_monto_obligado * rtesorostr * 
        interes)/deflator) * 100, costo_cargaot = ((obl_monto_obligado * rcargaot * 
        interes)/deflator) * 100, costo_pago = ((obl_monto_obligado * rpago * 
        interes)/deflator) * 100)

costo_total <- costos %>% group_by(dataset) %>% select(starts_with("costo")) %>% 
    summarise_all(sum) %>% gather(costo, monto, -dataset) %>% mutate(costo_dolar = (monto/5618.933)/1000000) %>% 
    ungroup() %>% group_by(costo) %>% summarise(costo_dolar = sum(costo_dolar))


costo_gdp17 <- costos %>% select(starts_with("costo")) %>% summarise_all(sum) %>% 
    gather(costo, monto) %>% mutate(costo_dolar = (monto/5618.933)/1000000, 
    GDP_2017 = 165393694205500, perc_gdp = round((monto/GDP_2017 * 100), 2))

# PIB Percentage
costo_gdp2 <- costos %>% select(obl_anho_obligacion, costo30, costo_total) %>% 
    rename(`Total payment duration` = costo_total, `Late payments` = costo30) %>% 
    group_by(obl_anho_obligacion) %>% summarise_all(sum) %>% gather(costo, monto, 
    -obl_anho_obligacion) %>% mutate(costo_dolar = (monto/5618.933)/1000000, 
    GDP_2017 = 165393694205500, perc_gdp = round((monto/GDP_2017 * 100), 3))

ggplot(costo_gdp2, aes(x = obl_anho_obligacion, y = perc_gdp, colour = costo)) + 
    geom_line() + theme(legend.title = element_blank()) + theme_Publication() + 
    scale_color_manual(" ", values = c("#559171", "#FCA32B")) + labs(title = "Cost as a percentage of GDP", 
    subtitle = "Full dataset", y = "% of GDP", x = "Year", caption = "Based on data from datos.hacienda.gov.py") + 
    scale_x_continuous(breaks = seq(2011, 2017, 1))

# Year
year_cost <- costos %>% group_by(obl_anho_obligacion, dataset) %>% rename(Year = obl_anho_obligacion) %>% 
    summarise(`Total cost` = (sum(costo30)/1000000)/5618.933) %>% gather(Type, 
    cost, -Year, -dataset)
year_cost$dataset <- factor(year_cost$dataset, levels = c("B", "A"))

ggplot(year_cost, aes(x = Year, y = cost, fill = dataset)) + geom_bar(stat = "identity") + 
    scale_fill_manual(" ", values = c(A = "#559171", B = "#FCA32B")) + theme(legend.title = element_blank()) + 
    theme_Publication() + labs(title = "Cost of delayed payments by year", subtitle = "30-day deadline", 
    y = "Millions of dollars", x = "Year", caption = "Based on data from datos.hacienda.gov.py") + 
    scale_y_continuous(labels = comma) + scale_x_continuous(breaks = seq(2011, 
    2017, 1))

# Cost by stages
costo_etapa <- costo_total %>% group_by(costo) %>% summarise(costo_dolar = sum(costo_dolar)) %>% 
    filter(costo == "costo_factura" | costo == "costo_cargastr" | costo == "costo_aprovstr" | 
        costo == "costo_tesorostr" | costo == "costo_cargaot" | costo == "costo_pago") %>% 
    mutate(stage = ifelse(costo == "costo_factura", "Invoice", ifelse(costo == 
        "costo_cargastr", "TR stage", ifelse(costo == "costo_aprovstr", "TR stage", 
        ifelse(costo == "costo_tesorostr", "TR stage", "TO stage")))))
costo_etapa$costo <- gsub("costo_factura", "Invoice approval", costo_etapa$costo, 
    fixed = T)
costo_etapa$costo <- gsub("costo_cargastr", "TR creation", costo_etapa$costo, 
    fixed = T)
costo_etapa$costo <- gsub("costo_aprovstr", "TR approval", costo_etapa$costo, 
    fixed = T)
costo_etapa$costo <- gsub("costo_tesorostr", "Transfer to Treasury", costo_etapa$costo, 
    fixed = T)
costo_etapa$costo <- gsub("costo_cargaot", "TO creation", costo_etapa$costo, 
    fixed = T)
costo_etapa$costo <- gsub("costo_pago", "Payment", costo_etapa$costo, fixed = T)


costo_etapa$stage <- factor(costo_etapa$stage, levels = c("Invoice", "TR stage", 
    "TO stage"))
costo_etapa$costo <- factor(costo_etapa$costo, levels = c("Payment", "TO creation", 
    "Transfer to Treasury", "TR approval", "TR creation", "Invoice approval"))
ggplot(costo_etapa, aes(x = stage, y = costo_dolar, fill = costo)) + geom_bar(stat = "identity") + 
    theme_Publication() + coord_flip() + labs(title = "Cost of payment duration by stages", 
    subtitle = "Full dataset", y = "Millions of dollars", x = "", caption = "Based on data from datos.hacienda.gov.py") + 
    scale_y_continuous(breaks = seq(0, 150, 10)) + scale_fill_manual(" ", values = c(`Invoice approval` = "#559171", 
    `TR creation` = "#b5c62d", `TR approval` = "#ffef96", `Transfer to Treasury` = "#FCA32B", 
    `TO creation` = "#5b5b5f", Payment = "#b2b2b2"))

# Cost by institution
costo_inst <- costos %>% filter(obl_entidad_descripcion != "002-MECANISMO NACIONAL DE PREVENCIN DE LA TORTURA", 
    obl_entidad_descripcion != "002-COM. NAC. DE PREV. CONTRA LA TORTURA Y OTROS TRATOS O PENAS.", 
    obl_entidad_descripcion != "003-CRDITO AGRCOLA DE HABILITACIN", obl_entidad_descripcion != 
        "004-DIRECCION DE BENEFICENCIA Y AYUDA SOCIAL", obl_entidad_descripcion != 
        "007-SINDICATURA GENERAL DE QUIEBRAS", obl_entidad_descripcion != "013-ENTE REGULADOR DE SERVICIOS SANITARIOS", 
    obl_entidad_descripcion != "014-INSTITUTO NACIONAL DE COOPERATIVISMO", obl_entidad_descripcion != 
        "014-MINISTERIO DE LA MUJER", obl_entidad_descripcion != "016-SERVICIO NACIONAL DE CALIDAD Y SALUD ANIMAL", 
    obl_entidad_descripcion != "020-INSTITUTO FORESTAL NACIONAL", obl_entidad_descripcion != 
        "021-SECRETARA DEL AMBIENTE", obl_entidad_descripcion != "026-SECRETARIA DE DEFENSA DEL CONSUMIDOR Y EL USUARIO", 
    obl_entidad_descripcion != "027-COMISIN NACIONAL DE LA COMPETENCIA", obl_entidad_descripcion != 
        "028-AGENCIA NACIONAL DE TRANSITO Y SEGURIDAD VIAL", obl_entidad_descripcion != 
        "029-CONSEJO NACIONAL DE EDUCACION SUPERIOR", obl_entidad_descripcion != 
        "030-AGENCIA NACIONAL DE EVAL. Y ACRED. DE LA EDUCACION SUPERIOR", obl_entidad_descripcion != 
        "030-AGENCIA NACIONAL DE EVAL. Y ACRED. DE LA EDUCACION SUPERIOR", obl_entidad_descripcion != 
        "031-AUTORIDAD REGULADORA RADIOLGICA Y NUCLEAR") %>% group_by(obl_entidad_descripcion, 
    dataset) %>% select(starts_with("costo")) %>% summarise_all(sum) %>% arrange(desc(costo_total)) %>% 
    mutate(acumulado = cumsum(costo_total)/sum(costo_total) * 100)

costo_inst2 <- costo_inst %>% filter(obl_entidad_descripcion == "013-MINISTERIO DE OBRAS PUBLICAS Y COMUNICACIONES" | 
    obl_entidad_descripcion == "008-MINISTERIO DE SALUD PUBLICA Y BIENESTAR SOCIAL" | 
    obl_entidad_descripcion == "007-MINISTERIO DE EDUCACION Y CIENCIAS" | obl_entidad_descripcion == 
    "005-MINISTERIO DE DEFENSA NACIONAL" | obl_entidad_descripcion == "003-MINISTERIO DEL INTERIOR") %>% 
    mutate(costo_total_dolar = (costo_total/5618.933)/1000000)

costo_inst2$obl_entidad_descripcion <- gsub("013-MINISTERIO DE OBRAS PUBLICAS Y COMUNICACIONES", 
    "Ministry of Public Works", costo_inst2$obl_entidad_descripcion, fixed = T)
costo_inst2$obl_entidad_descripcion <- gsub("008-MINISTERIO DE SALUD PUBLICA Y BIENESTAR SOCIAL", 
    "Ministry of Health", costo_inst2$obl_entidad_descripcion, fixed = T)
costo_inst2$obl_entidad_descripcion <- gsub("007-MINISTERIO DE EDUCACION Y CIENCIAS", 
    "Ministry of Education", costo_inst2$obl_entidad_descripcion, fixed = T)
costo_inst2$obl_entidad_descripcion <- gsub("005-MINISTERIO DE DEFENSA NACIONAL", 
    "Ministry of Defense", costo_inst2$obl_entidad_descripcion, fixed = T)
costo_inst2$obl_entidad_descripcion <- gsub("003-MINISTERIO DEL INTERIOR", "Ministry of Interior", 
    costo_inst2$obl_entidad_descripcion, fixed = T)

costo_inst2$obl_entidad_descripcion <- factor(costo_inst2$obl_entidad_descripcion, 
    levels = c("Ministry of Interior", "Ministry of Defense", "Ministry of Education", 
        "Ministry of Health", "Ministry of Public Works"))
costo_inst2$dataset <- factor(costo_inst2$dataset, levels = c("B", "A"))

ggplot(costo_inst2, aes(x = obl_entidad_descripcion, y = costo_total_dolar, 
    fill = dataset)) + geom_bar(stat = "identity") + scale_fill_manual("Dataset", 
    values = c(A = "#559171", B = "#FCA32B")) + theme(legend.title = element_blank()) + 
    theme_Publication() + coord_flip() + labs(title = "Total cost by institution", 
    y = "Cost in millions of dollars", x = "Institution", caption = "Based on data from datos.hacienda.gov.py") + 
    scale_y_continuous(breaks = seq(0, 90, 5))