This exploratory data analysis was made based on data provided by the Brazilian government about the expenses allowed to its parliamentarians or C.E.A.P. (Cota para o Exercício da Atividade Parlamentar / Quota for the Exercise of Parliamentary Activity). More information about it (in Portuguese) can be found in its official site
data <- read_csv(here::here("data/dadosCEAP.csv"),
progress = F,
col_types = cols(
nomeParlamentar = col_character(),
idCadastro = col_integer(),
sgUF = col_character(),
sgPartido = col_character(),
tipoDespesa = col_character(),
especDespesa = col_character(),
fornecedor = col_character(),
CNPJCPF = col_character(),
tipoDocumento = col_integer(),
dataEmissao = col_character(),
valorDocumento = col_double(),
valorGlosa = col_integer(),
valorLíquido = col_double()))
data %>%
mutate(dataEmissao = parse_date_time(dataEmissao,"%Y-%m-%d %H:%M:%S"),
year_month = paste(lubridate::year(dataEmissao), # extract year
lubridate::month(dataEmissao), # extract month
sep = "-"),
tipoDespesa = toupper(tipoDespesa)) -> data
state_info <- read_csv(here::here("/data/limiteMensalCEAP.csv"),
progress = F,
col_types = cols(
UF = col_character(),
limite_mensal = col_double()))
data %>%
full_join(state_info,
by = c("sgUF" = "UF")) -> data
data %>%
glimpse()
## Observations: 963,452
## Variables: 15
## $ nomeParlamentar <chr> "ABEL MESQUITA JR.", "ABEL MESQUITA JR.", "ABE...
## $ idCadastro <int> 178957, 178957, 178957, 178957, 178957, 178957...
## $ sgUF <chr> "RR", "RR", "RR", "RR", "RR", "RR", "RR", "RR"...
## $ sgPartido <chr> "DEM", "DEM", "DEM", "DEM", "DEM", "DEM", "DEM...
## $ tipoDespesa <chr> "MANUTENÇÃO DE ESCRITÓRIO DE APOIO À ATIVIDADE...
## $ especDespesa <chr> NA, NA, NA, NA, NA, "Veículos Automotores", "V...
## $ fornecedor <chr> "COMPANHIA DE AGUAS E ESGOTOS DE RORAIMA", "CO...
## $ CNPJCPF <chr> "05939467000115", "05939467000115", "023414700...
## $ tipoDocumento <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ dataEmissao <dttm> 2015-11-14, 2015-12-10, 2015-11-27, 2015-12-3...
## $ valorDocumento <dbl> 165.65, 59.48, 130.95, 196.53, 310.25, 32.00, ...
## $ valorGlosa <int> 0, 0, 0, 347, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ valorLíquido <dbl> 165.65, 59.48, 130.95, 193.06, 310.25, 32.00, ...
## $ year_month <chr> "2015-11", "2015-12", "2015-11", "2015-12", "2...
## $ limite_mensal <dbl> 45612.53, 45612.53, 45612.53, 45612.53, 45612....
read_tsv(here::here("data/trsl_expenses.tsv"),
progress = F,
col_types = cols(
original_term = col_character(),
translation = col_character())) -> trsltd_expenses
trsltd_expenses
data %>%
full_join(trsltd_expenses,
by = c("tipoDespesa" = "original_term")) %>%
mutate(tipoDespesa = translation) %>% # translate col tipoDespesa
select(-translation)-> data # drop unused column
data %>%
sample_n(5) %>%
select(nomeParlamentar,
tipoDespesa,
dataEmissao)
data %>%
mutate(week_day = wday(dataEmissao), # extract week day
is_weekend = (week_day == 1) | # deduce whether it's a weekend
(week_day == 7)) -> data
data %>%
select(dataEmissao,
week_day,
is_weekend) %>%
sample_n(5)
data %>%
na.omit(year_month) %>%
group_by(year_month) %>% # times used
summarize(total = n()) %>% # by year/month
ggplot(aes(year_month, total)) +
geom_point() +
theme(axis.text.x = element_text(angle = 60,
hjust = 1)) +
labs(x="Year and month", y="Absolute Frequency")
data %>%
group_by(idCadastro,
nomeParlamentar) %>%
slice(1) %>% # keep one entry for parliamentarian
ungroup() %>%
group_by(sgPartido) %>% # calc parliamentarian
summarize(num_parliament = n()) %>% # by party
na.omit() %>%
ggplot(aes(reorder(sgPartido,
num_parliament),
num_parliament)) +
geom_bar(stat="identity") +
labs(x="Political parties",
y="Number of parliamentarians") +
theme(axis.text.x = element_text(angle = 60,
hjust = 1))
data %>%
mutate(month = lubridate::month(dataEmissao)) %>% # extract month
na.omit(week_day,month) %>%
group_by(week_day,month) %>% # times used
summarize(total = sum(valorLíquido)) %>% # by day of week / month
tidyr::spread(key= week_day, value=total,
fill = 0) %>%
select(-month) %>% data.matrix() -> matrix_yearly
colnames(matrix_yearly) <- c("Sun","Mon","Tue","Wed","Thu","Fri","Sat")
x.scale <- list(at=seq(from=1,to=12,by=1))
levelplot(matrix_yearly,
col.regions=heat.colors,
scales=list(x=x.scale),
main="Total expenditure across days of the week",
xlab="month of the year", ylab= "week day")
data %>%
group_by(is_weekend,sgPartido) %>%
summarize(total_cost = sum(valorLíquido)) %>%
na.omit(is_weekend) -> party_expense
party_expense %>%
ungroup() %>%
sample_n(5)
ggplot(data= party_expense,
aes(reorder(sgPartido,
total_cost),
total_cost,
fill=is_weekend)) +
geom_bar(data=subset(party_expense,
!is_weekend),
stat = "identity") +
geom_bar(data=subset(party_expense,
is_weekend),
aes(y=-total_cost),
stat = "identity") +
coord_flip() +
labs(x="Political parties",
y="Total expenditure (R$)")
data %>%
filter(is_weekend) %>%
group_by(is_weekend,sgPartido) %>%
summarize(total_cost = sum(valorLíquido)) %>%
na.omit(is_weekend) %>%
ggplot(aes(reorder(sgPartido,
total_cost),
total_cost)) +
geom_bar(stat="identity",
fill="#00BFC4") +
labs(x="Political parties",
y="Total expenditure (R$)",
title="Expenditures during the weekend") +
theme(axis.text.x = element_text(angle = 60,
hjust = 1))
PMDB, PT and PSDB spent the most during the weekend.
data %>%
na.omit(is_weekend,
sgPartido) %>%
group_by(sgPartido) %>%
mutate(total = sum(valorLíquido)) %>% # calc total cost by party
ungroup() %>%
group_by(total,
sgPartido,
is_weekend) %>% # calc cost by weekdays
summarize(total_cost = sum(valorLíquido)) %>% # and by weekends
mutate(fract = total_cost/total) %>% # calc fraction
filter(is_weekend) %>%
ggplot(aes(reorder(sgPartido,fract),
fract)) +
geom_bar(stat="identity",
fill="#00BFC4") +
labs(x="Political parties",
y="Percentage (%)",
title="Expenditures during the weekend in R$") +
scale_y_continuous(labels = scales::percent) +
theme(axis.text.x = element_text(angle = 60,
hjust = 1))
PRTB was the party to have most of its expenditures made during the weekend.
data %>%
mutate(fornecedor = toupper(fornecedor)) %>%
mutate(fornecedor = ifelse( # shorten the name
fornecedor == "TODDE ADVOGADOS E CONSULTORES ASSOCIADOS",
"TODDE ADVOGADOS",
fornecedor)) %>%
group_by(fornecedor,
sgPartido,
tipoDespesa) %>% # calc expense of each
summarize(total_cost = sum(valorLíquido)) %>% # supplier for each party
ungroup() %>%
group_by(fornecedor) %>% # calc median expense of
mutate(median_expense = median(total_cost)) %>% # each supplier (all parties)
ungroup() -> supplier_data
supplier_data %>%
select(fornecedor,
median_expense,
sgPartido,
total_cost) %>%
sample_n(5)
supplier_data %>%
select(median_expense) %>%
unique() %>%
arrange(desc(median_expense)) %>%
slice(10) -> tenth_supplier # 10th biggest median cost
supplier_data %>%
group_by(fornecedor,
sgPartido,
tipoDespesa) %>%
filter(median_expense >= tenth_supplier) %>% # filter top 10 suppliers
ggplot(aes(reorder(fornecedor,
median_expense),
total_cost,
color=tipoDespesa)) +
geom_boxplot() +
labs(x="Supplier",
y="Cost (R$)",
color=NULL) +
coord_flip()+
theme(legend.position="bottom",
legend.direction = "vertical") +
theme(legend.text=element_text(size=8),
axis.text.y = element_text(size=8))
The costliest suppliers of all were TAM and GOL with Air Ticket Issuance