This exploratory data analysis was made based on data provided by the Brazilian government about the expenses allowed to its parliamentarians or C.E.A.P. (Cota para o Exercício da Atividade Parlamentar / Quota for the Exercise of Parliamentary Activity). More information about it (in Portuguese) can be found in its official site
data <- read_csv(here::here("data/dadosCEAP.csv"),
progress = F,
col_types = cols(
nomeParlamentar = col_character(),
idCadastro = col_integer(),
sgUF = col_character(),
sgPartido = col_character(),
tipoDespesa = col_character(),
especDespesa = col_character(),
fornecedor = col_character(),
CNPJCPF = col_character(),
tipoDocumento = col_integer(),
dataEmissao = col_character(),
valorDocumento = col_double(),
valorGlosa = col_integer(),
valorLíquido = col_double()))
data %>%
mutate(dataEmissao = parse_date_time(dataEmissao,"%Y-%m-%d %H:%M:%S"),
year_month = paste(lubridate::year(dataEmissao), # extract year
lubridate::month(dataEmissao), # extract month
sep = "-"),
tipoDespesa = toupper(tipoDespesa)) -> data
state_info <- read_csv(here::here("/data/limiteMensalCEAP.csv"),
progress = F,
col_types = cols(
UF = col_character(),
limite_mensal = col_double()))
data %>%
full_join(state_info,
by = c("sgUF" = "UF")) -> data
data %>%
glimpse()
## Observations: 963,452
## Variables: 15
## $ nomeParlamentar <chr> "ABEL MESQUITA JR.", "ABEL MESQUITA JR.", "ABE...
## $ idCadastro <int> 178957, 178957, 178957, 178957, 178957, 178957...
## $ sgUF <chr> "RR", "RR", "RR", "RR", "RR", "RR", "RR", "RR"...
## $ sgPartido <chr> "DEM", "DEM", "DEM", "DEM", "DEM", "DEM", "DEM...
## $ tipoDespesa <chr> "MANUTENÇÃO DE ESCRITÓRIO DE APOIO À ATIVIDADE...
## $ especDespesa <chr> NA, NA, NA, NA, NA, "Veículos Automotores", "V...
## $ fornecedor <chr> "COMPANHIA DE AGUAS E ESGOTOS DE RORAIMA", "CO...
## $ CNPJCPF <chr> "05939467000115", "05939467000115", "023414700...
## $ tipoDocumento <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ dataEmissao <dttm> 2015-11-14, 2015-12-10, 2015-11-27, 2015-12-3...
## $ valorDocumento <dbl> 165.65, 59.48, 130.95, 196.53, 310.25, 32.00, ...
## $ valorGlosa <int> 0, 0, 0, 347, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ valorLíquido <dbl> 165.65, 59.48, 130.95, 193.06, 310.25, 32.00, ...
## $ year_month <chr> "2015-11", "2015-12", "2015-11", "2015-12", "2...
## $ limite_mensal <dbl> 45612.53, 45612.53, 45612.53, 45612.53, 45612....
read_tsv(here::here("data/trsl_expenses.tsv"),
progress = F,
col_types = cols(
original_term = col_character(),
translation = col_character())) -> trsltd_expenses
trsltd_expenses
data %>%
full_join(trsltd_expenses,
by = c("tipoDespesa" = "original_term")) %>%
mutate(tipoDespesa = translation) %>% # translate col tipoDespesa
select(-translation)-> data # drop unused column
data %>%
sample_n(5) %>%
select(nomeParlamentar,
tipoDespesa,
dataEmissao)
data %>%
na.omit(year_month) %>%
group_by(year_month) %>% # times used
summarize(total = n()) %>% # by year/month
ggplot(aes(year_month, total)) +
geom_point() +
theme(axis.text.x = element_text(angle = 60,
hjust = 1)) +
labs(x="Year and month", y="Absolute Frequency")
data %>%
group_by(idCadastro,
nomeParlamentar) %>%
slice(1) %>% # keep one entry for parliamentarian
ungroup() %>%
group_by(sgPartido) %>% # calc parliamentarian
summarize(num_parliament = n()) %>% # by party
na.omit() %>%
ggplot(aes(reorder(sgPartido,
num_parliament),
num_parliament)) +
geom_bar(stat="identity") +
labs(x="Political parties",
y="Number of parliamentarians") +
theme(axis.text.x = element_text(angle = 60,
hjust = 1))
data %>%
group_by(sgUF,idCadastro,nomeParlamentar) %>% # calculate expense for
summarize(total_expense = sum(valorLíquido)) %>% # each parliamentarian
ungroup() %>%
top_n(n=10, wt=total_expense) %>% # get the top 10 expenders
ggplot(aes(total_expense,
reorder(nomeParlamentar,
total_expense))) +
geom_segment(aes(x = min(total_expense),
y =reorder(nomeParlamentar,
total_expense),
xend = total_expense,
yend = nomeParlamentar),
color = "grey50", size=0.35) +
geom_point(aes(color=sgUF), size=3.5) +
scale_x_continuous(breaks = seq(from = 135E3,
to=1E7,
by = 1E4)) +
theme(axis.text.x = element_text(angle = 30,
hjust = 1)) +
guides(color=guide_legend("State")) +
labs(x="Total Expense (R$)",
y="Parliamentarian name")
state_info %>%
ggplot(aes(reorder(UF,limite_mensal),
limite_mensal)) +
geom_point() +
scale_y_continuous(breaks = seq(5E2,5E5,1.5E3)) +
labs(y="Montly allowance", x="State")
The state of Roraima (RR) has the biggest monthly allowance in the whole country which is approximately \(R\$\:45,\!500.\,00\). It makes sense that their parliamentarians are among the biggest spenders in the whole country.
The federal district (DF), where’s the capital, has the smallest allowance which is a little more than \(R\$\:30,\!500.\,00\). A parliamentarian who lives there is already close to the capital so doesn’t have to travel by plane and probably already has a house there, so the smaller allowance is to be expected.
data %>%
group_by(sgUF,idCadastro,nomeParlamentar) %>% # calculate expense for
summarize(total_expense = sum(valorLíquido)) %>% # each parliamentarian
ungroup() %>%
filter(total_expense >= 0) %>% # Remove those without actual net amount
arrange(total_expense) %>% # Order by total expense
slice(1:10) %>% # Get the top 10 frugal
ggplot(aes(total_expense,
reorder(nomeParlamentar,
total_expense))) +
geom_segment(aes(x = 0,
y = reorder(nomeParlamentar,
-total_expense),
xend = total_expense,
yend = nomeParlamentar),
color = "grey50", size=0.35) +
geom_point(aes(color=sgUF), size=3.5) +
scale_x_continuous(breaks = seq(from = 0,
to=1E5,
by = 20)) +
guides(color=guide_legend("State")) +
labs(x="Total Expense (R$)",
y="Parliamentarian name")
Going against what one would expect none of the more frugal parliamentarians are from the Federal District (DF) which has the smallest monthly allowance.
Let’s look at the exact values for the top 3 most frugal parliamentarians
data %>%
group_by(sgUF,idCadastro,nomeParlamentar) %>%
summarize(total_expense = sum(valorLíquido)) %>%
ungroup() %>%
filter(total_expense >= 0) %>%
arrange(total_expense) %>%
slice(1:3)
data %>%
select(sgUF,fornecedor,
nomeParlamentar,
sgPartido,tipoDespesa) %>%
filter(nomeParlamentar %in%
c("CAMILO COLA",
"MARCIO MONTEIRO",
"MARCELO ALMEIDA"))
Camilo Cola seems the type of man that would pay for pretty much all of his expenses as parliamentarian.
It’s likely that our second place, ‘the richest parliamentarian in Brazil’, would pay for most of his expenses.
The third place simply didn’t have enough time to spend any money.
data %>%
filter(tipoDocumento == 2) %>% # Keep only expenses abroad
group_by(sgUF) %>% # group by State
summarize(international_expense = sum(valorLíquido)) %>% # calc expense by state
ggplot(aes(reorder(sgUF,international_expense),
international_expense)) +
geom_bar(stat = "identity") +
labs(x="States", y="Expenses abroad (R$)")
data %>%
filter(tipoDocumento == 2) %>% # keep only expenses abroad
filter(sgUF == "MA") %>% # keep only expenses from MA
select(fornecedor,
tipoDespesa,
valorLíquido,
nomeParlamentar,
sgUF,
sgPartido)
data %>%
group_by(sgUF,idCadastro,year_month, # group by parliamentarian
nomeParlamentar,limite_mensal) %>% # and vars of interest
summarise(monthly_expense = sum(valorLíquido)) %>% # calc expense by month/year
filter(year_month != "NA-NA") %>% # remove expense without a month/year
filter(monthly_expense > limite_mensal) %>% # keep only violations
ungroup() -> expense_month_year
#
expense_month_year %>%
select(nomeParlamentar,
year_month,
limite_mensal,
monthly_expense) %>%
sample_n(5)
expense_month_year %>%
group_by(idCadastro,nomeParlamentar) %>% # group only by parliamentarian
arrange(desc(monthly_expense)) %>% # order by month/year expense
slice(1) %>% # keep biggest expense of each parliamentarian
ungroup() %>%
top_n(n=10, wt=monthly_expense) %>% # get overall top 10 month/year expenses
mutate(overbudget = monthly_expense - limite_mensal) %>% # calc actual overbudget
ggplot(aes(overbudget,
reorder(nomeParlamentar,
overbudget))) +
geom_segment(aes(x = 1E5,
y =reorder(nomeParlamentar,
overbudget),
xend = overbudget,
yend = nomeParlamentar),
color = "grey50",
size=0.35) +
geom_point(aes(color=sgUF),size=3) +
labs(x="Overbudget (R$)",
y="parliamentarians")
We will consider both the cost of air tickets themselves and the cost of their issuance.
data %>%
filter(tipoDespesa %in%
c("AIRLINE TICKETS",
"AIR TICKET ISSUANCE")) %>% # keep only fligths
group_by(sgUF) %>%
summarize(international_expense = sum(valorLíquido)) %>%
filter(sgUF != "NA") %>%
ggplot(aes(reorder(sgUF,international_expense),
international_expense)) +
geom_bar(stat = "identity") +
labs(x="States", y="Flight expense")
data %>%
filter(sgUF == "PB") %>% # keep only PB
na.omit(sgUF,
sgPartido) %>%
group_by(sgPartido) %>%
summarize(frequency =n()) %>% # calc num of times
ggplot(aes(reorder(sgPartido,frequency),
frequency)) +
geom_bar(stat = "identity") +
labs(x="Parties", y="Absolute frequency")
data %>%
filter(sgUF == "PB") %>% # keep only PB
na.omit(sgUF,
sgPartido,
valorLíquido) %>%
group_by(sgPartido) %>%
summarize(total_expense = sum(valorLíquido)) %>% # calc total expense
ggplot(aes(reorder(sgPartido,total_expense),
total_expense)) +
geom_bar(stat = "identity") +
labs(x="Parties", y="Total expense")
data %>%
filter(sgPartido %in% c("PSDB")) %>% # keep only PSDB
group_by(tipoDespesa) %>%
summarize(frequency = n()) %>% # calc total times
ggplot(aes(reorder(tipoDespesa,frequency),
frequency)) +
geom_bar(stat="identity",
fill="#1E347F") +
labs(y= "Absolute Frequency",
x="Type of expense") +
coord_flip() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1))
data %>%
filter(sgPartido %in% c("PMDB")) %>% # keep only PMDB
group_by(tipoDespesa) %>%
summarize(frequency = n()) %>% # calc total times
ggplot(aes(reorder(tipoDespesa,frequency),
frequency)) +
geom_bar(stat="identity",
fill="#7F6607") +
labs(y= "Absolute Frequency",
x="Type of expense") +
coord_flip() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1))
data %>%
filter(sgPartido %in% c("PT")) %>% # keep only PT
group_by(tipoDespesa) %>%
summarize(frequency = n()) %>% # calc total times
ggplot(aes(reorder(tipoDespesa,frequency),
frequency)) +
geom_bar(stat="identity",
fill="#7F1707") +
labs(y= "Absolute Frequency",
x="Type of expense") +
coord_flip() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1))
Across all three parties the expense most frequently paid for with CEAP is Air Tick Issuance, followed by Fuel and Lubricants. The third place is Telephony for PSDB and PMDB and Taxi Service, Toll and Parking for PT.
data %>%
filter(sgPartido %in% c("PSDB")) %>% # keep only PSDB
group_by(tipoDespesa) %>%
summarize(expense = sum(valorLíquido)) %>% # calc total expense
ggplot(aes(reorder(tipoDespesa,expense),
expense)) +
geom_bar(stat="identity",
fill="#1E347F") +
labs(y= "Total expense",
x="Type of expense") +
coord_flip() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1))
data %>%
filter(sgPartido %in% c("PMDB")) %>% # keep only PMDB
group_by(tipoDespesa) %>%
summarize(expense = sum(valorLíquido)) %>% # calc total expense
ggplot(aes(reorder(tipoDespesa,expense),
expense)) +
geom_bar(stat="identity",
fill="#7F6607") +
labs(y= "Total expense",
x="Type of expense") +
coord_flip() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1))
data %>%
filter(sgPartido %in% c("PT")) %>% # keep only PT
group_by(tipoDespesa) %>%
summarize(expense = sum(valorLíquido)) %>% # calc total expense
ggplot(aes(reorder(tipoDespesa,expense),
expense)) +
geom_bar(stat="identity",
fill="#7F1707") +
labs(y= "Total expense",
x="Type of expense") +
coord_flip() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1))
PT and PSDB have the same three costlier expenses Air Ticket Issuance, Divulgation and Office Maintenance. Unlike the other for PMDB Divulgation is costlier than Air Ticket Issuance and Vehicle Renting/Freighting takes the spot from Office Maintenance.