library(tidyverse)
library(readxl)
library(formattable)
library(ggplot2)
library(dplyr)
library(lubridate)
library(scales)
library(readr)
library(gt)
library(extrafont)
library(ggstats)
library(hrbrthemes)
dados=read.csv2("data.csv",sep = ",",dec = ".")
col_names <- sapply(dados, function(col) length(unique(col)) < 8)
dados <- dados %>%
mutate(across(names(col_names)[col_names], as.factor))R Programming Example - Exploratory Data Analysis
Loading packages and imported to the base
In addition to loading the base, we transformed some variables into factors for better understanding, our database in a brief view looks like this:
| ID | salary | gender | race | married | age | years_of_schooling |
|---|---|---|---|---|---|---|
| 1 | 3439.524 | Female | Black | Yes | 30 | 19 |
| 2 | 3769.823 | Female | Black | Yes | 42 | 8 |
| 3 | 5558.708 | Female | White | No | 25 | 10 |
| 4 | 4070.508 | Male | Black | No | 31 | 19 |
| 5 | 4129.288 | Female | White | Yes | 38 | 11 |
| 6 | 5715.065 | Female | indigenous | No | 46 | 8 |
This example is to show programming skills for the studypool website.
Now let’s answer our questions in some brief ways, either using simple and objective commands to get to the answer or through beautiful graphics for a better visualization of the result.
Question 1: What is the average salary of the individuals in the database? (round the value to one decimal place)
resultado <- dados %>% summarise(salary_mean = round(mean(salary),1))
resultado %>% gt()| salary_mean |
|---|
| 4090.4 |
Using a simple command from the dplyr data manipulation package, we filter the average salary and round it to one decimal place.
Question 2: How many female candidates are in the database?
num=dados %>% group_by(gender) %>% summarise(Number_of_candidates=n())
num %>% gt()| gender | Number_of_candidates |
|---|---|
| Female | 49 |
| Male | 51 |
Again we use data manipulations and the gt package to visualize.
Question 3: What is the average age of married candidates? (round to zero decimal places)
married=dados %>% group_by(married) %>% summarise(Age_Mean=round(mean(age)))
married %>% gt()| married | Age_Mean |
|---|---|
| No | 34 |
| Yes | 38 |
It is worth mentioning that candidates who are married are relatively older
Question 4: What is the most common color among individuals?
color <- c("#D3D3D3","#F5F5DC","#A0AAB2","#ECECEC","#483C32")
corcomum=dados %>% group_by(race) %>% count() %>% arrange(desc(n))
ggplot(corcomum, aes(x = reorder(race, n), y = n,fill=race)) +
geom_bar(stat = "identity", width = 0.5) + # Reduz a largura das barras
geom_text(
aes(label = n),
hjust = -0.2, # Ajusta a posição do texto
vjust = 0.4,
color = "black", # Letras mais escuras
size = 7, # Tamanho maior para o texto
family = "Arial"
) +
scale_fill_manual(values = color)+
labs(title = "Number of candidates by race", y = "Number of Candidates", x = "candidate's race") +
coord_flip() +
theme_minimal() +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.position = "none",
plot.title = element_text(hjust = 0.5, size = 20, color = "black",family = "Tahoma", face = "bold"), # Título mais escuro
axis.text = element_text(size = 14, color = "black",family = "Tahoma", face = "bold"), # Letras mais e8scuras
axis.title = element_text(size = 16, color = "black",family = "Tahoma", face = "bold"), # Letras mais escuras
axis.line = element_line(color = "black"), # Linha do eixo mais escura
panel.background = element_rect(fill = "white"), # Fundo branco
plot.margin = margin(0.2, 0.2, 0.2, 0.2, "cm") # Reduz o tamanho do gráfico
)Through the graph, we see that the majority of candidates are indigenous, followed by black and white. we use the ggplot2 package for great data visualization, in addition to using other packages to improve the graph.
Question 5: What is the average, minimum, maximum and median level of education (years of schooling) of the candidates?
estatisticasne=dados %>% summarise(mean=sprintf("%.2f",mean(years_of_schooling)),min=min(years_of_schooling),max=max(years_of_schooling),median=median(years_of_schooling))
estatisticasne %>%gt() %>%
tab_header(title = "
Statistics level of education (in years) of candidates")| Statistics level of education (in years) of candidates | |||
| mean | min | max | median |
|---|---|---|---|
| 13.98 | 8 | 20 | 14 |
Again, through data manipulation we obtained all the statistics from our database.
Question 6: What is the average salary difference between men and women in the database?
Sexo=c("Male","Female","salary gap")
SF=dados %>% filter(gender=="Male") %>% pull(salary) %>% mean()
SM=dados %>% filter(gender=="Female") %>% pull(salary) %>% mean()
Média=c(SM,SF,SM-SF)
DataSalario=data.frame(Sexo,Média)
DataSalario %>% gt() %>% tab_header(title = md("
Salaries by Sex and **Salary Gap**"))| Salaries by Sex and Salary Gap | |
| Sexo | Média |
|---|---|
| Male | 4234.4516 |
| Female | 3952.0090 |
| salary gap | 282.4426 |
Now, in addition to using data manipulation from the dplyr package, we also use subtraction and then create a data.frame with the necessary information.
Question 7: What is the proportion of married candidates among candidates of different colors?
ggplot(dados) +
aes(x = race , fill = married , by = race) +
geom_bar(position = "fill") +
geom_text(stat = "prop", position = position_fill(.5))+
labs(title = "Proportion of married candidates among candidates of different colors", y = "Percentage", x = "Candidate Color")+
theme_minimal()+
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
plot.title = element_text(hjust = 0.5, size = 20, color = "black",family = "Tahoma", face = "bold"), # Título mais escuro
axis.text = element_text(size = 14, color = "black",family = "Tahoma", face = "bold"), # Letras mais escuras
axis.title = element_text(size = 16, color = "black",family = "Tahoma", face = "bold"), # Letras mais escuras
axis.line = element_line(color = "black") # Linha do eixo mais escura
)Now it is worth mentioning that we created a graph to better visualize the requested results, where each bar represents a color and the percentage of matches among each color.
Question 8: What is the relationship between years of education and salary?
ggplot(dados, aes(x=years_of_schooling, y=salary)) +
geom_point( color="black") +
labs(title = "
Relationship between years of schooling and salary", y = "Salary", x = "Years of Schooling")+
geom_smooth(method=lm , color="red", se=FALSE)+
theme_ipsum()+
theme(
plot.title = element_text(hjust = 0.5, size = 20, color = "black",family = "Tahoma", face = "bold"),
axis.title.y = element_text(size = 16, color = "black", hjust = 0.5,family = "Tahoma", face = "bold"), # Centraliza o texto dos eixos X e Y
axis.title.x = element_text(size = 16, color = "black", hjust = 0.5,family = "Tahoma", face = "bold"), # Centraliza o título dos eixos X e Y
)It is worth highlighting now that our graph of the relationship between years of schooling and salary was a dispersion graph, where it is better to show if there is any trend and we can also draw a line to better visualize it, in this case we see that there is no correlation between the variables, indicating that even with many years of schooling, the salary does not increase.
Question 9: What is the average age of candidates who have an above-average salary? (round the value to zero decimal places) First let’s find the average salary
salariom=mean(dados$salary)
dadosSAM=dados %>% filter(salary>salariom)
dadosSSAM=dados %>% filter(salary<=salariom)
Candidatos <- c("All candidates","Salary above average","Salary below average")
Idade_Média <- c(mean(dados$age),mean(dadosSAM$age),mean(dadosSSAM$age))
Idade_Média=round(Idade_Média)
dataa=data.frame(Candidatos,Idade_Média)
dataa %>% gt() %>% tab_header(title = md("
Average age of Candidates by **salary range**"))| Average age of Candidates by salary range | |
| Candidatos | Idade_Média |
|---|---|
| All candidates | 36 |
| Salary above average | 37 |
| Salary below average | 35 |
For this question, we created variables that separate our base into salaries above and below the average, after that, we created our data.frame and our table, with all the information we need, indicating that the average age is higher for salaries larger than average
Question 10: What is the average age of individuals of each color?
dados %>% group_by(race) %>% summarise(Middle_Ages=round(mean(age),2)) %>% gt() %>% tab_header(title = md("Average age of individuals of each **Color**"))| Average age of individuals of each Color | |
| race | Middle_Ages |
|---|---|
| Black | 33.39 |
| Brown | 39.71 |
| indigenous | 35.58 |
| White | 37.70 |
| Yellow | 32.86 |
For this question, a simple data manipulation was also able to identify the average ages, verifying that the Brown are the oldest and the Yellow are the youngest
Question 11: Create a boxplot to visualize the distribution of individual salaries in relation to marital status (married or not). Interpret the results.
ggplot(dados, aes(x = NULL , y = salary)) +
geom_boxplot(fill = "#4e79a7", colour = "gold",
alpha = 0.8) +
scale_y_continuous(name = "Salary Distribution",
breaks = seq(0, 7000, 1000),
limits=c(1000, 7000)) +
scale_x_discrete(name = "Candidate's Marital Status") +
ggtitle("Boxplot visualization of Salary distribution in relation\n to marital status (Whether the candidate is married or not)") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5,size = 14, family = "Tahoma", face = "bold"),
text = element_text(size = 12, family = "Tahoma"),
axis.title = element_text(face="bold"),
axis.text.x=element_text(size = 11)) +
facet_grid(. ~ married)For this last graph, it is worth highlighting that for a better visualization, we separated it into two images that share the X axis, for a statistical analysis, we verified that the median of married candidates is slightly higher and also has a greater variation, indicating that the married candidates have incomes closer to each other, but even unmarried candidates having a higher median, indicates a greater dispersion due to the size of the Boxplot, indicating that salaries range from low values to very high values(something that is more common in relation to married candidates).