PD 1 Gabriel Lopes Guidi

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(summarytools)

Anexando pacote: 'summarytools'

O seguinte objeto é mascarado por 'package:tibble':

    view
library(knitr)
library(ggplot2)
library(rlang)

Anexando pacote: 'rlang'

Os seguintes objetos são mascarados por 'package:purrr':

    flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
    flatten_raw, invoke, splice
library(patchwork)
library(corrplot)
corrplot 0.95 loaded
library(gtsummary)

adult_csv <- read_csv("adult.csv")
Rows: 32561 Columns: 15
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): workclass, education, marital.status, occupation, relationship, rac...
dbl (6): age, fnlwgt, education.num, capital.gain, capital.loss, hours.per.week

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(adult_csv)
Rows: 32,561
Columns: 15
$ age            <dbl> 90, 82, 66, 54, 41, 34, 38, 74, 68, 41, 45, 38, 52, 32,…
$ workclass      <chr> "?", "Private", "?", "Private", "Private", "Private", "…
$ fnlwgt         <dbl> 77053, 132870, 186061, 140359, 264663, 216864, 150601, …
$ education      <chr> "HS-grad", "HS-grad", "Some-college", "7th-8th", "Some-…
$ education.num  <dbl> 9, 9, 10, 4, 10, 9, 6, 16, 9, 10, 16, 15, 13, 14, 16, 1…
$ marital.status <chr> "Widowed", "Widowed", "Widowed", "Divorced", "Separated…
$ occupation     <chr> "?", "Exec-managerial", "?", "Machine-op-inspct", "Prof…
$ relationship   <chr> "Not-in-family", "Not-in-family", "Unmarried", "Unmarri…
$ race           <chr> "White", "White", "Black", "White", "White", "White", "…
$ sex            <chr> "Female", "Female", "Female", "Female", "Female", "Fema…
$ capital.gain   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ capital.loss   <dbl> 4356, 4356, 4356, 3900, 3900, 3770, 3770, 3683, 3683, 3…
$ hours.per.week <dbl> 40, 18, 40, 40, 40, 45, 40, 20, 40, 60, 35, 45, 20, 55,…
$ native.country <chr> "United-States", "United-States", "United-States", "Uni…
$ income         <chr> "<=50K", "<=50K", "<=50K", "<=50K", "<=50K", "<=50K", "…
# Seleciona todas as variáveis categóricas
categoricas <- adult_csv |>
  select(where(~ is.character(.))) %>%
  names()

# Filtra apenas pessoas com renda >50K
adult_high <- adult_csv |>
  filter(income == ">50K")
resultado_high <- adult_high |>
  count(across(all_of(categoricas)), sort = TRUE)|>
  slice(1:10)

kable(resultado_high)
workclass education marital.status occupation relationship race sex native.country income n
Private Bachelors Married-civ-spouse Exec-managerial Husband White Male United-States >50K 369
Private HS-grad Married-civ-spouse Craft-repair Husband White Male United-States >50K 255
Private Bachelors Married-civ-spouse Prof-specialty Husband White Male United-States >50K 223
Private Bachelors Married-civ-spouse Sales Husband White Male United-States >50K 198
Private Masters Married-civ-spouse Exec-managerial Husband White Male United-States >50K 159
Private Some-college Married-civ-spouse Craft-repair Husband White Male United-States >50K 151
Private Some-college Married-civ-spouse Exec-managerial Husband White Male United-States >50K 126
Private Some-college Married-civ-spouse Sales Husband White Male United-States >50K 119
Private Masters Married-civ-spouse Prof-specialty Husband White Male United-States >50K 115
Private HS-grad Married-civ-spouse Transport-moving Husband White Male United-States >50K 111

Item 8: Observação: a variável education.num são só valores atribuídos abritariamente para representar a escolaridade de indivíduos, não tem sentido medidas de tendência central e dispersão.

Numéricas

resumo_age <- adult_csv |> summarise(variavel = "age",min = min(age), média = mean(age),
            mediana =median(age),
            sd = sd(age) , quartil_25 = quantile(age,0.25),
            quartil_75 = quantile(age,0.75),max = max(age))

kable(resumo_age)
variavel min média mediana sd quartil_25 quartil_75 max
age 17 38.58165 37 13.64043 28 48 90
resumo_fnlwgt <- adult_csv |> summarise(variavel = "fnlwgt",min = min(fnlwgt), 
            média = mean(fnlwgt),
            mediana =median(fnlwgt),
            sd = sd(fnlwgt) , quartil_25 = quantile(fnlwgt,0.25),
            quartil_75 = quantile(fnlwgt,0.75) , max = max(fnlwgt))

kable(resumo_fnlwgt)
variavel min média mediana sd quartil_25 quartil_75 max
fnlwgt 12285 189778.4 178356 105550 117827 237051 1484705
resumo_capital.gain <- adult_csv |> summarise(variavel = "capital.gain",
            min = min(capital.gain),
            média = mean(capital.gain),
            mediana =median(capital.gain),
            sd = sd(capital.gain) , quartil_25 = quantile(capital.gain,0.25),
            quartil_75 = quantile(capital.gain,0.75), max = max(capital.gain))

kable(resumo_capital.gain)
variavel min média mediana sd quartil_25 quartil_75 max
capital.gain 0 1077.649 0 7385.292 0 0 99999
resumo_capital.loss <- adult_csv |> summarise(variavel = "capital.loss",
            min = min(capital.loss),
            média = mean(capital.loss),
            mediana = median(capital.loss),
            sd = sd(capital.loss) , quartil_25 = quantile(capital.loss,0.25),
            quartil_75 = quantile(capital.loss,0.75), max = max(capital.loss))

kable(resumo_capital.loss)
variavel min média mediana sd quartil_25 quartil_75 max
capital.loss 0 87.30383 0 402.9602 0 0 4356
resumo_hours.per.week <- adult_csv |> summarise(variavel = "hours.per.week", 
            min = min(hours.per.week),                                   
            média = mean(hours.per.week),
            mediana = median(hours.per.week),
            sd = sd(hours.per.week) , quartil_25 = quantile(hours.per.week,0.25),
            quartil_75 = quantile(hours.per.week,0.75), max = max(hours.per.week))

kable(resumo_hours.per.week)
variavel min média mediana sd quartil_25 quartil_75 max
hours.per.week 1 40.43746 40 12.34743 40 45 99

Categóricas

chr_var <- adult_csv |> 
  select(where(is.character))



tbl_summary(chr_var)
Characteristic N = 32,5611
workclass
    ? 1,836 (5.6%)
    Federal-gov 960 (2.9%)
    Local-gov 2,093 (6.4%)
    Never-worked 7 (<0.1%)
    Private 22,696 (70%)
    Self-emp-inc 1,116 (3.4%)
    Self-emp-not-inc 2,541 (7.8%)
    State-gov 1,298 (4.0%)
    Without-pay 14 (<0.1%)
education
    10th 933 (2.9%)
    11th 1,175 (3.6%)
    12th 433 (1.3%)
    1st-4th 168 (0.5%)
    5th-6th 333 (1.0%)
    7th-8th 646 (2.0%)
    9th 514 (1.6%)
    Assoc-acdm 1,067 (3.3%)
    Assoc-voc 1,382 (4.2%)
    Bachelors 5,355 (16%)
    Doctorate 413 (1.3%)
    HS-grad 10,501 (32%)
    Masters 1,723 (5.3%)
    Preschool 51 (0.2%)
    Prof-school 576 (1.8%)
    Some-college 7,291 (22%)
marital.status
    Divorced 4,443 (14%)
    Married-AF-spouse 23 (<0.1%)
    Married-civ-spouse 14,976 (46%)
    Married-spouse-absent 418 (1.3%)
    Never-married 10,683 (33%)
    Separated 1,025 (3.1%)
    Widowed 993 (3.0%)
occupation
    ? 1,843 (5.7%)
    Adm-clerical 3,770 (12%)
    Armed-Forces 9 (<0.1%)
    Craft-repair 4,099 (13%)
    Exec-managerial 4,066 (12%)
    Farming-fishing 994 (3.1%)
    Handlers-cleaners 1,370 (4.2%)
    Machine-op-inspct 2,002 (6.1%)
    Other-service 3,295 (10%)
    Priv-house-serv 149 (0.5%)
    Prof-specialty 4,140 (13%)
    Protective-serv 649 (2.0%)
    Sales 3,650 (11%)
    Tech-support 928 (2.9%)
    Transport-moving 1,597 (4.9%)
relationship
    Husband 13,193 (41%)
    Not-in-family 8,305 (26%)
    Other-relative 981 (3.0%)
    Own-child 5,068 (16%)
    Unmarried 3,446 (11%)
    Wife 1,568 (4.8%)
race
    Amer-Indian-Eskimo 311 (1.0%)
    Asian-Pac-Islander 1,039 (3.2%)
    Black 3,124 (9.6%)
    Other 271 (0.8%)
    White 27,816 (85%)
sex
    Female 10,771 (33%)
    Male 21,790 (67%)
native.country
    ? 583 (1.8%)
    Cambodia 19 (<0.1%)
    Canada 121 (0.4%)
    China 75 (0.2%)
    Columbia 59 (0.2%)
    Cuba 95 (0.3%)
    Dominican-Republic 70 (0.2%)
    Ecuador 28 (<0.1%)
    El-Salvador 106 (0.3%)
    England 90 (0.3%)
    France 29 (<0.1%)
    Germany 137 (0.4%)
    Greece 29 (<0.1%)
    Guatemala 64 (0.2%)
    Haiti 44 (0.1%)
    Holand-Netherlands 1 (<0.1%)
    Honduras 13 (<0.1%)
    Hong 20 (<0.1%)
    Hungary 13 (<0.1%)
    India 100 (0.3%)
    Iran 43 (0.1%)
    Ireland 24 (<0.1%)
    Italy 73 (0.2%)
    Jamaica 81 (0.2%)
    Japan 62 (0.2%)
    Laos 18 (<0.1%)
    Mexico 643 (2.0%)
    Nicaragua 34 (0.1%)
    Outlying-US(Guam-USVI-etc) 14 (<0.1%)
    Peru 31 (<0.1%)
    Philippines 198 (0.6%)
    Poland 60 (0.2%)
    Portugal 37 (0.1%)
    Puerto-Rico 114 (0.4%)
    Scotland 12 (<0.1%)
    South 80 (0.2%)
    Taiwan 51 (0.2%)
    Thailand 18 (<0.1%)
    Trinadad&Tobago 19 (<0.1%)
    United-States 29,170 (90%)
    Vietnam 67 (0.2%)
    Yugoslavia 16 (<0.1%)
income
    <=50K 24,720 (76%)
    >50K 7,841 (24%)
1 n (%)

Item 9 : O resumo descritivo relevante das variáveis categoricas foi obtido no item anterior ( quantidade e percentual em cada categoria). Novamente education.num não tem sentido analítico, logo foi desconsiderado.

# Estatísticas descritivas das variáveis numéricas


adult_sem_edu <- adult_csv |> 
  select(-education.num)

resumo_total <- descr(adult_sem_edu)

# Converte para data.frame para usar com kable
resumo_total_df <- as.data.frame(resumo_total)

# Exibe a tabela com kable
kable(resumo_total_df, caption = "Resumo descritivo das variáveis numéricas")
Resumo descritivo das variáveis numéricas
age capital.gain capital.loss fnlwgt hours.per.week
Mean 38.5816468 1077.648844 87.303830 1.897784e+05 4.043746e+01
Std.Dev 13.6404326 7385.292085 402.960219 1.055500e+05 1.234743e+01
Min 17.0000000 0.000000 0.000000 1.228500e+04 1.000000e+00
Q1 28.0000000 0.000000 0.000000 1.178270e+05 4.000000e+01
Median 37.0000000 0.000000 0.000000 1.783560e+05 4.000000e+01
Q3 48.0000000 0.000000 0.000000 2.370510e+05 4.500000e+01
Max 90.0000000 99999.000000 4356.000000 1.484705e+06 9.900000e+01
MAD 14.8260000 0.000000 0.000000 8.879884e+04 4.447800e+00
IQR 20.0000000 0.000000 0.000000 1.192240e+05 5.000000e+00
CV 0.3535472 6.853153 4.615608 5.561750e-01 3.053463e-01
Skewness 0.5586919 11.952746 4.594206 1.446847e+00 2.276216e-01
SE.Skewness 0.0135740 0.013574 0.013574 1.357400e-02 1.357400e-02
Kurtosis -0.1664603 154.765793 20.372053 6.217106e+00 2.915691e+00
N.Valid 32561.0000000 32561.000000 32561.000000 3.256100e+04 3.256100e+04
N 32561.0000000 32561.000000 32561.000000 3.256100e+04 3.256100e+04
Pct.Valid 100.0000000 100.000000 100.000000 1.000000e+02 1.000000e+02

Item 10 : A variável education.num não tem sentido pois é um valor numérico atribuído arbitrariamente ao nível educacional.

num_var <- adult_csv |> 
  select(where(is.numeric),-education.num)
 r1_age <- num_var |>
    ggplot(aes(x = age)) +
    geom_histogram(
    aes(y = after_stat(density)),
    bins = 30,
    fill = "lightblue",
    color = "white",
    alpha = 0.6
    ) +
    geom_density(color = "darkblue", linewidth = 1) +
    geom_vline(
    aes(xintercept = mean(age), color = "Média"),
    linewidth = 1.2
    ) +
    geom_vline(
    aes(xintercept = median(age), color = "Mediana"),
    linewidth = 1.2
    ) +
    scale_color_manual(
    values = c("Média" = "red", "Mediana" = "darkgreen")
    ) +
    theme_minimal() +
    labs(
    title = "Distribuição da Idade",
    x = "Idade",
    y = "Densidade"
    )



 
 
 

qqplot_age <- num_var |> 
  ggplot(aes(sample = age)) +
  stat_qq(size = 2, alpha = 0.6) +
  stat_qq_line(color = "red", linewidth = 1) +
  theme_minimal() +
  labs(
    title = "QQ plot - Idade",
    x = "Quantis Teóricos (Normais)",
    y = "Quantis Amostrais"
  )

r1_age + qqplot_age

 r1_fnlwgt <- num_var |>
    ggplot(aes(x = fnlwgt)) +
    geom_histogram(
    aes(y = after_stat(density)),
    bins = 25,
    fill = "lightblue",
    color = "white",
    alpha = 0.6
    ) +
    geom_density(color = "darkblue", linewidth = 1) +
    geom_vline(
    aes(xintercept = mean(fnlwgt), color = "Média"),
    linewidth = 1.2
    ) +
    geom_vline(
    aes(xintercept = median(fnlwgt), color = "Mediana"),
    linewidth = 1.2
    ) +
    scale_color_manual(
    values = c("Média" = "red", "Mediana" = "darkgreen")
    ) +
    theme_minimal() +
    labs(
    title = "Distribuição de fnlwgt",
    x = "fnlwgt",
    y = "Densidade"
    )

 
 
 
 
 
 
 
 
qqplot_fnlwgt <- num_var |> 
  ggplot(aes(sample = fnlwgt)) +
  stat_qq(size = 2, alpha = 0.6) +
  stat_qq_line(color = "red", linewidth = 1) +
  theme_minimal() +
  labs(
    title = "QQ plot - fnlwgt",
    x = "Quantis Teóricos (Normais)",
    y = "Quantis Amostrais"
  )


r1_fnlwgt + qqplot_fnlwgt

r1_capital.gain <- num_var |>
    ggplot(aes(x = capital.gain)) +
    geom_histogram(
    aes(y = after_stat(density)),
    bins = 15,
    fill = "lightblue",
    color = "white",
    alpha = 0.6) +
    geom_density(color = "darkblue", linewidth = 1) +
    geom_vline(
    aes(xintercept = mean(capital.gain), color = "Média"),
    linewidth = 1.2)+
    geom_vline(
    aes(xintercept = median(capital.gain), color = "Mediana"),
    linewidth = 1.2) +
    scale_color_manual(
    values = c("Média" = "red", "Mediana" = "darkgreen")) +
    theme_minimal() +
    labs(
    title = "Distribuição de capital.gain",
    x = "capital.gain",
    y = "Densidade"
    )



qqplot_capital.gain <- num_var |> 
  ggplot(aes(sample = capital.gain)) +
  stat_qq(size = 2, alpha = 0.6) +
  stat_qq_line(color = "red", linewidth = 1) +
  theme_minimal() +
  labs(
    title = "QQ plot - capital.gain",
    x = "Quantis Teóricos (Normais)",
    y = "Quantis Amostrais"
  )


r1_capital.gain

qqplot_capital.gain

r1_capital.loss <- num_var |>
    ggplot(aes(x = capital.loss)) +
    geom_histogram(
    aes(y = after_stat(density)),
    bins = 15,
    fill = "lightblue",
    color = "white",
    alpha = 0.6) +
    geom_density(color = "darkblue", linewidth = 1) +
    geom_vline(
    aes(xintercept = mean(capital.loss), color = "Média"),
    linewidth = 1.2) +
    geom_vline(
    aes(xintercept = median(capital.loss), color = "Mediana"),
    linewidth = 1.2) +
    scale_color_manual(
    values = c("Média" = "red", "Mediana" = "darkgreen")) +
    theme_minimal() +
    labs(
    title = "Distribuição de capital.loss",
    x = "capital.loss",
    y = "Densidade"
    )







qqplot_capital.loss <- num_var |> 
  ggplot(aes(sample = capital.loss)) +
  stat_qq(size = 2, alpha = 0.6) +
  stat_qq_line(color = "red", linewidth = 1) +
  theme_minimal() +
  labs(
    title = "QQ plot - capital.loss",
    x = "Quantis Teóricos (Normais)",
    y = "Quantis Amostrais"
  )





r1_capital.loss  

qqplot_capital.loss

r1_hours.per.week <- num_var |>
    ggplot(aes(x = hours.per.week)) +
    geom_histogram(
    aes(y = after_stat(density)),
    bins = 20,
    fill = "lightblue",
    color = "white",
    alpha = 0.6) +
    geom_density(color = "darkblue", linewidth = 1) +
    geom_vline(
    aes(xintercept = mean(hours.per.week), color = "Média"),
    linewidth = 1.2) +
    geom_vline(
    aes(xintercept = median(hours.per.week), color = "Mediana"),
    linewidth = 1.2) +
    scale_color_manual(
    values = c("Média" = "red", "Mediana" = "darkgreen")) +
    theme_minimal() +
    labs(
    title = "Distribuição de hours.per.week",
    x = "hours.per.week",
    y = "Densidade"
    )




qqplot_hours.per.week <- num_var |> 
  ggplot(aes(sample = hours.per.week)) +
  stat_qq(size = 2, alpha = 0.6) +
  stat_qq_line(color = "red", linewidth = 1) +
  theme_minimal() +
  labs(
    title = "QQ plot - hours.per.week",
    x = "Quantis Teóricos (Normais)",
    y = "Quantis Amostrais"
  )


 

r1_hours.per.week+ qqplot_hours.per.week

# Selecionar apenas variáveis numéricas válidas}
num_vars <- adult_csv |>
  select(where(is.numeric)) |>
  select(-'education.num')  # remove variável 

# Calcular matriz de correlação
cor_matriz <- cor(num_vars, use = "complete.obs")
corrplot(cor_matriz, 
          method = "color", 
          type = "upper", 
          addCoef.col = "black",
          tl.col = "black", 
          tl.srt = 45 
)

Item 12 :O scatterplot entre hours.per.week e capital.gain mostra grande concentração de observações em valores próximos de zero no eixo de capital.gain, além de poucos valores mais altos. Os pontos aparecem bastante dispersos e não formam um padrão linear claro. O coeficiente de correlação entre as variáveis é aproximadamente 0.078, indicando uma relação linear muito fraca , o que é consistente com o padrão observado no gráfico.

cor(adult_csv$hours.per.week, adult_csv$capital.gain, use = "complete.obs")
[1] 0.07840862
scatter_hw_cg <- adult_csv |>
  ggplot(aes(x = hours.per.week, y = capital.gain)) +
  geom_point(alpha = 0.5, color = "darkblue") +
  theme_minimal() +
  labs(
    title = "Relação entre Hours per Week e Capital Gain",
    x = "Hours per Week",
    y = "Capital Gain"
  )

scatter_hw_cg

Item 13: Gráfico de linhas de hours.per.week por capital.gain :

grafico_linha <- adult_csv |>
  ggplot(aes(x = hours.per.week, y = capital.gain, group = 1)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(
    title = "Gráfico de linhas de  hours per week por capital.gain",
    x = "hours.per.week",
    y = "capital.gain",
    color = "Legenda"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

grafico_linha
Ignoring unknown labels:
• colour : "Legenda"