Distribições de Probabilidade

Discretas

Bernoulli

\(X \sim Bern(p)\)

dados <- tibble(
    var_bernoulli = rbernoulli(n = 1000, p = 0.5)
    )

ggplot(data = dados, aes(x = var_bernoulli)) +
    geom_bar()

Binomial

\(X \sim B(n,p)\)

dados <- tibble(
    var_binomial = rbinom(n = 10000, size = 10, p = 0.1)
    )

ggplot(data = dados, aes(x = var_binomial)) +
    geom_bar()

Poisson

\(X \sim P(lambda)\)

dados <- tibble(
    var_poisson = rpois(n = 1000, lambda = 100)
    )

ggplot(data = dados, aes(x = var_poisson)) +
    geom_bar()

#dpois(20, lambda = 100)
ppois(125, lambda = 100)
## [1] 0.9932024

Continuas

Exponencial

\(X \sim Exp(\lambda)\)

dados <- tibble(
    var_exp = rexp(n = 1000, rate = 100)
    )

ggplot(data = dados, aes(x = var_exp)) +
    geom_histogram()

pexp(0.1, rate = 100)
## [1] 0.9999546

Uniforme

\(X \sim U(a,b)\)

dados <- tibble(
    var_uniforme = runif(n = 10000, min = 0 , max = 10)
    )

ggplot(data = dados, aes(x = var_uniforme)) +
    geom_histogram()

Normal

\(X \sim N(\mu,\sigma^2)\)

dados <- tibble(
    var_normal = rnorm(n = 1000, mean = 1.65, sd = 0.1)
    )

ggplot(data = dados, aes(x = var_normal)) +
    geom_histogram()

Cases

C.1.1 ~ Uma cientista de dados de sucesso (age)

Construção da base de dados

Considerando as variáveis aleatórias discutidas até aqui:

#tamanho das amostras
n <- 1000

dados <- tibble(
    var_bernoulli = rbernoulli(n, p = 0.5),
    var_binomial = rbinom(n, size = 5, p = 0.25),
    var_poisson = rpois(n, lambda = 3),
    var_exp = rexp(n, rate = 3),
    var_uniforme = runif(n, min = 0 , max = 10),
    var_normal = rnorm(n, mean = 40, sd = 10)
)

dados
## # A tibble: 1,000 x 6
##    var_bernoulli var_binomial var_poisson var_exp var_uniforme var_normal
##    <lgl>                <int>       <int>   <dbl>        <dbl>      <dbl>
##  1 FALSE                    1           0  0.218         4.20        50.7
##  2 TRUE                     0           6  0.191         3.62        52.0
##  3 TRUE                     2           4  0.121         5.03        37.8
##  4 FALSE                    2           3  0.0982        8.88        30.4
##  5 FALSE                    0           3  1.78          0.551       51.2
##  6 TRUE                     2           4  0.555         4.85        26.1
##  7 FALSE                    0           4  0.427         4.29        42.1
##  8 TRUE                     1           4  0.789         0.656       36.0
##  9 FALSE                    2           3  0.945         8.85        20.8
## 10 FALSE                    2           2  0.441         1.16        27.7
## # ... with 990 more rows

E se as variáveis fossem…

dadosEx <- dados %>% 
    rename(sex_fem = 1,
           process_n = 2,
           firstYear_nProj = 3,
           firstProj_timing = 4,
           kmFromRes = 5,
           age =6
           ) %>% 
    mutate(firstProj_timing = round(firstProj_timing*365,0) )

dadosEx
## # A tibble: 1,000 x 6
##    sex_fem process_n firstYear_nProj firstProj_timing kmFromRes   age
##    <lgl>       <int>           <int>            <dbl>     <dbl> <dbl>
##  1 FALSE           1               0               80     4.20   50.7
##  2 TRUE            0               6               70     3.62   52.0
##  3 TRUE            2               4               44     5.03   37.8
##  4 FALSE           2               3               36     8.88   30.4
##  5 FALSE           0               3              649     0.551  51.2
##  6 TRUE            2               4              202     4.85   26.1
##  7 FALSE           0               4              156     4.29   42.1
##  8 TRUE            1               4              288     0.656  36.0
##  9 FALSE           2               3              345     8.85   20.8
## 10 FALSE           2               2              161     1.16   27.7
## # ... with 990 more rows

Entender a distribuição dos dados nos permitiria …

Boxplot univariado

p1 <- dadosEx %>%  
    ggplot(aes(age, fill=1, color=1)) + 
    geom_density(alpha=0.5) +
    labs(y="") + 
    theme(legend.position = "none") +
    theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

p2 <- dadosEx %>%  
    ggplot(aes(1, age)) + 
    geom_boxplot(color="darkgray") +
    labs(x="") +
    theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())


p1 + p2

Boxplot bivariado

p1 <- dadosEx %>%  
    mutate(sex = ifelse(sex_fem == TRUE, "fem", "masc")) %>% 
    ggplot(aes(age, fill = sex, color = sex)) + 
    geom_density(alpha=0.5) +
    labs(y="") +
    theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

p2 <- dadosEx %>%  
    mutate(sex = ifelse(sex_fem == TRUE, "fem", "masc")) %>% 
    ggplot(aes(1, age, fill = sex)) + 
    geom_boxplot()+
    labs(x="")+ 
    theme(legend.position = "none") +
    theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

p1 + p2

E se…

dadosEx %>% 
    mutate(sex_fem = rbernoulli(n, p = 0.3)) %>% 
    mutate(age = ifelse(sex_fem == T, 
                        rnorm(n, mean = 35, sd = 5), 
                        rnorm(n, mean = 50, sd = 10))) -> dadosEx2

p1 <- dadosEx2 %>%  
    mutate(sex = ifelse(sex_fem == TRUE, "fem", "masc")) %>% 
    ggplot(aes(age, fill = sex, color = sex)) + 
    geom_density(alpha=0.5) +
    labs(y="") +
    theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())

p2 <- dadosEx2 %>%  
    mutate(sex = ifelse(sex_fem == TRUE, "fem", "masc")) %>% 
    ggplot(aes(1, age, fill = sex)) + 
    geom_boxplot()+
    labs(x="")+ 
    theme(legend.position = "none") +
    theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

p1 + p2

C.1.2 ~ Uma cientista de dados de sucesso (seniori)

Entender a distribuição dos dados nos permitiria …

Boxplot univariado

set.seed(2021)
dadosEx <- dadosEx %>%  
  mutate(salary = rnorm(n, mean = 9, sd = 2))

p1 <-dadosEx %>% 
  ggplot(aes(salary, color = "", fill = "")) +
    geom_density(alpha=0.5) +
    scale_colour_viridis_d("", begin = 0.45, end = 0.8, option = "C") +
    scale_fill_viridis_d("", begin = 0.45, end = 0.8, option = "C") +
    geom_vline(aes(xintercept = mean(salary), colour = "")) +
    scale_x_continuous(breaks = seq(0,ceiling(max(dadosEx$salary)), 2),
                        guide = guide_axis(n.dodge=2),
                        labels = scales::dollar_format(suffix = "K")) +
    theme(axis.text.y=element_blank(), legend.position = 'none') +
    xlab("") 


p2 <- dadosEx %>%
  ggplot(aes("", salary, color = "", fill = "")) +
  geom_boxplot(alpha=0.5) +
  stat_summary(fun.y="mean", shape=2) +
  scale_colour_viridis_d("", begin = 0.45, end = 0.8, option = "C") +
  scale_fill_viridis_d("", begin = 0.45, end = 0.8, option = "C") +
  scale_y_continuous(breaks = seq(floor(min(dadosEx$salary)),
                                  ceiling(max(dadosEx$salary)), 2),
                      labels = scales::dollar_format(suffix = "K")) +
  theme(legend.position = 'none') +
  ylab("") +
  xlab("") 


ggpubr::ggarrange(p1, p2, ncol=2, nrow=1) +
  plot_annotation(title = 'Fake Data: Salary')

Boxplot bivariado

p1 <-dadosEx %>% 
  mutate(level = ifelse(sex_fem == TRUE, "entry", "senior")) %>% 
  ggplot(aes(salary, fill = level, color = level)) +
    geom_density(alpha=0.5) +
    scale_colour_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
    scale_fill_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
    geom_vline(data = . %>% filter(level =="entry"), aes(xintercept = mean(salary), colour = level)) +
    geom_vline(data = . %>% filter(level =="senior"), aes(xintercept = mean(salary), colour = level))  +    
  scale_x_continuous(breaks = seq(0,ceiling(max(dadosEx$salary)), 2),
                        guide = guide_axis(n.dodge=2),
                        labels = scales::dollar_format(suffix = "K")) +
    theme(axis.text.y=element_blank(), legend.position = 'none') +
    xlab("") 


p2 <- dadosEx %>%
  mutate(level = ifelse(sex_fem == TRUE, "entry", "senior")) %>% 
  ggplot(aes("", salary, fill = level, color = level)) +
  geom_boxplot(alpha=0.5) +
  facet_grid(. ~ level) +
  stat_summary(fun.y="mean", shape=2) +
  scale_colour_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
  scale_fill_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
  scale_y_continuous(breaks = seq(floor(min(dadosEx$salary)),
                                  ceiling(max(dadosEx$salary)), 2),
                      labels = scales::dollar_format(suffix = "K")) +
  ylab("") +
  xlab("") 

             
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
  plot_annotation(title = 'Fake Data: Salary vs. Seniority')

E se…

set.seed(2021)
dadosEx_1 <-  dadosEx %>% 
    mutate(sex_fem = rbernoulli(n, p = 0.3)) %>% 
    mutate(salary = ifelse(sex_fem == T, 
                        rnorm(n, mean = 6, sd = 2.5), 
                        rnorm(n, mean = 12, sd = 3))) -> dadosEx2

p1 <- dadosEx_1 %>% 
  mutate(level = ifelse(sex_fem == TRUE, "entry", "senior")) %>% 
  ggplot(aes(salary, fill = level, color = level)) +
    geom_density(alpha=0.5) +
    scale_colour_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
    scale_fill_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
    geom_vline(data = . %>% filter(level =="entry"), aes(xintercept = mean(salary), colour = level)) +
    geom_vline(data = . %>% filter(level =="senior"), aes(xintercept = mean(salary), colour = level))  +    
  scale_x_continuous(breaks = seq(0,ceiling(max(dadosEx$salary)), 2),
                        guide = guide_axis(n.dodge=2),
                        labels = scales::dollar_format(suffix = "K")) +
    theme(axis.text.y=element_blank(), legend.position = 'none') +
    xlab("") 


p2 <- dadosEx_1 %>%
  mutate(level = ifelse(sex_fem == TRUE, "entry", "senior")) %>% 
  ggplot(aes("", salary, fill = level, color = level)) +
  geom_boxplot(alpha=0.5) +
  facet_grid(. ~ level) +
  stat_summary(fun.y="mean", shape=2) +
  scale_colour_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
  scale_fill_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
  scale_y_continuous(breaks = seq(floor(min(dadosEx$salary)),
                                  ceiling(max(dadosEx$salary)), 2),
                      labels = scales::dollar_format(suffix = "K")) +
  ylab("") +
  xlab("") 

             
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
  plot_annotation(title = 'Fake Data: Salary vs. Seniority')

escala

dadosEx2 <- dadosEx2 %>% mutate(salary = ifelse(salary < 0, mean(salary), salary))

dadosEx2 %>% 
  mutate(salary = salary * 1000) %>% 
  select(age, salary, years_since_graduation = firstYear_nProj) %>% 
  pivot_longer(everything()) %>% 
  ggplot(aes(name, value, color = name, fill = name)) +
    geom_boxplot(alpha = 0.3) +
    theme(legend.position = 'none') +
    scale_colour_viridis_d("", option = "E") +
    scale_fill_viridis_d("",  option = "E") +
    xlab("")  +
    ylab("") +
    ggtitle("Fake Data")

dadosEx2 %>% 
  select(age, salary, years_since_graduation = firstYear_nProj) %>% 
  pivot_longer(everything()) %>% 
  ggplot(aes(name, value, color = name, fill = name)) +
    geom_boxplot(alpha = 0.3) +
    theme(legend.position = 'none') +
    scale_colour_viridis_d("", option = "E") +
    scale_fill_viridis_d("",  option = "E") +
    xlab("")  +
    ylab("")+
    ggtitle("Fake Data")

p1 <- dadosEx2 %>% 
  select(age, salary, years_since_graduation = firstYear_nProj) %>% 
  scale() %>% 
  as_tibble() %>% 
  pivot_longer(everything()) %>% 
  ggplot(aes(name, value, color = name, fill = name)) +
    geom_boxplot(alpha = 0.3) +
    theme(legend.position = 'none') +
    scale_colour_viridis_d("",  end = 0.95, option = "E") +
    scale_fill_viridis_d("",  end = 0.95,  option = "E") +
    xlab("") +
    ylab("")  +
    ggtitle("Z-Score Standardization")


p2 <- dadosEx2 %>% 
  select(age, salary, years_since_graduation = firstYear_nProj) %>% 
  mutate(across(everything(), ~scales::rescale(.x, to = c(0, 1)))) %>% 
  pivot_longer(everything()) %>% 
  ggplot(aes(name, value, color = name, fill = name)) +
    geom_boxplot(alpha = 0.3) +
    theme(legend.position = 'none') +
    scale_colour_viridis_d("",  end = 0.95, option = "E") +
    scale_fill_viridis_d("",  end = 0.95, option = "E") +
    xlab("") +
    ylab("")  +
    ggtitle("Min-Max Scaling")


p2 + p1 + plot_annotation(title = 'Fake Data')

dadosEx2 %>% select(age, salary, years_since_graduation = firstYear_nProj) %>% skimr::skim() 
Data summary
Name Piped data
Number of rows 1000
Number of columns 3
_______________________
Column type frequency:
numeric 3
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 40.20 9.68 11.10 33.35 40.27 46.95 69.86 ▁▅▇▅▁
salary 0 1 10.39 4.05 0.24 7.59 10.58 13.36 23.02 ▂▆▇▃▁
years_since_graduation 0 1 3.00 1.70 0.00 2.00 3.00 4.00 10.00 ▇▇▃▁▁

Correlação

Correlação não é causalidade, MAS AJUDA!

dadosEx %>% GGally::ggpairs()

dadosEx %>% GGally::ggpairs(mapping = aes(color = sex_fem))

dadosEx %>% 
    mutate(sex_fem = rbernoulli(n, p = 0.3)) %>% 
    mutate(age = ifelse(sex_fem == T, 
                        rnorm(n, mean = 38, sd = 5), 
                        rnorm(n, mean = 45, sd = 10))) %>% 
    GGally::ggpairs(mapping = aes(color = sex_fem))

C.2 ~ American Math Society Survey Data

Counts of new PhDs in the mathematical sciences for 2008-09 and 2011-12 categorized by type of institution, gender, and US citizenship status.

library(carData)

data("AMSsurvey") 
AMSsurvey %>% glimpse
#?AMSsurvey

C.3 ~ School Science Survey Data

attitudes to science, from a survey where there were results from 20 classes in private schools and 46 classes in public schools. - like: a summary score based on two of the questions, on a scale from 1 (dislike) to 12 (like)

library(DAAG)

data("science") 
science %>% glimpse
## Rows: 1,385
## Columns: 7
## $ State   <fct> ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, AC~
## $ PrivPub <fct> public, public, public, public, public, public, public, public~
## $ school  <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ class   <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,~
## $ sex     <fct> f, f, f, f, f, f, f, f, m, m, m, m, m, m, f, f, f, f, f, f, f,~
## $ like    <dbl> 8, 6, 5, 2, 5, 6, 3, 7, 6, 3, 4, 8, 5, 8, 2, 4, 4, 4, 3, 5, 6,~
## $ Class   <fct> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.~
#?science

C.4 ~ Fertility and Socioeconomic Indicators

data("swiss") 

swiss %>% GGally::ggpairs()

swiss %>%
    mutate(Fertility_gr = ifelse(Fertility < 75, "baixa", "alta")) %>% 
    GGally::ggpairs(mapping = aes(color = Fertility_gr))

C.5 ~ temperatura

library(ggridges)
data("lincoln_weather")

#lincoln_weather %>% glimpse
lincoln_weather <- lincoln_weather %>% janitor::clean_names() %>%  glimpse
## Rows: 366
## Columns: 24
## $ cst                        <chr> "2016-1-1", "2016-1-2", "2016-1-3", "2016-1~
## $ max_temperature_f          <int> 37, 41, 37, 30, 38, 34, 33, 28, 22, 31, 37,~
## $ mean_temperature_f         <int> 24, 23, 23, 17, 29, 33, 30, 25, 9, 11, 28, ~
## $ min_temperature_f          <int> 11, 5, 8, 4, 19, 32, 27, 22, -4, -9, 18, 14~
## $ max_dew_point_f            <int> 19, 22, 23, 24, 29, 33, 32, 25, 17, 20, 27,~
## $ mean_dew_point_f           <int> 13, 14, 15, 13, 25, 32, 30, 22, 4, 5, 24, 1~
## $ min_dewpoint_f             <int> 8, 4, 8, 2, 19, 29, 25, 18, -8, -13, 16, 10~
## $ max_humidity               <int> 88, 100, 92, 92, 96, 100, 100, 92, 87, 87, ~
## $ mean_humidity              <int> 68, 72, 73, 82, 83, 91, 96, 85, 77, 75, 74,~
## $ min_humidity               <int> 47, 44, 54, 72, 70, 82, 92, 78, 67, 63, 64,~
## $ max_sea_level_pressure_in  <dbl> 30.50, 30.35, 30.50, 30.50, 30.20, 29.98, 2~
## $ mean_sea_level_pressure_in <dbl> 30.39, 30.30, 30.42, 30.39, 30.06, 29.90, 2~
## $ min_sea_level_pressure_in  <dbl> 30.30, 30.22, 30.32, 30.18, 29.99, 29.81, 2~
## $ max_visibility_miles       <int> 10, 10, 10, 10, 10, 10, 9, 10, 10, 10, 10, ~
## $ mean_visibility_miles      <int> 10, 10, 10, 9, 8, 4, 3, 6, 9, 10, 10, 10, 1~
## $ min_visibility_miles       <int> 10, 10, 10, 6, 5, 0, 0, 2, 5, 10, 10, 5, 10~
## $ max_wind_speed_mph         <int> 20, 15, 13, 17, 22, 16, 16, 25, 25, 10, 25,~
## $ mean_wind_speed_mph        <int> 9, 6, 5, 7, 13, 7, 7, 16, 14, 5, 14, 6, 5, ~
## $ max_gust_speed_mph         <int> 23, 18, 14, 23, 28, 21, 21, 32, 28, 12, 34,~
## $ precipitation_in           <chr> "0", "0", "0", "0", "0", "T", "0", "0", "T"~
## $ cloud_cover                <int> 0, 0, 0, 1, 4, 8, 8, 8, 5, 0, 7, 5, 0, 2, 5~
## $ events                     <chr> NA, NA, NA, NA, NA, "Fog", "Fog", "Snow", "~
## $ wind_dir_degrees           <int> 280, 312, 330, 155, 178, 167, 7, 338, 340, ~
## $ month                      <fct> January, January, January, January, January~
#distribution
ggplot(lincoln_weather, aes(x = mean_temperature_f, y = month, fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "Temp. [F]", option = "C") +
  labs(title = 'Temperatures in Lincoln NE in 2016') +
  theme_minimal()

ggplot(lincoln_weather,
    aes(y = mean_temperature_f, x = month, fill = mean_temperature_f)) +
    coord_flip() +
    geom_boxplot()

C.6 ~ Avengers

library(tidyverse)

avengers_raw <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/avengers/avengers.csv" %>%
  base::url() %>% 
  readr::read_csv() %>% 
  janitor::clean_names() %>% 
  dplyr::glimpse()
## Rows: 173
## Columns: 21
## $ url                         <chr> "http://marvel.wikia.com/Henry_Pym_(Earth-~
## $ name_alias                  <chr> "Henry Jonathan \"Hank\" Pym", "Janet van ~
## $ appearances                 <dbl> 1269, 1165, 3068, 2089, 2402, 612, 3458, 1~
## $ current                     <chr> "YES", "YES", "YES", "YES", "YES", "YES", ~
## $ gender                      <chr> "MALE", "FEMALE", "MALE", "MALE", "MALE", ~
## $ probationary_introl         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ full_reserve_avengers_intro <chr> "Sep-63", "Sep-63", "Sep-63", "Sep-63", "S~
## $ year                        <dbl> 1963, 1963, 1963, 1963, 1963, 1963, 1964, ~
## $ years_since_joining         <dbl> 52, 52, 52, 52, 52, 52, 51, 50, 50, 50, 50~
## $ honorary                    <chr> "Full", "Full", "Full", "Full", "Full", "H~
## $ death1                      <chr> "YES", "YES", "YES", "YES", "YES", "NO", "~
## $ return1                     <chr> "NO", "YES", "YES", "YES", "YES", NA, "YES~
## $ death2                      <chr> NA, NA, NA, NA, "YES", NA, NA, "YES", NA, ~
## $ return2                     <chr> NA, NA, NA, NA, "NO", NA, NA, "YES", NA, N~
## $ death3                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return3                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ death4                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return4                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ death5                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return5                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ notes                       <chr> "Merged with Ultron in Rage of Ultron Vol.~
avengers_raw <- avengers_raw %>% 
  mutate(death1 = forcats::fct_recode(death1, dead = "YES", not_dead = "NO")) %>%
  mutate(return1 = forcats::fct_recode(return1, return = "YES", not_return = "NO")) %>% 
  mutate(return1 = forcats::fct_explicit_na(return1, "not_dead")) %>% 
  glimpse
## Rows: 173
## Columns: 21
## $ url                         <chr> "http://marvel.wikia.com/Henry_Pym_(Earth-~
## $ name_alias                  <chr> "Henry Jonathan \"Hank\" Pym", "Janet van ~
## $ appearances                 <dbl> 1269, 1165, 3068, 2089, 2402, 612, 3458, 1~
## $ current                     <chr> "YES", "YES", "YES", "YES", "YES", "YES", ~
## $ gender                      <chr> "MALE", "FEMALE", "MALE", "MALE", "MALE", ~
## $ probationary_introl         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ full_reserve_avengers_intro <chr> "Sep-63", "Sep-63", "Sep-63", "Sep-63", "S~
## $ year                        <dbl> 1963, 1963, 1963, 1963, 1963, 1963, 1964, ~
## $ years_since_joining         <dbl> 52, 52, 52, 52, 52, 52, 51, 50, 50, 50, 50~
## $ honorary                    <chr> "Full", "Full", "Full", "Full", "Full", "H~
## $ death1                      <fct> dead, dead, dead, dead, dead, not_dead, de~
## $ return1                     <fct> not_return, return, return, return, return~
## $ death2                      <chr> NA, NA, NA, NA, "YES", NA, NA, "YES", NA, ~
## $ return2                     <chr> NA, NA, NA, NA, "NO", NA, NA, "YES", NA, N~
## $ death3                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return3                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ death4                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return4                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ death5                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return5                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ notes                       <chr> "Merged with Ultron in Rage of Ultron Vol.~
theme_set(theme_minimal() + theme(legend.position = 'none'))


p1 <- avengers_raw %>% 
  ggplot(aes(years_since_joining, color = "", fill = "")) +
    geom_density(alpha=0.5) +
    scale_colour_viridis_d("", begin = 0.45, end = 0.8) +
    scale_fill_viridis_d("", begin = 0.45, end = 0.8) +
    geom_vline(aes(xintercept = mean(years_since_joining), colour = "")) +
    scale_x_continuous(breaks = seq(min(avengers_raw$years_since_joining),
                                    max(avengers_raw$years_since_joining), 10)) +
    theme(axis.text.y=element_blank()) +
    xlab("") 



p2 <- avengers_raw %>%
  ggplot(aes("", years_since_joining, color = "", fill = "")) +
  geom_boxplot() +
  stat_summary(fun.y="mean", shape=2) +
  scale_colour_viridis_d("", begin = 0.45, end = 0.5) +
  scale_fill_viridis_d("", begin = 0.45, end = 0.5, alpha = 0.5) +
  scale_y_continuous(breaks = seq(min(avengers_raw$years_since_joining),
                                  max(avengers_raw$years_since_joining), 10)) +
  xlab("") +
  ylab("")

ggpubr::ggarrange(p1, p2, ncol=2, nrow=1) +
  plot_annotation(title = 'Avengers: years_since_joining')

theme_set(theme_minimal())


p1 <- avengers_raw %>% 
  ggplot(aes(years_since_joining, fill = death1, colour = death1)) +
    geom_density(alpha=0.5) +
    scale_colour_viridis_d("", end = 0.8) +
    scale_fill_viridis_d("", end = 0.8) +
  geom_vline(data = . %>% filter(death1 =="dead"), aes(xintercept = mean(years_since_joining), colour = death1)) +
  geom_vline(data = . %>% filter(death1 =="not_dead"), aes(xintercept = mean(years_since_joining), colour = death1))  +
      theme(axis.text.y=element_blank()) +
      scale_x_continuous(breaks = seq(min(avengers_raw$years_since_joining),
                                    max(avengers_raw$years_since_joining), 10)) +
    xlab("") 


p2 <- avengers_raw %>%
  ggplot(aes("", years_since_joining, fill = death1, colour = death1)) +
  geom_boxplot() +
  facet_grid(. ~ death1) +
  scale_colour_viridis_d("", end = 0.8) +
  scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
  xlab("") + 
  ylab("") + 
      scale_y_continuous(breaks = seq(min(avengers_raw$years_since_joining),
                                    max(avengers_raw$years_since_joining), 10)) +
  theme(legend.position = 'bottom') 

ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
  plot_annotation(title = 'Avengers: years_since_joining vs. Death (at least once)')

theme_set(theme_minimal())

p1 <- avengers_raw %>% 
  mutate(death1 = forcats::fct_recode(death1, dead = "YES", not_dead = "NO")) %>%
  mutate(return1 = forcats::fct_recode(return1, return = "YES", not_return = "NO")) %>% 
  mutate(return1 = forcats::fct_explicit_na(return1, "not_dead")) %>% 
  mutate(return1 = forcats::fct_infreq(return1)) %>% 
  ggplot(aes(years_since_joining, fill = return1, colour = return1)) +
    geom_density(alpha=0.5) +
    scale_colour_viridis_d("", end = 0.8) +
    scale_fill_viridis_d("", end = 0.8)  +
  geom_vline(data = . %>% filter(return1 =="not_return"), aes(xintercept = mean(years_since_joining), colour = return1)) +
  geom_vline(data = . %>% filter(return1 =="return"), aes(xintercept = mean(years_since_joining), colour = return1))  +
  geom_vline(data = . %>% filter(return1 =="not_dead"), aes(xintercept = mean(years_since_joining), colour = return1)) +
      theme(axis.text.y=element_blank()) +

      scale_x_continuous(breaks = seq(min(avengers_raw$years_since_joining),
                                    max(avengers_raw$years_since_joining), 10)) +
  xlab("") 

p2 <- avengers_raw %>%
  mutate(death1 = forcats::fct_recode(death1, dead = "YES", not_dead = "NO")) %>%
  mutate(return1 = forcats::fct_recode(return1, return = "YES", not_return = "NO")) %>% 
  mutate(return1 = forcats::fct_explicit_na(return1, "not_dead")) %>% 
  mutate(return1 = forcats::fct_infreq(return1)) %>% 
  ggplot(aes("", years_since_joining, fill = return1, colour = return1)) +
  facet_grid(. ~ return1) +
  geom_boxplot() +
    scale_y_continuous(breaks = seq(min(avengers_raw$years_since_joining),
                                    max(avengers_raw$years_since_joining), 10)) +
  scale_colour_viridis_d("", end = 0.8) +
  scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
  xlab("") +
  ylab("") 

ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
  plot_annotation(title = 'Avengers: years_since_joining vs. Return (at least once)')

avengers_raw %>% 
  janitor::tabyl(gender) %>% 
  mutate(percent = scales::percent(percent)) %>% 
  kableExtra::kbl() %>%
  kableExtra::kable_styling(full_width = F)
gender n percent
FEMALE 58 34%
MALE 115 66%
theme_set(theme_minimal())


p1 <- avengers_raw %>% 
  ggplot(aes(years_since_joining, fill = gender, colour = gender)) +
    geom_density(alpha=0.5) +
    scale_colour_viridis_d("", end = 0.8) +
    scale_fill_viridis_d("", end = 0.8) +
    geom_vline(data = . %>% filter(gender =="FEMALE"), aes(xintercept = mean(years_since_joining), colour = gender)) +
  geom_vline(data = . %>% filter(gender =="MALE"), aes(xintercept = mean(years_since_joining), colour = gender)) +
      theme(axis.text.y=element_blank()) +
      scale_x_continuous(breaks = seq(min(avengers_raw$years_since_joining),
                                    max(avengers_raw$years_since_joining), 10)) +
    xlab("") 


p2 <- avengers_raw %>%
  ggplot(aes("", years_since_joining, fill = gender, colour = gender)) +
  geom_boxplot() +
  facet_grid(. ~ gender) +
  scale_colour_viridis_d("", end = 0.8) +
  scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
  xlab("") + 
  ylab("") + 
      scale_y_continuous(breaks = seq(min(avengers_raw$years_since_joining),
                                    max(avengers_raw$years_since_joining), 10)) +
  theme(legend.position = 'bottom') 

ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
  plot_annotation(title = 'Avengers: years_since_joining vs. Gender')

p1 <- avengers_raw %>% 
  ggplot(aes(appearances, fill = gender, colour = gender)) +
    geom_density(alpha=0.5) +
    scale_colour_viridis_d("", end = 0.8) +
    scale_fill_viridis_d("", end = 0.8) +
    geom_vline(data = . %>% filter(gender =="FEMALE"), aes(xintercept = mean(appearances), colour = gender)) +
  geom_vline(data = . %>% filter(gender =="MALE"), aes(xintercept = mean(appearances), colour = gender)) +
      theme(axis.text.y=element_blank()) +
      scale_x_continuous(breaks = seq(0,
                                    max(avengers_raw$appearances), 500)) +
    xlab("") 


p2 <- avengers_raw %>%
  ggplot(aes("", appearances, fill = gender, colour = gender)) +
  geom_boxplot() +
  ggrepel::geom_text_repel(data = . %>% group_by(gender) %>% filter(appearances %in% boxplot.stats(appearances)$out) ,
            aes(label = name_alias, y = appearances), size = 3) +
  facet_grid(. ~ gender) +
  scale_colour_viridis_d("", end = 0.8) +
  scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
  xlab("") + 
  ylab("") + 
      scale_y_continuous(breaks = seq(0,
                                    max(avengers_raw$appearances), 500)) +
  theme(legend.position = 'bottom') 

ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
  plot_annotation(title = 'Avengers: Appearances vs. Gender')

avengers_raw %>%
  ggplot(aes("", appearances, fill = gender, colour = gender)) +
  geom_boxplot() +
  ggrepel::geom_text_repel(data = . %>% group_by(gender) %>% filter(appearances %in% boxplot.stats(appearances)$out) ,
            aes(label = name_alias, y = appearances), size = 3) +
  facet_grid(. ~ gender) +
  scale_colour_viridis_d("", end = 0.8) +
  scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
  xlab("") + 
      scale_y_continuous(breaks = seq(0,
                                    max(avengers_raw$appearances), 500)) +
  theme(legend.position = 'none') +
  ggtitle("Avengers: Appearances vs. Gender")

Conclusões

Ao avaliar uma distribuição, observe:

  • forma: simétrica, assimétrica (à esquerda/direita)
  • outliers: observações que se diferenciam drasticamente dos demais
  • dispersão: variabilidade das observações
  • posição: centro das observações

E o que tudo isso tem a ver com Machine Learning?

Tudo! O que discutimos até aqui, é o que os algoritmos de ML fazem de forma escalonada. E cabe a você, cientista de dados, garantir os ajustes necessários para que os modelos possam utilizar os dados da melhor forma, seja para inferência ou predição.

E lembre-se: você é a porta voz dos dados, porém não se trata de você!

Toda essa teoria deve ser um meio e não um fim, então não perca o foco na (o) cliente final.