\(X \sim Bern(p)\)
dados <- tibble(
var_bernoulli = rbernoulli(n = 1000, p = 0.5)
)
ggplot(data = dados, aes(x = var_bernoulli)) +
geom_bar()
\(X \sim B(n,p)\)
dados <- tibble(
var_binomial = rbinom(n = 10000, size = 10, p = 0.1)
)
ggplot(data = dados, aes(x = var_binomial)) +
geom_bar()
\(X \sim P(lambda)\)
dados <- tibble(
var_poisson = rpois(n = 1000, lambda = 100)
)
ggplot(data = dados, aes(x = var_poisson)) +
geom_bar()
#dpois(20, lambda = 100)
ppois(125, lambda = 100)
## [1] 0.9932024
\(X \sim Exp(\lambda)\)
dados <- tibble(
var_exp = rexp(n = 1000, rate = 100)
)
ggplot(data = dados, aes(x = var_exp)) +
geom_histogram()
pexp(0.1, rate = 100)
## [1] 0.9999546
\(X \sim U(a,b)\)
dados <- tibble(
var_uniforme = runif(n = 10000, min = 0 , max = 10)
)
ggplot(data = dados, aes(x = var_uniforme)) +
geom_histogram()
\(X \sim N(\mu,\sigma^2)\)
dados <- tibble(
var_normal = rnorm(n = 1000, mean = 1.65, sd = 0.1)
)
ggplot(data = dados, aes(x = var_normal)) +
geom_histogram()
Considerando as variáveis aleatórias discutidas até aqui:
#tamanho das amostras
n <- 1000
dados <- tibble(
var_bernoulli = rbernoulli(n, p = 0.5),
var_binomial = rbinom(n, size = 5, p = 0.25),
var_poisson = rpois(n, lambda = 3),
var_exp = rexp(n, rate = 3),
var_uniforme = runif(n, min = 0 , max = 10),
var_normal = rnorm(n, mean = 40, sd = 10)
)
dados
## # A tibble: 1,000 x 6
## var_bernoulli var_binomial var_poisson var_exp var_uniforme var_normal
## <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 FALSE 0 2 0.201 6.67 42.3
## 2 FALSE 1 1 0.312 8.01 41.4
## 3 TRUE 0 3 0.854 3.96 31.8
## 4 FALSE 0 4 0.253 0.0883 39.0
## 5 FALSE 1 3 0.398 2.46 33.8
## 6 TRUE 2 3 0.133 1.97 29.0
## 7 FALSE 1 4 0.000469 8.00 52.9
## 8 FALSE 0 3 0.105 5.31 49.1
## 9 FALSE 0 7 0.871 3.50 43.2
## 10 TRUE 1 5 0.294 4.09 33.9
## # ... with 990 more rows
E se as variáveis fossem…
dadosEx <- dados %>%
rename(sex_fem = 1,
process_n = 2,
firstYear_nProj = 3,
firstProj_timing = 4,
kmFromRes = 5,
age =6
) %>%
mutate(firstProj_timing = round(firstProj_timing*365,0) )
dadosEx
## # A tibble: 1,000 x 6
## sex_fem process_n firstYear_nProj firstProj_timing kmFromRes age
## <lgl> <int> <int> <dbl> <dbl> <dbl>
## 1 FALSE 0 2 73 6.67 42.3
## 2 FALSE 1 1 114 8.01 41.4
## 3 TRUE 0 3 312 3.96 31.8
## 4 FALSE 0 4 92 0.0883 39.0
## 5 FALSE 1 3 145 2.46 33.8
## 6 TRUE 2 3 48 1.97 29.0
## 7 FALSE 1 4 0 8.00 52.9
## 8 FALSE 0 3 38 5.31 49.1
## 9 FALSE 0 7 318 3.50 43.2
## 10 TRUE 1 5 107 4.09 33.9
## # ... with 990 more rows
Entender a distribuição dos dados nos permitiria …
p1 <- dadosEx %>%
ggplot(aes(age, fill=1, color=1)) +
geom_density(alpha=0.5) +
labs(y="") +
theme(legend.position = "none") +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())
p2 <- dadosEx %>%
ggplot(aes(1, age)) +
geom_boxplot(color="darkgray") +
labs(x="") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p1 + p2
p1 <- dadosEx %>%
mutate(sex = ifelse(sex_fem == TRUE, "fem", "masc")) %>%
ggplot(aes(age, fill = sex, color = sex)) +
geom_density(alpha=0.5) +
labs(y="") +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())
p2 <- dadosEx %>%
mutate(sex = ifelse(sex_fem == TRUE, "fem", "masc")) %>%
ggplot(aes(1, age, fill = sex)) +
geom_boxplot()+
labs(x="")+
theme(legend.position = "none") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p1 + p2
dadosEx %>%
mutate(sex_fem = rbernoulli(n, p = 0.3)) %>%
mutate(age = ifelse(sex_fem == T,
rnorm(n, mean = 35, sd = 5),
rnorm(n, mean = 50, sd = 10))) -> dadosEx2
p1 <- dadosEx2 %>%
mutate(sex = ifelse(sex_fem == TRUE, "fem", "masc")) %>%
ggplot(aes(age, fill = sex, color = sex)) +
geom_density(alpha=0.5) +
labs(y="") +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank())
p2 <- dadosEx2 %>%
mutate(sex = ifelse(sex_fem == TRUE, "fem", "masc")) %>%
ggplot(aes(1, age, fill = sex)) +
geom_boxplot()+
labs(x="")+
theme(legend.position = "none") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p1 + p2
Entender a distribuição dos dados nos permitiria …
set.seed(2021)
dadosEx <- dadosEx %>%
mutate(salary = rnorm(n, mean = 9, sd = 2))
p1 <-dadosEx %>%
ggplot(aes(salary, color = "", fill = "")) +
geom_density(alpha=0.5) +
scale_colour_viridis_d("", begin = 0.45, end = 0.8, option = "C") +
scale_fill_viridis_d("", begin = 0.45, end = 0.8, option = "C") +
geom_vline(aes(xintercept = mean(salary), colour = "")) +
scale_x_continuous(breaks = seq(0,ceiling(max(dadosEx$salary)), 2),
guide = guide_axis(n.dodge=2),
labels = scales::dollar_format(suffix = "K")) +
theme(axis.text.y=element_blank(), legend.position = 'none') +
xlab("")
p2 <- dadosEx %>%
ggplot(aes("", salary, color = "", fill = "")) +
geom_boxplot(alpha=0.5) +
stat_summary(fun.y="mean", shape=2) +
scale_colour_viridis_d("", begin = 0.45, end = 0.8, option = "C") +
scale_fill_viridis_d("", begin = 0.45, end = 0.8, option = "C") +
scale_y_continuous(breaks = seq(floor(min(dadosEx$salary)),
ceiling(max(dadosEx$salary)), 2),
labels = scales::dollar_format(suffix = "K")) +
theme(legend.position = 'none') +
ylab("") +
xlab("")
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1) +
plot_annotation(title = 'Fake Data: Salary')
p1 <-dadosEx %>%
mutate(level = ifelse(sex_fem == TRUE, "entry", "senior")) %>%
ggplot(aes(salary, fill = level, color = level)) +
geom_density(alpha=0.5) +
scale_colour_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
scale_fill_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
geom_vline(data = . %>% filter(level =="entry"), aes(xintercept = mean(salary), colour = level)) +
geom_vline(data = . %>% filter(level =="senior"), aes(xintercept = mean(salary), colour = level)) +
scale_x_continuous(breaks = seq(0,ceiling(max(dadosEx$salary)), 2),
guide = guide_axis(n.dodge=2),
labels = scales::dollar_format(suffix = "K")) +
theme(axis.text.y=element_blank(), legend.position = 'none') +
xlab("")
p2 <- dadosEx %>%
mutate(level = ifelse(sex_fem == TRUE, "entry", "senior")) %>%
ggplot(aes("", salary, fill = level, color = level)) +
geom_boxplot(alpha=0.5) +
facet_grid(. ~ level) +
stat_summary(fun.y="mean", shape=2) +
scale_colour_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
scale_fill_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
scale_y_continuous(breaks = seq(floor(min(dadosEx$salary)),
ceiling(max(dadosEx$salary)), 2),
labels = scales::dollar_format(suffix = "K")) +
ylab("") +
xlab("")
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
plot_annotation(title = 'Fake Data: Salary vs. Seniority')
set.seed(2021)
dadosEx_1 <- dadosEx %>%
mutate(sex_fem = rbernoulli(n, p = 0.3)) %>%
mutate(salary = ifelse(sex_fem == T,
rnorm(n, mean = 6, sd = 2.5),
rnorm(n, mean = 12, sd = 3))) -> dadosEx2
p1 <- dadosEx_1 %>%
mutate(level = ifelse(sex_fem == TRUE, "entry", "senior")) %>%
ggplot(aes(salary, fill = level, color = level)) +
geom_density(alpha=0.5) +
scale_colour_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
scale_fill_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
geom_vline(data = . %>% filter(level =="entry"), aes(xintercept = mean(salary), colour = level)) +
geom_vline(data = . %>% filter(level =="senior"), aes(xintercept = mean(salary), colour = level)) +
scale_x_continuous(breaks = seq(0,ceiling(max(dadosEx$salary)), 2),
guide = guide_axis(n.dodge=2),
labels = scales::dollar_format(suffix = "K")) +
theme(axis.text.y=element_blank(), legend.position = 'none') +
xlab("")
p2 <- dadosEx_1 %>%
mutate(level = ifelse(sex_fem == TRUE, "entry", "senior")) %>%
ggplot(aes("", salary, fill = level, color = level)) +
geom_boxplot(alpha=0.5) +
facet_grid(. ~ level) +
stat_summary(fun.y="mean", shape=2) +
scale_colour_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
scale_fill_viridis_d("", begin = 0.6, end = 0.8, option = "B") +
scale_y_continuous(breaks = seq(floor(min(dadosEx$salary)),
ceiling(max(dadosEx$salary)), 2),
labels = scales::dollar_format(suffix = "K")) +
ylab("") +
xlab("")
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
plot_annotation(title = 'Fake Data: Salary vs. Seniority')
dadosEx2 <- dadosEx2 %>% mutate(salary = ifelse(salary < 0, mean(salary), salary))
dadosEx2 %>%
mutate(salary = salary * 1000) %>%
select(age, salary, years_since_graduation = firstYear_nProj) %>%
pivot_longer(everything()) %>%
ggplot(aes(name, value, color = name, fill = name)) +
geom_boxplot(alpha = 0.3) +
theme(legend.position = 'none') +
scale_colour_viridis_d("", option = "E") +
scale_fill_viridis_d("", option = "E") +
xlab("") +
ylab("") +
ggtitle("Fake Data")
dadosEx2 %>%
select(age, salary, years_since_graduation = firstYear_nProj) %>%
pivot_longer(everything()) %>%
ggplot(aes(name, value, color = name, fill = name)) +
geom_boxplot(alpha = 0.3) +
theme(legend.position = 'none') +
scale_colour_viridis_d("", option = "E") +
scale_fill_viridis_d("", option = "E") +
xlab("") +
ylab("")+
ggtitle("Fake Data")
p1 <- dadosEx2 %>%
select(age, salary, years_since_graduation = firstYear_nProj) %>%
scale() %>%
as_tibble() %>%
pivot_longer(everything()) %>%
ggplot(aes(name, value, color = name, fill = name)) +
geom_boxplot(alpha = 0.3) +
theme(legend.position = 'none') +
scale_colour_viridis_d("", option = "E") +
scale_fill_viridis_d("", option = "E") +
xlab("") +
ylab("") +
ggtitle("Standardization Z-Score")
p2 <- dadosEx2 %>%
select(age, salary, years_since_graduation = firstYear_nProj) %>%
mutate(across(everything(), ~scales::rescale(.x, to = c(0, 1)))) %>%
pivot_longer(everything()) %>%
ggplot(aes(name, value, color = name, fill = name)) +
geom_boxplot(alpha = 0.3) +
theme(legend.position = 'none') +
scale_colour_viridis_d("", option = "E") +
scale_fill_viridis_d("", option = "E") +
xlab("") +
ylab("") +
ggtitle("Normalization Min-Max")
p2 + p1 + plot_annotation(title = 'Fake Data')
dadosEx2 %>% select(age, salary, years_since_graduation = firstYear_nProj) %>% skimr::skim()
| Name | Piped data |
| Number of rows | 1000 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 40.10 | 10.21 | 4.65 | 33.47 | 39.92 | 47.19 | 69.85 | ▁▃▇▆▁ |
| salary | 0 | 1 | 10.39 | 4.05 | 0.24 | 7.59 | 10.58 | 13.36 | 23.02 | ▂▆▇▃▁ |
| years_since_graduation | 0 | 1 | 3.00 | 1.82 | 0.00 | 2.00 | 3.00 | 4.00 | 11.00 | ▇▆▃▁▁ |
Correlação não é causalidade, MAS AJUDA!
dadosEx %>% GGally::ggpairs()
dadosEx %>% GGally::ggpairs(mapping = aes(color = sex_fem))
dadosEx %>%
mutate(sex_fem = rbernoulli(n, p = 0.3)) %>%
mutate(age = ifelse(sex_fem == T,
rnorm(n, mean = 38, sd = 5),
rnorm(n, mean = 45, sd = 10))) %>%
GGally::ggpairs(mapping = aes(color = sex_fem))
Counts of new PhDs in the mathematical sciences for 2008-09 and 2011-12 categorized by type of institution, gender, and US citizenship status.
library(carData)
data("AMSsurvey")
AMSsurvey %>% glimpse
#?AMSsurvey
attitudes to science, from a survey where there were results from 20 classes in private schools and 46 classes in public schools. - like: a summary score based on two of the questions, on a scale from 1 (dislike) to 12 (like)
library(DAAG)
data("science")
science %>% glimpse
## Rows: 1,385
## Columns: 7
## $ State <fct> ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, ACT, AC~
## $ PrivPub <fct> public, public, public, public, public, public, public, public~
## $ school <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ class <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,~
## $ sex <fct> f, f, f, f, f, f, f, f, m, m, m, m, m, m, f, f, f, f, f, f, f,~
## $ like <dbl> 8, 6, 5, 2, 5, 6, 3, 7, 6, 3, 4, 8, 5, 8, 2, 4, 4, 4, 3, 5, 6,~
## $ Class <fct> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.~
#?science
data("swiss")
swiss %>% GGally::ggpairs()
swiss %>%
mutate(Fertility_gr = ifelse(Fertility < 75, "baixa", "alta")) %>%
GGally::ggpairs(mapping = aes(color = Fertility_gr))
library(ggridges)
data("lincoln_weather")
#lincoln_weather %>% glimpse
lincoln_weather <- lincoln_weather %>% janitor::clean_names() %>% glimpse
## Rows: 366
## Columns: 24
## $ cst <chr> "2016-1-1", "2016-1-2", "2016-1-3", "2016-1~
## $ max_temperature_f <int> 37, 41, 37, 30, 38, 34, 33, 28, 22, 31, 37,~
## $ mean_temperature_f <int> 24, 23, 23, 17, 29, 33, 30, 25, 9, 11, 28, ~
## $ min_temperature_f <int> 11, 5, 8, 4, 19, 32, 27, 22, -4, -9, 18, 14~
## $ max_dew_point_f <int> 19, 22, 23, 24, 29, 33, 32, 25, 17, 20, 27,~
## $ mean_dew_point_f <int> 13, 14, 15, 13, 25, 32, 30, 22, 4, 5, 24, 1~
## $ min_dewpoint_f <int> 8, 4, 8, 2, 19, 29, 25, 18, -8, -13, 16, 10~
## $ max_humidity <int> 88, 100, 92, 92, 96, 100, 100, 92, 87, 87, ~
## $ mean_humidity <int> 68, 72, 73, 82, 83, 91, 96, 85, 77, 75, 74,~
## $ min_humidity <int> 47, 44, 54, 72, 70, 82, 92, 78, 67, 63, 64,~
## $ max_sea_level_pressure_in <dbl> 30.50, 30.35, 30.50, 30.50, 30.20, 29.98, 2~
## $ mean_sea_level_pressure_in <dbl> 30.39, 30.30, 30.42, 30.39, 30.06, 29.90, 2~
## $ min_sea_level_pressure_in <dbl> 30.30, 30.22, 30.32, 30.18, 29.99, 29.81, 2~
## $ max_visibility_miles <int> 10, 10, 10, 10, 10, 10, 9, 10, 10, 10, 10, ~
## $ mean_visibility_miles <int> 10, 10, 10, 9, 8, 4, 3, 6, 9, 10, 10, 10, 1~
## $ min_visibility_miles <int> 10, 10, 10, 6, 5, 0, 0, 2, 5, 10, 10, 5, 10~
## $ max_wind_speed_mph <int> 20, 15, 13, 17, 22, 16, 16, 25, 25, 10, 25,~
## $ mean_wind_speed_mph <int> 9, 6, 5, 7, 13, 7, 7, 16, 14, 5, 14, 6, 5, ~
## $ max_gust_speed_mph <int> 23, 18, 14, 23, 28, 21, 21, 32, 28, 12, 34,~
## $ precipitation_in <chr> "0", "0", "0", "0", "0", "T", "0", "0", "T"~
## $ cloud_cover <int> 0, 0, 0, 1, 4, 8, 8, 8, 5, 0, 7, 5, 0, 2, 5~
## $ events <chr> NA, NA, NA, NA, NA, "Fog", "Fog", "Snow", "~
## $ wind_dir_degrees <int> 280, 312, 330, 155, 178, 167, 7, 338, 340, ~
## $ month <fct> January, January, January, January, January~
#distribution
ggplot(lincoln_weather, aes(x = mean_temperature_f, y = month, fill = stat(x))) +
geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
scale_fill_viridis_c(name = "Temp. [F]", option = "C") +
labs(title = 'Temperatures in Lincoln NE in 2016') +
theme_minimal()
ggplot(lincoln_weather,
aes(y = mean_temperature_f, x = month, fill = mean_temperature_f)) +
coord_flip() +
geom_boxplot()
library(tidyverse)
avengers_raw <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/avengers/avengers.csv" %>%
base::url() %>%
readr::read_csv() %>%
janitor::clean_names() %>%
dplyr::glimpse()
## Rows: 173
## Columns: 21
## $ url <chr> "http://marvel.wikia.com/Henry_Pym_(Earth-~
## $ name_alias <chr> "Henry Jonathan \"Hank\" Pym", "Janet van ~
## $ appearances <dbl> 1269, 1165, 3068, 2089, 2402, 612, 3458, 1~
## $ current <chr> "YES", "YES", "YES", "YES", "YES", "YES", ~
## $ gender <chr> "MALE", "FEMALE", "MALE", "MALE", "MALE", ~
## $ probationary_introl <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ full_reserve_avengers_intro <chr> "Sep-63", "Sep-63", "Sep-63", "Sep-63", "S~
## $ year <dbl> 1963, 1963, 1963, 1963, 1963, 1963, 1964, ~
## $ years_since_joining <dbl> 52, 52, 52, 52, 52, 52, 51, 50, 50, 50, 50~
## $ honorary <chr> "Full", "Full", "Full", "Full", "Full", "H~
## $ death1 <chr> "YES", "YES", "YES", "YES", "YES", "NO", "~
## $ return1 <chr> "NO", "YES", "YES", "YES", "YES", NA, "YES~
## $ death2 <chr> NA, NA, NA, NA, "YES", NA, NA, "YES", NA, ~
## $ return2 <chr> NA, NA, NA, NA, "NO", NA, NA, "YES", NA, N~
## $ death3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ death4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ death5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ notes <chr> "Merged with Ultron in Rage of Ultron Vol.~
avengers_raw <- avengers_raw %>%
mutate(death1 = forcats::fct_recode(death1, dead = "YES", not_dead = "NO")) %>%
mutate(return1 = forcats::fct_recode(return1, return = "YES", not_return = "NO")) %>%
mutate(return1 = forcats::fct_explicit_na(return1, "not_dead")) %>%
glimpse
## Rows: 173
## Columns: 21
## $ url <chr> "http://marvel.wikia.com/Henry_Pym_(Earth-~
## $ name_alias <chr> "Henry Jonathan \"Hank\" Pym", "Janet van ~
## $ appearances <dbl> 1269, 1165, 3068, 2089, 2402, 612, 3458, 1~
## $ current <chr> "YES", "YES", "YES", "YES", "YES", "YES", ~
## $ gender <chr> "MALE", "FEMALE", "MALE", "MALE", "MALE", ~
## $ probationary_introl <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ full_reserve_avengers_intro <chr> "Sep-63", "Sep-63", "Sep-63", "Sep-63", "S~
## $ year <dbl> 1963, 1963, 1963, 1963, 1963, 1963, 1964, ~
## $ years_since_joining <dbl> 52, 52, 52, 52, 52, 52, 51, 50, 50, 50, 50~
## $ honorary <chr> "Full", "Full", "Full", "Full", "Full", "H~
## $ death1 <fct> dead, dead, dead, dead, dead, not_dead, de~
## $ return1 <fct> not_return, return, return, return, return~
## $ death2 <chr> NA, NA, NA, NA, "YES", NA, NA, "YES", NA, ~
## $ return2 <chr> NA, NA, NA, NA, "NO", NA, NA, "YES", NA, N~
## $ death3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return3 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ death4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return4 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ death5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ return5 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
## $ notes <chr> "Merged with Ultron in Rage of Ultron Vol.~
theme_set(theme_minimal() + theme(legend.position = 'none'))
p1 <- avengers_raw %>%
ggplot(aes(years_since_joining, color = "", fill = "")) +
geom_density(alpha=0.5) +
scale_colour_viridis_d("", begin = 0.45, end = 0.8) +
scale_fill_viridis_d("", begin = 0.45, end = 0.8) +
geom_vline(aes(xintercept = mean(years_since_joining), colour = "")) +
scale_x_continuous(breaks = seq(min(avengers_raw$years_since_joining),
max(avengers_raw$years_since_joining), 10)) +
theme(axis.text.y=element_blank()) +
xlab("")
p2 <- avengers_raw %>%
ggplot(aes("", years_since_joining, color = "", fill = "")) +
geom_boxplot() +
stat_summary(fun.y="mean", shape=2) +
scale_colour_viridis_d("", begin = 0.45, end = 0.5) +
scale_fill_viridis_d("", begin = 0.45, end = 0.5, alpha = 0.5) +
scale_y_continuous(breaks = seq(min(avengers_raw$years_since_joining),
max(avengers_raw$years_since_joining), 10)) +
xlab("") +
ylab("")
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1) +
plot_annotation(title = 'Avengers: years_since_joining')
theme_set(theme_minimal())
p1 <- avengers_raw %>%
ggplot(aes(years_since_joining, fill = death1, colour = death1)) +
geom_density(alpha=0.5) +
scale_colour_viridis_d("", end = 0.8) +
scale_fill_viridis_d("", end = 0.8) +
geom_vline(data = . %>% filter(death1 =="dead"), aes(xintercept = mean(years_since_joining), colour = death1)) +
geom_vline(data = . %>% filter(death1 =="not_dead"), aes(xintercept = mean(years_since_joining), colour = death1)) +
theme(axis.text.y=element_blank()) +
scale_x_continuous(breaks = seq(min(avengers_raw$years_since_joining),
max(avengers_raw$years_since_joining), 10)) +
xlab("")
p2 <- avengers_raw %>%
ggplot(aes("", years_since_joining, fill = death1, colour = death1)) +
geom_boxplot() +
facet_grid(. ~ death1) +
scale_colour_viridis_d("", end = 0.8) +
scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
xlab("") +
ylab("") +
scale_y_continuous(breaks = seq(min(avengers_raw$years_since_joining),
max(avengers_raw$years_since_joining), 10)) +
theme(legend.position = 'bottom')
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
plot_annotation(title = 'Avengers: years_since_joining vs. Death (at least once)')
theme_set(theme_minimal())
p1 <- avengers_raw %>%
mutate(death1 = forcats::fct_recode(death1, dead = "YES", not_dead = "NO")) %>%
mutate(return1 = forcats::fct_recode(return1, return = "YES", not_return = "NO")) %>%
mutate(return1 = forcats::fct_explicit_na(return1, "not_dead")) %>%
mutate(return1 = forcats::fct_infreq(return1)) %>%
ggplot(aes(years_since_joining, fill = return1, colour = return1)) +
geom_density(alpha=0.5) +
scale_colour_viridis_d("", end = 0.8) +
scale_fill_viridis_d("", end = 0.8) +
geom_vline(data = . %>% filter(return1 =="not_return"), aes(xintercept = mean(years_since_joining), colour = return1)) +
geom_vline(data = . %>% filter(return1 =="return"), aes(xintercept = mean(years_since_joining), colour = return1)) +
geom_vline(data = . %>% filter(return1 =="not_dead"), aes(xintercept = mean(years_since_joining), colour = return1)) +
theme(axis.text.y=element_blank()) +
scale_x_continuous(breaks = seq(min(avengers_raw$years_since_joining),
max(avengers_raw$years_since_joining), 10)) +
xlab("")
p2 <- avengers_raw %>%
mutate(death1 = forcats::fct_recode(death1, dead = "YES", not_dead = "NO")) %>%
mutate(return1 = forcats::fct_recode(return1, return = "YES", not_return = "NO")) %>%
mutate(return1 = forcats::fct_explicit_na(return1, "not_dead")) %>%
mutate(return1 = forcats::fct_infreq(return1)) %>%
ggplot(aes("", years_since_joining, fill = return1, colour = return1)) +
facet_grid(. ~ return1) +
geom_boxplot() +
scale_y_continuous(breaks = seq(min(avengers_raw$years_since_joining),
max(avengers_raw$years_since_joining), 10)) +
scale_colour_viridis_d("", end = 0.8) +
scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
xlab("") +
ylab("")
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
plot_annotation(title = 'Avengers: years_since_joining vs. Return (at least once)')
avengers_raw %>%
janitor::tabyl(gender) %>%
mutate(percent = scales::percent(percent)) %>%
kableExtra::kbl() %>%
kableExtra::kable_styling(full_width = F)
| gender | n | percent |
|---|---|---|
| FEMALE | 58 | 34% |
| MALE | 115 | 66% |
theme_set(theme_minimal())
p1 <- avengers_raw %>%
ggplot(aes(years_since_joining, fill = gender, colour = gender)) +
geom_density(alpha=0.5) +
scale_colour_viridis_d("", end = 0.8) +
scale_fill_viridis_d("", end = 0.8) +
geom_vline(data = . %>% filter(gender =="FEMALE"), aes(xintercept = mean(years_since_joining), colour = gender)) +
geom_vline(data = . %>% filter(gender =="MALE"), aes(xintercept = mean(years_since_joining), colour = gender)) +
theme(axis.text.y=element_blank()) +
scale_x_continuous(breaks = seq(min(avengers_raw$years_since_joining),
max(avengers_raw$years_since_joining), 10)) +
xlab("")
p2 <- avengers_raw %>%
ggplot(aes("", years_since_joining, fill = gender, colour = gender)) +
geom_boxplot() +
facet_grid(. ~ gender) +
scale_colour_viridis_d("", end = 0.8) +
scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
xlab("") +
ylab("") +
scale_y_continuous(breaks = seq(min(avengers_raw$years_since_joining),
max(avengers_raw$years_since_joining), 10)) +
theme(legend.position = 'bottom')
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
plot_annotation(title = 'Avengers: years_since_joining vs. Gender')
p1 <- avengers_raw %>%
ggplot(aes(appearances, fill = gender, colour = gender)) +
geom_density(alpha=0.5) +
scale_colour_viridis_d("", end = 0.8) +
scale_fill_viridis_d("", end = 0.8) +
geom_vline(data = . %>% filter(gender =="FEMALE"), aes(xintercept = mean(appearances), colour = gender)) +
geom_vline(data = . %>% filter(gender =="MALE"), aes(xintercept = mean(appearances), colour = gender)) +
theme(axis.text.y=element_blank()) +
scale_x_continuous(breaks = seq(0,
max(avengers_raw$appearances), 500)) +
xlab("")
p2 <- avengers_raw %>%
ggplot(aes("", appearances, fill = gender, colour = gender)) +
geom_boxplot() +
ggrepel::geom_text_repel(data = . %>% group_by(gender) %>% filter(appearances %in% boxplot.stats(appearances)$out) ,
aes(label = name_alias, y = appearances), size = 3) +
facet_grid(. ~ gender) +
scale_colour_viridis_d("", end = 0.8) +
scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
xlab("") +
ylab("") +
scale_y_continuous(breaks = seq(0,
max(avengers_raw$appearances), 500)) +
theme(legend.position = 'bottom')
ggpubr::ggarrange(p1, p2, ncol=2, nrow=1, common.legend = TRUE, legend="bottom") +
plot_annotation(title = 'Avengers: Appearances vs. Gender')
avengers_raw %>%
ggplot(aes("", appearances, fill = gender, colour = gender)) +
geom_boxplot() +
ggrepel::geom_text_repel(data = . %>% group_by(gender) %>% filter(appearances %in% boxplot.stats(appearances)$out) ,
aes(label = name_alias, y = appearances), size = 3) +
facet_grid(. ~ gender) +
scale_colour_viridis_d("", end = 0.8) +
scale_fill_viridis_d("", end = 0.8, alpha = 0.5) +
xlab("") +
scale_y_continuous(breaks = seq(0,
max(avengers_raw$appearances), 500)) +
theme(legend.position = 'none') +
ggtitle("Avengers: Appearances vs. Gender")
Ao avaliar uma distribuição, observe:
E o que tudo isso tem a ver com Machine Learning?
Tudo! O que discutimos até aqui, é o que os algoritmos de ML fazem de forma escalonada. E cabe a você, cientista de dados, garantir os ajustes necessários para que os modelos possam utilizar os dados da melhor forma, seja para inferência ou predição.
E lembre-se: você é a porta voz dos dados, porém não se trata de você!
Toda essa teoria deve ser um meio e não um fim, então não perca o foco na (o) cliente final.