dados <- read_csv("speed-dating/speed-dating2.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   field = col_character(),
##   from = col_character(),
##   career = col_character(),
##   attr3_s = col_logical(),
##   sinc3_s = col_logical(),
##   intel3_s = col_logical(),
##   fun3_s = col_logical(),
##   amb3_s = col_logical(),
##   dec = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 10220 parsing failures.
##  row      col           expected actual                             file
## 1847 attr3_s  1/0/T/F/TRUE/FALSE  8.00  'speed-dating/speed-dating2.csv'
## 1847 sinc3_s  1/0/T/F/TRUE/FALSE  10.00 'speed-dating/speed-dating2.csv'
## 1847 intel3_s 1/0/T/F/TRUE/FALSE  9.00  'speed-dating/speed-dating2.csv'
## 1847 fun3_s   1/0/T/F/TRUE/FALSE  10    'speed-dating/speed-dating2.csv'
## 1847 amb3_s   1/0/T/F/TRUE/FALSE  10    'speed-dating/speed-dating2.csv'
## .... ........ .................. ...... ................................
## See problems(...) for more details.
glimpse(dados)
## Observations: 4,918
## Variables: 44
## $ iid      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ gender   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ order    <dbl> 4, 3, 10, 5, 7, 6, 1, 2, 8, 9, 10, 9, 6, 1, 3, 2, 7, 8, 4,...
## $ pid      <dbl> 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 11, 12, 13, 14, 15...
## $ int_corr <dbl> 0.14, 0.54, 0.16, 0.61, 0.21, 0.25, 0.34, 0.50, 0.28, -0.3...
## $ samerace <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1...
## $ age_o    <dbl> 27, 22, 22, 23, 24, 25, 30, 27, 28, 24, 27, 22, 22, 23, 24...
## $ age      <dbl> 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24...
## $ field    <chr> "Law", "Law", "Law", "Law", "Law", "Law", "Law", "Law", "L...
## $ race     <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ from     <chr> "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Ch...
## $ career   <chr> "lawyer", "lawyer", "lawyer", "lawyer", "lawyer", "lawyer"...
## $ sports   <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ tvsports <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ exercise <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...
## $ dining   <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, ...
## $ museums  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...
## $ art      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...
## $ hiking   <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ gaming   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5...
## $ clubbing <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...
## $ reading  <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, ...
## $ tv       <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ theater  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9...
## $ movies   <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, ...
## $ concerts <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 7, 7, 7, 7, 7, 7, ...
## $ music    <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...
## $ shopping <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ yoga     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ attr     <dbl> 6, 7, 5, 7, 5, 4, 7, 4, 7, 5, 5, 8, 5, 7, 6, 8, 7, 5, 7, 6...
## $ sinc     <dbl> 9, 8, 8, 6, 6, 9, 6, 9, 6, 6, 7, 5, 8, 9, 8, 7, 5, 8, 6, 7...
## $ intel    <dbl> 7, 7, 9, 8, 7, 7, 7, 7, 8, 6, 8, 6, 9, 7, 7, 8, 9, 7, 8, 8...
## $ fun      <dbl> 7, 8, 8, 7, 7, 4, 4, 6, 9, 8, 4, 6, 6, 6, 9, 3, 6, 5, 9, 7...
## $ amb      <dbl> 6, 5, 5, 6, 6, 6, 6, 5, 8, 10, 6, 9, 3, 5, 7, 6, 7, 9, 4, ...
## $ shar     <dbl> 5, 6, 7, 8, 6, 4, 7, 6, 8, 8, 3, 6, 4, 7, 8, 2, 9, 5, 5, 8...
## $ like     <dbl> 7, 7, 7, 7, 6, 6, 6, 6, 7, 6, 6, 7, 6, 7, 8, 6, 8, 5, 5, 8...
## $ prob     <dbl> 6, 5, NA, 6, 6, 5, 5, 7, 7, 6, 4, 3, 7, 8, 6, 5, 7, 6, 6, ...
## $ match_es <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ attr3_s  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ sinc3_s  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ intel3_s <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ fun3_s   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ amb3_s   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ dec      <chr> "yes", "yes", "yes", "yes", "yes", "no", "yes", "no", "yes...

Agora vamos analisar as variáveis pelos atributos principais, como idade, sexo, região, profissão.

dados1 <- dados %>% 
  mutate(gender = as.factor(gender),
         race = as.factor(race),
         dec = as.factor(dec))

dados1 %>% 
  group_by(gender) %>% 
  summarise(quantidade = n()) %>% 
  ggplot(mapping = aes(x = gender, y = quantidade, fill = gender)) +
  geom_bar(stat = "identity") + 
  scale_fill_discrete(labels = c("Masculino", "Feminino")) +
  labs(title = "Usuários por gênero",
       x = "Gênero",
       y = "Quantidade")

dados %>% 
  group_by(age) %>% 
  summarise(quantidade = n()) %>% 
  ggplot(mapping = aes(x = age, y = quantidade)) + 
  geom_bar(stat = "identity") + 
  labs(title = "Usuários por idade",
       x = "Idade",
       y = "Quantidade")
## Warning: Removed 1 rows containing missing values (position_stack).

dados %>% 
  group_by(career) %>% 
  summarise(quantidade = n()) %>% 
  filter(quantidade > 70) %>% 
  ggplot(mapping = aes(x = career, y = quantidade)) + 
  geom_bar(stat = "identity") + 
  labs(title = "Usuários por carreira",
       x = "Profissão",
       y = "Quantidade")

dados %>% 
  group_by(from) %>% 
  summarise(quantidade = n()) %>% 
  filter(quantidade > 85) %>% 
  ggplot(mapping = aes(x = from, y = quantidade)) + 
  geom_bar(stat = "identity") + 
  labs(title = "Usuário por local",
       x = "Local",
       y = "Quantidade")

Modelo de regressão logística

O modelo de regressão logística é usado para variáveis de natureza binária, ou seja, quando duas pessoas são necessárias para quantificar essa variável. Logo, usaremos as seguintes variáveis:

attr: Quão atraente era a outra pessoa fun: Quão divertido a outra pessoa foi shar: Nível de compartilhamento de intresses e hobbies like : no geral, quanto p1 gostou de p2? prob: Probabilidade de se encontrar novamente com a outra pessoa dec: Se deu match ou não

modelo  <- dados1 %>% 
  select(attr, fun, shar, like, prob, dec)
ggpairs(modelo)
## Warning: Removed 118 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 203 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 649 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 133 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 166 rows containing missing values
## Warning: Removed 118 rows containing non-finite values (stat_boxplot).
## Warning: Removed 203 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 662 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 209 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 242 rows containing missing values
## Warning: Removed 197 rows containing non-finite values (stat_boxplot).
## Warning: Removed 649 rows containing missing values (geom_point).
## Warning: Removed 662 rows containing missing values (geom_point).
## Warning: Removed 643 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 648 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 668 rows containing missing values
## Warning: Removed 643 rows containing non-finite values (stat_boxplot).
## Warning: Removed 133 rows containing missing values (geom_point).
## Warning: Removed 209 rows containing missing values (geom_point).
## Warning: Removed 648 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 162 rows containing missing values
## Warning: Removed 122 rows containing non-finite values (stat_boxplot).
## Warning: Removed 166 rows containing missing values (geom_point).
## Warning: Removed 242 rows containing missing values (geom_point).
## Warning: Removed 668 rows containing missing values (geom_point).
## Warning: Removed 162 rows containing missing values (geom_point).
## Warning: Removed 156 rows containing non-finite values (stat_density).
## Warning: Removed 156 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 118 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 197 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 643 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 122 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 156 rows containing non-finite values (stat_bin).

modelo1 = glm(dec ~ attr + fun + shar + like + prob,
                  data = modelo,
                  family = "binomial")
tidy(modelo1, conf.int = TRUE, exponentiate = TRUE, conf.level = 0.95)

Ao ver os resultados, podemos concluir que com 95% de confiança o fato da outra pessoa de ser engraçada, atraente, compartilhar os mesmos hobbies, no geral gostar da pessoa e a probabilidade encontrar a pessoa novamente têm uma relação média com o fato da pessoa dar um match na outra. O fato da pessoa gostar da outra é o que mais influenciou na decisão de dar ou não match na pessoa e o quão a pessoa achou a outra engraçada foi a que menos influenciou.

O quanto o método explica os dados?

pR2(modelo1)
## fitting null model for pseudo-r2
##           llh       llhNull            G2      McFadden          r2ML 
## -2018.4172036 -2878.6843004  1720.5341935     0.2988404     0.3346995 
##          r2CU 
##     0.4496976

Utilizamos a métrica McFadden pseudo-r2, o modelo explica 30% dos dados, sendo essa uma avaliação ruim do modelo e essas variáveis podem não ser adequadas para explicar porque a pessoa deu match na outra.