dados <- read_csv("speed-dating/speed-dating2.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## field = col_character(),
## from = col_character(),
## career = col_character(),
## attr3_s = col_logical(),
## sinc3_s = col_logical(),
## intel3_s = col_logical(),
## fun3_s = col_logical(),
## amb3_s = col_logical(),
## dec = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 10220 parsing failures.
## row col expected actual file
## 1847 attr3_s 1/0/T/F/TRUE/FALSE 8.00 'speed-dating/speed-dating2.csv'
## 1847 sinc3_s 1/0/T/F/TRUE/FALSE 10.00 'speed-dating/speed-dating2.csv'
## 1847 intel3_s 1/0/T/F/TRUE/FALSE 9.00 'speed-dating/speed-dating2.csv'
## 1847 fun3_s 1/0/T/F/TRUE/FALSE 10 'speed-dating/speed-dating2.csv'
## 1847 amb3_s 1/0/T/F/TRUE/FALSE 10 'speed-dating/speed-dating2.csv'
## .... ........ .................. ...... ................................
## See problems(...) for more details.
glimpse(dados)
## Observations: 4,918
## Variables: 44
## $ iid <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ gender <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ order <dbl> 4, 3, 10, 5, 7, 6, 1, 2, 8, 9, 10, 9, 6, 1, 3, 2, 7, 8, 4,...
## $ pid <dbl> 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 11, 12, 13, 14, 15...
## $ int_corr <dbl> 0.14, 0.54, 0.16, 0.61, 0.21, 0.25, 0.34, 0.50, 0.28, -0.3...
## $ samerace <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1...
## $ age_o <dbl> 27, 22, 22, 23, 24, 25, 30, 27, 28, 24, 27, 22, 22, 23, 24...
## $ age <dbl> 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24...
## $ field <chr> "Law", "Law", "Law", "Law", "Law", "Law", "Law", "Law", "L...
## $ race <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ from <chr> "Chicago", "Chicago", "Chicago", "Chicago", "Chicago", "Ch...
## $ career <chr> "lawyer", "lawyer", "lawyer", "lawyer", "lawyer", "lawyer"...
## $ sports <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ tvsports <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ exercise <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7...
## $ dining <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, ...
## $ museums <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...
## $ art <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...
## $ hiking <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ gaming <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5...
## $ clubbing <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...
## $ reading <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, ...
## $ tv <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ theater <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9...
## $ movies <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, ...
## $ concerts <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 7, 7, 7, 7, 7, 7, ...
## $ music <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8...
## $ shopping <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ yoga <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ attr <dbl> 6, 7, 5, 7, 5, 4, 7, 4, 7, 5, 5, 8, 5, 7, 6, 8, 7, 5, 7, 6...
## $ sinc <dbl> 9, 8, 8, 6, 6, 9, 6, 9, 6, 6, 7, 5, 8, 9, 8, 7, 5, 8, 6, 7...
## $ intel <dbl> 7, 7, 9, 8, 7, 7, 7, 7, 8, 6, 8, 6, 9, 7, 7, 8, 9, 7, 8, 8...
## $ fun <dbl> 7, 8, 8, 7, 7, 4, 4, 6, 9, 8, 4, 6, 6, 6, 9, 3, 6, 5, 9, 7...
## $ amb <dbl> 6, 5, 5, 6, 6, 6, 6, 5, 8, 10, 6, 9, 3, 5, 7, 6, 7, 9, 4, ...
## $ shar <dbl> 5, 6, 7, 8, 6, 4, 7, 6, 8, 8, 3, 6, 4, 7, 8, 2, 9, 5, 5, 8...
## $ like <dbl> 7, 7, 7, 7, 6, 6, 6, 6, 7, 6, 6, 7, 6, 7, 8, 6, 8, 5, 5, 8...
## $ prob <dbl> 6, 5, NA, 6, 6, 5, 5, 7, 7, 6, 4, 3, 7, 8, 6, 5, 7, 6, 6, ...
## $ match_es <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ attr3_s <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ sinc3_s <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ intel3_s <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ fun3_s <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ amb3_s <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ dec <chr> "yes", "yes", "yes", "yes", "yes", "no", "yes", "no", "yes...
Agora vamos analisar as variáveis pelos atributos principais, como idade, sexo, região, profissão.
dados1 <- dados %>%
mutate(gender = as.factor(gender),
race = as.factor(race),
dec = as.factor(dec))
dados1 %>%
group_by(gender) %>%
summarise(quantidade = n()) %>%
ggplot(mapping = aes(x = gender, y = quantidade, fill = gender)) +
geom_bar(stat = "identity") +
scale_fill_discrete(labels = c("Masculino", "Feminino")) +
labs(title = "Usuários por gênero",
x = "Gênero",
y = "Quantidade")
dados %>%
group_by(age) %>%
summarise(quantidade = n()) %>%
ggplot(mapping = aes(x = age, y = quantidade)) +
geom_bar(stat = "identity") +
labs(title = "Usuários por idade",
x = "Idade",
y = "Quantidade")
## Warning: Removed 1 rows containing missing values (position_stack).
dados %>%
group_by(career) %>%
summarise(quantidade = n()) %>%
filter(quantidade > 70) %>%
ggplot(mapping = aes(x = career, y = quantidade)) +
geom_bar(stat = "identity") +
labs(title = "Usuários por carreira",
x = "Profissão",
y = "Quantidade")
dados %>%
group_by(from) %>%
summarise(quantidade = n()) %>%
filter(quantidade > 85) %>%
ggplot(mapping = aes(x = from, y = quantidade)) +
geom_bar(stat = "identity") +
labs(title = "Usuário por local",
x = "Local",
y = "Quantidade")
O modelo de regressão logística é usado para variáveis de natureza binária, ou seja, quando duas pessoas são necessárias para quantificar essa variável. Logo, usaremos as seguintes variáveis:
attr: Quão atraente era a outra pessoa fun: Quão divertido a outra pessoa foi shar: Nível de compartilhamento de intresses e hobbies like : no geral, quanto p1 gostou de p2? prob: Probabilidade de se encontrar novamente com a outra pessoa dec: Se deu match ou não
modelo <- dados1 %>%
select(attr, fun, shar, like, prob, dec)
ggpairs(modelo)
## Warning: Removed 118 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 203 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 649 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 133 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 166 rows containing missing values
## Warning: Removed 118 rows containing non-finite values (stat_boxplot).
## Warning: Removed 203 rows containing missing values (geom_point).
## Warning: Removed 197 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 662 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 209 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 242 rows containing missing values
## Warning: Removed 197 rows containing non-finite values (stat_boxplot).
## Warning: Removed 649 rows containing missing values (geom_point).
## Warning: Removed 662 rows containing missing values (geom_point).
## Warning: Removed 643 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 648 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 668 rows containing missing values
## Warning: Removed 643 rows containing non-finite values (stat_boxplot).
## Warning: Removed 133 rows containing missing values (geom_point).
## Warning: Removed 209 rows containing missing values (geom_point).
## Warning: Removed 648 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 162 rows containing missing values
## Warning: Removed 122 rows containing non-finite values (stat_boxplot).
## Warning: Removed 166 rows containing missing values (geom_point).
## Warning: Removed 242 rows containing missing values (geom_point).
## Warning: Removed 668 rows containing missing values (geom_point).
## Warning: Removed 162 rows containing missing values (geom_point).
## Warning: Removed 156 rows containing non-finite values (stat_density).
## Warning: Removed 156 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 118 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 197 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 643 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 122 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 156 rows containing non-finite values (stat_bin).
modelo1 = glm(dec ~ attr + fun + shar + like + prob,
data = modelo,
family = "binomial")
tidy(modelo1, conf.int = TRUE, exponentiate = TRUE, conf.level = 0.95)
Ao ver os resultados, podemos concluir que com 95% de confiança o fato da outra pessoa de ser engraçada, atraente, compartilhar os mesmos hobbies, no geral gostar da pessoa e a probabilidade encontrar a pessoa novamente têm uma relação média com o fato da pessoa dar um match na outra. O fato da pessoa gostar da outra é o que mais influenciou na decisão de dar ou não match na pessoa e o quão a pessoa achou a outra engraçada foi a que menos influenciou.
pR2(modelo1)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML
## -2018.4172036 -2878.6843004 1720.5341935 0.2988404 0.3346995
## r2CU
## 0.4496976
Utilizamos a métrica McFadden pseudo-r2, o modelo explica 30% dos dados, sendo essa uma avaliação ruim do modelo e essas variáveis podem não ser adequadas para explicar porque a pessoa deu match na outra.