Dating_App_Assignment

library(aod)
library(ggplot2)

mydata <- read.csv("/Users/yahavmanor/Desktop/ENP164/dating_app - Sheet1.csv")

mydata   #This command will show the first few rows of data and the variable names

##    Image Good_Picture Age Height Fish_NoFish Individual_Group Smile_Size
## 1      1            1  38     67           0                0          3
## 2      2            1  26     69           1                0          2
## 3      3            0  20     70           1                0          3
## 4      4            1  38     66           0                0          3
## 5      5            1  29     63           0                0          2
## 6      6            0  24     73           0                1          4
## 7      7            0  23     64           0                1          4
## 8      8            1  24     74           0                0          3
## 9      9            1  22     66           0                0          3
## 10    10            1  40     72           0                0          1
## 11    11            0  31     69           0                1          4
## 12    12            0  33     64           0                0          3
## 13    13            1  26     67           0                0          3
## 14    14            1  32     67           0                0          3
## 15    15            0  27     71           0                1          4
## 16    16            1  24     72           0                0          1
## 17    17            1  73     72           0                0          3
## 18    18            1  25     65           1                0          3
## 19    19            1  26     70           0                0          1
## 20    20            1  47     66           1                0          3
## 21    21            0  32     69           0                1          4
## 22    22            1  23     71           0                0          3
## 23    23            1  27     69           0                0          2
## 24    24            1  27     74           1                0          3
## 25    25            0  53     66           0                1          4
## 26    26            1  32     70           1                0          3
## 27    27            1  25     72           0                0          2
## 28    28            0  25     70           0                1          4
## 29    29            1  51     65           0                0          3
## 30    30            0  62     67           0                0          3
## 31    31            1  34     65           1                0          3
## 32    32            1  25     63           0                0          3
## 33    33            0  33     65           0                1          4
## 34    34            1  33     64           0                0          3
## 35    35            0  28     73           0                1          4
## 36    36            1  27     64           0                0          2
## 37    37            1  47     66           0                0          3
## 38    38            1  29     65           0                0          3
## 39    39            1  35     67           0                0          3
## 40    40            1  24     68           1                0          3
## 41    41            1  25     72           1                0          3
## 42    42            1  25     70           0                0          3
## 43    43            1  47     72           0                0          2
## 44    44            1  29     66           0                0          3
## 45    45            1  24     69           0                0          2
## 46    46            1  27     64           1                0          3
## 47    47            1  28     65           0                0          3
## 48    48            0  38     67           0                0          3
## 49    49            0  27     70           0                0          1
## 50    50            0  28     68           0                1          4
## 51    51            0  28     65           0                0          4
## 52    52            1  41     64           0                0          2
## 53    53            1  53     66           0                0          3
## 54    54            1  57     70           0                0          3
## 55    55            1  24     68           0                0          1
## 56    56            1  27     66           0                0          2
## 57    57            1  29     70           0                0          2
## 58    58            1  28     70           0                0          3
## 59    59            1  26     73           0                0          2
## 60    60            1  30     70           0                0          2

# I want to be able to check that all my data exists and was transferred over correctly. I also want to check out the sd of each column in my data to make sure the numbers of mydata are read in correctly. I also want to look at the means of each column, because why not?!

summary(mydata)

##      Image        Good_Picture         Age            Height     
##  Min.   : 1.00   Min.   :0.0000   Min.   :20.00   Min.   :63.00  
##  1st Qu.:15.75   1st Qu.:0.0000   1st Qu.:25.00   1st Qu.:65.75  
##  Median :30.50   Median :1.0000   Median :28.00   Median :68.00  
##  Mean   :30.50   Mean   :0.7333   Mean   :32.35   Mean   :68.08  
##  3rd Qu.:45.25   3rd Qu.:1.0000   3rd Qu.:34.25   3rd Qu.:70.00  
##  Max.   :60.00   Max.   :1.0000   Max.   :73.00   Max.   :74.00  
##   Fish_NoFish     Individual_Group   Smile_Size   
##  Min.   :0.0000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.0000   Median :0.0000   Median :3.000  
##  Mean   :0.1667   Mean   :0.1667   Mean   :2.817  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :4.000

sapply(mydata, sd)

##            Image     Good_Picture              Age           Height 
##       17.4642492        0.4459485       10.8218627        3.0603626 
##      Fish_NoFish Individual_Group       Smile_Size 
##        0.3758230        0.3758230        0.8334463

sapply(mydata, mean)

##            Image     Good_Picture              Age           Height 
##       30.5000000        0.7333333       32.3500000       68.0833333 
##      Fish_NoFish Individual_Group       Smile_Size 
##        0.1666667        0.1666667        2.8166667

#Smile size indicates the following: 1 means no smile, 2 means smile with no teeth, 3 means smile with teeth, and 4 means not available (essentially meaning that there is more than one person in the picture-- Individual_Group = 1-- so it is not clear whether the specific person is smiling or not)

# This cross tab will check if there is data in each row and column of my dataset (making sure there are no NAs that may influence my data!)

xtabs(~Good_Picture + Smile_Size, data = mydata)

##             Smile_Size
## Good_Picture  1  2  3  4
##            0  1  0  4 11
##            1  4 12 28  0

# All of the data is coming through and looks good!

mydata$Smile_Size <- factor(mydata$Smile_Size) #this makes a factor that can be used in my logistics model, noted by mylogit on the following line
mylogit <- glm(Good_Picture ~ Age + Height + Fish_NoFish + Individual_Group + Smile_Size, data = mydata, family = "binomial")

# summary command will print out essential results of my logistics model
summary(mylogit)

## 
## Call:
## glm(formula = Good_Picture ~ Age + Height + Fish_NoFish + Individual_Group + 
##     Smile_Size, family = "binomial", data = mydata)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)      -4.327e+00  1.402e+01  -0.309    0.758
## Age              -2.312e-02  4.131e-02  -0.560    0.576
## Height            9.048e-02  2.015e-01   0.449    0.653
## Fish_NoFish      -8.580e-02  1.303e+00  -0.066    0.947
## Individual_Group -3.367e-01  1.127e+04   0.000    1.000
## Smile_Size2       1.844e+01  3.079e+03   0.006    0.995
## Smile_Size3       1.048e+00  1.579e+00   0.663    0.507
## Smile_Size4      -2.047e+01  1.075e+04  -0.002    0.998
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 69.590  on 59  degrees of freedom
## Residual deviance: 28.636  on 52  degrees of freedom
## AIC: 44.636
## 
## Number of Fisher Scoring iterations: 18

confint.default(mylogit)

##                          2.5 %       97.5 %
## (Intercept)      -3.181206e+01 2.315885e+01
## Age              -1.040839e-01 5.785229e-02
## Height           -3.044643e-01 4.854335e-01
## Fish_NoFish      -2.639465e+00 2.467874e+00
## Individual_Group -2.208832e+04 2.208764e+04
## Smile_Size2      -6.017177e+03 6.054058e+03
## Smile_Size3      -2.047557e+00 4.142873e+00
## Smile_Size4      -2.109795e+04 2.105700e+04

# terms need to look at how smile size influences ultimate predictions (in the eventual odds ratio to come), so only look at variables 6-8 in the model (denoted by Smile_Size2, Smile_Size3, and Smile_Size4)
wald.test(b = coef(mylogit), Sigma = vcov(mylogit), Terms = 6:8)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 0.44, df = 3, P(> X2) = 0.93

#stuck here: not sure why error is printing out when there are 5 total combinations that are possible

length(coef(mylogit))

## [1] 8

#Now, I know the length is 8, so I will want to create a 1x8 matrix with 8 columns
l <- cbind(0, 0, 0, 1, -1, 0, 0, 0)

wald.test(b = coef(mylogit), Sigma = vcov(mylogit), L = l)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 5e-10, df = 1, P(> X2) = 1.0

# Finding the odds ratios
exp(coef(mylogit))

##      (Intercept)              Age           Height      Fish_NoFish 
##     1.321230e-02     9.771493e-01     1.094705e+00     9.177821e-01 
## Individual_Group      Smile_Size2      Smile_Size3      Smile_Size4 
##     7.141203e-01     1.019836e+08     2.850965e+00     1.283444e-09

exp(cbind(OR = coef(mylogit), confint.default(mylogit)))

##                            OR        2.5 %       97.5 %
## (Intercept)      1.321230e-02 1.528256e-14 1.142249e+10
## Age              9.771493e-01 9.011497e-01 1.059558e+00
## Height           1.094705e+00 7.375184e-01 1.624879e+00
## Fish_NoFish      9.177821e-01 7.139948e-02 1.179734e+01
## Individual_Group 7.141203e-01 0.000000e+00          Inf
## Smile_Size2      1.019836e+08 0.000000e+00          Inf
## Smile_Size3      2.850965e+00 1.290497e-01 6.298349e+01
## Smile_Size4      1.283444e-09 0.000000e+00          Inf

with(mylogit, null.deviance - deviance)

## [1] 40.95402

with(mylogit, df.null - df.residual)

## [1] 7

with(mylogit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))

## [1] 8.262185e-07

# Summarizing results of this data: These results tell us that the difference in the residuals of 40.95402 (otherwise known as errors) between our original model and a model with no variables, as a difference of 8 degrees of freedom (as there are 8 elements in the model that are being used to make the prediction) and that the difference between the model with 8 variables and a model with no variables is statistically significantly different (at the 0.05 level).  In other words, our model provides a better prediction of whether a picture will be swiped left or right on (what makes a "good picture") rather than no model at all.

Dating_App_Assignment

Daniel Hannon. Edited by Yahav Manor

2025-04-15