library(aod)
library(ggplot2)
mydata <- read.csv("/Users/yahavmanor/Desktop/ENP164/dating_app - Sheet1.csv")
mydata #This command will show the first few rows of data and the variable names
## Image Good_Picture Age Height Fish_NoFish Individual_Group Smile_Size
## 1 1 1 38 67 0 0 3
## 2 2 1 26 69 1 0 2
## 3 3 0 20 70 1 0 3
## 4 4 1 38 66 0 0 3
## 5 5 1 29 63 0 0 2
## 6 6 0 24 73 0 1 4
## 7 7 0 23 64 0 1 4
## 8 8 1 24 74 0 0 3
## 9 9 1 22 66 0 0 3
## 10 10 1 40 72 0 0 1
## 11 11 0 31 69 0 1 4
## 12 12 0 33 64 0 0 3
## 13 13 1 26 67 0 0 3
## 14 14 1 32 67 0 0 3
## 15 15 0 27 71 0 1 4
## 16 16 1 24 72 0 0 1
## 17 17 1 73 72 0 0 3
## 18 18 1 25 65 1 0 3
## 19 19 1 26 70 0 0 1
## 20 20 1 47 66 1 0 3
## 21 21 0 32 69 0 1 4
## 22 22 1 23 71 0 0 3
## 23 23 1 27 69 0 0 2
## 24 24 1 27 74 1 0 3
## 25 25 0 53 66 0 1 4
## 26 26 1 32 70 1 0 3
## 27 27 1 25 72 0 0 2
## 28 28 0 25 70 0 1 4
## 29 29 1 51 65 0 0 3
## 30 30 0 62 67 0 0 3
## 31 31 1 34 65 1 0 3
## 32 32 1 25 63 0 0 3
## 33 33 0 33 65 0 1 4
## 34 34 1 33 64 0 0 3
## 35 35 0 28 73 0 1 4
## 36 36 1 27 64 0 0 2
## 37 37 1 47 66 0 0 3
## 38 38 1 29 65 0 0 3
## 39 39 1 35 67 0 0 3
## 40 40 1 24 68 1 0 3
## 41 41 1 25 72 1 0 3
## 42 42 1 25 70 0 0 3
## 43 43 1 47 72 0 0 2
## 44 44 1 29 66 0 0 3
## 45 45 1 24 69 0 0 2
## 46 46 1 27 64 1 0 3
## 47 47 1 28 65 0 0 3
## 48 48 0 38 67 0 0 3
## 49 49 0 27 70 0 0 1
## 50 50 0 28 68 0 1 4
## 51 51 0 28 65 0 0 4
## 52 52 1 41 64 0 0 2
## 53 53 1 53 66 0 0 3
## 54 54 1 57 70 0 0 3
## 55 55 1 24 68 0 0 1
## 56 56 1 27 66 0 0 2
## 57 57 1 29 70 0 0 2
## 58 58 1 28 70 0 0 3
## 59 59 1 26 73 0 0 2
## 60 60 1 30 70 0 0 2
# I want to be able to check that all my data exists and was transferred over correctly. I also want to check out the sd of each column in my data to make sure the numbers of mydata are read in correctly. I also want to look at the means of each column, because why not?!
summary(mydata)
## Image Good_Picture Age Height
## Min. : 1.00 Min. :0.0000 Min. :20.00 Min. :63.00
## 1st Qu.:15.75 1st Qu.:0.0000 1st Qu.:25.00 1st Qu.:65.75
## Median :30.50 Median :1.0000 Median :28.00 Median :68.00
## Mean :30.50 Mean :0.7333 Mean :32.35 Mean :68.08
## 3rd Qu.:45.25 3rd Qu.:1.0000 3rd Qu.:34.25 3rd Qu.:70.00
## Max. :60.00 Max. :1.0000 Max. :73.00 Max. :74.00
## Fish_NoFish Individual_Group Smile_Size
## Min. :0.0000 Min. :0.0000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.0000 Median :0.0000 Median :3.000
## Mean :0.1667 Mean :0.1667 Mean :2.817
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:3.000
## Max. :1.0000 Max. :1.0000 Max. :4.000
sapply(mydata, sd)
## Image Good_Picture Age Height
## 17.4642492 0.4459485 10.8218627 3.0603626
## Fish_NoFish Individual_Group Smile_Size
## 0.3758230 0.3758230 0.8334463
sapply(mydata, mean)
## Image Good_Picture Age Height
## 30.5000000 0.7333333 32.3500000 68.0833333
## Fish_NoFish Individual_Group Smile_Size
## 0.1666667 0.1666667 2.8166667
#Smile size indicates the following: 1 means no smile, 2 means smile with no teeth, 3 means smile with teeth, and 4 means not available (essentially meaning that there is more than one person in the picture-- Individual_Group = 1-- so it is not clear whether the specific person is smiling or not)
# This cross tab will check if there is data in each row and column of my dataset (making sure there are no NAs that may influence my data!)
xtabs(~Good_Picture + Smile_Size, data = mydata)
## Smile_Size
## Good_Picture 1 2 3 4
## 0 1 0 4 11
## 1 4 12 28 0
# All of the data is coming through and looks good!
mydata$Smile_Size <- factor(mydata$Smile_Size) #this makes a factor that can be used in my logistics model, noted by mylogit on the following line
mylogit <- glm(Good_Picture ~ Age + Height + Fish_NoFish + Individual_Group + Smile_Size, data = mydata, family = "binomial")
# summary command will print out essential results of my logistics model
summary(mylogit)
##
## Call:
## glm(formula = Good_Picture ~ Age + Height + Fish_NoFish + Individual_Group +
## Smile_Size, family = "binomial", data = mydata)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.327e+00 1.402e+01 -0.309 0.758
## Age -2.312e-02 4.131e-02 -0.560 0.576
## Height 9.048e-02 2.015e-01 0.449 0.653
## Fish_NoFish -8.580e-02 1.303e+00 -0.066 0.947
## Individual_Group -3.367e-01 1.127e+04 0.000 1.000
## Smile_Size2 1.844e+01 3.079e+03 0.006 0.995
## Smile_Size3 1.048e+00 1.579e+00 0.663 0.507
## Smile_Size4 -2.047e+01 1.075e+04 -0.002 0.998
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 69.590 on 59 degrees of freedom
## Residual deviance: 28.636 on 52 degrees of freedom
## AIC: 44.636
##
## Number of Fisher Scoring iterations: 18
confint.default(mylogit)
## 2.5 % 97.5 %
## (Intercept) -3.181206e+01 2.315885e+01
## Age -1.040839e-01 5.785229e-02
## Height -3.044643e-01 4.854335e-01
## Fish_NoFish -2.639465e+00 2.467874e+00
## Individual_Group -2.208832e+04 2.208764e+04
## Smile_Size2 -6.017177e+03 6.054058e+03
## Smile_Size3 -2.047557e+00 4.142873e+00
## Smile_Size4 -2.109795e+04 2.105700e+04
# terms need to look at how smile size influences ultimate predictions (in the eventual odds ratio to come), so only look at variables 6-8 in the model (denoted by Smile_Size2, Smile_Size3, and Smile_Size4)
wald.test(b = coef(mylogit), Sigma = vcov(mylogit), Terms = 6:8)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 0.44, df = 3, P(> X2) = 0.93
#stuck here: not sure why error is printing out when there are 5 total combinations that are possible
length(coef(mylogit))
## [1] 8
#Now, I know the length is 8, so I will want to create a 1x8 matrix with 8 columns
l <- cbind(0, 0, 0, 1, -1, 0, 0, 0)
wald.test(b = coef(mylogit), Sigma = vcov(mylogit), L = l)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 5e-10, df = 1, P(> X2) = 1.0
# Finding the odds ratios
exp(coef(mylogit))
## (Intercept) Age Height Fish_NoFish
## 1.321230e-02 9.771493e-01 1.094705e+00 9.177821e-01
## Individual_Group Smile_Size2 Smile_Size3 Smile_Size4
## 7.141203e-01 1.019836e+08 2.850965e+00 1.283444e-09
exp(cbind(OR = coef(mylogit), confint.default(mylogit)))
## OR 2.5 % 97.5 %
## (Intercept) 1.321230e-02 1.528256e-14 1.142249e+10
## Age 9.771493e-01 9.011497e-01 1.059558e+00
## Height 1.094705e+00 7.375184e-01 1.624879e+00
## Fish_NoFish 9.177821e-01 7.139948e-02 1.179734e+01
## Individual_Group 7.141203e-01 0.000000e+00 Inf
## Smile_Size2 1.019836e+08 0.000000e+00 Inf
## Smile_Size3 2.850965e+00 1.290497e-01 6.298349e+01
## Smile_Size4 1.283444e-09 0.000000e+00 Inf
with(mylogit, null.deviance - deviance)
## [1] 40.95402
with(mylogit, df.null - df.residual)
## [1] 7
with(mylogit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 8.262185e-07
# Summarizing results of this data: These results tell us that the difference in the residuals of 40.95402 (otherwise known as errors) between our original model and a model with no variables, as a difference of 8 degrees of freedom (as there are 8 elements in the model that are being used to make the prediction) and that the difference between the model with 8 variables and a model with no variables is statistically significantly different (at the 0.05 level). In other words, our model provides a better prediction of whether a picture will be swiped left or right on (what makes a "good picture") rather than no model at all.