h/w3_Zaychenkova

0.0.1 marginal effects
0.0.2 visualization
0.0.3 specificity & sensitivity
0.0.4 multicollinearity

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library (foreign) 
data_hw3 <- read.spss ("HBSC2014OAed1.1_F1.sav",to.data.frame=T, use.value.labels=T)

#Part 1 (3 points)

##1. Create the dependent variable from var “fight12m”. It must be a variable with two levels: 1 - success (>= 2 times) and 0 - failure (None, 1 time).

data_hw3$fighting <- ifelse(data_hw3$fight12m == "None", 0, 
                      ifelse(data_hw3$fight12m == "1 time", 0,
                             ifelse(data_hw3$fight12m == "2 times", 1,
                                    ifelse(data_hw3$fight12m == "3 times", 1,
                                           ifelse(data_hw3$fight12m == "4 times or more", 1, NA)))))

table(data_hw3$fight12m)

## 
##            None          1 time         2 times         3 times 4 times or more 
##          138771           30061           14182            7146           13031

table(data_hw3$fighting)

## 
##      0      1 
## 168832  34359

#1
data_hw3$friendhelp_1 <- ifelse(data_hw3$friendhelp == "Very strongly disagree", 1,
                 ifelse(data_hw3$friendhelp == "Very strongly agree", 7, data_hw3$friendhelp))
class(data_hw3$friendhelp_1)

## [1] "numeric"

table(data_hw3$friendhelp, data_hw3$friendhelp_1)

##                         
##                              1     2     3     4     5     6     7
##   Very strongly disagree 12487     0     0     0     0     0     0
##   2                          0  7754     0     0     0     0     0
##   3                          0     0  9577     0     0     0     0
##   4                          0     0     0 19666     0     0     0
##   5                          0     0     0     0 31006     0     0
##   6                          0     0     0     0     0 40134     0
##   Very strongly agree        0     0     0     0     0     0 67366

#2
data_hw3$friendcounton_1 <- ifelse(data_hw3$friendcounton == "Very strongly disagree", 1,
                 ifelse(data_hw3$friendcounton == "Very strongly agree", 7, data_hw3$friendcounton))
class(data_hw3$friendcounton_1)

## [1] "numeric"

table(data_hw3$friendcounton, data_hw3$friendcounton_1)

##                         
##                              1     2     3     4     5     6     7
##   Very strongly disagree 12894     0     0     0     0     0     0
##   2                          0  8029     0     0     0     0     0
##   3                          0     0  9476     0     0     0     0
##   4                          0     0     0 16709     0     0     0
##   5                          0     0     0     0 26683     0     0
##   6                          0     0     0     0     0 37729     0
##   Very strongly agree        0     0     0     0     0     0 73056

#3
data_hw3$friendshare_1 <- ifelse(data_hw3$friendshare == "Very strongly disagree", 1,
                 ifelse(data_hw3$friendshare == "Very strongly agree", 7, data_hw3$friendshare))
class(data_hw3$friendshare_1)

## [1] "numeric"

table(data_hw3$friendshare, data_hw3$friendshare_1)

##                         
##                              1     2     3     4     5     6     7
##   Very strongly disagree 13307     0     0     0     0     0     0
##   2                          0  6619     0     0     0     0     0
##   3                          0     0  7134     0     0     0     0
##   4                          0     0     0 12651     0     0     0
##   5                          0     0     0     0 21805     0     0
##   6                          0     0     0     0     0 34367     0
##   Very strongly agree        0     0     0     0     0     0 91551

#4
data_hw3$friendtalk_1 <- ifelse(data_hw3$friendtalk == "Very strongly disagree", 1,
                 ifelse(data_hw3$friendtalk == "Very strongly agree", 7, data_hw3$friendtalk))
class(data_hw3$friendtalk_1)

## [1] "numeric"

table(data_hw3$friendtalk, data_hw3$friendtalk_1)

##                         
##                              1     2     3     4     5     6     7
##   Very strongly disagree 17092     0     0     0     0     0     0
##   2                          0  8694     0     0     0     0     0
##   3                          0     0  9625     0     0     0     0
##   4                          0     0     0 16573     0     0     0
##   5                          0     0     0     0 23810     0     0
##   6                          0     0     0     0     0 34521     0
##   Very strongly agree        0     0     0     0     0     0 77027

data_hw3$frndsup <- rowMeans(data_hw3[,c('friendhelp_1', 'friendcounton_1', 'friendshare_1', 'friendtalk_1')], na.rm=TRUE)
summary(data_hw3$frndsup)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.500   6.000   5.411   7.000   7.000   25083

data_hw3$lifesat_1 <- ifelse(data_hw3$lifesat == "0, worst possible life", 1,
                 ifelse(data_hw3$lifesat == "10, best possible life", 11, data_hw3$lifesat))
table(data_hw3$lifesat, data_hw3$lifesat_1)

##                         
##                              1     2     3     4     5     6     7     8     9
##   0, worst possible life  1099     0     0     0     0     0     0     0     0
##   1                          0  1132     0     0     0     0     0     0     0
##   2                          0     0  1790     0     0     0     0     0     0
##   3                          0     0     0  3519     0     0     0     0     0
##   4                          0     0     0     0  6301     0     0     0     0
##   5                          0     0     0     0     0 15604     0     0     0
##   6                          0     0     0     0     0     0 17697     0     0
##   7                          0     0     0     0     0     0     0 35015     0
##   8                          0     0     0     0     0     0     0     0 48835
##   9                          0     0     0     0     0     0     0     0     0
##   10, best possible life     0     0     0     0     0     0     0     0     0
##                         
##                             10    11
##   0, worst possible life     0     0
##   1                          0     0
##   2                          0     0
##   3                          0     0
##   4                          0     0
##   5                          0     0
##   6                          0     0
##   7                          0     0
##   8                          0     0
##   9                      38724     0
##   10, best possible life     0 36237

##2. Use the predictors: sex, age, m96, lifesat, friend support (average of “friendhelp”, “friendcounton”, “friendshare”, “friendtalk).

m_hw3 <- select(data_hw3, c(fighting, frndsup, AGE, sex, m96,lifesat_1))

sapply(m_hw3, function(x) sum(is.na(x)))

##  fighting   frndsup       AGE       sex       m96 lifesat_1 
##     10889     25083      1854         0     27764      8127

hw3 <- na.omit(m_hw3)

##3. Do pairwise comparisons for the relationship between the dependent variable and each independent variable. Don’t foget about assumptions.

chisq.test(hw3$sex, hw3$fighting,correct = TRUE)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  hw3$sex and hw3$fighting
## X-squared = 9405.3, df = 1, p-value < 2.2e-16

chisq.test(hw3$sex, hw3$fighting,correct = TRUE)$stdres

##        hw3$fighting
## hw3$sex         0         1
##    Boy  -96.98758  96.98758
##    Girl  96.98758 -96.98758

Interpretation: p-value is less than 0.05 (< 2.2e-16), thus we reject the null hypothesis, so there is a statistically significant relationship between two categories: fighting and sex. As for standardized residuals suggest that there is a gender difference, girls are more likely to not engage into fighting (-96.98758), while boys are more likely to engage into fighting (96.98758).

As we are interested in “Why do girls fight”. I propose to make a subset with only girls.

hw3_g <- subset(hw3, sex == 'Girl')

var.test(hw3_g$AGE ~ hw3_g$fighting)

## 
##  F test to compare two variances
## 
## data:  hw3_g$AGE by hw3_g$fighting
## F = 1.0714, num df = 76777, denom df = 6626, p-value = 0.0001637
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.033806 1.109859
## sample estimates:
## ratio of variances 
##           1.071445

shapiro.test(hw3_g$AGE[hw3_g$fighting == 1][0:5000])

## 
##  Shapiro-Wilk normality test
## 
## data:  hw3_g$AGE[hw3_g$fighting == 1][0:5000]
## W = 0.93484, p-value < 2.2e-16

shapiro.test(hw3_g$AGE[hw3_g$fighting == 0][0:5000])

## 
##  Shapiro-Wilk normality test
## 
## data:  hw3_g$AGE[hw3_g$fighting == 0][0:5000]
## W = 0.90611, p-value < 2.2e-16

t.test(hw3_g$AGE ~ hw3_g$fighting, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  hw3_g$AGE by hw3_g$fighting
## t = 4.4907, df = 7902.4, p-value = 7.199e-06
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.05171002 0.13182667
## sample estimates:
## mean in group 0 mean in group 1 
##        13.58641        13.49464

Interpretation: p-value of the t-test is lower than 0.05 (7.199e-06), which suggests that we reject the null hypothesis, thus there is a significant difference between mean values of age between those who engage into fighting and not fighting. For non-fighters the mean is 13.58 which is a bit higher than for fighters 13.49. As for assumptions: variance test’s p-value is lower than 0.05 (0.0001), meaning that variances are not equal. While Shapiro-Wilk tests’ both have p-values lower than 0.05 (< 2.2e-16), meaning that the age is not distributed normally.

chisq.test(hw3_g$m96, hw3_g$fighting)

## 
##  Pearson's Chi-squared test
## 
## data:  hw3_g$m96 and hw3_g$fighting
## X-squared = 964.79, df = 3, p-value < 2.2e-16

chisq.test(hw3_g$m96, hw3_g$fighting)$stdres

##                                    hw3_g$fighting
## hw3_g$m96                                    0          1
##   Hardly ever or never               20.150847 -20.150847
##   Less than weekly                    1.555688  -1.555688
##   Weekly                             -9.795776   9.795776
##   Daily (specified in own variable) -27.269088  27.269088

Interpretation: p-value is lower than 0.05, meaning that there is an association between fighting and meeting up with friends after 8 pm (m96). As for stdres we can notice that non-fighters are more likely to meet with friends hardly ever or never after 8 pm (20.15), while fighters are more likely to meet up daily (27.26).

var.test(hw3_g$lifesat_1 ~ hw3_g$fighting)

## 
##  F test to compare two variances
## 
## data:  hw3_g$lifesat_1 by hw3_g$fighting
## F = 0.67975, num df = 76777, denom df = 6626, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6558747 0.7041250
## sample estimates:
## ratio of variances 
##          0.6797543

shapiro.test(hw3_g$lifesat_1[hw3_g$fighting == 1][0:5000])

## 
##  Shapiro-Wilk normality test
## 
## data:  hw3_g$lifesat_1[hw3_g$fighting == 1][0:5000]
## W = 0.94217, p-value < 2.2e-16

shapiro.test(hw3_g$lifesat_1[hw3_g$fighting == 0][0:5000])

## 
##  Shapiro-Wilk normality test
## 
## data:  hw3_g$lifesat_1[hw3_g$fighting == 0][0:5000]
## W = 0.81901, p-value < 2.2e-16

t.test(hw3_g$lifesat_1 ~ hw3_g$fighting, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  hw3_g$lifesat_1 by hw3_g$fighting
## t = 30.129, df = 7424.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.8399244 0.9568246
## sample estimates:
## mean in group 0 mean in group 1 
##        8.566399        7.668025

Interpretation: t-test p-value shows lower than 0.05 value (< 2.2e-16), meaning that we reject null hypothesis, thus there is a significant difference between fighters and non-fighters in terms of life satisfaction. For non-fighters the mean value is a bit higher (~8.57) that for fighters (~7.67). As for assumptions: var test shows that variances are not equal (p-value is < 2.2e-16), while Shapiro-Wilk tests show that life satisfaction is not normally distributed (p-values are < 2.2e-16).

var.test(hw3_g$frndsup ~ hw3_g$fighting)

## 
##  F test to compare two variances
## 
## data:  hw3_g$frndsup by hw3_g$fighting
## F = 0.86532, num df = 76777, denom df = 6626, p-value = 3.416e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.8349243 0.8963466
## sample estimates:
## ratio of variances 
##          0.8653228

shapiro.test(hw3_g$frndsup[hw3_g$fighting == 1][0:5000])

## 
##  Shapiro-Wilk normality test
## 
## data:  hw3_g$frndsup[hw3_g$fighting == 1][0:5000]
## W = 0.82866, p-value < 2.2e-16

shapiro.test(hw3_g$frndsup[hw3_g$fighting == 0][0:5000])

## 
##  Shapiro-Wilk normality test
## 
## data:  hw3_g$frndsup[hw3_g$fighting == 0][0:5000]
## W = 0.77357, p-value < 2.2e-16

t.test(hw3_g$frndsup ~ hw3_g$fighting, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  hw3_g$frndsup by hw3_g$fighting
## t = 10.167, df = 7649.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.1840847 0.2720279
## sample estimates:
## mean in group 0 mean in group 1 
##        5.637090        5.409034

Interpretation: t-test p-value shows lower than 0.05 value (< 2.2e-16), meaning that we reject null hypothesis, thus there is a significant difference between fighters and non-fighters in terms of friend support. For non-fighters the mean value is a bit higher (~5.64) that for fighters (~5.41). As for assumptions: var test shows that variances are not equal (p-value is 3.416e-16), while Shapiro-Wilk tests show that friend support is not normally distributed (p-values are < 2.2e-16).

#Part 2 (6 points)

##1. Make a regression analysis and choose the best model. Remember, an insignificant result is also a result.

md1 <- glm(fighting ~ AGE + lifesat_1 + frndsup + m96, data = hw3_g, family = binomial)
summary(md1)

## 
## Call:
## glm(formula = fighting ~ AGE + lifesat_1 + frndsup + m96, family = binomial, 
##     data = hw3_g)
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           1.185758   0.132550   8.946  < 2e-16 ***
## AGE                                  -0.150103   0.008516 -17.626  < 2e-16 ***
## lifesat_1                            -0.203920   0.005910 -34.503  < 2e-16 ***
## frndsup                              -0.048568   0.007540  -6.442 1.18e-10 ***
## m96Less than weekly                   0.312644   0.035143   8.896  < 2e-16 ***
## m96Weekly                             0.624291   0.035069  17.802  < 2e-16 ***
## m96Daily (specified in own variable)  1.286391   0.041873  30.721  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 46279  on 83404  degrees of freedom
## Residual deviance: 44071  on 83398  degrees of freedom
## AIC: 44085
## 
## Number of Fisher Scoring iterations: 5

table(hw3_g$m96)

## 
##              Hardly ever or never                  Less than weekly 
##                             43959                             18010 
##                            Weekly Daily (specified in own variable) 
##                             15763                              5673

Interpretation: As we can notice, all of the predictors are significant (age, life satisfaction, friend support and meeting with friends). Age is negatively associated with the girl’s chances to engage into fighting. With the increase of age by 1 the log of odds of girl fighting decreases by 0.15.

Life satisfaction is negatively associated with girl’s chances to engage into fighting. With the increase of life satisfaction by 1 the log of odds of girl fighting decreases by 0.20.

Friend support is negatively associated with girl’s chances to engage into fighting. With the increase of friend support by 1 the log of odds of girl fighting decreases by ~ 0.05.

Meeting with friends after 8 pm less than weekly compared to hardly ever or never is positively associated with girl’s chances to engage into fighting. Meeting friends after 8 pm less than weekly increases the log of odds of girl fighting by 0.31 compared to meeting hardly ever or never. Meeting with friends after 8 pm weekly compared to hardly ever or never is positively associated with girl’s chances to engage into fighting. Meeting friends after 8 pm weekly increases the log of odds of girl fighting by 0.62 compared to meeting hardly ever or never. Meeting with friends after 8 pm daily compared to hardly ever or never is positively associated with girl’s chances to engage into fighting. Meeting friends after 8 pm daily increases the log of odds of girl fighting by ~ 1.29 compared to meeting hardly ever or never.

md2 <- glm(fighting ~ log(AGE) + lifesat_1 + frndsup + m96, data = hw3_g, family = binomial)
summary(md2)

## 
## Call:
## glm(formula = fighting ~ log(AGE) + lifesat_1 + frndsup + m96, 
##     family = binomial, data = hw3_g)
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           4.296619   0.305126  14.081  < 2e-16 ***
## log(AGE)                             -1.979452   0.113882 -17.382  < 2e-16 ***
## lifesat_1                            -0.203784   0.005913 -34.464  < 2e-16 ***
## frndsup                              -0.048543   0.007540  -6.438 1.21e-10 ***
## m96Less than weekly                   0.311612   0.035153   8.865  < 2e-16 ***
## m96Weekly                             0.621245   0.035073  17.713  < 2e-16 ***
## m96Daily (specified in own variable)  1.283744   0.041869  30.661  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 46279  on 83404  degrees of freedom
## Residual deviance: 44082  on 83398  degrees of freedom
## AIC: 44096
## 
## Number of Fisher Scoring iterations: 5

md3 <- glm(fighting ~ AGE + lifesat_1 * frndsup + m96, data = hw3_g, family = binomial)
summary(md3)

## 
## Call:
## glm(formula = fighting ~ AGE + lifesat_1 * frndsup + m96, family = binomial, 
##     data = hw3_g)
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           0.870626   0.177689   4.900  9.6e-07 ***
## AGE                                  -0.150454   0.008516 -17.666  < 2e-16 ***
## lifesat_1                            -0.161490   0.016864  -9.576  < 2e-16 ***
## frndsup                               0.013641   0.024411   0.559  0.57628    
## m96Less than weekly                   0.311378   0.035143   8.860  < 2e-16 ***
## m96Weekly                             0.621542   0.035083  17.716  < 2e-16 ***
## m96Daily (specified in own variable)  1.283850   0.041882  30.654  < 2e-16 ***
## lifesat_1:frndsup                    -0.008094   0.003010  -2.689  0.00718 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 46279  on 83404  degrees of freedom
## Residual deviance: 44063  on 83397  degrees of freedom
## AIC: 44079
## 
## Number of Fisher Scoring iterations: 5

anova(md1, md2)

## Analysis of Deviance Table
## 
## Model 1: fighting ~ AGE + lifesat_1 + frndsup + m96
## Model 2: fighting ~ log(AGE) + lifesat_1 + frndsup + m96
##   Resid. Df Resid. Dev Df Deviance
## 1     83398      44071            
## 2     83398      44082  0  -10.775

anova(md2, md3)

## Analysis of Deviance Table
## 
## Model 1: fighting ~ log(AGE) + lifesat_1 + frndsup + m96
## Model 2: fighting ~ AGE + lifesat_1 * frndsup + m96
##   Resid. Df Resid. Dev Df Deviance
## 1     83398      44082            
## 2     83397      44063  1   18.018

anova(md1, md3)

## Analysis of Deviance Table
## 
## Model 1: fighting ~ AGE + lifesat_1 + frndsup + m96
## Model 2: fighting ~ AGE + lifesat_1 * frndsup + m96
##   Resid. Df Resid. Dev Df Deviance
## 1     83398      44071            
## 2     83397      44063  1   7.2438

Interpretation: After comparison of three models we can conclude that the best fitting model is md1. As comparing the md1 and md2 we notice that they have the same degree of freedom, and deviance -10.8 suggesting there is no significant difference between these two models. While between md2 and md3 there is a significant difference in deviance with 1 degree of freedom, md3 being better. Comparing md1 and md3 we notice that md3 do not improve md1, model md1 have a slightly higher deviance.

0.0.1 marginal effects

margins::margins_summary(md1)

##                                factor     AME     SE        z      p   lower
##                                   AGE -0.0106 0.0006 -17.5031 0.0000 -0.0118
##                               frndsup -0.0034 0.0005  -6.4354 0.0000 -0.0045
##                             lifesat_1 -0.0144 0.0004 -33.6997 0.0000 -0.0153
##  m96Daily (specified in own variable)  0.1217 0.0053  22.9701 0.0000  0.1113
##                   m96Less than weekly  0.0196 0.0023   8.4545 0.0000  0.0151
##                             m96Weekly  0.0448 0.0028  16.0161 0.0000  0.0393
##    upper
##  -0.0094
##  -0.0024
##  -0.0136
##   0.1321
##   0.0242
##   0.0503

Interpretation: Increase in age by 1 on average decreases the probability of girls fighting by 0.01.

Increase in friend support by 1 on average decreases the probability of girls fighting by 0.003.

Increase in life satisfaction by 1 on average decreases the probability of girls fighting by 0.001.

Meeting with friends daily on average increases the probability of girls fighting by 0.12 compared to meeting with friends hardly ever or never. Meeting with friends less than weekly on average increases the probability of girls fighting by ~ 0.02 compared to meeting with friends hardly ever or never. Meeting with friends weekly on average increases the probability of girls fighting by ~ 0.04 compared to meeting with friends hardly ever or never.

0.0.2 visualization

sjPlot::plot_model(md1, type = 'pred', terms = c('AGE'), axis.title = c('AGE','Probability of girls fighting', title = ''))

## Data were 'prettified'. Consider using `terms="AGE [all]"` to get smooth
##   plots.

sjPlot::plot_model(md1, type = 'pred', terms = c('frndsup'), axis.title = c('Friend support','Probability of girls fighting', title = ''))

## Data were 'prettified'. Consider using `terms="frndsup [all]"` to get
##   smooth plots.

sjPlot::plot_model(md1, type = 'pred', terms = c('lifesat_1'), axis.title = c('Life satisfaction','Probability of girls fighting', title = ''))

sjPlot::plot_model(md1, type = 'pred', terms = c('m96'), axis.title = c('M96','Probability of girls fighting', title = ''))

Interpretation: On the first 2 plots we can see that y-axis is quite small, the highest fighting probability is 7-8%, which shows that on average girls even with low levels of friend support and 10 years old are not very likely to fight, and this probability decreases with growth of age and friend support. Plot with life satisfaction have more distribution in terms of probability of girls fighting on average, with the highest being around 20% probability on low levels of life satisfaction. On the plot with m96 (meeting friends after 8 pm) we see that on average the probability of girls fighting is the highest in daily category.

##2. Model diagnostics. ### pseudo-R2

pscl::pR2(md1)[4]

## fitting null model for pseudo-r2

##   McFadden 
## 0.04772505

Interpretation: the pseudo-r2 is 0.04, which is a really low value, good model’s fit should be 0.2 to 0.4 and this is very far away from this model.

0.0.3 specificity & sensitivity

sens <- pROC::roc(hw3$fighting, predict(md2, hw3, type = "response"))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot(sens)

#2.1. Area under the curve
fitted = predict(md2, hw3, type = "response")
pROC::auc(hw3$fighting, fitted)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## Area under the curve: 0.6287

# 2.2. the percent of correctly predicted:
pscl::hitmiss(md1)

## Classification Threshold = 0.5 
##          y=0  y=1
## yhat=0 76768 6614
## yhat=1    10   13
## Percent Correctly Predicted = 92.06%
## Percent Correctly Predicted = 99.99%, for y = 0
## Percent Correctly Predicted = 0.1962%  for y = 1
## Null Model Correctly Predicts 92.05%

## [1] 92.0580301 99.9869754  0.1961672

Interpretation: Although plot shows that curve is above from the diagonal line, it is not very much above, thus the model is not good. This conclusion is also confirmed by the area under the curve value, which is equal to 0.6. Not a very good, good model should have 0.7 to 0.8. Lastly, the percent of correctly predicted values show a good overall value (92%). However, if we look at the distribution of predictions between fighters and non-fighters cases, we notice that for y = 1 (fighters) percent of correct predictions is 0.19%, which is not even a 1 percent of correct predictions.

0.0.4 multicollinearity

car::vif(md1)

##               GVIF Df GVIF^(1/(2*Df))
## AGE       1.144968  1        1.070032
## lifesat_1 1.070857  1        1.034822
## frndsup   1.033127  1        1.016429
## m96       1.107551  3        1.017171

Interpretation: we do not notice any multicollinearity, as all GVIF values are smaller that 4.

All in all, the model is very bad at predicting the girls fighting, while good at predicting non-fighting behavior. Thus, we need to try other predictors for fighting behavior of girls.

h/w3_Zaychenkova

Anastasia Zaychenkova

2024-04-24

0.0.1 marginal effects

0.0.2 visualization

0.0.3 specificity & sensitivity

0.0.4 multicollinearity