H/W1_Zaychenkova

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(ggplot2)
library(ggpubr)
library (foreign) 
library(corrplot)

## corrplot 0.92 loaded

library(sjPlot)

HBSC <- read.spss ("HBSC2014OAed1.1_F1.sav",to.data.frame=T, use.value.labels=T)

# victimization
HBSC$cbullpict_1[HBSC$cbullpict == "Haven't"] <- "1" 
HBSC$cbullpict_1[HBSC$cbullpict == "Once or twice"] <- "2" 
HBSC$cbullpict_1[HBSC$cbullpict == "2-3 times per month"] <- "3" 
HBSC$cbullpict_1[HBSC$cbullpict == "Once/week"] <- "4" 
HBSC$cbullpict_1[HBSC$cbullpict == "Several times/week"] <- "5"
HBSC$cbullpict_1 <- as.numeric(HBSC$cbullpict_1)

HBSC$cbullmess_1[HBSC$cbullmess == "Haven't"] <- "1" 
HBSC$cbullmess_1[HBSC$cbullmess == "Once or twice"] <- "2" 
HBSC$cbullmess_1[HBSC$cbullmess == "2-3 times per month"] <- "3" 
HBSC$cbullmess_1[HBSC$cbullmess == "Once/week"] <- "4" 
HBSC$cbullmess_1[HBSC$cbullmess == "Several times/week"] <- "5"
HBSC$cbullmess_1 <- as.numeric(HBSC$cbullmess_1)

HBSC$beenbullied_1[HBSC$beenbullied == "Haven't"] <- "1" 
HBSC$beenbullied_1[HBSC$beenbullied == "Once or twice"] <- "2" 
HBSC$beenbullied_1[HBSC$beenbullied == "2-3 times per month"] <- "3" 
HBSC$beenbullied_1[HBSC$beenbullied == "Once/week"] <- "4" 
HBSC$beenbullied_1[HBSC$beenbullied == "Several times/week"] <- "5"
HBSC$beenbullied_1 <- as.numeric(HBSC$beenbullied_1)

HBSC$victimization <- ifelse(HBSC$cbullpict_1 >= 3 | HBSC$cbullmess_1 >= 3 | HBSC$beenbullied_1 >= 3, "victim", "non-victim")

# lifesat
HBSC$lifesat_1 <- ifelse(HBSC$lifesat == "0, worst possible life", 1,
                 ifelse(HBSC$lifesat == "10, best possible life", 11, HBSC$lifesat))
table(HBSC$lifesat, HBSC$lifesat_1)

##                         
##                              1     2     3     4     5     6     7     8     9
##   0, worst possible life  1099     0     0     0     0     0     0     0     0
##   1                          0  1132     0     0     0     0     0     0     0
##   2                          0     0  1790     0     0     0     0     0     0
##   3                          0     0     0  3519     0     0     0     0     0
##   4                          0     0     0     0  6301     0     0     0     0
##   5                          0     0     0     0     0 15604     0     0     0
##   6                          0     0     0     0     0     0 17697     0     0
##   7                          0     0     0     0     0     0     0 35015     0
##   8                          0     0     0     0     0     0     0     0 48835
##   9                          0     0     0     0     0     0     0     0     0
##   10, best possible life     0     0     0     0     0     0     0     0     0
##                         
##                             10    11
##   0, worst possible life     0     0
##   1                          0     0
##   2                          0     0
##   3                          0     0
##   4                          0     0
##   5                          0     0
##   6                          0     0
##   7                          0     0
##   8                          0     0
##   9                      38724     0
##   10, best possible life     0 36237

# schoolpressure
class(HBSC$schoolpressure) #I'll leave it as factor, because the variable only has 4 categories. An thus it is not enough to make it numeric

## [1] "factor"

# device usage
HBSC$tvwd <- as.numeric(HBSC$tvwd)
HBSC$playgamewd <- as.numeric(HBSC$playgamewd)
HBSC$compusewd <- as.numeric(HBSC$compusewd)

HBSC$deviceusage <- rowMeans(HBSC[,c('tvwd', 'playgamewd', 'compusewd')], na.rm=TRUE)
summary(HBSC$deviceusage)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.667   3.667   3.848   4.667   9.000   19690

label(HBSC$victimization) = 'Victimization'
label(HBSC$lifesat_1) = 'Life satisfaction'
label(HBSC$schoolpressure) = 'Pressured by schoolwork'
label(HBSC$deviceusage) = 'Device usage, weekdays'

t1kable(table1(~lifesat_1 + victimization + 
         schoolpressure+ deviceusage, data = HBSC))%>%
  kable_styling(font_size = 16)

	Overall
	(N=214080)
Life satisfaction
Mean (SD)	8.63 (1.95)
Median [Min, Max]	9.00 [1.00, 11.0]
Missing	8127 (3.8%)
Victimization
non-victim	168307 (78.6%)
victim	25664 (12.0%)
Missing	20109 (9.4%)
Pressured by schoolwork
Not at all	44229 (20.7%)
A little	90123 (42.1%)
Some	46805 (21.9%)
A lot	23672 (11.1%)
Missing	9251 (4.3%)
Device usage, weekdays
Mean (SD)	3.85 (1.67)
Median [Min, Max]	3.67 [1.00, 9.00]
Missing	19690 (9.2%)

In the table above we can see that mean value of life satisfaction is quite high (8.63). In the victimization variable there a lot more non-victim cases (79%). Almost a half of the sample (42%) indicate being a little pressured by schoolwork, while ‘not at all’ and ‘some’ categories are almost equal. Mean for device usage on the weekdays is 3.85, while max is 9.

After that I create a separate dataset, so it will be more convenient to work, and I also delete NAs.

HBSC_1 <- select(HBSC, c(lifesat_1, victimization, schoolpressure, deviceusage))
dim(HBSC_1)

## [1] 214080      4

sapply(HBSC_1, function(x) sum(is.na(x)))

##      lifesat_1  victimization schoolpressure    deviceusage 
##           8127          20109           9251          19690

HBSC_1 <- na.omit(HBSC_1)
dim(HBSC_1)

## [1] 171650      4

Firstly, I propose to run a chi-squared test with a binary variable (victimization) and a categorical variable (schoolpressure) H0: the variables are distributed independently H1: the variables are not distributed independently

chisq_hw <- chisq.test(HBSC_1$victimization, HBSC_1$schoolpressure)

chisq.test(HBSC_1$victimization, HBSC_1$schoolpressure)$stdres

##                     HBSC_1$schoolpressure
## HBSC_1$victimization Not at all  A little      Some     A lot
##           non-victim   15.81626  16.44177 -10.05309 -32.26792
##           victim      -15.81626 -16.44177  10.05309  32.26792

chisq.test(HBSC_1$victimization, HBSC_1$schoolpressure)$expected

##                     HBSC_1$schoolpressure
## HBSC_1$victimization Not at all  A little      Some     A lot
##           non-victim  31248.693 66113.894 34776.335 17409.077
##           victim       4618.307  9771.106  5139.665  2572.923

chisq.test(HBSC_1$victimization, HBSC_1$schoolpressure)$observed

##                     HBSC_1$schoolpressure
## HBSC_1$victimization Not at all A little  Some A lot
##           non-victim      32141    67247 34187 15973
##           victim           3726     8638  5729  4009

corrplot(chisq_hw$stdres, is.cor = FALSE)

Interpretation: p-value is less than 0.05 (< 2.2e-16), thus we reject the null hypothesis, so there is a statistically significant relationship between two categories: victimization and school pressure. At the table with residuals the highest value is in the cell ‘victim’ and ‘a lot’ (32.27), which shows the positive association between these two variables, while the lowest value is the opposite ‘non-victim’ and ‘a lot’ (-32.27), meaning there is a strong negative association between them. While there is a high values in ‘non-victim’ and ‘not at all’ (15.82) and in ‘non-victim’ and ‘a little’ cells (16.44), which indicates an association between low school pressure and not being a victim. It may also be seen on the corrplot.

plot_xtab(HBSC_1$victimization, HBSC_1$schoolpressure, margin = "row", bar.pos = "stack",
         show.summary = TRUE) +  
  labs(
    x = "Victimization",
    fill="School pressure")

If we look at sjplot, it is noticeable that the proportion of people who note to be not pressured or pressured a little at school is higher among non-victims, while being under some or a lot pressure are higher for victimized group. Thus, the chi-squared test results indicate that victimization is positively associated with being under some or a lot pressure in school.

Secondly, I will conduct t-test with a binary variable (victimization) and numeric variable (lifesat_1). But before t-test, I will conduct the test to check the assumption regarding equality of variances, with H0: the life satisfaction variable variances are equal across levels of the victimization variable. And look at the disrtibution of lifesat_1 for each category of victimization. While for the t-test the null hypothesis is: there is no difference in mean values of life satisfaction between bullying victims and non-victims.

var.test(HBSC_1$lifesat_1~HBSC_1$victimization)

## 
##  F test to compare two variances
## 
## data:  HBSC_1$lifesat_1 by HBSC_1$victimization
## F = 0.60424, num df = 149547, denom df = 22101, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.5922443 0.6163840
## sample estimates:
## ratio of variances 
##           0.604239

hist(HBSC_1$lifesat_1[HBSC_1$victimization == 'victim'])

hist(HBSC_1$lifesat_1[HBSC_1$victimization == 'non-victim'])

t.test(HBSC_1$lifesat_1 ~ HBSC_1$victimization, var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  HBSC_1$lifesat_1 by HBSC_1$victimization
## t = 57.088, df = 26194, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group non-victim and group victim is not equal to 0
## 95 percent confidence interval:
##  0.9147258 0.9797712
## sample estimates:
## mean in group non-victim     mean in group victim 
##                 8.727178                 7.779929

Interpretation: The variances are not equal (p-value is < 2.2e-16, thus we reject the null hypothesis) and lifesat_1 is not normally distributed. T-test: p-value is smaller than 0.05 (< 2.2e-16), meaning that null hypothesis should be rejected, meaning that there is a difference between mean values of life satisfaction between bullying victims and non-victims. For non-victim group the mean is 8.73, while for victims it is lower and equals to 7.78

I also propose to estimate the effect size for a t-test.

effectsize::cohens_d(HBSC_1$lifesat_1 ~ HBSC_1$victimization)

## Cohen's d |       95% CI
## ------------------------
## 0.50      | [0.54, 0.54]
## 
## - Estimated using pooled SD.

The effect size is equal to 0.5, which is a middle effect size.

ggplot(HBSC_1, aes(x=victimization, y=lifesat_1)) + 
  geom_boxplot()+
  theme_minimal(base_size = 16)+
  ylab('life satisfaction')+xlab('victimization')+
  ggpubr::stat_compare_means(method = "t.test")

On the plot we can see a visualized difference in the means between victims and non-victims, which complies with the formal test results. Thus, t-test results indicate that being a victim is associated with lower life satisfaction values, than being a non-victim.

Thirdly, I will conduct correlation test with two numeric variables (lifesat_1 and deviceusage). H0: there is no correlation between life satisfaction and device usage.

cor.test(HBSC_1$deviceusage, HBSC_1$lifesat_1)

## 
##  Pearson's product-moment correlation
## 
## data:  HBSC_1$deviceusage and HBSC_1$lifesat_1
## t = -50.208, df = 171648, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1249654 -0.1156409
## sample estimates:
##        cor 
## -0.1203058

ggscatter(HBSC_1, x = "deviceusage", y = "lifesat_1", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = "deviceusage", ylab = "lifesat_1")

Interpretation: p-value is smaller than 0.05 (< 2.2e-16), meaning that null hypothesis is rejected, and there is a correlation between life satisfaction and device usage. The value of correlation coefficient (-0.12) indicates a weak negative correlation between life satisfaction and device usage. This results may be interpreted as following: the more satisfied a person with his life, the less he uses devices. The plot confirms the results, as a trend line shows the linear negative relationship between variables.

And lastly, I will conduct anova test using a numeric variable (lifesat_1) and a categorical variable with more than two categories (schoolpressure). First, we need to check variances and normality of data For Levene’s test H0: there is no difference in variances between categories of school work variable.

car::leveneTest(HBSC_1$lifesat_1~HBSC_1$schoolpressure)

## Levene's Test for Homogeneity of Variance (center = median)
##           Df F value    Pr(>F)    
## group      3   916.6 < 2.2e-16 ***
##       171646                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

hist(HBSC_1$lifesat_1[HBSC_1$schoolpressure == 'Not at all'])

hist(HBSC_1$lifesat_1[HBSC_1$schoolpressure == 'A little'])

hist(HBSC_1$lifesat_1[HBSC_1$schoolpressure == 'Some'])

hist(HBSC_1$lifesat_1[HBSC_1$schoolpressure == 'A lot'])

Interpretation: the p-value for Levene’s test is smaller than 0.05 (< 2.2e-16), thus we reject null hypothesis. And as we can see on histograms, life satisfaction is distributed non-normally in all of the categories of school pressure.

For anova test: H0: means of life satisfaction variable are equal across the groups. H1: means of life satisfaction variable are not equal across the groups.

plotmeans(lifesat_1 ~ schoolpressure, data = HBSC_1, 
          xlab = "Pressure in school", ylab = "life satisfaction",
          main="Mean Plot with 95% CI")

anova <- aov(HBSC_1$lifesat_1 ~ HBSC_1$schoolpressure)
summary(anova)

##                           Df Sum Sq Mean Sq F value Pr(>F)    
## HBSC_1$schoolpressure      3  33487   11162    3131 <2e-16 ***
## Residuals             171646 612037       4                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation: the p-value is smaller than 0.05 (<2e-16), which means that we reject the null hypothesis and there are some differences in the means of life satisfaction variable across school pressure categories. On the plot we can see that the highest mean value of life satisfaction is in ‘Not at all’ school work pressure category.

rstatix::eta_squared(aov(lifesat_1~schoolpressure, data = HBSC_1))

## schoolpressure 
##     0.05187622

The effect size (0.05) is considered medium.

pairwise.t.test(HBSC_1$lifesat_1, g = HBSC_1$schoolpressure, p.adj = "bonf")

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  HBSC_1$lifesat_1 and HBSC_1$schoolpressure 
## 
##          Not at all A little Some  
## A little <2e-16     -        -     
## Some     <2e-16     <2e-16   -     
## A lot    <2e-16     <2e-16   <2e-16
## 
## P value adjustment method: bonferroni

TukeyHSD(aov(lifesat_1~schoolpressure, data = HBSC_1))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = lifesat_1 ~ schoolpressure, data = HBSC_1)
## 
## $schoolpressure
##                           diff        lwr        upr p adj
## A little-Not at all -0.4693614 -0.5004459 -0.4382769     0
## Some-Not at all     -0.9061153 -0.9414098 -0.8708209     0
## A lot-Not at all    -1.4822817 -1.5251052 -1.4394582     0
## Some-A little       -0.4367539 -0.4667488 -0.4067591     0
## A lot-A little      -1.0129203 -1.0514929 -0.9743477     0
## A lot-Some          -0.5761664 -0.6182056 -0.5341272     0

There is a significant differences between school pressure categories (p values are <2e-16), and Tukey multiple comparisons of means test also suggest the same, with adjusted p-value (0).

anova_comparisons <- list( c("Not at all", "A little"), c("Some", "A little"), c("A lot", "A little") , c("Not at all", "Some"), c("Not at all", "A lot"), c("A lot", "Some"))
ggplot(HBSC_1, aes(x=schoolpressure, y=lifesat_1)) + 
  geom_boxplot()+
  theme_minimal(base_size = 20)+
  ylab('Life satisfaction')+xlab('School pressure')+
  ggpubr::stat_compare_means(comparisons = anova_comparisons,label = "p.signif")+
  ggpubr::stat_compare_means(method = 'anova', label.y = 81)

Here we can visually see that each group significantly differs from others. Thus, the anova test results suggest that a lot of pressure is more associated with lower life satisfaction.

H/W1_Zaychenkova

Anastasia Zaychenkova

2024-02-15