library(haven)
library(tidyverse)
library(dplyr)
library(readr)
library(ggplot2)
library(graphics)
library(corrplot)
library(stats)
library(stringr)
library(forcats)
Reading in data
data <- read.csv("data.csv")
Cleaning data (NA and outlier values)
## Ommit rows with missing data
data_clean <- na.omit(data)
## IQR
iqr <- IQR(data_clean$`Duration__in_seconds_`)
lower_limit <- quantile(data_clean$`Duration__in_seconds_`, 0.25) - 1.5 * iqr
upper_limit <- quantile(data_clean$`Duration__in_seconds_`, 0.75) + 1.5 * iqr
data_clean <- data_clean[data_clean$`Duration__in_seconds_` >= lower_limit &
data_clean$`Duration__in_seconds_` <= upper_limit, ]
data_clean <- data_clean[, !(names(data_clean) %in% c("Q5_3_TEXT", "Q29_8_TEXT", "Q14_8_TEXT", "Q5_3_TEXT"))]
## Creating data frame
df_data <- data.frame(data_clean)
## Removing characters from age column
df_data$Q3 <- gsub("[^0-9]", "", df_data$Q3)
df_data$Q3 <- trimws(df_data$Q3)
df_data$Q3 <- as.numeric(df_data$Q3, na.rm = TRUE)
df_data$Q3 <- as.integer(df_data$Q3)
## Age distribution
ggplot(df_data, aes(x = Q3)) + geom_boxplot() + labs(title = "Az életkor megoszlása", x = "Életkor", y = "Frekvencia")
ggplot(df_data, aes(x=Q3)) + geom_histogram(bins = 10) + geom_vline(aes(xintercept=mean(Q3)),
color="blue", linetype="dashed", size=1) + labs(title="Életkori hisztogram",x="Életkor", y = "Frekvencia")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Gender
data_clean
table(data_clean$Q4)
##
## 1 2
## 84 135
n <- nrow(data_clean)
(percent_gender <- table(data_clean$Q4)/n * 100)
##
## 1 2
## 38.35616 61.64384
# barplot(percent_gender,ylim=c(0,70), ylab="%",main="Biológiai nem eloszlása",names.arg=c("Férfi", "Nő"),col=c("darkblue","red"))
ggplot(data.frame(category = c("Férfi", "Nő"), percent_gender)) +
aes(x = category, y = percent_gender, fill = category) +
geom_bar(stat = "identity") +
labs(y = "%", title = "Biológiai nem eloszlása", x = "", fill = "") +
geom_text(aes(label = round(percent_gender, 1)), vjust = 1, hjust = 0.5) +
scale_y_continuous(limits = c(0, 70))
## Don't know how to automatically pick scale for object of type <table>.
## Defaulting to continuous.
## Schooling
ggplot(df_data, aes(x=factor(Q7), fill=factor(Q7))) + geom_bar(stat="count") + labs(title="Iskolai végzettség eloszlása",x="Iskolai végzettség", y = "Frekvencia") + theme(legend.position="none") + scale_x_discrete(labels = c('Alapfokú végzettség','Szakmunkás, szakiskola','Érettségi', 'Felsőfokó végzettség')) + scale_fill_grey(start = 0.85, end = 0.15)
## Level of affluence
wrapped_labels <- str_wrap(c('Sokkal rosszabb az átlagnál',
'Valamivel rosszabb az átlagnál',
'Átlagos',
'Valamivel jobb az átlagnál',
'Sokkal jobb az átlagnál',
'Magasan a legjobb'), width = 15)
ggplot(df_data, aes(x=fct_rev(factor(Q8)), fill=factor(Q8))) + geom_bar(stat="count") + scale_fill_grey(start = 0.15, end = 0.85) + theme(legend.position="none") + labs(title="Megítélése szerint összességében mennyire él jó körülmények között?",x="Jóléti szintek", y = "Frekvencia") + scale_x_discrete(labels = wrapped_labels)
df_data$Q32_1 <- (4+1) - df_data$Q32_1
df_data$Q32_4 <- (4+1) - df_data$Q32_4
df_data$Q32_7 <- (4+1) - df_data$Q32_7
df_data$Q32_10 <- (4+1) - df_data$Q32_10
theme_groups <- c("Internethasznalat", "Jatekhasznalat", "Kozossegimedia_hasznalat", "Narcizmus", "Onertekeles", "Jollet_kerdoiv", "Depresszio", "Maganyossag", "Erzelemszabalyozas", "Impulzivitas", "Tarsas_osszehasonlitas")
Internethasznalat <- c("Q23_1", "Q23_2", "Q23_3", "Q23_4", "Q23_5", "Q23_6", "Q23_7", "Q23_8", "Q23_9")
Jatekhasznalat <- c("Q24_1", "Q24_2", "Q24_3", "Q24_4", "Q24_5", "Q24_6", "Q24_7", "Q24_8", "Q24_9", "Q24_10")
Kozossegimedia_hasznalat <- c("Q25_1", "Q25_2", "Q25_3", "Q25_4", "Q25_5", "Q25_6", "Q25_7", "Q25_8", "Q25_9")
Narcizmus <- c("Q27_1", "Q27_2", "Q27_3", "Q27_4", "Q27_5", "Q27_6", "Q27_7", "Q27_8", "Q27_9")
Onertekeles <- c("Q28_1.0", "Q28_2.0", "Q28_3.0", "Q28_4.0", "Q28_5.0", "Q28_6.0", "Q28_7.0", "Q28_8.0", "Q28_9.0", "Q28_10.0")
Jollet_kerdoiv <- c("Q29_1.0", "Q29_2.0", "Q29_3.0", "Q29_4.0", "Q29_5.0")
Depresszio <- c("Q30_1", "Q30_2", "Q30_3", "Q30_4", "Q30_5", "Q30_6", "Q30_7", "Q30_8", "Q30_9")
Maganyossag <- c("Q31_1", "Q31_2", "Q31_3", "Q31_4", "Q31_5", "Q31_6")
Erzelemszabalyozas <- c("Q26_1.0", "Q26_2.0", "Q26_3.0", "Q26_4.0", "Q26_5.0", "Q26_6.0", "Q26_7.0", "Q26_8.0", "Q26_9.0", "Q26_10.0", "Q26_11.0", "Q26_12.0", "Q26_13", "Q26_14", "Q26_15", "Q26_16", "Q26_17", "Q26_18")
Impulzivitas <- c("Q32_1", "Q32_2", "Q32_3", "Q32_4", "Q32_5", "Q32_6", "Q32_7", "Q32_8", "Q32_9", "Q32_10")
Tarsas_osszehasonlitas <- c("Q33_1", "Q33_2", "Q33_3", "Q33_4", "Q33_5", "Q33_6")
Internethasznalat_score <- rowSums(df_data[, Internethasznalat])
Jatekhasznalat_score <- rowSums(df_data[, Jatekhasznalat])
Kozossegimedia_hasznalat_score <- rowSums(df_data[, Kozossegimedia_hasznalat])
Narcizmus_score <- rowSums(df_data[, Narcizmus])
Onertekeles_score <- rowSums(df_data[, Onertekeles])
Jollet_kerdoiv_score <- rowSums(df_data[, Jollet_kerdoiv])
Depresszio_score <- rowSums(df_data[, Depresszio])
Maganyossag_score <- rowSums(df_data[, Maganyossag])
Erzelemszabalyozas_score <- rowSums(df_data[, Erzelemszabalyozas])
Impulzivitas_score <- rowSums(df_data[, Impulzivitas])
Tarsas_osszehasonlitas_score <- rowSums(df_data[, Tarsas_osszehasonlitas])
df_data$Internethasznalat_total <- as.integer(Internethasznalat_score)
df_data$Jatekhasznalat_total <- as.integer(Jatekhasznalat_score)
df_data$Kozossegimedia_hasznalat_total <- as.integer(Kozossegimedia_hasznalat_score)
df_data$Narcizmus_total <- as.integer(Narcizmus_score)
df_data$Onertekeles_total <- as.integer(Onertekeles_score)
df_data$Jollet_kerdoiv_total <- as.integer(Jollet_kerdoiv_score)
df_data$Depresszio_total <- as.integer(Depresszio_score)
df_data$Maganyossag_total <- as.integer(Maganyossag_score)
df_data$Erzelemszabalyozas_total <- as.integer(Erzelemszabalyozas_score)
df_data$Impulzivitas_total <- as.integer(Impulzivitas_score)
df_data$Tarsas_osszehasonlitas_total <- as.integer(Tarsas_osszehasonlitas_score)
df_data$binned_Internethasznalat_total <- cut(df_data$Internethasznalat_total, breaks=3, labels=c('Alacsony', 'Kozepes', 'Magas'))
df_data$binned_Jatekhasznalat_total <- cut(df_data$Jatekhasznalat_total, breaks=3, labels=c('Alacsony', 'Kozepes', 'Magas'))
df_data$binned_Kozossegimedia_hasznalat_total <- cut(df_data$Kozossegimedia_hasznalat_total, breaks=3, labels=c('Alacsony', 'Kozepes', 'Magas'))
df_data$binned_Onertekeles_total <- cut(df_data$Onertekeles_total, breaks=3, labels=c('Alacsony', 'Kozepes', 'Magas'))
df_data$binned_Impulzivitas_total <- cut(df_data$Impulzivitas_total, breaks=3, labels=c('Alacsony', 'Kozepes', 'Magas'))
df_data$binned_Tarsas_osszehasonlitas_total <- cut(df_data$Tarsas_osszehasonlitas_total, breaks=3, labels=c('Alacsony', 'Kozepes', 'Magas'))
# df_data$binned_Internethasznalat_total <- recode(df_data$binned_Internethasznalat_total, "1"="Low", "2"="Medium", "3"="High")
Null hypothesis (H0): the row and the column variables of the contingency table are independent.
Alternative hypothesis (H1): row and column variables are dependent
internet_and_schooling <- table(df_data$binned_Internethasznalat_total, factor(df_data$Q7))
internet_and_schooling
##
## 1 2 3 4
## Alacsony 3 3 61 45
## Kozepes 3 2 61 29
## Magas 0 0 7 5
chisq_result_internet_and_schooling <- chisq.test(internet_and_schooling, simulate.p.value = TRUE)
chisq_result_internet_and_schooling
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: internet_and_schooling
## X-squared = 3.022, df = NA, p-value = 0.8251
games_and_schooling <- table(df_data$binned_Jatekhasznalat_total, df_data$Q7)
games_and_schooling
##
## 1 2 3 4
## Alacsony 4 3 95 65
## Kozepes 1 2 28 11
## Magas 1 0 6 3
chisq_result_games_and_schooling <- chisq.test(games_and_schooling, simulate.p.value = TRUE)
chisq_result_games_and_schooling
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: games_and_schooling
## X-squared = 5.7102, df = NA, p-value = 0.4378
media_and_schooling <- table(df_data$binned_Kozossegimedia_hasznalat_total, df_data$Q7)
media_and_schooling
##
## 1 2 3 4
## Alacsony 1 0 6 2
## Kozepes 0 0 31 8
## Magas 5 5 92 69
chisq_result_media_and_schooling <- chisq.test(media_and_schooling, simulate.p.value = TRUE)
chisq_result_media_and_schooling
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: media_and_schooling
## X-squared = 12.412, df = NA, p-value = 0.07646
internet_and_impulsivity <- table(df_data$binned_Internethasznalat_total, df_data$binned_Impulzivitas_total)
internet_and_impulsivity
##
## Alacsony Kozepes Magas
## Alacsony 54 40 18
## Kozepes 23 50 22
## Magas 2 6 4
chisq_result_internet_and_impulsivity <- chisq.test(internet_and_impulsivity, simulate.p.value = TRUE)
chisq_result_internet_and_impulsivity
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: internet_and_impulsivity
## X-squared = 15.394, df = NA, p-value = 0.003498
internet_and_selfesteem <- table(df_data$binned_Internethasznalat_total, df_data$binned_Onertekeles_total)
internet_and_selfesteem
##
## Alacsony Kozepes Magas
## Alacsony 31 72 9
## Kozepes 9 61 25
## Magas 0 9 3
chisq_result_internet_and_selfesteem <- chisq.test(internet_and_selfesteem, simulate.p.value = TRUE)
chisq_result_internet_and_selfesteem
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: internet_and_selfesteem
## X-squared = 22.508, df = NA, p-value = 0.0004998
internet_and_socialcomparison <- table(df_data$binned_Internethasznalat_total, df_data$binned_Tarsas_osszehasonlitas_total)
internet_and_socialcomparison
##
## Alacsony Kozepes Magas
## Alacsony 22 67 23
## Kozepes 5 37 53
## Magas 0 5 7
chisq_result_internet_and_socialcomparison <- chisq.test(internet_and_socialcomparison, simulate.p.value = TRUE)
chisq_result_internet_and_socialcomparison
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: internet_and_socialcomparison
## X-squared = 33.108, df = NA, p-value = 0.0004998
Blue color indicates that the observed value is higher than the expected value if the data were random
Red color specifies that the observed value is lower than the expected value if the data were random
mosaicplot(internet_and_schooling, shade = TRUE, las=1,
main = "Internethasználat vs iskolai végzettés")
mosaicplot(games_and_schooling, shade = TRUE, las=1,
main = "Játékhasználat vs iskolai végzettség")
mosaicplot(media_and_schooling, shade = TRUE, las=1,
main = "Közösségi-média vs iskolai végzettség")
mosaicplot(internet_and_impulsivity, shade = TRUE, las=1,
main = "Internethasználat vs impulzivitás")
mosaicplot(internet_and_selfesteem, shade = TRUE, las=1,
main = "Internethasználat vs önértékelés")
mosaicplot(internet_and_socialcomparison, shade = TRUE, las=1,
main = "Internethasználat vs társas összehasonlítás")
Cells with the highest absolute standardized residuals contribute the most to the total Chi-square score.
print("Internethasználat vs iskolai végzettés:")
## [1] "Internethasználat vs iskolai végzettés:"
round(chisq_result_internet_and_schooling$residuals, 3)
##
## 1 2 3 4
## Alacsony -0.039 0.277 -0.612 0.723
## Kozepes 0.246 -0.115 0.674 -0.900
## Magas -0.573 -0.523 -0.026 0.323
print("Játékhasználat vs iskolai végzettség:")
## [1] "Játékhasználat vs iskolai végzettség:"
round(chisq_result_games_and_schooling$residuals, 3)
##
## 1 2 3 4
## Alacsony -0.269 -0.416 -0.340 0.613
## Kozepes -0.140 1.063 0.655 -1.066
## Magas 1.387 -0.478 0.045 -0.320
print("Közösségi-média vs iskolai végzettség:")
## [1] "Közösségi-média vs iskolai végzettség:"
round(chisq_result_media_and_schooling$residuals, 3)
##
## 1 2 3 4
## Alacsony 1.517 -0.453 0.303 -0.692
## Kozepes -1.034 -0.944 1.675 -1.618
## Magas 0.146 0.555 -0.869 0.931
print("Internethasználat vs impulzivitás:")
## [1] "Internethasználat vs impulzivitás:"
round(chisq_result_internet_and_impulsivity$residuals, 3)
##
## Alacsony Kozepes Magas
## Alacsony 2.139 -1.298 -0.949
## Kozepes -1.925 1.295 0.667
## Magas -1.119 0.323 1.023
print("Internethasználat vs önértékelés:")
## [1] "Internethasználat vs önértékelés:"
round(chisq_result_internet_and_selfesteem$residuals, 3)
##
## Alacsony Kozepes Magas
## Alacsony 2.331 -0.073 -2.281
## Kozepes -2.005 -0.076 2.234
## Magas -1.480 0.437 0.683
print("Internethasználat vs társas összehasonlítás:")
## [1] "Internethasználat vs társas összehasonlítás:"
round(chisq_result_internet_and_socialcomparison$residuals, 3)
##
## Alacsony Kozepes Magas
## Alacsony 2.204 1.508 -2.985
## Kozepes -1.961 -1.495 2.832
## Magas -1.216 -0.398 1.150
For a given cell, the size of the circle is proportional to the amount of the cell contribution. The sign of the standardized residuals is also very important to interpret the association between rows and columns.
Positive residuals are in blue. Positive values in cells specify an attraction (positive association) between the corresponding row and column variables.
Negative residuals are in red. This implies a repulsion (negative association) between the corresponding row and column variables.
corrplot(chisq_result_internet_and_schooling$residuals, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
corrplot(chisq_result_games_and_schooling$residuals, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
corrplot(chisq_result_media_and_schooling$residuals, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
corrplot(chisq_result_internet_and_impulsivity$residuals, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
corrplot(chisq_result_internet_and_selfesteem$residuals, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
corrplot(chisq_result_internet_and_socialcomparison$residuals, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
contrib_1 <- 100*chisq_result_internet_and_schooling$residuals^2/chisq_result_internet_and_schooling$statistic
round(contrib_1, 3)
##
## 1 2 3 4
## Alacsony 0.051 2.539 12.402 17.317
## Kozepes 2.006 0.435 15.027 26.811
## Magas 10.879 9.066 0.022 3.444
corrplot(contrib_1, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
contrib_2 <- 100*chisq_result_games_and_schooling$residuals^2/chisq_result_games_and_schooling$statistic
round(contrib_2, 3)
##
## 1 2 3 4
## Alacsony 1.267 3.034 2.022 6.581
## Kozepes 0.346 19.795 7.524 19.914
## Magas 33.693 3.998 0.036 1.791
corrplot(contrib_2, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
contrib_3 <- 100*chisq_result_media_and_schooling$residuals^2/chisq_result_media_and_schooling$statistic
round(contrib_3, 3)
##
## 1 2 3 4
## Alacsony 18.547 1.655 0.742 3.856
## Kozepes 8.608 7.174 22.599 21.090
## Magas 0.171 2.478 6.090 6.989
corrplot(contrib_3, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
contrib_4 <- 100*chisq_result_internet_and_impulsivity$residuals^2/chisq_result_internet_and_impulsivity$statistic
round(contrib_4, 3)
##
## Alacsony Kozepes Magas
## Alacsony 29.730 10.947 5.852
## Kozepes 24.073 10.892 2.888
## Magas 8.138 0.676 6.803
corrplot(contrib_4, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
contrib_5 <- 100*chisq_result_internet_and_selfesteem$residuals^2/chisq_result_internet_and_selfesteem$statistic
round(contrib_5, 3)
##
## Alacsony Kozepes Magas
## Alacsony 24.143 0.024 23.116
## Kozepes 17.859 0.026 22.172
## Magas 9.738 0.849 2.073
corrplot(contrib_5, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)
contrib_6 <- 100*chisq_result_internet_and_socialcomparison$residuals^2/chisq_result_internet_and_socialcomparison$statistic
round(contrib_6, 3)
##
## Alacsony Kozepes Magas
## Alacsony 14.679 6.865 26.912
## Kozepes 11.619 6.755 24.231
## Magas 4.469 0.478 3.993
corrplot(contrib_6, is.cor = FALSE, addCoef.col = 1, cl.cex = 0.5)