Dataset: Student Mental Health and analyze them using correlation.
library(ggplot2)
df=read.csv("Data Carrard et al. 2022 MedTeach.csv")
df
str(df)
## 'data.frame': 886 obs. of 20 variables:
## $ id : int 2 4 9 10 13 14 17 21 23 24 ...
## $ age : int 18 26 21 21 21 26 23 23 23 22 ...
## $ year : int 1 4 3 2 3 5 5 4 4 2 ...
## $ sex : int 1 1 2 2 1 2 2 1 2 2 ...
## $ glang : int 120 1 1 1 1 1 1 1 1 1 ...
## $ part : int 1 1 0 0 1 1 1 1 1 1 ...
## $ job : int 0 0 0 1 0 1 0 1 1 0 ...
## $ stud_h : int 56 20 36 51 22 10 15 8 20 20 ...
## $ health : int 3 4 3 5 4 2 3 4 2 5 ...
## $ psyt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ jspe : int 88 109 106 101 102 102 117 118 118 108 ...
## $ qcae_cog : int 62 55 64 52 58 48 58 65 69 56 ...
## $ qcae_aff : int 27 37 39 33 28 37 38 40 46 36 ...
## $ amsp : int 17 22 17 18 21 17 23 32 23 22 ...
## $ erec_mean: num 0.738 0.69 0.69 0.833 0.69 ...
## $ cesd : int 34 7 25 17 14 14 45 6 43 11 ...
## $ stai_t : int 61 33 73 48 46 56 56 36 43 43 ...
## $ mbi_ex : int 17 14 24 16 22 18 28 11 26 18 ...
## $ mbi_cy : int 13 11 7 10 14 15 17 10 21 6 ...
## $ mbi_ea : int 20 26 23 21 23 18 16 27 22 23 ...
#Changing int into factor for categorical variables
df$sex = as.factor(df$sex) # Gender
df$glang = as.factor(df$glang) # Mother Tongue
df$part = as.factor(df$part) # Partenership Status
df$health = as.factor(df$health) # Health Status
df$psyt = as.factor(df$psyt) # Physiotherapy
df$year = as.factor(df$year) # year
df
str(df)
## 'data.frame': 886 obs. of 20 variables:
## $ id : int 2 4 9 10 13 14 17 21 23 24 ...
## $ age : int 18 26 21 21 21 26 23 23 23 22 ...
## $ year : Factor w/ 6 levels "1","2","3","4",..: 1 4 3 2 3 5 5 4 4 2 ...
## $ sex : Factor w/ 3 levels "1","2","3": 1 1 2 2 1 2 2 1 2 2 ...
## $ glang : Factor w/ 19 levels "1","15","20",..: 18 1 1 1 1 1 1 1 1 1 ...
## $ part : Factor w/ 2 levels "0","1": 2 2 1 1 2 2 2 2 2 2 ...
## $ job : int 0 0 0 1 0 1 0 1 1 0 ...
## $ stud_h : int 56 20 36 51 22 10 15 8 20 20 ...
## $ health : Factor w/ 5 levels "1","2","3","4",..: 3 4 3 5 4 2 3 4 2 5 ...
## $ psyt : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ jspe : int 88 109 106 101 102 102 117 118 118 108 ...
## $ qcae_cog : int 62 55 64 52 58 48 58 65 69 56 ...
## $ qcae_aff : int 27 37 39 33 28 37 38 40 46 36 ...
## $ amsp : int 17 22 17 18 21 17 23 32 23 22 ...
## $ erec_mean: num 0.738 0.69 0.69 0.833 0.69 ...
## $ cesd : int 34 7 25 17 14 14 45 6 43 11 ...
## $ stai_t : int 61 33 73 48 46 56 56 36 43 43 ...
## $ mbi_ex : int 17 14 24 16 22 18 28 11 26 18 ...
## $ mbi_cy : int 13 11 7 10 14 15 17 10 21 6 ...
## $ mbi_ea : int 20 26 23 21 23 18 16 27 22 23 ...
df %>%
summarise_all(list(~ sum(is.na(.))))
So no na values found!
summary(df)
## id age year sex glang part
## Min. : 2.0 Min. :17.00 1:245 1:275 1 :717 0:387
## 1st Qu.: 447.5 1st Qu.:20.00 2:135 2:606 90 : 45 1:499
## Median : 876.0 Median :22.00 3:143 3: 5 15 : 31
## Mean : 889.7 Mean :22.38 4:123 102 : 27
## 3rd Qu.:1341.8 3rd Qu.:24.00 5:127 20 : 22
## Max. :1790.0 Max. :49.00 6:113 121 : 13
## (Other): 31
## job stud_h health psyt jspe
## Min. :0.0000 Min. : 0.00 1: 37 0:687 Min. : 67.0
## 1st Qu.:0.0000 1st Qu.:12.00 2: 87 1:199 1st Qu.:101.0
## Median :0.0000 Median :25.00 3:136 Median :107.0
## Mean :0.3488 Mean :25.29 4:402 Mean :106.4
## 3rd Qu.:1.0000 3rd Qu.:36.00 5:224 3rd Qu.:113.0
## Max. :1.0000 Max. :70.00 Max. :125.0
##
## qcae_cog qcae_aff amsp erec_mean
## Min. :37.00 Min. :18.00 Min. : 6.00 Min. :0.3571
## 1st Qu.:54.00 1st Qu.:31.00 1st Qu.:20.00 1st Qu.:0.6667
## Median :58.00 Median :35.00 Median :23.00 Median :0.7262
## Mean :58.53 Mean :34.78 Mean :23.15 Mean :0.7201
## 3rd Qu.:63.00 3rd Qu.:39.00 3rd Qu.:26.75 3rd Qu.:0.7857
## Max. :76.00 Max. :48.00 Max. :35.00 Max. :0.9524
##
## cesd stai_t mbi_ex mbi_cy mbi_ea
## Min. : 0.00 Min. :20.0 Min. : 5.00 Min. : 4.00 Min. :10.00
## 1st Qu.: 9.00 1st Qu.:34.0 1st Qu.:13.00 1st Qu.: 6.00 1st Qu.:21.00
## Median :16.00 Median :43.0 Median :17.00 Median : 9.00 Median :24.00
## Mean :18.05 Mean :42.9 Mean :16.88 Mean :10.08 Mean :24.21
## 3rd Qu.:25.00 3rd Qu.:51.0 3rd Qu.:20.00 3rd Qu.:13.00 3rd Qu.:28.00
## Max. :56.00 Max. :77.0 Max. :30.00 Max. :24.00 Max. :36.00
##
df_num <- df %>%
select(age,stud_h,jspe,qcae_cog,qcae_aff,amsp,erec_mean,cesd,stai_t,mbi_ex,mbi_cy,mbi_ea)
df_num
# Load ggplot2 library
library(ggplot2)
# Scatter plot of two columns in the dataframe
ggplot(df_num, aes(x = cesd, y = mbi_ex)) +
geom_point() +
xlab("CES-D total score") +
ylab("MBI Emotional Exhaustion") +
ggtitle("Scatter plot of cesd and mbi_ex")
df_num %>%
select(mbi_ex, qcae_cog, stai_t) %>%
cor(use="pairwise.complete.obs", method = "pearson")
## mbi_ex qcae_cog stai_t
## mbi_ex 1.00000000 -0.02363016 0.53048589
## qcae_cog -0.02363016 1.00000000 -0.07786829
## stai_t 0.53048589 -0.07786829 1.00000000
df_num %>%
select(cesd, mbi_ea, stud_h) %>%
cor(use="pairwise.complete.obs", method = "pearson")
## cesd mbi_ea stud_h
## cesd 1.0000000 -0.453589 0.1740849
## mbi_ea -0.4535890 1.000000 0.1017320
## stud_h 0.1740849 0.101732 1.0000000
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
# Calculate the correlation matrix
cor_matrix <- cor(df_num)
# Melt the correlation matrix into a data frame for plotting
melted_matrix <- melt(cor_matrix)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
# Plot the heatmap
ggplot(melted_matrix, aes(X1, X2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "yellow", high = "red", mid = "orange", midpoint = 0, limit = c(-1,1), space = "Lab", name="Correlation\nCoefficient") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "bottom") +
geom_text(aes(X1, X2, label = round(value, 2)), color = "white", size = 4,fontface = "bold", show.legend = FALSE) +
ggtitle("Correlation Heatmap")
High +ve Correlated -
cesd x stai_t = 0.72 cesd X mbi_ex = 0.61
mbi_ex X stai_t = 0.53 mbi_ex X mbi_cy = 0.51
High -ve Correlated -
mbi_ea X mbi_cy = -0.57 mbi_ea X mbi_ex = -0.47 mbi_ea X stai_t = -0.46 mbi_ea X cesd = -0.45
df_x <- df %>%
select(mbi_ex , stai_t ,cesd)
summary(df_x)
## mbi_ex stai_t cesd
## Min. : 5.00 Min. :20.0 Min. : 0.00
## 1st Qu.:13.00 1st Qu.:34.0 1st Qu.: 9.00
## Median :17.00 Median :43.0 Median :16.00
## Mean :16.88 Mean :42.9 Mean :18.05
## 3rd Qu.:20.00 3rd Qu.:51.0 3rd Qu.:25.00
## Max. :30.00 Max. :77.0 Max. :56.00
# Load ggplot2 library
library(ggplot2)
# Scatter plot of two columns in the dataframe
ggplot(df_num, aes(x = cesd, y = stai_t)) +
geom_point() +
xlab("CES-D total score (Depression)") +
ylab("STAI Score (Anxiety)") +
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Scatter plot of cesd and stai_t")+
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
# Scatter plot of two columns in the dataframe
ggplot(df_num, aes(x = cesd, y = stai_t)) +
geom_point(color='brown') +
xlab("CES-D total score (Depression)") +
ylab("MBI_EX ") +
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Scatter plot of cesd and mbi_ex")+
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
# Load ggplot2 library
library(ggplot2)
# Scatter plot of two columns in the dataframe
ggplot(df_num, aes(y = mbi_ex, x = stai_t)) +
geom_point(color='red') +
xlab("mbi_ea") +
ylab("stai_t") +
ggtitle("Scatter plot of stai_t and mbi_ex")+
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_num,aes(x=mbi_ex))+
geom_histogram(bins = 15, fill='grey',color='black') +
geom_vline(aes(xintercept = mean(mbi_ex), color = "mean"), linetype = "dashed", size = 1) +
ggtitle("Distribution of Maslach Burnout Inventory-Exhaustion scale of the participant" ) +
xlab("mbi_ex") +
ylab("Frequency") +
theme(plot.title = element_text(hjust = 0.5))+
scale_color_manual(name = "", values = c(mean = "blue"), labels = c("Mean"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
ggplot(df_num,aes(cesd))+
geom_histogram(bins = 15, fill='brown',color='black') +
geom_vline(aes(xintercept = mean(cesd), color = "mean"), linetype = "dashed", size = 1) +
ggtitle("Distribution of Center for Epidemiologic Studies Depression scale of the participant" ) +
xlab("cesd") +
ylab("Frequency") +
theme(plot.title = element_text(hjust = 0.5))+
scale_color_manual(name = "", values = c(mean = "blue"), labels = c("Mean"))
ggplot(df_num,aes(stai_t))+
geom_histogram(bins = 15, fill='orange',color='black') +
geom_vline(aes(xintercept = mean(stai_t), color = "mean"), linetype = "dashed", size = 1) +
ggtitle("Distribution of Center for Epidemiologic Studies Depression scale of the participant" ) +
xlab("cesd") +
ylab("Frequency") +
theme(plot.title = element_text(hjust = 0.5))+
scale_color_manual(name = "", values = c(mean = "blue"), labels = c("Mean"))
# Q-Q plots
ggplot(df_num, aes(sample= stai_t)) +
geom_qq() +
geom_qq_line() +
xlab("Theoritical") +
ylab("Sample")+
ggtitle("Q-Q plot for STAI")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(df_num, aes(sample= cesd)) +
geom_qq() +
geom_qq_line() +
xlab("Theoritical") +
ylab("Sample")+
ggtitle("Q-Q plot for CESD")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(df_num, aes(sample= mbi_ex)) +
geom_qq() +
geom_qq_line() +
xlab("Theoritical") +
ylab("Sample")+
ggtitle("Q-Q plot for MBI_EX")+
theme(plot.title = element_text(hjust = 0.5))
shapiro.test(df_num$mbi_ex)
##
## Shapiro-Wilk normality test
##
## data: df_num$mbi_ex
## W = 0.99046, p-value = 1.651e-05
length(df_num$mbi_ex)
## [1] 886
df_filtered <- df_num %>%
select(mbi_ex,stai_t,cesd)
df_filtered_t <- mutate(df_filtered,mbi_ex_t=log(mbi_ex+1),
stai_t_t=log(stai_t+1),
cesd_t=log(cesd+1))
shapiro.test(df_filtered_t$mbi_ex_t)
##
## Shapiro-Wilk normality test
##
## data: df_filtered_t$mbi_ex_t
## W = 0.95864, p-value = 4.085e-15
shapiro.test(df_filtered_t$stai_t_t)
##
## Shapiro-Wilk normality test
##
## data: df_filtered_t$stai_t_t
## W = 0.97987, p-value = 1.05e-09
shapiro.test(df_filtered_t$cesd_t)
##
## Shapiro-Wilk normality test
##
## data: df_filtered_t$cesd_t
## W = 0.94305, p-value < 2.2e-16
Transformations do not help to achieve normality. However, we assume normality as sample size is large .
All the three variables are interval data so we can use pearson’s correlation but for significance and CIs we need the variables of interest to be normal which even though are not from shapiro wilk test for normality but still can be assumed based on Central Limit Theorem
df_num %>%
select(mbi_ex,stai_t,cesd) %>%
cor(use="pairwise.complete.obs", method = "pearson")
## mbi_ex stai_t cesd
## mbi_ex 1.0000000 0.5304859 0.6056172
## stai_t 0.5304859 1.0000000 0.7157281
## cesd 0.6056172 0.7157281 1.0000000
df_3_var <- df_num %>%
select(cesd,mbi_ex,stai_t)
library(ggcorrplot)
ggcorrplot(cor(df_3_var),lab=TRUE)
# Testing significance of each correlation
cor.test(df_3_var$cesd,df_3_var$mbi_ex)
##
## Pearson's product-moment correlation
##
## data: df_3_var$cesd and df_3_var$mbi_ex
## t = 22.628, df = 884, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5621786 0.6457235
## sample estimates:
## cor
## 0.6056172
cor.test(df_3_var$stai_t,df_3_var$mbi_ex)
##
## Pearson's product-moment correlation
##
## data: df_3_var$stai_t and df_3_var$mbi_ex
## t = 18.606, df = 884, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4814446 0.5762160
## sample estimates:
## cor
## 0.5304859
cor.test(df_3_var$cesd,df_3_var$stai_t)
##
## Pearson's product-moment correlation
##
## data: df_3_var$cesd and df_3_var$stai_t
## t = 30.471, df = 884, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6820155 0.7464053
## sample estimates:
## cor
## 0.7157281
#Correlation using ggstatsplot::ggcormat which gives more information ( Adding the significance test too)
library(ggstatsplot)
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
ggstatsplot::ggcorrmat(df_3_var)
cor(df_3_var,use="pairwise.complete.obs", method = "pearson")^2
## cesd mbi_ex stai_t
## cesd 1.0000000 0.3667722 0.5122667
## mbi_ex 0.3667722 1.0000000 0.2814153
## stai_t 0.5122667 0.2814153 1.0000000
51.4% of the variance in cesd can be explained by the variance in stai_t, and vice versa.
We try to understand the relationship between cesd - depression and stai_t - anxiety by cantrolling mbi_ex - exhaustion
df_3_var
summary(df_3_var)
## cesd mbi_ex stai_t
## Min. : 0.00 Min. : 5.00 Min. :20.0
## 1st Qu.: 9.00 1st Qu.:13.00 1st Qu.:34.0
## Median :16.00 Median :17.00 Median :43.0
## Mean :18.05 Mean :16.88 Mean :42.9
## 3rd Qu.:25.00 3rd Qu.:20.00 3rd Qu.:51.0
## Max. :56.00 Max. :30.00 Max. :77.0
library(ggplot2)
# Fit linear models and extract residuals
model1 <- lm(cesd ~ mbi_ex, data = df_3_var)
model2 <- lm(stai_t ~ mbi_ex, data = df_3_var)
resid1 <- residuals(model1)
resid2 <- residuals(model2)
# Create a data frame with the residuals
resid_df <- data.frame(resid1, resid2)
# Create a scatterplot with a line of best fit
ggplot(resid_df, aes(x = resid1, y = resid2)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
xlab("Residuals from regressing cesd on mbi_ex") +
ylab("Residuals from regressing stai_t on mbi_ex") +
ggtitle("Partial correlation between cesd and stai_t while controlling the effect of mbi_ex") +
theme(plot.title = element_text(hjust = 0.5))+
geom_hline(yintercept = 0, col = "red") +
geom_vline(xintercept = 0, col = "red")
## `geom_smooth()` using formula = 'y ~ x'
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
df_t <- df %>%
select(health,cesd,stai_t,mbi_ex)
GGally::ggpairs(df_t,
mapping = aes(color = health),
title = 'Pairwise Scatter Plot and Correlation of cesd, mbi_ex and stai_t')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggm)
obj <- pcor(c("cesd","stai_t","mbi_ex"),var(df_3_var))
obj
## [1] 0.5847636
#Sample size
sample_size <- count(df_3_var)
sample_size$n
## [1] 886
pcor.test(r=obj,q=1,n=sample_size$n)
## $tval
## [1] 21.42056
##
## $df
## [1] 883
##
## $pvalue
## [1] 2.646448e-82
The correlation between cesd(depression) and stai_t(anxiety) was this : data: df_3_var\(cesd and df_3_var\)stai_t t = 30.471, df = 884, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.6820155 0.7464053 sample estimates: cor 0.7157281
After controlling the variable mbi_ex : the partial correlation dropped to 0.5847636 with the significance pcor.test(r=obj,q=1,n=sample_size$n) $tval [1] 21.42056 $df [1] 883 $pvalue [1] 2.646448e-82
df_3_var_2 <- df_num%>%
select(cesd,stai_t,mbi_ea)
cor(df_3_var_2)
## cesd stai_t mbi_ea
## cesd 1.0000000 0.7157281 -0.4535890
## stai_t 0.7157281 1.0000000 -0.4625348
## mbi_ea -0.4535890 -0.4625348 1.0000000
ggstatsplot::ggcorrmat(df_3_var_2)
obj1 <- pcor(c("cesd","stai_t","mbi_ea"),var(df_3_var_2))
obj1
## [1] 0.6402939
pcor.test(r=obj1,q=1,n=sample_size$n)
## $tval
## [1] 24.76994
##
## $df
## [1] 883
##
## $pvalue
## [1] 2.888227e-103