Dataset: Student Mental Health and analyze them using correlation.

Loading libraries

library(ggplot2)

Loading data

df=read.csv("Data Carrard et al. 2022 MedTeach.csv")
df

Structure of data

str(df)
## 'data.frame':    886 obs. of  20 variables:
##  $ id       : int  2 4 9 10 13 14 17 21 23 24 ...
##  $ age      : int  18 26 21 21 21 26 23 23 23 22 ...
##  $ year     : int  1 4 3 2 3 5 5 4 4 2 ...
##  $ sex      : int  1 1 2 2 1 2 2 1 2 2 ...
##  $ glang    : int  120 1 1 1 1 1 1 1 1 1 ...
##  $ part     : int  1 1 0 0 1 1 1 1 1 1 ...
##  $ job      : int  0 0 0 1 0 1 0 1 1 0 ...
##  $ stud_h   : int  56 20 36 51 22 10 15 8 20 20 ...
##  $ health   : int  3 4 3 5 4 2 3 4 2 5 ...
##  $ psyt     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ jspe     : int  88 109 106 101 102 102 117 118 118 108 ...
##  $ qcae_cog : int  62 55 64 52 58 48 58 65 69 56 ...
##  $ qcae_aff : int  27 37 39 33 28 37 38 40 46 36 ...
##  $ amsp     : int  17 22 17 18 21 17 23 32 23 22 ...
##  $ erec_mean: num  0.738 0.69 0.69 0.833 0.69 ...
##  $ cesd     : int  34 7 25 17 14 14 45 6 43 11 ...
##  $ stai_t   : int  61 33 73 48 46 56 56 36 43 43 ...
##  $ mbi_ex   : int  17 14 24 16 22 18 28 11 26 18 ...
##  $ mbi_cy   : int  13 11 7 10 14 15 17 10 21 6 ...
##  $ mbi_ea   : int  20 26 23 21 23 18 16 27 22 23 ...

Changing the types of data type based on documentation

#Changing int into factor for categorical variables

df$sex = as.factor(df$sex) # Gender
df$glang = as.factor(df$glang) # Mother Tongue
df$part = as.factor(df$part) # Partenership Status
df$health = as.factor(df$health) # Health Status
df$psyt = as.factor(df$psyt) # Physiotherapy
df$year = as.factor(df$year) # year
df
str(df)
## 'data.frame':    886 obs. of  20 variables:
##  $ id       : int  2 4 9 10 13 14 17 21 23 24 ...
##  $ age      : int  18 26 21 21 21 26 23 23 23 22 ...
##  $ year     : Factor w/ 6 levels "1","2","3","4",..: 1 4 3 2 3 5 5 4 4 2 ...
##  $ sex      : Factor w/ 3 levels "1","2","3": 1 1 2 2 1 2 2 1 2 2 ...
##  $ glang    : Factor w/ 19 levels "1","15","20",..: 18 1 1 1 1 1 1 1 1 1 ...
##  $ part     : Factor w/ 2 levels "0","1": 2 2 1 1 2 2 2 2 2 2 ...
##  $ job      : int  0 0 0 1 0 1 0 1 1 0 ...
##  $ stud_h   : int  56 20 36 51 22 10 15 8 20 20 ...
##  $ health   : Factor w/ 5 levels "1","2","3","4",..: 3 4 3 5 4 2 3 4 2 5 ...
##  $ psyt     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ jspe     : int  88 109 106 101 102 102 117 118 118 108 ...
##  $ qcae_cog : int  62 55 64 52 58 48 58 65 69 56 ...
##  $ qcae_aff : int  27 37 39 33 28 37 38 40 46 36 ...
##  $ amsp     : int  17 22 17 18 21 17 23 32 23 22 ...
##  $ erec_mean: num  0.738 0.69 0.69 0.833 0.69 ...
##  $ cesd     : int  34 7 25 17 14 14 45 6 43 11 ...
##  $ stai_t   : int  61 33 73 48 46 56 56 36 43 43 ...
##  $ mbi_ex   : int  17 14 24 16 22 18 28 11 26 18 ...
##  $ mbi_cy   : int  13 11 7 10 14 15 17 10 21 6 ...
##  $ mbi_ea   : int  20 26 23 21 23 18 16 27 22 23 ...

Checking for missing values

df %>%
  summarise_all(list(~ sum(is.na(.))))

So no na values found!

Summary ( central tendency of continous features and count of categorical features )

summary(df)
##        id              age        year    sex         glang     part   
##  Min.   :   2.0   Min.   :17.00   1:245   1:275   1      :717   0:387  
##  1st Qu.: 447.5   1st Qu.:20.00   2:135   2:606   90     : 45   1:499  
##  Median : 876.0   Median :22.00   3:143   3:  5   15     : 31          
##  Mean   : 889.7   Mean   :22.38   4:123           102    : 27          
##  3rd Qu.:1341.8   3rd Qu.:24.00   5:127           20     : 22          
##  Max.   :1790.0   Max.   :49.00   6:113           121    : 13          
##                                                   (Other): 31          
##       job             stud_h      health  psyt         jspe      
##  Min.   :0.0000   Min.   : 0.00   1: 37   0:687   Min.   : 67.0  
##  1st Qu.:0.0000   1st Qu.:12.00   2: 87   1:199   1st Qu.:101.0  
##  Median :0.0000   Median :25.00   3:136           Median :107.0  
##  Mean   :0.3488   Mean   :25.29   4:402           Mean   :106.4  
##  3rd Qu.:1.0000   3rd Qu.:36.00   5:224           3rd Qu.:113.0  
##  Max.   :1.0000   Max.   :70.00                   Max.   :125.0  
##                                                                  
##     qcae_cog        qcae_aff          amsp         erec_mean     
##  Min.   :37.00   Min.   :18.00   Min.   : 6.00   Min.   :0.3571  
##  1st Qu.:54.00   1st Qu.:31.00   1st Qu.:20.00   1st Qu.:0.6667  
##  Median :58.00   Median :35.00   Median :23.00   Median :0.7262  
##  Mean   :58.53   Mean   :34.78   Mean   :23.15   Mean   :0.7201  
##  3rd Qu.:63.00   3rd Qu.:39.00   3rd Qu.:26.75   3rd Qu.:0.7857  
##  Max.   :76.00   Max.   :48.00   Max.   :35.00   Max.   :0.9524  
##                                                                  
##       cesd           stai_t         mbi_ex          mbi_cy          mbi_ea     
##  Min.   : 0.00   Min.   :20.0   Min.   : 5.00   Min.   : 4.00   Min.   :10.00  
##  1st Qu.: 9.00   1st Qu.:34.0   1st Qu.:13.00   1st Qu.: 6.00   1st Qu.:21.00  
##  Median :16.00   Median :43.0   Median :17.00   Median : 9.00   Median :24.00  
##  Mean   :18.05   Mean   :42.9   Mean   :16.88   Mean   :10.08   Mean   :24.21  
##  3rd Qu.:25.00   3rd Qu.:51.0   3rd Qu.:20.00   3rd Qu.:13.00   3rd Qu.:28.00  
##  Max.   :56.00   Max.   :77.0   Max.   :30.00   Max.   :24.00   Max.   :36.00  
## 

EDA

df_num <- df %>%
  select(age,stud_h,jspe,qcae_cog,qcae_aff,amsp,erec_mean,cesd,stai_t,mbi_ex,mbi_cy,mbi_ea)
df_num

Checking the cesd vs mbi_ex

# Load ggplot2 library
library(ggplot2)

# Scatter plot of two columns in the dataframe
ggplot(df_num, aes(x = cesd, y = mbi_ex)) +
  geom_point() +
  xlab("CES-D total score") +
  ylab("MBI Emotional Exhaustion") +
  ggtitle("Scatter plot of cesd and mbi_ex")

Plotting the correlation matrix

df_num %>% 
  select(mbi_ex, qcae_cog, stai_t) %>% 
  cor(use="pairwise.complete.obs", method = "pearson")
##               mbi_ex    qcae_cog      stai_t
## mbi_ex    1.00000000 -0.02363016  0.53048589
## qcae_cog -0.02363016  1.00000000 -0.07786829
## stai_t    0.53048589 -0.07786829  1.00000000
df_num %>% 
  select(cesd, mbi_ea, stud_h) %>% 
  cor(use="pairwise.complete.obs", method = "pearson")
##              cesd    mbi_ea    stud_h
## cesd    1.0000000 -0.453589 0.1740849
## mbi_ea -0.4535890  1.000000 0.1017320
## stud_h  0.1740849  0.101732 1.0000000
library(reshape)
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
# Calculate the correlation matrix
cor_matrix <- cor(df_num)

# Melt the correlation matrix into a data frame for plotting
melted_matrix <- melt(cor_matrix)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
# Plot the heatmap
ggplot(melted_matrix, aes(X1, X2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "yellow", high = "red", mid = "orange", midpoint = 0, limit = c(-1,1), space = "Lab", name="Correlation\nCoefficient") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "bottom") +
  geom_text(aes(X1, X2, label = round(value, 2)), color = "white", size = 4,fontface = "bold", show.legend = FALSE) +
  ggtitle("Correlation Heatmap")

High +ve Correlated -

cesd x stai_t = 0.72 cesd X mbi_ex = 0.61

mbi_ex X stai_t = 0.53 mbi_ex X mbi_cy = 0.51

High -ve Correlated -

mbi_ea X mbi_cy = -0.57 mbi_ea X mbi_ex = -0.47 mbi_ea X stai_t = -0.46 mbi_ea X cesd = -0.45

Summary of mbi_ex , stai_t ,cesd

df_x <- df %>%
  select(mbi_ex , stai_t ,cesd)
summary(df_x)
##      mbi_ex          stai_t          cesd      
##  Min.   : 5.00   Min.   :20.0   Min.   : 0.00  
##  1st Qu.:13.00   1st Qu.:34.0   1st Qu.: 9.00  
##  Median :17.00   Median :43.0   Median :16.00  
##  Mean   :16.88   Mean   :42.9   Mean   :18.05  
##  3rd Qu.:20.00   3rd Qu.:51.0   3rd Qu.:25.00  
##  Max.   :30.00   Max.   :77.0   Max.   :56.00
# Load ggplot2 library
library(ggplot2)

# Scatter plot of two columns in the dataframe
ggplot(df_num, aes(x = cesd, y = stai_t)) +
  geom_point() +
  xlab("CES-D total score (Depression)") +
  ylab("STAI Score (Anxiety)") +
  theme(plot.title = element_text(hjust = 0.5))+
  ggtitle("Scatter plot of cesd and stai_t")+
  geom_smooth(method = "lm", se = FALSE) 
## `geom_smooth()` using formula = 'y ~ x'

# Scatter plot of two columns in the dataframe
ggplot(df_num, aes(x = cesd, y = stai_t)) +
  geom_point(color='brown') +
  xlab("CES-D total score (Depression)") +
  ylab("MBI_EX ") +
  theme(plot.title = element_text(hjust = 0.5))+
  ggtitle("Scatter plot of cesd and mbi_ex")+
  geom_smooth(method = "lm", se = FALSE) 
## `geom_smooth()` using formula = 'y ~ x'

# Load ggplot2 library
library(ggplot2)

# Scatter plot of two columns in the dataframe
ggplot(df_num, aes(y = mbi_ex, x = stai_t)) +
  geom_point(color='red') +
  xlab("mbi_ea") +
  ylab("stai_t") +
  ggtitle("Scatter plot of stai_t and mbi_ex")+
  geom_smooth(method = "lm", se = FALSE) 
## `geom_smooth()` using formula = 'y ~ x'

Plotting the distribution of the 3 variables

ggplot(df_num,aes(x=mbi_ex))+
  geom_histogram(bins = 15, fill='grey',color='black') + 
  geom_vline(aes(xintercept = mean(mbi_ex), color = "mean"), linetype = "dashed", size = 1) + 
  ggtitle("Distribution of Maslach Burnout Inventory-Exhaustion scale of the participant" ) +
  xlab("mbi_ex") +
  ylab("Frequency") +
  theme(plot.title = element_text(hjust = 0.5))+
  scale_color_manual(name = "", values = c(mean = "blue"), labels = c("Mean")) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

ggplot(df_num,aes(cesd))+
  geom_histogram(bins = 15, fill='brown',color='black') + 
  geom_vline(aes(xintercept = mean(cesd), color = "mean"), linetype = "dashed", size = 1) + 
  ggtitle("Distribution of Center for Epidemiologic Studies Depression scale of the participant" ) +
  xlab("cesd") +
  ylab("Frequency") +
  theme(plot.title = element_text(hjust = 0.5))+
  scale_color_manual(name = "", values = c(mean = "blue"), labels = c("Mean"))

ggplot(df_num,aes(stai_t))+
  geom_histogram(bins = 15, fill='orange',color='black') + 
  geom_vline(aes(xintercept = mean(stai_t), color = "mean"), linetype = "dashed", size = 1) + 
  ggtitle("Distribution of Center for Epidemiologic Studies Depression scale of the participant" ) +
  xlab("cesd") +
  ylab("Frequency") +
  theme(plot.title = element_text(hjust = 0.5))+
  scale_color_manual(name = "", values = c(mean = "blue"), labels = c("Mean"))

# Q-Q plots

ggplot(df_num, aes(sample= stai_t)) + 
  geom_qq() +
  geom_qq_line() + 
  xlab("Theoritical") + 
  ylab("Sample")+
  ggtitle("Q-Q plot for STAI")+
  theme(plot.title = element_text(hjust = 0.5))

ggplot(df_num, aes(sample= cesd)) + 
  geom_qq() +
  geom_qq_line() + 
  xlab("Theoritical") + 
  ylab("Sample")+
  ggtitle("Q-Q plot for CESD")+
  theme(plot.title = element_text(hjust = 0.5))

ggplot(df_num, aes(sample= mbi_ex)) + 
  geom_qq() +
  geom_qq_line() + 
  xlab("Theoritical") + 
  ylab("Sample")+
  ggtitle("Q-Q plot for MBI_EX")+
  theme(plot.title = element_text(hjust = 0.5))

Normality Tests

shapiro.test(df_num$mbi_ex)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_num$mbi_ex
## W = 0.99046, p-value = 1.651e-05

Sample Size

length(df_num$mbi_ex)
## [1] 886

Checking if any transformations help

df_filtered <- df_num %>%
  select(mbi_ex,stai_t,cesd)
df_filtered_t <- mutate(df_filtered,mbi_ex_t=log(mbi_ex+1),
                       stai_t_t=log(stai_t+1),
                        cesd_t=log(cesd+1))
shapiro.test(df_filtered_t$mbi_ex_t)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_filtered_t$mbi_ex_t
## W = 0.95864, p-value = 4.085e-15
shapiro.test(df_filtered_t$stai_t_t)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_filtered_t$stai_t_t
## W = 0.97987, p-value = 1.05e-09
shapiro.test(df_filtered_t$cesd_t)
## 
##  Shapiro-Wilk normality test
## 
## data:  df_filtered_t$cesd_t
## W = 0.94305, p-value < 2.2e-16

Transformations do not help to achieve normality. However, we assume normality as sample size is large .

Type of data

All the three variables are interval data so we can use pearson’s correlation but for significance and CIs we need the variables of interest to be normal which even though are not from shapiro wilk test for normality but still can be assumed based on Central Limit Theorem

Correlation matrix of

mbi_ex - Maslach Burnout Inventory-Exhaustion scale of the participant.

stai_t State-Trait Anxiety Inventory scale of the participant.

cesd Center for Epidemiologic Studies Depression scale of the participant.

df_num %>%
  select(mbi_ex,stai_t,cesd) %>%
  cor(use="pairwise.complete.obs", method = "pearson")
##           mbi_ex    stai_t      cesd
## mbi_ex 1.0000000 0.5304859 0.6056172
## stai_t 0.5304859 1.0000000 0.7157281
## cesd   0.6056172 0.7157281 1.0000000
df_3_var <- df_num %>%
  select(cesd,mbi_ex,stai_t)

Heatmap to vizualize correlation

library(ggcorrplot)
ggcorrplot(cor(df_3_var),lab=TRUE)

# Testing significance of each correlation

cor.test(df_3_var$cesd,df_3_var$mbi_ex)
## 
##  Pearson's product-moment correlation
## 
## data:  df_3_var$cesd and df_3_var$mbi_ex
## t = 22.628, df = 884, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5621786 0.6457235
## sample estimates:
##       cor 
## 0.6056172
cor.test(df_3_var$stai_t,df_3_var$mbi_ex)
## 
##  Pearson's product-moment correlation
## 
## data:  df_3_var$stai_t and df_3_var$mbi_ex
## t = 18.606, df = 884, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4814446 0.5762160
## sample estimates:
##       cor 
## 0.5304859
cor.test(df_3_var$cesd,df_3_var$stai_t)
## 
##  Pearson's product-moment correlation
## 
## data:  df_3_var$cesd and df_3_var$stai_t
## t = 30.471, df = 884, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6820155 0.7464053
## sample estimates:
##       cor 
## 0.7157281

#Correlation using ggstatsplot::ggcormat which gives more information ( Adding the significance test too)

library(ggstatsplot)
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
ggstatsplot::ggcorrmat(df_3_var)

Coefficient of Determination ( R2)

cor(df_3_var,use="pairwise.complete.obs", method = "pearson")^2
##             cesd    mbi_ex    stai_t
## cesd   1.0000000 0.3667722 0.5122667
## mbi_ex 0.3667722 1.0000000 0.2814153
## stai_t 0.5122667 0.2814153 1.0000000

51.4% of the variance in cesd can be explained by the variance in stai_t, and vice versa.

Partial Correlation

We try to understand the relationship between cesd - depression and stai_t - anxiety by cantrolling mbi_ex - exhaustion

df_3_var

Summary of df_3_var

summary(df_3_var)
##       cesd           mbi_ex          stai_t    
##  Min.   : 0.00   Min.   : 5.00   Min.   :20.0  
##  1st Qu.: 9.00   1st Qu.:13.00   1st Qu.:34.0  
##  Median :16.00   Median :17.00   Median :43.0  
##  Mean   :18.05   Mean   :16.88   Mean   :42.9  
##  3rd Qu.:25.00   3rd Qu.:20.00   3rd Qu.:51.0  
##  Max.   :56.00   Max.   :30.00   Max.   :77.0

Partial Correlation using residuals

library(ggplot2)

# Fit linear models and extract residuals
model1 <- lm(cesd ~ mbi_ex, data = df_3_var)
model2 <- lm(stai_t ~ mbi_ex, data = df_3_var)
resid1 <- residuals(model1)
resid2 <- residuals(model2)

# Create a data frame with the residuals
resid_df <- data.frame(resid1, resid2)

# Create a scatterplot with a line of best fit
ggplot(resid_df, aes(x = resid1, y = resid2)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE) +
  xlab("Residuals from regressing cesd on mbi_ex") +
  ylab("Residuals from regressing stai_t on mbi_ex") +
  ggtitle("Partial correlation between cesd and stai_t while controlling the effect of mbi_ex") +
  theme(plot.title = element_text(hjust = 0.5))+
  geom_hline(yintercept = 0, col = "red") +
  geom_vline(xintercept = 0, col = "red")
## `geom_smooth()` using formula = 'y ~ x'

Plotting Correation/ distribution using better packages

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
df_t <- df %>%
  select(health,cesd,stai_t,mbi_ex)
GGally::ggpairs(df_t,
                mapping = aes(color = health),
                title = 'Pairwise Scatter Plot and Correlation of cesd, mbi_ex and stai_t')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The correlation between cesd and stat_i when controlling mbi_ex

library(ggm)
obj <- pcor(c("cesd","stai_t","mbi_ex"),var(df_3_var))
obj
## [1] 0.5847636

#Sample size

sample_size <- count(df_3_var)
sample_size$n
## [1] 886

Testing Significance of the partial correlation

pcor.test(r=obj,q=1,n=sample_size$n)
## $tval
## [1] 21.42056
## 
## $df
## [1] 883
## 
## $pvalue
## [1] 2.646448e-82

The correlation between cesd(depression) and stai_t(anxiety) was this : data: df_3_var\(cesd and df_3_var\)stai_t t = 30.471, df = 884, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.6820155 0.7464053 sample estimates: cor 0.7157281

After controlling the variable mbi_ex : the partial correlation dropped to 0.5847636 with the significance pcor.test(r=obj,q=1,n=sample_size$n) $tval [1] 21.42056 $df [1] 883 $pvalue [1] 2.646448e-82

——— CHECKING FOR OTHER VARIABLES ————-

df_3_var_2 <- df_num%>%
  select(cesd,stai_t,mbi_ea)
cor(df_3_var_2)
##              cesd     stai_t     mbi_ea
## cesd    1.0000000  0.7157281 -0.4535890
## stai_t  0.7157281  1.0000000 -0.4625348
## mbi_ea -0.4535890 -0.4625348  1.0000000
ggstatsplot::ggcorrmat(df_3_var_2)

obj1 <- pcor(c("cesd","stai_t","mbi_ea"),var(df_3_var_2))
obj1
## [1] 0.6402939
pcor.test(r=obj1,q=1,n=sample_size$n)
## $tval
## [1] 24.76994
## 
## $df
## [1] 883
## 
## $pvalue
## [1] 2.888227e-103