Hypothesis testing

library(tidyverse)

#dir= 'D:/wallpapers and photos/csv/r data'
mr_data=read_csv("D:/wallpapers and photos/csv/r data/mental_health_dataset.csv" )

sapply( mr_data , is.character)

##                    age                 gender      employment_status 
##                  FALSE                   TRUE                   TRUE 
##       work_environment  mental_health_history        seeks_treatment 
##                   TRUE                   TRUE                   TRUE 
##           stress_level            sleep_hours physical_activity_days 
##                  FALSE                  FALSE                  FALSE 
##       depression_score          anxiety_score   social_support_score 
##                  FALSE                  FALSE                  FALSE 
##     productivity_score     mental_health_risk 
##                  FALSE                   TRUE

unique(mr_data$employment_status)

## [1] "Employed"      "Student"       "Self-employed" "Unemployed"

#a function for nominal encoding

factor_change=function(x){
  col_names=names(x)
  for (col in col_names){
    if (is.character(x[[col]])) {
      x[[col]]=as.factor(x[[col]])
      
    }
  }
  return(x)
}

mr_data=factor_change(mr_data)

#doing some ordinal encoding

mr_data=mr_data %>% 
  mutate(mental_health_risk=factor(mental_health_risk,levels = c('High','Medium','Low')),
         employment_status=factor(employment_status,levels = c("Self-employed","Employed","Student","Unemployed")))

I always try to work with a target variable and question in mind to keep me in track in the whole analysis process. Right now the i decided to work with mental_health_risk

### To check missing data:
library(naniar)

gg_miss_var(mr_data)

##or

vis_miss(mr_data)

No missing data

library(viridis)
library(hrbrthemes)
library(plotly)

plot_of_anxiety=ggplot(mr_data,aes(x=gender,y=anxiety_score,fill=mental_health_risk))+
  geom_bar(position = "dodge", stat = "identity")+
  facet_wrap(~mental_health_risk) + theme(panel.background = element_rect(fill = NA),
    plot.background = element_rect(fill = "white"))+
  theme_minimal()+
   scale_fill_viridis(discrete = T, option = "E") 


plot_of_anxiety

plot_of_anxiety2=ggplot(mr_data,aes(x=mental_health_risk,y=anxiety_score,fill=mental_health_risk))+
  geom_bar(position = "dodge", stat = "identity")+
  facet_wrap(~gender) + theme(panel.background = element_rect(fill = NA),
    plot.background = element_rect(fill = "white"))+
  theme_minimal()+
   scale_fill_viridis(discrete = T, option = "E") 


plot_of_anxiety2

##is there any association between mental health risk and gender:

mr_data %>% 
  select(mental_health_risk,gender) %>% 
  table() %>% 
  chisq.test()

## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 3.2609, df = 6, p-value = 0.7755

p>0.05, no association.

#similarly employment status and mental health risk:


mr_data %>% 
  select(mental_health_risk,employment_status) %>% 
  table() %>% 
  chisq.test()

## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 9.2337, df = 6, p-value = 0.1609

p>0.05, no association.

ggplot(mr_data) +
 aes(x = gender, y = anxiety_score, fill = gender) +
 geom_col() +
 scale_fill_hue(direction = 1) +
 theme_minimal()

ggplot(mr_data) +
 aes(x = gender, y = depression_score, fill = gender) +
 geom_col() +
 scale_fill_hue(direction = 1) +
 theme_minimal()

ggplot(mr_data) +
 aes(x = mental_health_risk, y = physical_activity_days, fill = mental_health_risk) +
 geom_boxplot() +
 scale_fill_viridis_d(option = "plasma", direction = 1) +
 theme_minimal()

#distribution of all data:
distribution= function(data){
  plots=list()
  columns=names(data)
  for(col in columns){
    if (is.numeric(data[[col]])){
      p=data %>% 
          ggplot(aes_string(x=col))+
            geom_histogram(fill="skyblue")+
        theme_ipsum()
      
      plots[[col]] = p
    }
    
  }
  print(plots)
}

distribution(mr_data)

## $age

## 
## $stress_level

## 
## $sleep_hours

## 
## $physical_activity_days

## 
## $depression_score

## 
## $anxiety_score

## 
## $social_support_score

## 
## $productivity_score

###mental health history and gender association:


mr_data %>% 
  select(mental_health_history,gender) %>% 
  table() %>% 
  chisq.test()

## 
##  Pearson's Chi-squared test
## 
## data:  .
## X-squared = 2.262, df = 3, p-value = 0.5198

no association.

###is the sleep hour between all kind of employed status people are same?

mr_data %>% 
  select(employment_status,sleep_hours) %>% 
  aov(sleep_hours~employment_status,alternative = "two.sided",data=.) %>% 
  summary()

##                     Df Sum Sq Mean Sq F value Pr(>F)
## employment_status    3     11   3.639   1.676   0.17
## Residuals         9996  21712   2.172

we do not have statistically significant evidence that employment status has an effect on the outcome variable as p>0.05 aka no effect.

ggplot(mr_data) +
 aes(x = sleep_hours, fill = employment_status) +
 geom_histogram(bins = 30L) +
 scale_fill_viridis_d(option = "cividis", 
 direction = 1) +
 theme_minimal() +
 facet_wrap(vars(employment_status))

Hypothesis testing

omon das

2025-05-15