work by Fengkai Tian

This data was collected by “osmi.org” (OSMI means Open Source of Mental Illness) in 2014. This survey investigates the frequency of mental health disorders in tech companies. There are 27 variables and 1260 observations in this dataset. The variables are very attractive. For example, family_history, remote_work, work_interfere, age, and so on, and some of them may have a correlation relationship with the level of mental health or morbidity of mental illness. Moreover, the dataset also includes the information about responders’ countries and states. There are many aspects that we can deeply investigate in this data set, and there are three questions will be discussed in this article.

Is there any relationship between the responders’ location and the occurrence of mental illness?
Is the number of employees in the work place a factor of causing mental illness?
Which variable has a correlated relationship with the occurrence of mental illness?

rm(list=ls())
library("ggplot2")
library("corrplot")

## corrplot 0.84 loaded

library("tidyverse")

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## v purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library("usmap")
library("plyr")

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

survey <- read.csv(file = "C:/Users/ft7b6/Desktop/survey.csv", header = TRUE, sep = ",")

state <- levels(survey$state) %>% as.data.frame()
nlevels(survey$state)

## [1] 0

state_freq <- as.data.frame(table(survey$state))
colnames(state_freq) <- c("state","freq")
as_tibble(state_freq)

## # A tibble: 45 x 2
##    state  freq
##    <fct> <int>
##  1 AL        8
##  2 AZ        7
##  3 CA      138
##  4 CO        9
##  5 CT        4
##  6 DC        4
##  7 FL       15
##  8 GA       12
##  9 IA        4
## 10 ID        1
## # ... with 35 more rows

plot_usmap(data = state_freq, values = "freq",color = "black") +
  scale_fill_continuous(low = "white", high = "dark blue", name = "Responders")

d <- c(-1,-3,-5,-10,-27)
group <- summary(survey[,d]) %>% as.matrix(); summary(survey$Age)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -1.726e+03  2.700e+01  3.100e+01  7.943e+07  3.600e+01  1.000e+11

true_age <- subset(survey$Age, 18<survey$Age&survey$Age<65)
boxplot(true_age,horizontal = FALSE)
text(x=40,"                                 mean  is 32.1")

survey$treatment <- as.factor(revalue(survey$treatment,c("No" = "0", "Yes" = "1")))
survey$family_history <- as.factor(revalue(survey$family_history,c("No" = "0", "Yes" = "1")))
survey$remote_work <- as.factor(revalue(survey$remote_work,c("No" = "0", "Yes" = "1")))
survey$tech_company <- as.factor(revalue(survey$tech_company,c("No" = "0", "Yes" = "1")))
survey$mental_health_interview <- as.factor(revalue(survey$mental_health_interview,c("No" = "0", "Yes" = "1","Maybe" = "0.5"),warn_missing = TRUE))
survey$remote_work <- as.factor(revalue(survey$remote_work,c("No" = "0", "Yes" = "1"),warn_missing = FALSE))
survey$mental_health_consequence <- as.factor(revalue(survey$mental_health_consequence,c("No" = "0", "Yes" = "1","Maybe" = "0.5")))
survey$phys_health_consequence <- as.factor(revalue(survey$phys_health_consequence,                                                    c("No" = "0", "Yes" = "1","Maybe" = "0.5")))
survey$seek_help <- as.factor(revalue(survey$seek_help, c("No" = "0", "Yes" = "1", "Don't know" = "NA")))
na.omit(survey$seek_help);

##    [1] 1  NA 0  0  NA NA 0  0  0  NA 0  0  0  NA NA 0  NA NA NA 0  0  0  0  NA
##   [25] 1  NA NA NA NA 0  0  0  0  1  0  0  0  0  NA 1  0  1  0  0  0  1  0  0 
##   [49] 0  NA 0  NA NA 0  0  0  0  0  0  0  NA 1  1  0  0  1  0  1  NA 0  0  NA
##   [73] 0  NA NA 0  1  0  0  NA 1  NA 0  1  0  0  0  NA 0  NA 1  0  0  0  NA NA
##   [97] NA 0  0  0  0  NA NA 1  NA 0  0  0  0  1  NA 1  1  NA NA NA 0  1  0  NA
##  [121] 1  0  NA 0  0  1  0  1  0  0  0  0  NA 0  0  0  0  0  NA 0  NA 1  NA 1 
##  [145] NA 1  0  0  1  NA 1  0  0  NA 1  NA NA 0  0  0  1  0  1  0  1  0  0  0 
##  [169] 0  NA NA 0  0  NA 0  NA 1  NA 0  0  0  0  NA NA NA 0  NA 0  0  0  0  0 
##  [193] 0  0  0  0  0  0  0  0  0  1  0  0  NA 0  0  0  NA 0  1  0  0  0  NA NA
##  [217] 0  NA NA 0  0  0  0  0  NA 0  0  1  NA 0  1  NA NA NA NA 0  NA 0  1  1 
##  [241] 0  0  1  0  NA 1  NA 0  0  NA NA NA NA NA NA 0  NA 1  0  0  0  0  1  0 
##  [265] 0  NA 0  NA 0  0  0  NA 1  0  0  1  0  0  NA NA 1  0  NA 0  0  0  NA NA
##  [289] 0  NA 0  NA NA NA NA NA NA 1  1  1  1  NA NA 1  1  NA 0  0  NA 1  NA 1 
##  [313] 0  NA NA NA NA 1  1  0  1  NA 1  1  0  NA 1  1  0  0  1  NA 0  0  0  NA
##  [337] NA 0  NA 1  NA 0  NA NA 0  0  0  1  NA NA 0  1  NA 1  0  NA 0  1  0  0 
##  [361] 0  NA NA 1  0  NA 1  0  NA 0  1  1  1  1  0  NA 0  0  1  1  NA 1  0  0 
##  [385] 0  NA 0  0  0  1  0  NA NA NA 0  NA 0  0  0  NA 0  0  NA 0  0  1  1  NA
##  [409] 0  0  0  NA NA 0  NA NA 0  0  0  NA 0  0  0  NA NA 1  0  NA 0  1  0  0 
##  [433] 0  0  0  NA NA NA NA 1  0  NA 1  0  0  NA NA 0  0  1  0  NA 0  0  NA 0 
##  [457] 0  0  0  0  NA 1  0  0  NA 0  0  0  0  0  1  1  0  1  0  1  0  NA NA 0 
##  [481] NA 1  NA 0  NA 0  0  0  0  0  NA 0  0  NA NA 0  0  0  1  NA 0  0  NA 1 
##  [505] 0  1  0  0  1  NA 1  0  NA 1  1  NA 1  1  0  0  1  NA 0  NA 0  1  NA 0 
##  [529] 1  NA 0  1  0  NA 0  0  1  NA 0  NA 0  0  0  NA 0  0  NA 1  1  0  0  0 
##  [553] 0  0  0  NA 1  NA NA NA 0  NA NA NA 0  0  NA 1  NA 0  0  1  1  0  0  NA
##  [577] 0  0  0  0  0  NA 1  0  0  0  0  0  0  0  0  0  0  1  1  NA 0  NA NA NA
##  [601] 1  1  0  1  NA 0  NA 0  0  0  0  0  0  0  0  1  NA 0  NA 0  0  0  0  1 
##  [625] 0  1  0  0  0  NA 0  0  0  NA NA 0  0  0  0  0  0  0  1  1  0  0  0  0 
##  [649] 1  0  1  NA 0  1  1  0  0  0  NA NA 1  NA NA 0  0  0  NA NA 0  0  NA NA
##  [673] 1  NA 0  NA 0  0  0  1  0  1  0  NA 0  NA 1  1  0  NA 0  0  NA NA 0  0 
##  [697] 0  0  NA NA NA 1  NA 0  1  0  0  NA NA 0  0  NA NA 1  NA 0  1  0  0  NA
##  [721] 0  0  0  NA 0  0  0  0  NA 0  0  0  0  1  0  0  1  0  1  NA 0  0  0  NA
##  [745] 0  NA NA 0  1  0  0  1  NA 0  1  NA NA NA 0  0  0  1  0  0  NA 0  1  NA
##  [769] 0  0  0  0  NA NA 0  1  NA 1  1  0  NA 0  NA 0  0  1  0  NA NA 1  0  NA
##  [793] 0  0  NA 0  NA 0  0  NA 0  0  0  1  NA 0  1  NA NA 1  0  0  0  0  NA NA
##  [817] 1  0  1  0  NA 0  0  NA 1  0  NA 0  0  0  1  NA 0  0  0  0  0  NA 0  1 
##  [841] NA 0  0  NA 0  NA NA 0  0  1  0  NA 0  0  0  1  1  0  0  0  0  NA NA 0 
##  [865] 1  0  0  0  0  NA 1  0  1  NA 0  NA 0  NA 1  NA NA NA NA NA NA 0  0  1 
##  [889] 0  1  1  0  NA 1  NA NA 1  0  0  NA 0  0  1  0  0  1  1  0  NA 1  0  1 
##  [913] 0  1  0  0  0  0  1  NA NA 1  0  0  NA 0  NA 1  0  NA 1  1  1  0  1  0 
##  [937] NA NA 1  1  NA 1  NA 1  NA 0  1  1  1  NA 0  0  0  NA 0  0  NA 1  NA NA
##  [961] 0  0  0  0  1  0  0  NA 0  1  0  0  NA 1  0  NA 1  0  0  0  0  0  1  NA
##  [985] 0  0  0  0  1  1  0  1  NA 0  0  NA 0  NA 0  0  NA NA 0  NA 0  NA 1  0 
## [1009] 0  0  1  NA NA NA 1  0  0  1  0  0  0  1  1  0  0  NA 1  0  0  NA 0  NA
## [1033] NA 0  NA 0  0  NA 0  0  0  1  1  0  0  0  0  0  1  1  0  0  0  NA NA NA
## [1057] NA 0  NA 1  0  0  0  0  0  0  NA NA 1  0  0  1  1  0  0  1  NA 1  NA 0 
## [1081] 0  1  0  NA 1  0  0  1  NA NA 0  NA 0  NA 1  0  0  NA NA NA 1  0  0  0 
## [1105] 0  NA 0  NA NA 0  NA 0  0  NA NA 0  NA NA 1  0  1  1  NA NA NA 0  NA 1 
## [1129] 0  0  0  0  0  1  1  0  0  0  0  NA 0  NA 0  0  1  0  NA 0  1  NA 1  0 
## [1153] NA 1  0  0  NA NA 0  NA 0  1  0  NA 0  NA 1  0  0  NA 0  1  1  0  0  0 
## [1177] 0  NA 0  0  0  1  1  0  0  NA NA 0  0  0  1  1  0  NA 0  0  0  0  NA NA
## [1201] 0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  1  0  0  1  0  NA 1  0 
## [1225] 1  0  NA 1  0  0  1  0  1  0  NA NA NA 0  1  0  0  0  NA 1  NA 1  1  0 
## [1249] NA 0  1  0  0  1  0  0  0  0  0 
## Levels: 0 1 NA

survey$mental_health_interview <- as.factor(revalue(survey$mental_health_interview,c("No" = "0", "Yes" = "1","Maybe" = "0.5"),warn_missing = FALSE))
survey$no_employees <- as.factor(revalue(survey$no_employees,c("1-5" = "1", "6-25" = "2", "26-100" = "3", "100-500" = "4", "500-1000" = "5", "More than 1000" = "6")))
survey$obs_consequence <- as.factor(revalue(survey$obs_consequence,c("No" = "0", "Yes" = "1")))
survey$work_interfere <- as.factor(revalue(survey$work_interfere,c("Never" = "0","Rarely" = "1", "Sometimes" = "2","Often" = "3")))
survey$treatment <- as.numeric(survey$treatment)
lm(treatment ~ no_employees, data=survey) %>% summary()

## 
## Call:
## lm(formula = treatment ~ no_employees, data = survey)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5617 -0.5177  0.4383  0.4810  0.5586 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.56173    0.03924  39.802   <2e-16 ***
## no_employees2 -0.12035    0.04899  -2.457   0.0142 *  
## no_employees3 -0.04270    0.04902  -0.871   0.3839    
## no_employees4 -0.02196    0.05438  -0.404   0.6864    
## no_employees5 -0.11173    0.07547  -1.480   0.1390    
## no_employees6 -0.04400    0.04923  -0.894   0.3717    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4994 on 1253 degrees of freedom
## Multiple R-squared:  0.006962,   Adjusted R-squared:  0.002999 
## F-statistic: 1.757 on 5 and 1253 DF,  p-value: 0.1189

survey$family_history <- as.numeric(survey$family_history)
survey$remote_work <- as.numeric(survey$remote_work)
survey$tech_company <- as.numeric(survey$tech_company)
survey$mental_health_interview <- as.numeric(survey$mental_health_interview)
survey$remote_work <- as.numeric(survey$remote_work)
survey$mental_health_consequence <- as.numeric(survey$mental_health_consequence)
survey$phys_health_consequence <- as.numeric(survey$phys_health_consequence)
survey$seek_help <- as.numeric(survey$seek_help)
survey$mental_health_interview <- as.numeric(survey$mental_health_interview)
survey$no_employees <-as.numeric(survey$no_employees)
survey$work_interfere <- as.numeric(survey$work_interfere)
survey$obs_consequence <- as.numeric(survey$obs_consequence)
nume <- c(2,7,8,9,11,12,16,19,20,23,26)
survey[,nume] %>% cor(use = "complete.obs") %>% corrplot(method = "pie")

ccc <- subset(survey, survey$state == "CA") 
ttt <- subset(survey, survey$state == "TX")
nnn <- subset(survey, survey$state == "NY")
www <- subset(survey, survey$state == "WA")
c<-as.numeric(as.matrix(table(ccc$treatment))[1,1])/nrow(ccc)
t<-as.numeric(as.matrix(table(ttt$treatment))[1,1])/nrow(ttt)
n<-as.numeric(as.matrix(table(nnn$treatment))[1,1])/nrow(nnn)
w<-as.numeric(as.matrix(table(www$treatment))[1,1])/nrow(www)
c1<-as.numeric(as.matrix(table(ccc$family_history))[1,1])/nrow(ccc)
t1<-as.numeric(as.matrix(table(ttt$family_history))[1,1])/nrow(ttt)
n1<-as.numeric(as.matrix(table(nnn$family_history))[1,1])/nrow(nnn)
w1<-as.numeric(as.matrix(table(www$family_history))[1,1])/nrow(www)
z <- c(c,t,n,w,c1,t1,n1,w1)
lll <- c("CA","TX","NY","WA")
qwq <-matrix(data = z, nrow =4, ncol = 2, byrow = FALSE)
rownames(qwq) <- lll
eqweqweqw <- c("por of treatment","por of family history")
colnames(qwq) <- eqweqweqw
qwq

##    por of treatment por of family history
## CA        0.3768116             0.5072464
## TX        0.4318182             0.7045455
## NY        0.4736842             0.6140351
## WA        0.4142857             0.5000000

funcc <- lm(treatment ~ family_history, data = survey, family = "binomial")

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'family' will be disregarded

summary(funcc)

## 
## Call:
## lm(formula = treatment ~ family_history, data = survey, family = "binomial")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7419 -0.3546  0.2581  0.2581  0.6454 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.96739    0.03944   24.53   <2e-16 ***
## family_history  0.38724    0.02676   14.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4633 on 1257 degrees of freedom
## Multiple R-squared:  0.1428, Adjusted R-squared:  0.1421 
## F-statistic: 209.4 on 1 and 1257 DF,  p-value: < 2.2e-16

Based on the above data analysis, we can conclude that mental illness has a little correlated relationship with the number of employees. Even though there is no individual variable perfectly correlated with the occurrence of mental illness, we can confirm that the level of mental health may be influenced by many variables. Moreover, it is possible that CA may have a lower proportion, and NY has a higher proportion of mental illness. To sum up, we need more data to make further confirmation about it.