## Step 1: Familiarize with the data set
data<-read.csv("mental-heath-in-tech-2016_20161114.csv")
#a. check the number of rows and columns the data frame
dim(data)
## [1] 1433 63
#an initial look at the data frame
str(data)
## 'data.frame': 1433 obs. of 63 variables:
## $ Are.you.self.employed. : int 0 0 0 1 0 0 0 0 0 1 ...
## $ How.many.employees.does.your.company.or.organization.have. : chr "26-100" "6-25" "6-25" "" ...
## $ Is.your.employer.primarily.a.tech.company.organization. : int 1 1 1 NA 0 1 1 1 0 NA ...
## $ Is.your.primary.role.within.your.company.related.to.tech.IT. : int NA NA NA NA 1 NA NA NA 1 NA ...
## $ Does.your.employer.provide.mental.health.benefits.as.part.of.healthcare.coverage. : chr "Not eligible for coverage / N/A" "No" "No" "" ...
## $ Do.you.know.the.options.for.mental.health.care.available.under.your.employer.provided.coverage. : chr "N/A" "Yes" "N/A" "" ...
## $ Has.your.employer.ever.formally.discussed.mental.health..for.example..as.part.of.a.wellness.campaign.or.other.official.communication.. : chr "No" "Yes" "No" "" ...
## $ Does.your.employer.offer.resources.to.learn.more.about.mental.health.concerns.and.options.for.seeking.help. : chr "No" "Yes" "No" "" ...
## $ Is.your.anonymity.protected.if.you.choose.to.take.advantage.of.mental.health.or.substance.abuse.treatment.resources.provided.by.your.employer. : chr "I don't know" "Yes" "I don't know" "" ...
## $ If.a.mental.health.issue.prompted.you.to.request.a.medical.leave.from.work..asking.for.that.leave.would.be. : chr "Very easy" "Somewhat easy" "Neither easy nor difficult" "" ...
## $ Do.you.think.that.discussing.a.mental.health.disorder.with.your.employer.would.have.negative.consequences. : chr "No" "No" "Maybe" "" ...
## $ Do.you.think.that.discussing.a.physical.health.issue.with.your.employer.would.have.negative.consequences. : chr "No" "No" "No" "" ...
## $ Would.you.feel.comfortable.discussing.a.mental.health.disorder.with.your.coworkers. : chr "Maybe" "Maybe" "Maybe" "" ...
## $ Would.you.feel.comfortable.discussing.a.mental.health.disorder.with.your.direct.supervisor.s.. : chr "Yes" "Yes" "Maybe" "" ...
## $ Do.you.feel.that.your.employer.takes.mental.health.as.seriously.as.physical.health. : chr "I don't know" "Yes" "I don't know" "" ...
## $ Have.you.heard.of.or.observed.negative.consequences.for.co.workers.who.have.been.open.about.mental.health.issues.in.your.workplace. : chr "No" "No" "No" "" ...
## $ Do.you.have.medical.coverage..private.insurance.or.state.provided..which.includes.treatment.of.Â.mental.health.issues. : int NA NA NA 1 NA NA NA NA NA 1 ...
## $ Do.you.know.local.or.online.resources.to.seek.help.for.a.mental.health.disorder. : chr "" "" "" "Yes, I know several" ...
## $ If.you.have.been.diagnosed.or.treated.for.a.mental.health.disorder..do.you.ever.reveal.this.to.clients.or.business.contacts. : chr "" "" "" "Sometimes, if it comes up" ...
## $ If.you.have.revealed.a.mental.health.issue.to.a.client.or.business.contact..do.you.believe.this.has.impacted.you.negatively. : chr "" "" "" "I'm not sure" ...
## $ If.you.have.been.diagnosed.or.treated.for.a.mental.health.disorder..do.you.ever.reveal.this.to.coworkers.or.employees. : chr "" "" "" "Sometimes, if it comes up" ...
## $ If.you.have.revealed.a.mental.health.issue.to.a.coworker.or.employee..do.you.believe.this.has.impacted.you.negatively. : chr "" "" "" "I'm not sure" ...
## $ Do.you.believe.your.productivity.is.ever.affected.by.a.mental.health.issue. : chr "" "" "" "Yes" ...
## $ If.yes..what.percentage.of.your.work.time..time.performing.primary.or.secondary.job.functions..is.affected.by.a.mental.health.issue. : chr "" "" "" "1-25%" ...
## $ Do.you.have.previous.employers. : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Have.your.previous.employers.provided.mental.health.benefits. : chr "No, none did" "Yes, they all did" "No, none did" "Some did" ...
## $ Were.you.aware.of.the.options.for.mental.health.care.provided.by.your.previous.employers. : chr "N/A (not currently aware)" "I was aware of some" "N/A (not currently aware)" "N/A (not currently aware)" ...
## $ Did.your.previous.employers.ever.formally.discuss.mental.health..as.part.of.a.wellness.campaign.or.other.official.communication.. : chr "I don't know" "None did" "None did" "None did" ...
## $ Did.your.previous.employers.provide.resources.to.learn.more.about.mental.health.issues.and.how.to.seek.help. : chr "None did" "Some did" "Some did" "None did" ...
## $ Was.your.anonymity.protected.if.you.chose.to.take.advantage.of.mental.health.or.substance.abuse.treatment.resources.with.previous.employers. : chr "I don't know" "Yes, always" "I don't know" "I don't know" ...
## $ Do.you.think.that.discussing.a.mental.health.disorder.with.previous.employers.would.have.negative.consequences. : chr "Some of them" "None of them" "I don't know" "Some of them" ...
## $ Do.you.think.that.discussing.a.physical.health.issue.with.previous.employers.would.have.negative.consequences. : chr "None of them" "None of them" "Some of them" "Some of them" ...
## $ Would.you.have.been.willing.to.discuss.a.mental.health.issue.with.your.previous.co.workers. : chr "Some of my previous employers" "No, at none of my previous employers" "Some of my previous employers" "Some of my previous employers" ...
## $ Would.you.have.been.willing.to.discuss.a.mental.health.issue.with.your.direct.supervisor.s.. : chr "Some of my previous employers" "Some of my previous employers" "I don't know" "Some of my previous employers" ...
## $ Did.you.feel.that.your.previous.employers.took.mental.health.as.seriously.as.physical.health. : chr "I don't know" "Some did" "I don't know" "I don't know" ...
## $ Did.you.hear.of.or.observe.negative.consequences.for.co.workers.with.mental.health.issues.in.your.previous.workplaces. : chr "None of them" "None of them" "Some of them" "Some of them" ...
## $ Would.you.be.willing.to.bring.up.a.physical.health.issue.with.a.potential.employer.in.an.interview. : chr "Maybe" "Maybe" "Yes" "Yes" ...
## $ Why.or.why.not. : chr "" "It would depend on the health issue. If there is a health issue that would not immediately affect my job perfor"| __truncated__ "They would provable need to know, to Judge if I can do my job or not. " "old back injury, doesn't cause me many issues but occasionally impacts my ability to work at desk " ...
## $ Would.you.bring.up.a.mental.health.issue.with.a.potential.employer.in.an.interview. : chr "Maybe" "No" "Yes" "Maybe" ...
## $ Why.or.why.not..1 : chr "" "While mental health has become a more prominent issue recently, I feel like there is still a lot of stigma surr"| __truncated__ "Stigma, mainly. " "would not if I was not 100% sure that the disclosure would not affect my application" ...
## $ Do.you.feel.that.being.identified.as.a.person.with.a.mental.health.issue.would.hurt.your.career. : chr "Maybe" "No, I don't think it would" "Maybe" "Yes, I think it would" ...
## $ Do.you.think.that.team.members.co.workers.would.view.you.more.negatively.if.they.knew.you.suffered.from.a.mental.health.issue. : chr "No, I don't think they would" "No, I don't think they would" "Maybe" "Maybe" ...
## $ How.willing.would.you.be.to.share.with.friends.and.family.that.you.have.a.mental.illness. : chr "Somewhat open" "Somewhat open" "Somewhat open" "Neutral" ...
## $ Have.you.observed.or.experienced.an.unsupportive.or.badly.handled.response.to.a.mental.health.issue.in.your.current.or.previous.workplace. : chr "No" "No" "Maybe/Not sure" "No" ...
## $ Have.your.observations.of.how.another.individual.who.discussed.a.mental.health.disorder.made.you.less.likely.to.reveal.a.mental.health.issue.yourself.in.your.current.workplace.: chr "" "" "Yes" "" ...
## $ Do.you.have.a.family.history.of.mental.illness. : chr "No" "Yes" "No" "No" ...
## $ Have.you.had.a.mental.health.disorder.in.the.past. : chr "Yes" "Yes" "Maybe" "Yes" ...
## $ Do.you.currently.have.a.mental.health.disorder. : chr "No" "Yes" "No" "Yes" ...
## $ If.yes..what.condition.s..have.you.been.diagnosed.with. : chr "" "Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)" "" "Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)" ...
## $ If.maybe..what.condition.s..do.you.believe.you.have. : chr "" "" "" "" ...
## $ Have.you.been.diagnosed.with.a.mental.health.condition.by.a.medical.professional. : chr "Yes" "Yes" "No" "Yes" ...
## $ If.so..what.condition.s..were.you.diagnosed.with. : chr "Anxiety Disorder (Generalized, Social, Phobia, etc)" "Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)" "" "Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)" ...
## $ Have.you.ever.sought.treatment.for.a.mental.health.issue.from.a.mental.health.professional. : int 0 1 1 1 1 1 0 1 1 1 ...
## $ If.you.have.a.mental.health.issue..do.you.feel.that.it.interferes.with.your.work.when.being.treated.effectively. : chr "Not applicable to me" "Rarely" "Not applicable to me" "Sometimes" ...
## $ If.you.have.a.mental.health.issue..do.you.feel.that.it.interferes.with.your.work.when.NOT.being.treated.effectively. : chr "Not applicable to me" "Sometimes" "Not applicable to me" "Sometimes" ...
## $ What.is.your.age. : int 39 29 38 43 43 42 30 37 44 30 ...
## $ What.is.your.gender. : chr "Male" "male" "Male " "male" ...
## $ What.country.do.you.live.in. : chr "United Kingdom" "United States of America" "United Kingdom" "United Kingdom" ...
## $ What.US.state.or.territory.do.you.live.in. : chr "" "Illinois" "" "" ...
## $ What.country.do.you.work.in. : chr "United Kingdom" "United States of America" "United Kingdom" "United Kingdom" ...
## $ What.US.state.or.territory.do.you.work.in. : chr "" "Illinois" "" "" ...
## $ Which.of.the.following.best.describes.your.work.position. : chr "Back-end Developer" "Back-end Developer|Front-end Developer" "Back-end Developer" "Supervisor/Team Lead" ...
## $ Do.you.work.remotely. : chr "Sometimes" "Never" "Always" "Sometimes" ...
#b. view the summary statistics for all the columns of the data frame
summary(data)
# c. renaming long variable names
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
names(data)
## [1] "Are.you.self.employed."
## [2] "How.many.employees.does.your.company.or.organization.have."
## [3] "Is.your.employer.primarily.a.tech.company.organization."
## [4] "Is.your.primary.role.within.your.company.related.to.tech.IT."
## [5] "Does.your.employer.provide.mental.health.benefits.as.part.of.healthcare.coverage."
## [6] "Do.you.know.the.options.for.mental.health.care.available.under.your.employer.provided.coverage."
## [7] "Has.your.employer.ever.formally.discussed.mental.health..for.example..as.part.of.a.wellness.campaign.or.other.official.communication.."
## [8] "Does.your.employer.offer.resources.to.learn.more.about.mental.health.concerns.and.options.for.seeking.help."
## [9] "Is.your.anonymity.protected.if.you.choose.to.take.advantage.of.mental.health.or.substance.abuse.treatment.resources.provided.by.your.employer."
## [10] "If.a.mental.health.issue.prompted.you.to.request.a.medical.leave.from.work..asking.for.that.leave.would.be."
## [11] "Do.you.think.that.discussing.a.mental.health.disorder.with.your.employer.would.have.negative.consequences."
## [12] "Do.you.think.that.discussing.a.physical.health.issue.with.your.employer.would.have.negative.consequences."
## [13] "Would.you.feel.comfortable.discussing.a.mental.health.disorder.with.your.coworkers."
## [14] "Would.you.feel.comfortable.discussing.a.mental.health.disorder.with.your.direct.supervisor.s.."
## [15] "Do.you.feel.that.your.employer.takes.mental.health.as.seriously.as.physical.health."
## [16] "Have.you.heard.of.or.observed.negative.consequences.for.co.workers.who.have.been.open.about.mental.health.issues.in.your.workplace."
## [17] "Do.you.have.medical.coverage..private.insurance.or.state.provided..which.includes.treatment.of.Â.mental.health.issues."
## [18] "Do.you.know.local.or.online.resources.to.seek.help.for.a.mental.health.disorder."
## [19] "If.you.have.been.diagnosed.or.treated.for.a.mental.health.disorder..do.you.ever.reveal.this.to.clients.or.business.contacts."
## [20] "If.you.have.revealed.a.mental.health.issue.to.a.client.or.business.contact..do.you.believe.this.has.impacted.you.negatively."
## [21] "If.you.have.been.diagnosed.or.treated.for.a.mental.health.disorder..do.you.ever.reveal.this.to.coworkers.or.employees."
## [22] "If.you.have.revealed.a.mental.health.issue.to.a.coworker.or.employee..do.you.believe.this.has.impacted.you.negatively."
## [23] "Do.you.believe.your.productivity.is.ever.affected.by.a.mental.health.issue."
## [24] "If.yes..what.percentage.of.your.work.time..time.performing.primary.or.secondary.job.functions..is.affected.by.a.mental.health.issue."
## [25] "Do.you.have.previous.employers."
## [26] "Have.your.previous.employers.provided.mental.health.benefits."
## [27] "Were.you.aware.of.the.options.for.mental.health.care.provided.by.your.previous.employers."
## [28] "Did.your.previous.employers.ever.formally.discuss.mental.health..as.part.of.a.wellness.campaign.or.other.official.communication.."
## [29] "Did.your.previous.employers.provide.resources.to.learn.more.about.mental.health.issues.and.how.to.seek.help."
## [30] "Was.your.anonymity.protected.if.you.chose.to.take.advantage.of.mental.health.or.substance.abuse.treatment.resources.with.previous.employers."
## [31] "Do.you.think.that.discussing.a.mental.health.disorder.with.previous.employers.would.have.negative.consequences."
## [32] "Do.you.think.that.discussing.a.physical.health.issue.with.previous.employers.would.have.negative.consequences."
## [33] "Would.you.have.been.willing.to.discuss.a.mental.health.issue.with.your.previous.co.workers."
## [34] "Would.you.have.been.willing.to.discuss.a.mental.health.issue.with.your.direct.supervisor.s.."
## [35] "Did.you.feel.that.your.previous.employers.took.mental.health.as.seriously.as.physical.health."
## [36] "Did.you.hear.of.or.observe.negative.consequences.for.co.workers.with.mental.health.issues.in.your.previous.workplaces."
## [37] "Would.you.be.willing.to.bring.up.a.physical.health.issue.with.a.potential.employer.in.an.interview."
## [38] "Why.or.why.not."
## [39] "Would.you.bring.up.a.mental.health.issue.with.a.potential.employer.in.an.interview."
## [40] "Why.or.why.not..1"
## [41] "Do.you.feel.that.being.identified.as.a.person.with.a.mental.health.issue.would.hurt.your.career."
## [42] "Do.you.think.that.team.members.co.workers.would.view.you.more.negatively.if.they.knew.you.suffered.from.a.mental.health.issue."
## [43] "How.willing.would.you.be.to.share.with.friends.and.family.that.you.have.a.mental.illness."
## [44] "Have.you.observed.or.experienced.an.unsupportive.or.badly.handled.response.to.a.mental.health.issue.in.your.current.or.previous.workplace."
## [45] "Have.your.observations.of.how.another.individual.who.discussed.a.mental.health.disorder.made.you.less.likely.to.reveal.a.mental.health.issue.yourself.in.your.current.workplace."
## [46] "Do.you.have.a.family.history.of.mental.illness."
## [47] "Have.you.had.a.mental.health.disorder.in.the.past."
## [48] "Do.you.currently.have.a.mental.health.disorder."
## [49] "If.yes..what.condition.s..have.you.been.diagnosed.with."
## [50] "If.maybe..what.condition.s..do.you.believe.you.have."
## [51] "Have.you.been.diagnosed.with.a.mental.health.condition.by.a.medical.professional."
## [52] "If.so..what.condition.s..were.you.diagnosed.with."
## [53] "Have.you.ever.sought.treatment.for.a.mental.health.issue.from.a.mental.health.professional."
## [54] "If.you.have.a.mental.health.issue..do.you.feel.that.it.interferes.with.your.work.when.being.treated.effectively."
## [55] "If.you.have.a.mental.health.issue..do.you.feel.that.it.interferes.with.your.work.when.NOT.being.treated.effectively."
## [56] "What.is.your.age."
## [57] "What.is.your.gender."
## [58] "What.country.do.you.live.in."
## [59] "What.US.state.or.territory.do.you.live.in."
## [60] "What.country.do.you.work.in."
## [61] "What.US.state.or.territory.do.you.work.in."
## [62] "Which.of.the.following.best.describes.your.work.position."
## [63] "Do.you.work.remotely."
data <- data %>% rename(self_employed = Are.you.self.employed.,
employees = How.many.employees.does.your.company.or.organization.have.,
company_role = Is.your.employer.primarily.a.tech.company.organization.,
primary_role = Is.your.primary.role.within.your.company.related.to.tech.IT.,
mh_benefit_options = Do.you.know.the.options.for.mental.health.care.available.under.your.employer.provided.coverage.,
mh_benefits = Does.your.employer.provide.mental.health.benefits.as.part.of.healthcare.coverage.,
mh_discussion = Has.your.employer.ever.formally.discussed.mental.health..for.example..as.part.of.a.wellness.campaign.or.other.official.communication..,
mh_resources = Does.your.employer.offer.resources.to.learn.more.about.mental.health.concerns.and.options.for.seeking.help.,
mh_anonymity = Is.your.anonymity.protected.if.you.choose.to.take.advantage.of.mental.health.or.substance.abuse.treatment.resources.provided.by.your.employer.,
mh_medical_leave = If.a.mental.health.issue.prompted.you.to.request.a.medical.leave.from.work..asking.for.that.leave.would.be.,
mh_discussion_negative = Do.you.think.that.discussing.a.mental.health.disorder.with.your.employer.would.have.negative.consequences.,
ph_discussion_negative = Do.you.think.that.discussing.a.physical.health.issue.with.your.employer.would.have.negative.consequences.,
mh_discussion_coworkers = Would.you.feel.comfortable.discussing.a.mental.health.disorder.with.your.coworkers.,
mh_discussion_supervisor = Would.you.feel.comfortable.discussing.a.mental.health.disorder.with.your.direct.supervisor.s..,
mh_ph_serious = Do.you.feel.that.your.employer.takes.mental.health.as.seriously.as.physical.health.,
mh_coworker_consequences = Have.you.heard.of.or.observed.negative.consequences.for.co.workers.who.have.been.open.about.mental.health.issues.in.your.workplace.,
mh_local_online_resources = Do.you.know.local.or.online.resources.to.seek.help.for.a.mental.health.disorder.,
mh_dx_reveal_contacts = If.you.have.been.diagnosed.or.treated.for.a.mental.health.disorder..do.you.ever.reveal.this.to.clients.or.business.contacts.,
mh_dx_reveal_contacts_impact = If.you.have.revealed.a.mental.health.issue.to.a.client.or.business.contact..do.you.believe.this.has.impacted.you.negatively.,
mh_dx_reveal_coworkers = If.you.have.been.diagnosed.or.treated.for.a.mental.health.disorder..do.you.ever.reveal.this.to.coworkers.or.employees.,
mh_dx_reveal_coworkers_impact = If.you.have.revealed.a.mental.health.issue.to.a.coworker.or.employee..do.you.believe.this.has.impacted.you.negatively.,
mh_productivity = Do.you.believe.your.productivity.is.ever.affected.by.a.mental.health.issue.,
mh_productivity_percent = If.yes..what.percentage.of.your.work.time..time.performing.primary.or.secondary.job.functions..is.affected.by.a.mental.health.issue.,
previous_employers = Do.you.have.previous.employers.,
previous_employers_mhbenefits = Have.your.previous.employers.provided.mental.health.benefits.,
previous_employers_mhbenefits_aware = Were.you.aware.of.the.options.for.mental.health.care.provided.by.your.previous.employers.,
previous_employers_mhbenefits_discuss = Did.your.previous.employers.ever.formally.discuss.mental.health..as.part.of.a.wellness.campaign.or.other.official.communication..,
previous_employers_resources = Did.your.previous.employers.provide.resources.to.learn.more.about.mental.health.issues.and.how.to.seek.help.,
previous_employers_anonymtity = Was.your.anonymity.protected.if.you.chose.to.take.advantage.of.mental.health.or.substance.abuse.treatment.resources.with.previous.employers.,
previous_employers_mh_discuss_impact = Do.you.think.that.discussing.a.mental.health.disorder.with.previous.employers.would.have.negative.consequences.,
previous_employers_ph_discuss_impact = Do.you.think.that.discussing.a.physical.health.issue.with.previous.employers.would.have.negative.consequences.,
previous_coworkers_mh_discuss = Would.you.have.been.willing.to.discuss.a.mental.health.issue.with.your.previous.co.workers.,
previous_supervisor_mh_discuss = Would.you.have.been.willing.to.discuss.a.mental.health.issue.with.your.direct.supervisor.s..,
previous_employers_mh_ph_serious = Did.you.feel.that.your.previous.employers.took.mental.health.as.seriously.as.physical.health.,
previous_employers_mh_coworkers_consequences = Did.you.hear.of.or.observe.negative.consequences.for.co.workers.with.mental.health.issues.in.your.previous.workplaces.,
ph_interview = Would.you.be.willing.to.bring.up.a.physical.health.issue.with.a.potential.employer.in.an.interview.,
ph_interview_why =Why.or.why.not.,
mh_interview = Would.you.bring.up.a.mental.health.issue.with.a.potential.employer.in.an.interview.,
mh_interview_why = Why.or.why.not..1,
mh_id_career_impact = Do.you.feel.that.being.identified.as.a.person.with.a.mental.health.issue.would.hurt.your.career.,
mh_id_coworkers_impact = Do.you.think.that.team.members.co.workers.would.view.you.more.negatively.if.they.knew.you.suffered.from.a.mental.health.issue.,
mh_family_friends = How.willing.would.you.be.to.share.with.friends.and.family.that.you.have.a.mental.illness.,
mh_unsupportive_workplace = Have.you.observed.or.experienced.an.unsupportive.or.badly.handled.response.to.a.mental.health.issue.in.your.current.or.previous.workplace.,
mh_other_impact = Have.your.observations.of.how.another.individual.who.discussed.a.mental.health.disorder.made.you.less.likely.to.reveal.a.mental.health.issue.yourself.in.your.current.workplace.,
mi_family = Do.you.have.a.family.history.of.mental.illness.,
mh_past = Have.you.had.a.mental.health.disorder.in.the.past.,
mh_current = Do.you.currently.have.a.mental.health.disorder.,
mh_current_specify = If.yes..what.condition.s..have.you.been.diagnosed.with.,
mh_maybe_specify = If.maybe..what.condition.s..do.you.believe.you.have.,
mh_dx = Have.you.been.diagnosed.with.a.mental.health.condition.by.a.medical.professional.,
mh_dx_specify = If.so..what.condition.s..were.you.diagnosed.with.,
mh_treatment = Have.you.ever.sought.treatment.for.a.mental.health.issue.from.a.mental.health.professional.,
mh_interfere_treated = If.you.have.a.mental.health.issue..do.you.feel.that.it.interferes.with.your.work.when.being.treated.effectively.,
mh_interfere_not_treated = If.you.have.a.mental.health.issue..do.you.feel.that.it.interferes.with.your.work.when.NOT.being.treated.effectively.,
age = What.is.your.age.,
gender = What.is.your.gender.,
resident_country = What.country.do.you.live.in.,
resident_state_territory = What.US.state.or.territory.do.you.live.in.,
work_state_territory = What.US.state.or.territory.do.you.work.in.,
work_country = What.country.do.you.work.in.,
work_position = Which.of.the.following.best.describes.your.work.position.,
work_remote = Do.you.work.remotely.)
#d. checking for duplicated data
duplicated(data)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [493] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [505] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [517] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [529] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [541] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [553] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [565] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [577] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [589] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [601] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [613] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [625] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [637] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [649] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [661] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [673] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [685] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [697] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [709] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [721] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [733] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [745] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [757] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [769] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [781] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [793] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [805] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [817] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [829] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [841] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [853] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [865] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [877] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [889] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [901] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [913] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [925] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [937] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [949] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [961] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [973] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [985] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [997] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1009] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1021] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1033] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1045] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1057] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1069] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1081] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1093] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1105] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1117] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1129] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1141] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1153] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1165] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1189] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1201] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1213] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1225] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1237] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1249] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1261] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1273] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1285] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1297] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1309] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1321] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1333] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1345] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1357] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1369] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1381] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1393] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1405] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1417] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1429] FALSE FALSE FALSE FALSE FALSE
any(duplicated(data))
## [1] FALSE
data <- data[!duplicated(data),] #remove duplicated data
#e. cleaning irregular gender values
unique(data$gender)
## [1] "Male"
## [2] "male"
## [3] "Male "
## [4] "Female"
## [5] "M"
## [6] "female"
## [7] "m"
## [8] "I identify as female."
## [9] "female "
## [10] "Bigender"
## [11] "non-binary"
## [12] "Female assigned at birth "
## [13] "F"
## [14] "Woman"
## [15] "man"
## [16] "fm"
## [17] "f"
## [18] "Cis female "
## [19] "Transitioned, M2F"
## [20] "Genderfluid (born female)"
## [21] "Other/Transfeminine"
## [22] "Female or Multi-Gender Femme"
## [23] "Female "
## [24] "woman"
## [25] "female/woman"
## [26] "Cis male"
## [27] "Male."
## [28] "Androgynous"
## [29] "male 9:1 female, roughly"
## [30] "N/A"
## [31] "Male (cis)"
## [32] "Other"
## [33] "nb masculine"
## [34] "Cisgender Female"
## [35] "Man"
## [36] "Sex is male"
## [37] "none of your business"
## [38] "genderqueer"
## [39] "cis male"
## [40] "Human"
## [41] "Genderfluid"
## [42] "Enby"
## [43] "Malr"
## [44] "genderqueer woman"
## [45] "mtf"
## [46] "Queer"
## [47] "Agender"
## [48] "Dude"
## [49] "Fluid"
## [50] "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? "
## [51] "mail"
## [52] "M|"
## [53] "Male/genderqueer"
## [54] "fem"
## [55] "Nonbinary"
## [56] "male "
## [57] "human"
## [58] "Female (props for making this a freeform field, though)"
## [59] " Female"
## [60] "Unicorn"
## [61] "Cis Male"
## [62] "Male (trans, FtM)"
## [63] "Cis-woman"
## [64] "Genderqueer"
## [65] "cisdude"
## [66] "Genderflux demi-girl"
## [67] "female-bodied; no feelings about gender"
## [68] "cis man"
## [69] ""
## [70] "AFAB"
## [71] "Transgender woman"
## [72] "MALE"
table(data$gender)
data$gender <- as.factor(data$gender)
# Convert all female-identifying responses to 1 and all male-identifying responses to 0
data$gender <- gsub("(?i)F|(?i)Female", "1", data$gender)
data$gender <- gsub("(?i)M|(?i)Male", "0", data$gender)
# Convert gender column to numeric data type
data$gender <- as.numeric(data$gender)
## Warning: NAs introduced by coercion
# Replace instances of "NA" with actual NA data type in gender column
data$gender <- replace(data$gender, data$gender == "NA", NA)
data$gender
## [1] 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 NA 0 1 0 0 0 0 0
## [25] 0 0 0 0 1 NA NA 0 0 0 0 0 0 0 0 0 0 0 NA 0 0 0 0 0
## [49] 0 1 0 0 0 0 0 NA 0 0 1 0 NA 0 0 0 0 0 1 0 0 0 0 0
## [73] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0
## [97] 1 0 0 1 0 0 0 10 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0
## [121] 0 1 0 NA 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 NA 1 1 0
## [145] 1 0 0 0 NA 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0
## [169] 0 0 0 0 0 0 0 0 1 1 0 0 0 0 NA 0 0 0 0 1 0 0 0 0
## [193] 0 0 1 0 0 0 0 0 1 0 0 NA 0 0 0 0 0 0 0 0 1 0 0 0
## [217] 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0
## [241] 0 0 1 0 0 1 0 NA 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 1
## [265] 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 NA NA 0 1 0 0 0 1 0
## [289] 0 0 0 NA 0 0 1 0 0 1 1 0 NA 0 NA 0 1 0 0 1 0 0 1 0
## [313] 0 0 0 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0
## [337] 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0
## [361] 0 1 0 0 0 1 0 1 NA 1 1 0 NA 0 0 0 1 1 1 1 0 0 1 0
## [385] 1 0 0 1 1 0 1 1 0 0 0 0 1 NA 0 1 0 0 0 1 0 0 0 0
## [409] 0 0 0 0 1 0 0 0 1 1 0 0 NA 1 0 0 0 0 0 1 0 NA 0 0
## [433] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
## [457] 0 1 0 0 0 1 0 0 NA 0 0 1 0 0 0 0 0 0 1 1 0 NA 0 0
## [481] 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 NA 0
## [505] 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1
## [529] 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
## [553] 0 0 0 0 0 0 1 1 0 NA 0 0 0 0 1 0 0 0 0 1 0 0 1 0
## [577] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
## [601] 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 NA 0 0 NA 0 0 1
## [625] NA 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 NA 0 0 0 1
## [649] NA 0 0 NA 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 1 1
## [673] 0 NA 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
## [697] 1 1 NA 1 1 0 0 0 0 0 0 0 0 0 1 NA 1 0 0 1 0 0 1 0
## [721] 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 NA 1
## [745] 1 0 0 0 0 0 NA 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0
## [769] 0 1 0 0 0 NA 1 0 0 0 0 1 1 0 1 1 0 0 NA NA 0 0 0 1
## [793] 0 0 0 1 0 NA 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## [817] 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0
## [841] NA 0 0 0 0 0 NA 0 1 0 0 0 0 0 0 NA 1 1 0 0 0 1 0 0
## [865] 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1
## [889] 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0
## [913] 1 0 1 0 0 0 1 0 0 0 0 0 0 NA 1 1 0 0 NA 0 0 1 0 0
## [937] 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
## [961] 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0
## [985] 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 NA 0 0 0
## [1009] 1 1 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1
## [1033] 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1
## [1057] 0 0 0 0 0 0 NA 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NA 0
## [1081] 0 0 1 1 0 1 0 NA 0 0 NA 1 0 0 0 0 0 0 1 0 0 1 NA 0
## [1105] 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0
## [1129] 1 0 0 0 1 0 1 0 1 1 0 0 NA 0 0 0 NA 0 0 0 0 1 0 0
## [1153] 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 NA 0
## [1177] 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0
## [1201] 0 0 1 0 0 0 0 1 1 1 0 0 1 0 1 1 0 0 0 0 1 0 0 0
## [1225] 1 0 0 NA 0 0 0 0 0 NA 0 NA NA 0 0 0 0 0 0 0 0 0 0 0
## [1249] 1 0 NA 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0
## [1273] 0 NA 0 0 NA 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 NA 0 1 0
## [1297] 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 NA NA 1 0 1 1 0 1 0
## [1321] 1 1 NA 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0
## [1345] 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 0 0
## [1369] 0 0 0 0 1 0 NA 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0
## [1393] 1 NA 0 0 0 0 0 0 1 0 0 0 0 0 NA 0 0 0 0 1 0 1 1 1
## [1417] 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 NA
#filtering self-employed and company-employed into separate data frame
df_self <- data %>% filter(self_employed == 1)
data <- data %>% filter(self_employed == 0)
#removing cols with duplicate information or only applicable to self-employed or company-employed respondents
data <- data %>% select(-c("mh_local_online_resources", "mh_dx_reveal_contacts", "mh_dx_reveal_contacts_impact",
"mh_dx_reveal_coworkers", "mh_dx_reveal_coworkers_impact", "mh_productivity",
"mh_productivity_percent", "mh_maybe_specify", "mh_dx_specify", "self_employed"))
df_self <- df_self %>% select(-c("employees", "mh_benefits", "mh_benefit_options", "mh_discussion", "mh_resources",
"mh_anonymity", "mh_medical_leave", "mh_discussion_negative", "ph_discussion_negative",
"mh_discussion_coworkers", "mh_discussion_supervisor", "mh_ph_serious",
"mh_coworker_consequences", "mh_maybe_specify", "mh_dx_specify", "self_employed"))
#check the total missing values for the company-employed and self-employed
sum(is.na(data))
## [1] 2088
sum(is.na(df_self))
## [1] 586
#identifying the rows with NAs (Company employed)
rownames(data)[apply(data, 2, anyNA)]
## [1] "3" "16" "30" "47" "56" "69" "83" "100" "109" "122"
## [11] "136" "153" "162" "175" "189" "206" "215" "228" "242" "259"
## [21] "268" "281" "295" "312" "321" "334" "348" "365" "374" "387"
## [31] "401" "418" "427" "440" "454" "471" "480" "493" "507" "524"
## [41] "533" "546" "560" "577" "586" "599" "613" "630" "639" "652"
## [51] "666" "683" "692" "705" "719" "736" "745" "758" "772" "789"
## [61] "798" "811" "825" "842" "851" "864" "878" "895" "904" "917"
## [71] "931" "948" "957" "970" "984" "1001" "1010" "1023" "1037" "1054"
## [81] "1063" "1076" "1090" "1107" "1116" "1129" "1143"
#identifying the rows with NAs (self employed)
rownames(df_self)[apply(df_self, 2, anyNA)]
## [1] "1" "2" "41" "48" "49" "88" "95" "96" "135" "142" "143" "182"
## [13] "189" "190" "229" "236" "237" "276" "283" "284"
##ColMeans() function to calculate the percentage of missing values in each column,
#and then remove columns with a high percentage of missing values using indexing.
##--data (company employed)
# calculate percentage of missing values for each column
missing_pct <- colMeans(is.na(data)) * 100
# identify columns with more than 50% missing values
cols_to_drop <- which(missing_pct > 50)
# drop columns with more than 50% missing values
data_clean <- data[, -cols_to_drop]
##--df_self (self employed)
# calculate percentage of missing values for each column
missing_pct <- colMeans(is.na(df_self)) * 100
# identify columns with more than 50% missing values
cols_to_drop <- which(missing_pct > 50)
# drop columns with more than 50% missing values
df_self_clean <- df_self[, -cols_to_drop]
# check the number of missing values for each column in the data data frame.
colSums(is.na(data_clean))
## employees
## 0
## company_role
## 0
## mh_benefits
## 0
## mh_benefit_options
## 0
## mh_discussion
## 0
## mh_resources
## 0
## mh_anonymity
## 0
## mh_medical_leave
## 0
## mh_discussion_negative
## 0
## ph_discussion_negative
## 0
## mh_discussion_coworkers
## 0
## mh_discussion_supervisor
## 0
## mh_ph_serious
## 0
## mh_coworker_consequences
## 0
## previous_employers
## 0
## previous_employers_mhbenefits
## 0
## previous_employers_mhbenefits_aware
## 0
## previous_employers_mhbenefits_discuss
## 0
## previous_employers_resources
## 0
## previous_employers_anonymtity
## 0
## previous_employers_mh_discuss_impact
## 0
## previous_employers_ph_discuss_impact
## 0
## previous_coworkers_mh_discuss
## 0
## previous_supervisor_mh_discuss
## 0
## previous_employers_mh_ph_serious
## 0
## previous_employers_mh_coworkers_consequences
## 0
## ph_interview
## 0
## ph_interview_why
## 1
## mh_interview
## 0
## mh_interview_why
## 0
## mh_id_career_impact
## 0
## mh_id_coworkers_impact
## 0
## mh_family_friends
## 0
## mh_unsupportive_workplace
## 0
## mh_other_impact
## 0
## mi_family
## 0
## mh_past
## 0
## mh_current
## 0
## mh_current_specify
## 0
## mh_dx
## 0
## mh_treatment
## 0
## mh_interfere_treated
## 0
## mh_interfere_not_treated
## 0
## age
## 0
## gender
## 58
## resident_country
## 0
## resident_state_territory
## 0
## work_country
## 0
## work_state_territory
## 0
## work_position
## 0
## work_remote
## 0
colSums(is.na(df_self_clean))
## Do.you.have.medical.coverage..private.insurance.or.state.provided..which.includes.treatment.of.Â.mental.health.issues.
## 0
## mh_local_online_resources
## 0
## mh_dx_reveal_contacts
## 0
## mh_dx_reveal_contacts_impact
## 0
## mh_dx_reveal_coworkers
## 0
## mh_dx_reveal_coworkers_impact
## 0
## mh_productivity
## 0
## mh_productivity_percent
## 0
## previous_employers
## 0
## previous_employers_mhbenefits
## 0
## previous_employers_mhbenefits_aware
## 0
## previous_employers_mhbenefits_discuss
## 0
## previous_employers_resources
## 0
## previous_employers_anonymtity
## 0
## previous_employers_mh_discuss_impact
## 0
## previous_employers_ph_discuss_impact
## 0
## previous_coworkers_mh_discuss
## 0
## previous_supervisor_mh_discuss
## 0
## previous_employers_mh_ph_serious
## 0
## previous_employers_mh_coworkers_consequences
## 0
## ph_interview
## 0
## ph_interview_why
## 0
## mh_interview
## 0
## mh_interview_why
## 0
## mh_id_career_impact
## 0
## mh_id_coworkers_impact
## 0
## mh_family_friends
## 0
## mh_unsupportive_workplace
## 0
## mh_other_impact
## 0
## mi_family
## 0
## mh_past
## 0
## mh_current
## 0
## mh_current_specify
## 0
## mh_dx
## 0
## mh_treatment
## 0
## mh_interfere_treated
## 0
## mh_interfere_not_treated
## 0
## age
## 0
## gender
## 12
## resident_country
## 0
## resident_state_territory
## 0
## work_country
## 0
## work_state_territory
## 0
## work_position
## 0
## work_remote
## 0
#remove NA from rows
df_self_clean <- na.omit(df_self_clean)
data_clean <- na.omit(data_clean)
# last check for NA
sum(is.na(df_self_clean))
## [1] 0
sum(is.na(data_clean))
## [1] 0
#writing clean dataframes to the working directory
library(readr)
## Warning: package 'readr' was built under R version 4.1.3
## Warning: replacing previous import 'ellipsis::check_dots_unnamed' by
## 'rlang::check_dots_unnamed' when loading 'hms'
## Warning: replacing previous import 'ellipsis::check_dots_used' by
## 'rlang::check_dots_used' when loading 'hms'
## Warning: replacing previous import 'ellipsis::check_dots_empty' by
## 'rlang::check_dots_empty' when loading 'hms'
write_csv(df_self_clean, "C:/Users/User/Documents/UM Sem 2/WQD 7006 ML FOR DS/mental-health-in-tech-2016-self-employed-clean.csv")
write_csv(data_clean, "C:/Users/User/Documents/UM Sem 2/WQD 7006 ML FOR DS/mental-health-in-tech-2016-clean.csv")