#Clear R environmentrm(list=ls())#Working directorysetwd("C:/Users/USER/Desktop/Portfolio")#Import datasetlibrary(readr)Anxiety <-read_csv("Anxiety.csv")
Rows: 12000 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): Gender, Occupation, Smoking, Family History of Anxiety, Dizziness,...
dbl (13): ID, Age, Sleep Hours, Physical Activity (hrs/week), Caffeine Intak...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
**Data cleaning
Anxiety$Gender<-as.factor(Anxiety$Gender)Anxiety$Occupation<-as.factor(Anxiety$Occupation)Anxiety$Smoking<-as.factor(Anxiety$Smoking)Anxiety$`Family History of Anxiety`<-as.factor(Anxiety$`Family History of Anxiety`)Anxiety$Dizziness<-as.factor(Anxiety$Dizziness)Anxiety$Medication<-as.factor(Anxiety$Medication)Anxiety$`Recent Major Life Event`<-as.factor(Anxiety$`Recent Major Life Event`)
Explanatory Data Analysis for categorical variables
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ purrr 1.0.2
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.0 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(summarytools)
Attaching package: 'summarytools'
The following object is masked from 'package:tibble':
view
Univariate Analysis for categorical variables
Tables
freq(Anxiety$Gender,report.nas =FALSE)
Frequencies
Anxiety$Gender
Type: Factor
Freq % % Cum.
------------ ------- -------- --------
Female 5809 48.41 48.41
Male 5723 47.69 96.10
Other 468 3.90 100.00
Total 12000 100.00 100.00
freq(Anxiety$`Family History of Anxiety`,report.nas =FALSE)
Frequencies
Anxiety$`Family History of Anxiety`
Type: Factor
Freq % % Cum.
----------- ------- -------- --------
No 7179 59.82 59.82
Yes 4821 40.17 100.00
Total 12000 100.00 100.00
Counts<-table(Anxiety$`Family History of Anxiety`)barplot(Counts,main="Counts of Family History Anxiety",xlab="Family History of Anxiety",ylab="frequency",legend=rownames(count),col=c("red","blue"))
counts<-table(Anxiety$Dizziness)barplot(Counts,main="Counts of Dizziness",xlab="Dizziness",ylab="frequency",legend=rownames(count),col=c("red","blue"))
ounts<-table(Anxiety$Medication)barplot(Counts,main="counts of Medication",xlab="Medication",ylab="frequency",legend=rownames(count),col=c("red","blue"))
Bivariate Analysis
Association between categorical variables using chi square test of independent .
Outcome variable:Family history of anxiety
ctable(x=Anxiety$Gender,y=Anxiety$`Family History of Anxiety`,chisq =TRUE,headings=FALSE)
-------- --------------------------- -------------- -------------- ----------------
Family History of Anxiety No Yes Total
Gender
Female 3498 (60.2%) 2311 (39.8%) 5809 (100.0%)
Male 3426 (59.9%) 2297 (40.1%) 5723 (100.0%)
Other 255 (54.5%) 213 (45.5%) 468 (100.0%)
Total 7179 (59.8%) 4821 (40.2%) 12000 (100.0%)
-------- --------------------------- -------------- -------------- ----------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
5.9228 2 0.0517
----------------------------
ctable(x=Anxiety$Occupation,y=Anxiety$`Family History of Anxiety`,chisq =TRUE,headings=FALSE)
------------ --------------------------- -------------- -------------- ----------------
Family History of Anxiety No Yes Total
Occupation
Doctor 1186 (59.2%) 818 (40.8%) 2004 (100.0%)
Engineer 1178 (60.3%) 775 (39.7%) 1953 (100.0%)
Other 1184 (60.1%) 787 (39.9%) 1971 (100.0%)
Student 1172 (60.0%) 781 (40.0%) 1953 (100.0%)
Teacher 1164 (58.8%) 816 (41.2%) 1980 (100.0%)
Unemployed 1295 (60.5%) 844 (39.5%) 2139 (100.0%)
Total 7179 (59.8%) 4821 (40.2%) 12000 (100.0%)
------------ --------------------------- -------------- -------------- ----------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
1.9637 5 0.8541
----------------------------
ctable(x=Anxiety$Smoking,y=Anxiety$`Family History of Anxiety`,chisq =TRUE,headings=FALSE)
--------- --------------------------- -------------- -------------- ----------------
Family History of Anxiety No Yes Total
Smoking
No 5030 (59.8%) 3387 (40.2%) 8417 (100.0%)
Yes 2149 (60.0%) 1434 (40.0%) 3583 (100.0%)
Total 7179 (59.8%) 4821 (40.2%) 12000 (100.0%)
--------- --------------------------- -------------- -------------- ----------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
0.0409 1 0.8397
----------------------------
ctable(x=Anxiety$Dizziness,y=Anxiety$`Family History of Anxiety`,chisq =TRUE,headings=FALSE)
----------- --------------------------- -------------- -------------- ----------------
Family History of Anxiety No Yes Total
Dizziness
No 4995 (59.4%) 3411 (40.6%) 8406 (100.0%)
Yes 2184 (60.8%) 1410 (39.2%) 3594 (100.0%)
Total 7179 (59.8%) 4821 (40.2%) 12000 (100.0%)
----------- --------------------------- -------------- -------------- ----------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
1.8424 1 0.1747
----------------------------
ctable(x=Anxiety$Medication,y=Anxiety$`Family History of Anxiety`,chisq =TRUE,headings=FALSE)
------------ --------------------------- -------------- -------------- ----------------
Family History of Anxiety No Yes Total
Medication
No 5764 (60.0%) 3841 (40.0%) 9605 (100.0%)
Yes 1415 (59.1%) 980 (40.9%) 2395 (100.0%)
Total 7179 (59.8%) 4821 (40.2%) 12000 (100.0%)
------------ --------------------------- -------------- -------------- ----------------
----------------------------
Chi.squared df p.value
------------- ---- ---------
0.6502 1 0.42
----------------------------
Explanatory Data Analysis for continuous variables
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA between categorical variables and continuous variables
ggplot(Anxiety,aes(Gender,Age,fill=Gender))+geom_boxplot(show.legend =FALSE)+facet_wrap(~`Family History of Anxiety`)+theme_bw()+labs(title="Age and gender distribution among families with history of anxiety",y="Age in years")
ggplot(Anxiety,aes(Gender,`Sleep Hours`,fill=Gender))+geom_boxplot(show.legend =FALSE)+facet_wrap(~`Family History of Anxiety`)+theme_bw()+labs(title="Sleeping hours and gender distribution among families with history of anxiety",y="Sleeping hours")