Start Date: 1 Nov 2024
Report Date: 30 November 2024
Welcome to the contest for November! The prompts can be found here, and the data is available here. The contest will run until the last day of November (November 30th), latest time zone on earth. The contest entry form is available here
knitr::opts_chunk$set(echo = T,message = F,warning = F)
setwd("C:/Users/alanh/Documents/R/Psi_Chi_R")
library(tidyverse)
data=read_csv('data.csv')
#access CSV directly through URL; local copy will be overwritten with a different month's data
#data= read.csv('https://osf.io/download/7uc9x/')
## EDA
names(data) = make.names(colnames(data))
skimr::skim(data)
Name | data |
Number of rows | 374 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
character | 4 |
numeric | 8 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
Gender | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
Occupation | 0 | 1 | 5 | 20 | 0 | 11 | 0 |
Blood.Pressure | 0 | 1 | 6 | 6 | 0 | 25 | 0 |
Sleep.Disorder | 0 | 1 | 4 | 11 | 0 | 3 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Person.ID | 0 | 1 | 187.50 | 108.11 | 1.0 | 94.25 | 187.5 | 280.75 | 374.0 | ▇▇▇▇▇ |
Age | 0 | 1 | 42.18 | 8.67 | 27.0 | 35.25 | 43.0 | 50.00 | 59.0 | ▆▆▇▃▅ |
Sleep.Duration | 0 | 1 | 7.13 | 0.80 | 5.8 | 6.40 | 7.2 | 7.80 | 8.5 | ▇▆▇▇▆ |
Quality.of.Sleep | 0 | 1 | 7.31 | 1.20 | 4.0 | 6.00 | 7.0 | 8.00 | 9.0 | ▁▇▆▇▅ |
Physical.Activity.Level | 0 | 1 | 59.17 | 20.83 | 30.0 | 45.00 | 60.0 | 75.00 | 90.0 | ▇▇▇▇▇ |
Stress.Level | 0 | 1 | 5.52 | 3.12 | 3.0 | 4.00 | 5.0 | 7.00 | 55.0 | ▇▁▁▁▁ |
Heart.Rate | 0 | 1 | 70.17 | 4.14 | 65.0 | 68.00 | 70.0 | 72.00 | 86.0 | ▇▇▂▁▁ |
Daily.Steps | 0 | 1 | 6816.84 | 1617.92 | 3000.0 | 5600.00 | 7000.0 | 8000.00 | 10000.0 | ▁▅▇▆▂ |
Level 1:
Write a script to create a visualization showing the distribution of the ‘Quality of Sleep’ variable. Is ‘quality of sleep’ normally distributed?
#no, Quality of Sleep isn't normally distributed.
hist(data$Quality.of.Sleep)
Write a script to identify any outliers present in the ‘Stress Level’ variable. Are there any outliers present in ‘Stress Level?’
#Yes, there are outliers.
boxplot(data$Stress.Level)
title(ylab='Stress Level')
data %>%
ggplot(aes(y=Stress.Level,x=Person.ID))+
geom_boxplot(outlier.shape= NA)+
geom_jitter()+
theme_bw()
Level 2: If you identify any outliers present in the ‘Stress Level’ variable, write a script that removes the outliers.
data %>%
group_by(Stress.Level) %>%
count() %>%
arrange(desc(n))
data1 = data %>%
filter(Stress.Level < 55)
boxplot(data1$Stress.Level)
title(main = 'no outliers',ylab='Stress Level')
data1 %>%
ggplot(aes(y=Stress.Level,x=Person.ID))+
geom_boxplot(outlier.shape= NA)+
geom_jitter()+
theme_bw()
Find the min, median, max, average, and standard deviation of ‘Stress Level,’ ‘Quality of Sleep,’ and ‘Physical Activity Level.’
desc_stats = function(x){
c(min = min(x,na.rm=T),
median = median(x,na.rm=T),
median = median(x,na.rm=T),
max = max(x,na.rm=T),
mean = mean(x,na.rm=T),
sd = sd(x,na.rm=T))
}
desc_list = list("Stress.Level","Quality.of.Sleep","Physical.Activity.Level")
results= list()
for (x in desc_list){
results[[x]]=desc_stats(data1[[x]])
}
print(results)
## $Stress.Level
## min median median max mean sd
## 3.000000 5.000000 5.000000 8.000000 5.386059 1.776798
##
## $Quality.of.Sleep
## min median median max mean sd
## 4.000000 7.000000 7.000000 9.000000 7.313673 1.198454
##
## $Physical.Activity.Level
## min median median max mean sd
## 30.00000 60.00000 60.00000 90.00000 59.22252 20.83502
## using PURRR; stick with base
# results1 = map(desc_list,~desc_stats(data1[[.x]]))
#
# names(results1) = desc_list
#
# results1
Level 3: Plot the relationship between ‘Age’ and ‘Quality of Sleep.’
data1 %>%
ggplot(aes(x=Age,y=Quality.of.Sleep,col=Quality.of.Sleep))+
geom_point()+
geom_jitter()+
theme_bw()+
labs(col=' ',y='Sleep Quality',
title = 'Sleeping Beauty')+
theme(plot.title = element_text(hjust = .5))
data1 %>%
ggplot(aes(y=Occupation,x=rev(Quality.of.Sleep),fill=Gender))+
geom_col()+
theme_bw()+
labs(fill=' ',x='Sleep Quality',
title = 'Sleeping "On the Job"',
y='Occupation')+
theme(plot.title = element_text(hjust = .5))
data1 %>%
ggplot(aes(x=Age,y=Quality.of.Sleep,fill=Sleep.Disorder))+
geom_col()+
theme_bw()+
labs(fill=' ',y='Sleep Quality',
title = 'Sleeping Differently')+
theme(plot.title = element_text(hjust = .5))+
facet_grid(Gender~Sleep.Disorder)
Create a table to show the average ‘Stress Level’ by ‘Occupation.’
tab1 = data1 %>%
group_by(Occupation) %>%
summarise(Avg_Stress = mean(Stress.Level,na.rm = T)) %>%
arrange(desc(Avg_Stress))
print(tab1)
## # A tibble: 11 × 2
## Occupation Avg_Stress
## <chr> <dbl>
## 1 Sales Representative 8
## 2 Salesperson 7
## 3 Scientist 7
## 4 Doctor 6.73
## 5 Software Engineer 6
## 6 Nurse 5.55
## 7 Lawyer 5.06
## 8 Manager 5
## 9 Accountant 4.59
## 10 Teacher 4.51
## 11 Engineer 3.89
tab1 %>%
ggplot(aes(y=fct_inorder(Occupation),x=Avg_Stress,fill=Avg_Stress))+
geom_col()+
theme_bw()+
theme(plot.title = element_text(hjust = .5))+
labs(title="Who's the 'Meanest' (Stress) of Them All?",x='Mean Stress Levels',fill=" ",y='Occupation')
# dollar format
# dollars = function(x) {
# paste0("$", format(x,big.mark= ",",scientific=F))
# }
Level 4: Is there a relationship between ‘Quality of Sleep’ and ‘Stress Level?’
shapiro.test(data1$Quality.of.Sleep)
##
## Shapiro-Wilk normality test
##
## data: data1$Quality.of.Sleep
## W = 0.89346, p-value = 1.845e-15
shapiro.test(data1$Stress.Level)
##
## Shapiro-Wilk normality test
##
## data: data1$Stress.Level
## W = 0.89051, p-value = 1.098e-15
hist(data1$Quality.of.Sleep)
hist(data1$Stress.Level)
cor.test(data1$Quality.of.Sleep,data1$Stress.Level,method = "kendall")
##
## Kendall's rank correlation tau
##
## data: data1$Quality.of.Sleep and data1$Stress.Level
## z = -19.857, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## -0.8263603
#yes, there's a relationship