Capstone Project Code

R Markdown

This is the R Markdown document where you should write all the code you use for your final capstone project.

As a refresher, for help using R Markdown you can refer to Canvas:

https://canvas.cornell.edu/courses/75000/pages/watch-assembling-a-document-with-r-markdown?module_item_id=3022666.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

sldata <- read.csv("Health_Sleep_Statistics.csv")

Setting Variables

PAL <- sldata$Physical.Activity.Level
S_D <- sldata$Sleep.Disorders
SQ <- sldata$Sleep.Quality

Setting up the Side-by-Side box plot using Tidyverse

bplot <- ggplot(sldata, aes(x = Physical.Activity.Level, y = Sleep.Quality, fill = Sleep.Disorders)) + geom_boxplot()
bplot

Boxplot without tidyverse

blt <- boxplot(SQ ~ PAL+S_D)

blt

## $stats
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    8   NA    6   NA    4   NA
## [2,]    8   NA    6   NA    4   NA
## [3,]    9   NA    7   NA    5   NA
## [4,]    9   NA    8   NA    5   NA
## [5,]    9   NA    9   NA    5   NA
## attr(,"class")
##   high.no 
## "integer" 
## 
## $n
## [1] 36  0 38  0 26  0
## 
## $conf
##          [,1] [,2]    [,3] [,4]     [,5] [,6]
## [1,] 8.736667   NA 6.48738   NA 4.690137   NA
## [2,] 9.263333   NA 7.51262   NA 5.309863   NA
## 
## $out
## [1] 7
## 
## $group
## [1] 5
## 
## $names
## [1] "high.no"    "low.no"     "medium.no"  "high.yes"   "low.yes"   
## [6] "medium.yes"

Summaries for each variable

#Summary for Physical Activity Level
pPAL <- prop.table(table(PAL))


#Summary for Sleep Disorder
pS_D <- prop.table(table(S_D))

S_Ddata <- subset(sldata, Physical.Activity.Level == "low")

Uncertinaty Calculation using permutation

Null: Sleep Quality Level distribution of each Physical Activity Level are the same

Alternative: Sleep Quality Level distribution of each Physical Activity Level are different.

First we make the boxplot without the confounding variable of the orignal dataset that we can compare the permutated boxplot to.

SQuality.hg <- sldata$Sleep.Quality[sldata$Physical.Activity.Level == "high"]
SQuality.md <- sldata$Sleep.Quality[sldata$Physical.Activity.Level == "medium"]
SQuality.lw <- sldata$Sleep.Quality[sldata$Physical.Activity.Level == "low"]

#Create a boxplot of each Physical Activty Level

boxplot(SQuality.lw, SQuality.md, SQuality.hg, names = c("Low", "Medium", "High"), 
        ylab = 'Sleep Quality Score',
        main = 'Sleep Quality Score for each Physical Activity Level',
        col = ecBlack)

#Computing the mean of each group
group_means <- c(mean(SQuality.lw, na.rm=TRUE),
  mean(SQuality.md, na.rm=TRUE),
  mean(SQuality.hg, na.rm=TRUE))
# Computing the observed statistic
obs_stat = max(group_means) - min(group_means)
obs_stat

## [1] 3.940171

We shuffle the Sleep Quality data throughout all obeservations.

#Permutation
set.seed(1)
P = 10000
store_mean_diff = rep(0, P)

for (n in 1:P){

  dat.perm <- sldata
  perm.id = sample(1:100,size = 100, replace = FALSE)
  dat.perm$Sleep.Quality = sldata$Sleep.Quality[perm.id] 
  SQuality.hg.perm <- dat.perm$Sleep.Quality[dat.perm$Physical.Activity.Level == "high"]
  SQuality.md.perm <- dat.perm$Sleep.Quality[dat.perm$Physical.Activity.Level == "medium"]
  SQuality.lw.perm <- dat.perm$Sleep.Quality[dat.perm$Physical.Activity.Level == "low"]
  group_means.perm <- c(mean(SQuality.hg.perm),
                        mean(SQuality.md.perm),
                        mean(SQuality.lw.perm))
store_mean_diff[n] <- max(group_means.perm) - min (group_means.perm)
}


#Create Boxplot of one permutated Data

boxplot(SQuality.lw.perm, SQuality.md.perm, SQuality.hg.perm,names = c("Low", "Medium", "High"), 
        ylab = 'Sleep Quality Score',
        main = 'Sleep Quality Score for each Physical Activity Level',
        col = ecBlack)

#Create Histogram of all permutated Data:

hist(store_mean_diff, breaks = 20, freq = FALSE, col = ecBlack, border = 'white',
     xlab = 'Mean Diff of Sleep Quality Scores',
     main = 'Histogram of Sleep Quality Score Diff (Permuted Data)',
     xlim = c(0,4))
abline(v = obs_stat, col = skyBlue, lwd = 3)

#Calculate P-value
mean(abs(store_mean_diff) >= abs(obs_stat))

## [1] 0

Univariate Analysis

#Physical Activity Level Sumarization
P.tl <-table(factor(PAL, levels = c("low", "medium", "high")))
P.tl

## 
##    low medium   high 
##     26     38     36

P.tl.f <-prop.table(P.tl)
P.tl.f

## 
##    low medium   high 
##   0.26   0.38   0.36

#Physical Activity Level Bar plot
barplot(P.tl, 
        main = "Count of each Physical Activity Level",
        col = crimson,
        xlab = "Physical Activty Level",
        ylab = "Count",
        cex.main = 1,
        axes = FALSE)

axis(side = 2, at = seq(0, 40, by = 5), labels = seq(0, 40, by = 5))

#Sleep Quality Score Sumarization
SQm <- median(SQ)
SQmx <- max(SQ)
SQmn <- min(SQ)
SQq <- quantile(SQ, probs = c(0.25,0.75))
SQq

##  25%  75% 
## 5.75 8.25

#Sleep Quality Score Histogram
hist(SQ,
     main = "Distribution of Sleep Quality Scores",
     xlab = "Sleep Quality Score",
     col = crimson)

#Sleep Disorder Sumarization
S_D.tl <-table(S_D)
S_D.tl

## S_D
##  no yes 
##  74  26

S_D.tl.f <-prop.table(S_D.tl)
S_D.tl.f

## S_D
##   no  yes 
## 0.74 0.26

#Sleep Disorder Bar plot
barplot(S_D.tl, 
        main = "Count of Sleep Disorder",
        col = crimson,
        xlab = "Sleep Disorder Status",
        ylab = "Count",
        axes = FALSE)

axis(side = 2, at = seq(0, 75, by = 5), labels = seq(0, 75, by = 5))

Multivaraite Summary Statistic

#Create varaibles with each Activey level, Sleep Disorder combination
SQ.hg.no <- SQuality.hg[sldata$Sleep.Disorders == "no"]
SQ.hg.ys <- SQuality.hg[sldata$Sleep.Disorders == "yes"]
SQ.md.no <- SQuality.md[sldata$Sleep.Disorders == "no"]
SQ.md.ys <- SQuality.md[sldata$Sleep.Disorders == "yes"]
SQ.lw.no <- SQuality.lw[sldata$Sleep.Disorders == "no"]
SQ.lw.ys <- SQuality.lw[sldata$Sleep.Disorders == "yes"]

SQ.hg.no.mean <- mean(SQ.hg.no, na.rm = TRUE)
SQ.hg.no.median <- median( SQ.hg.no,na.rm = TRUE)

SQ.hg.ys.mean <-  mean(SQ.hg.ys, na.rm = TRUE)
SQ.hg.ys.median <- median( SQ.hg.ys,na.rm = TRUE)

SQ.md.no.mean <- mean(SQ.md.no, na.rm = TRUE)
SQ.md.no.median <- median( SQ.md.no,na.rm = TRUE)

SQ.md.ys.mean <-  mean(SQ.md.ys, na.rm = TRUE)
SQ.md.ys.median <- median( SQ.md.ys,na.rm = TRUE)

SQ.lw.no.mean <- mean(SQ.lw.no, na.rm = TRUE)
SQ.lw.no.median <- median( SQ.lw.no,na.rm = TRUE)

SQ.lw.ys.mean <-  mean(SQ.lw.ys, na.rm = TRUE)
SQ.lw.ys.median <- median( SQ.lw.ys,na.rm = TRUE)

SQ.hg.no.mean

## [1] 8.576923

SQ.hg.no.median

## [1] 9

SQ.hg.ys.mean

## [1] 8.5

SQ.hg.ys.median

## [1] 8.5

SQ.md.no.mean

## [1] 7.148148

SQ.md.no.median

## [1] 7

SQ.md.ys.mean

## [1] 7.181818

SQ.md.ys.median

## [1] 7

SQ.lw.no.mean

## [1] 4.722222

SQ.lw.no.median

## [1] 5

SQ.lw.ys.mean

## [1] 4.375

SQ.lw.ys.median

## [1] 4