library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.0.4
library(openintro)
library(statsr)
## Warning: package 'statsr' was built under R version 4.0.5
## Warning: package 'BayesFactor' was built under R version 4.0.5
## Warning: package 'coda' was built under R version 4.0.5
library(StMoSim)
## Warning: package 'StMoSim' was built under R version 4.0.5
library(RcppParallel)
## Warning: package 'RcppParallel' was built under R version 4.0.5
library(Rcpp)
library(ggplot2)
library(dplyr)
library(data.tree)
library(plotrix)
library(infer)
set.seed(110)

Step 1

Final Project:
Analyze Data

  1. Download the data
  2. View observations
  3. View summary data
  4. View the table of variables for the observations

Comments and Findings The original dataframe consists of 500 observations and 6 variables. To view the data, the structure, summary data, and a table of youth variables were generated as seen in the code below.

Problem
While trying to work with the original data, errors were generated during some of the tests because of data types. To resolve the problem with incompatible data types, the original dataset was modified by converting variables with char data types (Yes/No responses) to integers of 0 = No and 1 = Yes. The Sleep variable was modified from char to int (removed all chars other than numbers). Worth noting, a Sleep int of 4 is actually “4 hours or less”; a Sleep int of 10 is actually “10 hours or more.” After modifying the data file, there are 14 variables. The converted variable columns include the following: SleepNumeric, SmokeLife_Convert, SmokeDailyConvert, MarijuaEver_Convert, and x variables. The original variables 6 are as follows: Sleep, Sleep7, MarijuaEver, Age, SmokeLife, SmokeDaily.

#The Data
setwd("C:/AarynZimmerman/Biostatistics Project")
youth <-read.csv("YouthRisk2009AZ.csv")
destfile = "youth.RData"
str(youth)    #view 500 observations with 6 variables
## 'data.frame':    500 obs. of  14 variables:
##  $ X                  : int  6 7 24 25 34 39 43 49 62 74 ...
##  $ Sleep              : chr  "4 or less hours" "6 hours" "6 hours" "6 hours" ...
##  $ SleepNumeric       : int  4 6 6 6 6 5 6 6 6 6 ...
##  $ Sleep7             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.1                : logi  NA NA NA NA NA NA ...
##  $ SmokeLife_Convert  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.2                : logi  NA NA NA NA NA NA ...
##  $ SmokeDaily_Convert : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MarijuaEver        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.3                : logi  NA NA NA NA NA NA ...
##  $ Age                : int  15 17 16 16 15 18 16 18 14 16 ...
##  $ SmokeLife          : chr  "No" "No" "No" "No" ...
##  $ SmokeDaily         : chr  "No" "No" "No" "No" ...
##  $ MarijuaEver_Convert: chr  "No" "No" "No" "No" ...
summary(youth) #view summary data
##        X            Sleep            SleepNumeric        Sleep7      
##  Min.   :  1.0   Length:500         Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:125.8   Class :character   1st Qu.: 6.000   1st Qu.:0.0000  
##  Median :250.5   Mode  :character   Median : 7.000   Median :1.0000  
##  Mean   :250.5                      Mean   : 6.184   Mean   :0.6637  
##  3rd Qu.:375.2                      3rd Qu.: 8.000   3rd Qu.:1.0000  
##  Max.   :500.0                      Max.   :10.000   Max.   :1.0000  
##                                                      NA's   :54      
##    X.1          SmokeLife_Convert   X.2          SmokeDaily_Convert
##  Mode:logical   Min.   :  0.000   Mode:logical   Min.   :  0.00    
##  NA's:500       1st Qu.:  0.000   NA's:500       1st Qu.:  0.00    
##                 Median :  0.000                  Median :  0.00    
##                 Mean   :  4.616                  Mean   :  6.68    
##                 3rd Qu.:  1.000                  3rd Qu.:  0.00    
##                 Max.   :100.000                  Max.   :100.00    
##                                                                    
##   MarijuaEver       X.3               Age         SmokeLife        
##  Min.   :0.0000   Mode:logical   Min.   :14.00   Length:500        
##  1st Qu.:0.0000   NA's:500       1st Qu.:15.00   Class :character  
##  Median :0.0000                  Median :16.00   Mode  :character  
##  Mean   :0.3608                  Mean   :16.08                     
##  3rd Qu.:1.0000                  3rd Qu.:17.00                     
##  Max.   :1.0000                  Max.   :18.00                     
##  NA's   :15                                                        
##   SmokeDaily        MarijuaEver_Convert
##  Length:500         Length:500         
##  Class :character   Class :character   
##  Mode  :character   Mode  :character   
##                                        
##                                        
##                                        
## 
view(youth)  #view table of variables for the 500 observations
#convert variables to factors, where necessary
youth$Sleep <- as.factor(youth$Sleep)
youth$Sleep7 <- as.factor(youth$Sleep7)
youth$SmokeLife <- as.factor(youth$SmokeLife)
youth$SmokeDaily <- as.factor(youth$SmokeDaily)
youth$MarijuaEver <- as.factor(youth$MarijuaEver)
summary(youth)
##        X             Sleep      SleepNumeric     Sleep7      X.1         
##  Min.   :  1.0   7 hours:140   Min.   : 0.000   0   :150   Mode:logical  
##  1st Qu.:125.8   8 hours:116   1st Qu.: 6.000   1   :296   NA's:500      
##  Median :250.5   6 hours: 90   Median : 7.000   NA's: 54                 
##  Mean   :250.5          : 53   Mean   : 6.184                            
##  3rd Qu.:375.2   5 hours: 36   3rd Qu.: 8.000                            
##  Max.   :500.0   9 hours: 32   Max.   :10.000                            
##                  (Other): 33                                             
##  SmokeLife_Convert   X.2          SmokeDaily_Convert MarijuaEver   X.3         
##  Min.   :  0.000   Mode:logical   Min.   :  0.00     0   :310    Mode:logical  
##  1st Qu.:  0.000   NA's:500       1st Qu.:  0.00     1   :175    NA's:500      
##  Median :  0.000                  Median :  0.00     NA's: 15                  
##  Mean   :  4.616                  Mean   :  6.68                               
##  3rd Qu.:  1.000                  3rd Qu.:  0.00                               
##  Max.   :100.000                  Max.   :100.00                               
##                                                                                
##       Age        SmokeLife  SmokeDaily MarijuaEver_Convert
##  Min.   :14.00   No  :271   No  :427   Length:500         
##  1st Qu.:15.00   Yes :208   Yes : 40   Class :character   
##  Median :16.00   NA's: 21   NA's: 33   Mode  :character   
##  Mean   :16.08                                            
##  3rd Qu.:17.00                                            
##  Max.   :18.00                                            
## 
str(youth)
## 'data.frame':    500 obs. of  14 variables:
##  $ X                  : int  6 7 24 25 34 39 43 49 62 74 ...
##  $ Sleep              : Factor w/ 9 levels "","10  hours",..: 3 5 5 5 5 4 5 5 5 5 ...
##  $ SleepNumeric       : int  4 6 6 6 6 5 6 6 6 6 ...
##  $ Sleep7             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.1                : logi  NA NA NA NA NA NA ...
##  $ SmokeLife_Convert  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.2                : logi  NA NA NA NA NA NA ...
##  $ SmokeDaily_Convert : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MarijuaEver        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ X.3                : logi  NA NA NA NA NA NA ...
##  $ Age                : int  15 17 16 16 15 18 16 18 14 16 ...
##  $ SmokeLife          : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SmokeDaily         : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ MarijuaEver_Convert: chr  "No" "No" "No" "No" ...

Step 2

Final Project:
Definition of the type of study the dataset is associated with, definition of a research question, and data manipulations based on the study type and data.

Comments and Findings This an observational study of students in grades 9-12 concerning health-risk behaviors. One research question posed is whether health-risk behaviors of smoking, including marijuana, leads directly to differences in hours of sleep.

In this observational study, youth are surveyed as to their health-risk behaviors and sleep. No attempt is made to affect the outcome; no treatment is given to measure an outcome as this is not an experiment. In this study, researchers made no attempts to manipulate the study - instead, they simply observe. There are no procedures carried out to support, refute, or validate a hypothesis; there is no experimenting to provide insight into cause-and-effect. There is no demonstration of an outcome that possibly occurs when a factor is manipulated. Since this is an observational study, there is no control group or experimental groups. The variables do not cause or explain, however the variables can be related.

Question for the observational study: A youth’s health-risk behavior (smoking) is positively/negatively correlated to hours of sleep.

To answer this question, a series of data manipulations will be performed in R as follows:

  1. Review the distribution of sleep using histograms and a normal distribution curve based on calculated mean and standard deviation Question: Is it skewed, and if so, in what direction.

    Answer: The distribution of the variable sleep is more skewed to the left (under the mean of 6). According to this distribution, youth
    get more sleep than less, which is what I expected.
    Problem: Just as I have tried to add a normal distribution curve to my histogram in previous assignments, I tried to add one in my first snippet of code. It appears my code is wrong. I was hoping to plot the normal distribution curve and then see
    “normalcy.” In other words, a normal distribution with average sleep, including a few outside the norms in the below average or above average directions, however youth of these ages aren’t “normal” sleepers, and my code resulted in a flat line. To deal with this problem, I manually calculated the mean and standard deviation and then used ggplot.

  2. One way of calculating a 95% confidence interval for a sample mean by adding and subtracting 1.96 standard errors to the point estimate. When doing this, even though the full population isn’t known, with 95% confidence, the true average hours of sleep for youth lies between the values of the upper and lower limits of 6.020721 and 7.119279, respectively. There are conditions for this range to be valid. The sample mean must be normally distributed and have standard error s/n−−√. For these conditions to be met, the sample observations are random, sample size is greater than 50 and population distribution should not be greatly skewed to one side or the other. The confidence is in the method, not a particular Confidence Interval. A confidence interval is the probability that a value will fall between an upper and lower bound of a probability distribution. Since the skewing is not greatly one way or another, according to this model the conditions for a 95% confidence interval appear to be met. Worth noting, the “0” sleep is actually where respondents did not answer the question. See data conversion problem in step 1.

    Problem: When attempting to plot the confidence interval, the code failed. See code below.

#hist (v, main, xlab, xlim, ylim, breaks,col,border)
#where v – vector with numeric values
#main – denotes title of the chart
#col – sets color
#border -sets border color to the bar
#xlab - description of x-axis
#xlim - denotes to specify range of values on x-axis
#ylim – specifies range values on y-axis
#break – specifies the width of each bar.

hist(youth$SleepNumeric,
main="Health-Risk Behavior:  Distribution of Sleep",
xlab="Sleep",
border="Green",
col="Orange")

curve (dnorm(x, mean=mean(youth$SleepNumeric), sd=sd(youth$SleepNumeric)), add=TRUE, col="red")  #Implement the Normal Distribution Curve in Histogram

#Get mean and standard deviation of SleepNumeric
SNmean <- mean(youth$SleepNumeric)
SNsd   <- sd(youth$SleepNumeric)
SNmean
## [1] 6.184
SNsd
## [1] 2.484046
hist(youth$SleepNumeric,
main ="Histogram - Youth Risk Habits, View 2 - Sleep",
border = "Green",
col = "Orange" ,
breaks = 5)

#Plot SleepNumeric; include a normal distribution curve
ggplot(data = youth, aes(x = SleepNumeric)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = SNmean, sd = SNsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

population <- youth$SleepNumeric
sampYouth <- sample(population, 100)
hist(sampYouth,
main ="Histogram - Hours of Sleep",
border = "Green",
col = "Orange" ,
breaks = 5)

sample_mean <- mean(sampYouth)
sample_mean
## [1] 6.2
#Calculate a 95% confidence interval for the sample mean by adding and subtracting 1.96 standard errors to the point estimate.

se <- sd(sampYouth) / sqrt(100)
lower <- sample_mean - 1.96 * se
upper <- sample_mean + 1.96 * se
c(lower, upper)
## [1] 5.669229 6.730771
se
## [1] 0.2708013
sample_mean
## [1] 6.2
#Attempted to plot the confidence interval; code failed.
#plot_ci(lower, upper, mean(sampYouth))

Step 3

Final Project:
Understand the relationship between some of the other variables, not including SleepNumeric.

Comments and Findings SleepNumeric was excluded and two other variables, SmokeDaily and Age, were selected. The relationship between the other variables was analyzed using a boxplot as a visualization tool. A scatterplot was also used to further understand the relationship.

In looking at the average age of the youth vs the health-risk variable of smoking daily, the means of the two variables were compared and resulted in a very slight difference as seen below:

youth\(SmokeDaily: No [1] 16.05621 ------------------------------------------------------------------------------------------------------ youth\)SmokeDaily: Yes [1] 16.325

The tests were repeated for another set of variables, age and use of marijuana ever. The difference between the means is more than between age and daily smoking, but other testing is still needed to understand the relationship, if any.

youth\(MarijuaEver: 0 [1] 15.90323 ------------------------------------------------------------------------------------------------------ youth\)MarijuaEver: 1 [1] 16.37143

It appears a hypothesis test might be useful to see if the difference is statistically significant. By looking at the boxplot, it does not appear there is a correlation between age and daily smoking, nor is there one between age and marijuana smoking.

#The Data
setwd("C:/AarynZimmerman/Biostatistics Project")
youth <-read.csv("YouthRisk2009AZ.csv")
destfile = "youth.RData"
str(youth)    #view 500 observations with 7 variables
## 'data.frame':    500 obs. of  14 variables:
##  $ X                  : int  6 7 24 25 34 39 43 49 62 74 ...
##  $ Sleep              : chr  "4 or less hours" "6 hours" "6 hours" "6 hours" ...
##  $ SleepNumeric       : int  4 6 6 6 6 5 6 6 6 6 ...
##  $ Sleep7             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.1                : logi  NA NA NA NA NA NA ...
##  $ SmokeLife_Convert  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.2                : logi  NA NA NA NA NA NA ...
##  $ SmokeDaily_Convert : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MarijuaEver        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.3                : logi  NA NA NA NA NA NA ...
##  $ Age                : int  15 17 16 16 15 18 16 18 14 16 ...
##  $ SmokeLife          : chr  "No" "No" "No" "No" ...
##  $ SmokeDaily         : chr  "No" "No" "No" "No" ...
##  $ MarijuaEver_Convert: chr  "No" "No" "No" "No" ...
summary(youth) #view summary data
##        X            Sleep            SleepNumeric        Sleep7      
##  Min.   :  1.0   Length:500         Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:125.8   Class :character   1st Qu.: 6.000   1st Qu.:0.0000  
##  Median :250.5   Mode  :character   Median : 7.000   Median :1.0000  
##  Mean   :250.5                      Mean   : 6.184   Mean   :0.6637  
##  3rd Qu.:375.2                      3rd Qu.: 8.000   3rd Qu.:1.0000  
##  Max.   :500.0                      Max.   :10.000   Max.   :1.0000  
##                                                      NA's   :54      
##    X.1          SmokeLife_Convert   X.2          SmokeDaily_Convert
##  Mode:logical   Min.   :  0.000   Mode:logical   Min.   :  0.00    
##  NA's:500       1st Qu.:  0.000   NA's:500       1st Qu.:  0.00    
##                 Median :  0.000                  Median :  0.00    
##                 Mean   :  4.616                  Mean   :  6.68    
##                 3rd Qu.:  1.000                  3rd Qu.:  0.00    
##                 Max.   :100.000                  Max.   :100.00    
##                                                                    
##   MarijuaEver       X.3               Age         SmokeLife        
##  Min.   :0.0000   Mode:logical   Min.   :14.00   Length:500        
##  1st Qu.:0.0000   NA's:500       1st Qu.:15.00   Class :character  
##  Median :0.0000                  Median :16.00   Mode  :character  
##  Mean   :0.3608                  Mean   :16.08                     
##  3rd Qu.:1.0000                  3rd Qu.:17.00                     
##  Max.   :1.0000                  Max.   :18.00                     
##  NA's   :15                                                        
##   SmokeDaily        MarijuaEver_Convert
##  Length:500         Length:500         
##  Class :character   Class :character   
##  Mode  :character   Mode  :character   
##                                        
##                                        
##                                        
## 
view(youth)  #view table of variables for the 500 observations
boxplot(Age~SmokeDaily,
       data=youth,
       main = "Age vs Smoke Daily",
       xlab = "Youth Smoke Daily",
       ylab = "Age of Youth",
       col = "orange",
       border = "brown",
       vertical = TRUE,
       notch = FALSE)

#view means of the distributions by using the following function to split the SmokeDaily group variable into Yes/No groups, then taking the mean of each using the mean function.

by(youth$Age, youth$SmokeDaily, mean)
## youth$SmokeDaily: No
## [1] 16.05621
## ------------------------------------------------------------ 
## youth$SmokeDaily: Yes
## [1] 16.325
boxplot(Age~MarijuaEver,
       data=youth,
       main = "Marijuana Ever vs Age",
       xlab = "Marijuana Ever",
       ylab = "Age",
       col = "orange",
       border = "brown",
       vertical = TRUE,
       notch = FALSE)

#view means of the distributions by using the following function to split the Marijuana Ever into Yes/No groups, then taking the mean of each using the mean function.

by(youth$Age, youth$MarijuaEver, mean)
## youth$MarijuaEver: 0
## [1] 15.90323
## ------------------------------------------------------------ 
## youth$MarijuaEver: 1
## [1] 16.37143

Step 4

Final Project:
The fundamental question is whether or not the health-risk behavior (smoking) is positively/negatively correlated to hours of sleep. It seems that smoking, whether regular cigarettes or marijuana, might be disruptive to overall hours of sleep. T

Comments and Findings This was tested this using linear regression via a scatterplot to see if this appears to be the case. A scatterplot, in conjuction with jitter was used.

Problem: The scatterplot did not provide a clear image of the individual points. My assumption is this is because they overlap too closely with like values. If the values were different, the jitter would have added “noise” to the x-axis variable, and the individual points on the plot would be more clear/visible.

The scatterplot was implemented multiple ways, but as seen below, the results are the same. My interpretation is that a scatterplot isn’t the best way of viewing this particular data set.

Worth noting, because of data type conflicts, I had to assign a value to all “blanks” in the data set. I chose a value that was very different from the other values, “100.” The impact this assigned value has on results is not clear.

str(youth)    #view 500 observations with 6 (original) variables
## 'data.frame':    500 obs. of  14 variables:
##  $ X                  : int  6 7 24 25 34 39 43 49 62 74 ...
##  $ Sleep              : chr  "4 or less hours" "6 hours" "6 hours" "6 hours" ...
##  $ SleepNumeric       : int  4 6 6 6 6 5 6 6 6 6 ...
##  $ Sleep7             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.1                : logi  NA NA NA NA NA NA ...
##  $ SmokeLife_Convert  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.2                : logi  NA NA NA NA NA NA ...
##  $ SmokeDaily_Convert : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MarijuaEver        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.3                : logi  NA NA NA NA NA NA ...
##  $ Age                : int  15 17 16 16 15 18 16 18 14 16 ...
##  $ SmokeLife          : chr  "No" "No" "No" "No" ...
##  $ SmokeDaily         : chr  "No" "No" "No" "No" ...
##  $ MarijuaEver_Convert: chr  "No" "No" "No" "No" ...
summary(youth) #view summary data
##        X            Sleep            SleepNumeric        Sleep7      
##  Min.   :  1.0   Length:500         Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:125.8   Class :character   1st Qu.: 6.000   1st Qu.:0.0000  
##  Median :250.5   Mode  :character   Median : 7.000   Median :1.0000  
##  Mean   :250.5                      Mean   : 6.184   Mean   :0.6637  
##  3rd Qu.:375.2                      3rd Qu.: 8.000   3rd Qu.:1.0000  
##  Max.   :500.0                      Max.   :10.000   Max.   :1.0000  
##                                                      NA's   :54      
##    X.1          SmokeLife_Convert   X.2          SmokeDaily_Convert
##  Mode:logical   Min.   :  0.000   Mode:logical   Min.   :  0.00    
##  NA's:500       1st Qu.:  0.000   NA's:500       1st Qu.:  0.00    
##                 Median :  0.000                  Median :  0.00    
##                 Mean   :  4.616                  Mean   :  6.68    
##                 3rd Qu.:  1.000                  3rd Qu.:  0.00    
##                 Max.   :100.000                  Max.   :100.00    
##                                                                    
##   MarijuaEver       X.3               Age         SmokeLife        
##  Min.   :0.0000   Mode:logical   Min.   :14.00   Length:500        
##  1st Qu.:0.0000   NA's:500       1st Qu.:15.00   Class :character  
##  Median :0.0000                  Median :16.00   Mode  :character  
##  Mean   :0.3608                  Mean   :16.08                     
##  3rd Qu.:1.0000                  3rd Qu.:17.00                     
##  Max.   :1.0000                  Max.   :18.00                     
##  NA's   :15                                                        
##   SmokeDaily        MarijuaEver_Convert
##  Length:500         Length:500         
##  Class :character   Class :character   
##  Mode  :character   Mode  :character   
##                                        
##                                        
##                                        
## 
view(youth)  #view table of variables for the 500 observations
#scatterplot 

#The Data
setwd("C:/AarynZimmerman/Biostatistics Project")
youth <-read.csv("YouthRisk2009AZ.csv")
destfile = "youth.RData"
str(youth)    #view 500 observations with 7 variables
## 'data.frame':    500 obs. of  14 variables:
##  $ X                  : int  6 7 24 25 34 39 43 49 62 74 ...
##  $ Sleep              : chr  "4 or less hours" "6 hours" "6 hours" "6 hours" ...
##  $ SleepNumeric       : int  4 6 6 6 6 5 6 6 6 6 ...
##  $ Sleep7             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.1                : logi  NA NA NA NA NA NA ...
##  $ SmokeLife_Convert  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.2                : logi  NA NA NA NA NA NA ...
##  $ SmokeDaily_Convert : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MarijuaEver        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X.3                : logi  NA NA NA NA NA NA ...
##  $ Age                : int  15 17 16 16 15 18 16 18 14 16 ...
##  $ SmokeLife          : chr  "No" "No" "No" "No" ...
##  $ SmokeDaily         : chr  "No" "No" "No" "No" ...
##  $ MarijuaEver_Convert: chr  "No" "No" "No" "No" ...
summary(youth) #view summary data
##        X            Sleep            SleepNumeric        Sleep7      
##  Min.   :  1.0   Length:500         Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:125.8   Class :character   1st Qu.: 6.000   1st Qu.:0.0000  
##  Median :250.5   Mode  :character   Median : 7.000   Median :1.0000  
##  Mean   :250.5                      Mean   : 6.184   Mean   :0.6637  
##  3rd Qu.:375.2                      3rd Qu.: 8.000   3rd Qu.:1.0000  
##  Max.   :500.0                      Max.   :10.000   Max.   :1.0000  
##                                                      NA's   :54      
##    X.1          SmokeLife_Convert   X.2          SmokeDaily_Convert
##  Mode:logical   Min.   :  0.000   Mode:logical   Min.   :  0.00    
##  NA's:500       1st Qu.:  0.000   NA's:500       1st Qu.:  0.00    
##                 Median :  0.000                  Median :  0.00    
##                 Mean   :  4.616                  Mean   :  6.68    
##                 3rd Qu.:  1.000                  3rd Qu.:  0.00    
##                 Max.   :100.000                  Max.   :100.00    
##                                                                    
##   MarijuaEver       X.3               Age         SmokeLife        
##  Min.   :0.0000   Mode:logical   Min.   :14.00   Length:500        
##  1st Qu.:0.0000   NA's:500       1st Qu.:15.00   Class :character  
##  Median :0.0000                  Median :16.00   Mode  :character  
##  Mean   :0.3608                  Mean   :16.08                     
##  3rd Qu.:1.0000                  3rd Qu.:17.00                     
##  Max.   :1.0000                  Max.   :18.00                     
##  NA's   :15                                                        
##   SmokeDaily        MarijuaEver_Convert
##  Length:500         Length:500         
##  Class :character   Class :character   
##  Mode  :character   Mode  :character   
##                                        
##                                        
##                                        
## 
view(youth)  #view table of variables for the 500 observations
#youth$SleepNumeric <- as.numeric(youth$SleepNumeric)
#youth$SmokeLife_Convert <- as.numeric(youth$SmokeDaily_Convert)

plot(youth$SleepNumeric~youth$SmokeLife_Convert,
   main = "Relationship Between Hours of Sleep and Smoke Life",
     ylab = "Hours of Sleep", 
     xlab = "Smoke Life")

#scatterplot 
#plot(evals$score~evals$bty_avg,
  # main = "Relationship Between Evaluation Score and Professor Beauty Average",
   #  ylab = "Evaluation Score", 
   #  xlab = "Beauty Average")
#add jitter to SleepNumeric
youth$SleepNumeric <- as.numeric(youth$SleepNumeric)
youth$SmokeLife_Convert <- as.numeric(youth$SmokeLife_Convert)



plot(jitter(youth$SmokeLife_Convert), youth$SleepNumeric, 
     pch = 16, 
     col = 'steelblue',
     main = "Hours of Sleep and Smoke Life with Jitter",
     ylab = "Hours of Sleep", 
     xlab = "Smoke Life with Jitter")

Step 5

Final Project:
Test if a trend in the plot is something more than natural variation,

Comments and Findings A linear model was fitted (m_SmokeLifeConvert) to predict average sleep hours by average smoke life the line was added to the plot using abline. The following is the equation for the linear model along with an interpretation of the slope.

Equation for linear model, interpreted slope:

y^= 6.347579 + -0.035437 * SmokeLife_Convert

Smoke life is a statistically significant predictor of hours of sleep as the p-value is approximately 0 (8.098e-11).

Smoke life does not appear to be a practically significant predictor of hours of sleep. For every 1 point increase in SmokeLife_Convert, the equation and plot and summary data predicts a decrease in sleep hours score -0.035437 which is not a significant change in hours of sleep. In summary, smoke life does not appear to be a significant predictor of hours of sleep.

m_SmokeLifeConvert <- lm(youth$SleepNumeric ~ youth$SmokeLife_Convert)
plot(jitter(youth$SleepNumeric, factor = 1.5) ~ jitter(youth$SmokeLife_Convert,factor = 1.5))
abline(m_SmokeLifeConvert)

cor(youth$SleepNumeric,youth$SmokeLife_Convert)
## [1] -0.2852868
summary(m_SmokeLifeConvert)
## 
## Call:
## lm(formula = youth$SleepNumeric ~ youth$SmokeLife_Convert)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3476 -0.3476  0.6524  1.6524  5.1962 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              6.347579   0.109388  58.028  < 2e-16 ***
## youth$SmokeLife_Convert -0.035437   0.005335  -6.642  8.1e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.383 on 498 degrees of freedom
## Multiple R-squared:  0.08139,    Adjusted R-squared:  0.07954 
## F-statistic: 44.12 on 1 and 498 DF,  p-value: 8.098e-11

Step 6

Final Project:
Review estimated proportion of daily smokers out of the population of youth smokers.

Comments and Findings A new dataframe, smokers was created that contains only the rows from the youth dataset that are associated with smoker respondents. The proportion of daily smokers was then calculated.

The calculated proportion of daily smokers from the smoker life pool of responses = 0.2163462, which is approximately 22%.

#us12 <- subset(atheism, nationality == "United States" & year == "2012")
smokers <- subset(youth, SmokeLife == "Yes")
summary(smokers)
##        X            Sleep            SleepNumeric        Sleep7      
##  Min.   :  2.0   Length:208         Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:112.2   Class :character   1st Qu.: 5.000   1st Qu.:0.0000  
##  Median :225.5   Mode  :character   Median : 7.000   Median :1.0000  
##  Mean   :238.4                      Mean   : 6.255   Mean   :0.5918  
##  3rd Qu.:365.0                      3rd Qu.: 7.000   3rd Qu.:1.0000  
##  Max.   :500.0                      Max.   :10.000   Max.   :1.0000  
##                                                      NA's   :12      
##    X.1          SmokeLife_Convert   X.2          SmokeDaily_Convert
##  Mode:logical   Min.   :1         Mode:logical   Min.   :  0.000   
##  NA's:208       1st Qu.:1         NA's:208       1st Qu.:  0.000   
##                 Median :1                        Median :  0.000   
##                 Mean   :1                        Mean   :  2.596   
##                 3rd Qu.:1                        3rd Qu.:  0.000   
##                 Max.   :1                        Max.   :100.000   
##                                                                    
##   MarijuaEver       X.3               Age        SmokeLife        
##  Min.   :0.0000   Mode:logical   Min.   :14.0   Length:208        
##  1st Qu.:0.0000   NA's:208       1st Qu.:15.0   Class :character  
##  Median :1.0000                  Median :16.0   Mode  :character  
##  Mean   :0.6798                  Mean   :16.2                     
##  3rd Qu.:1.0000                  3rd Qu.:17.0                     
##  Max.   :1.0000                  Max.   :18.0                     
##  NA's   :5                                                        
##   SmokeDaily        MarijuaEver_Convert
##  Length:208         Length:208         
##  Class :character   Class :character   
##  Mode  :character   Mode  :character   
##                                        
##                                        
##                                        
## 
#calculate proportion of daily youth smokers
#get number of daily smokers


smokers_youth <-nrow(smokers[smokers$SmokeDaily=="Yes",])
smokers_youth
## [1] 45
#calculate proportion of daily youth smokers out of the number of smokers
#get number of us rows

smokers_rows <- nrow(smokers)
smokers_rows
## [1] 208
#calculate proportion of daily smokers out of total youth smokers
smokers_youth/smokers_rows
## [1] 0.2163462

Step 7

Final Project:
The statistic above was made from a sample of 500 youth. Of interest is insight into the population parameters. The calculation answers what proportion of youth smokers reported smoking daily, however the question of what proportion of youth who smoke do so daily is answered with an estimate of the parameter. For this understanding, an inference on proportions is needed.

Comments and Findings The inferential tools for estimating population proportion include the confidence interval and the hypothesis test. Conditions can be checked for inference by obtaining sample sizes to check those conditions. Group size is computed with the “by” command as follows: by(youth\(SmokeLife, youth\)SmokeDaily, length)

Below are the conditions for inference to construct a 95% confidence interval for the proportion of daily youth smokers in the United States:

  1. Randomly picked - If the random (picked radomly across the US) sample size is less than 10% of the population of daily youth smokers, the observations are independent - and this condition necessary for inference is satisfied as observations must be independent. In the smokers data set, there are 208 observations. This number of observations is significantly below 10% of the US population, so the observations are independent.

  2. In the population of youth smokers, the success/fail condition is satisfied: Calc 1 - Population of 208 * 0.2163462 = 45 Calc 2 - Population of 208 * 0.8076923 = 168 Both calculations are greater than 10. The success/fail condition is met if there are at least 10 expected successes and 10 expected failures in the sample.

  3. Expected number of successes is at least 10: np ≥ 10 Expected number of failures is at least 10: n(1-p) ≥ 10 where n is the sample size & p is the probability of success on a given trial.

All conditions for inference are met. Since the conditions for inference are reasonable, the standard error was calculated and the interval using the inference function was constructed.

Since the goal is to construct an interval estimate for a proportion, it’s necessary to specify what constitutes a “success”, which here is a response of “Yes” for smokes daily.

The error margin for surveys of this kind is ± 14-25% at 95% confidence.

Standard error = 0.0279 95 % Confidence interval = ( 0.1423 , 0.2518 )

by(youth$SmokeLife, youth$SmokeDaily, length)
## youth$SmokeDaily: No
## [1] 427
## ------------------------------------------------------------ 
## youth$SmokeDaily: Yes
## [1] 40
#calculate proportion of non-daily smokers
#get number of youth non-daily smokers
non_daily_smokers_youth <-nrow(smokers[smokers$SmokeDaily == "No",])
non_daily_smokers_youth
## [1] 168
#calculate proportion of non-daily-smoker responses
non_daily_smokers_youth/smokers_rows
## [1] 0.8076923
#Success/fail condition satisfied:  Is the calculation greater than 10?

#us_rows *(us_atheist/us_rows)

smokers_rows*(smokers_youth/smokers_rows)
## [1] 45
#Success/fail condition satisfied:  Is the calculation greater than 10?

#us_rows *(us_non_atheist/us_rows)

smokers_rows*(non_daily_smokers_youth/smokers_rows)
## [1] 168
#If the conditions for inference are reasonable, calculate the standard error and construct the interval using the inference function 


#inference(smokers$SmokeDaily, est = "proportion", type = "ci", method = "theoretical", success = "Yes")

Step 8

Final Project: Since gathering information on an entire population of youth would be very costly, time consuming, and likely impossible, a sample of the population is helpful for understanding the properties of the population.

Comments and Findings To estimate the proportion of youth smokers who smoke daily, the sample_n command is used to survey the population. A bar plot is used to visualize the distribution of smoking responses, and summary statistics assist in confirming the population properties are in alignment with the sample.

The sample_n command collects a random sample size of 100 from the smokers dataset and assigns the result to youthSamp1. This is similar to randomly drawing from responses from the entire population. It is easier to work with 100 responses than with all of those in the population.

By using a larger number of samples tested, a more accurate estimate of the population proportion is generated (hence the basis of 100 vs. a smaller number). One hundred responses from the smoker subset of youth is easier to work with than the entire population.
When using the population, 78% of youth smokers do not smoke daily while 19% do (2% did not respond to the daily question). When using the random sample of 100, 78% of youth smokers do not smoke daily while 21% do (1% did not respond to the daily question).

ggplot(smokers, aes(x = SmokeDaily)) +
  geom_bar() +
  labs(
    x = "", y = "",
    title = "Proportion of Youth Daily Smokers"
  ) +
  coord_flip() 

smokers %>%
  count(SmokeDaily) %>%
  mutate(p = n /sum(n))
##   SmokeDaily   n          p
## 1         No 163 0.78365385
## 2        Yes  40 0.19230769
## 3       <NA>   5 0.02403846
#take sample of population of youth smokers
set.seed(1)
youthSamp1 <- smokers %>%
  sample_n(100)
#Visualize the distribution of the sample by using a bar plot.
set.seed(1)
ggplot(youthSamp1, aes(x = SmokeDaily)) +
  geom_bar() +
  labs(
    x = "", y = "",
    title = "Sample of Youth Daily Smokers"
  ) +
  coord_flip()

#summary statistics from the sample
set.seed(1)
youthSamp1 %>%
  count(SmokeDaily) %>%
  mutate(s = n /sum(n))
##   SmokeDaily  n    s
## 1         No 77 0.77
## 2        Yes 20 0.20
## 3       <NA>  3 0.03

Step 9

Final Project:

To accurately estimate the population mean, it’s useful to get a sense of how much variability to expect. The distribution of sample proportions (i.e. sampling distribution of the proportion) is used to understand the variability.

Comments and Findings There are 15,000 elements in sample_youth100. Around 8% (1,200 of the 15,000 elements in sample_youth100) smoke daily. The mean appears to be less than the original dataset (approximately 20%) and more concentrated than the graph above. Since the mean is a better estimate the larger the sample size, a sample size of 100 was used instead of a lower sample size. In order to make estimates that are close to the true value, a sampling distribution with a small spread is best. The larger sample size has a smaller spread and is preferable for a sampling distribution.

#take 15,000 different samples of size 100 from the population
#rep_sample_n function is for repetition. Rather than taking a single sample of size n (100) from the population of all youth daily smokers in the population, repeat this sampling procedure rep times in order to build a distribution of a series of sample statistics, which is called the sampling distribution.
#calculate the proportion of responses in each sample
#filter for only the Smokes Daily == "no" responses
#store each result in a vector called sample_youth100
#replace = TRUE since sampling distributions are constructed by sampling with replacement.



sample_youth100 <- youth %>%
                     rep_sample_n(size = 100, reps = 15000, replace = TRUE) %>%
                     count(SmokeDaily) %>%
                     mutate(p_hat = n /sum(n)) %>%
                     filter(SmokeDaily == "Yes")
#Visualize the distribution of the proportions with a histogram.


ggplot(data = sample_youth100, aes(x = p_hat)) +
  geom_histogram(binwidth = 0.02) +
  labs(
    x = "p_hat (Smokes Daily)",
    title = "Sampling distribution of p_hat",
    subtitle = "Sample size = 100, Number of samples = 15000")

set.seed(200)
ggplot(sample_youth100, aes(x = p_hat)) +
  geom_bar() +
  labs(
    x = "p_hat   Smokes Daily", y = "Count",
    title = "Sample _youth100:  Sampling Distribution of p_hat"
  ) +
  coord_flip() #for fun, flipping the coordinate :)

Problem After running most of the tests, it seems this data set is quite limited, especially with the number of variables. I would have liked to have had more variables so I could take additional subsets and processed additional tests. It seems there are weaknesses in this observational study. I would be interested to understand if gender played a role in the observational study. It seems the response successes and failures could be better understood by further separating the groups into male/female.

