Safiya

Instructor Turner

11/17/2018

Assignment 12: Central Limit Theorem and Comparing Sampling Distributions

The question is: Do respondents aged 13 and 16 think about suicide to create a suicide plan?

The variable in this data set, age was purposely manipulated to reflect a continuous variable. Originally, 1= 12 years or younger and 7= 18 years or older. Variable is now exactly 1 equal distance apart from each other.

Steps 1-3:

library(readr)
library(dplyr)
library(knitr)
library(ggplot2)



read_csv("/Users/safiesaf/Downloads/YRBS1991_2017(2).csv")
## # A tibble: 203,663 x 191
##    sitecode sitename sitetype sitetypenum  year survyear weight stratum
##    <chr>    <chr>    <chr>          <int> <int>    <int>  <dbl>   <int>
##  1 XX       United … National           3  1991        1 0.264    12210
##  2 XX       United … National           3  1991        1 0.506    12310
##  3 XX       United … National           3  1991        1 0.320    11400
##  4 XX       United … National           3  1991        1 0.779    11200
##  5 XX       United … National           3  1991        1 1.39     12100
##  6 XX       United … National           3  1991        1 0.0344   19007
##  7 XX       United … National           3  1991        1 0.125    19006
##  8 XX       United … National           3  1991        1 0.165    11410
##  9 XX       United … National           3  1991        1 0.431    19005
## 10 XX       United … National           3  1991        1 0.416    19003
## # ... with 203,653 more rows, and 183 more variables: PSU <int>,
## #   record <int>, age <int>, sex <int>, grade <int>, race4 <int>,
## #   race7 <int>, stheight <chr>, stweight <chr>, bmi <chr>, bmipct <chr>,
## #   qnobese <chr>, qnowt <chr>, q67 <chr>, q66 <chr>, sexid <chr>,
## #   sexid2 <chr>, sexpart <chr>, sexpart2 <chr>, qn8 <int>, qn9 <int>,
## #   qn10 <chr>, qn11 <chr>, qn12 <int>, qn13 <chr>, qn14 <chr>,
## #   qn15 <chr>, qn16 <chr>, qn17 <int>, qn18 <chr>, qn19 <chr>,
## #   qn20 <chr>, qn21 <chr>, qn22 <chr>, qn23 <chr>, qn24 <chr>,
## #   qn25 <chr>, qn26 <int>, qn27 <int>, qn28 <int>, qn29 <int>,
## #   qn30 <int>, qn31 <chr>, qn32 <int>, qn33 <int>, qn34 <chr>,
## #   qn35 <chr>, qn36 <chr>, qn37 <chr>, qn38 <chr>, qn39 <chr>,
## #   qn40 <int>, qn41 <int>, qn42 <int>, qn43 <chr>, qn44 <chr>,
## #   qn45 <chr>, qn46 <int>, qn47 <int>, qn48 <int>, qn49 <int>,
## #   qn50 <chr>, qn51 <chr>, qn52 <chr>, qn53 <chr>, qn54 <chr>,
## #   qn55 <int>, qn56 <chr>, qn57 <chr>, qn58 <int>, qn59 <int>,
## #   qn60 <int>, qn61 <int>, qn62 <int>, qn63 <int>, qn64 <int>,
## #   qn65 <int>, qn68 <int>, qn69 <chr>, qn70 <chr>, qn71 <chr>,
## #   qn72 <chr>, qn73 <chr>, qn74 <chr>, qn75 <chr>, qn76 <chr>,
## #   qn77 <chr>, qn78 <chr>, qn79 <chr>, qn80 <chr>, qn81 <chr>,
## #   qn82 <int>, qn83 <chr>, qn84 <chr>, qn85 <chr>, qn86 <chr>,
## #   qn87 <chr>, qn88 <chr>, qn89 <chr>, qnfrcig <int>, …
ResearchData<-read_csv("/Users/safiesaf/Downloads/YRBS1991_2017(2).csv")

NewResearchData<-ResearchData%>%
  rename("SuicidePlan"=qn27,
         "AgeResp"=age)%>%
  
select(SuicidePlan,
         AgeResp)%>%
  
mutate(SuicidePlan=ifelse(SuicidePlan==1,"Yes",
                            ifelse(SuicidePlan==2,"No",NA)),
        AgeResp=ifelse(AgeResp==1,12,
                       ifelse(AgeResp==2,13,
                       ifelse(AgeResp==3,14,
                       ifelse(AgeResp==4,15,
                       ifelse(AgeResp==5,16,
                       ifelse(AgeResp==6,17,
                       ifelse(AgeResp==7,18,NA))))))))%>%
  
select(SuicidePlan,AgeResp)

kable(head(NewResearchData))
SuicidePlan AgeResp
No NA
Yes NA
No 15
No 16
NA 17
No NA
NewResearchData%>%
  filter(SuicidePlan=="Yes"|
        SuicidePlan=="No",
        AgeResp==12|
        AgeResp==13|
        AgeResp==14|
        AgeResp==15|
        AgeResp==16|
        AgeResp==17|
        AgeResp==18)%>%

group_by(SuicidePlan)%>%
summarize(AgeResp=mean(AgeResp,na.rm=TRUE))%>%
kable()
SuicidePlan AgeResp
No 16.16315
Yes 16.07716

Two data sets were created to differentiate suicide plans for teens. According to the data, 16 year olds typically have a suicide plan. In this data, 18 year olds rarely have suicide plans.

Step 4 & 5:

NewResearchData%>%
filter(SuicidePlan=="Yes")%>%  
ggplot()+
geom_histogram(aes(AgeResp),fill="blue")+
ggtitle("Teens who planned to commit suicide")

According to the data below, 17 year olds don’t have a suicide plan. 12 & 13 year olds don’t seem to plan for suicide at all. 18 year olds rarely have a suicide plan.

NewResearchData%>%
filter(SuicidePlan=="No")%>%  
ggplot()+
geom_histogram(aes(AgeResp),fill="tan")+
ggtitle("Teens who do not plan to commit suicide")

Step 6:

Highlights the central limit theorem used to display 10,000 samples of a sample size of 40 from the population

NewResearchData%>%
  group_by(SuicidePlan)%>%
  summarize(meanage=mean(AgeResp, na.rm=TRUE))  #this is a more concise way to get the mean for two of the groups that were under analysis.
## # A tibble: 3 x 2
##   SuicidePlan meanage
##   <chr>         <dbl>
## 1 No             16.2
## 2 Yes            16.1
## 3 <NA>           16.1
SuicideData<-NewResearchData%>%
filter(SuicidePlan=="Yes",
      !is.na(AgeResp))
sample(SuicideData$AgeResp,40)%>%
mean()
## [1] 16.05
replicate(10000,mean(sample(SuicideData$AgeResp, 40)))%>%
data.frame()%>%
rename("mean"=1)%>%
ggplot()+
geom_histogram(aes(x=mean),fill="purple",alpha=.5)

The sampling distribution looks differently than the population distribution because according to the Central Limit Theorem, if your population is not normally distributed. The Central Limit Theorem forces it to be a normal distribution by using large sample distributions. The more samples are taken from a population, the tendency that the sample will be more representative of the population.

NoSuicideData<-NewResearchData%>%
filter(SuicidePlan=="No",
     !is.na(AgeResp))

sample(NoSuicideData$AgeResp,40)%>%
mean()
## [1] 15.925
replicate(10000, mean(sample(NoSuicideData$AgeResp, 40)))%>%
data.frame()%>%
rename("mean"=1)%>%
ggplot()+
geom_histogram(aes(x=mean),fill="yellow",alpha=.5)