Assignment 12: Central Limit Theorem and Comparing Sampling Distributions
The question is: Do respondents aged 13 and 16 think about suicide to create a suicide plan?
The variable in this data set, age was purposely manipulated to reflect a continuous variable. Originally, 1= 12 years or younger and 7= 18 years or older. Variable is now exactly 1 equal distance apart from each other.
Steps 1-3:
library(readr)
library(dplyr)
library(knitr)
library(ggplot2)
read_csv("/Users/safiesaf/Downloads/YRBS1991_2017(2).csv")
## # A tibble: 203,663 x 191
## sitecode sitename sitetype sitetypenum year survyear weight stratum
## <chr> <chr> <chr> <int> <int> <int> <dbl> <int>
## 1 XX United … National 3 1991 1 0.264 12210
## 2 XX United … National 3 1991 1 0.506 12310
## 3 XX United … National 3 1991 1 0.320 11400
## 4 XX United … National 3 1991 1 0.779 11200
## 5 XX United … National 3 1991 1 1.39 12100
## 6 XX United … National 3 1991 1 0.0344 19007
## 7 XX United … National 3 1991 1 0.125 19006
## 8 XX United … National 3 1991 1 0.165 11410
## 9 XX United … National 3 1991 1 0.431 19005
## 10 XX United … National 3 1991 1 0.416 19003
## # ... with 203,653 more rows, and 183 more variables: PSU <int>,
## # record <int>, age <int>, sex <int>, grade <int>, race4 <int>,
## # race7 <int>, stheight <chr>, stweight <chr>, bmi <chr>, bmipct <chr>,
## # qnobese <chr>, qnowt <chr>, q67 <chr>, q66 <chr>, sexid <chr>,
## # sexid2 <chr>, sexpart <chr>, sexpart2 <chr>, qn8 <int>, qn9 <int>,
## # qn10 <chr>, qn11 <chr>, qn12 <int>, qn13 <chr>, qn14 <chr>,
## # qn15 <chr>, qn16 <chr>, qn17 <int>, qn18 <chr>, qn19 <chr>,
## # qn20 <chr>, qn21 <chr>, qn22 <chr>, qn23 <chr>, qn24 <chr>,
## # qn25 <chr>, qn26 <int>, qn27 <int>, qn28 <int>, qn29 <int>,
## # qn30 <int>, qn31 <chr>, qn32 <int>, qn33 <int>, qn34 <chr>,
## # qn35 <chr>, qn36 <chr>, qn37 <chr>, qn38 <chr>, qn39 <chr>,
## # qn40 <int>, qn41 <int>, qn42 <int>, qn43 <chr>, qn44 <chr>,
## # qn45 <chr>, qn46 <int>, qn47 <int>, qn48 <int>, qn49 <int>,
## # qn50 <chr>, qn51 <chr>, qn52 <chr>, qn53 <chr>, qn54 <chr>,
## # qn55 <int>, qn56 <chr>, qn57 <chr>, qn58 <int>, qn59 <int>,
## # qn60 <int>, qn61 <int>, qn62 <int>, qn63 <int>, qn64 <int>,
## # qn65 <int>, qn68 <int>, qn69 <chr>, qn70 <chr>, qn71 <chr>,
## # qn72 <chr>, qn73 <chr>, qn74 <chr>, qn75 <chr>, qn76 <chr>,
## # qn77 <chr>, qn78 <chr>, qn79 <chr>, qn80 <chr>, qn81 <chr>,
## # qn82 <int>, qn83 <chr>, qn84 <chr>, qn85 <chr>, qn86 <chr>,
## # qn87 <chr>, qn88 <chr>, qn89 <chr>, qnfrcig <int>, …
ResearchData<-read_csv("/Users/safiesaf/Downloads/YRBS1991_2017(2).csv")
NewResearchData<-ResearchData%>%
rename("SuicidePlan"=qn27,
"AgeResp"=age)%>%
select(SuicidePlan,
AgeResp)%>%
mutate(SuicidePlan=ifelse(SuicidePlan==1,"Yes",
ifelse(SuicidePlan==2,"No",NA)),
AgeResp=ifelse(AgeResp==1,12,
ifelse(AgeResp==2,13,
ifelse(AgeResp==3,14,
ifelse(AgeResp==4,15,
ifelse(AgeResp==5,16,
ifelse(AgeResp==6,17,
ifelse(AgeResp==7,18,NA))))))))%>%
select(SuicidePlan,AgeResp)
kable(head(NewResearchData))
| SuicidePlan | AgeResp |
|---|---|
| No | NA |
| Yes | NA |
| No | 15 |
| No | 16 |
| NA | 17 |
| No | NA |
NewResearchData%>%
filter(SuicidePlan=="Yes"|
SuicidePlan=="No",
AgeResp==12|
AgeResp==13|
AgeResp==14|
AgeResp==15|
AgeResp==16|
AgeResp==17|
AgeResp==18)%>%
group_by(SuicidePlan)%>%
summarize(AgeResp=mean(AgeResp,na.rm=TRUE))%>%
kable()
| SuicidePlan | AgeResp |
|---|---|
| No | 16.16315 |
| Yes | 16.07716 |
Two data sets were created to differentiate suicide plans for teens. According to the data, 16 year olds typically have a suicide plan. In this data, 18 year olds rarely have suicide plans.
Step 4 & 5:
NewResearchData%>%
filter(SuicidePlan=="Yes")%>%
ggplot()+
geom_histogram(aes(AgeResp),fill="blue")+
ggtitle("Teens who planned to commit suicide")
According to the data below, 17 year olds don’t have a suicide plan. 12 & 13 year olds don’t seem to plan for suicide at all. 18 year olds rarely have a suicide plan.
NewResearchData%>%
filter(SuicidePlan=="No")%>%
ggplot()+
geom_histogram(aes(AgeResp),fill="tan")+
ggtitle("Teens who do not plan to commit suicide")
Step 6:
Highlights the central limit theorem used to display 10,000 samples of a sample size of 40 from the population
NewResearchData%>%
group_by(SuicidePlan)%>%
summarize(meanage=mean(AgeResp, na.rm=TRUE)) #this is a more concise way to get the mean for two of the groups that were under analysis.
## # A tibble: 3 x 2
## SuicidePlan meanage
## <chr> <dbl>
## 1 No 16.2
## 2 Yes 16.1
## 3 <NA> 16.1
SuicideData<-NewResearchData%>%
filter(SuicidePlan=="Yes",
!is.na(AgeResp))
sample(SuicideData$AgeResp,40)%>%
mean()
## [1] 16.05
replicate(10000,mean(sample(SuicideData$AgeResp, 40)))%>%
data.frame()%>%
rename("mean"=1)%>%
ggplot()+
geom_histogram(aes(x=mean),fill="purple",alpha=.5)
The sampling distribution looks differently than the population distribution because according to the Central Limit Theorem, if your population is not normally distributed. The Central Limit Theorem forces it to be a normal distribution by using large sample distributions. The more samples are taken from a population, the tendency that the sample will be more representative of the population.
NoSuicideData<-NewResearchData%>%
filter(SuicidePlan=="No",
!is.na(AgeResp))
sample(NoSuicideData$AgeResp,40)%>%
mean()
## [1] 15.925
replicate(10000, mean(sample(NoSuicideData$AgeResp, 40)))%>%
data.frame()%>%
rename("mean"=1)%>%
ggplot()+
geom_histogram(aes(x=mean),fill="yellow",alpha=.5)