library(tidyr)
library(pander)
library(pwr)
library(ggplot2)
library(dplyr)
library(statsr)
library(gridExtra)
source('http://bit.ly/dasi_inference')
Make sure your data and R Markdown files are in the same directory. When loaded your data file will be called gss
. Delete this note when before you submit your work.
df1<-load("_5db435f06000e694f6050a2d43fc7be3_gss.Rdata")
gss<-tbl_df(gss)
dim(gss)
## [1] 57061 114
df2 <- gss %>% select(degree,attend,polviews, incom16, finrela)
df2<-na.omit(df2)
#df2 <- df2[complete.cases(df2),]
df2org<-df2
#df2<-na.fail(df2)
df2 <- df2 %>%
filter(incom16 == "Below Average" | incom16 == "Far Below Average") %>%
mutate(education = ifelse((degree == "Lt High School" | degree == "High School"), "No College Degree", "College Degree"), income_today = ifelse((finrela == "Far Above Average" | finrela == "Above Average"), "Above Average", "Not Above Average"), politics = ifelse((polviews == "Extrmly Conservative" | polviews == "Conservative"| polviews == "Slightly Conservative"), "conservative", "liberal"), religious = ifelse((attend == "More Thn Once Wk"|attend == "Every Week"| attend == "Nrly Every Week"|attend == "2-3X A Month"|attend == "Once A Month"),"Active","NotActive"))
df2$education <- factor(df2$education)
df2$income_today <- factor(df2$income_today)
df2$religious <- factor(df2$religious)
df2$politics <- factor(df2$politics)
# Select the new variables only
df2 <- droplevels(df2) %>% select(education,religious,politics,income_today)
# Take a glimpse of the data
glimpse(df2)
## Observations: 9,456
## Variables: 4
## $ education <fctr> No College Degree, College Degree, No College De...
## $ religious <fctr> NotActive, Active, Active, Active, Active, Activ...
## $ politics <fctr> conservative, conservative, liberal, liberal, li...
## $ income_today <fctr> Not Above Average, Above Average, Not Above Aver...
# Use ggplot to plot the difference in proportions between those who earned a college degree and those who did not
#ggplot(study, aes(x = education,fill= income_today)) +
#geom_bar(position = "fill") +
#scale_y_continuous(labels = c("0%", "25%", "50%", "75%", "100%")) +
labs(title = "Proportion of US Adults Raised in Below Average Income Families' \n Present Income by Education Level", y = "% Adults in Each Present Income Category", x = "Education Level", fill = "Present Income")
## $title
## [1] "Proportion of US Adults Raised in Below Average Income Families' \n Present Income by Education Level"
##
## $y
## [1] "% Adults in Each Present Income Category"
##
## $x
## [1] "Education Level"
##
## $fill
## [1] "Present Income"
##
## attr(,"class")
## [1] "labels"
##df22<-select(df2,income_today,education)
##df222<-na.omit(df22)
df3<-group_by(df2,education,income_today)
bbb<-summarise(df3,count=n())
bbb2<-bbb
bbb2<-group_by(bbb2,education)
bbb2<-summarise(bbb2,sum(count))
bbb2
## # A tibble: 2 x 2
## education `sum(count)`
## <fctr> <int>
## 1 College Degree 1972
## 2 No College Degree 7484
bbb<-mutate(bbb,percentage=ifelse(education=="College Degree",count*100/(as.numeric(bbb2[1,2])),count*100/(as.numeric(bbb2[2,2]))))
bbb$percentage<-round(bbb$percentage,digits = 1)
ggplot(bbb, aes(x = education, y=percentage, fill= income_today)) +
geom_col() +
scale_y_continuous(labels = c("0%", "25%", "50%", "75%", "100%")) +
labs(title = "Proportion of US Adults Raised in Below Average Income Families' \n Present Income by Education Level", y = "% Adults in Each Present Income Category", x = "Education Level", color = "Present Income")+
geom_text(aes(label = percentage),size=6,vjust=1.5,position ="stack" )+
theme(plot.title = element_text(hjust = .5))
#library(gridExtra)
#bcd<-tbl_df(as.data.frame(summary(df2)))
#bcd<-rename(bcd,Variable=Var2)
#bcd<-select(bcd,Variable,Freq)
grid.table(bbb)
df4<-group_by(df2,religious,income_today)
ccc<-summarise(df4,count=n())
ccc2<-ccc
ccc2<-group_by(ccc2,religious)
ccc2<-summarise(ccc2,sum(count))
ccc2
## # A tibble: 2 x 2
## religious `sum(count)`
## <fctr> <int>
## 1 Active 5862
## 2 NotActive 3594
ccc<-mutate(ccc,percentage=ifelse(religious=="Active",count*100/as.numeric(ccc2[1,2]),
count*100/as.numeric(ccc2[2,2])))
ccc$percentage<-round(ccc$percentage,digits = 1)
ggplot(ccc, aes(x = religious, y=percentage, fill= income_today)) +
geom_col() +
scale_y_continuous(labels = c("0%", "25%", "50%", "75%", "100%")) +
labs(title = "Proportion of US Adults Raised in Below Average Income Families' \n Present Religious Level", y = "% Adults in Each Present Income Category", x = "Religious Activity", fill = "Present Income")+
geom_text(aes(label = percentage),size=6,vjust=1.5,position ="stack" )+
theme(plot.title = element_text(hjust = .5))
grid.table(ccc)
df5<-group_by(df2,politics,income_today)
ddd<-summarise(df5,count=n())
ddd2<-ddd
ddd2<-group_by(ddd2,politics)
ddd2<-summarise(ddd2,sum(count))
ddd2
## # A tibble: 2 x 2
## politics `sum(count)`
## <fctr> <int>
## 1 conservative 3330
## 2 liberal 6126
ddd<-mutate(ddd,percentage=ifelse(politics=="conservative",
count*100/as.numeric(ddd2[1,2]),count*100/as.numeric(ddd2[2,2])))
ddd$percentage<-round(ddd$percentage,digits = 1)
ggplot(ddd, aes(x = politics, y=percentage, fill= income_today)) +
geom_col() +
scale_y_continuous(labels = c("0%", "25%", "50%", "75%", "100%")) +
labs(title = "Proportion of US Adults Raised in Below Average Income Families' \n Political View", y = "% Adults in Each Present Income Category", x = "Political View", fill = "Present Income")+
geom_text(aes(label = percentage),size=6,vjust=1.5,position ="stack" )+
theme(plot.title = element_text(hjust = .5))
grid.table(ddd)
inference(df3$income_today,df3$education,est = "proportion",type = "ci" ,method = "theoretical",success="Above Average")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y College Degree No College Degree Sum
## Above Average 711 764 1475
## Not Above Average 1261 6720 7981
## Sum 1972 7484 9456
## Observed difference between proportions (College Degree-No College Degree) = 0.2585
## Check conditions:
## College Degree : number of successes = 711 ; number of failures = 1261
## No College Degree : number of successes = 764 ; number of failures = 6720
## Standard error = 0.0114
## 95 % Confidence interval = ( 0.2362 , 0.2807 )
inference(df3$income_today,df3$education,est = "proportion",type = "ht" ,method = "theoretical",success="Above Average",null = 0,alternative = "twosided")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y College Degree No College Degree Sum
## Above Average 711 764 1475
## Not Above Average 1261 6720 7981
## Sum 1972 7484 9456
## Observed difference between proportions (College Degree-No College Degree) = 0.2585
## H0: p_College Degree - p_No College Degree = 0
## HA: p_College Degree - p_No College Degree != 0
## Pooled proportion = 0.156
## Check conditions:
## College Degree : number of expected successes = 308 ; number of expected failures = 1664
## No College Degree : number of expected successes = 1167 ; number of expected failures = 6317
## Standard error = 0.009
## Test statistic: Z = 28.142
## p-value = 0
inference(df3$income_today,df3$politics,est = "proportion",type = "ci" ,method = "theoretical",success="Above Average")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y conservative liberal Sum
## Above Average 641 834 1475
## Not Above Average 2689 5292 7981
## Sum 3330 6126 9456
## Observed difference between proportions (conservative-liberal) = 0.0564
## Check conditions:
## conservative : number of successes = 641 ; number of failures = 2689
## liberal : number of successes = 834 ; number of failures = 5292
## Standard error = 0.0081
## 95 % Confidence interval = ( 0.0404 , 0.0723 )
inference(df3$income_today,df3$politics,est = "proportion",type = "ht" ,method = "theoretical",success="Above Average",null = 0,alternative = "twosided")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y conservative liberal Sum
## Above Average 641 834 1475
## Not Above Average 2689 5292 7981
## Sum 3330 6126 9456
## Observed difference between proportions (conservative-liberal) = 0.0564
## H0: p_conservative - p_liberal = 0
## HA: p_conservative - p_liberal != 0
## Pooled proportion = 0.156
## Check conditions:
## conservative : number of expected successes = 519 ; number of expected failures = 2811
## liberal : number of expected successes = 956 ; number of expected failures = 5170
## Standard error = 0.008
## Test statistic: Z = 7.213
## p-value = 0
inference(df3$income_today,df3$religious,est = "proportion",type = "ci" ,method = "theoretical",success="Above Average")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y Active NotActive Sum
## Above Average 897 578 1475
## Not Above Average 4965 3016 7981
## Sum 5862 3594 9456
## Observed difference between proportions (Active-NotActive) = -0.0078
## Check conditions:
## Active : number of successes = 897 ; number of failures = 4965
## NotActive : number of successes = 578 ; number of failures = 3016
## Standard error = 0.0077
## 95 % Confidence interval = ( -0.0229 , 0.0073 )
inference(df3$income_today,df3$religious,est = "proportion",type = "ht" ,method = "theoretical",success="Above Average",null = 0,alternative = "twosided")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y Active NotActive Sum
## Above Average 897 578 1475
## Not Above Average 4965 3016 7981
## Sum 5862 3594 9456
## Observed difference between proportions (Active-NotActive) = -0.0078
## H0: p_Active - p_NotActive = 0
## HA: p_Active - p_NotActive != 0
## Pooled proportion = 0.156
## Check conditions:
## Active : number of expected successes = 914 ; number of expected failures = 4948
## NotActive : number of expected successes = 561 ; number of expected failures = 3033
## Standard error = 0.008
## Test statistic: Z = -1.015
## p-value = 0.31