install.packages("pander", repos = "http://cran.us.r-project.org/")
##
## The downloaded binary packages are in
## /var/folders/sp/nzv75y0j265chzqq39942g6w0000gn/T//RtmpAyfecS/downloaded_packages
library(pander)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.3
library(pwr)
## Warning: package 'pwr' was built under R version 3.4.3
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(statsr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
source("http://bit.ly/dasi_inference")
Make sure your data and R Markdown files are in the same directory. When loaded your data file will be called gss. Delete this note when before you submit your work. ## Part 1: Data
load("/Users/shuangtan/statistical with R /Statistical inference GSS /_5db435f06000e694f6050a2d43fc7be3_gss.Rdata")
df1<-load("_5db435f06000e694f6050a2d43fc7be3_gss.Rdata")
dim(gss)
## [1] 57061 114
I am interested in what causes/influences people who grew up poor to improve their financial status as an adult - I am going to do some preliminary investigation into three potential predictor/expanatory variables : education, religious activity and political philosophy. I make a new “factor” variable for each of these categories which join the multiple potential survey responses into two categories (education into “college/no college”; religious activity into “active/not active” and political philosophy into “conservative/liberal” ). For any of these potential predictor variables that show promise in the initial Exploratory Analysis I will perform inference testing. My interest in this subject is to explore what types of life style choices could be effective in improving one’s life. ## Part 3: Exploratory data analysis Select variables to be explored
df2 <- gss %>% select(degree,attend,polviews, incom16, finrela)
df2<-na.omit(df2)
df2org<-df2
df2 <- df2 %>%
filter(incom16 == "Below Average" | incom16 == "Far Below Average") %>%
mutate(education = ifelse((degree == "Lt High School" | degree == "High School"), "No College Degree", "College Degree"), income_today = ifelse((finrela == "Far Above Average" | finrela == "Above Average"), "Above Average", "Not Above Average"), politics = ifelse((polviews == "Extrmly Conservative" | polviews == "Conservative"| polviews == "Slightly Conservative"), "conservative", "liberal"), religious = ifelse((attend == "More Thn Once Wk"|attend == "Every Week"| attend == "Nrly Every Week"|attend == "2-3X A Month"|attend == "Once A Month"),"Active","NotActive"))
df2$education <- factor(df2$education)
df2$income_today <- factor(df2$income_today)
df2$religious <- factor(df2$religious)
df2$politics <- factor(df2$politics)
df2 <- droplevels(df2) %>% select(education,religious,politics,income_today)
glimpse(df2)
## Observations: 9,456
## Variables: 4
## $ education <fct> No College Degree, College Degree, No College Deg...
## $ religious <fct> NotActive, Active, Active, Active, Active, Active...
## $ politics <fct> conservative, conservative, liberal, liberal, lib...
## $ income_today <fct> Not Above Average, Above Average, Not Above Avera...
Show a plot and table for explanatory variable “Education” and response variable “Current Income” - 36.1% of those with college degrees reported above average incomes vrs only 10.2% of those without college degrees - thus the potential for a strong association is found.
df22<-select(df2,income_today,education)
df222<-na.omit(df22)
df3<-group_by(df2,education,income_today)
bbb<-summarise(df3,count=n())
bbb2<-bbb
bbb2<-group_by(bbb2,education)
bbb2<-summarise(bbb2,sum(count))
bbb2
## # A tibble: 2 x 2
## education `sum(count)`
## <fct> <int>
## 1 College Degree 1972
## 2 No College Degree 7484
bbb<-mutate(bbb,percentage=ifelse(education=="College Degree",count*100/(as.numeric(bbb2[1,2])),count*100/(as.numeric(bbb2[2,2]))))
bbb$percentage<-round(bbb$percentage,digits = 1)
ggplot(bbb, aes(x = education, y=percentage, fill= income_today)) +
geom_col() +
scale_y_continuous(labels = c("0%", "25%", "50%", "75%", "100%")) +
labs(title = "Proportion of US Adults Raised in Below Average Income Families' \n Present Income by Education Level", y = "% Adults in Each Present Income Category", x = "Education Level", color = "Present Income")+
geom_text(aes(label = percentage),size=6,vjust=1.5,position ="stack" )+
theme(plot.title = element_text(hjust = .5))
bcd<-tbl_df(as.data.frame(summary(df2)))
bcd<-rename(bcd,Variable=Var2)
bcd<-select(bcd,Variable,Freq)
grid.table(bbb)
df4<-group_by(df2,religious,income_today)
ccc<-summarise(df4,count=n())
ccc2<-ccc
ccc2<-group_by(ccc2,religious)
ccc2<-summarise(ccc2,sum(count))
ccc2
## # A tibble: 2 x 2
## religious `sum(count)`
## <fct> <int>
## 1 Active 5862
## 2 NotActive 3594
ccc<-mutate(ccc,percentage=ifelse(religious=="Active",count*100/as.numeric(ccc2[1,2]),
count*100/as.numeric(ccc2[2,2])))
ccc$percentage<-round(ccc$percentage,digits = 1)
ggplot(ccc, aes(x = religious, y=percentage, fill= income_today)) +
geom_col() +
scale_y_continuous(labels = c("0%", "25%", "50%", "75%", "100%")) +
labs(title = "Proportion of US Adults Raised in Below Average Income Families' \n Present Religious Level", y = "% Adults in Each Present Income Category", x = "Religious Activity", fill = "Present Income")+
geom_text(aes(label = percentage),size=6,vjust=1.5,position ="stack" )+
theme(plot.title = element_text(hjust = .5))
grid.table(ccc)
### Show a plot and table for explanatory variable “Politics” and response variable “Current Income” - 19.2% of those “conservatives” reported above average incomes vrs only 13.6% of liberals - thus the potential for an association is found.
df5<-group_by(df2,politics,income_today)
ddd<-summarise(df5,count=n())
ddd2<-ddd
ddd2<-group_by(ddd2,politics)
ddd2<-summarise(ddd2,sum(count))
ddd2
## # A tibble: 2 x 2
## politics `sum(count)`
## <fct> <int>
## 1 conservative 3330
## 2 liberal 6126
ddd<-mutate(ddd,percentage=ifelse(politics=="conservative",
count*100/as.numeric(ddd2[1,2]),count*100/as.numeric(ddd2[2,2])))
ddd$percentage<-round(ddd$percentage,digits = 1)
ggplot(ddd, aes(x = politics, y=percentage, fill= income_today)) +
geom_col() +
scale_y_continuous(labels = c("0%", "25%", "50%", "75%", "100%")) +
labs(title = "Proportion of US Adults Raised in Below Average Income Families' \n Political View", y = "% Adults in Each Present Income Category", x = "Political View", fill = "Present Income")+
geom_text(aes(label = percentage),size=6,vjust=1.5,position ="stack" )+
theme(plot.title = element_text(hjust = .5))
grid.table(ddd)
### Part 4: Inference NOTE: Insert code chunks as needed by clicking on the “Insert a new code chunk” button (green button with orange arrow) above. Make sure that your code is visible in the project you submit. Delete this note when before you submit your work. 1.Inference tests with education level as a predictor of of people from modest childhood economic cirmanstances achieving above average adult incomes.
inference(df3$income_today,df3$education,est = "proportion",type = "ci" ,method = "theoretical",success="Above Average")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y College Degree No College Degree Sum
## Above Average 711 764 1475
## Not Above Average 1261 6720 7981
## Sum 1972 7484 9456
## Observed difference between proportions (College Degree-No College Degree) = 0.2585
## Check conditions:
## College Degree : number of successes = 711 ; number of failures = 1261
## No College Degree : number of successes = 764 ; number of failures = 6720
## Standard error = 0.0114
## 95 % Confidence interval = ( 0.2362 , 0.2807 )
inference(df3$income_today,df3$education,est = "proportion",type = "ht" ,method = "theoretical",success="Above Average",null = 0,alternative = "twosided")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y College Degree No College Degree Sum
## Above Average 711 764 1475
## Not Above Average 1261 6720 7981
## Sum 1972 7484 9456
## Observed difference between proportions (College Degree-No College Degree) = 0.2585
## H0: p_College Degree - p_No College Degree = 0
## HA: p_College Degree - p_No College Degree != 0
## Pooled proportion = 0.156
## Check conditions:
## College Degree : number of expected successes = 308 ; number of expected failures = 1664
## No College Degree : number of expected successes = 1167 ; number of expected failures = 6317
## Standard error = 0.009
## Test statistic: Z = 28.142
## p-value = 0
**The confidence interval test, “ci”, indicates almost a 26% difference in proportions“. The hypothesis test,”ht“, shows a high p-value which rounds to”0“. Thus both methods provide strong evidence is found to reject the null hypothesis and an association is found between having a college degree and reporting an above average income.** ### Inference tests with political view (conservative or liberal) as a predictor of of people from modest childhood economic cirmanstances achieving above average adult incomes.
inference(df3$income_today,df3$politics,est = "proportion",type = "ci" ,method = "theoretical",success="Above Average")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y conservative liberal Sum
## Above Average 641 834 1475
## Not Above Average 2689 5292 7981
## Sum 3330 6126 9456
## Observed difference between proportions (conservative-liberal) = 0.0564
## Check conditions:
## conservative : number of successes = 641 ; number of failures = 2689
## liberal : number of successes = 834 ; number of failures = 5292
## Standard error = 0.0081
## 95 % Confidence interval = ( 0.0404 , 0.0723 )
inference(df3$income_today,df3$politics,est = "proportion",type = "ht" ,method = "theoretical",success="Above Average",null = 0,alternative = "twosided")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y conservative liberal Sum
## Above Average 641 834 1475
## Not Above Average 2689 5292 7981
## Sum 3330 6126 9456
## Observed difference between proportions (conservative-liberal) = 0.0564
## H0: p_conservative - p_liberal = 0
## HA: p_conservative - p_liberal != 0
## Pooled proportion = 0.156
## Check conditions:
## conservative : number of expected successes = 519 ; number of expected failures = 2811
## liberal : number of expected successes = 956 ; number of expected failures = 5170
## Standard error = 0.008
## Test statistic: Z = 7.213
## p-value = 0
### The confidence interval test, “ci”, indicates almost a 6% difference in proportions. The hypothesis test, “ht”, shows a high p-value which rounds to “0”. Thus both methods provide strong evidence is found to reject the null hypothesis and an association is found between political view and reporting an above average income. ### Inference tests with religious activity as a predictor of of people from modest childhood economic cirmanstances achieving above average adult incomes.
inference(df3$income_today,df3$religious,est = "proportion",type = "ci" ,method = "theoretical",success="Above Average")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y Active NotActive Sum
## Above Average 897 578 1475
## Not Above Average 4965 3016 7981
## Sum 5862 3594 9456
## Observed difference between proportions (Active-NotActive) = -0.0078
## Check conditions:
## Active : number of successes = 897 ; number of failures = 4965
## NotActive : number of successes = 578 ; number of failures = 3016
## Standard error = 0.0077
## 95 % Confidence interval = ( -0.0229 , 0.0073 )
inference(df3$income_today,df3$religious,est = "proportion",type = "ht" ,method = "theoretical",success="Above Average",null = 0,alternative = "twosided")
## Response variable: categorical, Explanatory variable: categorical
## Difference between two proportions -- success: Above Average
## Summary statistics:
## x
## y Active NotActive Sum
## Above Average 897 578 1475
## Not Above Average 4965 3016 7981
## Sum 5862 3594 9456
## Observed difference between proportions (Active-NotActive) = -0.0078
## H0: p_Active - p_NotActive = 0
## HA: p_Active - p_NotActive != 0
## Pooled proportion = 0.156
## Check conditions:
## Active : number of expected successes = 914 ; number of expected failures = 4948
## NotActive : number of expected successes = 561 ; number of expected failures = 3033
## Standard error = 0.008
## Test statistic: Z = -1.015
## p-value = 0.31
### The confidence interval test, “ci”, indicates less than a 1% difference in proportions. The hypothesis test, “ht”, shows a high p-value (.31) - both methods show significant evidence that the null hypothesis should not be rejected (no association is found).
Summary It was noted that the exploratory data analysis provided good information to help predict the potential for association between potential predictor variables and the chosen response variable. It should be noted that the exploratory data analysis for “political view”" as a predictor of above average income was not very strong (as compared to “education”) and that the inference analysis was particullarly important to prove this association. It is acknowledged that since the GSS survey is not an experiment or observational study and that confounding factors may impact the associations found in this analysis - thus a carefully constructed study will be required to determine actual associations and causality. A study that documented actual income vrs self reported income perception would be a major improvement. Also confirmation of actual college degrees including the student’s major, GPA, and any advanced degrees would be of interest and improve future analysis.