library(ggplot2)
library(dplyr)
library(statsr)load("gss.Rdata")What are the most important traits that people want in a job? Do these preferences change significantly with education?
GSS asked each respondent the following:Please look at this card and tell me which one thing on this list you would most prefer in a job? b. Which comes next? c. Which is third most important? d. Which is fourth most important?
- High income
- No danger of being fired
- Working hours are short, lots of free time
- Chances for advancement
- Work important and gives a feeling of accomplishment
Each respondent prioritized these traits accordingly, and the results are in jobinc, jobsec, jobhour, jobpromo, and jobmeans.
job_fct_rank to indicate the ranking score and factor to indicate which of the five factors being described.
gss_jobpref <- gss %>% filter(!is.na(jobinc), !is.na(jobsec), !is.na(jobhour), !is.na(jobpromo), !is.na(jobmeans), !is.na(degree)) %>% mutate(jobinc=as.integer(jobinc), jobsec=as.integer(jobsec), jobhour=as.integer(jobhour), jobpromo=as.integer(jobpromo), jobmeans=as.integer(jobmeans))
jobinc_resp <- cbind(gss_jobpref, gss_jobpref$jobinc, rep("jobinc"))
jobsec_resp <- cbind(gss_jobpref, gss_jobpref$jobsec, rep("jobsec"))
jobhour_resp <- cbind(gss_jobpref, gss_jobpref$jobhour, rep("jobhour"))
jobpromo_resp <- cbind(gss_jobpref, gss_jobpref$jobpromo, rep("jobpromo"))
jobmeans_resp <- cbind(gss_jobpref, gss_jobpref$jobmeans, rep("jobmeans"))
cn <- append(colnames(gss_jobpref),c("job_fct_rank","factor"))
colnames(jobinc_resp) <- colnames(jobsec_resp) <- colnames(jobhour_resp) <- colnames(jobpromo_resp) <- colnames(jobmeans_resp) <- cn
jobpref_tallform <- rbind(jobinc_resp,jobsec_resp,jobhour_resp,jobpromo_resp,jobmeans_resp)Let’s look at the overall distribution, first with frequencies and then with proportions:
jobpref_table <- table(jobpref_tallform %>% select(factor,job_fct_rank))
jobpref_table## job_fct_rank
## factor 1 2 3 4 5
## jobinc 4292 4884 5942 3504 1486
## jobsec 1672 2688 4023 6402 5323
## jobhour 926 1785 2260 4832 10305
## jobpromo 3737 6930 4723 3003 1715
## jobmeans 9481 3821 3160 2367 1279
sweep(jobpref_table[,], 1, rowSums(jobpref_table[,]), FUN="/")## job_fct_rank
## factor 1 2 3 4 5
## jobinc 0.21344738 0.24288840 0.29550428 0.17425900 0.07390093
## jobsec 0.08315098 0.13367814 0.20006962 0.31838074 0.26472051
## jobhour 0.04605132 0.08877064 0.11239308 0.24030237 0.51248259
## jobpromo 0.18584643 0.34463895 0.23488164 0.14934354 0.08528944
## jobmeans 0.47150388 0.19002387 0.15715138 0.11771434 0.06360652
It’s easier to interpret the proportions with bar charts. Each chart shows for a single factor what was the distribution of respondent rankings (1st, 2nd, 3rd, 4th, 5th):
jobpref_tallform %>% ggplot(aes(x=job_fct_rank,fill=factor)) + geom_histogram(aes(color=factor,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~factor)Notice the extreme right skew of jobmeans and the extreme left skew of jobhour This indicates that meaning is most often selected first by a wide margin, and hours is most often selected last also by a wide margin. The other three factors, income, security, and promotions, have distributions that are a little bit more uniform with less extreme skew.
To simplify our analysis, we will focus on just one factor, jobinc, and explore how various respondent demographics affect the importance of income among factors they consider in assessing their job.
Ultimately we really want to look at how education (degree) affects job factor preferences, so let’s check that:
gss_jobpref %>% filter(!is.na(educ)) %>% group_by(degree) %>% summarise(n=n(), avg_yrs_school=mean(educ))## # A tibble: 5 x 3
## degree n avg_yrs_school
## <fctr> <int> <dbl>
## 1 Lt High School 5040 8.767857
## 2 High School 10551 12.558430
## 3 Junior College 826 14.369249
## 4 Bachelor 2464 16.340097
## 5 Graduate 1202 18.316140
By education (degree)
jobpref_inc <- jobpref_tallform %>% filter(factor=="jobinc")
jobpref_inc %>% ggplot(aes(x=job_fct_rank,fill=degree)) + geom_histogram(aes(color=degree,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~degree)jobpref_inc_tb <- table(jobpref_inc %>% select(degree,job_fct_rank))
jobpref_inc_tb## job_fct_rank
## degree 1 2 3 4 5
## Lt High School 1453 1089 1155 979 376
## High School 2237 2633 3179 1811 701
## Junior College 155 221 252 137 62
## Bachelor 318 623 933 399 192
## Graduate 129 318 423 178 155
chisq.test(jobpref_inc_tb[,])##
## Pearson's Chi-squared test
##
## data: jobpref_inc_tb[, ]
## X-squared = 526.16, df = 16, p-value < 2.2e-16
By job satisfaction
jobpref_inc %>% ggplot(aes(x=job_fct_rank,fill=satjob)) + geom_histogram(aes(color=satjob,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~satjob)jobpref_inc_tb <- table(jobpref_inc %>% select(satjob,job_fct_rank))
jobpref_inc_tb## job_fct_rank
## satjob 1 2 3 4 5
## Very Satisfied 1236 1761 2203 1266 511
## Mod. Satisfied 1384 1382 1635 900 391
## A Little Dissat 437 417 446 212 79
## Very Dissatisfied 167 186 179 102 28
chisq.test(jobpref_inc_tb[,])##
## Pearson's Chi-squared test
##
## data: jobpref_inc_tb[, ]
## X-squared = 155.31, df = 12, p-value < 2.2e-16
By region of interview
jobpref_inc %>% ggplot(aes(x=job_fct_rank,fill=region)) + geom_histogram(aes(color=region,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~region)jobpref_inc_tb <- table(jobpref_inc %>% select(region,job_fct_rank))
jobpref_inc_tb## job_fct_rank
## region 1 2 3 4 5
## New England 185 251 284 162 67
## Middle Atlantic 698 754 853 501 203
## E. Nor. Central 730 945 1269 773 301
## W. Nor. Central 261 361 489 296 119
## South Atlantic 983 866 1042 585 271
## E. Sou. Central 388 328 340 220 87
## W. Sou. Central 411 419 487 277 135
## Mountain 163 265 374 203 95
## Pacific 473 695 804 487 208
chisq.test(jobpref_inc_tb[,])##
## Pearson's Chi-squared test
##
## data: jobpref_inc_tb[, ]
## X-squared = 229.46, df = 32, p-value < 2.2e-16
By religious preference (just Protestant vs Catholic)
jobpref_inc %>% filter(relig=="Protestant" | relig=="Catholic") %>% ggplot(aes(x=job_fct_rank,fill=relig)) + geom_histogram(aes(color=relig,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~relig)jobpref_inc_tb <- table(jobpref_inc %>% filter(relig=="Protestant" | relig=="Catholic") %>% select(relig,job_fct_rank))
jobpref_inc_tb <- rbind(jobpref_inc_tb[1,],rbind(jobpref_inc_tb[2,]))
jobpref_inc_tb## 1 2 3 4 5
## [1,] 2695 2957 3734 2217 885
## [2,] 1018 1267 1482 880 320
chisq.test(jobpref_inc_tb[,])##
## Pearson's Chi-squared test
##
## data: jobpref_inc_tb[, ]
## X-squared = 9.0282, df = 4, p-value = 0.0604
By how often read newspaper
jobpref_inc %>% ggplot(aes(x=job_fct_rank,fill=news)) + geom_histogram(aes(color=news,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~news)jobpref_inc_tb <- table(jobpref_inc %>% select(news,job_fct_rank))
jobpref_inc_tb## job_fct_rank
## news 1 2 3 4 5
## Everyday 955 1164 1458 893 379
## Few Times A Week 505 551 596 298 159
## Once A Week 307 297 323 191 100
## Less Than Once Wk 239 188 182 113 109
## Never 201 136 117 137 126
chisq.test(jobpref_inc_tb[,])##
## Pearson's Chi-squared test
##
## data: jobpref_inc_tb[, ]
## X-squared = 225.9, df = 16, p-value < 2.2e-16
By marital status
jobpref_inc %>% ggplot(aes(x=job_fct_rank,fill=marital)) + geom_histogram(aes(color=marital,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~marital)jobpref_inc_tb <- table(jobpref_inc %>% select(marital,job_fct_rank))
jobpref_inc_tb## job_fct_rank
## marital 1 2 3 4 5
## Married 2323 2874 3632 1997 807
## Widowed 363 355 504 484 226
## Divorced 470 530 620 352 150
## Separated 220 166 159 103 40
## Never Married 916 957 1024 568 261
chisq.test(jobpref_inc_tb[,])##
## Pearson's Chi-squared test
##
## data: jobpref_inc_tb[, ]
## X-squared = 271.09, df = 16, p-value < 2.2e-16
We will perform a chi squared test for independence. Before proceeding, we build intuition by displaying the values of our charts as tables of proportions:
jobinc_prop <- sweep(jobpref_table[,,1], 1, rowSums(jobpref_table[,,1]), FUN="/")
jobsec_prop <- sweep(jobpref_table[,,2], 1, rowSums(jobpref_table[,,2]), FUN="/")
jobhours_prop <- sweep(jobpref_table[,,3], 1, rowSums(jobpref_table[,,3]), FUN="/")
jobpromo_prop <- sweep(jobpref_table[,,4], 1, rowSums(jobpref_table[,,4]), FUN="/")
jobmeans_prop <- sweep(jobpref_table[,,5], 1, rowSums(jobpref_table[,,5]), FUN="/")
jobinc_prop## job_fct_rank
## degree 1 2 3 4 5
## Lt High School 0.28760887 0.21555819 0.22862233 0.19378464 0.07442597
## High School 0.21181706 0.24931351 0.30101316 0.17147997 0.06637629
## Junior College 0.18742443 0.26723096 0.30471584 0.16565901 0.07496977
## Bachelor 0.12900609 0.25273834 0.37849899 0.16186613 0.07789047
## Graduate 0.10723192 0.26433915 0.35162095 0.14796342 0.12884456
jobsec_prop## job_fct_rank
## degree 1 2 3 4 5
## Lt High School 0.12509897 0.19714964 0.23357086 0.26346002 0.18072051
## High School 0.07669728 0.12432535 0.20263233 0.34172900 0.25461604
## Junior College 0.07496977 0.10157195 0.17775091 0.33736397 0.30834341
## Bachelor 0.04300203 0.07545639 0.15172414 0.32981744 0.40000000
## Graduate 0.05153782 0.09060682 0.15128845 0.30756442 0.39900249
jobhours_prop## job_fct_rank
## degree 1 2 3 4 5
## Lt High School 0.04552652 0.09718923 0.12509897 0.20526524 0.52692003
## High School 0.04204147 0.07726541 0.10084272 0.23454218 0.54530821
## Junior College 0.05441354 0.09673519 0.11487304 0.25876663 0.47521161
## Bachelor 0.05436105 0.10344828 0.12089249 0.29736308 0.42393509
## Graduate 0.06068163 0.11886949 0.14131338 0.30839568 0.37073982
jobpromo_prop## job_fct_rank
## degree 1 2 3 4 5
## Lt High School 0.20348377 0.30146477 0.23258116 0.15439430 0.10807601
## High School 0.20140138 0.35195531 0.22990247 0.14184263 0.07489821
## Junior College 0.17170496 0.33131802 0.25272068 0.14631197 0.09794438
## Bachelor 0.13265720 0.39391481 0.24178499 0.15294118 0.07870183
## Graduate 0.09393184 0.36990856 0.26184539 0.18869493 0.08561929
jobmeans_prop## job_fct_rank
## degree 1 2 3 4 5
## Lt High School 0.33828187 0.18863816 0.18012668 0.18309580 0.10985748
## High School 0.46804280 0.19714042 0.16560932 0.11040621 0.05880125
## Junior College 0.51148730 0.20314389 0.14993954 0.09189843 0.04353083
## Bachelor 0.64097363 0.17444219 0.10709939 0.05801217 0.01947262
## Graduate 0.68661679 0.15627598 0.09393184 0.04738155 0.01579385
Each table shows the preferences expressed for one of the five factors. Each row of each table corresponds to one of the 25 charts above, singling out how those with one specific education level have expressed their preferences for one specific job factor. These values are proportions and so each row adds up to one. So our question becomes: for each table above, are the proportions of 1- 2- 3- 4- and 5- responses independent of education, or is there strong statistical evidence suggesting a relationship between education and these values?
Income
chisq.test(jobpref_table[,,1])##
## Pearson's Chi-squared test
##
## data: jobpref_table[, , 1]
## X-squared = 526.16, df = 16, p-value < 2.2e-16
Security
chisq.test(jobpref_table[,,2])##
## Pearson's Chi-squared test
##
## data: jobpref_table[, , 2]
## X-squared = 959.07, df = 16, p-value < 2.2e-16
Hours
chisq.test(jobpref_table[,,3])##
## Pearson's Chi-squared test
##
## data: jobpref_table[, , 3]
## X-squared = 279.69, df = 16, p-value < 2.2e-16
Advancement
chisq.test(jobpref_table[,,4])##
## Pearson's Chi-squared test
##
## data: jobpref_table[, , 4]
## X-squared = 235.38, df = 16, p-value < 2.2e-16
Accomplishment/Meaning
chisq.test(jobpref_table[,,5])##
## Pearson's Chi-squared test
##
## data: jobpref_table[, , 5]
## X-squared = 1183.2, df = 16, p-value < 2.2e-16
These chi squared scores are extremely high. Something seems off. Let’s ignore selections for 3rd, 4th and 5th place, so that these low-ranking preferences do not affect our analysis.
jobpref_top <- jobpref_tallform %>% select(degree,job_fct_rank,factor) %>% mutate(job_fct_rank=ifelse(job_fct_rank>1,2,job_fct_rank))
jobpref_top %>% filter(factor=="jobinc") %>% ggplot(aes(x=job_fct_rank,fill=degree)) + geom_histogram(aes(color=degree,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~degree) + ggtitle("Proportion of FIRST RANK for INCOME by Education (degree)")jobpref_top %>% filter(factor=="jobsec") %>% ggplot(aes(x=job_fct_rank,fill=degree)) + geom_histogram(aes(color=degree,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~degree) + ggtitle("Proportion of FIRST RANK for SECURITY by Education (degree)")jobpref_top %>% filter(factor=="jobhour") %>% ggplot(aes(x=job_fct_rank,fill=degree)) + geom_histogram(aes(color=degree,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~degree) + ggtitle("Proportion of FIRST RANK for HOURS by Education (degree)")jobpref_top %>% filter(factor=="jobpromo") %>% ggplot(aes(x=job_fct_rank,fill=degree)) + geom_histogram(aes(color=degree,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~degree) + ggtitle("Proportion of FIRST RANK for ADVANCEMENT by Education (degree)")jobpref_top %>% filter(factor=="jobmeans") %>% ggplot(aes(x=job_fct_rank,fill=degree)) + geom_histogram(aes(color=degree,y=..density..),position="dodge",binwidth = 1) + facet_wrap(~degree) + ggtitle("Proportion of FIRST RANK for ACCOMPLISHMENT+MEANING by Education (degree)")Let’s check HOURS, where the differences seem LEAST significant
jobpref_toptbl <- table(jobpref_top)
chisq.test(jobpref_toptbl[,,3])##
## Pearson's Chi-squared test
##
## data: jobpref_toptbl[, , 3]
## X-squared = 14.949, df = 4, p-value = 0.004807
What exactly are the “significant” differences we are seeing?
jobinc_top <- sweep(jobpref_toptbl[,,1], 1, rowSums(jobpref_toptbl[,,1]), FUN="/")
jobsec_top <- sweep(jobpref_toptbl[,,2], 1, rowSums(jobpref_toptbl[,,2]), FUN="/")
jobhours_top <- sweep(jobpref_toptbl[,,3], 1, rowSums(jobpref_toptbl[,,3]), FUN="/")
jobpromo_top <- sweep(jobpref_toptbl[,,4], 1, rowSums(jobpref_toptbl[,,4]), FUN="/")
jobmeans_top <- sweep(jobpref_toptbl[,,5], 1, rowSums(jobpref_toptbl[,,5]), FUN="/")
jobhours_top## job_fct_rank
## degree 1 2
## Lt High School 0.04552652 0.95447348
## High School 0.04204147 0.95795853
## Junior College 0.05441354 0.94558646
## Bachelor 0.05436105 0.94563895
## Graduate 0.06068163 0.93931837
Some of these differences seem not too meaningful, even though statistically significant.