#
dta <- read.csv("data0505.csv", header = T)
options(digits = 3)
pacman::p_load(tidyverse, ggplot2, knitr, furniture, gmodels)
dta <- dta %>% mutate( Gender = relevel(Gender, ref = "女"),
Sector = relevel(Sector, ref = "私立"),
Field = relevel(Field, ref = "遊憩與運動學群"),
EduLv = factor(EduLv, levels=c("博士","碩士","普通大學","科技大學",
"技術學院","五專","三專",
"二專","高中","高職","國中")),
EduLv = relevel(EduLv, ref = "技術學院"),
Region = factor(Region, levels =c("宜花東離島","北北基","桃竹苗",
"中彰投","雲嘉南","高屏澎")),
Age = as.numeric(Age),
J_year = as.numeric(J_year),
JobZone = as.numeric(JobZone),
EduZone = as.numeric(EduZone),
JobZone_D = as.numeric(EduZone-JobZone),
Salary = as.numeric(Salary),
SubEduOver = relevel(SubEduOver, ref="符合工作要求"),
Core = recode_factor(as.factor(JobCor), "1" = "無關聯",
"2" = "部分關聯",
"3" = "核心關聯"),
SubEduOver = factor(SubEduOver,levels =c("符合工作要求","高於工作要求","低於工作要求")))
# data construction
glimpse(dta)
## Observations: 1,571
## Variables: 26
## $ SID <fctr> A1, A10, A100, A102, A103, A104, A105, A106, A107...
## $ Gender <fctr> 女, 女, 女, 女, 男, 女, 女, 女, 女, 女, 女, 男, 男, 男, 女, 男, 男...
## $ Sector <fctr> 國立(公立), 國立(公立), 私立, 國立(公立), 國立(公立), 國立(公立), 國立(公立...
## $ EduLv <fctr> 碩士, 碩士, 普通大學, 普通大學, 高職, 普通大學, 普通大學, 普通大學, 普通大學, 普...
## $ SubEduOver <fctr> 符合工作要求, 高於工作要求, 符合工作要求, 符合工作要求, 符合工作要求, 符合工作要求, 符...
## $ Require <fctr> 碩士, 高中/高職, 普通大學, 普通大學, 高中/高職, 普通大學, 普通大學, 普通大學, 普...
## $ Field <fctr> 教育學群, 資訊學群, 外語學群, 教育學群, 工程學群, 文史哲學群, 文史哲學群, 大眾傳播學...
## $ City <fctr> 臺南市, 高雄市, 苗栗縣, 新北市, 高雄市, 南投縣, 嘉義市, 臺北市, 臺北市, 南投縣,...
## $ Category <fctr> 受雇於公營機關, 受雇於公營機關, 受雇於公營機關, 受雇於公營機關, 受雇者於私營企業, 受雇於...
## $ Staff <fctr> 10-29人, 50-99人, 50-99人, 10-29人, 2-9人, 100-199人, 1...
## $ Hours <int> 48, 40, 70, 50, 57, 51, 64, 50, 50, 47, 50, 60, 45...
## $ J_year <dbl> 2, 8, 4, 1, 21, 1, 6, 0, 1, 1, 17, 7, 3, 23, 1, 2,...
## $ J_total <dbl> 2, 8, 4, 1, 30, 1, 6, 0, 2, 2, 28, 7, 30, 26, 1, 1...
## $ income <fctr> 2萬以下, 2-3萬以下, 3-4萬以下, 4-5萬以下, 3-4萬以下, 4-5萬以下, 2萬以...
## $ SubMismatch <int> 4, 2, 3, 4, 5, 4, 5, 4, 3, 3, 4, 4, 4, 4, 5, 5, 3,...
## $ JobSat <int> 6, 4, 3, 5, 5, 6, 7, 5, 3, 6, 3, 5, 4, 7, 3, 4, 4,...
## $ EduZone <dbl> 5, 5, 4, 4, 2, 4, 4, 4, 4, 4, 5, 5, 3, 5, 4, 4, 4,...
## $ Region <fctr> 雲嘉南, 高屏澎, 桃竹苗, 北北基, 高屏澎, 中彰投, 雲嘉南, 北北基, 北北基, 中彰投,...
## $ Salary <dbl> 20000, 25000, 35000, 45000, 35000, 45000, 20000, 3...
## $ Age <dbl> 26, 34, 30, 25, 62, 25, 21, 24, 25, 26, 57, 35, 54...
## $ JobZone <dbl> 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 3,...
## $ JobCor <int> 3, 1, 2, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 1, 3,...
## $ Core <fctr> 核心關聯, 無關聯, 部分關聯, 核心關聯, 無關聯, 部分關聯, 無關聯, 核心關聯, 無關聯,...
## $ ObjOver <fctr> over, over, adequate, adequate, under, adequate, ...
## $ X <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ JobZone_D <dbl> 1, 2, 0, 0, -1, 0, 0, 0, 0, 0, 1, 1, -1, 0, 0, 0, ...
# check and pick out
lapply(dta[,c("Sector", "Field", "City", "Region","EduLv", "SubEduOver", "ObjOver")], levels)
## $Sector
## [1] "私立" "國外學校" "國立(公立)"
##
## $Field
## [1] "遊憩與運動學群" "大眾傳播學群" "工程學群" "文史哲學群"
## [5] "外語學群" "生命科學學群" "生物資源學群" "地球與環境學群"
## [9] "法政學群" "社會與心理學群" "建築與設計學群" "財經學群"
## [13] "教育學群" "資訊學群" "管理學群" "數理化學群"
## [17] "醫藥衛生學群" "藝術學群"
##
## $City
## [1] "宜蘭縣" "花蓮縣" "金門縣" "南投縣" "屏東縣" "苗栗縣" "桃園市"
## [8] "高雄市" "基隆市" "雲林縣" "新北市" "新竹市" "新竹縣" "嘉義市"
## [15] "嘉義縣" "彰化縣" "臺中市" "臺北市" "臺東縣" "臺南市" "澎湖縣"
##
## $Region
## [1] "宜花東離島" "北北基" "桃竹苗" "中彰投" "雲嘉南"
## [6] "高屏澎"
##
## $EduLv
## [1] "技術學院" "博士" "碩士" "普通大學" "科技大學" "五專"
## [7] "三專" "二專" "高中" "高職" "國中"
##
## $SubEduOver
## [1] "符合工作要求" "高於工作要求" "低於工作要求"
##
## $ObjOver
## [1] "adequate" "over" "under"
# pick out
names(dta)
## [1] "SID" "Gender" "Sector" "EduLv" "SubEduOver"
## [6] "Require" "Field" "City" "Category" "Staff"
## [11] "Hours" "J_year" "J_total" "income" "SubMismatch"
## [16] "JobSat" "EduZone" "Region" "Salary" "Age"
## [21] "JobZone" "JobCor" "Core" "ObjOver" "X"
## [26] "JobZone_D"
p <- dplyr::select(dta, -City, -income, -JobSat, -X)%>%
filter(Age <= 65 & Age >= 20 & Category != "自營者" )
p <- p[p$EduLv %in% c("博士","碩士", "普通大學", "科技大學", "技術學院"),]
p <- p %>% mutate( EduLv = factor(EduLv, levels=c("博士","碩士","普通大學","科技大學",
"技術學院")),
EduLv = relevel(EduLv, ref = "技術學院"),
Category = factor(Category,levels=c("受雇者於私營企業","受雇於公營機關")))
# hunt for missing values by variables
apply(apply(p, 1, is.na), 1, sum)
## SID Gender Sector EduLv SubEduOver Require
## 0 0 0 0 0 0
## Field Category Staff Hours J_year J_total
## 0 0 0 0 0 0
## SubMismatch EduZone Region Salary Age JobZone
## 0 0 0 0 0 0
## JobCor Core ObjOver JobZone_D
## 0 0 0 0
#
# define function
d <- function(x){
data.frame(n = t(t(table(x))),
prop = t(t(prop.table(table(x)))),
mean = aggregate(Salary ~ x, p, mean),
sd = aggregate(Salary ~ x, p, sd))
}
#
# table
des <- as.data.frame(rbind(d(p$Gender),d(p$Sector),d(p$EduLv),d(p$Field),d(p$Category),d(p$Region),d(p$SubEduOver),d(p$SubMismatch),d(p$ObjOver),d(p$Core)))%>%
select(c(1,3,6,8,10))
## Warning in `[<-.factor`(`*tmp*`, ri, value = 1:5): invalid factor level, NA
## generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = 1:5): invalid factor level, NA
## generated
kable(des)
| 女 |
865 |
0.606 |
41376 |
16341 |
| 男 |
562 |
0.394 |
53817 |
32063 |
| 私立 |
568 |
0.398 |
40616 |
20534 |
| 國外學校 |
24 |
0.017 |
66250 |
41631 |
| 國立(公立) |
835 |
0.585 |
49551 |
25552 |
| 技術學院 |
39 |
0.027 |
42051 |
23944 |
| 博士 |
9 |
0.006 |
76111 |
14530 |
| 碩士 |
449 |
0.315 |
58742 |
31556 |
| 普通大學 |
672 |
0.471 |
41287 |
17390 |
| 科技大學 |
258 |
0.181 |
37171 |
16631 |
| 遊憩與運動學群 |
29 |
0.020 |
42241 |
21654 |
| 大眾傳播學群 |
45 |
0.032 |
35889 |
12937 |
| 工程學群 |
224 |
0.157 |
59866 |
32648 |
| 文史哲學群 |
86 |
0.060 |
40988 |
16066 |
| 外語學群 |
78 |
0.055 |
43782 |
16597 |
| 生命科學學群 |
35 |
0.025 |
44714 |
16713 |
| 生物資源學群 |
22 |
0.015 |
42273 |
15791 |
| 地球與環境學群 |
23 |
0.016 |
45870 |
12400 |
| 法政學群 |
63 |
0.044 |
42698 |
14751 |
| 社會與心理學群 |
125 |
0.088 |
38320 |
13228 |
| 建築與設計學群 |
48 |
0.034 |
43542 |
41794 |
| 財經學群 |
102 |
0.071 |
43529 |
23679 |
| 教育學群 |
103 |
0.072 |
45631 |
12321 |
| 資訊學群 |
110 |
0.077 |
50227 |
26513 |
| 管理學群 |
141 |
0.099 |
41738 |
22063 |
| 數理化學群 |
60 |
0.042 |
51000 |
18794 |
| 醫藥衛生學群 |
99 |
0.069 |
49293 |
33258 |
| 藝術學群 |
34 |
0.024 |
36324 |
14735 |
| 受雇者於私營企業 |
857 |
0.601 |
45163 |
27905 |
| 受雇於公營機關 |
570 |
0.399 |
47947 |
18319 |
| 宜花東離島 |
51 |
0.036 |
45294 |
16983 |
| 北北基 |
466 |
0.327 |
46406 |
21342 |
| 桃竹苗 |
207 |
0.145 |
55966 |
29606 |
| 中彰投 |
213 |
0.149 |
43638 |
27079 |
| 雲嘉南 |
246 |
0.172 |
43496 |
23814 |
| 高屏澎 |
244 |
0.171 |
43115 |
23455 |
| 符合工作要求 |
982 |
0.688 |
49038 |
27107 |
| 高於工作要求 |
299 |
0.210 |
41020 |
16417 |
| 低於工作要求 |
146 |
0.102 |
38459 |
15398 |
| 1 |
177 |
0.124 |
40819 |
24298 |
| 2 |
251 |
0.176 |
42530 |
18881 |
| 3 |
360 |
0.252 |
43819 |
22243 |
| 4 |
444 |
0.311 |
49212 |
25182 |
| 5 |
195 |
0.137 |
53897 |
30670 |
| adequate |
438 |
0.307 |
44680 |
24990 |
| over |
927 |
0.650 |
46882 |
24265 |
| under |
62 |
0.043 |
48468 |
25697 |
| 無關聯 |
873 |
0.612 |
43975 |
22626 |
| 部分關聯 |
188 |
0.132 |
51516 |
27493 |
| 核心關聯 |
366 |
0.256 |
49071 |
26678 |
kable(table1(p,SubEduOver,splitby = ~ ObjOver, row_wise=T, output = 'text2'))
|
n = 438 |
n = 927 |
n = 62 |
| SubEduOver |
|
|
|
| 符合工作要求 |
340 (34.6%) |
589 (60%) |
53 (5.4%) |
| 高於工作要求 |
62 (20.7%) |
232 (77.6%) |
5 (1.7%) |
| 低於工作要求 |
36 (24.7%) |
106 (72.6%) |
4 (2.7%) |
|
kable(table1(p,as.factor(SubMismatch),splitby = ~ Core, row_wise=T, output = 'text2'))
|
n = 873 |
n = 188 |
n = 366 |
| as.factor.SubMismatch. |
|
|
|
| 1 |
161 (91%) |
6 (3.4%) |
10 (5.6%) |
| 2 |
193 (76.9%) |
22 (8.8%) |
36 (14.3%) |
| 3 |
232 (64.4%) |
48 (13.3%) |
80 (22.2%) |
| 4 |
210 (47.3%) |
84 (18.9%) |
150 (33.8%) |
| 5 |
77 (39.5%) |
28 (14.4%) |
90 (46.2%) |
|
kable(table(p$SubEduOver,p$ObjOver))
| 符合工作要求 |
340 |
589 |
53 |
| 高於工作要求 |
62 |
232 |
5 |
| 低於工作要求 |
36 |
106 |
4 |
# relation
pc <- select(p, Salary,Age,Hours, J_year,J_total )
tableC(pc, cor_type="pearson")
## N = 1427
## Note: pearson correlation (p-value).
##
## ────────────────────────────────────────────────────────────────────────────
## [1] [2] [3] [4]
## [1]Salary 1.00
## [2]Age 0.364 (<.001) 1.00
## [3]Hours 0.112 (<.001) -0.041 (0.124) 1.00
## [4]J_year 0.26 (<.001) 0.721 (<.001) -0.048 (0.07) 1.00
## [5]J_total 0.277 (<.001) 0.886 (<.001) -0.047 (0.078) 0.792 (<.001)
## [5]
##
##
##
##
## 1.00
## ────────────────────────────────────────────────────────────────────────────
# normal way correlation
cor(pc, use="complete.obs", method="spearman")
## Salary Age Hours J_year J_total
## Salary 1.000 0.4390 0.1197 0.3338 0.3069
## Age 0.439 1.0000 -0.0283 0.6508 0.8293
## Hours 0.120 -0.0283 1.0000 -0.0227 -0.0339
## J_year 0.334 0.6508 -0.0227 1.0000 0.7259
## J_total 0.307 0.8293 -0.0339 0.7259 1.0000
#
# Chi-squared
# normal Pearson
chisq.test(table(p$SubEduOver, p$ObjOver))
##
## Pearson's Chi-squared test
##
## data: table(p$SubEduOver, p$ObjOver)
## X-squared = 40, df = 4, p-value = 2e-07
# Yate's correct
chisq.test(table(p$SubEduOver, p$ObjOver), correct = T)
##
## Pearson's Chi-squared test
##
## data: table(p$SubEduOver, p$ObjOver)
## X-squared = 40, df = 4, p-value = 2e-07
# fisher fisher.test(table(p$SubEduOver, p$ObjOver))
# Pearson's
library(vcd)
## Warning: package 'vcd' was built under R version 3.4.4
## Loading required package: grid
assocstats(table(p$SubEduOver, p$ObjOver))#詳細報表
## X^2 df P(> X^2)
## Likelihood Ratio 39.295 4 6.0553e-08
## Pearson 37.024 4 1.7812e-07
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.159
## Cramer's V : 0.114
independence_table(table(p$SubEduOver, p$ObjOver))#無關時的期望值
##
## adequate over under
## 符合工作要求 301.4 637.9 42.67
## 高於工作要求 91.8 194.2 12.99
## 低於工作要求 44.8 94.8 6.34
#
# plot
# age
ggplot(p , aes(x = Age))+
geom_bar(position="dodge")+
geom_vline(xintercept = mean(p$Age), color = "black", linetype = 2) +
scale_x_continuous(limits=c(20,65), breaks=seq(20,65, by = 5))+
scale_y_continuous(limits=c(0,180), breaks=seq(0,180, by = 20))+
labs(x = "年齡",y = "人數")+
theme_bw()

# salary
ggplot(p, aes(x = as.factor(Salary))) +
geom_bar(position="dodge", width =1)+
labs(x = "薪資",y = "人數") +
theme_bw() +
theme(axis.text.x = element_text(hjust = 1, angle =30))
