#
dta <- read.csv("data0505.csv", header = T)
options(digits = 3)
pacman::p_load(tidyverse, ggplot2, knitr, furniture, gmodels)
dta <- dta %>% mutate( Gender = relevel(Gender, ref = "女"),
                       Sector = relevel(Sector, ref = "私立"),
                       Field = relevel(Field, ref = "遊憩與運動學群"),
                       EduLv = factor(EduLv, levels=c("博士","碩士","普通大學","科技大學",
                                                      "技術學院","五專","三專",
                                                      "二專","高中","高職","國中")),
                       EduLv = relevel(EduLv, ref = "技術學院"),
                       Region = factor(Region, levels =c("宜花東離島","北北基","桃竹苗",
                                                         "中彰投","雲嘉南","高屏澎")),
                       Age = as.numeric(Age), 
                       J_year = as.numeric(J_year), 
                       JobZone = as.numeric(JobZone),
                       EduZone = as.numeric(EduZone),
                       JobZone_D = as.numeric(EduZone-JobZone),
                       Salary = as.numeric(Salary),
                       SubEduOver = relevel(SubEduOver, ref="符合工作要求"),
                       Core = recode_factor(as.factor(JobCor), "1" = "無關聯",
                                            "2" = "部分關聯",
                                            "3" = "核心關聯"),
                       SubEduOver = factor(SubEduOver,levels =c("符合工作要求","高於工作要求","低於工作要求")))

# data construction
glimpse(dta)
## Observations: 1,571
## Variables: 26
## $ SID         <fctr> A1, A10, A100, A102, A103, A104, A105, A106, A107...
## $ Gender      <fctr> 女, 女, 女, 女, 男, 女, 女, 女, 女, 女, 女, 男, 男, 男, 女, 男, 男...
## $ Sector      <fctr> 國立(公立), 國立(公立), 私立, 國立(公立), 國立(公立), 國立(公立), 國立(公立...
## $ EduLv       <fctr> 碩士, 碩士, 普通大學, 普通大學, 高職, 普通大學, 普通大學, 普通大學, 普通大學, 普...
## $ SubEduOver  <fctr> 符合工作要求, 高於工作要求, 符合工作要求, 符合工作要求, 符合工作要求, 符合工作要求, 符...
## $ Require     <fctr> 碩士, 高中/高職, 普通大學, 普通大學, 高中/高職, 普通大學, 普通大學, 普通大學, 普...
## $ Field       <fctr> 教育學群, 資訊學群, 外語學群, 教育學群, 工程學群, 文史哲學群, 文史哲學群, 大眾傳播學...
## $ City        <fctr> 臺南市, 高雄市, 苗栗縣, 新北市, 高雄市, 南投縣, 嘉義市, 臺北市, 臺北市, 南投縣,...
## $ Category    <fctr> 受雇於公營機關, 受雇於公營機關, 受雇於公營機關, 受雇於公營機關, 受雇者於私營企業, 受雇於...
## $ Staff       <fctr> 10-29人, 50-99人, 50-99人, 10-29人, 2-9人, 100-199人, 1...
## $ Hours       <int> 48, 40, 70, 50, 57, 51, 64, 50, 50, 47, 50, 60, 45...
## $ J_year      <dbl> 2, 8, 4, 1, 21, 1, 6, 0, 1, 1, 17, 7, 3, 23, 1, 2,...
## $ J_total     <dbl> 2, 8, 4, 1, 30, 1, 6, 0, 2, 2, 28, 7, 30, 26, 1, 1...
## $ income      <fctr> 2萬以下, 2-3萬以下, 3-4萬以下, 4-5萬以下, 3-4萬以下, 4-5萬以下, 2萬以...
## $ SubMismatch <int> 4, 2, 3, 4, 5, 4, 5, 4, 3, 3, 4, 4, 4, 4, 5, 5, 3,...
## $ JobSat      <int> 6, 4, 3, 5, 5, 6, 7, 5, 3, 6, 3, 5, 4, 7, 3, 4, 4,...
## $ EduZone     <dbl> 5, 5, 4, 4, 2, 4, 4, 4, 4, 4, 5, 5, 3, 5, 4, 4, 4,...
## $ Region      <fctr> 雲嘉南, 高屏澎, 桃竹苗, 北北基, 高屏澎, 中彰投, 雲嘉南, 北北基, 北北基, 中彰投,...
## $ Salary      <dbl> 20000, 25000, 35000, 45000, 35000, 45000, 20000, 3...
## $ Age         <dbl> 26, 34, 30, 25, 62, 25, 21, 24, 25, 26, 57, 35, 54...
## $ JobZone     <dbl> 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 3,...
## $ JobCor      <int> 3, 1, 2, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 1, 3,...
## $ Core        <fctr> 核心關聯, 無關聯, 部分關聯, 核心關聯, 無關聯, 部分關聯, 無關聯, 核心關聯, 無關聯,...
## $ ObjOver     <fctr> over, over, adequate, adequate, under, adequate, ...
## $ X           <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ JobZone_D   <dbl> 1, 2, 0, 0, -1, 0, 0, 0, 0, 0, 1, 1, -1, 0, 0, 0, ...
# check and pick out
lapply(dta[,c("Sector", "Field", "City", "Region","EduLv", "SubEduOver", "ObjOver")], levels)
## $Sector
## [1] "私立"         "國外學校"     "國立(公立)"
## 
## $Field
##  [1] "遊憩與運動學群" "大眾傳播學群"   "工程學群"       "文史哲學群"    
##  [5] "外語學群"       "生命科學學群"   "生物資源學群"   "地球與環境學群"
##  [9] "法政學群"       "社會與心理學群" "建築與設計學群" "財經學群"      
## [13] "教育學群"       "資訊學群"       "管理學群"       "數理化學群"    
## [17] "醫藥衛生學群"   "藝術學群"      
## 
## $City
##  [1] "宜蘭縣" "花蓮縣" "金門縣" "南投縣" "屏東縣" "苗栗縣" "桃園市"
##  [8] "高雄市" "基隆市" "雲林縣" "新北市" "新竹市" "新竹縣" "嘉義市"
## [15] "嘉義縣" "彰化縣" "臺中市" "臺北市" "臺東縣" "臺南市" "澎湖縣"
## 
## $Region
## [1] "宜花東離島" "北北基"     "桃竹苗"     "中彰投"     "雲嘉南"    
## [6] "高屏澎"    
## 
## $EduLv
##  [1] "技術學院" "博士"     "碩士"     "普通大學" "科技大學" "五專"    
##  [7] "三專"     "二專"     "高中"     "高職"     "國中"    
## 
## $SubEduOver
## [1] "符合工作要求" "高於工作要求" "低於工作要求"
## 
## $ObjOver
## [1] "adequate" "over"     "under"
# pick out
names(dta)
##  [1] "SID"         "Gender"      "Sector"      "EduLv"       "SubEduOver" 
##  [6] "Require"     "Field"       "City"        "Category"    "Staff"      
## [11] "Hours"       "J_year"      "J_total"     "income"      "SubMismatch"
## [16] "JobSat"      "EduZone"     "Region"      "Salary"      "Age"        
## [21] "JobZone"     "JobCor"      "Core"        "ObjOver"     "X"          
## [26] "JobZone_D"
p <- dplyr::select(dta, -City, -income, -JobSat, -X)%>%
  filter(Age <= 65 & Age >= 20 & Category != "自營者" )
p <- p[p$EduLv %in% c("博士","碩士", "普通大學", "科技大學", "技術學院"),]
p <- p %>% mutate(   EduLv = factor(EduLv, levels=c("博士","碩士","普通大學","科技大學",
                                                      "技術學院")),
                       EduLv = relevel(EduLv, ref = "技術學院"),
                       Category = factor(Category,levels=c("受雇者於私營企業","受雇於公營機關")))

# hunt for missing values by variables
apply(apply(p, 1, is.na), 1, sum)
##         SID      Gender      Sector       EduLv  SubEduOver     Require 
##           0           0           0           0           0           0 
##       Field    Category       Staff       Hours      J_year     J_total 
##           0           0           0           0           0           0 
## SubMismatch     EduZone      Region      Salary         Age     JobZone 
##           0           0           0           0           0           0 
##      JobCor        Core     ObjOver   JobZone_D 
##           0           0           0           0
#
# define function
d <- function(x){
  data.frame(n = t(t(table(x))),
             prop = t(t(prop.table(table(x)))),
             mean = aggregate(Salary ~ x, p, mean), 
             sd = aggregate(Salary ~ x, p, sd))
}

#
# table
des <- as.data.frame(rbind(d(p$Gender),d(p$Sector),d(p$EduLv),d(p$Field),d(p$Category),d(p$Region),d(p$SubEduOver),d(p$SubMismatch),d(p$ObjOver),d(p$Core)))%>% 
  select(c(1,3,6,8,10))
## Warning in `[<-.factor`(`*tmp*`, ri, value = 1:5): invalid factor level, NA
## generated

## Warning in `[<-.factor`(`*tmp*`, ri, value = 1:5): invalid factor level, NA
## generated
kable(des)
n.x n.Freq prop.Freq mean.Salary sd.Salary
865 0.606 41376 16341
562 0.394 53817 32063
私立 568 0.398 40616 20534
國外學校 24 0.017 66250 41631
國立(公立) 835 0.585 49551 25552
技術學院 39 0.027 42051 23944
博士 9 0.006 76111 14530
碩士 449 0.315 58742 31556
普通大學 672 0.471 41287 17390
科技大學 258 0.181 37171 16631
遊憩與運動學群 29 0.020 42241 21654
大眾傳播學群 45 0.032 35889 12937
工程學群 224 0.157 59866 32648
文史哲學群 86 0.060 40988 16066
外語學群 78 0.055 43782 16597
生命科學學群 35 0.025 44714 16713
生物資源學群 22 0.015 42273 15791
地球與環境學群 23 0.016 45870 12400
法政學群 63 0.044 42698 14751
社會與心理學群 125 0.088 38320 13228
建築與設計學群 48 0.034 43542 41794
財經學群 102 0.071 43529 23679
教育學群 103 0.072 45631 12321
資訊學群 110 0.077 50227 26513
管理學群 141 0.099 41738 22063
數理化學群 60 0.042 51000 18794
醫藥衛生學群 99 0.069 49293 33258
藝術學群 34 0.024 36324 14735
受雇者於私營企業 857 0.601 45163 27905
受雇於公營機關 570 0.399 47947 18319
宜花東離島 51 0.036 45294 16983
北北基 466 0.327 46406 21342
桃竹苗 207 0.145 55966 29606
中彰投 213 0.149 43638 27079
雲嘉南 246 0.172 43496 23814
高屏澎 244 0.171 43115 23455
符合工作要求 982 0.688 49038 27107
高於工作要求 299 0.210 41020 16417
低於工作要求 146 0.102 38459 15398
1 177 0.124 40819 24298
2 251 0.176 42530 18881
3 360 0.252 43819 22243
4 444 0.311 49212 25182
5 195 0.137 53897 30670
adequate 438 0.307 44680 24990
over 927 0.650 46882 24265
under 62 0.043 48468 25697
無關聯 873 0.612 43975 22626
部分關聯 188 0.132 51516 27493
核心關聯 366 0.256 49071 26678
kable(table1(p,SubEduOver,splitby = ~ ObjOver, row_wise=T, output = 'text2'))
adequate over under
n = 438 n = 927 n = 62
SubEduOver
符合工作要求 340 (34.6%) 589 (60%) 53 (5.4%)
高於工作要求 62 (20.7%) 232 (77.6%) 5 (1.7%)
低於工作要求 36 (24.7%) 106 (72.6%) 4 (2.7%)
kable(table1(p,as.factor(SubMismatch),splitby = ~ Core, row_wise=T, output = 'text2'))
無關聯 部分關聯 核心關聯
n = 873 n = 188 n = 366
as.factor.SubMismatch.
1 161 (91%) 6 (3.4%) 10 (5.6%)
2 193 (76.9%) 22 (8.8%) 36 (14.3%)
3 232 (64.4%) 48 (13.3%) 80 (22.2%)
4 210 (47.3%) 84 (18.9%) 150 (33.8%)
5 77 (39.5%) 28 (14.4%) 90 (46.2%)
kable(table(p$SubEduOver,p$ObjOver))
adequate over under
符合工作要求 340 589 53
高於工作要求 62 232 5
低於工作要求 36 106 4
# relation
pc <- select(p, Salary,Age,Hours, J_year,J_total )
tableC(pc, cor_type="pearson")
## N = 1427
## Note: pearson correlation (p-value).
## 
## ────────────────────────────────────────────────────────────────────────────
##             [1]           [2]            [3]            [4]          
##  [1]Salary  1.00                                                     
##  [2]Age     0.364 (<.001) 1.00                                       
##  [3]Hours   0.112 (<.001) -0.041 (0.124) 1.00                        
##  [4]J_year  0.26 (<.001)  0.721 (<.001)  -0.048 (0.07)  1.00         
##  [5]J_total 0.277 (<.001) 0.886 (<.001)  -0.047 (0.078) 0.792 (<.001)
##  [5]  
##       
##       
##       
##       
##  1.00 
## ────────────────────────────────────────────────────────────────────────────
# normal way correlation
cor(pc, use="complete.obs", method="spearman")
##         Salary     Age   Hours  J_year J_total
## Salary   1.000  0.4390  0.1197  0.3338  0.3069
## Age      0.439  1.0000 -0.0283  0.6508  0.8293
## Hours    0.120 -0.0283  1.0000 -0.0227 -0.0339
## J_year   0.334  0.6508 -0.0227  1.0000  0.7259
## J_total  0.307  0.8293 -0.0339  0.7259  1.0000
#
# Chi-squared
# normal Pearson
chisq.test(table(p$SubEduOver, p$ObjOver))
## 
##  Pearson's Chi-squared test
## 
## data:  table(p$SubEduOver, p$ObjOver)
## X-squared = 40, df = 4, p-value = 2e-07
# Yate's correct
chisq.test(table(p$SubEduOver, p$ObjOver), correct = T)
## 
##  Pearson's Chi-squared test
## 
## data:  table(p$SubEduOver, p$ObjOver)
## X-squared = 40, df = 4, p-value = 2e-07
# fisher fisher.test(table(p$SubEduOver, p$ObjOver))

# Pearson's
library(vcd)
## Warning: package 'vcd' was built under R version 3.4.4
## Loading required package: grid
assocstats(table(p$SubEduOver, p$ObjOver))#詳細報表
##                     X^2 df   P(> X^2)
## Likelihood Ratio 39.295  4 6.0553e-08
## Pearson          37.024  4 1.7812e-07
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.159 
## Cramer's V        : 0.114
independence_table(table(p$SubEduOver, p$ObjOver))#無關時的期望值
##               
##                adequate  over under
##   符合工作要求    301.4 637.9 42.67
##   高於工作要求     91.8 194.2 12.99
##   低於工作要求     44.8  94.8  6.34
#
# plot
# age
ggplot(p , aes(x = Age))+
  geom_bar(position="dodge")+
  geom_vline(xintercept = mean(p$Age), color = "black",  linetype = 2) +
  scale_x_continuous(limits=c(20,65), breaks=seq(20,65, by = 5))+
  scale_y_continuous(limits=c(0,180), breaks=seq(0,180, by = 20))+
  labs(x = "年齡",y = "人數")+
  theme_bw()

# salary
ggplot(p, aes(x = as.factor(Salary))) + 
  geom_bar(position="dodge", width =1)+
  labs(x = "薪資",y = "人數") + 
  theme_bw() + 
  theme(axis.text.x = element_text(hjust = 1, angle =30))