data

#
dta <- read.csv("D:/EDU MIS/project-research/data0505.csv", header = T)
options(digits = 3)
pacman::p_load(tidyverse, ggplot2, knitr, furniture, gmodels)
dta <- dta %>% mutate( Gender = relevel(Gender, ref = "女"),
                       Sector = relevel(Sector, ref = "私立"),
                       Field = relevel(Field, ref = "遊憩與運動學群"),
                       EduLv = factor(EduLv, levels=c("博士","碩士","普通大學","科技大學",
                                                      "技術學院","五專","三專",
                                                      "二專","高中","高職","國中")),
                       EduLv = relevel(EduLv, ref = "技術學院"),
                       Region = factor(Region, levels =c("宜花東離島","北北基","桃竹苗",
                                                         "中彰投","雲嘉南","高屏澎")),
                       Age = as.numeric(Age), 
                       J_year = as.numeric(J_year), 
                       JobZone = as.numeric(JobZone),
                       EduZone = as.numeric(EduZone),
                       JobZone_D = as.numeric(EduZone-JobZone),
                       Salary = as.numeric(Salary),
                       SubEduOver = relevel(SubEduOver, ref="符合工作要求"),
                       Core = recode_factor(as.factor(JobCor), "1" = "無關聯",
                                            "2" = "部分關聯",
                                            "3" = "核心關聯"),
                       SubEduOver = factor(SubEduOver,levels =c("符合工作要求","高於工作要求","低於工作要求")))

# data construction
glimpse(dta)
## Observations: 1,571
## Variables: 26
## $ SID         <fctr> A1, A10, A100, A102, A103, A104, A105, A106, A107...
## $ Gender      <fctr> 女, 女, 女, 女, 男, 女, 女, 女, 女, 女, 女, 男, 男, 男, 女, 男, 男...
## $ Sector      <fctr> 國立(公立), 國立(公立), 私立, 國立(公立), 國立(公立), 國立(公立), 國立(公立...
## $ EduLv       <fctr> 碩士, 碩士, 普通大學, 普通大學, 高職, 普通大學, 普通大學, 普通大學, 普通大學, 普...
## $ SubEduOver  <fctr> 符合工作要求, 高於工作要求, 符合工作要求, 符合工作要求, 符合工作要求, 符合工作要求, 符...
## $ Require     <fctr> 碩士, 高中/高職, 普通大學, 普通大學, 高中/高職, 普通大學, 普通大學, 普通大學, 普...
## $ Field       <fctr> 教育學群, 資訊學群, 外語學群, 教育學群, 工程學群, 文史哲學群, 文史哲學群, 大眾傳播學...
## $ City        <fctr> 臺南市, 高雄市, 苗栗縣, 新北市, 高雄市, 南投縣, 嘉義市, 臺北市, 臺北市, 南投縣,...
## $ Category    <fctr> 受雇於公營機關, 受雇於公營機關, 受雇於公營機關, 受雇於公營機關, 受雇者於私營企業, 受雇於...
## $ Staff       <fctr> 10-29人, 50-99人, 50-99人, 10-29人, 2-9人, 100-199人, 1...
## $ Hours       <int> 48, 40, 70, 50, 57, 51, 64, 50, 50, 47, 50, 60, 45...
## $ J_year      <dbl> 2, 8, 4, 1, 21, 1, 6, 0, 1, 1, 17, 7, 3, 23, 1, 2,...
## $ J_total     <dbl> 2, 8, 4, 1, 30, 1, 6, 0, 2, 2, 28, 7, 30, 26, 1, 1...
## $ income      <fctr> 2萬以下, 2-3萬以下, 3-4萬以下, 4-5萬以下, 3-4萬以下, 4-5萬以下, 2萬以...
## $ SubMismatch <int> 4, 2, 3, 4, 5, 4, 5, 4, 3, 3, 4, 4, 4, 4, 5, 5, 3,...
## $ JobSat      <int> 6, 4, 3, 5, 5, 6, 7, 5, 3, 6, 3, 5, 4, 7, 3, 4, 4,...
## $ EduZone     <dbl> 5, 5, 4, 4, 2, 4, 4, 4, 4, 4, 5, 5, 3, 5, 4, 4, 4,...
## $ Region      <fctr> 雲嘉南, 高屏澎, 桃竹苗, 北北基, 高屏澎, 中彰投, 雲嘉南, 北北基, 北北基, 中彰投,...
## $ Salary      <dbl> 20000, 25000, 35000, 45000, 35000, 45000, 20000, 3...
## $ Age         <dbl> 26, 34, 30, 25, 62, 25, 21, 24, 25, 26, 57, 35, 54...
## $ JobZone     <dbl> 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 3,...
## $ JobCor      <int> 3, 1, 2, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 1, 3,...
## $ Core        <fctr> 核心關聯, 無關聯, 部分關聯, 核心關聯, 無關聯, 部分關聯, 無關聯, 核心關聯, 無關聯,...
## $ ObjOver     <fctr> over, over, adequate, adequate, under, adequate, ...
## $ X           <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ JobZone_D   <dbl> 1, 2, 0, 0, -1, 0, 0, 0, 0, 0, 1, 1, -1, 0, 0, 0, ...
# check and pick out
lapply(dta[,c("Sector", "Field", "City", "Region","EduLv", "SubEduOver", "ObjOver")], levels)
## $Sector
## [1] "私立"         "國外學校"     "國立(公立)"
## 
## $Field
##  [1] "遊憩與運動學群" "大眾傳播學群"   "工程學群"       "文史哲學群"    
##  [5] "外語學群"       "生命科學學群"   "生物資源學群"   "地球與環境學群"
##  [9] "法政學群"       "社會與心理學群" "建築與設計學群" "財經學群"      
## [13] "教育學群"       "資訊學群"       "管理學群"       "數理化學群"    
## [17] "醫藥衛生學群"   "藝術學群"      
## 
## $City
##  [1] "宜蘭縣" "花蓮縣" "金門縣" "南投縣" "屏東縣" "苗栗縣" "桃園市"
##  [8] "高雄市" "基隆市" "雲林縣" "新北市" "新竹市" "新竹縣" "嘉義市"
## [15] "嘉義縣" "彰化縣" "臺中市" "臺北市" "臺東縣" "臺南市" "澎湖縣"
## 
## $Region
## [1] "宜花東離島" "北北基"     "桃竹苗"     "中彰投"     "雲嘉南"    
## [6] "高屏澎"    
## 
## $EduLv
##  [1] "技術學院" "博士"     "碩士"     "普通大學" "科技大學" "五專"    
##  [7] "三專"     "二專"     "高中"     "高職"     "國中"    
## 
## $SubEduOver
## [1] "符合工作要求" "高於工作要求" "低於工作要求"
## 
## $ObjOver
## [1] "adequate" "over"     "under"
# pick out
names(dta)
##  [1] "SID"         "Gender"      "Sector"      "EduLv"       "SubEduOver" 
##  [6] "Require"     "Field"       "City"        "Category"    "Staff"      
## [11] "Hours"       "J_year"      "J_total"     "income"      "SubMismatch"
## [16] "JobSat"      "EduZone"     "Region"      "Salary"      "Age"        
## [21] "JobZone"     "JobCor"      "Core"        "ObjOver"     "X"          
## [26] "JobZone_D"
po <- dplyr::select(dta, -City, -income, -JobSat, -X)%>%
  filter(Age <= 65 & Age >= 20 & Category != "自營者" )
p <- filter(po,Age <= 40)
p <- p[p$EduLv %in% c("博士","碩士", "普通大學", "科技大學", "技術學院"),]
p <- p %>% mutate(   EduLv = factor(EduLv, levels=c("博士","碩士","普通大學","科技大學",
                                                    "技術學院")),
                     EduLv = relevel(EduLv, ref = "技術學院"),
                     Category = factor(Category,levels=c("受雇者於私營企業","受雇於公營機關")))

# hunt for missing values by variables
apply(apply(p, 1, is.na), 1, sum)
##         SID      Gender      Sector       EduLv  SubEduOver     Require 
##           0           0           0           0           0           0 
##       Field    Category       Staff       Hours      J_year     J_total 
##           0           0           0           0           0           0 
## SubMismatch     EduZone      Region      Salary         Age     JobZone 
##           0           0           0           0           0           0 
##      JobCor        Core     ObjOver   JobZone_D 
##           0           0           0           0
summary(p$Salary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20000   35000   45000   44736   55000  300000

table

table 1

#
# define function
d <- function(x){
  data.frame(n = t(t(table(x))),
             prop = t(t(prop.table(table(x)))),
             mean = aggregate(Salary ~ x, p, mean), 
             sd = aggregate(Salary ~ x, p, sd))
}

#
# table
des <- as.data.frame(rbind(d(p$Gender),d(p$Sector),d(p$EduLv),d(p$Field),d(p$Category),d(p$Region),d(p$SubEduOver),d(p$SubMismatch),d(p$ObjOver),d(p$Core)))%>% 
  select(c(1,3,6,8,10))
## Warning in `[<-.factor`(`*tmp*`, ri, value = 1:5): invalid factor level, NA
## generated

## Warning in `[<-.factor`(`*tmp*`, ri, value = 1:5): invalid factor level, NA
## generated
kable(des)
n.x n.Freq prop.Freq mean.Salary sd.Salary
817 0.608 40306 15519
526 0.392 51616 28691
私立 534 0.398 39026 18068
國外學校 19 0.014 57632 41075
國立(公立) 790 0.588 48285 23474
技術學院 35 0.026 40857 24629
博士 5 0.004 71000 15166
碩士 415 0.309 56410 27934
普通大學 640 0.477 40234 16235
科技大學 248 0.185 36835 16548
遊憩與運動學群 27 0.020 39444 18415
大眾傳播學群 45 0.034 35889 12937
工程學群 213 0.159 57958 31424
文史哲學群 79 0.059 38861 14299
外語學群 69 0.051 40145 10501
生命科學學群 33 0.025 44091 16272
生物資源學群 22 0.016 42273 15791
地球與環境學群 21 0.016 44524 11170
法政學群 60 0.045 42417 14801
社會與心理學群 119 0.089 37437 11840
建築與設計學群 46 0.034 43696 42693
財經學群 97 0.072 42732 21759
教育學群 102 0.076 45833 12209
資訊學群 105 0.078 49000 25253
管理學群 129 0.096 38798 16462
數理化學群 58 0.043 51207 18995
醫藥衛生學群 87 0.065 45000 21861
藝術學群 31 0.023 33226 11147
受雇者於私營企業 822 0.612 44075 26847
受雇於公營機關 521 0.388 45777 12166
宜花東離島 49 0.036 44490 16840
北北基 441 0.328 44580 18868
桃竹苗 193 0.144 54378 28199
中彰投 206 0.153 43495 27264
雲嘉南 232 0.173 41185 16263
高屏澎 222 0.165 41577 22290
符合工作要求 911 0.678 47042 24476
高於工作要求 288 0.214 40694 16253
低於工作要求 144 0.107 38229 15196
1 169 0.126 39320 20463
2 238 0.177 41429 17779
3 341 0.254 42757 21788
4 413 0.308 46792 20122
5 182 0.136 53132 30886
adequate 413 0.308 42433 20377
over 872 0.649 45619 22868
under 58 0.043 47845 26376
無關聯 818 0.609 42586 21361
部分關聯 175 0.130 50171 27212
核心關聯 350 0.261 47043 21221

table 3

# correlation table
pc <- select(p, Salary,Age,Hours, J_year,J_total )
tableC(pc, cor_type="pearson")
## N = 1343
## Note: pearson correlation (p-value).
## 
## ────────────────────────────────────────────────────────────────────────────
##             [1]           [2]            [3]            [4]          
##  [1]Salary  1.00                                                     
##  [2]Age     0.303 (<.001) 1.00                                       
##  [3]Hours   0.143 (<.001) -0.021 (0.438) 1.00                        
##  [4]J_year  0.178 (<.001) 0.617 (<.001)  -0.049 (0.075) 1.00         
##  [5]J_total 0.18 (<.001)  0.786 (<.001)  -0.044 (0.11)  0.747 (<.001)
##  [5]  
##       
##       
##       
##       
##  1.00 
## ────────────────────────────────────────────────────────────────────────────

table 6

#
kable(table1(p,SubEduOver,splitby = ~ ObjOver, row_wise=T, output = 'text2'))
adequate over under
n = 413 n = 872 n = 58
SubEduOver
符合工作要求 318 (34.9%) 543 (59.6%) 50 (5.5%)
高於工作要求 60 (20.8%) 224 (77.8%) 4 (1.4%)
低於工作要求 35 (24.3%) 105 (72.9%) 4 (2.8%)

table7

#
kable(table1(p,as.factor(SubMismatch),splitby = ~ Core, row_wise=T, output = 'text2'))
無關聯 部分關聯 核心關聯
n = 818 n = 175 n = 350
as.factor.SubMismatch.
1 154 (91.1%) 6 (3.6%) 9 (5.3%)
2 184 (77.3%) 19 (8%) 35 (14.7%)
3 219 (64.2%) 48 (14.1%) 74 (21.7%)
4 192 (46.5%) 76 (18.4%) 145 (35.1%)
5 69 (37.9%) 26 (14.3%) 87 (47.8%)

plot

# salary
ggplot(p, aes(x = as.factor(Salary))) + 
  geom_bar(position="dodge")+
  labs(x = "薪資",y = "人數") + 
  theme_bw() + 
  theme(axis.text.x = element_text(hjust = 1, angle =30))

# age
ggplot(p, aes(x = p$Age, y = p$Salary))+
  geom_point(alpha = .5, size = .8)+
  labs(x='age',y='salary')+
  facet_wrap(~JobZone_D)

ggplot(p , aes(x = Age))+
  geom_bar(position="dodge")+
  geom_vline(xintercept = mean(p$Age), color = "black",  linetype = 2) +
  scale_x_continuous(limits=c(20,65), breaks=seq(20,65, by = 5))+
  scale_y_continuous(limits=c(0,180), breaks=seq(0,180, by = 20))+
  labs(x = "年齡",y = "人數")+
  theme_bw()