Description analysis

#
dta <- read.csv("data0505.csv", header = T)
options(digits = 3)
pacman::p_load(tidyverse, ggplot2, knitr, furniture, gmodels)
dta <- dta %>% mutate( Gender = relevel(Gender, ref = "女"),
                       Sector = relevel(Sector, ref = "私立"),
                       Field = relevel(Field, ref = "遊憩與運動學群"),
                       EduLv = factor(EduLv, levels=c("博士","碩士","普通大學","科技大學",
                                                      "技術學院","五專","三專",
                                                      "二專","高中","高職","國中")),
                       EduLv = relevel(EduLv, ref = "技術學院"),
                       Region = factor(Region, levels =c("宜花東離島","北北基","桃竹苗",
                                                         "中彰投","雲嘉南","高屏澎")),
                       Age = as.numeric(Age), 
                       J_year = as.numeric(J_year), 
                       JobZone = as.numeric(JobZone),
                       EduZone = as.numeric(EduZone),
                       JobZone_D = as.numeric(EduZone-JobZone),
                       Salary = as.numeric(Salary),
                       SubEduOver = relevel(SubEduOver, ref="符合工作要求"),
                       Core = recode_factor(as.factor(JobCor), "1" = "無關聯",
                                            "2" = "部分關聯",
                                            "3" = "核心關聯"),
                       SubEduOver = factor(SubEduOver,levels =c("符合工作要求","高於工作要求","低於工作要求")))

# data construction
glimpse(dta)

## Observations: 1,571
## Variables: 26
## $ SID         <fctr> A1, A10, A100, A102, A103, A104, A105, A106, A107...
## $ Gender      <fctr> 女, 女, 女, 女, 男, 女, 女, 女, 女, 女, 女, 男, 男, 男, 女, 男, 男...
## $ Sector      <fctr> 國立（公立）, 國立（公立）, 私立, 國立（公立）, 國立（公立）, 國立（公立）, 國立（公立...
## $ EduLv       <fctr> 碩士, 碩士, 普通大學, 普通大學, 高職, 普通大學, 普通大學, 普通大學, 普通大學, 普...
## $ SubEduOver  <fctr> 符合工作要求, 高於工作要求, 符合工作要求, 符合工作要求, 符合工作要求, 符合工作要求, 符...
## $ Require     <fctr> 碩士, 高中/高職, 普通大學, 普通大學, 高中/高職, 普通大學, 普通大學, 普通大學, 普...
## $ Field       <fctr> 教育學群, 資訊學群, 外語學群, 教育學群, 工程學群, 文史哲學群, 文史哲學群, 大眾傳播學...
## $ City        <fctr> 臺南市, 高雄市, 苗栗縣, 新北市, 高雄市, 南投縣, 嘉義市, 臺北市, 臺北市, 南投縣,...
## $ Category    <fctr> 受雇於公營機關, 受雇於公營機關, 受雇於公營機關, 受雇於公營機關, 受雇者於私營企業, 受雇於...
## $ Staff       <fctr> 10-29人, 50-99人, 50-99人, 10-29人, 2-9人, 100-199人, 1...
## $ Hours       <int> 48, 40, 70, 50, 57, 51, 64, 50, 50, 47, 50, 60, 45...
## $ J_year      <dbl> 2, 8, 4, 1, 21, 1, 6, 0, 1, 1, 17, 7, 3, 23, 1, 2,...
## $ J_total     <dbl> 2, 8, 4, 1, 30, 1, 6, 0, 2, 2, 28, 7, 30, 26, 1, 1...
## $ income      <fctr> 2萬以下, 2-3萬以下, 3-4萬以下, 4-5萬以下, 3-4萬以下, 4-5萬以下, 2萬以...
## $ SubMismatch <int> 4, 2, 3, 4, 5, 4, 5, 4, 3, 3, 4, 4, 4, 4, 5, 5, 3,...
## $ JobSat      <int> 6, 4, 3, 5, 5, 6, 7, 5, 3, 6, 3, 5, 4, 7, 3, 4, 4,...
## $ EduZone     <dbl> 5, 5, 4, 4, 2, 4, 4, 4, 4, 4, 5, 5, 3, 5, 4, 4, 4,...
## $ Region      <fctr> 雲嘉南, 高屏澎, 桃竹苗, 北北基, 高屏澎, 中彰投, 雲嘉南, 北北基, 北北基, 中彰投,...
## $ Salary      <dbl> 20000, 25000, 35000, 45000, 35000, 45000, 20000, 3...
## $ Age         <dbl> 26, 34, 30, 25, 62, 25, 21, 24, 25, 26, 57, 35, 54...
## $ JobZone     <dbl> 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 3,...
## $ JobCor      <int> 3, 1, 2, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 1, 3,...
## $ Core        <fctr> 核心關聯, 無關聯, 部分關聯, 核心關聯, 無關聯, 部分關聯, 無關聯, 核心關聯, 無關聯,...
## $ ObjOver     <fctr> over, over, adequate, adequate, under, adequate, ...
## $ X           <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ JobZone_D   <dbl> 1, 2, 0, 0, -1, 0, 0, 0, 0, 0, 1, 1, -1, 0, 0, 0, ...

# check and pick out
lapply(dta[,c("Sector", "Field", "City", "Region","EduLv", "SubEduOver", "ObjOver")], levels)

## $Sector
## [1] "私立"         "國外學校"     "國立（公立）"
## 
## $Field
##  [1] "遊憩與運動學群" "大眾傳播學群"   "工程學群"       "文史哲學群"    
##  [5] "外語學群"       "生命科學學群"   "生物資源學群"   "地球與環境學群"
##  [9] "法政學群"       "社會與心理學群" "建築與設計學群" "財經學群"      
## [13] "教育學群"       "資訊學群"       "管理學群"       "數理化學群"    
## [17] "醫藥衛生學群"   "藝術學群"      
## 
## $City
##  [1] "宜蘭縣" "花蓮縣" "金門縣" "南投縣" "屏東縣" "苗栗縣" "桃園市"
##  [8] "高雄市" "基隆市" "雲林縣" "新北市" "新竹市" "新竹縣" "嘉義市"
## [15] "嘉義縣" "彰化縣" "臺中市" "臺北市" "臺東縣" "臺南市" "澎湖縣"
## 
## $Region
## [1] "宜花東離島" "北北基"     "桃竹苗"     "中彰投"     "雲嘉南"    
## [6] "高屏澎"    
## 
## $EduLv
##  [1] "技術學院" "博士"     "碩士"     "普通大學" "科技大學" "五專"    
##  [7] "三專"     "二專"     "高中"     "高職"     "國中"    
## 
## $SubEduOver
## [1] "符合工作要求" "高於工作要求" "低於工作要求"
## 
## $ObjOver
## [1] "adequate" "over"     "under"

# pick out
names(dta)

##  [1] "SID"         "Gender"      "Sector"      "EduLv"       "SubEduOver" 
##  [6] "Require"     "Field"       "City"        "Category"    "Staff"      
## [11] "Hours"       "J_year"      "J_total"     "income"      "SubMismatch"
## [16] "JobSat"      "EduZone"     "Region"      "Salary"      "Age"        
## [21] "JobZone"     "JobCor"      "Core"        "ObjOver"     "X"          
## [26] "JobZone_D"

p <- dplyr::select(dta, -City, -income, -JobSat, -X)%>%
  filter(Age <= 65 & Age >= 20 & Category != "自營者" )
p <- p[p$EduLv %in% c("博士","碩士", "普通大學", "科技大學", "技術學院"),]
p <- p %>% mutate(   EduLv = factor(EduLv, levels=c("博士","碩士","普通大學","科技大學",
                                                      "技術學院")),
                       EduLv = relevel(EduLv, ref = "技術學院"),
                       Category = factor(Category,levels=c("受雇者於私營企業","受雇於公營機關")))

# hunt for missing values by variables
apply(apply(p, 1, is.na), 1, sum)

##         SID      Gender      Sector       EduLv  SubEduOver     Require 
##           0           0           0           0           0           0 
##       Field    Category       Staff       Hours      J_year     J_total 
##           0           0           0           0           0           0 
## SubMismatch     EduZone      Region      Salary         Age     JobZone 
##           0           0           0           0           0           0 
##      JobCor        Core     ObjOver   JobZone_D 
##           0           0           0           0

#
# define function
d <- function(x){
  data.frame(n = t(t(table(x))),
             prop = t(t(prop.table(table(x)))),
             mean = aggregate(Salary ~ x, p, mean), 
             sd = aggregate(Salary ~ x, p, sd))
}

#
# table
des <- as.data.frame(rbind(d(p$Gender),d(p$Sector),d(p$EduLv),d(p$Field),d(p$Category),d(p$Region),d(p$SubEduOver),d(p$SubMismatch),d(p$ObjOver),d(p$Core)))%>% 
  select(c(1,3,6,8,10))

## Warning in `[<-.factor`(`*tmp*`, ri, value = 1:5): invalid factor level, NA
## generated

## Warning in `[<-.factor`(`*tmp*`, ri, value = 1:5): invalid factor level, NA
## generated

kable(des)

n.x	n.Freq	prop.Freq	mean.Salary	sd.Salary
女	865	0.606	41376	16341
男	562	0.394	53817	32063
私立	568	0.398	40616	20534
國外學校	24	0.017	66250	41631
國立（公立）	835	0.585	49551	25552
技術學院	39	0.027	42051	23944
博士	9	0.006	76111	14530
碩士	449	0.315	58742	31556
普通大學	672	0.471	41287	17390
科技大學	258	0.181	37171	16631
遊憩與運動學群	29	0.020	42241	21654
大眾傳播學群	45	0.032	35889	12937
工程學群	224	0.157	59866	32648
文史哲學群	86	0.060	40988	16066
外語學群	78	0.055	43782	16597
生命科學學群	35	0.025	44714	16713
生物資源學群	22	0.015	42273	15791
地球與環境學群	23	0.016	45870	12400
法政學群	63	0.044	42698	14751
社會與心理學群	125	0.088	38320	13228
建築與設計學群	48	0.034	43542	41794
財經學群	102	0.071	43529	23679
教育學群	103	0.072	45631	12321
資訊學群	110	0.077	50227	26513
管理學群	141	0.099	41738	22063
數理化學群	60	0.042	51000	18794
醫藥衛生學群	99	0.069	49293	33258
藝術學群	34	0.024	36324	14735
受雇者於私營企業	857	0.601	45163	27905
受雇於公營機關	570	0.399	47947	18319
宜花東離島	51	0.036	45294	16983
北北基	466	0.327	46406	21342
桃竹苗	207	0.145	55966	29606
中彰投	213	0.149	43638	27079
雲嘉南	246	0.172	43496	23814
高屏澎	244	0.171	43115	23455
符合工作要求	982	0.688	49038	27107
高於工作要求	299	0.210	41020	16417
低於工作要求	146	0.102	38459	15398
1	177	0.124	40819	24298
2	251	0.176	42530	18881
3	360	0.252	43819	22243
4	444	0.311	49212	25182
5	195	0.137	53897	30670
adequate	438	0.307	44680	24990
over	927	0.650	46882	24265
under	62	0.043	48468	25697
無關聯	873	0.612	43975	22626
部分關聯	188	0.132	51516	27493
核心關聯	366	0.256	49071	26678

kable(table1(p,SubEduOver,splitby = ~ ObjOver, row_wise=T, output = 'text2'))

	adequate	over	under
	n = 438	n = 927	n = 62
SubEduOver
符合工作要求	340 (34.6%)	589 (60%)	53 (5.4%)
高於工作要求	62 (20.7%)	232 (77.6%)	5 (1.7%)
低於工作要求	36 (24.7%)	106 (72.6%)	4 (2.7%)

kable(table1(p,as.factor(SubMismatch),splitby = ~ Core, row_wise=T, output = 'text2'))

	無關聯	部分關聯	核心關聯
	n = 873	n = 188	n = 366
as.factor.SubMismatch.
1	161 (91%)	6 (3.4%)	10 (5.6%)
2	193 (76.9%)	22 (8.8%)	36 (14.3%)
3	232 (64.4%)	48 (13.3%)	80 (22.2%)
4	210 (47.3%)	84 (18.9%)	150 (33.8%)
5	77 (39.5%)	28 (14.4%)	90 (46.2%)

kable(table(p$SubEduOver,p$ObjOver))

	adequate	over	under
符合工作要求	340	589	53
高於工作要求	62	232	5
低於工作要求	36	106	4

# relation
pc <- select(p, Salary,Age,Hours, J_year,J_total )
tableC(pc, cor_type="pearson")

## N = 1427
## Note: pearson correlation (p-value).

## 
## ────────────────────────────────────────────────────────────────────────────
##             [1]           [2]            [3]            [4]          
##  [1]Salary  1.00                                                     
##  [2]Age     0.364 (<.001) 1.00                                       
##  [3]Hours   0.112 (<.001) -0.041 (0.124) 1.00                        
##  [4]J_year  0.26 (<.001)  0.721 (<.001)  -0.048 (0.07)  1.00         
##  [5]J_total 0.277 (<.001) 0.886 (<.001)  -0.047 (0.078) 0.792 (<.001)
##  [5]  
##       
##       
##       
##       
##  1.00 
## ────────────────────────────────────────────────────────────────────────────

# normal way correlation
cor(pc, use="complete.obs", method="spearman")

##         Salary     Age   Hours  J_year J_total
## Salary   1.000  0.4390  0.1197  0.3338  0.3069
## Age      0.439  1.0000 -0.0283  0.6508  0.8293
## Hours    0.120 -0.0283  1.0000 -0.0227 -0.0339
## J_year   0.334  0.6508 -0.0227  1.0000  0.7259
## J_total  0.307  0.8293 -0.0339  0.7259  1.0000

#
# Chi-squared
# normal Pearson
chisq.test(table(p$SubEduOver, p$ObjOver))

## 
##  Pearson's Chi-squared test
## 
## data:  table(p$SubEduOver, p$ObjOver)
## X-squared = 40, df = 4, p-value = 2e-07

# Yate's correct
chisq.test(table(p$SubEduOver, p$ObjOver), correct = T)

## 
##  Pearson's Chi-squared test
## 
## data:  table(p$SubEduOver, p$ObjOver)
## X-squared = 40, df = 4, p-value = 2e-07

# fisher fisher.test(table(p$SubEduOver, p$ObjOver))

# Pearson's
library(vcd)

## Warning: package 'vcd' was built under R version 3.4.4

## Loading required package: grid

assocstats(table(p$SubEduOver, p$ObjOver))#詳細報表

##                     X^2 df   P(> X^2)
## Likelihood Ratio 39.295  4 6.0553e-08
## Pearson          37.024  4 1.7812e-07
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.159 
## Cramer's V        : 0.114

independence_table(table(p$SubEduOver, p$ObjOver))#無關時的期望值

##               
##                adequate  over under
##   符合工作要求    301.4 637.9 42.67
##   高於工作要求     91.8 194.2 12.99
##   低於工作要求     44.8  94.8  6.34

#
# plot
# age
ggplot(p , aes(x = Age))+
  geom_bar(position="dodge")+
  geom_vline(xintercept = mean(p$Age), color = "black",  linetype = 2) +
  scale_x_continuous(limits=c(20,65), breaks=seq(20,65, by = 5))+
  scale_y_continuous(limits=c(0,180), breaks=seq(0,180, by = 20))+
  labs(x = "年齡",y = "人數")+
  theme_bw()

# salary
ggplot(p, aes(x = as.factor(Salary))) + 
  geom_bar(position="dodge", width =1)+
  labs(x = "薪資",y = "人數") + 
  theme_bw() + 
  theme(axis.text.x = element_text(hjust = 1, angle =30))

Description analysis

Li

2018年5月7日