資料整理與串接
#
pacman::p_load(ggplot2, tidyverse, stringr)
dta <- read.csv("C:/Users/USER/Desktop/EDU MIS/project-research/project-research_data_data.csv", header = T)
dta_s <- read.csv("C:/Users/USER/Desktop/EDU MIS/project-research/學類+職業.csv" , header = T)
dta_z <- read.csv("C:/Users/USER/Desktop/EDU MIS/project-research/職業+工作級區.csv" , header = T)
str(dta)
## 'data.frame': 1295 obs. of 23 variables:
## $ sub : int 1 2 3 5 6 7 8 9 10 11 ...
## $ 性別 : Factor w/ 2 levels "女","男": 1 2 2 2 2 1 1 1 1 1 ...
## $ 出生年 : int 1992 1990 1989 1987 1987 1992 1993 1988 1984 1993 ...
## $ 年齡 : int 27 29 30 32 32 27 26 31 35 26 ...
## $ 學歷 : Factor w/ 10 levels "二專","五專",..: 10 9 10 9 10 9 9 10 10 9 ...
## $ 公私立 : Factor w/ 3 levels "私立","國外學校",..: 3 1 3 1 3 3 3 3 3 3 ...
## $ 學群 : Factor w/ 18 levels "大眾傳播學群",..: 12 2 12 7 2 9 3 2 13 12 ...
## $ 學類 : Factor w/ 160 levels "13-經營與財務運作相關職類",..: 103 128 91 147 12 67 21 128 120 91 ...
## $ 大類 : Factor w/ 30 levels "11-主管職類",..: 11 29 11 5 5 3 11 28 23 11 ...
## $ 中類 : Factor w/ 124 levels "","/","11-1000 高階主管",..: 25 82 25 13 13 7 25 71 60 25 ...
## $ 小類 : Factor w/ 373 levels "","11-1011.00 執行長(Chief Executives)",..: 118 282 125 67 67 28 118 256 227 122 ...
## $ 地區 : Factor w/ 21 levels "宜蘭縣","花蓮縣",..: 20 8 17 18 8 18 4 12 8 15 ...
## $ 就業性質: Factor w/ 3 levels "自營者","受雇於公營機關",..: 2 2 3 3 3 3 2 3 2 2 ...
## $ 規模 : Factor w/ 9 levels "","10-29人","100-199人",..: 2 8 5 9 9 9 7 6 8 7 ...
## $ 每週工時: num 48 40 40 40 40 50 40 50 40 40 ...
## $ 現職年資: num 2 1 1 3 5 1 3 5 8 1 ...
## $ 總年資 : int 2 3 2 4 6 1 3 5 8 2 ...
## $ 薪資 : Factor w/ 20 levels "10-11萬以下",..: 12 13 10 13 16 13 15 17 10 13 ...
## $ 自評過量: Factor w/ 3 levels "低於工作要求",..: 3 2 3 3 2 3 3 3 2 3 ...
## $ 自評需求: Factor w/ 12 levels "","二專","不需要接受學校正式教育",..: 12 12 11 11 6 11 11 11 7 11 ...
## $ 自評學用: int 4 3 5 2 2 4 3 5 2 5 ...
## $ 工作滿意: int 6 6 6 4 5 5 3 5 4 4 ...
## $ salary : int 20000 35000 25000 35000 55000 35000 45000 65000 25000 35000 ...
## 'data.frame': 1583 obs. of 9 variables:
## $ 群編碼 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ 群名稱 : Factor w/ 18 levels "大眾傳播學群",..: 13 13 13 13 13 13 13 13 13 13 ...
## $ 類編碼 : int 49 49 49 49 49 49 49 49 49 49 ...
## $ 類名稱 : Factor w/ 123 levels "人類/民族學",..: 87 87 87 87 87 87 87 87 87 87 ...
## $ 職業名稱: Factor w/ 558 levels "","Spa會館經理",..: 416 44 387 160 383 363 339 338 437 382 ...
## $ 職業關聯: Factor w/ 3 levels "","核心職業",..: 3 3 2 2 2 2 2 2 2 2 ...
## $ 關聯編碼: int 2 2 3 3 3 3 3 3 3 3 ...
## $ X : logi NA NA NA NA NA NA ...
## $ 職業碼 : Factor w/ 559 levels "","11-1011.00",..: 115 259 103 104 106 107 108 109 110 111 ...
## 'data.frame': 1110 obs. of 3 variables:
## $ 職業大類: Factor w/ 98 levels "1+163:2027-2000 各類工程師",..: 2 2 2 2 3 3 3 3 3 4 ...
## $ 職業碼 : Factor w/ 1110 levels "11-1011.00","11-1011.03",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ 工作級區: Factor w/ 6 levels "#N/A","1","2",..: 6 6 5 5 5 5 5 5 5 4 ...
dta$學歷級區 <- recode_factor(dta$學歷,
"國中" = "1",
"高職" = "2",
"高中" = "2",
"二專" = "3",
"三專" = "3",
"五專" = "3",
"技術學院" = "4" ,
"科技大學" = "4",
"普通大學" = "4",
"碩士" = "5" ,
"博士" = "5" ,
"軍警學校" = "NA" )
dta_z$工作級區<- recode_factor(dta_z$工作級區,
"5" = "5",
"4" = "4",
"3" = "3",
"2" = "2",
"1" = "1",
"#N/A" = "NA")
## regular expression
# data
dta$小類 <- as.character(dta$小類)
dta$職業碼 <- str_sub(dta$小類, start = 1, end = 10)
dta$學類職業 <- str_c(dta$學類,dta$職業碼)
# 串接表
dta_s$學類職業 <- str_c(dta_s$類名稱,dta_s$職業碼)
dta_sl <- dta_s[ ,-c(1,2,3,4,5,6,8,9)]
# 參照與回傳關聯碼
ndta <- left_join(dta , dta_sl)
## Joining, by = "學類職業"
ndta$關聯編碼[is.na(ndta$關聯編碼)] <- 1
# 參照與回傳工作級區
dta_z$職業碼 <- as.character(dta_z$職業碼)
ndta <- left_join(ndta , dta_z)
## Joining, by = "職業碼"
#
ndta <- ndta[ ,-c(14,18,28)]
ndta$學歷級區 <- as.numeric(ndta$學歷級區)
ndta$工作級區 <- as.numeric(ndta$工作級區)
繪圖
## plot
# fig.1 客評過量
ggplot(ndta, aes(x = 學歷級區-工作級區))+
geom_bar()+
geom_text(stat = "count", aes(label = ..count.., y = ..count.. , vjust = -.75))+
scale_x_continuous(limits=c(-5,5), breaks=seq(-5,5, by = 1))+
labs(x = "客評過量")+
theme_bw()
## Warning: Removed 129 rows containing non-finite values (stat_count).
## Warning: Removed 129 rows containing non-finite values (stat_count).

# fig.2 客評相符
ggplot(ndta, aes(x = 關聯編碼))+
geom_bar()+
geom_text(stat = "count", aes(label = ..count.., y = ..count.. , vjust = -.75))+
scale_x_continuous(limits=c(0,4), breaks=seq(1,3, by = 1))+
labs(x = "客評相符")+
theme_bw()

#學歷排序
ndta$學歷 <- factor(ndta$學歷, levels = c( "博士" , "碩士" , "普通大學", "科技大學", "技術學院", "五專" , "二專" , "高職", "高中" , "軍警學校" ))
# fig.3 客評過量與學歷
ggplot(ndta, aes(x = 學歷級區-工作級區))+
geom_histogram(aes(y =..density..), binwidth = .5)+
scale_x_continuous(limits=c(-5,5), breaks=seq(-4,4, by = 1))+
geom_vline(xintercept = 0, color = "gray", linetype = 2) +
facet_wrap(~學歷)+
labs(x = "客評過量")+
theme_bw()
## Warning: Removed 129 rows containing non-finite values (stat_bin).

# fig.4 客評相符與學歷
ggplot(ndta, aes(x = 關聯編碼))+
geom_histogram(aes(y =..density..), binwidth = .5)+
scale_x_continuous(limits=c(0,4), breaks=seq(1,3, by = 1))+
facet_wrap(~學歷)+
labs(x = "客評相符")+
theme_bw()
