# package
pacman::p_load(tidyverse, ggplot2, GGally, lme4, scales)
# read data
dta <- read.csv("data0402.csv", header = T)

Data

# set the ref & differ from zones
str(dta)
## 'data.frame':    1837 obs. of  31 variables:
##  $ SID          : Factor w/ 1769 levels "A1","A10","A100",..: 1 110 218 328 438 438 512 523 534 544 ...
##  $ Born         : int  1992 1990 1989 1988 1987 1987 1987 1992 1993 1988 ...
##  $ Gender       : Factor w/ 2 levels "女","男": 1 2 2 1 2 2 2 1 1 1 ...
##  $ Sector       : Factor w/ 3 levels "私立","國外學校",..: 3 1 3 3 1 1 3 3 3 3 ...
##  $ EduLv        : Factor w/ 11 levels "二專","三專",..: 11 10 11 11 10 10 11 10 10 11 ...
##  $ Field1       : Factor w/ 18 levels "大眾傳播學群",..: 12 2 12 12 7 7 2 9 3 2 ...
##  $ Field2       : Factor w/ 178 levels "","13-經營與財務運作相關職類",..: 109 140 95 95 162 162 9 71 19 140 ...
##  $ Job1         : Factor w/ 24 levels "","11-主管職類",..: 9 23 9 9 5 5 5 3 9 22 ...
##  $ Job2         : Factor w/ 130 levels "","/","11-1000 高階主管",..: 23 80 23 23 12 12 12 7 23 71 ...
##  $ Job3         : Factor w/ 464 levels "","11-1011.00 執行長(Chief Executives)",..: 136 336 143 141 74 74 74 33 136 305 ...
##  $ City         : Factor w/ 21 levels "宜蘭縣","花蓮縣",..: 20 8 17 7 18 18 8 18 4 12 ...
##  $ Category     : Factor w/ 4 levels "","自營者","受雇於公營機關",..: 3 3 4 3 4 4 4 4 3 4 ...
##  $ Staff        : Factor w/ 9 levels "","10-29人","100-199人",..: 2 8 5 8 9 9 9 9 7 6 ...
##  $ Hours        : num  48 40 40 50 40 40 40 50 40 50 ...
##  $ J_year       : Factor w/ 50 levels "1","1.5","10",..: 16 1 1 43 28 28 41 1 28 41 ...
##  $ J_total      : num  2 3 2 5 4 4 6 1 3 5 ...
##  $ income       : Factor w/ 21 levels "10-11萬以下",..: 13 14 11 16 14 14 17 14 16 18 ...
##  $ SubEduOver   : Factor w/ 3 levels "低於工作要求",..: 3 2 3 2 3 3 2 3 3 3 ...
##  $ Require      : Factor w/ 11 levels "二專","不需要接受學校正式教育",..: 11 11 10 11 10 10 5 10 10 10 ...
##  $ SubMismatch  : int  4 3 5 4 2 2 2 4 3 5 ...
##  $ JobSat       : int  6 6 6 6 4 4 5 5 3 5 ...
##  $ Email        : Factor w/ 1313 levels "","0901pin@gmail.com",..: 945 1053 829 1 1235 1235 1 1 1 1 ...
##  $ EduZone      : int  5 4 5 5 4 4 5 4 4 5 ...
##  $ Region       : Factor w/ 6 levels "中彰投","北北基",..: 6 5 1 4 2 2 5 2 1 4 ...
##  $ Salary       : num  20000 35000 25000 45000 35000 35000 55000 35000 45000 65000 ...
##  $ Age          : int  26 28 29 30 31 31 31 26 25 30 ...
##  $ JobCode      : Factor w/ 463 levels "","11-1011.00",..: 136 335 143 141 74 74 74 33 136 304 ...
##  $ Field_JobCode: Factor w/ 1168 levels "13-經營與財務運作相關職類13-2011.01",..: 663 926 585 583 1070 1070 17 437 72 919 ...
##  $ JobZone      : int  4 2 4 4 4 4 4 4 4 2 ...
##  $ 職業關聯     : Factor w/ 2 levels "核心職業","關連職業": 1 NA NA 1 2 2 1 NA NA NA ...
##  $ JobAsso      : int  3 1 3 3 2 2 3 1 1 1 ...
dta <- dta %>% mutate( Sector = relevel(Sector, ref = "私立"),
                       Region = relevel(Region, ref = "宜花東離島"),
                       EduLv = factor(EduLv, levels=c("國中","高職","高中", "二專",
                                                      "三專","五專","技術學院",
                                                      "科技大學","普通大學","碩士",
                                                      "博士","軍警學校" )),
                       J_year = as.numeric(J_year), 
                       JobZone = as.numeric(JobZone),
                       EduZone = as.numeric(EduZone),
                       ZoneD = EduZone - JobZone) %>%  
               filter( Age < 75 & Age > 20)



# hunt for missing values by variables
apply(apply(dta, 1, is.na), 1, sum)
##           SID          Born        Gender        Sector         EduLv 
##             0             0             0             0             0 
##        Field1        Field2          Job1          Job2          Job3 
##             0             0             0             0             0 
##          City      Category         Staff         Hours        J_year 
##             1             0             0             0             0 
##       J_total        income    SubEduOver       Require   SubMismatch 
##             0             0             0             0             0 
##        JobSat         Email       EduZone        Region        Salary 
##             0             0             9             1             0 
##           Age       JobCode Field_JobCode       JobZone      職業關聯 
##             0             0             0           169          1307 
##       JobAsso         ZoneD 
##             0           177
# pick out
names(dta)
##  [1] "SID"           "Born"          "Gender"        "Sector"       
##  [5] "EduLv"         "Field1"        "Field2"        "Job1"         
##  [9] "Job2"          "Job3"          "City"          "Category"     
## [13] "Staff"         "Hours"         "J_year"        "J_total"      
## [17] "income"        "SubEduOver"    "Require"       "SubMismatch"  
## [21] "JobSat"        "Email"         "EduZone"       "Region"       
## [25] "Salary"        "Age"           "JobCode"       "Field_JobCode"
## [29] "JobZone"       "職業關聯"      "JobAsso"       "ZoneD"
p <- select(dta, -2, -c(7:10), -Email, -c(27:28), -30 )


# subject with missing values
# dta[which(is.na(dta$JobZone)), ]

Description analysis

age

ggplot(p , aes(x = Age))+
  geom_bar(position="dodge")+
  scale_x_continuous(limits=c(20,75), breaks=seq(20,75, by = 5))+
  scale_y_continuous(limits=c(0,180), breaks=seq(0,180, by = 20))+
  labs(x = "年齡")+
  theme_bw()

salary

ggplot(p, aes(x = as.factor(Salary))) + 
  geom_bar(position="dodge")+
  labs(x = "Salary") + 
  theme_bw() + 
  theme(axis.text.x = element_text(hjust = 1, angle =30))

Region

ggplot(p, aes(x = Region)) + 
  geom_bar(position="dodge")+
  geom_text(stat = "count", aes(label = ..count.., y = ..count.. , vjust = -.5))+
  geom_text(stat = "count", aes(y = ..count.., label = sprintf("%.0f%%", round(..count../sum(..count..)*100))),
            position = position_dodge(width = 0.9), color = "white", vjust = 1)+
  labs(x = "地區") + 
  theme_bw() + 
  theme(axis.text.x = element_text(hjust = 1))

Edu Level

ggplot(p, aes(x = EduLv)) + 
  geom_bar(position="dodge")+
  geom_text(stat = "count", aes(label = ..count.., y = ..count.. , hjust = -.3))+
  labs(x = "學歷") + 
  coord_flip() +
  theme_bw() +
  theme(axis.text.x = element_text(hjust = 1))

Edu Zone

ggplot(p, aes(x = EduZone)) + 
  geom_bar(position="dodge")+
  geom_text(stat = "count", aes(label = ..count.., y = ..count.. , hjust =.5, vjust = -0.5))+
  scale_x_continuous(limits=c(0,6), breaks=seq(1,5, by = 1))+
  labs(x = "學歷級區") + 
  theme_bw() +
  theme(axis.text.x = element_text(hjust = 1))
## Warning: Removed 9 rows containing non-finite values (stat_count).

## Warning: Removed 9 rows containing non-finite values (stat_count).

Study Field

ggplot(p, aes(x = Field1)) + 
  geom_bar(position="dodge")+
  geom_text(stat = "count", aes(label = ..count.., y = ..count.. , hjust = -.3))+
  geom_text(stat = "count", aes(y = ..count.., label = sprintf("%.0f%%", round(..count../sum(..count..)*100))),
            position = position_dodge(width = .9), color = "white", hjust = 1)+
  coord_flip() +
  labs(x = "學群") + 
  theme_bw() +
  theme(axis.text.x = element_text(hjust = 1))

Difference between Edu&Job zone

# 軍警學校為NA因此不列入繪圖
# 
ggplot(data= filter(p, ZoneD!="軍警學校"), aes(x = ZoneD))+
  geom_histogram( binwidth = .5)+
  geom_text(stat = "count", aes(label = ..count.., y = ..count.. , vjust = -.75))+
  scale_x_continuous(limits=c(-5,5), breaks=seq(-4,4, by = 1))+
  labs(x = "客評過量")+
  theme_bw()

# 
ggplot(data= filter(p, ZoneD!="軍警學校"), aes(x =ZoneD))+
  geom_histogram(aes(y =..density..), binwidth = .5)+
  geom_text(aes( label = scales::percent(..prop..), y= ..prop.. ), stat= "count", size = 2.5, vjust = -2) +
  scale_x_continuous(limits=c(-5,5), breaks=seq(-4,4, by = 1))+
  geom_vline(xintercept = 0, color = "gray",  linetype = 2) +
  facet_wrap(~EduLv, nrow = 5)+
  labs(x = "客評過量")+
  theme_bw()

Corelation between Edu&Job

#軍警學校為NA因此不列入繪圖
#
ggplot(data= filter(p, ZoneD!="軍警學校"), aes(x = JobAsso))+
  geom_histogram( binwidth = .5)+
  geom_text(stat = "count", aes(label = ..count.., y = ..count.. , vjust = -.75))+
  scale_x_continuous(breaks=seq(1,3, by = 1))+
  labs(x = "客評關聯")+
  theme_bw()

#
ggplot(data= filter(p, ZoneD!="軍警學校"), aes(x = JobAsso))+
  geom_histogram(aes(y =..density..), binwidth = .5)+
  geom_text(aes( label = scales::percent(..prop..), y= ..prop.. ), stat= "count", size = 3, vjust = -2) +
  scale_x_continuous(breaks=seq(1,3, by = 1))+
  labs(x = "客評關聯")+
  facet_wrap(~EduLv, nrow = 2)+
  theme_bw()