資料整理

dta <- read.csv("C:/Users/USER/Desktop/EDU MIS/project-research/project-research_data_data.csv", header = T)

# reorder levels
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
dta$學群 <- recode_factor(dta$學群,
                        "1" = "資訊學群",
                        "2" = "工程學群",
                        "3" = "數理化學群",
                        "4" = "地球與環境學群",
                        "5" = "生物資源學群",
                        "6" = "生命科學學群",
                        "7" = "醫藥衛生學群",
                        "8" = "社會與心理學群",
                        "9" = "教育學群",
                        "10" = "外語學群",
                        "11" = "文史哲學群",
                        "12" = "藝術學群",
                        "13" = "大眾傳播學群",
                        "14" = "建築與設計學群",
                        "15" = "遊憩與運動學群",
                        "16" = "法政學群",
                        "17" = "財經學群",
                        "18" = "管理學群")

#
dta$薪資 <- recode_factor(dta$薪資,
                        "1" = "2萬以下",
                        "2" = "2-3萬以下",
                        "3" = "3-4萬以下",
                        "4" = "4-5萬以下",
                        "5" = "5-6萬以下",
                        "6" = "6-7萬以下",
                        "7" = "7-8萬以下",
                        "8" = "8-9萬以下",
                        "9" = "9-10萬以下",
                        "10" = "10-11萬以下",
                        "11" = "11-12萬以下",
                        "12" = "12-13萬以下",
                        "13" = "13-14萬以下",
                        "14" = "14-15萬以下",
                        "15" = "15-16萬以下",
                        "16" = "16-17萬以下",
                        "17" = "17-18萬以下",
                        "18" = "18-19萬以下",
                        "19" = "19-20萬以下",
                        "20" = "20-30萬以下",
                        "21" = "30萬以上")


#
dta$薪資m <- recode_factor(dta$薪資,
                         "1" = "20000",
                         "2" = "25000",
                         "3" = "35000",
                         "4" = "45000",
                         "5" = "55000",
                         "6" = "65000",
                         "7" = "75000",
                         "8" = "85000",
                         "9" = "95000",
                         "10" = "105000",
                         "11" = "115000",
                         "12" = "125000",
                         "13" = "135000",
                         "14" = "145000",
                         "15" = "155000",
                         "16" = "165000",
                         "17" = "175000",
                         "18" = "185000",
                         "19" = "195000",
                         "20" = "205000",
                         "21" = "350000")

#為公私立排序
dta$公私立 <- factor(dta$公私立, levels = c( "國立(公立)" ,  "私立" , "國外學校" ))

#為學歷排序
dta$學歷 <- factor(dta$學歷, levels = c( "博士" ,  "碩士" , "普通大學", "科技大學", "技術學院", "五專" , "二專" , "高職", "高中" , "軍警學校"  ))

#
dta$自評過量 <- as.numeric(recode_factor(dta$自評過量,
                                     "1" = "低於工作要求",
                                     "2" = "符合工作要求",
                                     "3" = "高於工作要求"))

# age  除去無效樣本
dta$age <- 2018-dta$出生年 
summary(dta$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.00   26.00   30.00   31.28   35.00   70.00
dta<- arrange(dta, age)
dta <- dta[-c(1486,1487),]

Plot

# 年齡次數配
library(ggplot2)
ggplot(dta, aes(x = age))+
  geom_bar(position="dodge")+
  scale_x_continuous(limits=c(20,75), breaks=seq(20,75, by = 5))+
  scale_y_continuous(limits=c(0,140), breaks=seq(0,140, by = 20))+
  labs(x = "年齡")+
  theme_bw()

# 年齡百分比分配
ggplot(dta, aes(x = age))+
  geom_histogram(aes(y =..density..), binwidth = .8)+
  scale_x_continuous(limits=c(20,75), breaks=seq(20,75, by = 5))+
  labs(x = "年齡")+
  theme_bw()
## Warning: Removed 1 rows containing missing values (geom_bar).

# 自評過量 vs 公私立
ggplot(dta, aes(x = 自評過量)) +
  geom_histogram(aes(y =..density..), binwidth = .5)+
  facet_wrap(~公私立)+
  scale_x_continuous(breaks=1:3)+
  labs(x = "自評過量", y = "Density")+
  theme_bw()

#自評學用 vs 公私立
ggplot(dta, aes(x = 自評學用)) +
  geom_histogram(aes(y =..density..), binwidth = .5)+
  facet_wrap(~公私立)+
  labs(x = "自評學用", y = "Density")+
  theme_bw()

#自評過量 vs 學歷
  
ggplot(dta, aes(x = 自評過量)) +
  geom_histogram(aes(y =..density..), binwidth = .5)+
  facet_wrap(~學歷, nrow = 2)+
  scale_x_continuous(breaks=1:3) +
  scale_y_continuous(limits=c(0,2), breaks=seq(0,2, by = 0.3)) +
  labs(x = "自評過量", y = "Density")+
  theme_bw()

#自評學用 vs 學歷
ggplot(dta, aes(x = 自評學用)) +
  geom_histogram(aes(y =..density..), binwidth = .5)+
  facet_wrap(~學歷)+
  labs(x = "自評學用", y = "Density")+
  theme_bw()