The data

How does the data look like?

colnames(Data)
##  [1] "Title"                "Company"              "Company Registration"
##  [4] "Industry"             "Description"          "Experience"          
##  [7] "Benefits"             "Language"             "Working Hours"       
## [10] "Company Size"         "Dress Code"           "Company Description" 
## [13] "Why Join?"            "Address"              "Location"            
## [16] "Website"              "Phone"                "Date Posted"         
## [19] "Min Salary"           "Max Salary"           "Date"
Data$Description[1]
## [1] "BENEFITS: MEDICAL INSURANCE COVERAGE\n \nREQUIREMENTS:\n\nMIN 2 YEARS OF SALES EXPERIENCE AND INDUSTRY SUPPLY.\nDIPLOMA / DEGREE IN SALES OR MECHANICAL OR ANY FIELD OF ENGINEERING.\nGOOD COMMAND OF ORAL AND WRITTEN ENGLISH & MANDARIN.\nABLE TO WORK INDEPENDENTLY, AGGRESSIVE AND SELF-STARTER.\nGOOD INTERPERSONAL, P.R SKILL AND COMMUNICATION SKILL.\nPOSSESS OWN TRANSPORT AND ABLE TO TRAVELING.\n\n \nRESPONSIBILITIES:\n\nRESPONSIBLE TO PROMOTE AND SELL THE COMPANY’S RANGE OF PRODUCT AND SERVICES\nTO HANDLE / SERVICE EXISTING CUSTOMER AND DEVELOP/EXPAND NEW CUSTOMER BASE\nTO HANDLE ANY AD HOCK TASK AS AND WHEN ASSIGNED BY MANAGER\n\n\n \nSHIN-YO ENGINEERING SDN BHD (510968-K)\nNO. 86, JALAN MEGA MENDUNG, BANDAR PARK\nOFF JALAN KELANG LAMA, 58200 KUALA LUMPUR\nTEL: 03-7984 8401 EXT: 28 OR 26 OR 012-7980 263"

Creating dummies for education

rec <- recipe(Data) %>%
  step_regex(Description, pattern = "(PRIMARY SCHOOL|HIGHER SECONDARY|SPM)", result = "ed.school") %>%
  step_regex(Description, pattern = "(STPM|DIPLOMA|CERTIFICATE)", result = "ed.preu") %>% 
  step_regex(Description, pattern = "(BACHELOR|DEGREE)", result = "ed.bach") %>%
  step_regex(Description, pattern = "(MASTER|POSTGRADUATE DIPLOMA|PHD|DOCTORATE|DOCTOR|POST-DOC)", result = "ed.post") 

rec2 <- prep(rec, training = Data)
with_dummies <- bake(rec2, newdata = Data)

Imputing years of schooling

with_dummies$schooling <- ifelse(with_dummies$ed.post == 1, 20, 
                           ifelse(with_dummies$ed.bach == 1, 17, 
                                  ifelse(with_dummies$ed.preu == 1, 14, 
                                         ifelse(with_dummies$ed.school == 1, 12, 0))))

with_dummies[with_dummies$schooling == 0, 'schooling']<- NA

Distribution of max salaries between 1000-30000

jobsmy <- with_dummies %>% 
  filter(grepl('MALAYSIA', Location))
ggplot(jobsmy[jobsmy$`Max Salary`<30000 & jobsmy$`Max Salary`>1000,],aes(`Max Salary`)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Salaries data is obviously skewed

therefore take the logarithm of salaries to obtain a roughly normal distribution

jobsmy$lmax <- log(jobsmy$`Max Salary`)
## Warning in log(jobsmy$`Max Salary`): NaNs produced
jobsmy$lmin <- log(jobsmy$`Min Salary`)
## Warning in log(jobsmy$`Min Salary`): NaNs produced
ggplot(jobsmy,aes(lmax)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 477 rows containing non-finite values (stat_bin).

## Eliminate outliers * getting rid of observations with zmax 3 standard deviations above mean * Any experience above 10 years is assumed to be junk data

jobsmy$zmax <- abs(scale(jobsmy$lmax))
jobsmy[jobsmy$zmax > 3 & !is.na(jobsmy$zmax),'lmax'] <- NA

jobsmy[jobsmy$Experience > 10 & !is.na(jobsmy$Experience),'Experience'] <- NA
skimr::skim(jobsmy) %>% skimr::kable(type = 'html')
## Skim summary statistics  
##  n obs: 24160    
##  n variables: 29    
## 
## Variable type: character
## 
## variable   missing   complete   n       min   max   empty   n_unique 
## ---------  --------  ---------  ------  ----  ----  ------  ---------
## zmax       0         24160      24160   3     19    0       340      
## 
## Variable type: Date
## 
## variable   missing   complete   n       min          max          median       n_unique 
## ---------  --------  ---------  ------  -----------  -----------  -----------  ---------
## Date       0         24160      24160   2017-11-20   2018-01-26   2018-01-15   35       
## 
## Variable type: factor
## 
## variable               missing   complete   n       n_unique   top_counts                                   ordered 
## ---------------------  --------  ---------  ------  ---------  -------------------------------------------  --------
## Address                5386      18774      24160   9454       NA: 5386, LEV: 198, MEN: 174, LEV: 107       FALSE   
## Benefits               5576      18584      24160   2844       NA: 5576, MED: 2642, MED: 1470, MED: 477     FALSE   
## Company                0         24160      24160   8282       COM: 886, COM: 408, REE: 354, HAY: 287       FALSE   
## Company Description    200       23960      24160   9708       REE: 350, AS : 229, NA: 200, AT : 138        FALSE   
## Company Registration   1751      22409      24160   7873       NA: 1751, 111: 354, 955: 287, 972: 229       FALSE   
## Company Size           1288      22872      24160   8          1 -: 7930, 51 : 5859, 201: 2758, MOR: 1850   FALSE   
## Date Posted            0         24160      24160   35         25-: 1761, 26-: 1470, 19-: 1324, 22-: 1284   FALSE   
## Description            0         24160      24160   23517      YOU: 14, YOU: 13, SAL: 11, ACQ: 9            FALSE   
## Dress Code             5161      18999      24160   477        BUS: 9177, NA: 5161, FOR: 3345, CAS: 2515    FALSE   
## Industry               1283      22877      24160   59         MAN: 2650, HUM: 2488, BAN: 1375, NA: 1283    FALSE   
## Language               4996      19164      24160   387        ENG: 14671, NA: 4996, CHI: 1237, MAL: 417    FALSE   
## Location               0         24160      24160   4639       MAL: 3630, MAL: 1481, MAL: 696, MAL: 672     FALSE   
## Phone                  9220      14940      24160   6192       NA: 9220, 03-: 229, +60: 173, 603: 133       FALSE   
## Title                  0         24160      24160   16695      SAL: 360, ACC: 254, MAR: 154, GRA: 115       FALSE   
## Website                10140     14020      24160   5127       NA: 10140, HTT: 173, HTT: 133, HTT: 123      FALSE   
## Why Join?              6474      17686      24160   6401       NA: 6474, WE : 229, WE : 201, WE : 143       FALSE   
## Working Hours          5092      19068      24160   1602       REG: 12769, NA: 5092, SAT: 992, 24/: 229     FALSE   
## 
## Variable type: numeric
## 
## variable     missing   complete   n       mean       sd        p0         p25    p50    p75    p100      hist     
## -----------  --------  ---------  ------  ---------  --------  ---------  -----  -----  -----  --------  ---------
## ed.bach      0         24160      24160   0.6        0.49      0          0      1      1      1         <U+2586><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2587> 
## ed.post      0         24160      24160   0.065      0.25      0          0      0      0      1         <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581> 
## ed.preu      0         24160      24160   0.58       0.49      0          0      1      1      1         <U+2586><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2587> 
## ed.school    0         24160      24160   0.15       0.35      0          0      0      0      1         <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2582> 
## Experience   3685      20475      24160   2.99       2.09      1          1      2      4      10        <U+2587><U+2583><U+2581><U+2582><U+2581><U+2581><U+2581><U+2581> 
## lmax         699       23461      24160   8.41       0.56      6.67       8.01   8.34   8.78   10.2      <U+2581><U+2581><U+2583><U+2587><U+2586><U+2583><U+2581><U+2581> 
## lmin         477       23683      24160   8.03       0.6       2.56       7.6    8.01   8.41   16.81     <U+2581><U+2581><U+2586><U+2587><U+2581><U+2581><U+2581><U+2581> 
## Max Salary   0         24160      24160   -1.8e+17   1.3e+18   -9.2e+18   3000   4200   6500   2.5e+07   <U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2587> 
## Min Salary   0         24160      24160   -1.8e+17   1.3e+18   -9.2e+18   2000   3000   4100   2e+07     <U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2587> 
## schooling    3197      20963      24160   16.2       1.93      12         14     17     17     20        <U+2581><U+2583><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581>

ggplot(jobsmy[jobsmy$lmax > 6,]) + 
  geom_jitter(aes(schooling, lmax), color = 'red') + 
  geom_jitter(aes(schooling, lmin), color = 'blue') + 
  xlab('Years of schooling') + ylab('Logarithm of salary') +
  scale_color_discrete(labels = c('Maximum', 'Minimum')) + theme_light()

Can we do better?

## 
## Attaching package: 'ggdag'
## The following object is masked from 'package:stats':
## 
##     filter

Mincer wage regression

  • The Mincer wage regression is given by log wages ~ education + experience + experience^2
  • We estimate the returns to education using the cleaned education and experience data
  • ability is not observed, so we just control for years of experience:
library(estimatr)
## 
## Attaching package: 'estimatr'
## The following object is masked from 'package:broom':
## 
##     tidy
jobsmy[is.na(jobsmy$Experience), 'Experience'] <- mean(jobsmy$Experience, na.rm = TRUE)
jobsmy$exp2 <- jobsmy$Experience ^ 2 
mincer <- lm_robust(lmax~schooling+Experience+exp2, data = jobsmy)
summary(mincer)
## 
## Call:
## lm_robust(formula = lmax ~ schooling + Experience + exp2, data = jobsmy)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##              Estimate Std. Error   Pr(>|t|)  CI Lower CI Upper    DF
## (Intercept)  6.575260   0.026375  0.000e+00  6.523564  6.62696 20513
## schooling    0.084358   0.001654  0.000e+00  0.081116  0.08760 20513
## Experience   0.163510   0.004459 3.322e-285  0.154770  0.17225 20513
## exp2        -0.002071   0.000485  1.961e-05 -0.003021 -0.00112 20513
## 
## Multiple R-squared:  0.4301 ,    Adjusted R-squared:   0.43 
## F-statistic:  5160 on 3 and 20513 DF,  p-value: < 2.2e-16
paste('We estimate a', round(mincer$coefficients[2]*100,2), '% increase in wages from an extra year of education. Does this make sense?')
## [1] "We estimate a 8.44 % increase in wages from an extra year of education. Does this make sense?"