Project Proposal

DATA Statistics and Probability for Data Analytics

CUNY MSDS DATA 606

Date: 2018/04/04
Author: Rose Koh

Introduction

Load packages

library(data.table)
library(psych)
library(ggplot2)

Data Preparation

# load data
data <- read.csv("./source_lib/perm/PERM_Disclosure_Data_FY17.csv", stringsAsFactors = F)
data <- as.data.table(data)
dim(data)

Research question

Cases

Data collection

Type of study

Data Source

Response

Explanatory

Data manipulation

# First, subset data for what's required
data <- subset(data, select=c("CASE_NUMBER","CASE_STATUS","DECISION_DATE","CASE_RECEIVED_DATE","EMPLOYER_YR_ESTAB","EMPLOYER_NUM_EMPLOYEES","JOB_INFO_EXPERIENCE_NUM_MONTHS","FW_INFO_REQ_EXPERIENCE","PW_LEVEL_9089","PW_AMOUNT_9089","PW_UNIT_OF_PAY_9089","FOREIGN_WORKER_INFO_EDUCATION"))

# remove NA
data <- na.omit(data)

# * processing.time (days)
data$DECISION_DATE <- as.Date(data$DECISION_DATE)
data$CASE_RECEIVED_DATE <- as.Date(data$CASE_RECEIVED_DATE)
data$processing.time <- data$DECISION_DATE - data$CASE_RECEIVED_DATE
data$processing.time <- as.numeric(as.character(data$processing.time))

# * length of business (years)
# Year the employer commenced business or incorporated. If the employer is a private household employing a household domestic worker, this question may be skipped.
data$establishment <- ifelse(data$EMPLOYER_YR_ESTAB > 0, 2018 - data$EMPLOYER_YR_ESTAB, 0)

# create pay variable for yearly calculation
table(data$PW_UNIT_OF_PAY_9089)
## 
##        Hour Month  Week  Year 
##     1   515     8     4 54649
data$pay <- data$PW_AMOUNT_9089
data$pay <- ifelse(data$PW_UNIT_OF_PAY_9089 == "Hour", data$PW_AMOUNT_9089 * 40 * 52, data$pay)
data$pay <- ifelse(data$PW_UNIT_OF_PAY_9089 == "Month", data$PW_AMOUNT_9089 * 12, data$pay)
data$pay <- ifelse(data$PW_UNIT_OF_PAY_9089 == "Week", data$PW_AMOUNT_9089 * 52, data$pay)
data$pay <- ifelse(data$PW_UNIT_OF_PAY_9089 == "Bi-Weekly", data$PW_AMOUNT_9089 * 26, data$pay)

# copy the column
data$number.of.employees <- data$EMPLOYER_NUM_EMPLOYEES

# edit the work experience
# if the candidate has not met the required experience, then the number of months of job experience is 0.
data$work.experience <- ifelse(data$FW_INFO_REQ_EXPERIENCE == "Y", data$JOB_INFO_EXPERIENCE_NUM_MONTHS, 0)

# edit the education part: divide category below/above mater's degree
table(data$FOREIGN_WORKER_INFO_EDUCATION)
## 
##             Associate's  Bachelor's   Doctorate High School    Master's 
##           3         496       24152        1324        1368       22690 
##        None       Other 
##        3829        1315
data$edu <- ifelse(data$FOREIGN_WORKER_INFO_EDUCATION == "Doctorate", "y", NA)
data$edu <- ifelse(data$FOREIGN_WORKER_INFO_EDUCATION == "Master's", "y", data$edu)

data$edu <- ifelse(data$FOREIGN_WORKER_INFO_EDUCATION == "Bachelor's", "n", data$edu)
data$edu <- ifelse(data$FOREIGN_WORKER_INFO_EDUCATION == "Associate's", "n", data$edu)
data$edu <- ifelse(data$FOREIGN_WORKER_INFO_EDUCATION == "High School", "n", data$edu)

# Subset the clean data
test.data <- subset(data, select = c('CASE_STATUS', 'processing.time', 'establishment', 'pay', 'number.of.employees', 'work.experience', 'edu'))

Relevant summary statistics(Q1)

Relevant summary statistics(Q2)

Relevant summary statistics(Q3)

Relevant summary statistics(Q4)