CRISP-DM Process ###My Shiny Web:https://jingwang123.shinyapps.io/Hw1_jing/ ###upload data set from computer and install packages #### Data Exploration and Descriptive Statistics CRISP-DM Process *clean and upload data set
getwd()
## [1] "/Users/jingwang/Desktop"
setwd("~/Desktop")
employee <-read.csv(file="employee_attrition.csv",header=T, stringsAsFactors = T)
cleanSurvey<- na.omit(employee)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
library(grid)
cleanSurvey$Age <-discretize(cleanSurvey$Age, method = "frequency", breaks = 3, lablels= c("low","medium","high"), order=T)
cleanSurvey$DailyRate<-discretize(cleanSurvey$DailyRate, method = "frequency", breaks = 3, lablels= c("low","medium","high"), order=T)
cleanSurvey$DistanceFromHome <-as.factor(c("near","median","far"))
cleanSurvey$Education <-as.factor(c("low","median","high"))
cleanSurvey[["EmployeeNumber"]] <-NULL
cleanSurvey[["EmployeeCount"]] <-NULL
cleanSurvey[["Over18"]]<- NULL
cleanSurvey[["OverTime"]]<-NULL
cleanSurvey[["NumCompaniesWorked"]] <- NULL
cleanSurvey[["StandardHoures"]] <-NULL
cleanSurvey$EnvironmentSatisfaction[cleanSurvey$EnvironmentSatisfaction >= 2] ="satisfaction"
cleanSurvey$EnvironmentSatisfaction[cleanSurvey$EnvironmentSatisfaction <2] = "unsatisfaction"
cleanSurvey$HourlyRate [cleanSurvey$HourlyRate >= 60]="high"
cleanSurvey$HourlyRate[cleanSurvey$HourlyRate <69]="low"
cleanSurvey$JobInvolvement <-as.factor(c("low","median","high"))
cleanSurvey$JobLevel <-as.factor(c("low","median","high"))
cleanSurvey$JobSatisfaction[cleanSurvey$JobSatisfaction <=3]="unsatisfaction"
cleanSurvey$JobSatisfaction[cleanSurvey$JobSatisfaction >3]="satisfaction"
cleanSurvey$MonthlyIncome[cleanSurvey$MonthlyIncome< 4941]="lower income"
cleanSurvey$MonthlyIncome[cleanSurvey$MonthlyIncome> 4941 | cleanSurvey$MonthlyIncome <8363] ="median income"
cleanSurvey$MonthlyIncome[cleanSurvey$MonthlyIncome > 8363]= "high income"
cleanSurvey$MonthlyRate <- discretize(cleanSurvey$MonthlyRate,method="frequency", breaks=3, labels=c("short","mediun","high"), order=T)
cleanSurvey$StockOptionLevel<- as.factor(c("low","median","high"))
cleanSurvey$TotalWorkingYears [cleanSurvey$TotalWorkingYears <10]="short"
cleanSurvey$TotalWorkingYears[cleanSurvey$TotalWorkingYears >11 | cleanSurvey$TotalWorkingYears <17]="median"
cleanSurvey$TotalWorkingYears[cleanSurvey$TotalWorkingYears >17]= "long"
cleanSurvey$TrainingTimesLastYear <-discretize(cleanSurvey$TrainingTimesLastYear, method = "frequency", breaks = 3, lablels= c("low","medium","high"), order=T)
cleanSurvey$YearsAtCompany <-discretize(cleanSurvey$YearsAtCompany,method="frequency", breaks=3, labels=c("short","mediun","high"), order=T)
cleanSurvey$YearsInCurrentRole <-discretize(cleanSurvey$YearsInCurrentRole,method="frequency", breaks=3, labels=c("short","mediun","high"), order=T)
cleanSurvey$YearsWithCurrManager<- discretize(cleanSurvey$YearsWithCurrManager,method="frequency", breaks=3, labels=c("short","mediun","high"), order=T)
cleanSurvey$WorkLifeBalance[cleanSurvey$WorkLifeBalance == 1] = "Min"
cleanSurvey$WorkLifeBalance[cleanSurvey$WorkLifeBalance == 2] = "1st Quantile"
cleanSurvey$WorkLifeBalance[cleanSurvey$WorkLifeBalance == 3] = "Medium"
cleanSurvey$WorkLifeBalance[cleanSurvey$WorkLifeBalance == 4] = "3th Quantile"
cleanSurvey$WorkLifeBalance[cleanSurvey$WorkLifeBalance == 5] = "Max"
cleanSurvey$YearsSinceLastPromotion[cleanSurvey$YearsSinceLastPromotion <= 1] = "Min"
cleanSurvey$YearsSinceLastPromotion[cleanSurvey$YearsSinceLastPromotion > 2 | cleanSurvey$YearsSinceLastPromotion < 5] = "Medium"
cleanSurvey$YearsSinceLastPromotion[cleanSurvey$YearsSinceLastPromotion > 5 ] = "Medium"
cleanSurvey <- as(data.frame(lapply(cleanSurvey, as.character), stringsAsFactors=T), "transactions")
frequent_items <- eclat(cleanSurvey, parameter = list(support = 0.7, minlen = 2))
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.7 2 10 frequent itemsets FALSE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 816
##
## create itemset ...
## set transactions ...[99 item(s), 1167 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating bit matrix ... [9 row(s), 1167 column(s)] done [0.00s].
## writing ... [182 set(s)] done [0.00s].
## Creating S4 object ... done [0.00s].
inspect(head(frequent_items, 2))
## items support count
## [1] {BusinessTravel=Travel_Rarely,
## JobSatisfaction=satisfaction,
## MonthlyIncome=high income,
## StandardHours=80,
## TotalWorkingYears=long,
## YearsSinceLastPromotion=Medium} 0.7137961 833
## [2] {BusinessTravel=Travel_Rarely,
## JobSatisfaction=satisfaction,
## MonthlyIncome=high income,
## StandardHours=80,
## TotalWorkingYears=long} 0.7137961 833
itemFrequencyPlot(cleanSurvey, topN = 10, type = "absolute", main = "Item frequency")
rules <- apriori(cleanSurvey, parameter = list(support = 0.01, confidence = 0.5,minlen=3), appearance = list(default="lhs",rhs=c("Attrition=Yes")),control = list(verbose=F))
quality(head(rules, 3))
## support confidence lift count
## 1 0.01285347 0.7142857 4.555035 15
## 2 0.01285347 0.5769231 3.679067 15
## 3 0.01371037 0.5714286 3.644028 16
inspect(head(sort(rules, by = "lift", decreasing = T), 7))
## lhs rhs support confidence lift count
## [1] {Age=[18,32),
## MaritalStatus=Single,
## MonthlyRate=mediun,
## YearsInCurrentRole=short} => {Attrition=Yes} 0.01028278 1 6.377049 12
## [2] {Age=[18,32),
## MaritalStatus=Single,
## StockOptionLevel=high,
## YearsInCurrentRole=short} => {Attrition=Yes} 0.01028278 1 6.377049 12
## [3] {Age=[18,32),
## JobLevel=high,
## MaritalStatus=Single,
## YearsInCurrentRole=short} => {Attrition=Yes} 0.01028278 1 6.377049 12
## [4] {Age=[18,32),
## JobInvolvement=high,
## MaritalStatus=Single,
## YearsInCurrentRole=short} => {Attrition=Yes} 0.01028278 1 6.377049 12
## [5] {Age=[18,32),
## Education=high,
## MaritalStatus=Single,
## YearsInCurrentRole=short} => {Attrition=Yes} 0.01028278 1 6.377049 12
## [6] {Age=[18,32),
## DistanceFromHome=far,
## MaritalStatus=Single,
## YearsInCurrentRole=short} => {Attrition=Yes} 0.01028278 1 6.377049 12
## [7] {Age=[18,32),
## BusinessTravel=Travel_Frequently,
## PerformanceRating=3,
## YearsInCurrentRole=short,
## YearsWithCurrManager=short} => {Attrition=Yes} 0.01113967 1 6.377049 13
subset_rules1 <- which(colSums(is.subset(rules)) > 1)
rules <- sort(rules[-subset_rules1], by = "lift", descreasing = T)
inspect(head(rules, 5))
## lhs rhs support confidence lift count
## [1] {Age=[18,32),
## EducationField=Technical Degree,
## MaritalStatus=Single} => {Attrition=Yes} 0.01199657 0.7368421 4.698878 14
## [2] {BusinessTravel=Travel_Frequently,
## JobRole=Sales Representative} => {Attrition=Yes} 0.01285347 0.7142857 4.555035 15
## [3] {Age=[18,32),
## BusinessTravel=Travel_Frequently,
## YearsAtCompany=short} => {Attrition=Yes} 0.02056555 0.7058824 4.501446 24
## [4] {Age=[18,32),
## BusinessTravel=Travel_Frequently,
## YearsWithCurrManager=short} => {Attrition=Yes} 0.01628106 0.7037037 4.487553 19
## [5] {Age=[18,32),
## MaritalStatus=Single,
## YearsWithCurrManager=short} => {Attrition=Yes} 0.02570694 0.6818182 4.347988 30
plot(rules, measure = c("support", "lift"), shading = "confidence")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
library(shiny) run_example(“01_hello”) ui <- fluidPage( titlePanel(“Dataset exploration”),
sidebarLayout( sidebarPanel( selectInput(inputId = “cleanSurvey”, label = “cleanSurvey”, choices = ls(name = “cleanSurvey”), selected = “Attrition”) ),
mainPanel(
textOutput(outputId = "DatasetName")
)
) )
server <- function(input, output){} shinyApp(ui = ui, server = server)
ui
Report
###Analysis Report
####According to the data analysis part, several factors are significant related with “Attrition=Yes” and “Attrition = No”, I have found that: ####which influences “Attrition = Yes”: ####"Age=[18,32),
##EducationField=Technical Degree,
##MaritalStatus=Single}
##They like to change the job
####Which influences “Attrition = No” ####“StockOptionLevel=1st Quantile” ####“YearsSinceLastPromotion=Medium” ####They do not like change the job