Predict project assignment group from the description of incident of ITSM database.

rmarkdown is used and saved as .rmd and then published in kniter

Motivation

Sometimes incidents are assigned to wrong groups causing a lot of time wasted to reassign to particular group. So the idea was to make it automaticall assigned to correct group using predictive modeling and text mining. This has a huge potential to minisise cost and customer satisfaction

library(RODBC) dbhandle <- odbcDriverConnect(‘driver={SQL Server};server=C44S03\INST003;database=ITSM;trusted_connection=true;uid= , pwd=’) chnage the year res_inc <- sqlQuery(dbhandle, “SELECT [Assignment_Group],TITLE FROM [ITSM].[dbo].[SN_Incident] where [Assignment_Group] like ‘F161%’ or [Assignment_Group] like ‘F162%’ or [Assignment_Group] like ‘F163%’ or [Assignment_Group] like ‘F164%’ and Opened_time_year=2018”)

odbcCloseAll()

Data collection and import libraries

library(slam)
library(NLP)
library(tm)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RColorBrewer)
library(wordcloud)
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.1
setwd("C:\\Users\\suman\\Desktop\\datasciencework\\PROJECTS")
res_inc=read.csv("res_inc.csv")

str(res_inc)
## 'data.frame':    3873 obs. of  3 variables:
##  $ X               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Assignment_Group: Factor w/ 4 levels "F161 Sys. Mgm. EDW",..: 1 1 1 2 2 3 3 1 1 1 ...
##  $ TITLE           : Factor w/ 3790 levels "Change in ZW_FACT_KOR_2XXX_H  view",..: 594 1076 1154 1829 1628 2905 2256 1461 609 1001 ...
table(res_inc$Assignment_Group)
## 
##                F161 Sys. Mgm. EDW  F162 Sys. Mgm. Quality Assurance 
##                              1467                               706 
## F163 Sys. Mgm. EDW Infrastructure   F164 Sys. Mgm. EDW Architecture 
##                              1634                                66
res_inc=res_inc[,-1]

we have 1467,706,1634,66 class total 3873

Data wranglling and wordcloud

make “.” to " "

get wordcloud for diff proj code

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3
## 
## [1] JOB EWCFLK08 (EWPCPYLK08 0010 01191800 2624) ENDED IN ERROR 2624 tiludv
## [2] JOB EWCSHG08 (EWPSAKHG08 0010 01201800 1000) ENDED IN ERROR 1000 tiludv
## [3] JOB EWCSKS01 (EWPSAKKS01 0010 11131800 1625) ENDED IN ERROR 1625 tiludv

sms_train <- DocumentTermMatrix(sms_corpus_train)
sms_test <- DocumentTermMatrix(sms_corpus_test)
sms_train.m <- as.matrix(sms_train)
sms_test.m <- as.matrix(sms_test)
sms_train.df <- as.data.frame(sms_train.m)
sms_test.df <- as.data.frame(sms_test.m)

#naive bayes only works with catagorigal data but DTM contains count
#so we will change it to 1 or 0

convert_counts <- function(x) {
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
  return(x)
}
sms_train.df <- apply(sms_train.df, MARGIN = 2, convert_counts)
sms_test.df <- apply(sms_test.df, MARGIN = 2, convert_counts)

Build model naive bayes

sms_classifier <- naiveBayes(sms_train.df, as.factor(sms_raw_train$Assignment_Group),laplace=1)
sms_test_pred_e <- predict(sms_classifier, sms_test.df)

Performance of model

a<-table(True=sms_raw_test$Assignment_Group,Pred=sms_test_pred_e)


I=dim(a)[1]
J=dim(a)[2]
true<-0
false<-0
for (i in 1:I){
  for (j in 1:J){
    if (i==j){true=true+a[i,j]}}}
for (i in 1:I){
  for (j in 1:J){
    if (i!=j){
      false=false+a[i,j]}}}
accuracy=true/(false+true)
print(accuracy)
## [1] 0.9329205

Predict single case

input<-"JOB EWCSCR41 (EWPSAKRW14.0010.01041800.JCL) ENDED IN ERROR JCL tiludv"
input<-gsub(".", " ", input,fixed = TRUE)
corp1<-Corpus(VectorSource(input))
corpus_clean1 <- tm_map(corp1, tolower)
corpus_clean1 <- tm_map(corpus_clean1, removeNumbers)
corpus_clean1 <- tm_map(corpus_clean1, removeWords, stopwords())
corpus_clean1 <- tm_map(corpus_clean1, removePunctuation)
corpus_clean1 <- tm_map(corpus_clean1, stripWhitespace)
corpus_clean1 <- tm_map(corpus_clean1, removeWords,mystopword)
sms_corpus_test1 <- corpus_clean1
sms_test1 <- DocumentTermMatrix(sms_corpus_test1)
sms_test1.m <- as.matrix(sms_test1)
sms_test1.df <- as.data.frame(sms_test1.m)
sms_test1.df <- apply(sms_test1.df, MARGIN = 2, convert_counts)
sms_test_pred <- predict(sms_classifier, sms_test1.df)
print(sms_test_pred)
## [1] F163 Sys  Mgm  EDW Infrastructure F163 Sys  Mgm  EDW Infrastructure
## 4 Levels: F161 Sys  Mgm  EDW ... F164 Sys  Mgm  EDW Architecture

try with SVM. It creates hyperplane so that that is max distance from

n different catagory nearest point, it takes care of outliner #on its own, it also convert to another dimension like x^2 to seperate effectively by kernel trick(linear,poly). It can be used for regression and classification, #but mostly used for classification. Parameter: kernel,gamma(higer value overfit),c-penalty factor(adjestmnt for correct preditio #or smooth decision boundary),its effective for high dimensional spaces p>n . Its time taking library(RODBC,lib.loc=“H:/RLibrary”) library(slam,lib.loc=“H://RLibrary”) library(NLP,lib.loc=“H://RLibrary”) library(tm,lib.loc=“H://RLibrary”) library(SparseM,lib.loc=“H:/RLibrary”) library(e1071,lib.loc=“H://RLibrary”)

dbhandle <- odbcDriverConnect(‘driver={SQL Server};server=C44S03\INST003;database=ITSM;trusted_connection=true;uid=ba4205 , pwd=apr2017’) #change year res_inc <- sqlQuery(dbhandle, “SELECT [Assignment_Group],TITLE FROM [ITSM].[dbo].[SN_Incident] where [Assignment_Group] like ‘F161%’ or [Assignment_Group] like ‘F162%’ or [Assignment_Group] like ‘F163%’ or [Assignment_Group] like ‘F164%’ and Opened_time_year=2018”) odbcCloseAll() #SVm cannot work with factor(yes/no) so removed convert_counts methood which was required for naive bayes res_inc\(Assignment_Group<-as.character(res_inc\)Assignment_Group) res_inc\(TITLE<-as.character(res_inc\)TITLE) res_inc\(TITLE<-tolower(res_inc\)TITLE) res_inc\(Assignment_Group<-as.factor(res_inc\)Assignment_Group) res_inc\(TITLE<-as.factor(res_inc\)TITLE) #1073 has junk char as danish so that should be removed else those will create some junk char and wont match in test to.plain <- function(s) {

# 1 character substitutions old1 <- “åÅøä” new1 <- “aaoa” s1 <- chartr(old1, new1, s) # 2 character substitutions old2 <- c(“o”, “ß”, “æ”, “ø”) new2 <- c(“oe”, “ss”, “ae”, “oe”) s2 <- s1 for(i in seq_along(old2)) s2 <- gsub(old2[i], new2[i], s2, fixed = TRUE)

s2 } subset(res_inc, grepl(“ew_wh”, TITLE)) res_inc<-as.data.frame(lapply(res_inc ,to.plain)) res_inc<-as.data.frame(lapply(res_inc ,function(x) gsub(“.”, " “, x,fixed = TRUE))) corp<-Corpus(VectorSource(res_inc\(TITLE)) inspect(corp[1:3]) corpus_clean <- tm_map(corp, tolower) corpus_clean <- tm_map(corpus_clean, removeNumbers) corpus_clean <- tm_map(corpus_clean, removeWords, stopwords()) corpus_clean <- tm_map(corpus_clean, removePunctuation) corpus_clean <- tm_map(corpus_clean, stripWhitespace) mystopword<-c("a","an","the","job","error","automatic","batch","tiludv","auto","ended","jcl","jcli") corpus_clean <- tm_map(corpus_clean, removeWords,mystopword) set.seed(123) library(RColorBrewer,lib.loc="H://RLibrary") library(wordcloud,lib.loc="H://RLibrary") sms_all <- DocumentTermMatrix(corpus_clean) sms_all.m <- as.matrix(sms_all) sms_all.df <- as.data.frame(sms_all.m) #sort columns in ascending so that train and test columns are in same order, also same terms are used in train and test else svm wont work cc<-colnames(sms_all.df)[order(colnames(sms_all.df))] sms_all.df<-sms_all.df[,cc] all<-as.factor(res_inc\)Assignment_Group) data <- as.data.frame(cbind(all,as.matrix(sms_all.df))) smp_size <- floor(0.75 * nrow(data)) train_ind <- sample(seq_len(nrow(data)), size = smp_size) data_train<-data[train_ind,]#2859 1325 col and 1 predictor data_test<-data[-train_ind,]#953 sv <- svm(all~., data_train, type=”C-classification“, kernel=”linear“, cost=1) a<-table(Pred=predict(sv, data_test[,-1]) , True=data_test$all) #1325 I=dim(a)[1] J=dim(a)[2] true<-0 false<-0 for (i in 1:I){ for (j in 1:J){ if (i==j){true=true+a[i,j]} } }

for (i in 1:I){ for (j in 1:J){ if (i!=j){ false=false+a[i,j]} } }

accuracy=true/(false+true) #predict single case #input<-“JOB EWCRRW14 (EWPSAKRW14.0010.01041800.JCL) ENDED IN ERROR JCL tiludv” input<-“JOB ZWCLEGB3 (ZWCFAC.0010.01041800.JCL) ENDED IN ERROR JCL tiludv suman” input<-gsub(“.”, " “, input,fixed = TRUE) corp1<-Corpus(VectorSource(input)) corpus_clean1 <- tm_map(corp1, tolower) corpus_clean1 <- tm_map(corpus_clean1, removeNumbers) corpus_clean1 <- tm_map(corpus_clean1, removeWords, stopwords()) corpus_clean1 <- tm_map(corpus_clean1, removePunctuation) corpus_clean1 <- tm_map(corpus_clean1, stripWhitespace) corpus_clean1 <- tm_map(corpus_clean1, removeWords,mystopword) sms_corpus_test1 <- corpus_clean1 #use the same term as training otherwise for single prediction svm wont work sms_test1 <- DocumentTermMatrix(sms_corpus_test1,control = list(dictionary=Terms(sms_all)) ) sms_test1.m <- as.matrix(sms_test1) sms_test1.df <- as.data.frame(sms_test1.m) #sort columns in ascending to get right order as training sms_test1.df<-sms_test1.df[,cc] sms_test1.df<-as.data.frame(as.matrix(sms_test1.df)) predict(sv, sms_test1.df)#1324 #object ‘ewpmåned’ not found (but ewpmÃ¥ned found) colnames(sms_test1.df)[order(colnames(sms_test1.df))], #because this column is not present in test