Sometimes incidents are assigned to wrong groups causing a lot of time wasted to reassign to particular group. So the idea was to make it automaticall assigned to correct group using predictive modeling and text mining. This has a huge potential to minisise cost and customer satisfaction
#library(RODBC)
library(slam)
library(NLP)
library(tm)
library(dplyr)
library(RColorBrewer)
library(wordcloud)
library(e1071)
#dbhandle <- odbcDriverConnect('driver={SQL #Server};server=C44S03\\INST003;database=ITSM;trusted_connection=true;uid=***
, pwd=***')
#chnage the year
#res_inc <- sqlQuery(dbhandle, "SELECT [Assignment_Group],TITLE
FROM [ITSM].[dbo].[SN_Incident] where
[Assignment_Group] like 'F161%' or
[Assignment_Group] like 'F162%' or
[Assignment_Group] like 'F163%' or
[Assignment_Group] like 'F164%'
and Opened_time_year=2018")
odbcCloseAll()
setwd("C:\\Users\\suman\\Desktop\\datasciencework\\PROJECTS")
res_inc=read.csv("res_inc.csv")
str(res_inc)
table(res_inc$Assignment_Group)
res_inc=res_inc[,-1]
we have 1467,706,1634,66 class total 3873
make “.” to " "
res_inc<-as.data.frame(lapply(res_inc ,function(x) gsub(".", " ",
x,fixed = TRUE)))
#create corpora and tdm
corp<-Corpus(VectorSource(res_inc$TITLE))
inspect(corp[1:3])
corpus_clean <- tm_map(corp, tolower)
corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
corpus_clean <- tm_map(corpus_clean, removePunctuation)
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
mystopword<-c("a","an","the","job","error","automatic","batch","tiludv","auto","ended","jcl","jcli")
corpus_clean <- tm_map(corpus_clean, removeWords,mystopword)
sms_dtm <- DocumentTermMatrix(corpus_clean)
#sms_dtm <- DocumentTermMatrix(corpus_clean,control=list(weighting=weightTfIdf))
set.seed(123)
smp_size <- floor(0.75 * nrow(res_inc))
train_ind <- sample(seq_len(nrow(res_inc)), size = smp_size)
sms_raw_train <- res_inc[train_ind, ]
sms_raw_test <- res_inc[-train_ind, ]
sms_dtm_train <- sms_dtm[train_ind, ]
sms_dtm_test <- sms_dtm[-train_ind, ]
sms_corpus_train <- corpus_clean[train_ind]
sms_corpus_test <- corpus_clean[-train_ind]
# get wordcloud for diff proj code
ind<-which(sms_raw_train$Assignment_Group=="F161 Sys Mgm EDW")
sms_corpus_train_f161 <- sms_corpus_train[ind]
ind<-which(sms_raw_train$Assignment_Group=="F162 Sys Mgm Quality Assurance")
sms_corpus_train_f162 <- sms_corpus_train[ind]
ind<-which(sms_raw_train$Assignment_Group=="F163 Sys Mgm EDW Infrastructure")
sms_corpus_train_f163 <- sms_corpus_train[ind]
ind<-which(sms_raw_train$Assignment_Group=="F164 Sys Mgm EDW Architecture")
sms_corpus_train_f164 <- sms_corpus_train[ind]
wordcloud(sms_corpus_train_f161, min.freq = 5, random.order = FALSE)
wordcloud(sms_corpus_train_f162, min.freq = 5, random.order = FALSE)
wordcloud(sms_corpus_train_f163, min.freq = 5, random.order = FALSE)
wordcloud(sms_corpus_train_f164, min.freq = 5, random.order = FALSE)
wordcloud(sms_corpus_train, min.freq = 10, random.order = FALSE)
sms_train <- DocumentTermMatrix(sms_corpus_train)
sms_test <- DocumentTermMatrix(sms_corpus_test)
sms_train.m <- as.matrix(sms_train)
sms_test.m <- as.matrix(sms_test)
sms_train.df <- as.data.frame(sms_train.m)
sms_test.df <- as.data.frame(sms_test.m)
#naive bayes only works with catagorigal data but DTM contains count
#so we will change it to 1 or 0
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
sms_train.df <- apply(sms_train.df, MARGIN = 2, convert_counts)
sms_test.df <- apply(sms_test.df, MARGIN = 2, convert_counts)
sms_classifier <- naiveBayes(sms_train.df, as.factor(sms_raw_train$Assignment_Group),laplace=1)
sms_test_pred_e <- predict(sms_classifier, sms_test.df)
a<-table(True=sms_raw_test$Assignment_Group,Pred=sms_test_pred_e)
I=dim(a)[1]
J=dim(a)[2]
true<-0
false<-0
for (i in 1:I){
for (j in 1:J){
if (i==j){true=true+a[i,j]}}}
for (i in 1:I){
for (j in 1:J){
if (i!=j){
false=false+a[i,j]}}}
accuracy=true/(false+true)
print(accuracy)
input<-"JOB EWCSCR41 (EWPSAKRW14.0010.01041800.JCL) ENDED IN ERROR JCL tiludv"
input<-gsub(".", " ", input,fixed = TRUE)
corp1<-Corpus(VectorSource(input))
corpus_clean1 <- tm_map(corp1, tolower)
corpus_clean1 <- tm_map(corpus_clean1, removeNumbers)
corpus_clean1 <- tm_map(corpus_clean1, removeWords, stopwords())
corpus_clean1 <- tm_map(corpus_clean1, removePunctuation)
corpus_clean1 <- tm_map(corpus_clean1, stripWhitespace)
corpus_clean1 <- tm_map(corpus_clean1, removeWords,mystopword)
sms_corpus_test1 <- corpus_clean1
sms_test1 <- DocumentTermMatrix(sms_corpus_test1)
sms_test1.m <- as.matrix(sms_test1)
sms_test1.df <- as.data.frame(sms_test1.m)
sms_test1.df <- apply(sms_test1.df, MARGIN = 2, convert_counts)
sms_test_pred <- predict(sms_classifier, sms_test1.df)
print(sms_test_pred)
n different catagory nearest point, it takes care of outliner #on its own, it also convert to another dimension like x^2 to seperate effectively by kernel trick(linear,poly). It can be used for regression and classification, #but mostly used for classification. Parameter: kernel,gamma(higer value overfit),c-penalty factor(adjestmnt for correct preditio #or smooth decision boundary),its effective for high dimensional spaces p>n . Its time taking library(RODBC,lib.loc=“H:/RLibrary”) library(slam,lib.loc=“H://RLibrary”) library(NLP,lib.loc=“H://RLibrary”) library(tm,lib.loc=“H://RLibrary”) library(SparseM,lib.loc=“H:/RLibrary”) library(e1071,lib.loc=“H://RLibrary”)
dbhandle <- odbcDriverConnect(‘driver={SQL Server};server=C44S03\INST003;database=ITSM;trusted_connection=true;uid=ba4205 , pwd=apr2017’) #change year res_inc <- sqlQuery(dbhandle, “SELECT [Assignment_Group],TITLE FROM [ITSM].[dbo].[SN_Incident] where [Assignment_Group] like ‘F161%’ or [Assignment_Group] like ‘F162%’ or [Assignment_Group] like ‘F163%’ or [Assignment_Group] like ‘F164%’ and Opened_time_year=2018”) odbcCloseAll() #SVm cannot work with factor(yes/no) so removed convert_counts methood which was required for naive bayes res_inc\(Assignment_Group<-as.character(res_inc\)Assignment_Group) res_inc\(TITLE<-as.character(res_inc\)TITLE) res_inc\(TITLE<-tolower(res_inc\)TITLE) res_inc\(Assignment_Group<-as.factor(res_inc\)Assignment_Group) res_inc\(TITLE<-as.factor(res_inc\)TITLE) #1073 has junk char as danish so that should be removed else those will create some junk char and wont match in test to.plain <- function(s) {
# 1 character substitutions old1 <- “åÅøä” new1 <- “aaoa” s1 <- chartr(old1, new1, s) # 2 character substitutions old2 <- c(“o”, “ß”, “æ”, “ø”) new2 <- c(“oe”, “ss”, “ae”, “oe”) s2 <- s1 for(i in seq_along(old2)) s2 <- gsub(old2[i], new2[i], s2, fixed = TRUE)
s2 } subset(res_inc, grepl(“ew_wh”, TITLE)) res_inc<-as.data.frame(lapply(res_inc ,to.plain)) res_inc<-as.data.frame(lapply(res_inc ,function(x) gsub(“.”, " “, x,fixed = TRUE))) corp<-Corpus(VectorSource(res_inc\(TITLE)) inspect(corp[1:3]) corpus_clean <- tm_map(corp, tolower) corpus_clean <- tm_map(corpus_clean, removeNumbers) corpus_clean <- tm_map(corpus_clean, removeWords, stopwords()) corpus_clean <- tm_map(corpus_clean, removePunctuation) corpus_clean <- tm_map(corpus_clean, stripWhitespace) mystopword<-c("a","an","the","job","error","automatic","batch","tiludv","auto","ended","jcl","jcli") corpus_clean <- tm_map(corpus_clean, removeWords,mystopword) set.seed(123) library(RColorBrewer,lib.loc="H://RLibrary") library(wordcloud,lib.loc="H://RLibrary") sms_all <- DocumentTermMatrix(corpus_clean) sms_all.m <- as.matrix(sms_all) sms_all.df <- as.data.frame(sms_all.m) #sort columns in ascending so that train and test columns are in same order, also same terms are used in train and test else svm wont work cc<-colnames(sms_all.df)[order(colnames(sms_all.df))] sms_all.df<-sms_all.df[,cc] all<-as.factor(res_inc\)Assignment_Group) data <- as.data.frame(cbind(all,as.matrix(sms_all.df))) smp_size <- floor(0.75 * nrow(data)) train_ind <- sample(seq_len(nrow(data)), size = smp_size) data_train<-data[train_ind,]#2859 1325 col and 1 predictor data_test<-data[-train_ind,]#953 sv <- svm(all~., data_train, type=”C-classification“, kernel=”linear“, cost=1) a<-table(Pred=predict(sv, data_test[,-1]) , True=data_test$all) #1325 I=dim(a)[1] J=dim(a)[2] true<-0 false<-0 for (i in 1:I){ for (j in 1:J){ if (i==j){true=true+a[i,j]} } }
for (i in 1:I){ for (j in 1:J){ if (i!=j){ false=false+a[i,j]} } }
accuracy=true/(false+true) #predict single case #input<-“JOB EWCRRW14 (EWPSAKRW14.0010.01041800.JCL) ENDED IN ERROR JCL tiludv” input<-“JOB ZWCLEGB3 (ZWCFAC.0010.01041800.JCL) ENDED IN ERROR JCL tiludv suman” input<-gsub(“.”, " “, input,fixed = TRUE) corp1<-Corpus(VectorSource(input)) corpus_clean1 <- tm_map(corp1, tolower) corpus_clean1 <- tm_map(corpus_clean1, removeNumbers) corpus_clean1 <- tm_map(corpus_clean1, removeWords, stopwords()) corpus_clean1 <- tm_map(corpus_clean1, removePunctuation) corpus_clean1 <- tm_map(corpus_clean1, stripWhitespace) corpus_clean1 <- tm_map(corpus_clean1, removeWords,mystopword) sms_corpus_test1 <- corpus_clean1 #use the same term as training otherwise for single prediction svm wont work sms_test1 <- DocumentTermMatrix(sms_corpus_test1,control = list(dictionary=Terms(sms_all)) ) sms_test1.m <- as.matrix(sms_test1) sms_test1.df <- as.data.frame(sms_test1.m) #sort columns in ascending to get right order as training sms_test1.df<-sms_test1.df[,cc] sms_test1.df<-as.data.frame(as.matrix(sms_test1.df)) predict(sv, sms_test1.df)#1324 #object ‘ewpmåned’ not found (but ewpmÃ¥ned found) colnames(sms_test1.df)[order(colnames(sms_test1.df))], #because this column is not present in test