rmarkdown is used and saved as .rmd and then published in kniter
Sometimes incidents are assigned to wrong groups causing a lot of time wasted to reassign to particular group. So the idea was to make it automaticall assigned to correct group using predictive modeling and text mining. This has a huge potential to minisise cost and customer satisfaction
library(RODBC) dbhandle <- odbcDriverConnect(‘driver={SQL Server};server=C44S03\INST003;database=ITSM;trusted_connection=true;uid= , pwd=’) chnage the year res_inc <- sqlQuery(dbhandle, “SELECT [Assignment_Group],TITLE FROM [ITSM].[dbo].[SN_Incident] where [Assignment_Group] like ‘F161%’ or [Assignment_Group] like ‘F162%’ or [Assignment_Group] like ‘F163%’ or [Assignment_Group] like ‘F164%’ and Opened_time_year=2018”)
odbcCloseAll()
library(slam)
library(NLP)
library(tm)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RColorBrewer)
library(wordcloud)
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.1
setwd("C:\\Users\\suman\\Desktop\\datasciencework\\PROJECTS")
res_inc=read.csv("res_inc.csv")
str(res_inc)
## 'data.frame': 3873 obs. of 3 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Assignment_Group: Factor w/ 4 levels "F161 Sys. Mgm. EDW",..: 1 1 1 2 2 3 3 1 1 1 ...
## $ TITLE : Factor w/ 3790 levels "Change in ZW_FACT_KOR_2XXX_H view",..: 594 1076 1154 1829 1628 2905 2256 1461 609 1001 ...
table(res_inc$Assignment_Group)
##
## F161 Sys. Mgm. EDW F162 Sys. Mgm. Quality Assurance
## 1467 706
## F163 Sys. Mgm. EDW Infrastructure F164 Sys. Mgm. EDW Architecture
## 1634 66
res_inc=res_inc[,-1]
we have 1467,706,1634,66 class total 3873
make “.” to " "
get wordcloud for diff proj code
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] JOB EWCFLK08 (EWPCPYLK08 0010 01191800 2624) ENDED IN ERROR 2624 tiludv
## [2] JOB EWCSHG08 (EWPSAKHG08 0010 01201800 1000) ENDED IN ERROR 1000 tiludv
## [3] JOB EWCSKS01 (EWPSAKKS01 0010 11131800 1625) ENDED IN ERROR 1625 tiludv
sms_train <- DocumentTermMatrix(sms_corpus_train)
sms_test <- DocumentTermMatrix(sms_corpus_test)
sms_train.m <- as.matrix(sms_train)
sms_test.m <- as.matrix(sms_test)
sms_train.df <- as.data.frame(sms_train.m)
sms_test.df <- as.data.frame(sms_test.m)
#naive bayes only works with catagorigal data but DTM contains count
#so we will change it to 1 or 0
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
sms_train.df <- apply(sms_train.df, MARGIN = 2, convert_counts)
sms_test.df <- apply(sms_test.df, MARGIN = 2, convert_counts)
sms_classifier <- naiveBayes(sms_train.df, as.factor(sms_raw_train$Assignment_Group),laplace=1)
sms_test_pred_e <- predict(sms_classifier, sms_test.df)
a<-table(True=sms_raw_test$Assignment_Group,Pred=sms_test_pred_e)
I=dim(a)[1]
J=dim(a)[2]
true<-0
false<-0
for (i in 1:I){
for (j in 1:J){
if (i==j){true=true+a[i,j]}}}
for (i in 1:I){
for (j in 1:J){
if (i!=j){
false=false+a[i,j]}}}
accuracy=true/(false+true)
print(accuracy)
## [1] 0.9329205
input<-"JOB EWCSCR41 (EWPSAKRW14.0010.01041800.JCL) ENDED IN ERROR JCL tiludv"
input<-gsub(".", " ", input,fixed = TRUE)
corp1<-Corpus(VectorSource(input))
corpus_clean1 <- tm_map(corp1, tolower)
corpus_clean1 <- tm_map(corpus_clean1, removeNumbers)
corpus_clean1 <- tm_map(corpus_clean1, removeWords, stopwords())
corpus_clean1 <- tm_map(corpus_clean1, removePunctuation)
corpus_clean1 <- tm_map(corpus_clean1, stripWhitespace)
corpus_clean1 <- tm_map(corpus_clean1, removeWords,mystopword)
sms_corpus_test1 <- corpus_clean1
sms_test1 <- DocumentTermMatrix(sms_corpus_test1)
sms_test1.m <- as.matrix(sms_test1)
sms_test1.df <- as.data.frame(sms_test1.m)
sms_test1.df <- apply(sms_test1.df, MARGIN = 2, convert_counts)
sms_test_pred <- predict(sms_classifier, sms_test1.df)
print(sms_test_pred)
## [1] F163 Sys Mgm EDW Infrastructure F163 Sys Mgm EDW Infrastructure
## 4 Levels: F161 Sys Mgm EDW ... F164 Sys Mgm EDW Architecture
n different catagory nearest point, it takes care of outliner #on its own, it also convert to another dimension like x^2 to seperate effectively by kernel trick(linear,poly). It can be used for regression and classification, #but mostly used for classification. Parameter: kernel,gamma(higer value overfit),c-penalty factor(adjestmnt for correct preditio #or smooth decision boundary),its effective for high dimensional spaces p>n . Its time taking library(RODBC,lib.loc=“H:/RLibrary”) library(slam,lib.loc=“H://RLibrary”) library(NLP,lib.loc=“H://RLibrary”) library(tm,lib.loc=“H://RLibrary”) library(SparseM,lib.loc=“H:/RLibrary”) library(e1071,lib.loc=“H://RLibrary”)
dbhandle <- odbcDriverConnect(‘driver={SQL Server};server=C44S03\INST003;database=ITSM;trusted_connection=true;uid=ba4205 , pwd=apr2017’) #change year res_inc <- sqlQuery(dbhandle, “SELECT [Assignment_Group],TITLE FROM [ITSM].[dbo].[SN_Incident] where [Assignment_Group] like ‘F161%’ or [Assignment_Group] like ‘F162%’ or [Assignment_Group] like ‘F163%’ or [Assignment_Group] like ‘F164%’ and Opened_time_year=2018”) odbcCloseAll() #SVm cannot work with factor(yes/no) so removed convert_counts methood which was required for naive bayes res_inc\(Assignment_Group<-as.character(res_inc\)Assignment_Group) res_inc\(TITLE<-as.character(res_inc\)TITLE) res_inc\(TITLE<-tolower(res_inc\)TITLE) res_inc\(Assignment_Group<-as.factor(res_inc\)Assignment_Group) res_inc\(TITLE<-as.factor(res_inc\)TITLE) #1073 has junk char as danish so that should be removed else those will create some junk char and wont match in test to.plain <- function(s) {
# 1 character substitutions old1 <- “åÅøä” new1 <- “aaoa” s1 <- chartr(old1, new1, s) # 2 character substitutions old2 <- c(“o”, “ß”, “æ”, “ø”) new2 <- c(“oe”, “ss”, “ae”, “oe”) s2 <- s1 for(i in seq_along(old2)) s2 <- gsub(old2[i], new2[i], s2, fixed = TRUE)
s2 } subset(res_inc, grepl(“ew_wh”, TITLE)) res_inc<-as.data.frame(lapply(res_inc ,to.plain)) res_inc<-as.data.frame(lapply(res_inc ,function(x) gsub(“.”, " “, x,fixed = TRUE))) corp<-Corpus(VectorSource(res_inc\(TITLE)) inspect(corp[1:3]) corpus_clean <- tm_map(corp, tolower) corpus_clean <- tm_map(corpus_clean, removeNumbers) corpus_clean <- tm_map(corpus_clean, removeWords, stopwords()) corpus_clean <- tm_map(corpus_clean, removePunctuation) corpus_clean <- tm_map(corpus_clean, stripWhitespace) mystopword<-c("a","an","the","job","error","automatic","batch","tiludv","auto","ended","jcl","jcli") corpus_clean <- tm_map(corpus_clean, removeWords,mystopword) set.seed(123) library(RColorBrewer,lib.loc="H://RLibrary") library(wordcloud,lib.loc="H://RLibrary") sms_all <- DocumentTermMatrix(corpus_clean) sms_all.m <- as.matrix(sms_all) sms_all.df <- as.data.frame(sms_all.m) #sort columns in ascending so that train and test columns are in same order, also same terms are used in train and test else svm wont work cc<-colnames(sms_all.df)[order(colnames(sms_all.df))] sms_all.df<-sms_all.df[,cc] all<-as.factor(res_inc\)Assignment_Group) data <- as.data.frame(cbind(all,as.matrix(sms_all.df))) smp_size <- floor(0.75 * nrow(data)) train_ind <- sample(seq_len(nrow(data)), size = smp_size) data_train<-data[train_ind,]#2859 1325 col and 1 predictor data_test<-data[-train_ind,]#953 sv <- svm(all~., data_train, type=”C-classification“, kernel=”linear“, cost=1) a<-table(Pred=predict(sv, data_test[,-1]) , True=data_test$all) #1325 I=dim(a)[1] J=dim(a)[2] true<-0 false<-0 for (i in 1:I){ for (j in 1:J){ if (i==j){true=true+a[i,j]} } }
for (i in 1:I){ for (j in 1:J){ if (i!=j){ false=false+a[i,j]} } }
accuracy=true/(false+true) #predict single case #input<-“JOB EWCRRW14 (EWPSAKRW14.0010.01041800.JCL) ENDED IN ERROR JCL tiludv” input<-“JOB ZWCLEGB3 (ZWCFAC.0010.01041800.JCL) ENDED IN ERROR JCL tiludv suman” input<-gsub(“.”, " “, input,fixed = TRUE) corp1<-Corpus(VectorSource(input)) corpus_clean1 <- tm_map(corp1, tolower) corpus_clean1 <- tm_map(corpus_clean1, removeNumbers) corpus_clean1 <- tm_map(corpus_clean1, removeWords, stopwords()) corpus_clean1 <- tm_map(corpus_clean1, removePunctuation) corpus_clean1 <- tm_map(corpus_clean1, stripWhitespace) corpus_clean1 <- tm_map(corpus_clean1, removeWords,mystopword) sms_corpus_test1 <- corpus_clean1 #use the same term as training otherwise for single prediction svm wont work sms_test1 <- DocumentTermMatrix(sms_corpus_test1,control = list(dictionary=Terms(sms_all)) ) sms_test1.m <- as.matrix(sms_test1) sms_test1.df <- as.data.frame(sms_test1.m) #sort columns in ascending to get right order as training sms_test1.df<-sms_test1.df[,cc] sms_test1.df<-as.data.frame(as.matrix(sms_test1.df)) predict(sv, sms_test1.df)#1324 #object ‘ewpmåned’ not found (but ewpmÃ¥ned found) colnames(sms_test1.df)[order(colnames(sms_test1.df))], #because this column is not present in test