library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rpart.plot)
## Loading required package: rpart
library(rpart)
#set parallel backend (Windows)
library(parallelMap)
library(parallel)
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=8.
library(tree)
#every ML(decision tree classification) problem can be soloved using the following:
# Import the data
# Clean the dataset
# Create train/test set
# Build the model
# Make prediction
# Measure performance
sam<- read.csv('C:/Users/somy/Documents/merkel_sokrati_assignment/sample_user_data.csv')
nrow(sam)
## [1] 464704
#for geting the unique visitor on the site the Visit Number should be 1
levels(as.factor(sam$visitNumber))
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11"
## [12] "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22"
## [23] "23" "24" "25" "26" "27" "28" "29" "30" "31" "32" "33"
## [34] "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44"
## [45] "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55"
## [56] "56" "57" "58" "59" "60" "61" "62" "63" "64" "65" "66"
## [67] "67" "68" "69" "70" "71" "72" "73" "74" "75" "76" "77"
## [78] "78" "79" "80" "81" "82" "83" "84" "85" "86" "87" "88"
## [89] "89" "90" "91" "92" "93" "94" "95" "96" "97" "98" "99"
## [100] "100" "101" "102" "103" "104" "105" "106" "107" "108" "109" "110"
## [111] "111" "112" "113" "114" "115" "116" "117" "118" "119" "120" "121"
## [122] "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143"
## [144] "144" "145" "146" "147" "148" "149" "150" "151" "152" "153" "154"
## [155] "155" "156" "157" "158" "159" "160" "161" "162" "163" "164" "165"
## [166] "166" "167" "168" "169" "170" "171" "172" "173" "174" "176" "177"
## [177] "178" "179" "180" "181" "182" "183" "184" "185" "186" "187" "188"
## [188] "189" "190" "191" "192" "193" "194" "195" "196" "197" "198" "199"
## [199] "200" "201" "202" "203" "204" "205" "206" "207" "208" "209" "210"
## [210] "211" "212" "213" "214" "215" "216" "217" "218" "219" "220" "221"
## [221] "222" "223" "224" "225" "226" "227" "228" "229" "230" "231" "232"
## [232] "233" "234" "235" "236" "237" "238" "239" "240" "241" "242" "243"
## [243] "244" "245" "246" "247" "248" "249" "250" "251" "252" "253" "254"
## [254] "255" "256" "257" "258" "259" "260" "261" "262" "263" "264" "265"
## [265] "266" "267" "268" "269" "270" "271" "272" "273" "274" "275" "276"
## [276] "277" "278" "279" "280" "281" "282" "283" "284" "285" "286" "287"
## [287] "288" "289" "290" "291" "292" "293" "294" "295" "296" "297" "298"
## [298] "299" "300" "301" "302" "303" "304" "305" "306" "307" "308" "309"
## [309] "310" "311" "312" "313" "314" "315" "316" "317" "318" "319" "320"
## [320] "321" "322" "323" "324" "325" "326" "327" "328" "329" "333" "334"
## [331] "335" "336" "337" "338" "339" "340" "341" "342" "343" "344" "345"
## [342] "346" "347" "348" "349" "350" "351" "352" "353" "354" "355" "356"
## [353] "357" "358" "359" "360" "361" "362" "363" "364" "369" "370" "371"
## [364] "372" "373" "374" "375" "376" "377" "378" "379" "383" "384" "385"
## [375] "386" "387" "388" "389" "390" "391" "393" "394" "395"
levels(as.factor(sam$totals_transactions))
## [1] "1" "2" "3" "4" "7" "8" "10" "12" "15" "21" "25"
#lets do an intial classification using the raw csv
set.seed(123)
split<-sample(2,nrow(sam),prob = c(0.8,0.2),replace = T)
train_raw<-sam[split==1,]
test_raw<-sam[split==2,]
model_raw<-rpart(totals_transactions~.,data = train_raw)
rpart.plot(model_raw)

raw_predict<-predict(model_raw,test_raw)
plot(raw_predict)

#we cannot infer weather a person will transact or not thus we need to modify and
#cleanse the data
#creation of the list of users who have visited the site for the first time.
r<-sam%>%
filter(visitNumber==1)
#now we have to mutate the r data frame to add one more column so that we can
#add transact or not if the total transaction is greater than 1 then the transaction has
#happened else not
#creating a dataframe with the mutated column
t<-r%>%
mutate(totals_transactions,ton=totals_transactions)
levels(as.factor(t$totals_transactions))
## [1] "1" "2" "3" "7" "12" "21" "25"
levels(as.factor(t$ton))
## [1] "1" "2" "3" "7" "12" "21" "25"
#converting the NA values to 0
t$ton[is.na(t$ton)]<-0
#converting the non NA values to 1
#the logic behind the approach is that if the user whoose visit Number is 1
#that means he is the first time user on the the site
# and if we have any positive integer on the total transaction that
#means the user has transacted on the site
for (i in 1:nrow(t)){
if(t$ton[i]==0){
t$ton[i]<-'no'
}
else{
t$ton[i]<-'yes'
}
}
#writing the data frame in csv
setwd('C:/Users/somy/Documents/merkel_sokrati_assignment')
write.csv(t,file = "updated.csv",append = F,sep = ',')
## Warning in write.csv(t, file = "updated.csv", append = F, sep = ","):
## attempt to set 'append' ignored
## Warning in write.csv(t, file = "updated.csv", append = F, sep = ","):
## attempt to set 'sep' ignored
#readnig the
t<-read.csv('C:/Users/somy/Documents/merkel_sokrati_assignment/updated.csv')
#converting the NA to 0
t[is.na(t)]<-0
#removing the non-relvant columns
t<-t[,-c(1:3)]
set.seed(1234)
split<-sample(2,nrow(t),prob = c(0.8,0.2),replace = T)
train<-t[split==1,]
test<-t[split==2,]
t[is.na(t)]<-0
#model creation
model<-rpart(ton~.,data = train,method = 'class')
#plotting the model
rpart.plot(model)

#plotting the same model differently
plot(model,margin=0.1)
text(model,use.n = TRUE,cex=0.8)
#making prediction on test data set
predict_test<-predict(model,test,type = 'class')
#creating a table for the comparision
table(predict_test,test$ton)
##
## predict_test no yes
## no 70013 0
## yes 0 529
#creating a confusion matrix for the above result analysis
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2

confusionMatrix(predict_test,test$ton)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 70013 0
## yes 0 529
##
## Accuracy : 1
## 95% CI : (0.9999, 1)
## No Information Rate : 0.9925
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.9925
## Detection Rate : 0.9925
## Detection Prevalence : 0.9925
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : no
##
plot(predict_test)
