textanalysis.utf8.md

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rpart.plot)

## Loading required package: rpart

library(rpart)
#set parallel backend (Windows)
library(parallelMap)
library(parallel)
parallelStartSocket(cpus = detectCores())

## Starting parallelization in mode=socket with cpus=8.

library(tree)



#every ML(decision tree classification) problem can be soloved using the following: 
# Import the data
# Clean the dataset
# Create train/test set
# Build the model
# Make prediction
# Measure performance

sam<- read.csv('C:/Users/somy/Documents/merkel_sokrati_assignment/sample_user_data.csv')


nrow(sam)

## [1] 464704

#for geting the unique visitor on the site the Visit Number should be 1

levels(as.factor(sam$visitNumber))

##   [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11" 
##  [12] "12"  "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22" 
##  [23] "23"  "24"  "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33" 
##  [34] "34"  "35"  "36"  "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44" 
##  [45] "45"  "46"  "47"  "48"  "49"  "50"  "51"  "52"  "53"  "54"  "55" 
##  [56] "56"  "57"  "58"  "59"  "60"  "61"  "62"  "63"  "64"  "65"  "66" 
##  [67] "67"  "68"  "69"  "70"  "71"  "72"  "73"  "74"  "75"  "76"  "77" 
##  [78] "78"  "79"  "80"  "81"  "82"  "83"  "84"  "85"  "86"  "87"  "88" 
##  [89] "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96"  "97"  "98"  "99" 
## [100] "100" "101" "102" "103" "104" "105" "106" "107" "108" "109" "110"
## [111] "111" "112" "113" "114" "115" "116" "117" "118" "119" "120" "121"
## [122] "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143"
## [144] "144" "145" "146" "147" "148" "149" "150" "151" "152" "153" "154"
## [155] "155" "156" "157" "158" "159" "160" "161" "162" "163" "164" "165"
## [166] "166" "167" "168" "169" "170" "171" "172" "173" "174" "176" "177"
## [177] "178" "179" "180" "181" "182" "183" "184" "185" "186" "187" "188"
## [188] "189" "190" "191" "192" "193" "194" "195" "196" "197" "198" "199"
## [199] "200" "201" "202" "203" "204" "205" "206" "207" "208" "209" "210"
## [210] "211" "212" "213" "214" "215" "216" "217" "218" "219" "220" "221"
## [221] "222" "223" "224" "225" "226" "227" "228" "229" "230" "231" "232"
## [232] "233" "234" "235" "236" "237" "238" "239" "240" "241" "242" "243"
## [243] "244" "245" "246" "247" "248" "249" "250" "251" "252" "253" "254"
## [254] "255" "256" "257" "258" "259" "260" "261" "262" "263" "264" "265"
## [265] "266" "267" "268" "269" "270" "271" "272" "273" "274" "275" "276"
## [276] "277" "278" "279" "280" "281" "282" "283" "284" "285" "286" "287"
## [287] "288" "289" "290" "291" "292" "293" "294" "295" "296" "297" "298"
## [298] "299" "300" "301" "302" "303" "304" "305" "306" "307" "308" "309"
## [309] "310" "311" "312" "313" "314" "315" "316" "317" "318" "319" "320"
## [320] "321" "322" "323" "324" "325" "326" "327" "328" "329" "333" "334"
## [331] "335" "336" "337" "338" "339" "340" "341" "342" "343" "344" "345"
## [342] "346" "347" "348" "349" "350" "351" "352" "353" "354" "355" "356"
## [353] "357" "358" "359" "360" "361" "362" "363" "364" "369" "370" "371"
## [364] "372" "373" "374" "375" "376" "377" "378" "379" "383" "384" "385"
## [375] "386" "387" "388" "389" "390" "391" "393" "394" "395"

levels(as.factor(sam$totals_transactions))

##  [1] "1"  "2"  "3"  "4"  "7"  "8"  "10" "12" "15" "21" "25"

#lets do an intial classification using the raw csv 

set.seed(123)
split<-sample(2,nrow(sam),prob = c(0.8,0.2),replace = T)

train_raw<-sam[split==1,]
test_raw<-sam[split==2,]


model_raw<-rpart(totals_transactions~.,data = train_raw)

rpart.plot(model_raw)

raw_predict<-predict(model_raw,test_raw)

plot(raw_predict)

#we cannot infer weather a person will transact or not thus we need to modify and 
#cleanse the data
#creation of the list of users who have visited the site for the first time. 

r<-sam%>%
  filter(visitNumber==1)

#now we have to mutate the r data frame to add one more column so that we can 
#add transact or not if the total transaction is greater than 1 then the transaction has 
#happened else not 

#creating a dataframe with the mutated column

t<-r%>%
  mutate(totals_transactions,ton=totals_transactions)

levels(as.factor(t$totals_transactions))

## [1] "1"  "2"  "3"  "7"  "12" "21" "25"

levels(as.factor(t$ton))

## [1] "1"  "2"  "3"  "7"  "12" "21" "25"

#converting the NA values to 0  

t$ton[is.na(t$ton)]<-0

#converting the non NA values to 1 
#the logic behind the approach is that if the user whoose visit Number is 1 
#that means he is the first time user on the the site 
# and if we have any positive integer on the total transaction that 
#means the user has transacted on the site 




for (i in 1:nrow(t)){
  if(t$ton[i]==0){
    t$ton[i]<-'no'
  }
  else{
    t$ton[i]<-'yes'
  }
}


#writing the data frame in csv 
setwd('C:/Users/somy/Documents/merkel_sokrati_assignment')
write.csv(t,file = "updated.csv",append = F,sep = ',')

## Warning in write.csv(t, file = "updated.csv", append = F, sep = ","):
## attempt to set 'append' ignored

## Warning in write.csv(t, file = "updated.csv", append = F, sep = ","):
## attempt to set 'sep' ignored

#readnig the 
t<-read.csv('C:/Users/somy/Documents/merkel_sokrati_assignment/updated.csv')

#converting the NA to 0
t[is.na(t)]<-0

#removing the non-relvant columns

t<-t[,-c(1:3)]

set.seed(1234)
split<-sample(2,nrow(t),prob = c(0.8,0.2),replace = T)

train<-t[split==1,]
test<-t[split==2,]

t[is.na(t)]<-0

#model creation 
model<-rpart(ton~.,data = train,method = 'class')

#plotting the model 
rpart.plot(model)

#plotting the same model differently

plot(model,margin=0.1)
text(model,use.n = TRUE,cex=0.8)

#making prediction on test data set 
predict_test<-predict(model,test,type = 'class')

#creating a table for the comparision
table(predict_test,test$ton)

##             
## predict_test    no   yes
##          no  70013     0
##          yes     0   529

#creating a confusion matrix for the above result analysis 

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

confusionMatrix(predict_test,test$ton)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  70013     0
##        yes     0   529
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9999, 1)
##     No Information Rate : 0.9925     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.9925     
##          Detection Rate : 0.9925     
##    Detection Prevalence : 0.9925     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : no         
##

plot(predict_test)

textanalysis.R

somy

2019-11-15