Amazon <- read.csv("~/Data_Science/MarketingAnalytics/Amazon.csv", comment.char="#")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
#Create test and train data
smp_size <- floor(0.7*nrow(Amazon))
set.seed(143)
train_ind <- sample(seq_len(nrow(Amazon)), size = smp_size)
train <- Amazon[train_ind,]
test <- Amazon[-train_ind,]
#Classification and Regression Tree
library(rpart)
amazon.rt <- rpart(Customer.Segment.n~Product.Container.n+Ship.Mode.n+Order.Priority.n+Product.Category.n+Product.Sub.Category.n+Province+Region+PRIN1+PRIN2+PRIN3+PRIN4, data=train, method="class", control=rpart.control(minsplit=10, cp=.00001) )
printcp(amazon.rt)
##
## Classification tree:
## rpart(formula = Customer.Segment.n ~ Product.Container.n + Ship.Mode.n +
## Order.Priority.n + Product.Category.n + Product.Sub.Category.n +
## Province + Region + PRIN1 + PRIN2 + PRIN3 + PRIN4, data = train,
## method = "class", control = rpart.control(minsplit = 10,
## cp = 1e-05))
##
## Variables actually used in tree construction:
## [1] Order.Priority.n PRIN1 PRIN2
## [4] PRIN3 PRIN4 Product.Category.n
## [7] Product.Container.n Product.Sub.Category.n Province
## [10] Ship.Mode.n
##
## Root node error: 2556/5879 = 0.43477
##
## n= 5879
##
## CP nsplit rel error xerror xstd
## 1 2.0866e-03 0 1.00000 1.0000 0.014871
## 2 1.9562e-03 38 0.87989 1.0685 0.014961
## 3 1.7606e-03 43 0.87011 1.0716 0.014964
## 4 1.6954e-03 49 0.85955 1.0728 0.014965
## 5 1.5649e-03 53 0.85211 1.0736 0.014966
## 6 1.3693e-03 95 0.76995 1.0814 0.014972
## 7 1.3414e-03 112 0.74452 1.0845 0.014975
## 8 1.1737e-03 130 0.71792 1.0923 0.014980
## 9 1.0955e-03 173 0.66393 1.0974 0.014983
## 10 1.0433e-03 181 0.65336 1.0872 0.014977
## 11 9.7809e-04 191 0.64280 1.0888 0.014978
## 12 9.3897e-04 227 0.60603 1.0869 0.014976
## 13 8.8028e-04 232 0.60133 1.0892 0.014978
## 14 7.8247e-04 244 0.59077 1.0962 0.014983
## 15 6.8466e-04 367 0.48357 1.0939 0.014981
## 16 6.5206e-04 371 0.48083 1.0974 0.014983
## 17 6.2598e-04 398 0.46088 1.0990 0.014984
## 18 5.8685e-04 408 0.45462 1.1005 0.014985
## 19 5.2165e-04 461 0.42175 1.1013 0.014985
## 20 5.0302e-04 493 0.39828 1.1025 0.014986
## 21 4.8905e-04 503 0.39202 1.1025 0.014986
## 22 3.9124e-04 508 0.38928 1.1025 0.014986
## 23 2.9343e-04 646 0.33490 1.0915 0.014980
## 24 2.6082e-04 650 0.33372 1.0947 0.014982
## 25 2.3474e-04 656 0.33216 1.0939 0.014981
## 26 1.9562e-04 661 0.33099 1.0959 0.014982
## 27 1.5649e-04 681 0.32707 1.0959 0.014982
## 28 1.3041e-04 686 0.32629 1.0955 0.014982
## 29 9.7809e-05 692 0.32551 1.0955 0.014982
## 30 1.0000e-05 696 0.32512 1.0955 0.014982
pred <- predict(amazon.rt, type="class")
table(pred, train$Customer.Segment.n)
##
## pred Consumer Corporate
## Consumer 2144 419
## Corporate 412 2904
library(rpart.plot)
#rpart.plot(amazon.rt, type = 1)
# library(randomForest)
# amazon.rf <- randomForest(Customer.Segment.n~Product.Container.n+Ship.Mode.n+Order.Priority.n+Product.Category.n+Product.Sub.Category.n+PRIN1+PRIN2+PRIN3+PRIN4, data=Amazon, mtry= 6, ntree=1000, importance = TRUE, na.action=na.omit)
# print(amazon.rf)
# importance(amazon.rf)
####Calculate performance of CART
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
actual <- as.numeric(train$Customer.Segment.n)
pred1 <- as.numeric(pred)
ROCR_pred <- prediction(pred1, actual)
#Recall and Precision
RP.perf <- performance(ROCR_pred, "prec", "rec")
plot(RP.perf)

#ROC Curve
ROC.perf <- performance(ROCR_pred, "tpr", "fpr")
plot(ROC.perf)

#ROC under curve
auc.tmp <- performance(ROCR_pred, "auc")
auc <- as.numeric(auc.tmp@y.values)
auc
## [1] 0.8563599