Amazon <- read.csv("~/Data_Science/MarketingAnalytics/Amazon.csv", comment.char="#")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.4
#Create test and train data
smp_size <- floor(0.7*nrow(Amazon))
set.seed(143)
train_ind <- sample(seq_len(nrow(Amazon)), size = smp_size)
train <- Amazon[train_ind,]
test <- Amazon[-train_ind,]
#Classification and Regression Tree
library(rpart)
amazon.rt <- rpart(Customer.Segment.n~Product.Container.n+Ship.Mode.n+Order.Priority.n+Product.Category.n+Product.Sub.Category.n+Province+Region+PRIN1+PRIN2+PRIN3+PRIN4, data=train, method="class", control=rpart.control(minsplit=10, cp=.00001) )
printcp(amazon.rt)
## 
## Classification tree:
## rpart(formula = Customer.Segment.n ~ Product.Container.n + Ship.Mode.n + 
##     Order.Priority.n + Product.Category.n + Product.Sub.Category.n + 
##     Province + Region + PRIN1 + PRIN2 + PRIN3 + PRIN4, data = train, 
##     method = "class", control = rpart.control(minsplit = 10, 
##         cp = 1e-05))
## 
## Variables actually used in tree construction:
##  [1] Order.Priority.n       PRIN1                  PRIN2                 
##  [4] PRIN3                  PRIN4                  Product.Category.n    
##  [7] Product.Container.n    Product.Sub.Category.n Province              
## [10] Ship.Mode.n           
## 
## Root node error: 2556/5879 = 0.43477
## 
## n= 5879 
## 
##            CP nsplit rel error xerror     xstd
## 1  2.0866e-03      0   1.00000 1.0000 0.014871
## 2  1.9562e-03     38   0.87989 1.0685 0.014961
## 3  1.7606e-03     43   0.87011 1.0716 0.014964
## 4  1.6954e-03     49   0.85955 1.0728 0.014965
## 5  1.5649e-03     53   0.85211 1.0736 0.014966
## 6  1.3693e-03     95   0.76995 1.0814 0.014972
## 7  1.3414e-03    112   0.74452 1.0845 0.014975
## 8  1.1737e-03    130   0.71792 1.0923 0.014980
## 9  1.0955e-03    173   0.66393 1.0974 0.014983
## 10 1.0433e-03    181   0.65336 1.0872 0.014977
## 11 9.7809e-04    191   0.64280 1.0888 0.014978
## 12 9.3897e-04    227   0.60603 1.0869 0.014976
## 13 8.8028e-04    232   0.60133 1.0892 0.014978
## 14 7.8247e-04    244   0.59077 1.0962 0.014983
## 15 6.8466e-04    367   0.48357 1.0939 0.014981
## 16 6.5206e-04    371   0.48083 1.0974 0.014983
## 17 6.2598e-04    398   0.46088 1.0990 0.014984
## 18 5.8685e-04    408   0.45462 1.1005 0.014985
## 19 5.2165e-04    461   0.42175 1.1013 0.014985
## 20 5.0302e-04    493   0.39828 1.1025 0.014986
## 21 4.8905e-04    503   0.39202 1.1025 0.014986
## 22 3.9124e-04    508   0.38928 1.1025 0.014986
## 23 2.9343e-04    646   0.33490 1.0915 0.014980
## 24 2.6082e-04    650   0.33372 1.0947 0.014982
## 25 2.3474e-04    656   0.33216 1.0939 0.014981
## 26 1.9562e-04    661   0.33099 1.0959 0.014982
## 27 1.5649e-04    681   0.32707 1.0959 0.014982
## 28 1.3041e-04    686   0.32629 1.0955 0.014982
## 29 9.7809e-05    692   0.32551 1.0955 0.014982
## 30 1.0000e-05    696   0.32512 1.0955 0.014982
pred <- predict(amazon.rt, type="class")
table(pred, train$Customer.Segment.n)
##            
## pred        Consumer Corporate
##   Consumer      2144       419
##   Corporate      412      2904
library(rpart.plot)
#rpart.plot(amazon.rt, type = 1)
# library(randomForest)
# amazon.rf <- randomForest(Customer.Segment.n~Product.Container.n+Ship.Mode.n+Order.Priority.n+Product.Category.n+Product.Sub.Category.n+PRIN1+PRIN2+PRIN3+PRIN4, data=Amazon, mtry= 6, ntree=1000, importance = TRUE, na.action=na.omit)
# print(amazon.rf)
# importance(amazon.rf)


####Calculate performance of CART
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
actual <- as.numeric(train$Customer.Segment.n)
pred1 <- as.numeric(pred)
ROCR_pred <- prediction(pred1, actual)
#Recall and Precision
RP.perf <- performance(ROCR_pred, "prec", "rec")
plot(RP.perf)

#ROC Curve
ROC.perf <- performance(ROCR_pred, "tpr", "fpr")
plot(ROC.perf)

#ROC under curve
auc.tmp <- performance(ROCR_pred, "auc")
auc <- as.numeric(auc.tmp@y.values)
auc
## [1] 0.8563599