Data Mining Techniques: Part 1

Topics

Decision Trees

Decision Trees

library(party)
set.seed(1234)
#create indices 1 & 2 for sampling the dataset
ind=sample(2,nrow(iris),replace=TRUE,prob=c(0.7,0.3))
trainData=iris[ind==1,]
testData=iris[ind==2,]
#Build a model
myFormula=Species ~ Sepal.Length+Sepal.Width+Petal.Length+Petal.Width
iris_ctree=ctree(myFormula,data=trainData)

Decision Trees

#check the prediction with the training data
table(predict(iris_ctree),trainData$Species)

             setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         37         3
  virginica       0          1        31

Decision Trees

#plot the decision tree
plot(iris_ctree,type="simple")

plot of chunk unnamed-chunk-3

Decision Trees

#predict on test data
testPred=predict(iris_ctree,newdata=testData)
#show the confusion matrix
table(testPred,testData$Species)

testPred     setosa versicolor virginica
  setosa         10          0         0
  versicolor      0         12         2
  virginica       0          0        14

Random Forest

Random Forest

library(randomForest)
set.seed(1234)
#create indices 1 & 2 for sampling the dataset
ind=sample(2,nrow(iris),replace=TRUE,prob=c(0.7,0.3))
trainData=iris[ind==1,]
testData=iris[ind==2,]

rf=randomForest(Species ~ .,data=trainData,ntree=100,proximity=TRUE)
table(predict(rf),trainData$Species)

             setosa versicolor virginica
  setosa         40          0         0
  versicolor      0         35         2
  virginica       0          3        32

Random Forest

print(rf)

Call:
 randomForest(formula = Species ~ ., data = trainData, ntree = 100,      proximity = TRUE) 
               Type of random forest: classification
                     Number of trees: 100
No. of variables tried at each split: 2

        OOB estimate of  error rate: 4.46%
Confusion matrix:
           setosa versicolor virginica class.error
setosa         40          0         0  0.00000000
versicolor      0         35         3  0.07894737
virginica       0          2        32  0.05882353

Random Forest

attributes(rf)
$names
 [1] "call"            "type"            "predicted"      
 [4] "err.rate"        "confusion"       "votes"          
 [7] "oob.times"       "classes"         "importance"     
[10] "importanceSD"    "localImportance" "proximity"      
[13] "ntree"           "mtry"            "forest"         
[16] "y"               "test"            "inbag"          
[19] "terms"          

$class
[1] "randomForest.formula" "randomForest"        

Random Forest

plot(rf)

plot of chunk unnamed-chunk-8

Random Forest

importance(rf)
             MeanDecreaseGini
Sepal.Length         8.129141
Sepal.Width          1.253981
Petal.Length        31.235275
Petal.Width         33.198115

Random Forest

varImpPlot(rf)

plot of chunk unnamed-chunk-10

Random Forest

irisPred=predict(rf,newdata=testData)
table(irisPred,testData$Species)

irisPred     setosa versicolor virginica
  setosa         10          0         0
  versicolor      0         12         2
  virginica       0          0        14

Random Forest

plot(margin(rf,testData$Species))

plot of chunk unnamed-chunk-12

References and links