1.
1.1 Predicting B or not B
letters = read.csv("data/letters_ABPR.csv")
letters$isB = as.factor(letters$letter == "B")
library(caTools)
set.seed(1000)
spl = sample.split(letters$isB,SplitRatio = 0.5)
Train = subset(letters,spl=="TRUE")
Test = subset(letters,spl=="FALSE")
table(Test$isB)
FALSE TRUE
1175 383
1175/nrow(Test)
[1] 0.7542
1.2 Predicting B or not B
What is the accuracy of the CART model on the test set? (Use type=“class” when making predictions on the test set.)
library(rpart)
CARTb = rpart(isB ~ . - letter, data=Train, method="class")
Pred = predict(CARTb , Test ,type = "class")
table(Test$isB,Pred)
Pred
FALSE TRUE
FALSE 1118 57
TRUE 43 340
(1118+340)/nrow(Test)
[1] 0.9358
1.3 Predicting B or not B
Now, build a random forest model to predict whether the letter is a B or not (the isB variable) using the training set. You should use all of the other variables as independent variables, except letter (since it helped us define what we are trying to predict!). Use the default settings for ntree and nodesize (don’t include these arguments at all). Right before building the model, set the seed to 1000. (NOTE: You might get a slightly different answer on this problem, even if you set the random seed. This has to do with your operating system and the implementation of the random forest algorithm.)
What is the accuracy of the model on the test set?
library(randomForest)
set.seed(1000)
randomForestb = randomForest(isB ~ . - letter, data=Train)
Pred2 = predict(randomForestb,Test)
table(Test$isB,Pred2)
Pred2
FALSE TRUE
FALSE 1163 12
TRUE 9 374
(1163+374)/nrow(Test)
[1] 0.9865
2.
2.1 Predicting the letters A, B, P, R
In a multiclass classification problem, a simple baseline model is to predict the most frequent class of all of the options.
What is the baseline accuracy on the testing set?
letters$letter = as.factor( letters$letter )
set.seed(2000)
spl = sample.split(letters$letter,SplitRatio = 0.5)
Train2 = subset(letters,spl=="TRUE")
Test2 = subset(letters,spl=="FALSE")
table(Test2$letter)
A B P R
395 383 401 379
401/nrow(Test2)
[1] 0.2574
2.2 Predicting the letters A, B, P, R
What is the test set accuracy of your CART model? Use the argument type=“class” when making predictions.
CART = rpart(letter ~ . - isB, data=Train, method="class")
Pred3 = predict(CART,Test2,type="class")
table(Test2$letter,Pred3)
Pred3
A B P R
A 349 1 7 38
B 9 304 8 62
P 3 28 359 11
R 8 26 0 345
(349+304+359+345)/nrow(Test2)
[1] 0.871
2.3 Predicting the letters A, B, P, R
Now build a random forest model on the training data, using the same independent variables as in the previous problem – again, don’t forget to remove the isB variable. Just use the default parameter values for ntree and nodesize (you don’t need to include these arguments at all). Set the seed to 1000 right before building your model. (Remember that you might get a slightly different result even if you set the random seed.)
What is the test set accuracy of your random forest model?
library(randomForest)
set.seed(1000)
randomForest = randomForest(letter ~ . - isB,data=Train)
Pred4 = predict(randomForest,Test2,type="class")
table(Test2$letter,Pred4)
Pred4
A B P R
A 392 0 1 2
B 0 378 0 5
P 0 2 398 1
R 0 6 0 373
(392+378+398+373)/nrow(Test2)
[1] 0.9891
LS0tDQp0aXRsZTogIkFTNC0yIExldHRlciBSZWNvZ25pdGlvbiINCmF1dGhvcjogIkdyb3VwNCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIyAxLiANCg0KIyMjIyAxLjEgUHJlZGljdGluZyBCIG9yIG5vdCBCDQoNCmBgYHtyfQ0KbGV0dGVycyA9IHJlYWQuY3N2KCJkYXRhL2xldHRlcnNfQUJQUi5jc3YiKQ0KbGV0dGVycyRpc0IgPSBhcy5mYWN0b3IobGV0dGVycyRsZXR0ZXIgPT0gIkIiKQ0KbGlicmFyeShjYVRvb2xzKQ0Kc2V0LnNlZWQoMTAwMCkNCnNwbCA9IHNhbXBsZS5zcGxpdChsZXR0ZXJzJGlzQixTcGxpdFJhdGlvID0gMC41KQ0KVHJhaW4gPSBzdWJzZXQobGV0dGVycyxzcGw9PSJUUlVFIikNClRlc3QgPSBzdWJzZXQobGV0dGVycyxzcGw9PSJGQUxTRSIpDQp0YWJsZShUZXN0JGlzQikNCjExNzUvbnJvdyhUZXN0KQ0KYGBgDQoNCiMjIyMgMS4yIFByZWRpY3RpbmcgQiBvciBub3QgQg0KDQpXaGF0IGlzIHRoZSBhY2N1cmFjeSBvZiB0aGUgQ0FSVCBtb2RlbCBvbiB0aGUgdGVzdCBzZXQ/IChVc2UgdHlwZT0iY2xhc3MiIHdoZW4gbWFraW5nIHByZWRpY3Rpb25zIG9uIHRoZSB0ZXN0IHNldC4pDQoNCmBgYHtyfQ0KbGlicmFyeShycGFydCkNCkNBUlRiID0gcnBhcnQoaXNCIH4gLiAtIGxldHRlciwgZGF0YT1UcmFpbiwgbWV0aG9kPSJjbGFzcyIpDQpQcmVkID0gcHJlZGljdChDQVJUYiAsIFRlc3QgLHR5cGUgPSAiY2xhc3MiKQ0KdGFibGUoVGVzdCRpc0IsUHJlZCkNCigxMTE4KzM0MCkvbnJvdyhUZXN0KQ0KDQpgYGANCg0KIyMjIyAxLjMgUHJlZGljdGluZyBCIG9yIG5vdCBCDQoNCk5vdywgYnVpbGQgYSByYW5kb20gZm9yZXN0IG1vZGVsIHRvIHByZWRpY3Qgd2hldGhlciB0aGUgbGV0dGVyIGlzIGEgQiBvciBub3QgKHRoZSBpc0IgdmFyaWFibGUpIHVzaW5nIHRoZSB0cmFpbmluZyBzZXQuIFlvdSBzaG91bGQgdXNlIGFsbCBvZiB0aGUgb3RoZXIgdmFyaWFibGVzIGFzIGluZGVwZW5kZW50IHZhcmlhYmxlcywgZXhjZXB0IGxldHRlciAoc2luY2UgaXQgaGVscGVkIHVzIGRlZmluZSB3aGF0IHdlIGFyZSB0cnlpbmcgdG8gcHJlZGljdCEpLiBVc2UgdGhlIGRlZmF1bHQgc2V0dGluZ3MgZm9yIG50cmVlIGFuZCBub2Rlc2l6ZSAoZG9uJ3QgaW5jbHVkZSB0aGVzZSBhcmd1bWVudHMgYXQgYWxsKS4gUmlnaHQgYmVmb3JlIGJ1aWxkaW5nIHRoZSBtb2RlbCwgc2V0IHRoZSBzZWVkIHRvIDEwMDAuIChOT1RFOiBZb3UgbWlnaHQgZ2V0IGEgc2xpZ2h0bHkgZGlmZmVyZW50IGFuc3dlciBvbiB0aGlzIHByb2JsZW0sIGV2ZW4gaWYgeW91IHNldCB0aGUgcmFuZG9tIHNlZWQuIFRoaXMgaGFzIHRvIGRvIHdpdGggeW91ciBvcGVyYXRpbmcgc3lzdGVtIGFuZCB0aGUgaW1wbGVtZW50YXRpb24gb2YgdGhlIHJhbmRvbSBmb3Jlc3QgYWxnb3JpdGhtLikNCg0KV2hhdCBpcyB0aGUgYWNjdXJhY3kgb2YgdGhlIG1vZGVsIG9uIHRoZSB0ZXN0IHNldD8NCg0KYGBge3J9DQpsaWJyYXJ5KHJhbmRvbUZvcmVzdCkNCnNldC5zZWVkKDEwMDApDQpyYW5kb21Gb3Jlc3RiID0gcmFuZG9tRm9yZXN0KGlzQiB+IC4gLSBsZXR0ZXIsIGRhdGE9VHJhaW4pDQpQcmVkMiA9IHByZWRpY3QocmFuZG9tRm9yZXN0YixUZXN0KQ0KdGFibGUoVGVzdCRpc0IsUHJlZDIpDQooMTE2MyszNzQpL25yb3coVGVzdCkNCmBgYA0KDQojIyMgMi4gDQoNCiMjIyMgMi4xIFByZWRpY3RpbmcgdGhlIGxldHRlcnMgQSwgQiwgUCwgUg0KDQpJbiBhIG11bHRpY2xhc3MgY2xhc3NpZmljYXRpb24gcHJvYmxlbSwgYSBzaW1wbGUgYmFzZWxpbmUgbW9kZWwgaXMgdG8gcHJlZGljdCB0aGUgbW9zdCBmcmVxdWVudCBjbGFzcyBvZiBhbGwgb2YgdGhlIG9wdGlvbnMuDQoNCldoYXQgaXMgdGhlIGJhc2VsaW5lIGFjY3VyYWN5IG9uIHRoZSB0ZXN0aW5nIHNldD8NCg0KYGBge3J9DQpsZXR0ZXJzJGxldHRlciA9IGFzLmZhY3RvciggbGV0dGVycyRsZXR0ZXIgKQ0Kc2V0LnNlZWQoMjAwMCkNCnNwbCA9IHNhbXBsZS5zcGxpdChsZXR0ZXJzJGxldHRlcixTcGxpdFJhdGlvID0gMC41KQ0KVHJhaW4yID0gc3Vic2V0KGxldHRlcnMsc3BsPT0iVFJVRSIpDQpUZXN0MiA9IHN1YnNldChsZXR0ZXJzLHNwbD09IkZBTFNFIikNCnRhYmxlKFRlc3QyJGxldHRlcikNCjQwMS9ucm93KFRlc3QyKQ0KYGBgDQoNCiMjIyMgMi4yIFByZWRpY3RpbmcgdGhlIGxldHRlcnMgQSwgQiwgUCwgUg0KDQpXaGF0IGlzIHRoZSB0ZXN0IHNldCBhY2N1cmFjeSBvZiB5b3VyIENBUlQgbW9kZWw/IFVzZSB0aGUgYXJndW1lbnQgdHlwZT0iY2xhc3MiIHdoZW4gbWFraW5nIHByZWRpY3Rpb25zLg0KDQpgYGB7cn0NCkNBUlQgPSBycGFydChsZXR0ZXIgfiAuIC0gaXNCLCBkYXRhPVRyYWluLCBtZXRob2Q9ImNsYXNzIikNClByZWQzID0gcHJlZGljdChDQVJULFRlc3QyLHR5cGU9ImNsYXNzIikNCnRhYmxlKFRlc3QyJGxldHRlcixQcmVkMykNCigzNDkrMzA0KzM1OSszNDUpL25yb3coVGVzdDIpDQoNCmBgYA0KDQojIyMjIDIuMyBQcmVkaWN0aW5nIHRoZSBsZXR0ZXJzIEEsIEIsIFAsIFINCg0KTm93IGJ1aWxkIGEgcmFuZG9tIGZvcmVzdCBtb2RlbCBvbiB0aGUgdHJhaW5pbmcgZGF0YSwgdXNpbmcgdGhlIHNhbWUgaW5kZXBlbmRlbnQgdmFyaWFibGVzIGFzIGluIHRoZSBwcmV2aW91cyBwcm9ibGVtIC0tIGFnYWluLCBkb24ndCBmb3JnZXQgdG8gcmVtb3ZlIHRoZSBpc0IgdmFyaWFibGUuIEp1c3QgdXNlIHRoZSBkZWZhdWx0IHBhcmFtZXRlciB2YWx1ZXMgZm9yIG50cmVlIGFuZCBub2Rlc2l6ZSAoeW91IGRvbid0IG5lZWQgdG8gaW5jbHVkZSB0aGVzZSBhcmd1bWVudHMgYXQgYWxsKS4gU2V0IHRoZSBzZWVkIHRvIDEwMDAgcmlnaHQgYmVmb3JlIGJ1aWxkaW5nIHlvdXIgbW9kZWwuIChSZW1lbWJlciB0aGF0IHlvdSBtaWdodCBnZXQgYSBzbGlnaHRseSBkaWZmZXJlbnQgcmVzdWx0IGV2ZW4gaWYgeW91IHNldCB0aGUgcmFuZG9tIHNlZWQuKQ0KDQpXaGF0IGlzIHRoZSB0ZXN0IHNldCBhY2N1cmFjeSBvZiB5b3VyIHJhbmRvbSBmb3Jlc3QgbW9kZWw/DQoNCmBgYHtyfQ0KbGlicmFyeShyYW5kb21Gb3Jlc3QpDQpzZXQuc2VlZCgxMDAwKQ0KcmFuZG9tRm9yZXN0ID0gcmFuZG9tRm9yZXN0KGxldHRlciB+IC4gLSBpc0IsZGF0YT1UcmFpbikNClByZWQ0ID0gcHJlZGljdChyYW5kb21Gb3Jlc3QsVGVzdDIsdHlwZT0iY2xhc3MiKQ0KdGFibGUoVGVzdDIkbGV0dGVyLFByZWQ0KQ0KKDM5MiszNzgrMzk4KzM3MykvbnJvdyhUZXN0MikNCg0KYGBgDQo=