rm(list=ls(all=T))
options(digits=4, scipen=12)
library(dplyr)
library(rpart)
library(rpart.plot)
library(caret)
library(randomForest)
library(caTools)
source('DPP.R')
Problem 1.1 - Predicting B or not B
Before building models, let’s consider a baseline method that always predicts the most frequent outcome, which is “not B”. What is the accuracy of this baseline method on the test set?
letters = read.csv("data/letters_ABPR.csv")
letters$isB = as.factor(letters$letter == "B")
set.seed(1000)
library(caTools)
spl = sample.split(letters$isB, SplitRatio = 0.5)
train = subset(letters, spl == TRUE)
test = subset(letters, spl == FALSE)
table(test$isB)
FALSE TRUE
1175 383
1175/(1175+383)
[1] 0.7542
Problem 1.2 - Predicting B or not B
We are just using the default parameters in our CART model, so we don’t need to add the minbucket or cp arguments at all. We also added the argument method=“class” since this is a classification problem.
What is the accuracy of the CART model on the test set? (Use type=“class” when making predictions on the test set.)
CARTb = rpart(isB ~ . - letter, data=train, method="class")
predictions = predict(CARTb, newdata=test, type="class")
table(test$isB, predictions)
predictions
FALSE TRUE
FALSE 1118 57
TRUE 43 340
(1118+340)/nrow(test)
[1] 0.9358
Problem 1.3 - Predicting B or Not B
Now, build a random forest model to predict whether the letter is a B or not (the isB variable) using the training set. You should use all of the other variables as independent variables, except letter (since it helped us define what we are trying to predict!). Use the default settings for ntree and nodesize (don’t include these arguments at all). Right before building the model, set the seed to 1000. (NOTE: You might get a slightly different answer on this problem, even if you set the random seed. This has to do with your operating system and the implementation of the random forest algorithm.)
What is the accuracy of the model on the test set?
set.seed(1000)
library(randomForest)
RFb = randomForest(isB ~ . - letter, data=train)
predictions = predict(RFb, newdata=test)
table(test$isB, predictions)
predictions
FALSE TRUE
FALSE 1163 12
TRUE 9 374
Problem 2.1 - Predicting the letters A, B, P, R
Now, generate new training and testing sets of the letters data frame using letters$letter as the first input to the sample.split function. Before splitting, set your seed to 2000. Again put 50% of the data in the training set. (Why do we need to split the data again? Remember that sample.split balances the outcome variable in the training and testing sets. With a new outcome variable, we want to re-generate our split.)
In a multiclass classification problem, a simple baseline model is to predict the most frequent class of all of the options.
What is the baseline accuracy on the testing set?
set.seed(2000)
spl = sample.split(letters$letter, SplitRatio = 0.5)
train2 = subset(letters, spl == TRUE)
test2 = subset(letters, spl == FALSE)
table(train2$letter)
A B P R
394 383 402 379
401/nrow(test)
[1] 0.2574
Problem 2.2 - Predicting the letters A, B, P, R
Now build a classification tree to predict “letter”, using the training set to build your model. You should use all of the other variables as independent variables, except “isB”, since it is related to what we are trying to predict! Just use the default parameters in your CART model. Add the argument method=“class” since this is a classification problem. Even though we have multiple classes here, nothing changes in how we build the model from the binary case.
What is the test set accuracy of your CART model? Use the argument type=“class” when making predictions.
(HINT: When you are computing the test set accuracy using the confusion matrix, you want to add everything on the main diagonal and divide by the total number of observations in the test set, which can be computed with nrow(test), where test is the name of your test set).
CARTletter = rpart(letter ~ . - isB, data=train2, method="class")
predictLetter = predict(CARTletter, newdata=test2, type="class")
table(test2$letter, predictLetter)
predictLetter
A B P R
A 348 4 0 43
B 8 318 12 45
P 2 21 363 15
R 10 24 5 340
(348+318+363+340)/nrow(test2)
[1] 0.8787
Problem 2.3 - Predicting the letters A, B, P, R
Now build a random forest model on the training data, using the same independent variables as in the previous problem – again, don’t forget to remove the isB variable. Just use the default parameter values for ntree and nodesize (you don’t need to include these arguments at all). Set the seed to 1000 right before building your model. (Remember that you might get a slightly different result even if you set the random seed.)
What is the test set accuracy of your random forest model?
set.seed(1000)
RFletter = randomForest(letter ~ . - isB, data=train2)
predictLetter = predict(RFletter, newdata=test2)
table(test2$letter, predictLetter)
predictLetter
A B P R
A 391 0 3 1
B 0 380 1 2
P 0 6 394 1
R 3 14 0 362
(390+380 +394+362)/nrow(test2)
[1] 0.9795
LS0tDQp0aXRsZTogIkFTNC0yIExldHRlciBSZWNvZ25pdGlvbiINCmF1dGhvcjogIkdST1VQMSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCi0gLSAtIA0KDQoNCg0KYGBge3J9DQpybShsaXN0PWxzKGFsbD1UKSkNCm9wdGlvbnMoZGlnaXRzPTQsIHNjaXBlbj0xMikNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KHJwYXJ0KQ0KbGlicmFyeShycGFydC5wbG90KQ0KbGlicmFyeShjYXJldCkNCmxpYnJhcnkocmFuZG9tRm9yZXN0KQ0KbGlicmFyeShjYVRvb2xzKQ0Kc291cmNlKCdEUFAuUicpDQpgYGANCg0KIyNQcm9ibGVtIDEuMSAtIFByZWRpY3RpbmcgQiBvciBub3QgQg0KDQpCZWZvcmUgYnVpbGRpbmcgbW9kZWxzLCBsZXQncyBjb25zaWRlciBhIGJhc2VsaW5lIG1ldGhvZCB0aGF0IGFsd2F5cyBwcmVkaWN0cyB0aGUgbW9zdCBmcmVxdWVudCBvdXRjb21lLCB3aGljaCBpcyAibm90IEIiLiBXaGF0IGlzIHRoZSBhY2N1cmFjeSBvZiB0aGlzIGJhc2VsaW5lIG1ldGhvZCBvbiB0aGUgdGVzdCBzZXQ/DQoNCmBgYHtyfQ0KbGV0dGVycyA9IHJlYWQuY3N2KCJkYXRhL2xldHRlcnNfQUJQUi5jc3YiKQ0KbGV0dGVycyRpc0IgPSBhcy5mYWN0b3IobGV0dGVycyRsZXR0ZXIgPT0gIkIiKQ0Kc2V0LnNlZWQoMTAwMCkNCmxpYnJhcnkoY2FUb29scykNCnNwbCA9IHNhbXBsZS5zcGxpdChsZXR0ZXJzJGlzQiwgU3BsaXRSYXRpbyA9IDAuNSkNCnRyYWluID0gc3Vic2V0KGxldHRlcnMsIHNwbCA9PSBUUlVFKQ0KdGVzdCA9IHN1YnNldChsZXR0ZXJzLCBzcGwgPT0gRkFMU0UpDQp0YWJsZSh0ZXN0JGlzQikNCjExNzUvKDExNzUrMzgzKQ0KYGBgDQoNCiMjUHJvYmxlbSAxLjIgLSBQcmVkaWN0aW5nIEIgb3Igbm90IEINCldlIGFyZSBqdXN0IHVzaW5nIHRoZSBkZWZhdWx0IHBhcmFtZXRlcnMgaW4gb3VyIENBUlQgbW9kZWwsIHNvIHdlIGRvbid0IG5lZWQgdG8gYWRkIHRoZSBtaW5idWNrZXQgb3IgY3AgYXJndW1lbnRzIGF0IGFsbC4gV2UgYWxzbyBhZGRlZCB0aGUgYXJndW1lbnQgbWV0aG9kPSJjbGFzcyIgc2luY2UgdGhpcyBpcyBhIGNsYXNzaWZpY2F0aW9uIHByb2JsZW0uDQoNCldoYXQgaXMgdGhlIGFjY3VyYWN5IG9mIHRoZSBDQVJUIG1vZGVsIG9uIHRoZSB0ZXN0IHNldD8gKFVzZSB0eXBlPSJjbGFzcyIgd2hlbiBtYWtpbmcgcHJlZGljdGlvbnMgb24gdGhlIHRlc3Qgc2V0LikNCg0KDQoNCmBgYHtyfQ0KQ0FSVGIgPSBycGFydChpc0IgfiAuIC0gbGV0dGVyLCBkYXRhPXRyYWluLCBtZXRob2Q9ImNsYXNzIikNCnByZWRpY3Rpb25zID0gcHJlZGljdChDQVJUYiwgbmV3ZGF0YT10ZXN0LCB0eXBlPSJjbGFzcyIpDQp0YWJsZSh0ZXN0JGlzQiwgcHJlZGljdGlvbnMpDQooMTExOCszNDApL25yb3codGVzdCkNCmBgYA0KDQojI1Byb2JsZW0gMS4zIC0gUHJlZGljdGluZyBCIG9yIE5vdCBCDQoNCk5vdywgYnVpbGQgYSByYW5kb20gZm9yZXN0IG1vZGVsIHRvIHByZWRpY3Qgd2hldGhlciB0aGUgbGV0dGVyIGlzIGEgQiBvciBub3QgKHRoZSBpc0IgdmFyaWFibGUpIHVzaW5nIHRoZSB0cmFpbmluZyBzZXQuIFlvdSBzaG91bGQgdXNlIGFsbCBvZiB0aGUgb3RoZXIgdmFyaWFibGVzIGFzIGluZGVwZW5kZW50IHZhcmlhYmxlcywgZXhjZXB0IGxldHRlciAoc2luY2UgaXQgaGVscGVkIHVzIGRlZmluZSB3aGF0IHdlIGFyZSB0cnlpbmcgdG8gcHJlZGljdCEpLiBVc2UgdGhlIGRlZmF1bHQgc2V0dGluZ3MgZm9yIG50cmVlIGFuZCBub2Rlc2l6ZSAoZG9uJ3QgaW5jbHVkZSB0aGVzZSBhcmd1bWVudHMgYXQgYWxsKS4gUmlnaHQgYmVmb3JlIGJ1aWxkaW5nIHRoZSBtb2RlbCwgc2V0IHRoZSBzZWVkIHRvIDEwMDAuIChOT1RFOiBZb3UgbWlnaHQgZ2V0IGEgc2xpZ2h0bHkgZGlmZmVyZW50IGFuc3dlciBvbiB0aGlzIHByb2JsZW0sIGV2ZW4gaWYgeW91IHNldCB0aGUgcmFuZG9tIHNlZWQuIFRoaXMgaGFzIHRvIGRvIHdpdGggeW91ciBvcGVyYXRpbmcgc3lzdGVtIGFuZCB0aGUgaW1wbGVtZW50YXRpb24gb2YgdGhlIHJhbmRvbSBmb3Jlc3QgYWxnb3JpdGhtLikNCg0KV2hhdCBpcyB0aGUgYWNjdXJhY3kgb2YgdGhlIG1vZGVsIG9uIHRoZSB0ZXN0IHNldD8NCg0KYGBge3J9DQpzZXQuc2VlZCgxMDAwKQ0KbGlicmFyeShyYW5kb21Gb3Jlc3QpDQpSRmIgPSByYW5kb21Gb3Jlc3QoaXNCIH4gLiAtIGxldHRlciwgZGF0YT10cmFpbikNCnByZWRpY3Rpb25zID0gcHJlZGljdChSRmIsIG5ld2RhdGE9dGVzdCkNCnRhYmxlKHRlc3QkaXNCLCBwcmVkaWN0aW9ucykNCmBgYA0KDQojI1Byb2JsZW0gMi4xIC0gUHJlZGljdGluZyB0aGUgbGV0dGVycyBBLCBCLCBQLCBSDQpOb3csIGdlbmVyYXRlIG5ldyB0cmFpbmluZyBhbmQgdGVzdGluZyBzZXRzIG9mIHRoZSBsZXR0ZXJzIGRhdGEgZnJhbWUgdXNpbmcgbGV0dGVycyRsZXR0ZXIgYXMgdGhlIGZpcnN0IGlucHV0IHRvIHRoZSBzYW1wbGUuc3BsaXQgZnVuY3Rpb24uIEJlZm9yZSBzcGxpdHRpbmcsIHNldCB5b3VyIHNlZWQgdG8gMjAwMC4gQWdhaW4gcHV0IDUwJSBvZiB0aGUgZGF0YSBpbiB0aGUgdHJhaW5pbmcgc2V0LiAoV2h5IGRvIHdlIG5lZWQgdG8gc3BsaXQgdGhlIGRhdGEgYWdhaW4/IFJlbWVtYmVyIHRoYXQgc2FtcGxlLnNwbGl0IGJhbGFuY2VzIHRoZSBvdXRjb21lIHZhcmlhYmxlIGluIHRoZSB0cmFpbmluZyBhbmQgdGVzdGluZyBzZXRzLiBXaXRoIGEgbmV3IG91dGNvbWUgdmFyaWFibGUsIHdlIHdhbnQgdG8gcmUtZ2VuZXJhdGUgb3VyIHNwbGl0LikNCg0KSW4gYSBtdWx0aWNsYXNzIGNsYXNzaWZpY2F0aW9uIHByb2JsZW0sIGEgc2ltcGxlIGJhc2VsaW5lIG1vZGVsIGlzIHRvIHByZWRpY3QgdGhlIG1vc3QgZnJlcXVlbnQgY2xhc3Mgb2YgYWxsIG9mIHRoZSBvcHRpb25zLg0KDQpXaGF0IGlzIHRoZSBiYXNlbGluZSBhY2N1cmFjeSBvbiB0aGUgdGVzdGluZyBzZXQ/DQoNCmBgYHtyfQ0Kc2V0LnNlZWQoMjAwMCkNCnNwbCA9IHNhbXBsZS5zcGxpdChsZXR0ZXJzJGxldHRlciwgU3BsaXRSYXRpbyA9IDAuNSkNCnRyYWluMiA9IHN1YnNldChsZXR0ZXJzLCBzcGwgPT0gVFJVRSkNCnRlc3QyID0gc3Vic2V0KGxldHRlcnMsIHNwbCA9PSBGQUxTRSkNCnRhYmxlKHRyYWluMiRsZXR0ZXIpDQo0MDEvbnJvdyh0ZXN0KQ0KYGBgDQoNCiMjUHJvYmxlbSAyLjIgLSBQcmVkaWN0aW5nIHRoZSBsZXR0ZXJzIEEsIEIsIFAsIFINCk5vdyBidWlsZCBhIGNsYXNzaWZpY2F0aW9uIHRyZWUgdG8gcHJlZGljdCAibGV0dGVyIiwgdXNpbmcgdGhlIHRyYWluaW5nIHNldCB0byBidWlsZCB5b3VyIG1vZGVsLiBZb3Ugc2hvdWxkIHVzZSBhbGwgb2YgdGhlIG90aGVyIHZhcmlhYmxlcyBhcyBpbmRlcGVuZGVudCB2YXJpYWJsZXMsIGV4Y2VwdCAiaXNCIiwgc2luY2UgaXQgaXMgcmVsYXRlZCB0byB3aGF0IHdlIGFyZSB0cnlpbmcgdG8gcHJlZGljdCEgSnVzdCB1c2UgdGhlIGRlZmF1bHQgcGFyYW1ldGVycyBpbiB5b3VyIENBUlQgbW9kZWwuIEFkZCB0aGUgYXJndW1lbnQgbWV0aG9kPSJjbGFzcyIgc2luY2UgdGhpcyBpcyBhIGNsYXNzaWZpY2F0aW9uIHByb2JsZW0uIEV2ZW4gdGhvdWdoIHdlIGhhdmUgbXVsdGlwbGUgY2xhc3NlcyBoZXJlLCBub3RoaW5nIGNoYW5nZXMgaW4gaG93IHdlIGJ1aWxkIHRoZSBtb2RlbCBmcm9tIHRoZSBiaW5hcnkgY2FzZS4NCg0KV2hhdCBpcyB0aGUgdGVzdCBzZXQgYWNjdXJhY3kgb2YgeW91ciBDQVJUIG1vZGVsPyBVc2UgdGhlIGFyZ3VtZW50IHR5cGU9ImNsYXNzIiB3aGVuIG1ha2luZyBwcmVkaWN0aW9ucy4NCg0KKEhJTlQ6IFdoZW4geW91IGFyZSBjb21wdXRpbmcgdGhlIHRlc3Qgc2V0IGFjY3VyYWN5IHVzaW5nIHRoZSBjb25mdXNpb24gbWF0cml4LCB5b3Ugd2FudCB0byBhZGQgZXZlcnl0aGluZyBvbiB0aGUgbWFpbiBkaWFnb25hbCBhbmQgZGl2aWRlIGJ5IHRoZSB0b3RhbCBudW1iZXIgb2Ygb2JzZXJ2YXRpb25zIGluIHRoZSB0ZXN0IHNldCwgd2hpY2ggY2FuIGJlIGNvbXB1dGVkIHdpdGggbnJvdyh0ZXN0KSwgd2hlcmUgdGVzdCBpcyB0aGUgbmFtZSBvZiB5b3VyIHRlc3Qgc2V0KS4NCg0KYGBge3J9DQpDQVJUbGV0dGVyID0gcnBhcnQobGV0dGVyIH4gLiAtIGlzQiwgZGF0YT10cmFpbjIsIG1ldGhvZD0iY2xhc3MiKQ0KcHJlZGljdExldHRlciA9IHByZWRpY3QoQ0FSVGxldHRlciwgbmV3ZGF0YT10ZXN0MiwgdHlwZT0iY2xhc3MiKQ0KdGFibGUodGVzdDIkbGV0dGVyLCBwcmVkaWN0TGV0dGVyKQ0KKDM0OCszMTgrMzYzKzM0MCkvbnJvdyh0ZXN0MikNCmBgYA0KDQojI1Byb2JsZW0gMi4zIC0gUHJlZGljdGluZyB0aGUgbGV0dGVycyBBLCBCLCBQLCBSDQpOb3cgYnVpbGQgYSByYW5kb20gZm9yZXN0IG1vZGVsIG9uIHRoZSB0cmFpbmluZyBkYXRhLCB1c2luZyB0aGUgc2FtZSBpbmRlcGVuZGVudCB2YXJpYWJsZXMgYXMgaW4gdGhlIHByZXZpb3VzIHByb2JsZW0gLS0gYWdhaW4sIGRvbid0IGZvcmdldCB0byByZW1vdmUgdGhlIGlzQiB2YXJpYWJsZS4gSnVzdCB1c2UgdGhlIGRlZmF1bHQgcGFyYW1ldGVyIHZhbHVlcyBmb3IgbnRyZWUgYW5kIG5vZGVzaXplICh5b3UgZG9uJ3QgbmVlZCB0byBpbmNsdWRlIHRoZXNlIGFyZ3VtZW50cyBhdCBhbGwpLiBTZXQgdGhlIHNlZWQgdG8gMTAwMCByaWdodCBiZWZvcmUgYnVpbGRpbmcgeW91ciBtb2RlbC4gKFJlbWVtYmVyIHRoYXQgeW91IG1pZ2h0IGdldCBhIHNsaWdodGx5IGRpZmZlcmVudCByZXN1bHQgZXZlbiBpZiB5b3Ugc2V0IHRoZSByYW5kb20gc2VlZC4pDQoNCldoYXQgaXMgdGhlIHRlc3Qgc2V0IGFjY3VyYWN5IG9mIHlvdXIgcmFuZG9tIGZvcmVzdCBtb2RlbD8NCg0KYGBge3J9DQpzZXQuc2VlZCgxMDAwKQ0KUkZsZXR0ZXIgPSByYW5kb21Gb3Jlc3QobGV0dGVyIH4gLiAtIGlzQiwgZGF0YT10cmFpbjIpDQpwcmVkaWN0TGV0dGVyID0gcHJlZGljdChSRmxldHRlciwgbmV3ZGF0YT10ZXN0MikNCnRhYmxlKHRlc3QyJGxldHRlciwgcHJlZGljdExldHRlcikNCigzOTArMzgwICszOTQrMzYyKS9ucm93KHRlc3QyKSANCmBgYA0KDQoNCg0K