This is an R Markdown Notebook.

Example: Optical Character Recognition.

Step 1: Collecting Data.

Step 2: Exploring and preparing the data.

  • Reading in data and examine structure
Letters <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml11/letterdata.csv")
str(Letters)
'data.frame':   20000 obs. of  17 variables:
 $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
 $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
 $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
 $ width : int  3 3 6 6 3 5 5 3 4 13 ...
 $ height: int  5 7 8 6 1 8 4 2 4 9 ...
 $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
 $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
 $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
 $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
 $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
 $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
 $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
 $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
 $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
 $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
 $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
 $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...
  • Divide into training and test data.
letters_train <- Letters[1:16000, ]
letters_test  <- Letters[16001:20000, ]

Step 3: Training a model on the data.

  • Begin by training a simple linear SVM.
library(kernlab)
letter_classifier <- ksvm(letter ~ ., data = letters_train,
                          kernel = "vanilladot")
 Setting default kernel parameters  
  • Look at basic information about the model.
letter_classifier
Support Vector Machine object of class "ksvm" 

SV type: C-svc  (classification) 
 parameter : cost C = 1 

Linear (vanilla) kernel function. 

Number of Support Vectors : 7037 

Objective Function Value : -14.1746 -20.0072 -23.5628 -6.2009 -7.5524 -32.7694 -49.9786 -18.1824 -62.1111 -32.7284 -16.2209 -32.2837 -28.9777 -51.2195 -13.276 -35.6217 -30.8612 -16.5256 -14.6811 -32.7475 -30.3219 -7.7956 -11.8138 -32.3463 -13.1262 -9.2692 -153.1654 -52.9678 -76.7744 -119.2067 -165.4437 -54.6237 -41.9809 -67.2688 -25.1959 -27.6371 -26.4102 -35.5583 -41.2597 -122.164 -187.9178 -222.0856 -21.4765 -10.3752 -56.3684 -12.2277 -49.4899 -9.3372 -19.2092 -11.1776 -100.2186 -29.1397 -238.0516 -77.1985 -8.3339 -4.5308 -139.8534 -80.8854 -20.3642 -13.0245 -82.5151 -14.5032 -26.7509 -18.5713 -23.9511 -27.3034 -53.2731 -11.4773 -5.12 -13.9504 -4.4982 -3.5755 -8.4914 -40.9716 -49.8182 -190.0269 -43.8594 -44.8667 -45.2596 -13.5561 -17.7664 -87.4105 -107.1056 -37.0245 -30.7133 -112.3218 -32.9619 -27.2971 -35.5836 -17.8586 -5.1391 -43.4094 -7.7843 -16.6785 -58.5103 -159.9936 -49.0782 -37.8426 -32.8002 -74.5249 -133.3423 -11.1638 -5.3575 -12.438 -30.9907 -141.6924 -54.2953 -179.0114 -99.8896 -10.288 -15.1553 -3.7815 -67.6123 -7.696 -88.9304 -47.6448 -94.3718 -70.2733 -71.5057 -21.7854 -12.7657 -7.4383 -23.502 -13.1055 -239.9708 -30.4193 -25.2113 -136.2795 -140.9565 -9.8122 -34.4584 -6.3039 -60.8421 -66.5793 -27.2816 -214.3225 -34.7796 -16.7631 -135.7821 -160.6279 -45.2949 -25.1023 -144.9059 -82.2352 -327.7154 -142.0613 -158.8821 -32.2181 -32.8887 -52.9641 -25.4937 -47.9936 -6.8991 -9.7293 -36.436 -70.3907 -187.7611 -46.9371 -89.8103 -143.4213 -624.3645 -119.2204 -145.4435 -327.7748 -33.3255 -64.0607 -145.4831 -116.5903 -36.2977 -66.3762 -44.8248 -7.5088 -217.9246 -12.9699 -30.504 -2.0369 -6.126 -14.4448 -21.6337 -57.3084 -20.6915 -184.3625 -20.1052 -4.1484 -4.5344 -0.828 -121.4411 -7.9486 -58.5604 -21.4878 -13.5476 -5.646 -15.629 -28.9576 -20.5959 -76.7111 -27.0119 -94.7101 -15.1713 -10.0222 -7.6394 -1.5784 -87.6952 -6.2239 -99.3711 -101.0906 -45.6639 -24.0725 -61.7702 -24.1583 -52.2368 -234.3264 -39.9749 -48.8556 -34.1464 -20.9664 -11.4525 -123.0277 -6.4903 -5.1865 -8.8016 -9.4618 -21.7742 -24.2361 -123.3984 -31.4404 -88.3901 -30.0924 -13.8198 -9.2701 -3.0823 -87.9624 -6.3845 -13.968 -65.0702 -105.523 -13.7403 -13.7625 -50.4223 -2.933 -8.4289 -80.3381 -36.4147 -112.7485 -4.1711 -7.8989 -1.2676 -90.8037 -21.4919 -7.2235 -47.9557 -3.383 -20.433 -64.6138 -45.5781 -56.1309 -6.1345 -18.6307 -2.374 -72.2553 -111.1885 -106.7664 -23.1323 -19.3765 -54.9819 -34.2953 -64.4756 -20.4115 -6.689 -4.378 -59.141 -34.2468 -58.1509 -33.8665 -10.6902 -53.1387 -13.7478 -20.1987 -55.0923 -3.8058 -60.0382 -235.4841 -12.6837 -11.7407 -17.3058 -9.7167 -65.8498 -17.1051 -42.8131 -53.1054 -25.0437 -15.302 -44.0749 -16.9582 -62.9773 -5.204 -5.2963 -86.1704 -3.7209 -6.3445 -1.1264 -122.5771 -23.9041 -355.0145 -31.1013 -32.619 -4.9664 -84.1048 -134.5957 -72.8371 -23.9002 -35.3077 -11.7119 -22.2889 -1.8598 -59.2174 -8.8994 -150.742 -1.8533 -1.9711 -9.9676 -0.5207 -26.9229 -30.429 -5.6289 
Training error : 0.130062 

Step 4: Evaluating model performance.

  • Predictions on testing dataset.
letter_predictions <- predict(letter_classifier, letters_test)
head(letter_predictions)
[1] U N V X N H
Levels: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
table(letter_predictions, letters_test$letter)
                  
letter_predictions   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O   P   Q   R
                 A 144   0   0   0   0   0   0   0   0   1   0   0   1   2   2   0   5   0
                 B   0 121   0   5   2   0   1   2   0   0   1   0   1   0   0   2   2   3
                 C   0   0 120   0   4   0  10   2   2   0   1   3   0   0   2   0   0   0
                 D   2   2   0 156   0   1   3  10   4   3   4   3   0   5   5   3   1   4
                 E   0   0   5   0 127   3   1   1   0   0   3   4   0   0   0   0   2   0
                 F   0   0   0   0   0 138   2   2   6   0   0   0   0   0   0  16   0   0
                 G   1   1   2   1   9   2 123   2   0   0   1   2   1   0   1   2   8   2
                 H   0   0   0   1   0   1   0 102   0   2   3   2   3   4  20   0   2   3
                 I   0   1   0   0   0   1   0   0 141   8   0   0   0   0   0   1   0   0
                 J   0   1   0   0   0   1   0   2   5 128   0   0   0   0   1   1   3   0
                 K   1   1   9   0   0   0   2   5   0   0 118   0   0   2   0   1   0   7
                 L   0   0   0   0   2   0   1   1   0   0   0 133   0   0   0   0   1   0
                 M   0   0   1   1   0   0   1   1   0   0   0   0 135   4   0   0   0   0
                 N   0   0   0   0   0   1   0   1   0   0   0   0   0 145   0   0   0   3
                 O   1   0   2   1   0   0   1   2   0   1   0   0   0   1  99   3   3   0
                 P   0   0   0   1   0   2   1   0   0   0   0   0   0   0   2 130   0   0
                 Q   0   0   0   0   0   0   8   2   0   0   0   3   0   0   3   1 124   0
                 R   0   7   0   0   1   0   3   8   0   0  13   0   0   1   1   1   0 138
                 S   1   1   0   0   1   0   3   0   1   1   0   1   0   0   0   0  14   0
                 T   0   0   0   0   3   2   0   0   0   0   1   0   0   0   0   0   0   0
                 U   1   0   3   1   0   0   0   2   0   0   0   0   0   0   1   0   0   0
                 V   0   0   0   0   0   1   3   4   0   0   0   0   1   2   1   0   3   1
                 W   0   0   0   0   0   0   1   0   0   0   0   0   2   0   0   0   0   0
                 X   0   1   0   0   2   0   0   1   3   0   1   6   0   0   1   0   0   0
                 Y   3   0   0   0   0   0   0   1   0   0   0   0   0   0   0   7   0   0
                 Z   2   0   0   0   1   0   0   0   3   4   0   0   0   0   0   0   0   0
                  
letter_predictions   S   T   U   V   W   X   Y   Z
                 A   1   1   1   0   1   0   0   1
                 B   5   0   0   2   0   1   0   0
                 C   0   0   0   0   0   0   0   0
                 D   0   0   0   0   0   3   3   1
                 E  10   0   0   0   0   2   0   3
                 F   3   0   0   1   0   1   2   0
                 G   4   3   0   0   0   1   0   0
                 H   0   3   0   2   0   0   1   0
                 I   3   0   0   0   0   5   1   1
                 J   2   0   0   0   0   1   0   6
                 K   0   1   3   0   0   5   0   0
                 L   5   0   0   0   0   0   0   1
                 M   0   0   3   0   8   0   0   0
                 N   0   0   1   0   2   0   0   0
                 O   0   0   3   0   0   0   0   0
                 P   0   0   0   0   0   0   1   0
                 Q   5   0   0   0   0   0   2   0
                 R   0   1   0   1   0   0   0   0
                 S 101   3   0   0   0   2   0  10
                 T   3 133   1   0   0   0   2   2
                 U   0   0 152   0   0   1   1   0
                 V   0   0   0 126   1   0   4   0
                 W   0   0   4   4 127   0   0   0
                 X   1   0   0   0   0 137   1   1
                 Y   0   3   0   0   0   0 127   0
                 Z  18   3   0   0   0   0   0 132
  • To the left of the diagonal, its the actual information, but to the right, is the predition. Take a look at how wrongly ‘H’ has been OCR’d as an ‘O’, among others.
  • Look only at agreement vs. non-agreement.
  • Constructing a vector of TRUE/FALSE indicating correct/incorrect predictions.
agreement <- letter_predictions == letters_test$letter
table(agreement)
agreement
FALSE  TRUE 
  643  3357 
prop.table(table(agreement))
agreement
  FALSE    TRUE 
0.16075 0.83925 

Step 5: Improving model performance.

set.seed(12345)
letter_classifier_rbf <- ksvm(letter ~ ., data = letters_train, kernel = "rbfdot")
letter_predictions_rbf <- predict(letter_classifier_rbf, letters_test)
agreement_rbf <- letter_predictions_rbf == letters_test$letter
table(agreement_rbf)
agreement_rbf
FALSE  TRUE 
  275  3725 
prop.table(table(agreement_rbf))
agreement_rbf
  FALSE    TRUE 
0.06875 0.93125 
  • There’s an improvement of accuracy from 87% to 93%.
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpUaGlzIGlzIGFuIFtSIE1hcmtkb3duXShodHRwOi8vcm1hcmtkb3duLnJzdHVkaW8uY29tKSBOb3RlYm9vay4gDQoNCiMjIEV4YW1wbGU6IE9wdGljYWwgQ2hhcmFjdGVyIFJlY29nbml0aW9uLg0KDQojI1N0ZXAgMTogQ29sbGVjdGluZyBEYXRhLg0KDQoNCiMjU3RlcCAyOiBFeHBsb3JpbmcgYW5kIHByZXBhcmluZyB0aGUgZGF0YS4NCg0KDQo+LSBSZWFkaW5nIGluIGRhdGEgYW5kIGV4YW1pbmUgc3RydWN0dXJlDQoNCmBgYHtyfQ0KbGV0dGVycyA8LSByZWFkLmNzdigiaHR0cDovL3d3dy5zY2kuY3N1ZWFzdGJheS5lZHUvfmVzdWVzcy9jbGFzc2VzL1N0YXRpc3RpY3NfNjYyMC9QcmVzZW50YXRpb25zL21sMTEvbGV0dGVyZGF0YS5jc3YiKQ0KYGBgDQoNCg0KYGBge3J9DQpzdHIobGV0dGVycykNCmBgYA0KDQoNCj4tIERpdmlkZSBpbnRvIHRyYWluaW5nIGFuZCB0ZXN0IGRhdGEuDQoNCg0KYGBge3J9DQpsZXR0ZXJzX3RyYWluIDwtIGxldHRlcnNbMToxNjAwMCwgXQ0KYGBgDQoNCg0KYGBge3J9DQpsZXR0ZXJzX3Rlc3QgIDwtIGxldHRlcnNbMTYwMDE6MjAwMDAsIF0NCmBgYA0KDQoNCg0KIyMgU3RlcCAzOiBUcmFpbmluZyBhIG1vZGVsIG9uIHRoZSBkYXRhLg0KDQo+LSBCZWdpbiBieSB0cmFpbmluZyBhIHNpbXBsZSBsaW5lYXIgU1ZNLg0KDQpgYGB7cn0NCmxpYnJhcnkoa2VybmxhYikNCmBgYA0KDQoNCmBgYHtyfQ0KbGV0dGVyX2NsYXNzaWZpZXIgPC0ga3N2bShsZXR0ZXIgfiAuLCBkYXRhID0gbGV0dGVyc190cmFpbiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAga2VybmVsID0gInZhbmlsbGFkb3QiKQ0KYGBgDQoNCg0KPi0gTG9vayBhdCBiYXNpYyBpbmZvcm1hdGlvbiBhYm91dCB0aGUgbW9kZWwuDQoNCmBgYHtyfQ0KbGV0dGVyX2NsYXNzaWZpZXINCmBgYA0KDQoNCiMjIFN0ZXAgNDogRXZhbHVhdGluZyBtb2RlbCBwZXJmb3JtYW5jZS4NCg0KPi0gUHJlZGljdGlvbnMgb24gdGVzdGluZyBkYXRhc2V0Lg0KDQoNCmBgYHtyfQ0KbGV0dGVyX3ByZWRpY3Rpb25zIDwtIHByZWRpY3QobGV0dGVyX2NsYXNzaWZpZXIsIGxldHRlcnNfdGVzdCkNCmBgYA0KDQoNCmBgYHtyfQ0KaGVhZChsZXR0ZXJfcHJlZGljdGlvbnMpDQpgYGANCg0KDQpgYGB7cn0NCnRhYmxlKGxldHRlcl9wcmVkaWN0aW9ucywgbGV0dGVyc190ZXN0JGxldHRlcikNCmBgYA0KDQo+LSBUbyB0aGUgbGVmdCBvZiB0aGUgZGlhZ29uYWwsIGl0cyB0aGUgYWN0dWFsIGluZm9ybWF0aW9uLCBidXQgdG8gdGhlIHJpZ2h0LCBpcyB0aGUgcHJlZGl0aW9uLiBUYWtlIGEgbG9vayBhdCBob3cgd3JvbmdseSAnSCcgaGFzIGJlZW4gT0NSJ2QgYXMgYW4gJ08nLCBhbW9uZyBvdGhlcnMuDQoNCg0KPi0gTG9vayBvbmx5IGF0IGFncmVlbWVudCB2cy4gbm9uLWFncmVlbWVudC4NCg0KPi0gQ29uc3RydWN0aW5nIGEgdmVjdG9yIG9mIFRSVUUvRkFMU0UgaW5kaWNhdGluZyBjb3JyZWN0L2luY29ycmVjdCBwcmVkaWN0aW9ucy4NCg0KDQpgYGB7cn0NCmFncmVlbWVudCA8LSBsZXR0ZXJfcHJlZGljdGlvbnMgPT0gbGV0dGVyc190ZXN0JGxldHRlcg0KYGBgDQoNCg0KYGBge3J9DQp0YWJsZShhZ3JlZW1lbnQpDQpgYGANCg0KYGBge3J9DQpwcm9wLnRhYmxlKHRhYmxlKGFncmVlbWVudCkpDQpgYGANCg0KDQojIyBTdGVwIDU6IEltcHJvdmluZyBtb2RlbCBwZXJmb3JtYW5jZS4NCg0KYGBge3J9DQpzZXQuc2VlZCgxMjM0NSkNCmBgYA0KDQoNCmBgYHtyfQ0KbGV0dGVyX2NsYXNzaWZpZXJfcmJmIDwtIGtzdm0obGV0dGVyIH4gLiwgZGF0YSA9IGxldHRlcnNfdHJhaW4sIGtlcm5lbCA9ICJyYmZkb3QiKQ0KYGBgDQoNCg0KDQpgYGB7cn0NCmxldHRlcl9wcmVkaWN0aW9uc19yYmYgPC0gcHJlZGljdChsZXR0ZXJfY2xhc3NpZmllcl9yYmYsIGxldHRlcnNfdGVzdCkNCmBgYA0KDQoNCmBgYHtyfQ0KYWdyZWVtZW50X3JiZiA8LSBsZXR0ZXJfcHJlZGljdGlvbnNfcmJmID09IGxldHRlcnNfdGVzdCRsZXR0ZXINCmBgYA0KDQoNCg0KYGBge3J9DQp0YWJsZShhZ3JlZW1lbnRfcmJmKQ0KYGBgDQoNCg0KDQpgYGB7cn0NCnByb3AudGFibGUodGFibGUoYWdyZWVtZW50X3JiZikpDQpgYGANCg0KPi0gKlRoZXJlJ3MgYW4gaW1wcm92ZW1lbnQgb2YgYWNjdXJhY3kgZnJvbSA4NyUgdG8gOTMlLioNCg==