Lab 2

Load Credit Data Dataset

library(readxl)
creditData <- read.csv("creditData.csv")
View(creditData)
creditData$Creditability <- as.factor(creditData$Creditability)
sum(is.na(creditData))

## [1] 0

Laboratory 2: Naïve Bayes Classifiers, Part 1

library(caTools)
set.seed(12345)
creditDataRandom <- creditData[order(runif(1000)), ]
creditDataTrain <- creditDataRandom[1:750, ]
creditDataTest <- creditDataRandom[751:1000,]
prop.table(table(creditDataTrain$Creditability))
prop.table(table(creditDataTest$Creditability))
library(naivebayes)
naive_model <- naive_bayes(creditDataTrain$Creditability ~., data = creditDataTrain)
naive_model
conf_nat <- table(predict(naive_model, creditDataTest), creditDataTest$Creditability)
Accuracy <- sum(diag(predict/sum(conf_nat)*100))
Accuracy

Laboratory 2: Naïve Bayes Classifiers, Part 2

library(caret)
highlycor <- findCorrelation(m, 0.30)
credit_rand <- creditData[order(runif(1000)), ]
creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
highlycor <- findCorrelation(m, 0.30)
highlycor
filteredData <- credit_rand[, -(highlycor[5]+1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]
nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
filteredTestPred <- predict(nb_model, newdata = filteredTest)
table(filteredTestPred, filteredTest$Creditability)
conf_nat2<- table(filteredTestPred, filteredTest$Creditability)
Accuracy <- sum(diag(conf_nat2))/sum(conf_nat2)*100
Accuracy

Laboratory 2: Support Vector Machine, Part 3

letters <- read.csv("letterdata.csv")
str(letters)

## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...

letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]
library(kernlab)
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")

##  Setting default kernel parameters

letter_classifier

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 7886 
## 
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569 
## Training error : 0.1335

letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)

##                   
## letter_predictions  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T
##                  A 73  0  0  0  0  0  0  0  0  1  0  0  0  0  3  0  4  0  0  1
##                  B  0 61  0  3  2  0  1  1  0  0  1  1  0  0  0  2  0  1  3  0
##                  C  0  0 64  0  2  0  4  2  1  0  1  2  0  0  1  0  0  0  0  0
##                  D  2  1  0 67  0  0  1  3  3  2  1  2  0  3  4  2  1  2  0  0
##                  E  0  0  1  0 64  1  1  0  0  0  2  2  0  0  0  0  2  0  6  0
##                  F  0  0  0  0  0 70  1  1  4  0  0  0  0  0  0  5  1  0  2  0
##                  G  1  1  2  1  3  2 68  1  0  0  0  1  0  0  0  0  4  1  3  2
##                  H  0  0  0  1  0  1  0 46  0  2  3  1  1  1  9  0  0  5  0  3
##                  I  0  0  0  0  0  0  0  0 65  3  0  0  0  0  0  0  0  0  2  0
##                  J  0  1  0  0  0  1  0  0  3 61  0  0  0  0  1  0  0  0  1  0
##                  K  0  1  4  0  0  0  0  5  0  0 56  0  0  2  0  0  0  4  0  1
##                  L  0  0  0  0  1  0  0  1  0  0  0 63  0  0  0  0  0  0  0  0
##                  M  0  0  1  0  0  0  1  0  0  0  0  0 70  2  0  0  0  0  0  0
##                  N  0  0  0  0  0  0  0  0  0  0  0  0  0 77  0  0  0  1  0  0
##                  O  0  0  1  1  0  0  0  1  0  1  0  0  0  0 49  1  2  0  0  0
##                  P  0  0  0  0  0  3  0  0  0  0  0  0  0  0  2 69  0  0  0  0
##                  Q  0  0  0  0  0  0  3  1  0  0  0  2  0  0  2  1 52  0  1  0
##                  R  0  4  0  0  1  0  0  3  0  0  3  0  0  0  1  0  0 64  0  1
##                  S  0  1  0  0  1  1  1  0  1  1  0  0  0  0  0  0  6  0 47  1
##                  T  0  0  0  0  1  1  0  0  0  0  1  0  0  0  0  0  0  0  1 83
##                  U  0  0  2  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##                  V  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0  0  0
##                  W  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
##                  X  0  1  0  0  1  0  0  1  0  0  1  4  0  0  0  0  0  1  0  0
##                  Y  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  4  0  0  0  1
##                  Z  1  0  0  0  2  0  0  0  0  2  0  0  0  0  0  0  0  0  5  1
##                   
## letter_predictions  U  V  W  X  Y  Z
##                  A  2  0  1  0  0  0
##                  B  0  0  0  0  0  0
##                  C  0  0  0  0  0  0
##                  D  0  0  0  0  1  0
##                  E  0  0  0  1  0  0
##                  F  0  1  0  0  2  0
##                  G  0  0  0  0  0  0
##                  H  0  2  0  0  1  0
##                  I  0  0  0  2  1  0
##                  J  0  0  0  1  0  4
##                  K  2  0  0  4  0  0
##                  L  0  0  0  0  0  0
##                  M  1  0  6  0  0  0
##                  N  1  0  2  0  0  0
##                  O  1  0  0  0  0  0
##                  P  0  0  0  0  1  0
##                  Q  0  0  0  0  0  0
##                  R  0  1  0  0  0  0
##                  S  0  0  0  1  0  6
##                  T  1  0  0  0  2  2
##                  U 83  0  0  0  0  0
##                  V  0 64  1  0  1  0
##                  W  0  3 59  0  0  0
##                  X  0  0  0 76  1  0
##                  Y  0  0  0  1 58  0
##                  Z  0  0  0  0  0 70

agreement <- letter_predictions == letters_test$letter
table(agreement)

## agreement
## FALSE  TRUE 
##   321  1679

Laboratory 2: News popularity, Part 4

We are seeing a better accuracy in Naive Bayes Part 2 as compared to Part 1. Naive Bayes also provide a better accuracy when we compare to SVM in part 3.

Lab 2

Jing Ru TAN

1/12/2020