#Loading all required libraries
library(C50)
library(gmodels)
library(rpart.plot)
library(ca)
library(class)
library(ggplot2)
library(MASS)
library(colorspace)
library(bnlearn)
library(tm)
library(e1071)
library(kernlab)
library(SnowballC)
library(klaR)
library(corrplot)
library(caret)
library(leaps)
library(dplyr)
library(colorspace)
library(psych)
Method 1: Tree-based Classification
#Step 1: Collecting the Data
credit <- read.csv("credit.csv")
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
credit$Creditability <- as.factor(credit$Creditability)
#Step 2: Preparing the Data
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$Creditability)
##
## 0 1
## 300 700
#Splitting the data into train and test
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3088889 0.6911111
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.22 0.78
#Step 3: Training a Model on the Data
credit_model <- C5.0(x=credit_train[-1], y=credit_train$Creditability)
credit_model
##
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 20
##
## Tree size: 85
##
## Non-standard options: attempt to group attributes
summary(credit_model)
##
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
##
##
## C5.0 [Release 2.07 GPL Edition] Sat Oct 05 12:39:33 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 900 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## Account.Balance > 2:
## :...Concurrent.Credits > 2:
## : :...Age..years. > 33: 1 (179/11)
## : : Age..years. <= 33:
## : : :...Credit.Amount > 6681:
## : : :...Length.of.current.employment <= 2: 0 (4)
## : : : Length.of.current.employment > 2:
## : : : :...Payment.Status.of.Previous.Credit <= 3: 1 (4)
## : : : Payment.Status.of.Previous.Credit > 3: 0 (3/1)
## : : Credit.Amount <= 6681:
## : : :...Occupation > 2:
## : : :...Occupation <= 3: 1 (120/12)
## : : : Occupation > 3:
## : : : :...Duration.of.Credit..month. <= 33: 1 (9)
## : : : Duration.of.Credit..month. > 33: 0 (3)
## : : Occupation <= 2:
## : : :...No.of.Credits.at.this.Bank > 1: 1 (6)
## : : No.of.Credits.at.this.Bank <= 1:
## : : :...Most.valuable.available.asset > 1: 0 (3)
## : : Most.valuable.available.asset <= 1:
## : : :...Credit.Amount <= 1987: 1 (8/1)
## : : Credit.Amount > 1987: 0 (2)
## : Concurrent.Credits <= 2:
## : :...Guarantors > 1: 1 (4)
## : Guarantors <= 1:
## : :...Purpose <= 0:
## : :...Most.valuable.available.asset <= 2: 0 (5)
## : : Most.valuable.available.asset > 2:
## : : :...No.of.dependents <= 1: 1 (7/1)
## : : No.of.dependents > 1: 0 (2)
## : Purpose > 0:
## : :...Purpose <= 4: 1 (35/2)
## : Purpose > 4:
## : :...Length.of.current.employment <= 2: 0 (4)
## : Length.of.current.employment > 2:
## : :...No.of.dependents > 1: 0 (3/1)
## : No.of.dependents <= 1:
## : :...Length.of.current.employment > 3: 1 (4)
## : Length.of.current.employment <= 3:
## : :...Instalment.per.cent <= 2: 1 (2)
## : Instalment.per.cent > 2: 0 (2)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
## :...Value.Savings.Stocks <= 2: 0 (49/10)
## : Value.Savings.Stocks > 2:
## : :...Credit.Amount <= 2064: 0 (3)
## : Credit.Amount > 2064: 1 (9/1)
## Payment.Status.of.Previous.Credit > 1:
## :...Credit.Amount > 7980:
## :...Value.Savings.Stocks > 4:
## : :...Payment.Status.of.Previous.Credit <= 2: 0 (4/1)
## : : Payment.Status.of.Previous.Credit > 2: 1 (3)
## : Value.Savings.Stocks <= 4:
## : :...Account.Balance > 1: 0 (15)
## : Account.Balance <= 1:
## : :...Concurrent.Credits <= 2: 0 (2)
## : Concurrent.Credits > 2:
## : :...Credit.Amount <= 10297: 0 (6)
## : Credit.Amount > 10297: 1 (3)
## Credit.Amount <= 7980:
## :...Duration.of.Credit..month. <= 11:
## :...Occupation > 3:
## : :...Concurrent.Credits <= 2: 1 (3)
## : : Concurrent.Credits > 2:
## : : :...Payment.Status.of.Previous.Credit <= 2: 1 (4/1)
## : : Payment.Status.of.Previous.Credit > 2: 0 (3)
## : Occupation <= 3:
## : :...Age..years. > 32: 1 (34)
## : Age..years. <= 32:
## : :...Most.valuable.available.asset <= 1: 1 (13/1)
## : Most.valuable.available.asset > 1:
## : :...Instalment.per.cent <= 3: 1 (6/1)
## : Instalment.per.cent > 3: 0 (6/1)
## Duration.of.Credit..month. > 11:
## :...Duration.of.Credit..month. > 36:
## :...Length.of.current.employment <= 1: 1 (3)
## : Length.of.current.employment > 1:
## : :...No.of.dependents > 1: 1 (5/1)
## : No.of.dependents <= 1:
## : :...Duration.in.Current.address <= 1: 1 (4/1)
## : Duration.in.Current.address > 1: 0 (23)
## Duration.of.Credit..month. <= 36:
## :...Guarantors > 2:
## :...Foreign.Worker <= 1: 1 (23/1)
## : Foreign.Worker > 1: 0 (2)
## Guarantors <= 2:
## :...Credit.Amount <= 1381:
## :...Telephone > 1:
## : :...Sex...Marital.Status > 3: 0 (2)
## : : Sex...Marital.Status <= 3:
## : : :...Duration.of.Credit..month. <= 16: 1 (7)
## : : Duration.of.Credit..month. > 16: 0 (3/1)
## : Telephone <= 1:
## : :...Concurrent.Credits <= 2: 0 (9)
## : Concurrent.Credits > 2:
## : :...Account.Balance <= 1: 0 (29/6)
## : Account.Balance > 1: [S1]
## Credit.Amount > 1381:
## :...Guarantors > 1:
## :...Foreign.Worker > 1: 1 (2)
## : Foreign.Worker <= 1:
## : :...Instalment.per.cent > 2: 0 (5)
## : Instalment.per.cent <= 2: [S2]
## Guarantors <= 1:
## :...Payment.Status.of.Previous.Credit > 3:
## :...Age..years. > 33: 1 (22)
## : Age..years. <= 33:
## : :...Purpose > 3: 1 (7)
## : Purpose <= 3: [S3]
## Payment.Status.of.Previous.Credit <= 3:
## :...Instalment.per.cent <= 2:
## :...No.of.dependents > 1:
## : :...Purpose <= 0: 1 (2)
## : : Purpose > 0: 0 (3)
## : No.of.dependents <= 1: [S4]
## Instalment.per.cent > 2:
## :...Concurrent.Credits <= 1: 1 (8/1)
## Concurrent.Credits > 1:
## :...Sex...Marital.Status <= 1: 0 (6/1)
## Sex...Marital.Status > 1:
## :...Account.Balance > 1: [S5]
## Account.Balance <= 1: [S6]
##
## SubTree [S1]
##
## Duration.in.Current.address > 3: 1 (8/1)
## Duration.in.Current.address <= 3:
## :...Purpose > 2: 0 (5)
## Purpose <= 2:
## :...Type.of.apartment <= 1: 0 (2)
## Type.of.apartment > 1: 1 (5/1)
##
## SubTree [S2]
##
## Duration.in.Current.address <= 2: 1 (2)
## Duration.in.Current.address > 2: 0 (4/1)
##
## SubTree [S3]
##
## Duration.of.Credit..month. <= 16: 1 (4)
## Duration.of.Credit..month. > 16:
## :...Length.of.current.employment <= 3: 0 (8)
## Length.of.current.employment > 3: 1 (6/1)
##
## SubTree [S4]
##
## Duration.in.Current.address > 1: 1 (41/6)
## Duration.in.Current.address <= 1:
## :...Value.Savings.Stocks > 3: 0 (2)
## Value.Savings.Stocks <= 3:
## :...Length.of.current.employment > 2: 1 (4)
## Length.of.current.employment <= 2:
## :...Instalment.per.cent <= 1: 0 (3)
## Instalment.per.cent > 1: 1 (3/1)
##
## SubTree [S5]
##
## Sex...Marital.Status > 3: 0 (2)
## Sex...Marital.Status <= 3:
## :...Length.of.current.employment > 3: 1 (10)
## Length.of.current.employment <= 3:
## :...Duration.in.Current.address <= 1: 1 (5)
## Duration.in.Current.address > 1:
## :...Length.of.current.employment <= 2: 0 (4)
## Length.of.current.employment > 2:
## :...Value.Savings.Stocks <= 1: 0 (3)
## Value.Savings.Stocks > 1: 1 (5)
##
## SubTree [S6]
##
## Payment.Status.of.Previous.Credit > 2: 0 (3)
## Payment.Status.of.Previous.Credit <= 2:
## :...Purpose <= 0: 0 (7/1)
## Purpose > 0:
## :...Most.valuable.available.asset <= 1: 0 (5/1)
## Most.valuable.available.asset > 1:
## :...Sex...Marital.Status <= 2: 1 (6)
## Sex...Marital.Status > 2:
## :...Length.of.current.employment > 4: 0 (5)
## Length.of.current.employment <= 4:
## :...Telephone > 1: 1 (3)
## Telephone <= 1:
## :...Length.of.current.employment <= 2: 0 (2)
## Length.of.current.employment > 2:
## :...Age..years. <= 28: 1 (4)
## Age..years. > 28: 0 (2)
##
##
## Evaluation on training data (900 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 85 70( 7.8%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 233 45 (a): class 0
## 25 597 (b): class 1
##
##
## Attribute usage:
##
## 100.00% Account.Balance
## 67.11% Credit.Amount
## 63.11% Concurrent.Credits
## 55.33% Payment.Status.of.Previous.Credit
## 50.33% Age..years.
## 45.44% Duration.of.Credit..month.
## 40.11% Guarantors
## 24.44% Occupation
## 18.33% Instalment.per.cent
## 15.56% Purpose
## 14.22% Length.of.current.employment
## 13.67% Duration.in.Current.address
## 12.67% Value.Savings.Stocks
## 12.22% No.of.dependents
## 9.33% Sex...Marital.Status
## 9.00% Telephone
## 8.78% Most.valuable.available.asset
## 4.22% Foreign.Worker
## 2.11% No.of.Credits.at.this.Bank
## 0.78% Type.of.apartment
##
##
## Time: 0.0 secs
#Step 4: Evaluating Model Performance
cred_pred <- predict(credit_model, credit_test)
CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ---------------------|-----------|-----------|-----------|
## 0 | 8 | 14 | 22 |
## | 0.080 | 0.140 | |
## ---------------------|-----------|-----------|-----------|
## 1 | 17 | 61 | 78 |
## | 0.170 | 0.610 | |
## ---------------------|-----------|-----------|-----------|
## Column Total | 25 | 75 | 100 |
## ---------------------|-----------|-----------|-----------|
##
##
#14 records false negatives/Type II error
#17 records false positives/Type I error
conf_mat <- table(cred_pred, credit_test$Creditability)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy
## [1] 69
#Accuracy is 69% and can be improved
Method 2: Support Vector Machines
#Step 1: Collecting the Data
letters <- read.csv("letterdata.csv")
str(letters)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
#Step 2: Preparing the Data
letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]
#Step 3: Training a Model on the Data
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")
## Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 7886
##
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569
## Training error : 0.1335
#Step 4: Evaluating Model Performance
letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##
## letter_predictions A B C D E F G H I J K L M N O P Q R
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5
## I 0 0 0 0 0 0 0 0 65 3 0 0 0 0 0 0 0 0
## J 0 1 0 0 0 1 0 0 3 61 0 0 0 0 1 0 0 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0
##
## letter_predictions S T U V W X Y Z
## A 0 1 2 0 1 0 0 0
## B 3 0 0 0 0 0 0 0
## C 0 0 0 0 0 0 0 0
## D 0 0 0 0 0 0 1 0
## E 6 0 0 0 0 1 0 0
## F 2 0 0 1 0 0 2 0
## G 3 2 0 0 0 0 0 0
## H 0 3 0 2 0 0 1 0
## I 2 0 0 0 0 2 1 0
## J 1 0 0 0 0 1 0 4
## K 0 1 2 0 0 4 0 0
## L 0 0 0 0 0 0 0 0
## M 0 0 1 0 6 0 0 0
## N 0 0 1 0 2 0 0 0
## O 0 0 1 0 0 0 0 0
## P 0 0 0 0 0 0 1 0
## Q 1 0 0 0 0 0 0 0
## R 0 1 0 1 0 0 0 0
## S 47 1 0 0 0 1 0 6
## T 1 83 1 0 0 0 2 2
## U 0 0 83 0 0 0 0 0
## V 0 0 0 64 1 0 1 0
## W 0 0 0 3 59 0 0 0
## X 0 0 0 0 0 76 1 0
## Y 0 1 0 0 0 1 58 0
## Z 5 1 0 0 0 0 0 70
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 321 1679
#The classification was correct in 1679 out of 2000 test records
conf_mat <- table(letter_predictions, letters_test$letter)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy
## [1] 83.95
#Accuracy is pretty good at 84%
Method 3: Adding Regression to Trees
#Step 1: Collecting the Data
wine <- read.csv("whitewines.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
#Step 2: Preparing the Data
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
#Step 3: Training a Model on the Data
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 2945.53200 5.870933
## 2) alcohol< 10.85 2372 1418.86100 5.604975
## 4) volatile.acidity>=0.2275 1611 821.30730 5.432030
## 8) volatile.acidity>=0.3025 688 278.97670 5.255814 *
## 9) volatile.acidity< 0.3025 923 505.04230 5.563380 *
## 5) volatile.acidity< 0.2275 761 447.36400 5.971091 *
## 3) alcohol>=10.85 1378 1070.08200 6.328737
## 6) free.sulfur.dioxide< 10.5 84 95.55952 5.369048 *
## 7) free.sulfur.dioxide>=10.5 1294 892.13600 6.391036
## 14) alcohol< 11.76667 629 430.11130 6.173291
## 28) volatile.acidity>=0.465 11 10.72727 4.545455 *
## 29) volatile.acidity< 0.465 618 389.71680 6.202265 *
## 15) alcohol>=11.76667 665 403.99400 6.596992 *
rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

#Step 4: Evaluating Model Performance
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
#Correlation of 54% is not that great
conf_mat <- table(p.rpart, wine_test$quality)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy
## [1] 18.37979
#Accuracy is only 18% and needs improvement
Method 4a: News Popularity using Tree-based Classification
#Step 1: Collecting the Data
news <-read.csv("OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
#Step 2: Preparing the Data
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
prop.table(table(news_train$shares))
##
## no yes
## 0.4308889 0.5691111
prop.table(table(news_test$shares))
##
## no yes
## 0.414 0.586
#Step 3: Training a Model on the Data
news_model <- C5.0(x=news_train[-1], y=news_train$shares)
news_model
##
## Call:
## C5.0.default(x = news_train[-1], y = news_train$shares)
##
## Classification Tree
## Number of samples: 9000
## Number of predictors: 17
##
## Tree size: 2
##
## Non-standard options: attempt to group attributes
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-1], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Sat Oct 05 12:39:54 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 9000 cases (18 attributes) from undefined.data
##
## Decision tree:
##
## shares = no: no (3878)
## shares = yes: yes (5122)
##
##
## Evaluation on training data (9000 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 3878 (a): class no
## 5122 (b): class yes
##
##
## Attribute usage:
##
## 100.00% shares
##
##
## Time: 0.1 secs
#Step 4: Evaluating Model Performance
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Shares', 'Predicted Shares'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | Predicted Shares
## Actual Shares | no | yes | Row Total |
## --------------|-----------|-----------|-----------|
## no | 414 | 0 | 414 |
## | 0.414 | 0.000 | |
## --------------|-----------|-----------|-----------|
## yes | 0 | 586 | 586 |
## | 0.000 | 0.586 | |
## --------------|-----------|-----------|-----------|
## Column Total | 414 | 586 | 1000 |
## --------------|-----------|-----------|-----------|
##
##
#0 records false negatives/Type II error
#0 records false positives/Type I error
conf_mat <- table(news_pred, news_test$shares)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy
## [1] 100
#Accuracy is 100%
Method 4b: News Popularity using Support Vector Machines
#Step 1: Collecting the Data
news <-read.csv("OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
#Step 2: Preparing the Data
news_train <- newsShort[1:9000, ]
news_test <- newsShort[9001:10000, ]
#Step 3: Training a Model on the Data
news_classifier <- ksvm(shares ~ ., data = news_train, kernel = "vanilladot")
## Setting default kernel parameters
news_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 78
##
## Objective Function Value : -1
## Training error : 0
#Step 4: Evaluating Model Performance
news_pred <- predict(news_classifier, news_test)
table(news_pred, news_test$shares)
##
## news_pred no yes
## no 547 0
## yes 0 453
agreement <- news_pred == news_test$shares
table(agreement)
## agreement
## TRUE
## 1000
#The classification was correct in all 1000 test records
conf_mat <- table(news_pred, news_test$shares)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy
## [1] 100
#Accuracy is 100%