ANLY530_Lab1

#Loading all required libraries
library(C50)
library(gmodels)
library(rpart.plot)
library(ca)
library(class)
library(ggplot2)
library(MASS)
library(colorspace)
library(bnlearn)
library(tm)
library(e1071)
library(kernlab)
library(SnowballC)
library(klaR)
library(corrplot)
library(caret)
library(leaps)
library(dplyr)
library(colorspace)
library(psych)

Method 1: Tree-based Classification

#Step 1: Collecting the Data
credit <- read.csv("credit.csv")
str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

credit$Creditability <- as.factor(credit$Creditability)

#Step 2: Preparing the Data
summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

table(credit$Creditability)

## 
##   0   1 
## 300 700

#Splitting the data into train and test
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$Creditability))

## 
##         0         1 
## 0.3088889 0.6911111

prop.table(table(credit_test$Creditability))

## 
##    0    1 
## 0.22 0.78

#Step 3: Training a Model on the Data
credit_model <- C5.0(x=credit_train[-1], y=credit_train$Creditability)
credit_model

## 
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Tree size: 85 
## 
## Non-standard options: attempt to group attributes

summary(credit_model)

## 
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sat Oct 05 12:39:33 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## Account.Balance > 2:
## :...Concurrent.Credits > 2:
## :   :...Age..years. > 33: 1 (179/11)
## :   :   Age..years. <= 33:
## :   :   :...Credit.Amount > 6681:
## :   :       :...Length.of.current.employment <= 2: 0 (4)
## :   :       :   Length.of.current.employment > 2:
## :   :       :   :...Payment.Status.of.Previous.Credit <= 3: 1 (4)
## :   :       :       Payment.Status.of.Previous.Credit > 3: 0 (3/1)
## :   :       Credit.Amount <= 6681:
## :   :       :...Occupation > 2:
## :   :           :...Occupation <= 3: 1 (120/12)
## :   :           :   Occupation > 3:
## :   :           :   :...Duration.of.Credit..month. <= 33: 1 (9)
## :   :           :       Duration.of.Credit..month. > 33: 0 (3)
## :   :           Occupation <= 2:
## :   :           :...No.of.Credits.at.this.Bank > 1: 1 (6)
## :   :               No.of.Credits.at.this.Bank <= 1:
## :   :               :...Most.valuable.available.asset > 1: 0 (3)
## :   :                   Most.valuable.available.asset <= 1:
## :   :                   :...Credit.Amount <= 1987: 1 (8/1)
## :   :                       Credit.Amount > 1987: 0 (2)
## :   Concurrent.Credits <= 2:
## :   :...Guarantors > 1: 1 (4)
## :       Guarantors <= 1:
## :       :...Purpose <= 0:
## :           :...Most.valuable.available.asset <= 2: 0 (5)
## :           :   Most.valuable.available.asset > 2:
## :           :   :...No.of.dependents <= 1: 1 (7/1)
## :           :       No.of.dependents > 1: 0 (2)
## :           Purpose > 0:
## :           :...Purpose <= 4: 1 (35/2)
## :               Purpose > 4:
## :               :...Length.of.current.employment <= 2: 0 (4)
## :                   Length.of.current.employment > 2:
## :                   :...No.of.dependents > 1: 0 (3/1)
## :                       No.of.dependents <= 1:
## :                       :...Length.of.current.employment > 3: 1 (4)
## :                           Length.of.current.employment <= 3:
## :                           :...Instalment.per.cent <= 2: 1 (2)
## :                               Instalment.per.cent > 2: 0 (2)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
##     :...Value.Savings.Stocks <= 2: 0 (49/10)
##     :   Value.Savings.Stocks > 2:
##     :   :...Credit.Amount <= 2064: 0 (3)
##     :       Credit.Amount > 2064: 1 (9/1)
##     Payment.Status.of.Previous.Credit > 1:
##     :...Credit.Amount > 7980:
##         :...Value.Savings.Stocks > 4:
##         :   :...Payment.Status.of.Previous.Credit <= 2: 0 (4/1)
##         :   :   Payment.Status.of.Previous.Credit > 2: 1 (3)
##         :   Value.Savings.Stocks <= 4:
##         :   :...Account.Balance > 1: 0 (15)
##         :       Account.Balance <= 1:
##         :       :...Concurrent.Credits <= 2: 0 (2)
##         :           Concurrent.Credits > 2:
##         :           :...Credit.Amount <= 10297: 0 (6)
##         :               Credit.Amount > 10297: 1 (3)
##         Credit.Amount <= 7980:
##         :...Duration.of.Credit..month. <= 11:
##             :...Occupation > 3:
##             :   :...Concurrent.Credits <= 2: 1 (3)
##             :   :   Concurrent.Credits > 2:
##             :   :   :...Payment.Status.of.Previous.Credit <= 2: 1 (4/1)
##             :   :       Payment.Status.of.Previous.Credit > 2: 0 (3)
##             :   Occupation <= 3:
##             :   :...Age..years. > 32: 1 (34)
##             :       Age..years. <= 32:
##             :       :...Most.valuable.available.asset <= 1: 1 (13/1)
##             :           Most.valuable.available.asset > 1:
##             :           :...Instalment.per.cent <= 3: 1 (6/1)
##             :               Instalment.per.cent > 3: 0 (6/1)
##             Duration.of.Credit..month. > 11:
##             :...Duration.of.Credit..month. > 36:
##                 :...Length.of.current.employment <= 1: 1 (3)
##                 :   Length.of.current.employment > 1:
##                 :   :...No.of.dependents > 1: 1 (5/1)
##                 :       No.of.dependents <= 1:
##                 :       :...Duration.in.Current.address <= 1: 1 (4/1)
##                 :           Duration.in.Current.address > 1: 0 (23)
##                 Duration.of.Credit..month. <= 36:
##                 :...Guarantors > 2:
##                     :...Foreign.Worker <= 1: 1 (23/1)
##                     :   Foreign.Worker > 1: 0 (2)
##                     Guarantors <= 2:
##                     :...Credit.Amount <= 1381:
##                         :...Telephone > 1:
##                         :   :...Sex...Marital.Status > 3: 0 (2)
##                         :   :   Sex...Marital.Status <= 3:
##                         :   :   :...Duration.of.Credit..month. <= 16: 1 (7)
##                         :   :       Duration.of.Credit..month. > 16: 0 (3/1)
##                         :   Telephone <= 1:
##                         :   :...Concurrent.Credits <= 2: 0 (9)
##                         :       Concurrent.Credits > 2:
##                         :       :...Account.Balance <= 1: 0 (29/6)
##                         :           Account.Balance > 1: [S1]
##                         Credit.Amount > 1381:
##                         :...Guarantors > 1:
##                             :...Foreign.Worker > 1: 1 (2)
##                             :   Foreign.Worker <= 1:
##                             :   :...Instalment.per.cent > 2: 0 (5)
##                             :       Instalment.per.cent <= 2: [S2]
##                             Guarantors <= 1:
##                             :...Payment.Status.of.Previous.Credit > 3:
##                                 :...Age..years. > 33: 1 (22)
##                                 :   Age..years. <= 33:
##                                 :   :...Purpose > 3: 1 (7)
##                                 :       Purpose <= 3: [S3]
##                                 Payment.Status.of.Previous.Credit <= 3:
##                                 :...Instalment.per.cent <= 2:
##                                     :...No.of.dependents > 1:
##                                     :   :...Purpose <= 0: 1 (2)
##                                     :   :   Purpose > 0: 0 (3)
##                                     :   No.of.dependents <= 1: [S4]
##                                     Instalment.per.cent > 2:
##                                     :...Concurrent.Credits <= 1: 1 (8/1)
##                                         Concurrent.Credits > 1:
##                                         :...Sex...Marital.Status <= 1: 0 (6/1)
##                                             Sex...Marital.Status > 1:
##                                             :...Account.Balance > 1: [S5]
##                                                 Account.Balance <= 1: [S6]
## 
## SubTree [S1]
## 
## Duration.in.Current.address > 3: 1 (8/1)
## Duration.in.Current.address <= 3:
## :...Purpose > 2: 0 (5)
##     Purpose <= 2:
##     :...Type.of.apartment <= 1: 0 (2)
##         Type.of.apartment > 1: 1 (5/1)
## 
## SubTree [S2]
## 
## Duration.in.Current.address <= 2: 1 (2)
## Duration.in.Current.address > 2: 0 (4/1)
## 
## SubTree [S3]
## 
## Duration.of.Credit..month. <= 16: 1 (4)
## Duration.of.Credit..month. > 16:
## :...Length.of.current.employment <= 3: 0 (8)
##     Length.of.current.employment > 3: 1 (6/1)
## 
## SubTree [S4]
## 
## Duration.in.Current.address > 1: 1 (41/6)
## Duration.in.Current.address <= 1:
## :...Value.Savings.Stocks > 3: 0 (2)
##     Value.Savings.Stocks <= 3:
##     :...Length.of.current.employment > 2: 1 (4)
##         Length.of.current.employment <= 2:
##         :...Instalment.per.cent <= 1: 0 (3)
##             Instalment.per.cent > 1: 1 (3/1)
## 
## SubTree [S5]
## 
## Sex...Marital.Status > 3: 0 (2)
## Sex...Marital.Status <= 3:
## :...Length.of.current.employment > 3: 1 (10)
##     Length.of.current.employment <= 3:
##     :...Duration.in.Current.address <= 1: 1 (5)
##         Duration.in.Current.address > 1:
##         :...Length.of.current.employment <= 2: 0 (4)
##             Length.of.current.employment > 2:
##             :...Value.Savings.Stocks <= 1: 0 (3)
##                 Value.Savings.Stocks > 1: 1 (5)
## 
## SubTree [S6]
## 
## Payment.Status.of.Previous.Credit > 2: 0 (3)
## Payment.Status.of.Previous.Credit <= 2:
## :...Purpose <= 0: 0 (7/1)
##     Purpose > 0:
##     :...Most.valuable.available.asset <= 1: 0 (5/1)
##         Most.valuable.available.asset > 1:
##         :...Sex...Marital.Status <= 2: 1 (6)
##             Sex...Marital.Status > 2:
##             :...Length.of.current.employment > 4: 0 (5)
##                 Length.of.current.employment <= 4:
##                 :...Telephone > 1: 1 (3)
##                     Telephone <= 1:
##                     :...Length.of.current.employment <= 2: 0 (2)
##                         Length.of.current.employment > 2:
##                         :...Age..years. <= 28: 1 (4)
##                             Age..years. > 28: 0 (2)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      85   70( 7.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     233    45    (a): class 0
##      25   597    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% Account.Balance
##   67.11% Credit.Amount
##   63.11% Concurrent.Credits
##   55.33% Payment.Status.of.Previous.Credit
##   50.33% Age..years.
##   45.44% Duration.of.Credit..month.
##   40.11% Guarantors
##   24.44% Occupation
##   18.33% Instalment.per.cent
##   15.56% Purpose
##   14.22% Length.of.current.employment
##   13.67% Duration.in.Current.address
##   12.67% Value.Savings.Stocks
##   12.22% No.of.dependents
##    9.33% Sex...Marital.Status
##    9.00% Telephone
##    8.78% Most.valuable.available.asset
##    4.22% Foreign.Worker
##    2.11% No.of.Credits.at.this.Bank
##    0.78% Type.of.apartment
## 
## 
## Time: 0.0 secs

#Step 4: Evaluating Model Performance
cred_pred <- predict(credit_model, credit_test)
CrossTable(credit_test$Creditability, cred_pred, prop.chisq =  FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |         8 |        14 |        22 | 
##                      |     0.080 |     0.140 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |        17 |        61 |        78 | 
##                      |     0.170 |     0.610 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        25 |        75 |       100 | 
## ---------------------|-----------|-----------|-----------|
## 
##

#14 records false negatives/Type II error
#17 records false positives/Type I error 
conf_mat <- table(cred_pred, credit_test$Creditability)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy

## [1] 69

#Accuracy is 69% and can be improved

Method 2: Support Vector Machines

#Step 1: Collecting the Data
letters <- read.csv("letterdata.csv")
str(letters)

## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...

#Step 2: Preparing the Data
letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]

#Step 3: Training a Model on the Data
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")

##  Setting default kernel parameters

letter_classifier

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 7886 
## 
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569 
## Training error : 0.1335

#Step 4: Evaluating Model Performance
letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)

##                   
## letter_predictions  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R
##                  A 73  0  0  0  0  0  0  0  0  1  0  0  0  0  3  0  4  0
##                  B  0 61  0  3  2  0  1  1  0  0  1  1  0  0  0  2  0  1
##                  C  0  0 64  0  2  0  4  2  1  0  1  2  0  0  1  0  0  0
##                  D  2  1  0 67  0  0  1  3  3  2  1  2  0  3  4  2  1  2
##                  E  0  0  1  0 64  1  1  0  0  0  2  2  0  0  0  0  2  0
##                  F  0  0  0  0  0 70  1  1  4  0  0  0  0  0  0  5  1  0
##                  G  1  1  2  1  3  2 68  1  0  0  0  1  0  0  0  0  4  1
##                  H  0  0  0  1  0  1  0 46  0  2  3  1  1  1  9  0  0  5
##                  I  0  0  0  0  0  0  0  0 65  3  0  0  0  0  0  0  0  0
##                  J  0  1  0  0  0  1  0  0  3 61  0  0  0  0  1  0  0  0
##                  K  0  1  4  0  0  0  0  5  0  0 56  0  0  2  0  0  0  4
##                  L  0  0  0  0  1  0  0  1  0  0  0 63  0  0  0  0  0  0
##                  M  0  0  1  0  0  0  1  0  0  0  0  0 70  2  0  0  0  0
##                  N  0  0  0  0  0  0  0  0  0  0  0  0  0 77  0  0  0  1
##                  O  0  0  1  1  0  0  0  1  0  1  0  0  0  0 49  1  2  0
##                  P  0  0  0  0  0  3  0  0  0  0  0  0  0  0  2 69  0  0
##                  Q  0  0  0  0  0  0  3  1  0  0  0  2  0  0  2  1 52  0
##                  R  0  4  0  0  1  0  0  3  0  0  3  0  0  0  1  0  0 64
##                  S  0  1  0  0  1  1  1  0  1  1  0  0  0  0  0  0  6  0
##                  T  0  0  0  0  1  1  0  0  0  0  1  0  0  0  0  0  0  0
##                  U  0  0  2  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##                  V  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0
##                  W  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0
##                  X  0  1  0  0  1  0  0  1  0  0  1  4  0  0  0  0  0  1
##                  Y  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  4  0  0
##                  Z  1  0  0  0  2  0  0  0  0  2  0  0  0  0  0  0  0  0
##                   
## letter_predictions  S  T  U  V  W  X  Y  Z
##                  A  0  1  2  0  1  0  0  0
##                  B  3  0  0  0  0  0  0  0
##                  C  0  0  0  0  0  0  0  0
##                  D  0  0  0  0  0  0  1  0
##                  E  6  0  0  0  0  1  0  0
##                  F  2  0  0  1  0  0  2  0
##                  G  3  2  0  0  0  0  0  0
##                  H  0  3  0  2  0  0  1  0
##                  I  2  0  0  0  0  2  1  0
##                  J  1  0  0  0  0  1  0  4
##                  K  0  1  2  0  0  4  0  0
##                  L  0  0  0  0  0  0  0  0
##                  M  0  0  1  0  6  0  0  0
##                  N  0  0  1  0  2  0  0  0
##                  O  0  0  1  0  0  0  0  0
##                  P  0  0  0  0  0  0  1  0
##                  Q  1  0  0  0  0  0  0  0
##                  R  0  1  0  1  0  0  0  0
##                  S 47  1  0  0  0  1  0  6
##                  T  1 83  1  0  0  0  2  2
##                  U  0  0 83  0  0  0  0  0
##                  V  0  0  0 64  1  0  1  0
##                  W  0  0  0  3 59  0  0  0
##                  X  0  0  0  0  0 76  1  0
##                  Y  0  1  0  0  0  1 58  0
##                  Z  5  1  0  0  0  0  0 70

agreement <- letter_predictions == letters_test$letter
table(agreement)

## agreement
## FALSE  TRUE 
##   321  1679

#The classification was correct in 1679 out of 2000 test records
conf_mat <- table(letter_predictions, letters_test$letter)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy

## [1] 83.95

#Accuracy is pretty good at 84%

Method 3: Adding Regression to Trees

#Step 1: Collecting the Data
wine <- read.csv("whitewines.csv")
str(wine)

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...

#Step 2: Preparing the Data
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]

#Step 3: Training a Model on the Data
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart

## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile.acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile.acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile.acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile.acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free.sulfur.dioxide< 10.5 84   95.55952 5.369048 *
##      7) free.sulfur.dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile.acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile.acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *

rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

#Step 4: Evaluating Model Performance
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597

summary(wine_test$quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000

cor(p.rpart, wine_test$quality)

## [1] 0.5369525

#Correlation of 54% is not that great
conf_mat <- table(p.rpart, wine_test$quality)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy

## [1] 18.37979

#Accuracy is only 18% and needs improvement

Method 4a: News Popularity using Tree-based Classification

#Step 1: Collecting the Data
news <-read.csv("OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)

#Step 2: Preparing the Data
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
prop.table(table(news_train$shares))

## 
##        no       yes 
## 0.4308889 0.5691111

prop.table(table(news_test$shares))

## 
##    no   yes 
## 0.414 0.586

#Step 3: Training a Model on the Data
news_model <- C5.0(x=news_train[-1], y=news_train$shares)
news_model

## 
## Call:
## C5.0.default(x = news_train[-1], y = news_train$shares)
## 
## Classification Tree
## Number of samples: 9000 
## Number of predictors: 17 
## 
## Tree size: 2 
## 
## Non-standard options: attempt to group attributes

summary(news_model)

## 
## Call:
## C5.0.default(x = news_train[-1], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sat Oct 05 12:39:54 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 9000 cases (18 attributes) from undefined.data
## 
## Decision tree:
## 
## shares = no: no (3878)
## shares = yes: yes (5122)
## 
## 
## Evaluation on training data (9000 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3878          (a): class no
##          5122    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% shares
## 
## 
## Time: 0.1 secs

#Step 4: Evaluating Model Performance
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq =  FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Shares', 'Predicted Shares'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##               | Predicted Shares 
## Actual Shares |        no |       yes | Row Total | 
## --------------|-----------|-----------|-----------|
##            no |       414 |         0 |       414 | 
##               |     0.414 |     0.000 |           | 
## --------------|-----------|-----------|-----------|
##           yes |         0 |       586 |       586 | 
##               |     0.000 |     0.586 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |       414 |       586 |      1000 | 
## --------------|-----------|-----------|-----------|
## 
##

#0 records false negatives/Type II error
#0 records false positives/Type I error 
conf_mat <- table(news_pred, news_test$shares)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy

## [1] 100

#Accuracy is 100%

Method 4b: News Popularity using Support Vector Machines

#Step 1: Collecting the Data
news <-read.csv("OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)

#Step 2: Preparing the Data
news_train <- newsShort[1:9000, ]
news_test <- newsShort[9001:10000, ]

#Step 3: Training a Model on the Data
news_classifier <- ksvm(shares ~ ., data = news_train, kernel = "vanilladot")

##  Setting default kernel parameters

news_classifier

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 78 
## 
## Objective Function Value : -1 
## Training error : 0

#Step 4: Evaluating Model Performance
news_pred <- predict(news_classifier, news_test)
table(news_pred, news_test$shares)

##          
## news_pred  no yes
##       no  547   0
##       yes   0 453

agreement <- news_pred == news_test$shares
table(agreement)

## agreement
## TRUE 
## 1000

#The classification was correct in all 1000 test records
conf_mat <- table(news_pred, news_test$shares)
accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100
accuracy

## [1] 100

#Accuracy is 100%

ANLY530_Lab1

Richy Varghese

October 5, 2019

Method 1: Tree-based Classification

Method 2: Support Vector Machines

Method 3: Adding Regression to Trees

Method 4a: News Popularity using Tree-based Classification

Method 4b: News Popularity using Support Vector Machines