Method 1: Tree-Based classcification

Step 1: Collecting the data

credit <- read.csv("credit.csv")
str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

step2: exploring the data

summary(credit$Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

table(credit$Creditability)

## 
##   0   1 
## 300 700

#Before dividing data to train and test set, we need to randomize the data

set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$ Credit.Amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]

prop.table(table(credit_train$Creditability))

## 
##         0         1 
## 0.3088889 0.6911111

prop.table(table(credit_test$Creditability))

## 
##    0    1 
## 0.22 0.78

Step3: Training a model on the data

if (!require("C50")) {
install.packages("C50")
library(C50)
}

## Loading required package: C50

library(C50)
credit_train$Creditability <- as.factor(credit_train$Creditability)
credit_test$Creditability <- as.factor(credit_test$Creditability)

#now design a model in which its input is credit_model

credit_model <- C5.0(x = credit_train[-1], y = credit_train$Creditability)
summary(credit_model)

## 
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Dec 15 15:05:48 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## Account.Balance > 2:
## :...Concurrent.Credits > 2:
## :   :...Age..years. > 33: 1 (179/11)
## :   :   Age..years. <= 33:
## :   :   :...Credit.Amount > 6681:
## :   :       :...Length.of.current.employment <= 2: 0 (4)
## :   :       :   Length.of.current.employment > 2:
## :   :       :   :...Payment.Status.of.Previous.Credit <= 3: 1 (4)
## :   :       :       Payment.Status.of.Previous.Credit > 3: 0 (3/1)
## :   :       Credit.Amount <= 6681:
## :   :       :...Occupation > 2:
## :   :           :...Occupation <= 3: 1 (120/12)
## :   :           :   Occupation > 3:
## :   :           :   :...Duration.of.Credit..month. <= 33: 1 (9)
## :   :           :       Duration.of.Credit..month. > 33: 0 (3)
## :   :           Occupation <= 2:
## :   :           :...No.of.Credits.at.this.Bank > 1: 1 (6)
## :   :               No.of.Credits.at.this.Bank <= 1:
## :   :               :...Most.valuable.available.asset > 1: 0 (3)
## :   :                   Most.valuable.available.asset <= 1:
## :   :                   :...Credit.Amount <= 1987: 1 (8/1)
## :   :                       Credit.Amount > 1987: 0 (2)
## :   Concurrent.Credits <= 2:
## :   :...Guarantors > 1: 1 (4)
## :       Guarantors <= 1:
## :       :...Purpose <= 0:
## :           :...Most.valuable.available.asset <= 2: 0 (5)
## :           :   Most.valuable.available.asset > 2:
## :           :   :...No.of.dependents <= 1: 1 (7/1)
## :           :       No.of.dependents > 1: 0 (2)
## :           Purpose > 0:
## :           :...Purpose <= 4: 1 (35/2)
## :               Purpose > 4:
## :               :...Length.of.current.employment <= 2: 0 (4)
## :                   Length.of.current.employment > 2:
## :                   :...No.of.dependents > 1: 0 (3/1)
## :                       No.of.dependents <= 1:
## :                       :...Length.of.current.employment > 3: 1 (4)
## :                           Length.of.current.employment <= 3:
## :                           :...Instalment.per.cent <= 2: 1 (2)
## :                               Instalment.per.cent > 2: 0 (2)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
##     :...Value.Savings.Stocks <= 2: 0 (49/10)
##     :   Value.Savings.Stocks > 2:
##     :   :...Credit.Amount <= 2064: 0 (3)
##     :       Credit.Amount > 2064: 1 (9/1)
##     Payment.Status.of.Previous.Credit > 1:
##     :...Credit.Amount > 7980:
##         :...Value.Savings.Stocks > 4:
##         :   :...Payment.Status.of.Previous.Credit <= 2: 0 (4/1)
##         :   :   Payment.Status.of.Previous.Credit > 2: 1 (3)
##         :   Value.Savings.Stocks <= 4:
##         :   :...Account.Balance > 1: 0 (15)
##         :       Account.Balance <= 1:
##         :       :...Concurrent.Credits <= 2: 0 (2)
##         :           Concurrent.Credits > 2:
##         :           :...Credit.Amount <= 10297: 0 (6)
##         :               Credit.Amount > 10297: 1 (3)
##         Credit.Amount <= 7980:
##         :...Duration.of.Credit..month. <= 11:
##             :...Occupation > 3:
##             :   :...Concurrent.Credits <= 2: 1 (3)
##             :   :   Concurrent.Credits > 2:
##             :   :   :...Payment.Status.of.Previous.Credit <= 2: 1 (4/1)
##             :   :       Payment.Status.of.Previous.Credit > 2: 0 (3)
##             :   Occupation <= 3:
##             :   :...Age..years. > 32: 1 (34)
##             :       Age..years. <= 32:
##             :       :...Most.valuable.available.asset <= 1: 1 (13/1)
##             :           Most.valuable.available.asset > 1:
##             :           :...Instalment.per.cent <= 3: 1 (6/1)
##             :               Instalment.per.cent > 3: 0 (6/1)
##             Duration.of.Credit..month. > 11:
##             :...Duration.of.Credit..month. > 36:
##                 :...Length.of.current.employment <= 1: 1 (3)
##                 :   Length.of.current.employment > 1:
##                 :   :...No.of.dependents > 1: 1 (5/1)
##                 :       No.of.dependents <= 1:
##                 :       :...Duration.in.Current.address <= 1: 1 (4/1)
##                 :           Duration.in.Current.address > 1: 0 (23)
##                 Duration.of.Credit..month. <= 36:
##                 :...Guarantors > 2:
##                     :...Foreign.Worker <= 1: 1 (23/1)
##                     :   Foreign.Worker > 1: 0 (2)
##                     Guarantors <= 2:
##                     :...Credit.Amount <= 1381:
##                         :...Telephone > 1:
##                         :   :...Sex...Marital.Status > 3: 0 (2)
##                         :   :   Sex...Marital.Status <= 3:
##                         :   :   :...Duration.of.Credit..month. <= 16: 1 (7)
##                         :   :       Duration.of.Credit..month. > 16: 0 (3/1)
##                         :   Telephone <= 1:
##                         :   :...Concurrent.Credits <= 2: 0 (9)
##                         :       Concurrent.Credits > 2:
##                         :       :...Account.Balance <= 1: 0 (29/6)
##                         :           Account.Balance > 1: [S1]
##                         Credit.Amount > 1381:
##                         :...Guarantors > 1:
##                             :...Foreign.Worker > 1: 1 (2)
##                             :   Foreign.Worker <= 1:
##                             :   :...Instalment.per.cent > 2: 0 (5)
##                             :       Instalment.per.cent <= 2: [S2]
##                             Guarantors <= 1:
##                             :...Payment.Status.of.Previous.Credit > 3:
##                                 :...Age..years. > 33: 1 (22)
##                                 :   Age..years. <= 33:
##                                 :   :...Purpose > 3: 1 (7)
##                                 :       Purpose <= 3: [S3]
##                                 Payment.Status.of.Previous.Credit <= 3:
##                                 :...Instalment.per.cent <= 2:
##                                     :...No.of.dependents > 1:
##                                     :   :...Purpose <= 0: 1 (2)
##                                     :   :   Purpose > 0: 0 (3)
##                                     :   No.of.dependents <= 1: [S4]
##                                     Instalment.per.cent > 2:
##                                     :...Concurrent.Credits <= 1: 1 (8/1)
##                                         Concurrent.Credits > 1:
##                                         :...Sex...Marital.Status <= 1: 0 (6/1)
##                                             Sex...Marital.Status > 1:
##                                             :...Account.Balance > 1: [S5]
##                                                 Account.Balance <= 1: [S6]
## 
## SubTree [S1]
## 
## Duration.in.Current.address > 3: 1 (8/1)
## Duration.in.Current.address <= 3:
## :...Purpose > 2: 0 (5)
##     Purpose <= 2:
##     :...Type.of.apartment <= 1: 0 (2)
##         Type.of.apartment > 1: 1 (5/1)
## 
## SubTree [S2]
## 
## Duration.in.Current.address <= 2: 1 (2)
## Duration.in.Current.address > 2: 0 (4/1)
## 
## SubTree [S3]
## 
## Duration.of.Credit..month. <= 16: 1 (4)
## Duration.of.Credit..month. > 16:
## :...Length.of.current.employment <= 3: 0 (8)
##     Length.of.current.employment > 3: 1 (6/1)
## 
## SubTree [S4]
## 
## Duration.in.Current.address > 1: 1 (41/6)
## Duration.in.Current.address <= 1:
## :...Value.Savings.Stocks > 3: 0 (2)
##     Value.Savings.Stocks <= 3:
##     :...Length.of.current.employment > 2: 1 (4)
##         Length.of.current.employment <= 2:
##         :...Instalment.per.cent <= 1: 0 (3)
##             Instalment.per.cent > 1: 1 (3/1)
## 
## SubTree [S5]
## 
## Sex...Marital.Status > 3: 0 (2)
## Sex...Marital.Status <= 3:
## :...Length.of.current.employment > 3: 1 (10)
##     Length.of.current.employment <= 3:
##     :...Duration.in.Current.address <= 1: 1 (5)
##         Duration.in.Current.address > 1:
##         :...Length.of.current.employment <= 2: 0 (4)
##             Length.of.current.employment > 2:
##             :...Value.Savings.Stocks <= 1: 0 (3)
##                 Value.Savings.Stocks > 1: 1 (5)
## 
## SubTree [S6]
## 
## Payment.Status.of.Previous.Credit > 2: 0 (3)
## Payment.Status.of.Previous.Credit <= 2:
## :...Purpose <= 0: 0 (7/1)
##     Purpose > 0:
##     :...Most.valuable.available.asset <= 1: 0 (5/1)
##         Most.valuable.available.asset > 1:
##         :...Sex...Marital.Status <= 2: 1 (6)
##             Sex...Marital.Status > 2:
##             :...Length.of.current.employment > 4: 0 (5)
##                 Length.of.current.employment <= 4:
##                 :...Telephone > 1: 1 (3)
##                     Telephone <= 1:
##                     :...Length.of.current.employment <= 2: 0 (2)
##                         Length.of.current.employment > 2:
##                         :...Age..years. <= 28: 1 (4)
##                             Age..years. > 28: 0 (2)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      85   70( 7.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     233    45    (a): class 0
##      25   597    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% Account.Balance
##   67.11% Credit.Amount
##   63.11% Concurrent.Credits
##   55.33% Payment.Status.of.Previous.Credit
##   50.33% Age..years.
##   45.44% Duration.of.Credit..month.
##   40.11% Guarantors
##   24.44% Occupation
##   18.33% Instalment.per.cent
##   15.56% Purpose
##   14.22% Length.of.current.employment
##   13.67% Duration.in.Current.address
##   12.67% Value.Savings.Stocks
##   12.22% No.of.dependents
##    9.33% Sex...Marital.Status
##    9.00% Telephone
##    8.78% Most.valuable.available.asset
##    4.22% Foreign.Worker
##    2.11% No.of.Credits.at.this.Bank
##    0.78% Type.of.apartment
## 
## 
## Time: 0.0 secs

Step 4: Evaluating model performance

cred_pred <- predict(credit_model, credit_test)

# Method 1
if (!require("gmodels")) {
install.packages("gmodels")
library(gmodels)
}

## Loading required package: gmodels

CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                      | Predicted Creditability 
## Actual Creditability |         0 |         1 | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                    0 |         8 |        14 |        22 | 
##                      |     0.080 |     0.140 |           | 
## ---------------------|-----------|-----------|-----------|
##                    1 |        17 |        61 |        78 | 
##                      |     0.170 |     0.610 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |        25 |        75 |       100 | 
## ---------------------|-----------|-----------|-----------|
## 
##

(p <- table(cred_pred, credit_test$Creditability))

##          
## cred_pred  0  1
##         0  8 17
##         1 14 61

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 69

# Q1- If you see an accuracy of 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.

# This means that you evaluate your model on a part of your training data, i.e., you are doing in-sample evaluation. In-sample accuracy is a notoriously poor indicator to out-of-sample accuracy, and maximizing in-sample accuracy can lead to overfitting. Therefore, one should always evaluate a model on a true holdout sample that is completely independent of the training data.

Method 2: Random Forest

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)

##                 Length Class  Mode     
## call               3   -none- call     
## type               1   -none- character
## predicted        900   factor numeric  
## err.rate        1500   -none- numeric  
## confusion          6   -none- numeric  
## votes           1800   matrix numeric  
## oob.times        900   -none- numeric  
## classes            2   -none- character
## importance        20   -none- numeric  
## importanceSD       0   -none- NULL     
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            14   -none- list     
## y                900   factor numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))

##          
## cred_pred  0  1
##         0 11 10
##         1 11 68

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 79

Q2- What are the three most important features in this model.

importance(random_model)

##                                   MeanDecreaseGini
## Account.Balance                          42.599355
## Duration.of.Credit..month.               37.502785
## Payment.Status.of.Previous.Credit        22.563009
## Purpose                                  23.774048
## Credit.Amount                            52.397155
## Value.Savings.Stocks                     19.388385
## Length.of.current.employment             20.221289
## Instalment.per.cent                      16.394636
## Sex...Marital.Status                     13.424449
## Guarantors                                7.475422
## Duration.in.Current.address              15.563685
## Most.valuable.available.asset            17.326842
## Age..years.                              37.377916
## Concurrent.Credits                        8.480725
## Type.of.apartment                         9.595344
## No.of.Credits.at.this.Bank                8.424006
## Occupation                               12.669816
## No.of.dependents                          5.774473
## Telephone                                 7.505291
## Foreign.Worker                            1.746964

Three most imprtant features are Credit Amount at 49.202906, Account Balance at 41.274824, and Age by years at 37.019536.

#Now, Change the random seed to 23458 and find the new accuracy of random forest.

set.seed(23458)
random_model1 <- randomForest(Creditability ~ (Credit.Amount + Account.Balance + Age..years.), data= credit_train)
cred_pred1 <- predict(random_model1, credit_test)
(p1 <- table(cred_pred1, credit_test$Creditability))

##           
## cred_pred1  0  1
##          0  5  4
##          1 17 74

(Accuracy <- sum(diag(p1))/sum(p1)*100)

## [1] 79

accuracy stays the same

#Alternative solution & Visualize the model

if (!require("rpart")) {
install.packages("rpart")
library(rpart)
}

## Loading required package: rpart

if (!require("ggplot2")) {
install.packages("ggplot2")
library(ggplot2)
}

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

if (!require("rpart.plot")) {
install.packages("rpart.plot")
library(rpart.plot)
}

## Loading required package: rpart.plot

DT <- rpart(Creditability ~ . , data= credit)
summary(DT)

## Call:
## rpart(formula = Creditability ~ ., data = credit)
##   n= 1000 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.11407052      0 1.0000000 1.0024350 0.02767435
## 2 0.03050152      1 0.8859295 0.8900108 0.02811237
## 3 0.02390597      2 0.8554280 0.8962677 0.03140547
## 4 0.01755837      3 0.8315220 0.8819218 0.03209717
## 5 0.01270887      4 0.8139636 0.8560143 0.03325238
## 6 0.01208012      8 0.7631281 0.8646798 0.03497459
## 7 0.01196152      9 0.7510480 0.8673536 0.03527687
## 8 0.01000000     10 0.7390865 0.8684636 0.03596495
## 
## Variable importance
##                   Account.Balance        Duration.of.Credit..month. 
##                                36                                15 
##                     Credit.Amount Payment.Status.of.Previous.Credit 
##                                13                                10 
##              Value.Savings.Stocks     Most.valuable.available.asset 
##                                10                                 7 
##                 Type.of.apartment                       Age..years. 
##                                 2                                 2 
##      Length.of.current.employment                        Occupation 
##                                 2                                 1 
##        No.of.Credits.at.this.Bank                           Purpose 
##                                 1                                 1 
## 
## Node number 1: 1000 observations,    complexity param=0.1140705
##   mean=0.7, MSE=0.21 
##   left son=2 (543 obs) right son=3 (457 obs)
##   Primary splits:
##       Account.Balance                   < 2.5     to the left,  improve=0.11407050, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=0.04062411, (0 missing)
##       Value.Savings.Stocks              < 2.5     to the left,  improve=0.03525338, (0 missing)
##       Duration.of.Credit..month.        < 34.5    to the right, improve=0.03243225, (0 missing)
##       Credit.Amount                     < 3909.5  to the right, improve=0.02639923, (0 missing)
##   Surrogate splits:
##       Value.Savings.Stocks              < 2.5     to the left,  agree=0.611, adj=0.149, (0 split)
##       Payment.Status.of.Previous.Credit < 3.5     to the left,  agree=0.592, adj=0.107, (0 split)
##       Length.of.current.employment      < 4.5     to the left,  agree=0.554, adj=0.024, (0 split)
##       Age..years.                       < 30.5    to the left,  agree=0.554, adj=0.024, (0 split)
##       No.of.Credits.at.this.Bank        < 1.5     to the left,  agree=0.554, adj=0.024, (0 split)
## 
## Node number 2: 543 observations,    complexity param=0.03050152
##   mean=0.558011, MSE=0.2466347 
##   left son=4 (237 obs) right son=5 (306 obs)
##   Primary splits:
##       Duration.of.Credit..month.        < 22.5    to the right, improve=0.04782850, (0 missing)
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=0.03604240, (0 missing)
##       Most.valuable.available.asset     < 1.5     to the right, improve=0.03427861, (0 missing)
##       Value.Savings.Stocks              < 2.5     to the left,  improve=0.03319374, (0 missing)
##       Credit.Amount                     < 8079    to the right, improve=0.02464583, (0 missing)
##   Surrogate splits:
##       Credit.Amount                 < 2805.5  to the right, agree=0.748, adj=0.422, (0 split)
##       Most.valuable.available.asset < 2.5     to the right, agree=0.646, adj=0.190, (0 split)
##       Type.of.apartment             < 2.5     to the right, agree=0.606, adj=0.097, (0 split)
##       Purpose                       < 8.5     to the right, agree=0.604, adj=0.093, (0 split)
##       Occupation                    < 3.5     to the right, agree=0.595, adj=0.072, (0 split)
## 
## Node number 3: 457 observations
##   mean=0.868709, MSE=0.1140537 
## 
## Node number 4: 237 observations,    complexity param=0.01755837
##   mean=0.4345992, MSE=0.2457227 
##   left son=8 (196 obs) right son=9 (41 obs)
##   Primary splits:
##       Value.Savings.Stocks       < 3.5     to the left,  improve=0.06331546, (0 missing)
##       Credit.Amount              < 1381.5  to the left,  improve=0.02824112, (0 missing)
##       Instalment.per.cent        < 2.5     to the right, improve=0.02633681, (0 missing)
##       Duration.of.Credit..month. < 43.5    to the right, improve=0.02202167, (0 missing)
##       Purpose                    < 0.5     to the left,  improve=0.02025099, (0 missing)
## 
## Node number 5: 306 observations,    complexity param=0.02390597
##   mean=0.6535948, MSE=0.2264086 
##   left son=10 (28 obs) right son=11 (278 obs)
##   Primary splits:
##       Payment.Status.of.Previous.Credit < 1.5     to the left,  improve=0.07246216, (0 missing)
##       Most.valuable.available.asset     < 1.5     to the right, improve=0.04031178, (0 missing)
##       Guarantors                        < 2.5     to the left,  improve=0.02729505, (0 missing)
##       Duration.of.Credit..month.        < 11.5    to the right, improve=0.02718298, (0 missing)
##       Credit.Amount                     < 7491.5  to the right, improve=0.02697302, (0 missing)
## 
## Node number 8: 196 observations,    complexity param=0.01196152
##   mean=0.377551, MSE=0.2350062 
##   left son=16 (36 obs) right son=17 (160 obs)
##   Primary splits:
##       Duration.of.Credit..month.  < 47.5    to the right, improve=0.05453435, (0 missing)
##       Instalment.per.cent         < 2.5     to the right, improve=0.02911869, (0 missing)
##       Credit.Amount               < 11788   to the right, improve=0.02731942, (0 missing)
##       Duration.in.Current.address < 1.5     to the right, improve=0.02154070, (0 missing)
##       Age..years.                 < 60.5    to the left,  improve=0.01787033, (0 missing)
##   Surrogate splits:
##       Credit.Amount < 13319.5 to the right, agree=0.837, adj=0.111, (0 split)
## 
## Node number 9: 41 observations,    complexity param=0.01208012
##   mean=0.7073171, MSE=0.2070196 
##   left son=18 (17 obs) right son=19 (24 obs)
##   Primary splits:
##       Account.Balance                   < 1.5     to the left,  improve=0.29887870, (0 missing)
##       Credit.Amount                     < 2079    to the left,  improve=0.19001440, (0 missing)
##       Payment.Status.of.Previous.Credit < 2.5     to the left,  improve=0.15172410, (0 missing)
##       Purpose                           < 0.5     to the left,  improve=0.09387971, (0 missing)
##       No.of.Credits.at.this.Bank        < 1.5     to the left,  improve=0.05785132, (0 missing)
##   Surrogate splits:
##       Credit.Amount                 < 1548    to the left,  agree=0.732, adj=0.353, (0 split)
##       Length.of.current.employment  < 4.5     to the right, agree=0.683, adj=0.235, (0 split)
##       Most.valuable.available.asset < 2.5     to the left,  agree=0.659, adj=0.176, (0 split)
##       Age..years.                   < 48.5    to the right, agree=0.659, adj=0.176, (0 split)
##       Type.of.apartment             < 1.5     to the left,  agree=0.659, adj=0.176, (0 split)
## 
## Node number 10: 28 observations
##   mean=0.25, MSE=0.1875 
## 
## Node number 11: 278 observations,    complexity param=0.01270887
##   mean=0.6942446, MSE=0.212269 
##   left son=22 (7 obs) right son=23 (271 obs)
##   Primary splits:
##       Credit.Amount                     < 7491.5  to the right, improve=0.03699609, (0 missing)
##       Duration.of.Credit..month.        < 11.5    to the right, improve=0.03254299, (0 missing)
##       Most.valuable.available.asset     < 1.5     to the right, improve=0.03041005, (0 missing)
##       Payment.Status.of.Previous.Credit < 2.5     to the left,  improve=0.02503007, (0 missing)
##       Guarantors                        < 2.5     to the left,  improve=0.02245610, (0 missing)
## 
## Node number 16: 36 observations
##   mean=0.1388889, MSE=0.1195988 
## 
## Node number 17: 160 observations
##   mean=0.43125, MSE=0.2452734 
## 
## Node number 18: 17 observations
##   mean=0.4117647, MSE=0.2422145 
## 
## Node number 19: 24 observations
##   mean=0.9166667, MSE=0.07638889 
## 
## Node number 22: 7 observations
##   mean=0.1428571, MSE=0.122449 
## 
## Node number 23: 271 observations,    complexity param=0.01270887
##   mean=0.7084871, MSE=0.2065331 
##   left son=46 (193 obs) right son=47 (78 obs)
##   Primary splits:
##       Duration.of.Credit..month.    < 11.5    to the right, improve=0.03708564, (0 missing)
##       Credit.Amount                 < 1373    to the left,  improve=0.03368634, (0 missing)
##       Most.valuable.available.asset < 1.5     to the right, improve=0.02387411, (0 missing)
##       Guarantors                    < 2.5     to the left,  improve=0.02052141, (0 missing)
##       Value.Savings.Stocks          < 2.5     to the left,  improve=0.02006311, (0 missing)
##   Surrogate splits:
##       Credit.Amount  < 527.5   to the right, agree=0.742, adj=0.103, (0 split)
##       Age..years.    < 66.5    to the left,  agree=0.720, adj=0.026, (0 split)
##       Foreign.Worker < 1.5     to the left,  agree=0.720, adj=0.026, (0 split)
## 
## Node number 46: 193 observations,    complexity param=0.01270887
##   mean=0.6528497, MSE=0.226637 
##   left son=92 (73 obs) right son=93 (120 obs)
##   Primary splits:
##       Credit.Amount        < 1387.5  to the left,  improve=0.06845664, (0 missing)
##       Account.Balance      < 1.5     to the left,  improve=0.02543376, (0 missing)
##       Guarantors           < 2.5     to the left,  improve=0.02248369, (0 missing)
##       Purpose              < 8.5     to the left,  improve=0.02244828, (0 missing)
##       Value.Savings.Stocks < 2.5     to the left,  improve=0.02166506, (0 missing)
##   Surrogate splits:
##       Instalment.per.cent        < 3.5     to the right, agree=0.658, adj=0.096, (0 split)
##       Occupation                 < 2.5     to the left,  agree=0.658, adj=0.096, (0 split)
##       Age..years.                < 21.5    to the left,  agree=0.653, adj=0.082, (0 split)
##       Duration.of.Credit..month. < 12.5    to the left,  agree=0.648, adj=0.068, (0 split)
##       Sex...Marital.Status       < 3.5     to the right, agree=0.637, adj=0.041, (0 split)
## 
## Node number 47: 78 observations
##   mean=0.8461538, MSE=0.1301775 
## 
## Node number 92: 73 observations,    complexity param=0.01270887
##   mean=0.4931507, MSE=0.2499531 
##   left son=184 (23 obs) right son=185 (50 obs)
##   Primary splits:
##       Most.valuable.available.asset < 2.5     to the right, improve=0.18755450, (0 missing)
##       Guarantors                    < 1.5     to the left,  improve=0.14116680, (0 missing)
##       No.of.Credits.at.this.Bank    < 1.5     to the left,  improve=0.08359529, (0 missing)
##       Purpose                       < 1       to the left,  improve=0.06560191, (0 missing)
##       Duration.in.Current.address   < 3.5     to the left,  improve=0.04374000, (0 missing)
##   Surrogate splits:
##       Type.of.apartment            < 2.5     to the right, agree=0.726, adj=0.130, (0 split)
##       No.of.Credits.at.this.Bank   < 2.5     to the right, agree=0.712, adj=0.087, (0 split)
##       Length.of.current.employment < 1.5     to the left,  agree=0.699, adj=0.043, (0 split)
##       Occupation                   < 1.5     to the left,  agree=0.699, adj=0.043, (0 split)
## 
## Node number 93: 120 observations
##   mean=0.75, MSE=0.1875 
## 
## Node number 184: 23 observations
##   mean=0.173913, MSE=0.1436673 
## 
## Node number 185: 50 observations
##   mean=0.64, MSE=0.2304

Method 3: Adding regression to trees

#Loading the data
wine <- read.csv("whitewines.csv")
View(wine)
str(wine)

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...

#Exploring and Preparing the Data
wine_train <- wine[1:3750, ] 
wine_test <- wine[3751:4898, ]

#Training a Model on the Data
library(rpart)
m.rpart <- rpart(quality ~ ., data=wine_train)
summary(m.rpart)

## Call:
## rpart(formula = quality ~ ., data = wine_train)
##   n= 3750 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.15501053      0 1.0000000 1.0005637 0.02445885
## 2 0.05098911      1 0.8449895 0.8483850 0.02337511
## 3 0.02796998      2 0.7940004 0.8052179 0.02286010
## 4 0.01970128      3 0.7660304 0.7798982 0.02163249
## 5 0.01265926      4 0.7463291 0.7605653 0.02088461
## 6 0.01007193      5 0.7336698 0.7504350 0.02072098
## 7 0.01000000      6 0.7235979 0.7473579 0.02063472
## 
## Variable importance
##              alcohol              density     volatile.acidity 
##                   34                   21                   15 
##            chlorides total.sulfur.dioxide  free.sulfur.dioxide 
##                   11                    7                    6 
##       residual.sugar            sulphates          citric.acid 
##                    3                    1                    1 
## 
## Node number 1: 3750 observations,    complexity param=0.1550105
##   mean=5.870933, MSE=0.7854751 
##   left son=2 (2372 obs) right son=3 (1378 obs)
##   Primary splits:
##       alcohol              < 10.85    to the left,  improve=0.15501050, (0 missing)
##       density              < 0.992035 to the right, improve=0.10915940, (0 missing)
##       chlorides            < 0.0395   to the right, improve=0.07682258, (0 missing)
##       total.sulfur.dioxide < 158.5    to the right, improve=0.04089663, (0 missing)
##       citric.acid          < 0.235    to the left,  improve=0.03636458, (0 missing)
##   Surrogate splits:
##       density              < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
##       chlorides            < 0.0375   to the right, agree=0.757, adj=0.339, (0 split)
##       total.sulfur.dioxide < 103.5    to the right, agree=0.690, adj=0.155, (0 split)
##       residual.sugar       < 5.375    to the right, agree=0.667, adj=0.094, (0 split)
##       sulphates            < 0.345    to the right, agree=0.647, adj=0.038, (0 split)
## 
## Node number 2: 2372 observations,    complexity param=0.05098911
##   mean=5.604975, MSE=0.5981709 
##   left son=4 (1611 obs) right son=5 (761 obs)
##   Primary splits:
##       volatile.acidity    < 0.2275   to the right, improve=0.10585250, (0 missing)
##       free.sulfur.dioxide < 13.5     to the left,  improve=0.03390500, (0 missing)
##       citric.acid         < 0.235    to the left,  improve=0.03204075, (0 missing)
##       alcohol             < 10.11667 to the left,  improve=0.03136524, (0 missing)
##       chlorides           < 0.0585   to the right, improve=0.01633599, (0 missing)
##   Surrogate splits:
##       pH                   < 3.485    to the left,  agree=0.694, adj=0.047, (0 split)
##       sulphates            < 0.755    to the left,  agree=0.685, adj=0.020, (0 split)
##       total.sulfur.dioxide < 105.5    to the right, agree=0.683, adj=0.011, (0 split)
##       residual.sugar       < 0.75     to the right, agree=0.681, adj=0.007, (0 split)
##       chlorides            < 0.0285   to the right, agree=0.680, adj=0.003, (0 split)
## 
## Node number 3: 1378 observations,    complexity param=0.02796998
##   mean=6.328737, MSE=0.7765472 
##   left son=6 (84 obs) right son=7 (1294 obs)
##   Primary splits:
##       free.sulfur.dioxide  < 10.5     to the left,  improve=0.07699080, (0 missing)
##       alcohol              < 11.76667 to the left,  improve=0.06210660, (0 missing)
##       total.sulfur.dioxide < 67.5     to the left,  improve=0.04438619, (0 missing)
##       residual.sugar       < 1.375    to the left,  improve=0.02905351, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.02613259, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 53.5     to the left,  agree=0.952, adj=0.214, (0 split)
##       volatile.acidity     < 0.875    to the right, agree=0.940, adj=0.024, (0 split)
## 
## Node number 4: 1611 observations,    complexity param=0.01265926
##   mean=5.43203, MSE=0.5098121 
##   left son=8 (688 obs) right son=9 (923 obs)
##   Primary splits:
##       volatile.acidity    < 0.3025   to the right, improve=0.04540111, (0 missing)
##       alcohol             < 10.05    to the left,  improve=0.03874403, (0 missing)
##       free.sulfur.dioxide < 13.5     to the left,  improve=0.03338886, (0 missing)
##       chlorides           < 0.0495   to the right, improve=0.02574623, (0 missing)
##       citric.acid         < 0.195    to the left,  improve=0.02327981, (0 missing)
##   Surrogate splits:
##       citric.acid          < 0.215    to the left,  agree=0.633, adj=0.141, (0 split)
##       free.sulfur.dioxide  < 20.5     to the left,  agree=0.600, adj=0.063, (0 split)
##       chlorides            < 0.0595   to the right, agree=0.593, adj=0.047, (0 split)
##       residual.sugar       < 1.15     to the left,  agree=0.583, adj=0.023, (0 split)
##       total.sulfur.dioxide < 219.25   to the right, agree=0.582, adj=0.022, (0 split)
## 
## Node number 5: 761 observations
##   mean=5.971091, MSE=0.5878633 
## 
## Node number 6: 84 observations
##   mean=5.369048, MSE=1.137613 
## 
## Node number 7: 1294 observations,    complexity param=0.01970128
##   mean=6.391036, MSE=0.6894405 
##   left son=14 (629 obs) right son=15 (665 obs)
##   Primary splits:
##       alcohol              < 11.76667 to the left,  improve=0.06504696, (0 missing)
##       chlorides            < 0.0395   to the right, improve=0.02758705, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.02750932, (0 missing)
##       pH                   < 3.055    to the left,  improve=0.02307356, (0 missing)
##       total.sulfur.dioxide < 191.5    to the right, improve=0.02186818, (0 missing)
##   Surrogate splits:
##       density              < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
##       volatile.acidity     < 0.2675   to the left,  agree=0.637, adj=0.253, (0 split)
##       chlorides            < 0.0365   to the right, agree=0.630, adj=0.238, (0 split)
##       residual.sugar       < 1.475    to the left,  agree=0.575, adj=0.126, (0 split)
##       total.sulfur.dioxide < 128.5    to the right, agree=0.574, adj=0.124, (0 split)
## 
## Node number 8: 688 observations
##   mean=5.255814, MSE=0.4054895 
## 
## Node number 9: 923 observations
##   mean=5.56338, MSE=0.5471747 
## 
## Node number 14: 629 observations,    complexity param=0.01007193
##   mean=6.173291, MSE=0.6838017 
##   left son=28 (11 obs) right son=29 (618 obs)
##   Primary splits:
##       volatile.acidity     < 0.465    to the right, improve=0.06897561, (0 missing)
##       total.sulfur.dioxide < 200      to the right, improve=0.04223066, (0 missing)
##       residual.sugar       < 0.975    to the left,  improve=0.03061714, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.02978501, (0 missing)
##       sulphates            < 0.575    to the left,  improve=0.02165970, (0 missing)
##   Surrogate splits:
##       citric.acid          < 0.045    to the left,  agree=0.986, adj=0.182, (0 split)
##       total.sulfur.dioxide < 279.25   to the right, agree=0.986, adj=0.182, (0 split)
## 
## Node number 15: 665 observations
##   mean=6.596992, MSE=0.6075098 
## 
## Node number 28: 11 observations
##   mean=4.545455, MSE=0.9752066 
## 
## Node number 29: 618 observations
##   mean=6.202265, MSE=0.6306098

rpart.plot(m.rpart, digits=3, type=1)

#Model evaluation
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597

summary(wine_test$quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000

Q3- What is your interpretation about this amount of RMSE?

Asnwer:

if (!require("Metrics")) {
install.packages("Metrics")
library(Metrics)
}

## Loading required package: Metrics

rmse(wine_test$quality, p.rpart)

## [1] 0.7448093

The root-mean-square error is 0.745, the lower RMSE value, the more model fit the observed data, and 0.745 is still high.

Method #4. News Popularity

news <- read.csv("OnlineNewsPopularity_for_R.csv")
str(news)

## 'data.frame':    39644 obs. of  61 variables:
##  $ url                          : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ timedelta                    : num  731 731 731 731 731 731 731 731 731 731 ...
##  $ n_tokens_title               : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content             : num  219 255 211 531 1072 ...
##  $ n_unique_tokens              : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words             : num  1 1 1 1 1 ...
##  $ n_non_stop_unique_tokens     : num  0.815 0.792 0.664 0.666 0.541 ...
##  $ num_hrefs                    : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_self_hrefs               : num  2 1 1 0 19 2 20 20 0 1 ...
##  $ num_imgs                     : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                   : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length         : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                 : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ data_channel_is_lifestyle    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ data_channel_is_entertainment: num  1 0 0 1 0 0 0 0 0 0 ...
##  $ data_channel_is_bus          : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ data_channel_is_socmed       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ data_channel_is_tech         : num  0 0 0 0 1 1 0 1 1 0 ...
##  $ data_channel_is_world        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ kw_min_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_min                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_max                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_min_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_max_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ kw_avg_avg                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ self_reference_min_shares    : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares    : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess   : num  496 0 918 0 3151 ...
##  $ weekday_is_monday            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                       : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                       : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                       : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                       : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                       : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity          : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity    : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words   : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words   : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words          : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words          : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity        : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity        : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity        : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity        : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity        : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity        : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity           : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity     : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity       : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity : num  0.188 0 0 0 0.136 ...
##  $ shares                       : int  593 711 1500 1200 505 855 556 891 3600 710 ...

#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
str(newsShort)

## 'data.frame':    39644 obs. of  17 variables:
##  $ n_tokens_title              : num  12 9 9 9 13 10 8 12 11 10 ...
##  $ n_tokens_content            : num  219 255 211 531 1072 ...
##  $ n_unique_tokens             : num  0.664 0.605 0.575 0.504 0.416 ...
##  $ n_non_stop_words            : num  1 1 1 1 1 ...
##  $ num_hrefs                   : num  4 3 3 9 19 2 21 20 2 4 ...
##  $ num_imgs                    : num  1 1 1 1 20 0 20 20 0 1 ...
##  $ num_videos                  : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ average_token_length        : num  4.68 4.91 4.39 4.4 4.68 ...
##  $ num_keywords                : num  5 4 6 7 7 9 10 9 7 5 ...
##  $ kw_max_max                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ global_sentiment_polarity   : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ avg_positive_polarity       : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ title_subjectivity          : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity    : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity      : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity: num  0.188 0 0 0 0.136 ...
##  $ shares                      : int  593 711 1500 1200 505 855 556 891 3600 710 ...

#Pre-processing
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "1"} 
     else {newsShort$popular[i] = "0"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)

news_rand <- newsShort[order(runif(10000)), ]
#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

prop.table(table(news_train$shares))

## 
##         0         1 
## 0.4308889 0.5691111

prop.table(table(news_test$shares))

## 
##     0     1 
## 0.414 0.586

#Model training
library("C50", lib.loc="~/R/win-library/3.5")
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)

## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Dec 15 15:06:06 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 9000 cases (18 attributes) from undefined.data
## 
## Decision tree:
## 
## popular = 0: 0 (3878)
## popular = 1: 1 (5122)
## 
## 
## Evaluation on training data (9000 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##    3878          (a): class 0
##          5122    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% popular
## 
## 
## Time: 0.1 secs

#Evaluate the model
news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))

##          
## news_pred   0   1
##         0 414   0
##         1   0 586

(Accuracy <- sum(diag(p))/sum(p)*100)

## [1] 100

Q4- Try decision tree and random forest and evaluate the model.

Answer: #Step 1: RANDOM FOREST

news <- read.csv("OnlineNewsPopularity_for_R.csv")
news <- news[,-(1:2)]

#Check for outliers
news=news[!news$n_unique_tokens==701,]

#minify instances
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)

colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

#Standardize the dataset
for(i in ncol(news)-1){ 
  news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}

#Define popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))

set.seed(12346)

news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:3964, ]
news_test <- news_rand[3965:39643, ]
news_train$shares <- as.factor(news_train$shares)
random_modelNews <- randomForest(news_train$shares ~ . , data= news_train)

#Model training
cred_pridRF <- predict(random_modelNews, news_test)
(p2 <- table(cred_pridRF, news_test$shares))

##            
## cred_pridRF     0     1
##           0 10014  7264
##           1  8088 10313

#Accuracy
(Accuracy <- sum(diag(p2))/sum(p2)*100)

## [1] 56.97189

#importance
importance(random_modelNews)

##                              MeanDecreaseGini
## n_tokens_title                      110.95451
## n_tokens_content                    172.15879
## n_unique_tokens                     188.13602
## n_non_stop_words                    168.50493
## num_hrefs                           148.12312
## num_imgs                             90.89631
## num_videos                           52.40012
## average_token_length                195.31143
## num_keywords                         93.64084
## kw_max_max                           49.10667
## global_sentiment_polarity           214.07528
## avg_positive_polarity               187.35883
## title_subjectivity                   74.85371
## title_sentiment_polarity             82.44230
## abs_title_subjectivity               73.17472
## abs_title_sentiment_polarity         68.02149

Step2: Decision Tree

news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)

## 
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Dec 15 15:06:15 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 3964 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## global_sentiment_polarity > 0.1079453:
## :...num_hrefs > 12:
## :   :...n_tokens_title <= 10: 1 (436/135)
## :   :   n_tokens_title > 10:
## :   :   :...abs_title_subjectivity > 0.3416667: 1 (137/47)
## :   :       abs_title_subjectivity <= 0.3416667:
## :   :       :...n_unique_tokens <= 0.3832077: 1 (7)
## :   :           n_unique_tokens > 0.3832077: 0 (123/51)
## :   num_hrefs <= 12:
## :   :...abs_title_subjectivity > 0.1272727: 1 (1135/509)
## :       abs_title_subjectivity <= 0.1272727:
## :       :...num_imgs <= 1:
## :           :...n_unique_tokens > 0.4742152: 0 (255/87)
## :           :   n_unique_tokens <= 0.4742152:
## :           :   :...n_tokens_title <= 12: 1 (22/3)
## :           :       n_tokens_title > 12: 0 (5)
## :           num_imgs > 1:
## :           :...num_imgs <= 6: 1 (47/16)
## :               num_imgs > 6:
## :               :...kw_max_max <= 227300: 1 (6/1)
## :                   kw_max_max > 227300: 0 (44/15)
## global_sentiment_polarity <= 0.1079453:
## :...num_hrefs > 19: 1 (186/79)
##     num_hrefs <= 19:
##     :...n_non_stop_words <= 0.9999999: 1 (112/51)
##         n_non_stop_words > 0.9999999:
##         :...num_keywords <= 6:
##             :...num_hrefs <= 1: 0 (39/4)
##             :   num_hrefs > 1:
##             :   :...title_sentiment_polarity <= 0.1083333: 0 (449/140)
##             :       title_sentiment_polarity > 0.1083333:
##             :       :...num_keywords <= 5: 1 (74/28)
##             :           num_keywords > 5:
##             :           :...average_token_length <= 4.411043: 1 (9/1)
##             :               average_token_length > 4.411043: 0 (48/14)
##             num_keywords > 6:
##             :...num_imgs > 2: 1 (226/102)
##                 num_imgs <= 2:
##                 :...num_imgs > 0:
##                     :...kw_max_max <= 227300: 1 (25/10)
##                     :   kw_max_max > 227300: 0 (437/159)
##                     num_imgs <= 0:
##                     :...num_keywords <= 7: 1 (46/15)
##                         num_keywords > 7:
##                         :...num_videos > 15: 1 (5)
##                             num_videos <= 15:
##                             :...num_hrefs <= 2: 1 (10/2)
##                                 num_hrefs > 2: 0 (81/30)
## 
## 
## Evaluation on training data (3964 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      25 1499(37.8%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     981   999    (a): class 0
##     500  1484    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% num_hrefs
##  100.00% global_sentiment_polarity
##   44.93% abs_title_subjectivity
##   39.38% n_non_stop_words
##   36.55% num_keywords
##   30.50% num_imgs
##   18.42% n_tokens_title
##   14.63% title_sentiment_polarity
##   12.92% kw_max_max
##   10.39% n_unique_tokens
##    2.42% num_videos
##    1.44% average_token_length
## 
## 
## Time: 0.1 secs

#Model training
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  35679 
## 
##  
##               | predicted shares 
## actual shares |         0 |         1 | Row Total | 
## --------------|-----------|-----------|-----------|
##             0 |      7692 |     10410 |     18102 | 
##               |     0.216 |     0.292 |           | 
## --------------|-----------|-----------|-----------|
##             1 |      5596 |     11981 |     17577 | 
##               |     0.157 |     0.336 |           | 
## --------------|-----------|-----------|-----------|
##  Column Total |     13288 |     22391 |     35679 | 
## --------------|-----------|-----------|-----------|
## 
##

(p3 <- table(news_pred, news_test$shares))

##          
## news_pred     0     1
##         0  7692  5596
##         1 10410 11981

#Accuracy
(Accuracy <- sum(diag(p3))/sum(p3)*100)

## [1] 55.13888

ANLY 530 Lab1

Yining Jia

12/15/2019