#We will use credit data file from this location : http://archive.ics.uci.edu/ml/ and built three classification models on this dataset after doing some preprocessing.
#Lets look at the structure and summary of the data after uploading it
credit <- read.csv("C:/Users/Priya/Desktop/ANLY 500/credit.csv")
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
summary(credit$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$Creditability)
##
## 0 1
## 300 700
#Create a random number generator and randomize the data to create train and test datasets out of the credit dataset.
set.seed(56732)
credit_rand <- credit[order(runif(1000)), ]
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
#A simple test as shown below can tell us that the randomization has worked just fine. The ratios of 1’s and 0’s in both the train and test sets are very similar.
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3 0.7
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.3 0.7
##Model 1 on Credit - TREE BASED CLASSIFICATION
library(C50)
## Warning: package 'C50' was built under R version 3.5.3
credit_train$Creditability <- as.factor(credit_train$Creditability)
credit_test$Creditability <- as.factor(credit_test$Creditability)
#Decision tree model is created and summarized by using the code below:
credit_model <- C5.0(x = credit_train[-1], y = credit_train$Creditability)
summary(credit_model)
##
## Call:
## C5.0.default(x = credit_train[-1], y = credit_train$Creditability)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Aug 30 15:54:36 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 900 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## Account.Balance > 2:
## :...Concurrent.Credits > 2:
## : :...Guarantors > 1:
## : : :...Length.of.current.employment <= 2: 0 (3)
## : : : Length.of.current.employment > 2: 1 (16/2)
## : : Guarantors <= 1:
## : : :...Credit.Amount <= 5511: 1 (291/27)
## : : Credit.Amount > 5511:
## : : :...Length.of.current.employment <= 2: 0 (5/1)
## : : Length.of.current.employment > 2: 1 (32/2)
## : Concurrent.Credits <= 2:
## : :...Purpose > 4:
## : :...Instalment.per.cent <= 1: 1 (2)
## : : Instalment.per.cent > 1:
## : : :...Length.of.current.employment <= 3: 0 (8/1)
## : : Length.of.current.employment > 3: 1 (5/1)
## : Purpose <= 4:
## : :...Purpose > 0: 1 (33/2)
## : Purpose <= 0:
## : :...Duration.of.Credit..month. <= 15: 1 (5)
## : Duration.of.Credit..month. > 15:
## : :...No.of.Credits.at.this.Bank <= 1: 1 (4/1)
## : No.of.Credits.at.this.Bank > 1: 0 (6)
## Account.Balance <= 2:
## :...Payment.Status.of.Previous.Credit <= 1:
## :...Type.of.apartment <= 1: 0 (14)
## : Type.of.apartment > 1:
## : :...Guarantors > 1: 1 (3)
## : Guarantors <= 1:
## : :...Type.of.apartment > 2: 0 (13/1)
## : Type.of.apartment <= 2:
## : :...Value.Savings.Stocks > 2:
## : :...Credit.Amount <= 2064: 0 (2)
## : : Credit.Amount > 2064: 1 (7)
## : Value.Savings.Stocks <= 2:
## : :...Length.of.current.employment <= 2: 0 (6)
## : Length.of.current.employment > 2:
## : :...Duration.of.Credit..month. > 33: 0 (4)
## : Duration.of.Credit..month. <= 33:
## : :...Duration.in.Current.address <= 2: 1 (4)
## : Duration.in.Current.address > 2:
## : :...Concurrent.Credits > 2: 0 (2)
## : Concurrent.Credits <= 2:
## : :...Account.Balance <= 1: 0 (4/1)
## : Account.Balance > 1: 1 (3)
## Payment.Status.of.Previous.Credit > 1:
## :...Credit.Amount > 7966:
## :...Payment.Status.of.Previous.Credit <= 2: 0 (23/2)
## : Payment.Status.of.Previous.Credit > 2:
## : :...Value.Savings.Stocks <= 1: 0 (5/1)
## : Value.Savings.Stocks > 1: 1 (6/1)
## Credit.Amount <= 7966:
## :...Duration.of.Credit..month. <= 11: 1 (68/10)
## Duration.of.Credit..month. > 11:
## :...Duration.of.Credit..month. > 36:
## :...Length.of.current.employment <= 1: 1 (3)
## : Length.of.current.employment > 1:
## : :...No.of.dependents <= 1: 0 (21/1)
## : No.of.dependents > 1:
## : :...Account.Balance > 1: 1 (2)
## : Account.Balance <= 1:
## : :...Credit.Amount <= 4576: 0 (2)
## : Credit.Amount > 4576: 1 (2)
## Duration.of.Credit..month. <= 36:
## :...Guarantors > 2:
## :...Purpose <= 0: 0 (4/1)
## : Purpose > 0: 1 (20)
## Guarantors <= 2:
## :...Account.Balance > 1:
## :...Sex...Marital.Status <= 1:
## : :...Value.Savings.Stocks <= 2: 0 (9/2)
## : : Value.Savings.Stocks > 2: 1 (3)
## : Sex...Marital.Status > 1:
## : :...Length.of.current.employment > 3:
## : :...No.of.dependents <= 1: 1 (34/2)
## : : No.of.dependents > 1: [S1]
## : Length.of.current.employment <= 3:
## : :...Credit.Amount > 5511: 1 (10/1)
## : Credit.Amount <= 5511:
## : :...Value.Savings.Stocks > 3: 1 (16/2)
## : Value.Savings.Stocks <= 3:
## : :...Duration.of.Credit..month. > 21: [S2]
## : Duration.of.Credit..month. <= 21: [S3]
## Account.Balance <= 1:
## :...Occupation > 3:
## :...Payment.Status.of.Previous.Credit > 3: 1 (6)
## : Payment.Status.of.Previous.Credit <= 3:
## : :...Length.of.current.employment <= 4: 1 (11/1)
## : Length.of.current.employment > 4: 0 (3)
## Occupation <= 3:
## :...Instalment.per.cent <= 2:
## :...Duration.of.Credit..month. <= 16: [S4]
## : Duration.of.Credit..month. > 16:
## : :...Foreign.Worker > 1: 1 (2)
## : Foreign.Worker <= 1:
## : :...Telephone > 1: 0 (3)
## : Telephone <= 1: [S5]
## Instalment.per.cent > 2:
## :...Concurrent.Credits <= 1:
## :...Sex...Marital.Status > 2: 1 (5)
## : Sex...Marital.Status <= 2:
## : :...Purpose <= 1: 0 (2)
## : Purpose > 1: 1 (4/1)
## Concurrent.Credits > 1: [S6]
##
## SubTree [S1]
##
## Length.of.current.employment <= 4: 1 (5)
## Length.of.current.employment > 4: 0 (4)
##
## SubTree [S2]
##
## Duration.in.Current.address <= 1: 1 (7/2)
## Duration.in.Current.address > 1: 0 (13/1)
##
## SubTree [S3]
##
## Payment.Status.of.Previous.Credit > 2: 1 (12/1)
## Payment.Status.of.Previous.Credit <= 2:
## :...Duration.of.Credit..month. > 16: 1 (4)
## Duration.of.Credit..month. <= 16:
## :...Sex...Marital.Status > 2: 0 (5)
## Sex...Marital.Status <= 2:
## :...Purpose <= 1: 0 (3)
## Purpose > 1: 1 (3)
##
## SubTree [S4]
##
## Most.valuable.available.asset <= 2: 1 (8)
## Most.valuable.available.asset > 2:
## :...Credit.Amount <= 1941: 0 (2)
## Credit.Amount > 1941: 1 (3)
##
## SubTree [S5]
##
## Length.of.current.employment > 3: 1 (7/1)
## Length.of.current.employment <= 3:
## :...Credit.Amount <= 3416: 0 (10/1)
## Credit.Amount > 3416:
## :...Purpose > 2: 1 (2)
## Purpose <= 2:
## :...Credit.Amount <= 4057: 1 (2)
## Credit.Amount > 4057: 0 (2)
##
## SubTree [S6]
##
## Most.valuable.available.asset > 3: 0 (6)
## Most.valuable.available.asset <= 3:
## :...Payment.Status.of.Previous.Credit > 3:
## :...Guarantors > 1: 0 (2)
## : Guarantors <= 1:
## : :...Telephone > 1: 1 (6)
## : Telephone <= 1:
## : :...Duration.in.Current.address <= 2: 1 (2)
## : Duration.in.Current.address > 2:
## : :...Duration.in.Current.address <= 3: 0 (2)
## : Duration.in.Current.address > 3:
## : :...Credit.Amount <= 1123: 1 (2)
## : Credit.Amount > 1123: 0 (2)
## Payment.Status.of.Previous.Credit <= 3:
## :...Occupation > 2:
## :...Length.of.current.employment <= 4: 0 (27/3)
## : Length.of.current.employment > 4:
## : :...Sex...Marital.Status <= 2: 1 (3)
## : Sex...Marital.Status > 2: 0 (3/1)
## Occupation <= 2:
## :...Type.of.apartment <= 1: 0 (3)
## Type.of.apartment > 1:
## :...Occupation <= 1: 0 (2)
## Occupation > 1:
## :...No.of.dependents > 1: 1 (2)
## No.of.dependents <= 1:
## :...Age..years. > 42: 0 (3)
## Age..years. <= 42:
## :...Sex...Marital.Status <= 2: 0 (3/1)
## Sex...Marital.Status > 2: 1 (4)
##
##
## Evaluation on training data (900 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 80 75( 8.3%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 213 57 (a): class 0
## 18 612 (b): class 1
##
##
## Attribute usage:
##
## 100.00% Account.Balance
## 85.00% Credit.Amount
## 76.78% Guarantors
## 55.78% Concurrent.Credits
## 54.44% Payment.Status.of.Previous.Credit
## 47.33% Duration.of.Credit..month.
## 34.22% Length.of.current.employment
## 16.89% Sex...Marital.Status
## 16.00% Occupation
## 15.44% Instalment.per.cent
## 13.11% Value.Savings.Stocks
## 11.67% Purpose
## 9.44% Most.valuable.available.asset
## 9.11% No.of.dependents
## 8.78% Type.of.apartment
## 4.56% Duration.in.Current.address
## 4.44% Telephone
## 3.11% Foreign.Worker
## 1.11% Age..years.
## 1.11% No.of.Credits.at.this.Bank
##
##
## Time: 0.0 secs
#Now that we have our model ready, lets evaluate the performance
cred_pred <- predict(credit_model, credit_test)
#We will now look at the actual vs. predicted values of credibility with a crosstable command from the gmodels library
#method 1
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.3
CrossTable(credit_test$Creditability, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ---------------------|-----------|-----------|-----------|
## 0 | 19 | 11 | 30 |
## | 0.190 | 0.110 | |
## ---------------------|-----------|-----------|-----------|
## 1 | 13 | 57 | 70 |
## | 0.130 | 0.570 | |
## ---------------------|-----------|-----------|-----------|
## Column Total | 32 | 68 | 100 |
## ---------------------|-----------|-----------|-----------|
##
##
#As we can see the crosstable shows that there were 11 instances of type I error i.e. false positives and there were 13 instances out of 100 of type II error i.e. false negatives.
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0 19 13
## 1 11 57
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 76
###Q1- If you see an accuracy of 100%, what does it mean? Does this mean that we design a perfect model? This is some thing that needs more discussion. Write a few sentences about accuracy of 100%.
###Answer - An accuracy of 100% does not mean anything on its own. It also does not say anything about the predictive power of the model. Accuracy along with other measures like precision and recall should be used collectively to make a decision about a model being good. At times model that have lower accuracy but better precision and recall can be a better model than the one with higher accuracy. This is called accuracy paradox. Lastly, a 100% accuracy could also be a result of an error in the creation of model.
##Model 2 on Credit - RANDOM FOREST
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##We will convert our credibility variable as factor and run the model by using the code below
credit_train$Creditability <- as.factor(credit_train$Creditability)
random_model <- randomForest(Creditability ~ . , data= credit_train)
summary(random_model)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 900 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1800 matrix numeric
## oob.times 900 -none- numeric
## classes 2 -none- character
## importance 20 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 900 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
##Lets evaluate model performance:
cred_pred <- predict(random_model, credit_test)
(p <- table(cred_pred, credit_test$Creditability))
##
## cred_pred 0 1
## 0 15 9
## 1 15 61
##Accuracy?
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 76
##This model shows an accuracy of 76% which is considered as good.
importance(random_model)
## MeanDecreaseGini
## Account.Balance 41.273729
## Duration.of.Credit..month. 36.219147
## Payment.Status.of.Previous.Credit 22.218414
## Purpose 22.892350
## Credit.Amount 52.060291
## Value.Savings.Stocks 18.769774
## Length.of.current.employment 19.391456
## Instalment.per.cent 15.984195
## Sex...Marital.Status 13.722533
## Guarantors 7.220445
## Duration.in.Current.address 15.004206
## Most.valuable.available.asset 16.829201
## Age..years. 38.250632
## Concurrent.Credits 8.778610
## Type.of.apartment 9.506198
## No.of.Credits.at.this.Bank 8.521766
## Occupation 12.419724
## No.of.dependents 5.903604
## Telephone 7.386394
## Foreign.Worker 1.287096
set.seed(23458)
random_model1 <- randomForest(Creditability ~ (Credit.Amount + Account.Balance + Age..years.), data= credit_train)
cred_pred1 <- predict(random_model1, credit_test)
(p1 <- table(cred_pred1, credit_test$Creditability))
##
## cred_pred1 0 1
## 0 6 4
## 1 24 66
(Accuracy <- sum(diag(p1))/sum(p1)*100)
## [1] 72
###Q2- What are the three most important features in this model.Now, Change the random seed to 23458 and find the new accuracy of random forest.
###Answer - The tree most important features in this model are Account balance, Credit amount and Age. After resetting the seed to 23458 and building the model with only important features gives an accuracy of 71%, dropped from the previous accuracy of 76%.
##Model 3 on Wines dataset - ADDING REGRESSION TO TREES
##Lets upload the wines dataset and look at its structure.
wine <- read.csv("C:/Users/Priya/Desktop/ANLY 500/wine.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
#Splitting into Train and Test sets
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
#Training a model on the data
library(rpart)
m.rpart <- rpart(quality ~ ., data=wine_train)
summary(m.rpart)
## Call:
## rpart(formula = quality ~ ., data = wine_train)
## n= 3750
##
## CP nsplit rel error xerror xstd
## 1 0.15501053 0 1.0000000 1.0008627 0.02446641
## 2 0.05098911 1 0.8449895 0.8458174 0.02333590
## 3 0.02796998 2 0.7940004 0.8051844 0.02284216
## 4 0.01970128 3 0.7660304 0.7808727 0.02167925
## 5 0.01265926 4 0.7463291 0.7616229 0.02092260
## 6 0.01007193 5 0.7336698 0.7494999 0.02069349
## 7 0.01000000 6 0.7235979 0.7425072 0.02053918
##
## Variable importance
## alcohol density volatile.acidity
## 34 21 15
## chlorides total.sulfur.dioxide free.sulfur.dioxide
## 11 7 6
## residual.sugar sulphates citric.acid
## 3 1 1
##
## Node number 1: 3750 observations, complexity param=0.1550105
## mean=5.870933, MSE=0.7854751
## left son=2 (2372 obs) right son=3 (1378 obs)
## Primary splits:
## alcohol < 10.85 to the left, improve=0.15501050, (0 missing)
## density < 0.992035 to the right, improve=0.10915940, (0 missing)
## chlorides < 0.0395 to the right, improve=0.07682258, (0 missing)
## total.sulfur.dioxide < 158.5 to the right, improve=0.04089663, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03636458, (0 missing)
## Surrogate splits:
## density < 0.991995 to the right, agree=0.869, adj=0.644, (0 split)
## chlorides < 0.0375 to the right, agree=0.757, adj=0.339, (0 split)
## total.sulfur.dioxide < 103.5 to the right, agree=0.690, adj=0.155, (0 split)
## residual.sugar < 5.375 to the right, agree=0.667, adj=0.094, (0 split)
## sulphates < 0.345 to the right, agree=0.647, adj=0.038, (0 split)
##
## Node number 2: 2372 observations, complexity param=0.05098911
## mean=5.604975, MSE=0.5981709
## left son=4 (1611 obs) right son=5 (761 obs)
## Primary splits:
## volatile.acidity < 0.2275 to the right, improve=0.10585250, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03390500, (0 missing)
## citric.acid < 0.235 to the left, improve=0.03204075, (0 missing)
## alcohol < 10.11667 to the left, improve=0.03136524, (0 missing)
## chlorides < 0.0585 to the right, improve=0.01633599, (0 missing)
## Surrogate splits:
## pH < 3.485 to the left, agree=0.694, adj=0.047, (0 split)
## sulphates < 0.755 to the left, agree=0.685, adj=0.020, (0 split)
## total.sulfur.dioxide < 105.5 to the right, agree=0.683, adj=0.011, (0 split)
## residual.sugar < 0.75 to the right, agree=0.681, adj=0.007, (0 split)
## chlorides < 0.0285 to the right, agree=0.680, adj=0.003, (0 split)
##
## Node number 3: 1378 observations, complexity param=0.02796998
## mean=6.328737, MSE=0.7765472
## left son=6 (84 obs) right son=7 (1294 obs)
## Primary splits:
## free.sulfur.dioxide < 10.5 to the left, improve=0.07699080, (0 missing)
## alcohol < 11.76667 to the left, improve=0.06210660, (0 missing)
## total.sulfur.dioxide < 67.5 to the left, improve=0.04438619, (0 missing)
## residual.sugar < 1.375 to the left, improve=0.02905351, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02613259, (0 missing)
## Surrogate splits:
## total.sulfur.dioxide < 53.5 to the left, agree=0.952, adj=0.214, (0 split)
## volatile.acidity < 0.875 to the right, agree=0.940, adj=0.024, (0 split)
##
## Node number 4: 1611 observations, complexity param=0.01265926
## mean=5.43203, MSE=0.5098121
## left son=8 (688 obs) right son=9 (923 obs)
## Primary splits:
## volatile.acidity < 0.3025 to the right, improve=0.04540111, (0 missing)
## alcohol < 10.05 to the left, improve=0.03874403, (0 missing)
## free.sulfur.dioxide < 13.5 to the left, improve=0.03338886, (0 missing)
## chlorides < 0.0495 to the right, improve=0.02574623, (0 missing)
## citric.acid < 0.195 to the left, improve=0.02327981, (0 missing)
## Surrogate splits:
## citric.acid < 0.215 to the left, agree=0.633, adj=0.141, (0 split)
## free.sulfur.dioxide < 20.5 to the left, agree=0.600, adj=0.063, (0 split)
## chlorides < 0.0595 to the right, agree=0.593, adj=0.047, (0 split)
## residual.sugar < 1.15 to the left, agree=0.583, adj=0.023, (0 split)
## total.sulfur.dioxide < 219.25 to the right, agree=0.582, adj=0.022, (0 split)
##
## Node number 5: 761 observations
## mean=5.971091, MSE=0.5878633
##
## Node number 6: 84 observations
## mean=5.369048, MSE=1.137613
##
## Node number 7: 1294 observations, complexity param=0.01970128
## mean=6.391036, MSE=0.6894405
## left son=14 (629 obs) right son=15 (665 obs)
## Primary splits:
## alcohol < 11.76667 to the left, improve=0.06504696, (0 missing)
## chlorides < 0.0395 to the right, improve=0.02758705, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02750932, (0 missing)
## pH < 3.055 to the left, improve=0.02307356, (0 missing)
## total.sulfur.dioxide < 191.5 to the right, improve=0.02186818, (0 missing)
## Surrogate splits:
## density < 0.990885 to the right, agree=0.720, adj=0.424, (0 split)
## volatile.acidity < 0.2675 to the left, agree=0.637, adj=0.253, (0 split)
## chlorides < 0.0365 to the right, agree=0.630, adj=0.238, (0 split)
## residual.sugar < 1.475 to the left, agree=0.575, adj=0.126, (0 split)
## total.sulfur.dioxide < 128.5 to the right, agree=0.574, adj=0.124, (0 split)
##
## Node number 8: 688 observations
## mean=5.255814, MSE=0.4054895
##
## Node number 9: 923 observations
## mean=5.56338, MSE=0.5471747
##
## Node number 14: 629 observations, complexity param=0.01007193
## mean=6.173291, MSE=0.6838017
## left son=28 (11 obs) right son=29 (618 obs)
## Primary splits:
## volatile.acidity < 0.465 to the right, improve=0.06897561, (0 missing)
## total.sulfur.dioxide < 200 to the right, improve=0.04223066, (0 missing)
## residual.sugar < 0.975 to the left, improve=0.03061714, (0 missing)
## fixed.acidity < 7.35 to the right, improve=0.02978501, (0 missing)
## sulphates < 0.575 to the left, improve=0.02165970, (0 missing)
## Surrogate splits:
## citric.acid < 0.045 to the left, agree=0.986, adj=0.182, (0 split)
## total.sulfur.dioxide < 279.25 to the right, agree=0.986, adj=0.182, (0 split)
##
## Node number 15: 665 observations
## mean=6.596992, MSE=0.6075098
##
## Node number 28: 11 observations
## mean=4.545455, MSE=0.9752066
##
## Node number 29: 618 observations
## mean=6.202265, MSE=0.6306098
##The rpart function has determined alcohol as the most important feature. Lets plot the tree using the code below
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.3
rpart.plot(m.rpart, digits=3, type=1)
##Another vizualization method for the tree with its splits and leaves is as follows,
rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)
##Now lets evaluate the model performance:
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
##From the performance evaluation above we can see that the model performs well for the center but not for extremes values.Now lets check the correlation,
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
##A correlation value of 53% is obtained. Its not best but its acceptable. An RMSE of 84% was found.
###Q3- What is your interpretation about this amount of RMSE?
###Answer - A lower RMSE value indicates better fit. Our value of 0.84 is higher and indicates a poor fit of the model to prediction.
###News Popularity - Uploading the data and Preprocessing
###Question 4: Try decision tree and random forest and evaluate the model.
###Answer - See below…
news <- read.csv("C:/Users/Priya/Desktop/ANLY 500/news.csv")
#Check for missing data
sum(is.na(news))
## [1] 0
#remove non-predictive variables
news <- news[,-(1:2)]
#Check for outliers
news=news[!news$n_unique_tokens==701,]
#Keep variables that are meaningful for our model
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
#Standardize the dataset
for(i in ncol(news)-1){
news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}
#Define articles with shares greater than 1400 as popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))
##RANDOM FOREST
##Split the pre-processed data into train and test datasets.
set.seed(174004689)
news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:3964, ]
news_test <- news_rand[3965:39643, ]
##Build the Random Forest model using the code below:
news_train$shares <- as.factor(news_train$shares)
random_modelNews <- randomForest(shares ~ . , data= news_train)
##Lets evaluate the performance of the model
cred_predN <- predict(random_modelNews, news_test)
(p3 <- table(cred_predN, news_test$shares))
##
## cred_predN 0 1
## 0 10368 7501
## 1 7722 10088
##Accuracy?
(Accuracy <- sum(diag(p3))/sum(p3)*100)
## [1] 57.33345
##This model shows an accuracy of 57.33% which is considered as okay not great.
importance(random_modelNews)
## MeanDecreaseGini
## n_tokens_title 110.92619
## n_tokens_content 166.63691
## n_unique_tokens 191.46309
## n_non_stop_words 169.58711
## num_hrefs 144.24258
## num_imgs 96.59536
## num_videos 52.70675
## average_token_length 196.94057
## num_keywords 98.71184
## kw_max_max 50.17738
## global_sentiment_polarity 210.88191
## avg_positive_polarity 186.40270
## title_subjectivity 76.38895
## title_sentiment_polarity 78.75773
## abs_title_subjectivity 68.23585
## abs_title_sentiment_polarity 66.47343
##The most important features of our News Popularity model are: Global sentiment polarity, average token length and number of unique tokens.
###Decision Trees
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Aug 30 15:54:57 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 3964 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## n_non_stop_words <= 0: 1 (119/39)
## n_non_stop_words > 0:
## :...global_sentiment_polarity <= 0.07169198:
## :...num_imgs > 3:
## : :...num_keywords <= 8: 0 (172/80)
## : : num_keywords > 8: 1 (67/23)
## : num_imgs <= 3:
## : :...num_hrefs <= 1: 0 (29/2)
## : num_hrefs > 1:
## : :...num_imgs <= 0:
## : :...n_tokens_title > 10: 1 (72/31)
## : : n_tokens_title <= 10:
## : : :...num_keywords > 7: 0 (31/6)
## : : num_keywords <= 7:
## : : :...kw_max_max <= 690400: 1 (31/13)
## : : kw_max_max > 690400: 0 (22/6)
## : num_imgs > 0:
## : :...n_tokens_content <= 232: 1 (87/41)
## : n_tokens_content > 232:
## : :...num_hrefs <= 10: 0 (380/101)
## : num_hrefs > 10:
## : :...n_tokens_content <= 634:
## : :...n_tokens_title <= 10: 0 (44/17)
## : : n_tokens_title > 10: 1 (46/14)
## : n_tokens_content > 634:
## : :...num_hrefs <= 11: 1 (7/1)
## : num_hrefs > 11: 0 (97/25)
## global_sentiment_polarity > 0.07169198:
## :...num_hrefs <= 10:
## :...kw_max_max <= 617900: 1 (308/128)
## : kw_max_max > 617900:
## : :...num_videos > 0:
## : :...kw_max_max <= 690400: 1 (60/19)
## : : kw_max_max > 690400:
## : : :...avg_positive_polarity <= 0.272849: 0 (30/7)
## : : avg_positive_polarity > 0.272849: 1 (418/187)
## : num_videos <= 0:
## : :...num_hrefs <= 5:
## : :...title_subjectivity <= 0.875: 0 (417/149)
## : : title_subjectivity > 0.875: 1 (31/11)
## : num_hrefs > 5:
## : :...num_imgs <= 11:
## : :...n_tokens_title <= 10: 1 (247/118)
## : : n_tokens_title > 10: 0 (200/77)
## : num_imgs > 11:
## : :...average_token_length <= 4.879552: 1 (24/3)
## : average_token_length > 4.879552: 0 (7/1)
## num_hrefs > 10:
## :...n_tokens_title > 11:
## :...num_keywords > 9: 1 (60/18)
## : num_keywords <= 9:
## : :...num_keywords > 5: 0 (163/69)
## : num_keywords <= 5:
## : :...title_subjectivity <= 0.5125: 1 (28/5)
## : title_subjectivity > 0.5125: 0 (6)
## n_tokens_title <= 11:
## :...kw_max_max <= 441000:
## :...n_tokens_content <= 1521: 0 (50/21)
## : n_tokens_content > 1521: 1 (6)
## kw_max_max > 441000:
## :...num_imgs > 1:
## :...num_keywords > 4: 1 (336/96)
## : num_keywords <= 4:
## : :...num_keywords <= 3: 1 (6/1)
## : num_keywords > 3: 0 (14/4)
## num_imgs <= 1:
## :...kw_max_max <= 617900: 1 (31/7)
## kw_max_max > 617900:
## :...num_imgs <= 0: 1 (48/16)
## num_imgs > 0:
## :...n_tokens_title <= 8:
## :...n_unique_tokens > 0.6453488: 0 (5)
## : n_unique_tokens <= 0.6453488:
## : :...avg_positive_polarity <= 0.4310074: 1 (68/15)
## : avg_positive_polarity > 0.4310074: 0 (17/6)
## n_tokens_title > 8:
## :...title_subjectivity > 0.5571429: 0 (31/7)
## title_subjectivity <= 0.5571429:
## :...num_keywords > 9:
## :...n_tokens_content > 616: 1 (29/3)
## : n_tokens_content <= 616:
## : :...n_tokens_content <= 459: 1 (5/1)
## : n_tokens_content > 459: 0 (9)
## num_keywords <= 9:
## :...num_keywords <= 7:
## :...num_hrefs <= 15: 1 (41/13)
## : num_hrefs > 15: 0 (33/12)
## num_keywords > 7:
## :...num_hrefs <= 14: 0 (18/2)
## num_hrefs > 14:
## :...num_keywords <= 8: 0 (7/2)
## num_keywords > 8: 1 (7/1)
##
##
## Evaluation on training data (3964 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 47 1398(35.3%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 1188 804 (a): class 0
## 594 1378 (b): class 1
##
##
## Attribute usage:
##
## 100.00% n_non_stop_words
## 97.00% global_sentiment_polarity
## 90.97% num_hrefs
## 64.48% kw_max_max
## 57.21% num_imgs
## 43.16% n_tokens_title
## 36.18% num_videos
## 27.37% num_keywords
## 19.17% n_tokens_content
## 16.70% title_subjectivity
## 13.45% avg_positive_polarity
## 2.27% n_unique_tokens
## 0.78% average_token_length
##
##
## Time: 0.1 secs
##From the output above we can see that the three most important features as identified by the decision tree model are number of non-stop words, global sentiment polarity and number of hrefs.
##Lets evaluate model performance and build a crosstable to see actual vs. predicted values
news_pred <- predict(news_model, news_test)
CrossTable(news_test$shares, news_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actual shares', 'predicted shares'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 35679
##
##
## | predicted shares
## actual shares | 0 | 1 | Row Total |
## --------------|-----------|-----------|-----------|
## 0 | 8759 | 9331 | 18090 |
## | 0.245 | 0.262 | |
## --------------|-----------|-----------|-----------|
## 1 | 6931 | 10658 | 17589 |
## | 0.194 | 0.299 | |
## --------------|-----------|-----------|-----------|
## Column Total | 15690 | 19989 | 35679 |
## --------------|-----------|-----------|-----------|
##
##
##From CrossTable we can see that the type I error occurs 26% of times and Type II error occurs 19% of times from the total records of 35679.
(p4 <- table(news_pred, news_test$shares))
##
## news_pred 0 1
## 0 8759 6931
## 1 9331 10658
(Accuracy <- sum(diag(p4))/sum(p4)*100)
## [1] 54.42137
##We get an accuracy of 54% from our decision trees model which is almost the same as that of Random Forest Model. But the most important features for both the models arent exactly the same.