library(C50)
## Warning: package 'C50' was built under R version 3.5.1
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.1
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.1
## Loading required package: rpart
library(ca)
## Warning: package 'ca' was built under R version 3.5.1
library(class)
## Warning: package 'class' was built under R version 3.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.1
library(colorspace)
## Warning: package 'colorspace' was built under R version 3.5.1
library(bnlearn)
## Warning: package 'bnlearn' was built under R version 3.5.1
## 
## Attaching package: 'bnlearn'
## The following object is masked from 'package:stats':
## 
##     sigma
library(tm)
## Warning: package 'tm' was built under R version 3.5.1
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.1
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:bnlearn':
## 
##     impute
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(SnowballC)
library(klaR)
## Warning: package 'klaR' was built under R version 3.5.1
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.1
## corrplot 0.84 loaded
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
library(leaps)
## Warning: package 'leaps' was built under R version 3.5.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(colorspace)
library(naivebayes)
library(psych)
## Warning: package 'psych' was built under R version 3.5.1
## 
## Attaching package: 'psych'
## The following object is masked from 'package:kernlab':
## 
##     alpha
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

PART 1

(1) DATA COLLECTION

creditData <- read.csv("C:/Users/charl/Downloads/ANLY 530--Lab 2--creditData.csv")
str(creditData)
## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

Note that “Account Balance” is ranked 1, “Duration of Credit…” is #2, and “Credit Amount” is ranked #3. These are the top three variables affecting the credit score function in R.

(2) Pre-processing

Preparing the Data for the Learning Process.

Find the missing vaues: for many functions to work correctly, the cannot be any missing values in the dataset

Convert the target to categorical

creditData$Creditability <- as.factor(creditData$Creditability)
sum(is.na(creditData))
## [1] 0

Pre-processing (continued)

Split data to 75% for training and 25% for testing (Note: the sum above is “0”)

# 75% means 750 for training and the rest for testing

set.seed(12345)

credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]

Pre processing (continued)

Check the percentage target for both training and testing

prop.table(table(credit_train$Creditability))
## 
##         0         1 
## 0.3146667 0.6853333
prop.table(table(credit_test$Creditability))
## 
##     0     1 
## 0.256 0.744

Note: The percentage targets are pretty close

(3) Model training

Start to train a Naive Bayes model using the training data

naive_model <- naive_bayes(Creditability ~. , data = credit_train)
naive_model
## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
## 
## A priori probabilities: 
## 
##         0         1 
## 0.3146667 0.6853333 
## 
## Tables: 
##                
## Account.Balance        0        1
##            mean 1.923729 2.793774
##            sd   1.036826 1.252008
## 
##                           
## Duration.of.Credit..month.        0        1
##                       mean 24.46610 19.20039
##                       sd   13.82208 11.13433
## 
##                                  
## Payment.Status.of.Previous.Credit        0        1
##                              mean 2.161017 2.665370
##                              sd   1.071649 1.045219
## 
##        
## Purpose        0        1
##    mean 2.927966 2.803502
##    sd   2.944722 2.633253
## 
##              
## Credit.Amount        0        1
##          mean 3964.195 2984.177
##          sd   3597.093 2379.685
## 
## # ... and 15 more tables

Notice that “Account.Balance”and “Payment.Status.of.Previous.Credit” affect Creditability.

(4) Model evaluation

Evaluate the design model using both the confusion matrix and accuracy

(conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability))
##    
##       0   1
##   0  42  35
##   1  22 151
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 77.2

The model seems to be fine, with an accuracy of 77.2. However, this accuracy should be improved.

Part 2

Improve the performance

Data Exploration:

creditDataScaled <- scale(credit_rand[ ,2:ncol(credit_rand)], center = TRUE, scale = TRUE)
m <- cor(creditDataScaled)
(highlycor <- findCorrelation(m, 0.30))
## [1]  5 12 19 15  3

Let’s pick some observations by their values:

filteredData <- credit_rand[ , -(highlycor[4] + 1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]

We need to do more tests to check the quality of the data.

Training the data (Train a Naive Bayes Classifier (model))

nb_model <- naive_bayes(Creditability ~., data = filteredTraining)

Evaluate the Model (Evaluate the Naive Bayes Classifier)

filteredTestPred <- predict(nb_model, newdata = filteredTest)
table(filteredTestPred, filteredTest$Creditability)
##                 
## filteredTestPred   0   1
##                0  41  34
##                1  23 152

Results: The results are basically the same. However, each entry was either increased or decreased by one.

(conf_nat <- table(filteredTestPred, filteredTest$Creditability))
##                 
## filteredTestPred   0   1
##                0  41  34
##                1  23 152
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 77.2

The accuracy of the model is the same. No improvement.

Part 3: News Popularity

Data Preparation:

news <- read.csv("C:/Users/charl/Downloads/ANLY 530--OnlineNewsPopularity.csv")

Data Preparation

newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")

Data Preparation

newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular

newsShort$shares <- as.factor(newsShort$shares)

Data Preparation

news_rand <- newsShort[order(runif(10000)), ]
set.seed(12345)
#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]

Model Design

nb_model <- naive_bayes(shares~., data = news_train)
nb_model
## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = shares ~ ., data = news_train)
## 
## A priori probabilities: 
## 
##        no       yes 
## 0.4291111 0.5708889 
## 
## Tables: 
##               
## n_tokens_title       no      yes
##           mean 9.820559 9.695991
##           sd   1.929249 1.987754
## 
##                 
## n_tokens_content       no      yes
##             mean 452.2315 515.1051
##             sd   347.1779 450.0206
## 
##                
## n_unique_tokens        no       yes
##            mean 0.5702437 0.5542023
##            sd   0.1127776 0.1232687
## 
##                 
## n_non_stop_words         no        yes
##             mean 0.99404453 0.99124172
##             sd   0.07695147 0.09318398
## 
##          
## num_hrefs        no       yes
##      mean  9.147851 10.570650
##      sd    8.644083 11.540711
## 
## # ... and 12 more tables

Based on what we have, for a priori probabilities, there are more “Yes” tokens than “No” tokens. Also, based on the means, there seems to be more “Yes” tokens in content for both “n_token_content” and “num_hrefs;” although, the margins (or standard deviations) are relatively wide.

Evaluate the Model

news_Pred <- predict(nb_model, newdata = news_test)
(conf_nat <- table(news_Pred, news_test$shares))
##          
## news_Pred  no yes
##       no  430   0
##       yes   0 570

Should we be concerned about the confusion matrix above? Yes. However, the accuracy of the model seems to be 100%

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 100