ANLY 530 LAb 2

Part 1

In part 1, we use Naive Bayesian Classifiers to train a model that predicts loan defaut. We split the data into training dataset and testing dataset. Using the testin dataset, we found that our model has a 77.2% accuracy, with 35 false positives and 22 false negatives out of a total 200 cases.

creditData<- read.csv("creditData.csv")
str(creditData)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...

creditData$Creditability <- as.factor(creditData$Creditability)
sum(is.na(creditData))

## [1] 0

set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
prop.table(table(credit_train$Creditability))

## 
##         0         1 
## 0.3146667 0.6853333

prop.table(table(credit_test$Creditability))

## 
##     0     1 
## 0.256 0.744

library(naivebayes)
naive_model <- naive_bayes(Creditability ~ ., data= credit_train)
naive_model

## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
## 
## A priori probabilities: 
## 
##         0         1 
## 0.3146667 0.6853333 
## 
## Tables: 
##                
## Account.Balance        0        1
##            mean 1.923729 2.793774
##            sd   1.036826 1.252008
## 
##                           
## Duration.of.Credit..month.        0        1
##                       mean 24.46610 19.20039
##                       sd   13.82208 11.13433
## 
##                                  
## Payment.Status.of.Previous.Credit        0        1
##                              mean 2.161017 2.665370
##                              sd   1.071649 1.045219
## 
##        
## Purpose        0        1
##    mean 2.927966 2.803502
##    sd   2.944722 2.633253
## 
##              
## Credit.Amount        0        1
##          mean 3964.195 2984.177
##          sd   3597.093 2379.685
## 
## # ... and 15 more tables

(conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability))

##    
##       0   1
##   0  42  35
##   1  22 151

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 77.2

Part 2

In part 2, we try to improve accuracy of our model by cleaning out features that have high correlation (above 0.3) from the dataset before training the model. We find that this pre-processing decreased false negatives by 2 cases and improved accuracy to 78%.

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(robustbase)
creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
(highlycor <- findCorrelation(m, 0.30))

## [1]  5 12 19 15  3

filteredData <- credit_rand[, -(highlycor[5]+1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]
nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
filteredTestPred <- predict(nb_model, newdata = filteredTest)
table(filteredTestPred, filteredTest$Creditability)

##                 
## filteredTestPred   0   1
##                0  44  35
##                1  20 151

(conf_nat <- table(filteredTestPred, filteredTest$Creditability))

##                 
## filteredTestPred   0   1
##                0  44  35
##                1  20 151

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 78

Part 3

In part 3, we use Naive Bayesian Classifiers to train a model that predicts popularity of news articles. We first create a “popularity” variable that defines popoular (yes) as an article that has more than 1400 shares and rest as unpopular (no). We split the data into training dataset and testing dataset. Using the testin dataset, we found that our model has a 100% accuracy.

news <- read.csv("OnlineNewsPopularity.csv")
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
     if(newsShort$shares[i] >= 1400) {
         newsShort$popular[i] = "yes"} 
     else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
news_rand <- newsShort[order(runif(10000)), ]
set.seed(12345)
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
nb_model <- naive_bayes(shares ~ ., data=news_train)
nb_model

## ===================== Naive Bayes ===================== 
## Call: 
## naive_bayes.formula(formula = shares ~ ., data = news_train)
## 
## A priori probabilities: 
## 
##        no       yes 
## 0.4291111 0.5708889 
## 
## Tables: 
##               
## n_tokens_title       no      yes
##           mean 9.820559 9.695991
##           sd   1.929249 1.987754
## 
##                 
## n_tokens_content       no      yes
##             mean 452.2315 515.1051
##             sd   347.1779 450.0206
## 
##                
## n_unique_tokens        no       yes
##            mean 0.5702437 0.5542023
##            sd   0.1127776 0.1232687
## 
##                 
## n_non_stop_words         no        yes
##             mean 0.99404453 0.99124172
##             sd   0.07695147 0.09318398
## 
##          
## num_hrefs        no       yes
##      mean  9.147851 10.570650
##      sd    8.644083 11.540711
## 
## # ... and 12 more tables

news_Pred <- predict(nb_model, newdata = news_test)
(conf_nat <- table(news_Pred, news_test$shares))

##          
## news_Pred  no yes
##       no  430   0
##       yes   0 570

(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)

## [1] 100