library(C50)
## Warning: package 'C50' was built under R version 3.5.1
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.1
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.1
## Loading required package: rpart
library(ca)
## Warning: package 'ca' was built under R version 3.5.1
library(class)
## Warning: package 'class' was built under R version 3.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(rpart)
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.1
library(bnlearn)
## Warning: package 'bnlearn' was built under R version 3.5.1
##
## Attaching package: 'bnlearn'
## The following object is masked from 'package:stats':
##
## sigma
library(tm)
## Warning: package 'tm' was built under R version 3.5.1
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.1
##
## Attaching package: 'e1071'
## The following object is masked from 'package:bnlearn':
##
## impute
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(SnowballC)
library(klaR)
## Warning: package 'klaR' was built under R version 3.5.1
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.1
## corrplot 0.84 loaded
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
library(leaps)
## Warning: package 'leaps' was built under R version 3.5.1
library(colorspace)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(naivebayes)
library(psych)
## Warning: package 'psych' was built under R version 3.5.1
##
## Attaching package: 'psych'
## The following object is masked from 'package:kernlab':
##
## alpha
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
PART 1
(1) DATA COLLECTION
credit <- read.csv("C:/Users/charl/Downloads/ANLY 530--Lab 2--creditData.csv")
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
Note that “Account Balance” is ranked 1, “Duration of Credit…” is #2, and “Credit Amount” is ranked #3. These are the top three variables affecting the credit score function in R.
(2) Exploring the Data: Preparing the Data for the Learning Process.
summary(credit$Creditability)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 1.0 0.7 1.0 1.0
hist(credit$Creditability)

m1 <- glm(Creditability ~., family = "binomial", data = credit)
m1
##
## Call: glm(formula = Creditability ~ ., family = "binomial", data = credit)
##
## Coefficients:
## (Intercept) Account.Balance
## -3.9940768 0.5799270
## Duration.of.Credit..month. Payment.Status.of.Previous.Credit
## -0.0245701 0.3821907
## Purpose Credit.Amount
## 0.0315277 -0.0000934
## Value.Savings.Stocks Length.of.current.employment
## 0.2391122 0.1517308
## Instalment.per.cent Sex...Marital.Status
## -0.2983367 0.2573791
## Guarantors Duration.in.Current.address
## 0.3472739 -0.0141141
## Most.valuable.available.asset Age..years.
## -0.1828445 0.0089167
## Concurrent.Credits Type.of.apartment
## 0.2418915 0.2930602
## No.of.Credits.at.this.Bank Occupation
## -0.2435882 0.0188903
## No.of.dependents Telephone
## -0.1707594 0.2946784
## Foreign.Worker
## 1.1583058
##
## Degrees of Freedom: 999 Total (i.e. Null); 979 Residual
## Null Deviance: 1222
## Residual Deviance: 956.6 AIC: 998.6
Exploring the DATA and Randomizing the DATA
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
Divide DATA to Train and Test Sets. Classification Analysis.
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
Train and Test Sets- A good randomization should show Close percentages between Splits. Check the proportioj of Data after randomization:
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3146667 0.6853333
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.256 0.744
(3) Training the Model
m.rpart1 <- rpart(Creditability ~., data = credit_train)
print(m.rpart1)
## n= 750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 750 161.7387000 0.68533330
## 2) Account.Balance< 2.5 424 104.6415000 0.55660380
## 4) Duration.of.Credit..month.>=33 85 19.6941200 0.36470590
## 8) Value.Savings.Stocks< 4.5 73 15.3698600 0.30136990
## 16) Age..years.< 29.5 26 1.8461540 0.07692308 *
## 17) Age..years.>=29.5 47 11.4893600 0.42553190
## 34) Duration.in.Current.address>=3.5 24 3.9583330 0.20833330 *
## 35) Duration.in.Current.address< 3.5 23 5.2173910 0.65217390 *
## 9) Value.Savings.Stocks>=4.5 12 2.2500000 0.75000000 *
## 5) Duration.of.Credit..month.< 33 339 81.0324500 0.60471980
## 10) Payment.Status.of.Previous.Credit< 1.5 38 8.2105260 0.31578950 *
## 11) Payment.Status.of.Previous.Credit>=1.5 301 69.2491700 0.64119600
## 22) Credit.Amount>=10975.5 8 0.0000000 0.00000000 *
## 23) Credit.Amount< 10975.5 293 65.8703100 0.65870310
## 46) Duration.of.Credit..month.>=11.5 233 55.4592300 0.60944210
## 92) Credit.Amount< 1381.5 74 18.1621600 0.43243240
## 184) Guarantors< 1.5 63 14.3174600 0.34920630
## 368) Telephone< 1.5 52 10.2307700 0.26923080 *
## 369) Telephone>=1.5 11 2.1818180 0.72727270 *
## 185) Guarantors>=1.5 11 0.9090909 0.90909090 *
## 93) Credit.Amount>=1381.5 159 33.8993700 0.69182390 *
## 47) Duration.of.Credit..month.< 11.5 60 7.6500000 0.85000000 *
## 3) Account.Balance>=2.5 326 40.9325200 0.85276070
## 6) Purpose>=8.5 32 7.2187500 0.65625000
## 12) Length.of.current.employment< 2.5 8 0.8750000 0.12500000 *
## 13) Length.of.current.employment>=2.5 24 3.3333330 0.83333330 *
## 7) Purpose< 8.5 294 32.3435400 0.87414970 *
(4) Evaluate the Model
credit_pred <- predict(m.rpart1, credit_test)
(p <- table(credit_pred, credit_test$Creditability))
##
## credit_pred 0 1
## 0.0769230769230769 7 2
## 0.125 0 1
## 0.208333333333333 4 2
## 0.269230769230769 5 5
## 0.315789473684211 8 3
## 0.652173913043478 4 2
## 0.691823899371069 18 35
## 0.727272727272727 2 0
## 0.75 1 2
## 0.833333333333333 1 9
## 0.85 3 15
## 0.874149659863946 11 109
## 0.909090909090909 0 1
Since we are dealing with a 13 x 3 matrix dimension, we are unable to use the following accuracy model:
Accuracy <- sum(diag(p)) / (sum(p) * 100)
Visualization
The tree below shows that account balance, Duration of Credit, and Credit Amount appear to segment the loans well into different risk categories:
rpart.plot(m.rpart1, digits = 3, type = 1)

Our dichotomy is the following:
summary(credit_test$Creditability)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 0.744 1.000 1.000
Let’s look at the relation between Account.Balance and Duration.of.Credit..month. Two of the top three factors influencing the Credit Score Function in R:
xtabs(~ credit$Account.Balance + credit$Duration.of.Credit..month, data = credit)
## credit$Duration.of.Credit..month
## credit$Account.Balance 4 5 6 7 8 9 10 11 12 13 14 15 16 18 20 21 22
## 1 0 0 22 0 3 8 4 2 53 1 2 13 1 33 3 9 0
## 2 0 0 14 3 4 20 3 3 44 2 1 13 1 34 4 6 0
## 3 1 0 7 0 0 3 6 0 13 0 0 6 0 5 0 2 0
## 4 5 1 32 2 0 18 15 4 69 1 1 32 0 41 1 13 2
## credit$Duration.of.Credit..month
## credit$Account.Balance 24 26 27 28 30 33 36 39 40 42 45 47 48 54 60 72
## 1 56 0 3 1 11 1 22 1 1 5 2 1 14 0 2 0
## 2 37 1 5 0 14 0 24 2 0 2 3 0 21 1 6 1
## 3 11 0 0 0 3 0 4 0 0 2 0 0 0 0 0 0
## 4 80 0 5 2 12 2 33 2 0 2 0 0 13 1 5 0
Let’s change our class variables to factors:
credit$Duration <- as.factor(credit$Duration.of.Credit..month)
credit$Account <- as.factor(credit$Account.Balance)
credit$Creditability <- as.factor(credit$Creditability)
credit$Payment <- as.factor(credit$Payment.Status.of.Previous.Credit)
credit$Purpose <- as.factor(credit$Purpose)
credit$CreditA <- as.factor(credit$Credit.Amount)
credit$ValueS <- as.factor(credit$Value.Savings.Stocks)
credit$Length <- as.factor(credit$Length.of.current.employment)
credit$Instalment <- as.factor(credit$Instalment.per.cent)
credit$Sex <- as.factor(credit$Sex...Marital.Status)
credit$Guarantors <- as.factor(credit$Guarantors)
credit$Durat <- as.factor(credit$Duration.in.Current.address)
credit$Most <- as.factor(credit$Most.valuable.available.asset)
credit$Age <- as.factor(credit$Age..years.)
credit$Con <- as.factor(credit$Concurrent.Credits)
credit$Type <- as.factor(credit$Type.of.apartment)
credit$No <- as.factor(credit$No.of.Credits.at.this.Bank)
credit$Occupation <- as.factor(credit$Occupation)
credit$Number <- as.factor(credit$No.of.dependents)
credit$Telephone <- as.factor(credit$Telephone)
credit$Foreign <- as.factor(credit$Foreign.Worker)
As an alternative, let’s now create a density plot using Age..years and Account.Balance:
credit %>% ggplot(aes(x=Age..years., fill = credit$Account)) +
geom_density(alpha=0.8, color = "blue") + ggtitle("Density Plot")

Notice that the account balance was greatest around age 25 years.
Now, let’s create a density plot using only Creditability:
credit %>% ggplot(aes(x = Creditability, fill = credit$Creditability)) +
geom_density(alpha=0.8, color = "blue") + ggtitle("Density Plot")

(4) Model Evaluation
cred_pred <- predict(m.rpart1, credit_test)
head(cred_pred)
## 64 498 698 3 521 901
## 0.8741497 0.8741497 0.8741497 0.2692308 0.8741497 0.6918239
Let’s re-evaluate part of the model:
CrossTable(credit_test$Creditability, credit_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, prop.t = FALSE, dnn = c("predicted Creditability","actual Creditability"))
##
##
## Cell Contents
## |-------------------------|
## | N |
## |-------------------------|
##
##
## Total Observations in Table: 250
##
##
## | actual Creditability
## predicted Creditability | 0.0769230769230769 | 0.125 | 0.208333333333333 | 0.269230769230769 | 0.315789473684211 | 0.652173913043478 | 0.691823899371069 | 0.727272727272727 | 0.75 | 0.833333333333333 | 0.85 | 0.874149659863946 | 0.909090909090909 | Row Total |
## ------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## 0 | 7 | 0 | 4 | 5 | 8 | 4 | 18 | 2 | 1 | 1 | 3 | 11 | 0 | 64 |
## ------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## 1 | 2 | 1 | 2 | 5 | 3 | 2 | 35 | 0 | 2 | 9 | 15 | 109 | 1 | 186 |
## ------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Column Total | 9 | 1 | 6 | 10 | 11 | 6 | 53 | 2 | 3 | 10 | 18 | 120 | 1 | 250 |
## ------------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##
##
(5) Model Improvement
Data Partition
Let us use the naive_bayes() function on our earlier training data to build the Naive Bayes Classifier:
sum(is.na(credit))
## [1] 0
Report: Above, there were no missing values identified.
convert_counts <- function(credit_train) {credit_train <- ifelse (credit_train > 0, "Yes","No")}
Use the Naive Bayes function on our earlier Training Data
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
Below, we use the 75%/25% Split:
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
Naive Bayes Model
model1 <- naive_bayes(credit$Creditability ~ credit$Account, data = credit_train)
model1
## ===================== Naive Bayes =====================
## Call:
## naive_bayes.formula(formula = credit$Creditability ~ credit$Account,
## data = credit_train)
##
## A priori probabilities:
##
## 0 1
## 0.3 0.7
##
## Tables:
##
## credit$Account 0 1
## 1 0.45000000 0.19857143
## 2 0.35000000 0.23428571
## 3 0.04666667 0.07000000
## 4 0.15333333 0.49714286
model2 <- naive_bayes(credit$Creditability ~ credit$Duration+credit$Payment+credit$Length, data = credit_train)
model2
## ===================== Naive Bayes =====================
## Call:
## naive_bayes.formula(formula = credit$Creditability ~ credit$Duration +
## credit$Payment + credit$Length, data = credit_train)
##
## A priori probabilities:
##
## 0 1
## 0.3 0.7
##
## Tables:
##
## credit$Duration 0 1
## 4 0.000000000 0.008571429
## 5 0.000000000 0.001428571
## 6 0.030000000 0.094285714
## 7 0.000000000 0.007142857
## 8 0.003333333 0.008571429
## 9 0.046666667 0.050000000
## 10 0.010000000 0.035714286
## 11 0.000000000 0.012857143
## 12 0.163333333 0.185714286
## 13 0.000000000 0.005714286
## 14 0.003333333 0.004285714
## 15 0.040000000 0.074285714
## 16 0.003333333 0.001428571
## 18 0.140000000 0.101428571
## 20 0.003333333 0.010000000
## 21 0.030000000 0.030000000
## 22 0.000000000 0.002857143
## 24 0.186666667 0.182857143
## 26 0.000000000 0.001428571
## 27 0.016666667 0.011428571
## 28 0.003333333 0.002857143
## 30 0.043333333 0.038571429
## 33 0.003333333 0.002857143
## 36 0.123333333 0.065714286
## 39 0.003333333 0.005714286
## 40 0.003333333 0.000000000
## 42 0.010000000 0.011428571
## 45 0.013333333 0.001428571
## 47 0.000000000 0.001428571
## 48 0.093333333 0.028571429
## 54 0.003333333 0.001428571
## 60 0.020000000 0.010000000
## 72 0.003333333 0.000000000
##
##
## credit$Payment 0 1
## 0 0.08333333 0.02142857
## 1 0.09333333 0.03000000
## 2 0.56333333 0.51571429
## 3 0.09333333 0.08571429
## 4 0.16666667 0.34714286
##
##
## credit$Length 0 1
## 1 0.07666667 0.05571429
## 2 0.23333333 0.14571429
## 3 0.34666667 0.33571429
## 4 0.13000000 0.19285714
## 5 0.21333333 0.27000000
Evaluate the credit model’s performance:
# Predict:
credit_pred <- predict(model1, credit_test)
## Warning in t(log_sum) + log(prior): Recycling array of length 1 in array-vector arithmetic is deprecated.
## Use c() or as.vector() instead.
cat("\n", "Cross table of credit predictions", "\n\n")
##
## Cross table of credit predictions
head(cbind(credit_pred, train))
## credit_pred train
## [1,] 2 ?
# Predict with model1:
credit_pred1 <- predict(model1, credit_test)
## Warning in t(log_sum) + log(prior): Recycling array of length 1 in array-vector arithmetic is deprecated.
## Use c() or as.vector() instead.
cat("\n", "Cross table of credit predictions", "\n\n")
##
## Cross table of credit predictions
head(cbind(credit_pred1, train))
## credit_pred1 train
## [1,] 2 ?
# Predict:
credit_pred2 <- predict(model2, credit_test)
## Warning in t(log_sum) + log(prior): Recycling array of length 1 in array-vector arithmetic is deprecated.
## Use c() or as.vector() instead.
cat("\n", "Cross table of credit predictions", "\n\n")
##
## Cross table of credit predictions
head(cbind(credit_pred2, train))
## credit_pred2 train
## [1,] 2 ?
PART 2
Step 1: Exploring and Preparing the Data:
library(caret)
library(minqa)
## Warning: package 'minqa' was built under R version 3.5.1
library(nloptr)
## Warning: package 'nloptr' was built under R version 3.5.1
##
## Attaching package: 'nloptr'
## The following objects are masked from 'package:minqa':
##
## bobyqa, newuoa
library(MatrixModels)
## Warning: package 'MatrixModels' was built under R version 3.5.1
str(credit)
## 'data.frame': 1000 obs. of 37 variables:
## $ Creditability : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : Factor w/ 10 levels "0","1","2","3",..: 3 1 9 1 1 1 1 1 4 4 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
## $ Duration : Factor w/ 33 levels "4","5","6","7",..: 14 6 9 9 9 7 5 3 14 18 ...
## $ Account : Factor w/ 4 levels "1","2","3","4": 1 1 2 1 1 1 1 1 4 2 ...
## $ Payment : Factor w/ 5 levels "0","1","2","3",..: 5 5 3 5 5 5 5 5 5 3 ...
## $ CreditA : Factor w/ 923 levels "250","276","338",..: 117 529 73 399 412 424 605 219 126 653 ...
## $ ValueS : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 1 1 1 1 1 1 3 ...
## $ Length : Factor w/ 5 levels "1","2","3","4",..: 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment : Factor w/ 4 levels "1","2","3","4": 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 3 3 3 2 2 ...
## $ Durat : Factor w/ 4 levels "1","2","3","4": 4 2 4 2 4 3 4 4 4 4 ...
## $ Most : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 2 1 1 1 3 4 ...
## $ Age : Factor w/ 53 levels "19","20","21",..: 3 18 5 21 20 30 21 22 47 5 ...
## $ Con : Factor w/ 3 levels "1","2","3": 3 3 3 3 1 3 3 3 3 3 ...
## $ Type : Factor w/ 3 levels "1","2","3": 1 1 1 1 2 1 2 2 2 1 ...
## $ No : Factor w/ 4 levels "1","2","3","4": 1 2 1 2 2 2 2 1 2 1 ...
## $ Number : Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 1 ...
## $ Foreign : Factor w/ 2 levels "1","2": 1 1 1 2 2 2 2 2 1 1 ...
sum(is.na(credit))
## [1] 0
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
Let’s compute the correlation matrix:
## creditScaled <- scale(credit_rand[ ,2:ncol(credit_rand)], center = TRUE, scale = TRUE)
## m <- cor(creditScaled)
## highlycor <- findCorrelation(m, 0.30)
Part 3
News Popularity
Import the Data:
news <- read.csv("C:/Users/charl/Downloads/ANLY 530--OnlineNewsPopularity.csv")
str(news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : int 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : int 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : int 219 255 211 531 1072 370 960 989 97 231 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : int 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : int 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : int 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : int 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : int 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : int 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: int 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : int 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : int 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : int 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : int 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : int 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
Import the Data
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
Pre-Processing the Data
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
set.seed(12345)
news_rand <- newsShort[order(runif(10000)), ]
Let us start the classification analysis:
#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
Check the proportion of Data after randomization:
prop.table(table(news_train$shares))
##
## no yes
## 0.4308889 0.5691111
prop.table(table(news_test$shares))
##
## no yes
## 0.414 0.586
Let us Train the Model:
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Jul 27 21:31:57 2018
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 9000 cases (18 attributes) from undefined.data
##
## Decision tree:
##
## popular = no: no (3878)
## popular = yes: yes (5122)
##
##
## Evaluation on training data (9000 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 3878 (a): class no
## 5122 (b): class yes
##
##
## Attribute usage:
##
## 100.00% popular
##
##
## Time: 0.1 secs
Evaluate the Model:
news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##
## news_pred no yes
## no 414 0
## yes 0 586
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 100
Since the accuracy is 100%, we conclude there is something wrong with the model.
Let us work on improving the Naive Bayes Classifier:
We, first, randomize the data:
news_rand <- newsShort[order(runif(10000)),]
Let’s scale the data:
## newsScaled <- scale(news_rand[2:ncol(news_rand)][-1], center = TRUE, scale = TRUE)