###PART 1
###Q1- (one thing to make sure you do!): Remember that the class variable needs to be a categorical data type in order to build a Naïve Bayes Classifier. This means that you’ll need to convert your class variable. ###Next, use a 75%/25% split for training and test data, i.e. use 75% of the records for the training set and 25% of the records for the test set. Report the number of missing values you find in the data in your results report. Use the randomization seed of 12345.
###ANSWER- Q1:
creditData <- read.csv("C:/Users/Priya/Downloads/creditData.csv")
sum(is.na(creditData))
## [1] 0
str(creditData)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
creditData$Creditability <- as.factor(creditData$Creditability)
creditData$Account.Balance <- as.factor(creditData$Account.Balance)
creditData$Payment.Status.of.Previous.Credit <- as.factor(creditData$Payment.Status.of.Previous.Credit)
creditData$Purpose <- as.factor(creditData$Purpose)
creditData$Value.Savings.Stocks <- as.factor(creditData$Value.Savings.Stocks)
creditData$Length.of.current.employment <- as.factor(creditData$Length.of.current.employment)
creditData$Instalment.per.cent <- as.factor(creditData$Instalment.per.cent)
creditData$Sex...Marital.Status <- as.factor(creditData$Sex...Marital.Status)
creditData$Guarantors <- as.factor(creditData$Guarantors)
creditData$Duration.in.Current.address <- as.factor(creditData$Duration.in.Current.address)
creditData$Most.valuable.available.asset <- as.factor(creditData$Most.valuable.available.asset)
creditData$Concurrent.Credits <- as.factor(creditData$Concurrent.Credits)
creditData$Type.of.apartment <- as.factor(creditData$Type.of.apartment)
creditData$No.of.Credits.at.this.Bank <- as.factor(creditData$No.of.Credits.at.this.Bank)
creditData$Occupation <- as.factor(creditData$Occupation)
creditData$No.of.dependents <- as.factor(creditData$No.of.dependents)
creditData$Telephone <- as.factor(creditData$Telephone)
creditData$Foreign.Worker <- as.factor(creditData$Foreign.Worker)
str(creditData)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Account.Balance : Factor w/ 4 levels "1","2","3","4": 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: Factor w/ 5 levels "0","1","2","3",..: 5 5 3 5 5 5 5 5 5 3 ...
## $ Purpose : Factor w/ 10 levels "0","1","2","3",..: 3 1 9 1 1 1 1 1 4 4 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : Factor w/ 5 levels "1","2","3","4",..: 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : Factor w/ 4 levels "1","2","3","4": 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : Factor w/ 4 levels "1","2","3","4": 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : Factor w/ 4 levels "1","2","3","4": 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : Factor w/ 3 levels "1","2","3": 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : Factor w/ 3 levels "1","2","3": 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : Factor w/ 4 levels "1","2","3","4": 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : Factor w/ 2 levels "1","2": 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : Factor w/ 2 levels "1","2": 1 1 1 2 2 2 2 2 1 1 ...
set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
###Q2- Compute the percentage of both classes similar to what you did in lab 1 and see if the distribution of both classes preserved for both training and testing data.
###ANSWER - Q2:
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3088889 0.6911111
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.22 0.78
###From the prop tables above we can see that the distribution is uniform for test as well as train datasets.
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.5.3
## naivebayes 0.9.6 loaded
naive_model <- naive_bayes(as.character(Creditability) ~ ., data= credit_train)
naive_model
##
## ================================ Naive Bayes =================================
##
## Call:
## naive_bayes.formula(formula = as.character(Creditability) ~ .,
## data = credit_train)
##
## ------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.3088889 0.6911111
##
## ------------------------------------------------------------------------------
##
## Tables:
##
## ------------------------------------------------------------------------------
## ::: Account.Balance (Categorical)
## ------------------------------------------------------------------------------
##
## Account.Balance 0 1
## 1 0.44244604 0.20418006
## 2 0.35611511 0.22829582
## 3 0.05035971 0.07234727
## 4 0.15107914 0.49517685
##
## ------------------------------------------------------------------------------
## ::: Duration.of.Credit..month. (Gaussian)
## ------------------------------------------------------------------------------
##
## Duration.of.Credit..month. 0 1
## mean 24.91727 19.18489
## sd 13.41521 11.06282
##
## ------------------------------------------------------------------------------
## ::: Payment.Status.of.Previous.Credit (Categorical)
## ------------------------------------------------------------------------------
##
## Payment.Status.of.Previous.Credit 0 1
## 0 0.08273381 0.02090032
## 1 0.09352518 0.03215434
## 2 0.56115108 0.51929260
## 3 0.09352518 0.08038585
## 4 0.16906475 0.34726688
##
## ------------------------------------------------------------------------------
## ::: Purpose (Categorical)
## ------------------------------------------------------------------------------
##
## Purpose 0 1
## 0 0.298561151 0.200964630
## 1 0.057553957 0.118971061
## 2 0.197841727 0.181672026
## 3 0.201438849 0.319935691
## 4 0.014388489 0.009646302
## 5 0.025179856 0.016077170
## 6 0.079136691 0.043408360
## 8 0.003597122 0.011254019
## 9 0.104316547 0.088424437
## 10 0.017985612 0.009646302
##
## ------------------------------------------------------------------------------
## ::: Credit.Amount (Gaussian)
## ------------------------------------------------------------------------------
##
## Credit.Amount 0 1
## mean 3977.813 2962.659
## sd 3568.550 2383.757
##
## ------------------------------------------------------------------------------
##
## # ... and 15 more tables
##
## ------------------------------------------------------------------------------
conf_nat <- table(predict(naive_model, credit_test), credit_test$Creditability)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100
Accuracy
## [1] 80
###As we can see from above that the accuracy is 80%.
###PART 2
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
creditData1 <- read.csv("C:/Users/Priya/Downloads/creditData.csv")
credit_rand <- creditData1[order(runif(1000)), ]
creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
C <- cor(creditDataScaled)
highlycor <- findCorrelation(C, 0.30)
creditData1$Creditability <- as.factor(creditData1$Creditability)
creditData1$Account.Balance <- as.factor(creditData1$Account.Balance)
creditData1$Payment.Status.of.Previous.Credit <- as.factor(creditData1$Payment.Status.of.Previous.Credit)
creditData1$Purpose <- as.factor(creditData1$Purpose)
creditData1$Value.Savings.Stocks <- as.factor(creditData1$Value.Savings.Stocks)
creditData1$Length.of.current.employment <- as.factor(creditData1$Length.of.current.employment)
creditData1$Instalment.per.cent <- as.factor(creditData1$Instalment.per.cent)
creditData1$Sex...Marital.Status <- as.factor(creditData1$Sex...Marital.Status)
creditData1$Guarantors <- as.factor(creditData1$Guarantors)
creditData1$Duration.in.Current.address <- as.factor(creditData1$Duration.in.Current.address)
creditData1$Most.valuable.available.asset <- as.factor(creditData1$Most.valuable.available.asset)
creditData1$Concurrent.Credits <- as.factor(creditData1$Concurrent.Credits)
creditData1$Type.of.apartment <- as.factor(creditData1$Type.of.apartment)
creditData1$No.of.Credits.at.this.Bank <- as.factor(creditData1$No.of.Credits.at.this.Bank)
creditData1$Occupation <- as.factor(creditData1$Occupation)
creditData1$No.of.dependents <- as.factor(creditData1$No.of.dependents)
creditData1$Telephone <- as.factor(creditData1$Telephone)
creditData1$Foreign.Worker <- as.factor(creditData1$Foreign.Worker)
credit_rand <- creditData1[order(runif(1000)), ]
filteredData <- credit_rand[, -(highlycor[5]+1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]
library(naivebayes)
nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
## Warning: naive_bayes(): Feature Purpose - zero probabilities are present.
## Consider Laplace smoothing.
filteredTestPred <- predict(nb_model, newdata = filteredTest)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
conf_nat <- table(filteredTestPred, filteredTest$Creditability)
Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100
Accuracy
## [1] 74.4
###Q3- What is the accuracy this time? Be sure to include in your results report whether or not, after all this work, the performance of your Naïve Bayes Classifier was improved.
###ANSWER - Q3: As we can see from above the accuracy has now dropped to 74.4% from previous accuracy of 80%. The earlier model performed better here.
###PART 3
letterdata <- read.csv("C:/Users/Priya/Downloads/letterdata.csv")
str(letterdata)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
letters_train <- letterdata[1:18000, ]
letters_test <- letterdata[18001:20000, ]
library(kernlab)
## Warning: package 'kernlab' was built under R version 3.5.2
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
letter_classifier <- ksvm(letter ~., data= letters_train,kernel="vanilladot")
## Setting default kernel parameters
summary(letter_classifier)
## Length Class Mode
## 1 ksvm S4
letter_predictions <- predict(letter_classifier, letters_test)
(p<- table(letter_predictions,letters_test$letter))
##
## letter_predictions A B C D E F G H I J K L M N O P Q R
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5
## I 0 0 0 0 0 0 0 0 65 3 0 0 0 0 0 0 0 0
## J 0 1 0 0 0 1 0 0 3 61 0 0 0 0 1 0 0 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0
##
## letter_predictions S T U V W X Y Z
## A 0 1 2 0 1 0 0 0
## B 3 0 0 0 0 0 0 0
## C 0 0 0 0 0 0 0 0
## D 0 0 0 0 0 0 1 0
## E 6 0 0 0 0 1 0 0
## F 2 0 0 1 0 0 2 0
## G 3 2 0 0 0 0 0 0
## H 0 3 0 2 0 0 1 0
## I 2 0 0 0 0 2 1 0
## J 1 0 0 0 0 1 0 4
## K 0 1 2 0 0 4 0 0
## L 0 0 0 0 0 0 0 0
## M 0 0 1 0 6 0 0 0
## N 0 0 1 0 2 0 0 0
## O 0 0 1 0 0 0 0 0
## P 0 0 0 0 0 0 1 0
## Q 1 0 0 0 0 0 0 0
## R 0 1 0 1 0 0 0 0
## S 47 1 0 0 0 1 0 6
## T 1 83 1 0 0 0 2 2
## U 0 0 83 0 0 0 0 0
## V 0 0 0 64 1 0 1 0
## W 0 0 0 3 59 0 0 0
## X 0 0 0 0 0 76 1 0
## Y 0 1 0 0 0 1 58 0
## Z 5 1 0 0 0 0 0 70
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 83.95
###from using kernel as “vanilladot” we get an accuracy of 83.95%. Let change the kernel to “rbfdot” and “polydot” to see change in accuracy of the models.
###Q4- We may be able to do better than this by changing the Kernels. Try Polynomial and RBF kernels to improve the result.
###ANSWER - Q4:
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "rbfdot")
summary(letter_classifier)
## Length Class Mode
## 1 ksvm S4
letter_predictions <- predict(letter_classifier, letters_test)
(p<- table(letter_predictions,letters_test$letter))
##
## letter_predictions A B C D E F G H I J K L M N O P Q R
## A 75 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0
## B 0 67 0 2 0 1 0 0 0 0 0 1 0 1 1 2 1 1
## C 0 0 72 0 3 0 0 0 0 0 0 1 0 0 0 0 0 0
## D 1 1 0 71 0 0 1 2 2 2 1 0 0 0 0 2 1 1
## E 0 0 0 0 70 1 0 0 0 1 0 2 0 0 0 0 0 0
## F 0 0 0 0 0 77 0 0 3 0 0 0 0 0 0 6 0 0
## G 0 0 1 0 3 0 76 1 0 0 0 0 0 0 0 0 0 0
## H 0 0 0 1 0 0 1 58 0 1 1 1 1 0 0 0 1 1
## I 0 0 0 0 0 0 0 0 69 1 0 0 0 0 0 0 0 0
## J 0 0 0 0 0 0 0 0 2 66 0 0 0 0 0 0 0 0
## K 0 0 0 0 0 0 0 3 0 0 62 0 0 1 0 0 0 2
## L 0 0 0 0 0 0 1 0 0 0 0 69 0 0 0 0 0 0
## M 0 0 0 0 0 0 1 0 0 0 0 0 71 1 0 0 0 0
## N 0 0 0 0 0 1 0 0 0 0 0 0 0 78 0 0 0 0
## O 0 0 1 0 0 0 0 0 0 1 0 0 0 2 67 1 2 0
## P 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 72 0 0
## Q 0 0 0 0 0 0 0 1 0 0 0 0 0 0 3 1 65 0
## R 0 1 0 0 0 0 1 1 0 0 3 0 0 2 0 0 0 74
## S 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
## T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## U 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## W 0 0 1 0 0 0 0 0 0 0 0 0 1 0 2 0 0 0
## X 0 1 0 0 0 0 0 0 0 0 2 4 0 0 0 0 0 0
## Y 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Z 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## letter_predictions S T U V W X Y Z
## A 0 1 0 0 0 0 0 0
## B 1 0 0 1 0 0 0 0
## C 0 0 0 0 0 0 0 0
## D 0 0 1 0 0 0 0 0
## E 0 0 0 0 0 0 0 0
## F 1 0 0 1 0 0 0 0
## G 0 0 0 0 0 0 0 0
## H 0 3 0 1 0 0 0 0
## I 0 0 0 0 0 2 0 0
## J 0 0 0 0 0 0 0 1
## K 0 0 0 0 0 0 0 0
## L 0 0 0 0 0 0 0 0
## M 0 0 0 0 2 0 0 0
## N 0 0 0 0 1 0 0 0
## O 0 0 0 0 0 0 0 0
## P 0 0 0 0 0 0 0 0
## Q 0 0 0 0 0 0 0 0
## R 0 1 0 0 0 0 0 0
## S 68 0 0 0 0 0 0 0
## T 0 88 0 0 0 0 1 0
## U 0 0 89 0 0 0 0 0
## V 0 0 0 68 0 0 1 0
## W 0 0 1 0 66 0 0 0
## X 0 0 0 0 0 84 1 0
## Y 0 1 0 0 0 0 65 0
## Z 1 0 0 0 0 0 0 81
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 93.4
###Q4 - Answer: The accuracy increases to 93.45% when using “rbfdot” as a kernel. Lets check the same for “polydot”.
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "polydot")
## Setting default kernel parameters
summary(letter_classifier)
## Length Class Mode
## 1 ksvm S4
letter_predictions <- predict(letter_classifier, letters_test)
(p<- table(letter_predictions,letters_test$letter))
##
## letter_predictions A B C D E F G H I J K L M N O P Q R
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5
## I 0 0 0 0 0 0 0 0 65 2 0 0 0 0 0 0 0 0
## J 0 1 0 0 0 1 0 0 3 62 0 0 0 0 1 0 0 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0
##
## letter_predictions S T U V W X Y Z
## A 0 1 2 0 1 0 0 0
## B 3 0 0 0 0 0 0 0
## C 0 0 0 0 0 0 0 0
## D 0 0 0 0 0 0 1 0
## E 6 0 0 0 0 1 0 0
## F 2 0 0 1 0 0 2 0
## G 3 2 0 0 0 0 0 0
## H 0 3 0 2 0 0 1 0
## I 2 0 0 0 0 2 1 0
## J 1 0 0 0 0 1 0 4
## K 0 1 2 0 0 4 0 0
## L 0 0 0 0 0 0 0 0
## M 0 0 1 0 6 0 0 0
## N 0 0 1 0 2 0 0 0
## O 0 0 1 0 0 0 0 0
## P 0 0 0 0 0 0 1 0
## Q 1 0 0 0 0 0 0 0
## R 0 1 0 1 0 0 0 0
## S 47 1 0 0 0 1 0 6
## T 1 83 1 0 0 0 2 2
## U 0 0 83 0 0 0 0 0
## V 0 0 0 64 1 0 1 0
## W 0 0 0 3 59 0 0 0
## X 0 0 0 0 0 76 1 0
## Y 0 1 0 0 0 1 58 0
## Z 5 1 0 0 0 0 0 70
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 84
###Q4 - Answer: Using “polydot” as a kernel we get 84% accuracy which is similar to that of the first classifier model we created using “vanilladot”.
###PART 4
news <- read.csv("C:/Users/Priya/Desktop/ANLY 500/news.csv")
#Check for missing data
sum(is.na(news))
## [1] 0
#remove non-predictive variables
news <- news[,-(1:2)]
#Check for outliers
news=news[!news$n_unique_tokens==701,]
#Keep variables that are meaningful for our model
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
#Standardize the dataset
for(i in ncol(news)-1){
news[,i]<-scale(news[,i], center = TRUE, scale = TRUE)
}
#Define articles with shares greater than 1400 as popular articles
newsShort$shares <- as.factor(ifelse(newsShort$shares > 1400,1,0))
###Applying Naive-Bayes on New Popularity Dataset after pre-processing the original dataset as above.
set.seed(12345)
news_rand <- newsShort[order(runif(39643)), ]
news_train <- news_rand[1:35678, ]
news_test <- news_rand[35679:39643, ]
prop.table(table(news_train$shares))
##
## 0 1
## 0.506867 0.493133
prop.table(table(news_test$shares))
##
## 0 1
## 0.5039092 0.4960908
naive_modelNews <- naive_bayes(as.character(shares) ~ ., data= news_train)
naive_modelNews
##
## ================================ Naive Bayes =================================
##
## Call:
## naive_bayes.formula(formula = as.character(shares) ~ ., data = news_train)
##
## ------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.506867 0.493133
##
## ------------------------------------------------------------------------------
##
## Tables:
##
## ------------------------------------------------------------------------------
## ::: n_tokens_title (Gaussian)
## ------------------------------------------------------------------------------
##
## n_tokens_title 0 1
## mean 10.491650 10.305047
## sd 2.088112 2.131096
##
## ------------------------------------------------------------------------------
## ::: n_tokens_content (Gaussian)
## ------------------------------------------------------------------------------
##
## n_tokens_content 0 1
## mean 527.6581 565.0161
## sd 426.8231 509.6529
##
## ------------------------------------------------------------------------------
## ::: n_unique_tokens (Gaussian)
## ------------------------------------------------------------------------------
##
## n_unique_tokens 0 1
## mean 0.5378909 0.5237544
## sd 0.1309307 0.1421511
##
## ------------------------------------------------------------------------------
## ::: n_non_stop_words (Gaussian)
## ------------------------------------------------------------------------------
##
## n_non_stop_words 0 1
## mean 0.9745631 0.9664658
## sd 0.1574522 0.1800318
##
## ------------------------------------------------------------------------------
## ::: num_hrefs (Gaussian)
## ------------------------------------------------------------------------------
##
## num_hrefs 0 1
## mean 9.889847 11.861487
## sd 9.795279 12.502676
##
## ------------------------------------------------------------------------------
##
## # ... and 11 more tables
##
## ------------------------------------------------------------------------------
###Lets check the accuracy of the Naive-Bayes model on News Dataset
conf_natNews <- table(predict(naive_modelNews, news_test), news_test$shares)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
Accuracy <- sum(diag(conf_natNews))/sum(conf_natNews)*100
Accuracy
## [1] 53.97226
###We get a lower accuracy of 54% by using Naive-Bayes for News popularity dataset. Lets try using SVM models.
news_classifier <- ksvm(shares ~., data= news_train,kernel="vanilladot")
## Setting default kernel parameters
summary(news_classifier)
## Length Class Mode
## 1 ksvm S4
news_predictions <- predict(news_classifier, news_test)
(p<- table(news_predictions,news_test$shares))
##
## news_predictions 0 1
## 0 1401 1167
## 1 597 800
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 55.51072
###Using vanilladot as kernel we get a slightly improved accuracy of 55.51%. Lets try using other kernels,
news_classifier <- ksvm(shares ~., data= news_train,kernel="rbfdot")
summary(news_classifier)
## Length Class Mode
## 1 ksvm S4
news_predictions <- predict(news_classifier, news_test)
(p<- table(news_predictions,news_test$shares))
##
## news_predictions 0 1
## 0 1326 949
## 1 672 1018
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 59.11728
###Using rbfdot as kernel we get an accuracy of 59% which is improvement from the earlier models so far. Lets try the final model for this lab using kernel as polydot,
news_classifier <- ksvm(shares ~., data= news_train,kernel="polydot")
## Setting default kernel parameters
summary(news_classifier)
## Length Class Mode
## 1 ksvm S4
news_predictions <- predict(news_classifier, news_test)
(p<- table(news_predictions,news_test$shares))
##
## news_predictions 0 1
## 0 1401 1166
## 1 597 801
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 55.53594
###Using polydot as an SVM kernel we get an accuracy on the model of 55.53% which is similar to that of vanilladot. The highest accuracy number we could obtain was using rbfdot for news popularity dataset making it the best model so far.