library(readr)
creditData <- read_csv("creditData.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
head(creditData)
str(creditData)
## tibble [1,000 x 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Creditability : num [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
## $ Account Balance : num [1:1000] 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration of Credit (month) : num [1:1000] 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment Status of Previous Credit: num [1:1000] 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : num [1:1000] 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit Amount : num [1:1000] 1049 2799 841 2122 2171 ...
## $ Value Savings/Stocks : num [1:1000] 1 1 2 1 1 1 1 1 1 3 ...
## $ Length of current employment : num [1:1000] 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment per cent : num [1:1000] 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex & Marital Status : num [1:1000] 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : num [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration in Current address : num [1:1000] 4 2 4 2 4 3 4 4 4 4 ...
## $ Most valuable available asset : num [1:1000] 2 1 1 1 2 1 1 1 3 4 ...
## $ Age (years) : num [1:1000] 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent Credits : num [1:1000] 3 3 3 3 1 3 3 3 3 3 ...
## $ Type of apartment : num [1:1000] 1 1 1 1 2 1 2 2 2 1 ...
## $ No of Credits at this Bank : num [1:1000] 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : num [1:1000] 3 3 2 2 2 2 2 2 1 1 ...
## $ No of dependents : num [1:1000] 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : num [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign Worker : num [1:1000] 1 1 1 2 2 2 2 2 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Creditability = col_double(),
## .. `Account Balance` = col_double(),
## .. `Duration of Credit (month)` = col_double(),
## .. `Payment Status of Previous Credit` = col_double(),
## .. Purpose = col_double(),
## .. `Credit Amount` = col_double(),
## .. `Value Savings/Stocks` = col_double(),
## .. `Length of current employment` = col_double(),
## .. `Instalment per cent` = col_double(),
## .. `Sex & Marital Status` = col_double(),
## .. Guarantors = col_double(),
## .. `Duration in Current address` = col_double(),
## .. `Most valuable available asset` = col_double(),
## .. `Age (years)` = col_double(),
## .. `Concurrent Credits` = col_double(),
## .. `Type of apartment` = col_double(),
## .. `No of Credits at this Bank` = col_double(),
## .. Occupation = col_double(),
## .. `No of dependents` = col_double(),
## .. Telephone = col_double(),
## .. `Foreign Worker` = col_double()
## .. )
summary(creditData)
## Creditability Account Balance Duration of Credit (month)
## Min. :0.0 Min. :1.000 Min. : 4.0
## 1st Qu.:0.0 1st Qu.:1.000 1st Qu.:12.0
## Median :1.0 Median :2.000 Median :18.0
## Mean :0.7 Mean :2.577 Mean :20.9
## 3rd Qu.:1.0 3rd Qu.:4.000 3rd Qu.:24.0
## Max. :1.0 Max. :4.000 Max. :72.0
## Payment Status of Previous Credit Purpose Credit Amount
## Min. :0.000 Min. : 0.000 Min. : 250
## 1st Qu.:2.000 1st Qu.: 1.000 1st Qu.: 1366
## Median :2.000 Median : 2.000 Median : 2320
## Mean :2.545 Mean : 2.828 Mean : 3271
## 3rd Qu.:4.000 3rd Qu.: 3.000 3rd Qu.: 3972
## Max. :4.000 Max. :10.000 Max. :18424
## Value Savings/Stocks Length of current employment Instalment per cent
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:2.000
## Median :1.000 Median :3.000 Median :3.000
## Mean :2.105 Mean :3.384 Mean :2.973
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :4.000
## Sex & Marital Status Guarantors Duration in Current address
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000
## Median :3.000 Median :1.000 Median :3.000
## Mean :2.682 Mean :1.145 Mean :2.845
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:4.000
## Max. :4.000 Max. :3.000 Max. :4.000
## Most valuable available asset Age (years) Concurrent Credits
## Min. :1.000 Min. :19.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:27.00 1st Qu.:3.000
## Median :2.000 Median :33.00 Median :3.000
## Mean :2.358 Mean :35.54 Mean :2.675
## 3rd Qu.:3.000 3rd Qu.:42.00 3rd Qu.:3.000
## Max. :4.000 Max. :75.00 Max. :3.000
## Type of apartment No of Credits at this Bank Occupation No of dependents
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000
## Median :2.000 Median :1.000 Median :3.000 Median :1.000
## Mean :1.928 Mean :1.407 Mean :2.904 Mean :1.155
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :3.000 Max. :4.000 Max. :4.000 Max. :2.000
## Telephone Foreign Worker
## Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000
## Mean :1.404 Mean :1.037
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :2.000 Max. :2.000
names(creditData) <- make.names(names(creditData))
##Naïve Bayes Classifiers, Part 1
Q1- (one thing to make sure you do!): Remember that the class variable needs to be a categorical data type in order to build a Naïve Bayes Classifier. This means that you’ll need to convert your class variable.
#Missing values
sum(is.na(creditData))
## [1] 0
summary(creditData$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
creditData$Creditability <- as.factor(creditData$Creditability)
Q2- Compute the percentage of both classes similar to what you did in lab 1 and see if the distribution of both classes preserved for both training and testing data.
set.seed(12345)
credit_rand <- creditData[order(runif(1000)), ]
summary(creditData$Credit.Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3146667 0.6853333
prop.table(table(credit_test$Creditability))
##
## 0 1
## 0.256 0.744
table(credit_train$Creditability)
##
## 0 1
## 236 514
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.0.3
## naivebayes 0.9.7 loaded
credit_model <- naive_bayes(Creditability~.,data=credit_train)
credit_model
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.3146667 0.6853333
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: Account.Balance (Gaussian)
## ---------------------------------------------------------------------------------
##
## Account.Balance 0 1
## mean 1.923729 2.793774
## sd 1.036826 1.252008
##
## ---------------------------------------------------------------------------------
## ::: Duration.of.Credit..month. (Gaussian)
## ---------------------------------------------------------------------------------
##
## Duration.of.Credit..month. 0 1
## mean 24.46610 19.20039
## sd 13.82208 11.13433
##
## ---------------------------------------------------------------------------------
## ::: Payment.Status.of.Previous.Credit (Gaussian)
## ---------------------------------------------------------------------------------
##
## Payment.Status.of.Previous.Credit 0 1
## mean 2.161017 2.665370
## sd 1.071649 1.045219
##
## ---------------------------------------------------------------------------------
## ::: Purpose (Gaussian)
## ---------------------------------------------------------------------------------
##
## Purpose 0 1
## mean 2.927966 2.803502
## sd 2.944722 2.633253
##
## ---------------------------------------------------------------------------------
## ::: Credit.Amount (Gaussian)
## ---------------------------------------------------------------------------------
##
## Credit.Amount 0 1
## mean 3964.195 2984.177
## sd 3597.093 2379.685
##
## ---------------------------------------------------------------------------------
##
## # ... and 15 more tables
##
## ---------------------------------------------------------------------------------
68.53 % were credit worthy and 31.5% as not credit worthy
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.0.3
credit_pred <- predict(credit_model, credit_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
p <- table(credit_pred, credit_test$Creditability)
CrossTable(credit_test$Creditability, credit_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability ', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 250
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ----------------------|-----------|-----------|-----------|
## 0 | 42 | 22 | 64 |
## | 0.168 | 0.088 | |
## ----------------------|-----------|-----------|-----------|
## 1 | 35 | 151 | 186 |
## | 0.140 | 0.604 | |
## ----------------------|-----------|-----------|-----------|
## Column Total | 77 | 173 | 250 |
## ----------------------|-----------|-----------|-----------|
##
##
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 77.2
The table indicates that for the 250 records in our test set 22 cases were misclassified, i.e. false negatives or a Type II error, and 35 actual defaults were misclassified as favorite, i.e. false positives or a Type I error.We have got an accuracy of 77.2% which is good.
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
#Randomize the data
credit_rand2 <- creditData[order(runif(1000)), ]
#Scale the data
creditDataScaled <- scale(credit_rand2[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
corrplot(m, method="color")
highlycor <- findCorrelation(m, 0.40)
highlycor
## [1] 5 3
filteredData <- credit_rand[, -(highlycor)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]
I have used 0.40 as teh threshold value for the variable selection.it is taking out the variables which are less important to decide the credibility eg. purpose.
nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
filteredTestPred <- predict(nb_model, newdata = filteredTest)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
p <- table(filteredTestPred, filteredTest$Creditability)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 76
CrossTable(filteredTest$Creditability, filteredTestPred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual Creditability ', 'Predicted Creditability'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 250
##
##
## | Predicted Creditability
## Actual Creditability | 0 | 1 | Row Total |
## ----------------------|-----------|-----------|-----------|
## 0 | 43 | 21 | 64 |
## | 0.172 | 0.084 | |
## ----------------------|-----------|-----------|-----------|
## 1 | 39 | 147 | 186 |
## | 0.156 | 0.588 | |
## ----------------------|-----------|-----------|-----------|
## Column Total | 82 | 168 | 250 |
## ----------------------|-----------|-----------|-----------|
##
##
Q3- What is the accuracy this time? Be sure to include in your results report whether or not, after all this work, the performance of your Naïve Bayes Classifier was improved.
The table indicates that for the 250 records in our test set 21 cases were misclassified, i.e. false negatives or a Type II error, and 39 actual defaults were misclassified as favorite, i.e. false positives or a Type I error.We have got an accuracy of 76% which is less than the preious method.
The accuracy is reduced ,this happened because when we remove the variables due to high correlation , these variables also account for some variability in the target variable.Since we remove them , it also does not account for overall variability in the model.
letters <- read.csv("letterdata.csv")
str(letters)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: chr "T" "I" "D" "N" ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.0.3
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
letter_classifier <- ksvm(as.factor(letter) ~ ., data = letters_train,kernel ="vanilladot")
## Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 7886
##
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569
## Training error : 0.1335
letter_predictions <- predict(letter_classifier, letters_test)
p = table(letter_predictions, letters_test$letter)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 83.95
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 321 1679
This tells us that the classification was correct in 1,679 out of our 2000 test records. Now Accuracy is 83.95.
Q4- We may be able to do better than this by changing the Kernels. Try Polynomial and RBF kernels to improve the result.
In the below results shows that the RBF kernel is best with 93.35 accuracy.the classification was correct in 1,867 out of our 2000 test records.
#Polynomial kernels SVM
letter_classifier <- ksvm(as.factor(letter) ~ ., data = letters_train,kernel ="polydot")
## Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Polynomial kernel function.
## Hyperparameters : degree = 1 scale = 1 offset = 1
##
## Number of Support Vectors : 7887
##
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5052 -184.481 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1397 -239.2617 -23.8945 -10.9655 -64.2279 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.233 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2782 -85.4452 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2868 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.8179 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7687 -164.597 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5022 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.641 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9083 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.292 -46.3024 -50.4912 -37.9431 -21.5243 -11.6202 -134.9025 -7.516 -5.8131 -10.1595 -13.6329 -27.0294 -25.7282 -151.851 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5383 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8725 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2874 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2587 -32.3275 -35.1191 -4.9605 -90.2308 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4617 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569
## Training error : 0.1335
letter_predictions <- predict(letter_classifier, letters_test)
p = table(letter_predictions, letters_test$letter)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 84
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 320 1680
This tells us that the classification was correct in 1,680 out of our 2000 test records. The accuracy is 84%
#RBF kernels SVM
letter_classifier <- ksvm(as.factor(letter) ~ ., data = letters_train,kernel ="rbfdot")
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0472856751333276
##
## Number of Support Vectors : 9522
##
## Objective Function Value : -45.068 -35.4757 -61.1237 -27.7907 -36.4603 -49.0627 -72.4834 -41.0048 -71.1378 -54.2509 -48.7404 -62.9416 -69.9208 -51.8761 -37.2656 -50.2077 -61.3152 -46.7269 -42.1529 -68.0496 -51.0068 -39.5543 -43.3192 -59.4686 -43.0489 -37.0312 -175.8505 -113.2333 -101.3396 -124.3197 -194.513 -85.2816 -82.3514 -106.6628 -67.7708 -66.3184 -60.4829 -74.7244 -86.8999 -105.7321 -255.1937 -202.9954 -54.6302 -43.7191 -83.1566 -44.631 -107.619 -41.4851 -64.9836 -35.1415 -156.9712 -46.4411 -233.9847 -85.4517 -36.736 -27.5488 -125.9081 -88.3635 -39.9353 -42.9862 -94.0438 -47.8249 -60.866 -50.3466 -53.5631 -63.0599 -91.8204 -38.5397 -27.7093 -51.5103 -28.9676 -29.688 -42.867 -77.6766 -86.7789 -229.0298 -78.5526 -93.4866 -87.7459 -43.4439 -54.9169 -126.981 -149.1979 -79.7537 -70.8292 -159.5355 -96.3539 -60.4871 -66.4617 -43.2212 -29.2534 -104.1644 -39.3303 -64.4109 -85.6034 -188.9203 -109.4053 -76.4357 -57.4581 -151.2605 -127.2811 -40.6934 -26.7698 -47.4028 -65.7925 -101.7349 -86.9243 -187.9294 -105.0938 -39.5062 -44.9652 -25.2312 -155.1584 -32.3489 -153.1351 -73.9524 -109.154 -103.7196 -91.8203 -55.0493 -40.5303 -34.2454 -53.6705 -38.7576 -242.8607 -44.7822 -58.9175 -136.1429 -143.3246 -36.5523 -61.5641 -32.2199 -84.9382 -82.8876 -55.8831 -174.4871 -62.5113 -51.4646 -130.9532 -127.289 -71.9613 -56.4582 -165.7605 -98.0346 -190.3282 -141.7568 -116.0067 -71.3192 -73.5917 -80.7562 -61.8935 -96.6942 -37.0531 -45.0699 -65.3472 -88.8936 -225.0344 -82.6592 -97.25 -183.8182 -228.777 -113.7945 -132.2202 -253.7498 -93.8184 -85.7501 -141.955 -96.7489 -66.3552 -142.9304 -67.299 -45.4005 -185.417 -50.4301 -63.0946 -20.1681 -28.8991 -39.4485 -63.743 -63.1508 -52.1296 -145.7413 -51.3411 -24.8724 -23.796 -14.9319 -133.4394 -31.292 -86.5325 -49.6102 -50.8745 -28.0592 -42.8325 -55.6295 -55.7383 -60.8545 -59.7283 -119.1918 -48.1552 -36.8772 -27.1907 -17.067 -99.2173 -32.0584 -89.8497 -101.8716 -67.3186 -67.1192 -62.2483 -53.5223 -63.1039 -195.2706 -78.4711 -71.501 -79.0982 -49.8285 -43.1706 -190.3124 -31.9702 -40.3842 -34.6775 -39.6438 -51.0436 -53.649 -93.3179 -86.2953 -104.2327 -53.7924 -39.4504 -33.9345 -24.2052 -91.3393 -30.5823 -51.2496 -105.6786 -70.8294 -42.6057 -41.8111 -80.5221 -28.5596 -35.9553 -97.5454 -71.6715 -128.2217 -27.5813 -33.9253 -16.5173 -110.1321 -55.923 -44.5816 -98.0165 -31.6628 -43.7738 -99.6683 -70.8738 -79.2039 -42.764 -49.5717 -22.8451 -84.0768 -210.4489 -101.4114 -57.7494 -49.0711 -101.9206 -58.5467 -81.8676 -79.1256 -29.7675 -30.6112 -67.198 -71.941 -74.9827 -64.5243 -36.8208 -75.6632 -49.5469 -51.9236 -78.3021 -29.4058 -88.5447 -108.1962 -42.3913 -48.9789 -43.8053 -37.8094 -83.1124 -49.4488 -67.7982 -94.1202 -60.4357 -49.4828 -75.9437 -51.6446 -105.2105 -31.0395 -39.2052 -106.5495 -31.5897 -36.3352 -18.8946 -160.026 -49.5644 -239.91 -61.561 -72.3322 -29.669 -92.6541 -166.4571 -80.3586 -67.2158 -79.2135 -50.4271 -55.907 -21.08 -113.6714 -34.7217 -192.5426 -19.4235 -20.4897 -43.8845 -11.1356 -47.3078 -107.6987 -36.577
## Training error : 0.049833
letter_predictions <- predict(letter_classifier, letters_test)
p = table(letter_predictions, letters_test$letter)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 93.35
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 133 1867
This tells us that the classification was correct in 1,867 out of our 2000 test records. The accuracy which gave 93.35 to me which is best so far.
library(readr)
onlineData <- read_csv("OnlineNewsPopularity_for_R.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## url = col_character()
## )
## See spec(...) for full column specifications.
str(onlineData)
## tibble [39,644 x 61] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ url : chr [1:39644] "http://mashable.com/2013/01/07/amazon-instant-video-browser/" "http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/" "http://mashable.com/2013/01/07/apple-40-billion-app-downloads/" "http://mashable.com/2013/01/07/astronaut-notre-dame-bcs/" ...
## $ timedelta : num [1:39644] 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num [1:39644] 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num [1:39644] 219 255 211 531 1072 ...
## $ n_unique_tokens : num [1:39644] 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num [1:39644] 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num [1:39644] 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num [1:39644] 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num [1:39644] 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num [1:39644] 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num [1:39644] 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num [1:39644] 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num [1:39644] 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num [1:39644] 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num [1:39644] 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num [1:39644] 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num [1:39644] 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num [1:39644] 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num [1:39644] 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num [1:39644] 496 0 918 0 3151 ...
## $ weekday_is_monday : num [1:39644] 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num [1:39644] 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num [1:39644] 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num [1:39644] 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num [1:39644] 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num [1:39644] 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num [1:39644] 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num [1:39644] 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num [1:39644] 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num [1:39644] 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num [1:39644] 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num [1:39644] 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num [1:39644] 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num [1:39644] 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num [1:39644] 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num [1:39644] 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num [1:39644] -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num [1:39644] -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num [1:39644] -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num [1:39644] 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num [1:39644] -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num [1:39644] 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num [1:39644] 0.188 0 0 0 0.136 ...
## $ shares : num [1:39644] 593 711 1500 1200 505 855 556 891 3600 710 ...
## - attr(*, "spec")=
## .. cols(
## .. url = col_character(),
## .. timedelta = col_double(),
## .. n_tokens_title = col_double(),
## .. n_tokens_content = col_double(),
## .. n_unique_tokens = col_double(),
## .. n_non_stop_words = col_double(),
## .. n_non_stop_unique_tokens = col_double(),
## .. num_hrefs = col_double(),
## .. num_self_hrefs = col_double(),
## .. num_imgs = col_double(),
## .. num_videos = col_double(),
## .. average_token_length = col_double(),
## .. num_keywords = col_double(),
## .. data_channel_is_lifestyle = col_double(),
## .. data_channel_is_entertainment = col_double(),
## .. data_channel_is_bus = col_double(),
## .. data_channel_is_socmed = col_double(),
## .. data_channel_is_tech = col_double(),
## .. data_channel_is_world = col_double(),
## .. kw_min_min = col_double(),
## .. kw_max_min = col_double(),
## .. kw_avg_min = col_double(),
## .. kw_min_max = col_double(),
## .. kw_max_max = col_double(),
## .. kw_avg_max = col_double(),
## .. kw_min_avg = col_double(),
## .. kw_max_avg = col_double(),
## .. kw_avg_avg = col_double(),
## .. self_reference_min_shares = col_double(),
## .. self_reference_max_shares = col_double(),
## .. self_reference_avg_sharess = col_double(),
## .. weekday_is_monday = col_double(),
## .. weekday_is_tuesday = col_double(),
## .. weekday_is_wednesday = col_double(),
## .. weekday_is_thursday = col_double(),
## .. weekday_is_friday = col_double(),
## .. weekday_is_saturday = col_double(),
## .. weekday_is_sunday = col_double(),
## .. is_weekend = col_double(),
## .. LDA_00 = col_double(),
## .. LDA_01 = col_double(),
## .. LDA_02 = col_double(),
## .. LDA_03 = col_double(),
## .. LDA_04 = col_double(),
## .. global_subjectivity = col_double(),
## .. global_sentiment_polarity = col_double(),
## .. global_rate_positive_words = col_double(),
## .. global_rate_negative_words = col_double(),
## .. rate_positive_words = col_double(),
## .. rate_negative_words = col_double(),
## .. avg_positive_polarity = col_double(),
## .. min_positive_polarity = col_double(),
## .. max_positive_polarity = col_double(),
## .. avg_negative_polarity = col_double(),
## .. min_negative_polarity = col_double(),
## .. max_negative_polarity = col_double(),
## .. title_subjectivity = col_double(),
## .. title_sentiment_polarity = col_double(),
## .. abs_title_subjectivity = col_double(),
## .. abs_title_sentiment_polarity = col_double(),
## .. shares = col_double()
## .. )
#I have selected the columns which might be helpful in determining the newS popularity(same as in lab1)
onlineData <- onlineData[,c(3,4,5,7,10,11,12,13,14:19,24,26,30,31,39,45:49,54,58,57,61)]
dim(onlineData)
## [1] 39644 28
sum(is.na(onlineData))#No missing data
## [1] 0
onlineData$shares <- factor(ifelse(onlineData$shares>=1400, "favorite", "notfavorite"))
head(onlineData$shares)
## [1] notfavorite notfavorite favorite notfavorite notfavorite notfavorite
## Levels: favorite notfavorite
summary(onlineData$n_tokens_title)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 9.0 10.0 10.4 12.0 23.0
set.seed(12345)
online_rand <- onlineData[order(runif(24000)), ]
summary(onlineData$n_tokens_title)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 9.0 10.0 10.4 12.0 23.0
online_train <- online_rand[1:18000, ]
online_test <- online_rand[18001:24000, ]
prop.table(table(online_train$shares))
##
## favorite notfavorite
## 0.5461667 0.4538333
prop.table(table(online_test$shares))
##
## favorite notfavorite
## 0.551 0.449
online_model <- naive_bayes(shares~.,data=online_train)
online_model
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = shares ~ ., data = online_train)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## favorite notfavorite
## 0.5461667 0.4538333
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: n_tokens_title (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_tokens_title favorite notfavorite
## mean 9.960431 10.165626
## sd 2.018974 1.973344
##
## ---------------------------------------------------------------------------------
## ::: n_tokens_content (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_tokens_content favorite notfavorite
## mean 561.3095 499.8847
## sd 494.7044 410.6161
##
## ---------------------------------------------------------------------------------
## ::: n_unique_tokens (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_unique_tokens favorite notfavorite
## mean 0.5419214 0.5587277
## sd 0.1218511 0.1135802
##
## ---------------------------------------------------------------------------------
## ::: n_non_stop_unique_tokens (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_non_stop_unique_tokens favorite notfavorite
## mean 0.6844829 0.7023427
## sd 0.1270336 0.1192500
##
## ---------------------------------------------------------------------------------
## ::: num_imgs (Gaussian)
## ---------------------------------------------------------------------------------
##
## num_imgs favorite notfavorite
## mean 4.990133 4.190231
## sd 8.853298 8.419466
##
## ---------------------------------------------------------------------------------
##
## # ... and 22 more tables
##
## ---------------------------------------------------------------------------------
online_pred <- predict(online_model, online_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
p <- table(online_pred, online_test$shares)
CrossTable(online_test$shares, online_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual shares ', 'Predicted shares'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 6000
##
##
## | Predicted shares
## Actual shares | favorite | notfavorite | Row Total |
## ---------------|-------------|-------------|-------------|
## favorite | 1308 | 1998 | 3306 |
## | 0.218 | 0.333 | |
## ---------------|-------------|-------------|-------------|
## notfavorite | 589 | 2105 | 2694 |
## | 0.098 | 0.351 | |
## ---------------|-------------|-------------|-------------|
## Column Total | 1897 | 4103 | 6000 |
## ---------------|-------------|-------------|-------------|
##
##
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 56.88333
The table indicates that for the 6000 records in our test set 1998 cases were misclassified, i.e. false negatives or a Type II error, and 589 actual defaults were misclassified as favorite, i.e. false positives or a Type I error.We have got an accuracy of 56.88% which is okay,not so good.
#Randomize the data
online_rand2 <- onlineData[order(runif(24000)), ]
#Scale the data
onlineDataScaled <- scale(online_rand2[,1:27], center=TRUE, scale = TRUE)
m1 <- cor(onlineDataScaled)
highlycor <- findCorrelation(m1, 0.40)
highlycor
## [1] 24 4 3 21 17
corrplot(m1, method="color")
filteredData_online <- online_rand2[, -(highlycor)]
filteredTraining_online <- filteredData_online[1:18000, ]
filteredTest_online <- filteredData_online[18001:24000, ]
nb_online_model <- naive_bayes(shares ~ ., data=filteredTraining_online)
nb_online_model
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = shares ~ ., data = filteredTraining_online)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## favorite notfavorite
## 0.5460556 0.4539444
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: n_tokens_title (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_tokens_title favorite notfavorite
## mean 9.944043 10.148819
## sd 1.998759 1.977630
##
## ---------------------------------------------------------------------------------
## ::: n_tokens_content (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_tokens_content favorite notfavorite
## mean 556.2243 498.1637
## sd 489.4359 402.0742
##
## ---------------------------------------------------------------------------------
## ::: num_imgs (Gaussian)
## ---------------------------------------------------------------------------------
##
## num_imgs favorite notfavorite
## mean 4.956048 4.211724
## sd 8.819731 8.380455
##
## ---------------------------------------------------------------------------------
## ::: num_videos (Gaussian)
## ---------------------------------------------------------------------------------
##
## num_videos favorite notfavorite
## mean 1.329332 1.179170
## sd 4.480784 4.191553
##
## ---------------------------------------------------------------------------------
## ::: average_token_length (Gaussian)
## ---------------------------------------------------------------------------------
##
## average_token_length favorite notfavorite
## mean 4.6207953 4.6450876
## sd 0.5757248 0.5212896
##
## ---------------------------------------------------------------------------------
##
## # ... and 17 more tables
##
## ---------------------------------------------------------------------------------
filteredPred <- predict(nb_online_model, newdata = filteredTest_online)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
p <- table(filteredPred, filteredTest_online$shares)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 59.43333
CrossTable(filteredTest_online$shares, filteredPred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('Actual shares ', 'Predicted shares'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 6000
##
##
## | Predicted shares
## Actual shares | favorite | notfavorite | Row Total |
## ---------------|-------------|-------------|-------------|
## favorite | 1425 | 1883 | 3308 |
## | 0.237 | 0.314 | |
## ---------------|-------------|-------------|-------------|
## notfavorite | 551 | 2141 | 2692 |
## | 0.092 | 0.357 | |
## ---------------|-------------|-------------|-------------|
## Column Total | 1976 | 4024 | 6000 |
## ---------------|-------------|-------------|-------------|
##
##
The table indicates that for the 6000 records in our test set 1823 cases were misclassified, i.e. false negatives or a Type II error, and 695 actual defaults were misclassified as favorite, i.e. false positives or a Type I error.We have got an accuracy of 59.43% which is okay,not so good but better than naive bayes.
library(kernlab)
online_classifier <- ksvm(shares ~ ., data = online_train,kernel ="vanilladot")
## Setting default kernel parameters
online_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 14980
##
## Objective Function Value : -14482.55
## Training error : 0.402278
We have a initial training error of 40%
online_predictions <- predict(online_classifier, online_test)
p= table(online_predictions, online_test$shares)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 60.03333
agreement <- online_predictions == online_test$shares
table(agreement)
## agreement
## FALSE TRUE
## 2398 3602
This tells us that the classification was correct in 3602 out of our 6000 test records. The accuracy is 60.33%
#Polynomial kernels SVM
online_classifier <- ksvm(shares ~ ., data = online_train,kernel ="polydot")
## Setting default kernel parameters
online_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Polynomial kernel function.
## Hyperparameters : degree = 1 scale = 1 offset = 1
##
## Number of Support Vectors : 14998
##
## Objective Function Value : -14482.55
## Training error : 0.402278
online_predictions <- predict(online_classifier, online_test)
p = table(online_predictions, online_test$shares)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 60.03333
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 133 1867
This tells us that the classification was correct in 1,680 out of our 2000 test records. The accuracy is 60%
#RBF kernels SVM
online_classifier <- ksvm(shares ~ ., data = online_train,kernel ="rbfdot")
online_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0268469279091559
##
## Number of Support Vectors : 14003
##
## Objective Function Value : -12995.36
## Training error : 0.3455
online_predictions <- predict(online_classifier, online_test)
p= table(online_predictions, online_test$shares)
Accuracy <- sum(diag(p))/sum(p)*100
Accuracy
## [1] 63.08333
agreement <- online_predictions == online_test$shares
table(agreement)
## agreement
## FALSE TRUE
## 2215 3785
This tells us that the classification was correct in 3785 out of our 6000 test records. The accuracy is 63.08%.This is the best accuracy with SVM
We have used decision tress and Random forest on the news popularity data set.Decision tress accounted for 63.33% of accuracy and Random forest accounted for 64.71% of accuracy.
In current study, Naive bayes accounted for 56.88 % accuracy Naive Bayes Classifier (with feature selection )accounted for 59.43% and the Support vector machine gave 63.08% accuracy with RBF .
So far, with respect to accuracy Random forest is the best model.Support vector machine with RBF also provides good accuracy.This can also be considered with such problems where we have risk factor involved as it provides a good classification method