library (C50)
## Warning: package 'C50' was built under R version 3.6.3
library (gmodels)
## Warning: package 'gmodels' was built under R version 3.6.3
library (rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.3
## Loading required package: rpart
library (ca)
## Warning: package 'ca' was built under R version 3.6.3
library (class)
## Warning: package 'class' was built under R version 3.6.3
library (ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library (MASS)
## Warning: package 'MASS' was built under R version 3.6.3
library (colorspace)
library (bnlearn)
## Warning: package 'bnlearn' was built under R version 3.6.3
##
## Attaching package: 'bnlearn'
## The following object is masked from 'package:stats':
##
## sigma
library (tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library (e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:bnlearn':
##
## impute
library (kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library (SnowballC)
## Warning: package 'SnowballC' was built under R version 3.6.3
library (klaR)
## Warning: package 'klaR' was built under R version 3.6.3
library (corrplot)
## corrplot 0.84 loaded
library (caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
library (leaps)
## Warning: package 'leaps' was built under R version 3.6.3
library (dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (colorspace)
library (naivebayes)
## Warning: package 'naivebayes' was built under R version 3.6.3
## naivebayes 0.9.7 loaded
library (psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:kernlab':
##
## alpha
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#Read Data/ Data Collection
creditData <- read.csv("C:/Users/punthakur/Documents/HU - ANALYTICS/530-Machine Learning/creditData.csv")
str(creditData)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
#Exploring and Preparing the Data
creditData$Creditability <- as.factor (creditData$Creditability)
sum (is.na(creditData)) #No missing values
## [1] 0
#Split data into 75% training and 25% testing (Note: there are no missing vales)
set.seed (12345)
credit_rand <- creditData[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
#Check percentages for both training and testing datasets
prop.table(table(credit_train$Creditability))
##
## 0 1
## 0.3146667 0.6853333
#Model Training
naive_model1 <- naive_bayes (Creditability ~. , data = credit_train)
naive_model1
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = Creditability ~ ., data = credit_train)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.3146667 0.6853333
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: Account.Balance (Gaussian)
## ---------------------------------------------------------------------------------
##
## Account.Balance 0 1
## mean 1.923729 2.793774
## sd 1.036826 1.252008
##
## ---------------------------------------------------------------------------------
## ::: Duration.of.Credit..month. (Gaussian)
## ---------------------------------------------------------------------------------
##
## Duration.of.Credit..month. 0 1
## mean 24.46610 19.20039
## sd 13.82208 11.13433
##
## ---------------------------------------------------------------------------------
## ::: Payment.Status.of.Previous.Credit (Gaussian)
## ---------------------------------------------------------------------------------
##
## Payment.Status.of.Previous.Credit 0 1
## mean 2.161017 2.665370
## sd 1.071649 1.045219
##
## ---------------------------------------------------------------------------------
## ::: Purpose (Gaussian)
## ---------------------------------------------------------------------------------
##
## Purpose 0 1
## mean 2.927966 2.803502
## sd 2.944722 2.633253
##
## ---------------------------------------------------------------------------------
## ::: Credit.Amount (Gaussian)
## ---------------------------------------------------------------------------------
##
## Credit.Amount 0 1
## mean 3964.195 2984.177
## sd 3597.093 2379.685
##
## ---------------------------------------------------------------------------------
##
## # ... and 15 more tables
##
## ---------------------------------------------------------------------------------
#Model evaluation
(conf_mat <- table(predict(naive_model1, credit_test), credit_test$Creditability))
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
##
## 0 1
## 0 42 35
## 1 22 151
(Accuracy <- sum(diag(conf_mat))/sum(conf_mat)*100)
## [1] 77.2
#The model has an accuracy of 77.2%.
#The model accurately predicts no-risk 42 times and inaccurately predicts no-risk 35 times, while accurately predicts rosk 151 times and inaccurately predicts risk 22 times.
#PART2
#Improve the Performance
#Exploring and preparing the Data
credit_rand <- creditData[order(runif(1000)), ]
creditDataScaled <- scale(credit_rand[,2:ncol(credit_rand)], center=TRUE, scale = TRUE)
m <- cor(creditDataScaled)
highlycor <- findCorrelation(m, 0.30)
filteredData <- credit_rand[, -(highlycor[5]+1)]
filteredTraining <- filteredData[1:750, ]
filteredTest <- filteredData[751:1000, ]
#Training a model
nb_model <- naive_bayes(Creditability ~ ., data=filteredTraining)
#Evaluate the Model
filteredTestPred <- predict(nb_model, newdata = filteredTest)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
table(filteredTestPred, filteredTest$Creditability)
##
## filteredTestPred 0 1
## 0 49 44
## 1 21 136
(conf_mat2 <- table(filteredTestPred, filteredTest$Creditability))
##
## filteredTestPred 0 1
## 0 49 44
## 1 21 136
(Accuracy <- sum(diag(conf_mat2))/sum(conf_mat2)*100)
## [1] 74
#The accuracy reduces to 74%
#Laboratory 2: Support Vector Machine, Part 3
letters <- read.csv("letterdata.csv")
str (letters)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
#Preparing the Data
letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]
#Training a model
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")
## Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 7886
##
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569
## Training error : 0.1335
#Evaluating Model Performance
letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##
## letter_predictions A B C D E F G H I J K L M N O P Q R S T
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0 0 1
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1 3 0
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2 0 0
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0 6 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0 2 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1 3 2
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5 0 3
## I 0 0 0 0 0 0 0 0 65 3 0 0 0 0 0 0 0 0 2 0
## J 0 1 0 0 0 1 0 0 3 61 0 0 0 0 1 0 0 0 1 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4 0 1
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1 0 0
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0 0 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0 1 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64 0 1
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0 47 1
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 83
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1 0 0
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 1
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 5 1
##
## letter_predictions U V W X Y Z
## A 2 0 1 0 0 0
## B 0 0 0 0 0 0
## C 0 0 0 0 0 0
## D 0 0 0 0 1 0
## E 0 0 0 1 0 0
## F 0 1 0 0 2 0
## G 0 0 0 0 0 0
## H 0 2 0 0 1 0
## I 0 0 0 2 1 0
## J 0 0 0 1 0 4
## K 2 0 0 4 0 0
## L 0 0 0 0 0 0
## M 1 0 6 0 0 0
## N 1 0 2 0 0 0
## O 1 0 0 0 0 0
## P 0 0 0 0 1 0
## Q 0 0 0 0 0 0
## R 0 1 0 0 0 0
## S 0 0 0 1 0 6
## T 1 0 0 0 2 2
## U 83 0 0 0 0 0
## V 0 64 1 0 1 0
## W 0 3 59 0 0 0
## X 0 0 0 76 1 0
## Y 0 0 0 1 58 0
## Z 0 0 0 0 0 70
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 321 1679
(conf_mat3 <- table(agreement, letter_predictions))
## letter_predictions
## agreement A B C D E F G H I J K L M N O P Q R S T U V W
## FALSE 12 15 13 28 16 17 22 30 8 12 23 2 11 4 8 6 10 14 20 9 4 4 4
## TRUE 73 61 64 67 64 70 68 46 65 61 56 63 70 77 49 69 52 64 47 83 83 64 59
## letter_predictions
## agreement X Y Z
## FALSE 10 8 11
## TRUE 76 58 70
(Accuracy <- sum(diag(conf_mat3))/sum(conf_mat3)*100)
## [1] 3.65
#Polynomial Kernel
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "polydot")
## Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Polynomial kernel function.
## Hyperparameters : degree = 1 scale = 1 offset = 1
##
## Number of Support Vectors : 7887
##
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5052 -184.481 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1397 -239.2617 -23.8945 -10.9655 -64.2279 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.233 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2782 -85.4452 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2868 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.8179 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7687 -164.597 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5022 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.641 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9083 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.292 -46.3024 -50.4912 -37.9431 -21.5243 -11.6202 -134.9025 -7.516 -5.8131 -10.1595 -13.6329 -27.0294 -25.7282 -151.851 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5383 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8725 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2874 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2587 -32.3275 -35.1191 -4.9605 -90.2308 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4617 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569
## Training error : 0.1335
#Evaluating Model Performance
letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##
## letter_predictions A B C D E F G H I J K L M N O P Q R S T
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0 0 1
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1 3 0
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2 0 0
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0 6 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0 2 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1 3 2
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5 0 3
## I 0 0 0 0 0 0 0 0 65 2 0 0 0 0 0 0 0 0 2 0
## J 0 1 0 0 0 1 0 0 3 62 0 0 0 0 1 0 0 0 1 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4 0 1
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1 0 0
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0 0 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0 1 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64 0 1
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0 47 1
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 83
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1 0 0
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 1
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 5 1
##
## letter_predictions U V W X Y Z
## A 2 0 1 0 0 0
## B 0 0 0 0 0 0
## C 0 0 0 0 0 0
## D 0 0 0 0 1 0
## E 0 0 0 1 0 0
## F 0 1 0 0 2 0
## G 0 0 0 0 0 0
## H 0 2 0 0 1 0
## I 0 0 0 2 1 0
## J 0 0 0 1 0 4
## K 2 0 0 4 0 0
## L 0 0 0 0 0 0
## M 1 0 6 0 0 0
## N 1 0 2 0 0 0
## O 1 0 0 0 0 0
## P 0 0 0 0 1 0
## Q 0 0 0 0 0 0
## R 0 1 0 0 0 0
## S 0 0 0 1 0 6
## T 1 0 0 0 2 2
## U 83 0 0 0 0 0
## V 0 64 1 0 1 0
## W 0 3 59 0 0 0
## X 0 0 0 76 1 0
## Y 0 0 0 1 58 0
## Z 0 0 0 0 0 70
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 320 1680
(conf_mat4 <- table(agreement, letter_predictions))
## letter_predictions
## agreement A B C D E F G H I J K L M N O P Q R S T U V W
## FALSE 12 15 13 28 16 17 22 30 7 12 23 2 11 4 8 6 10 14 20 9 4 4 4
## TRUE 73 61 64 67 64 70 68 46 65 62 56 63 70 77 49 69 52 64 47 83 83 64 59
## letter_predictions
## agreement X Y Z
## FALSE 10 8 11
## TRUE 76 58 70
(Accuracy <- sum(diag(conf_mat4))/sum(conf_mat4)*100)
## [1] 3.65
#RBF
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "rbfdot")
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0472856751333276
##
## Number of Support Vectors : 9522
##
## Objective Function Value : -45.068 -35.4757 -61.1237 -27.7907 -36.4603 -49.0627 -72.4834 -41.0048 -71.1378 -54.2509 -48.7404 -62.9416 -69.9208 -51.8761 -37.2656 -50.2077 -61.3152 -46.7269 -42.1529 -68.0496 -51.0068 -39.5543 -43.3192 -59.4686 -43.0489 -37.0312 -175.8505 -113.2333 -101.3396 -124.3197 -194.513 -85.2816 -82.3514 -106.6628 -67.7708 -66.3184 -60.4829 -74.7244 -86.8999 -105.7321 -255.1937 -202.9954 -54.6302 -43.7191 -83.1566 -44.631 -107.619 -41.4851 -64.9836 -35.1415 -156.9712 -46.4411 -233.9847 -85.4517 -36.736 -27.5488 -125.9081 -88.3635 -39.9353 -42.9862 -94.0438 -47.8249 -60.866 -50.3466 -53.5631 -63.0599 -91.8204 -38.5397 -27.7093 -51.5103 -28.9676 -29.688 -42.867 -77.6766 -86.7789 -229.0298 -78.5526 -93.4866 -87.7459 -43.4439 -54.9169 -126.981 -149.1979 -79.7537 -70.8292 -159.5355 -96.3539 -60.4871 -66.4617 -43.2212 -29.2534 -104.1644 -39.3303 -64.4109 -85.6034 -188.9203 -109.4053 -76.4357 -57.4581 -151.2605 -127.2811 -40.6934 -26.7698 -47.4028 -65.7925 -101.7349 -86.9243 -187.9294 -105.0938 -39.5062 -44.9652 -25.2312 -155.1584 -32.3489 -153.1351 -73.9524 -109.154 -103.7196 -91.8203 -55.0493 -40.5303 -34.2454 -53.6705 -38.7576 -242.8607 -44.7822 -58.9175 -136.1429 -143.3246 -36.5523 -61.5641 -32.2199 -84.9382 -82.8876 -55.8831 -174.4871 -62.5113 -51.4646 -130.9532 -127.289 -71.9613 -56.4582 -165.7605 -98.0346 -190.3282 -141.7568 -116.0067 -71.3192 -73.5917 -80.7562 -61.8935 -96.6942 -37.0531 -45.0699 -65.3472 -88.8936 -225.0344 -82.6592 -97.25 -183.8182 -228.777 -113.7945 -132.2202 -253.7498 -93.8184 -85.7501 -141.955 -96.7489 -66.3552 -142.9304 -67.299 -45.4005 -185.417 -50.4301 -63.0946 -20.1681 -28.8991 -39.4485 -63.743 -63.1508 -52.1296 -145.7413 -51.3411 -24.8724 -23.796 -14.9319 -133.4394 -31.292 -86.5325 -49.6102 -50.8745 -28.0592 -42.8325 -55.6295 -55.7383 -60.8545 -59.7283 -119.1918 -48.1552 -36.8772 -27.1907 -17.067 -99.2173 -32.0584 -89.8497 -101.8716 -67.3186 -67.1192 -62.2483 -53.5223 -63.1039 -195.2706 -78.4711 -71.501 -79.0982 -49.8285 -43.1706 -190.3124 -31.9702 -40.3842 -34.6775 -39.6438 -51.0436 -53.649 -93.3179 -86.2953 -104.2327 -53.7924 -39.4504 -33.9345 -24.2052 -91.3393 -30.5823 -51.2496 -105.6786 -70.8294 -42.6057 -41.8111 -80.5221 -28.5596 -35.9553 -97.5454 -71.6715 -128.2217 -27.5813 -33.9253 -16.5173 -110.1321 -55.923 -44.5816 -98.0165 -31.6628 -43.7738 -99.6683 -70.8738 -79.2039 -42.764 -49.5717 -22.8451 -84.0768 -210.4489 -101.4114 -57.7494 -49.0711 -101.9206 -58.5467 -81.8676 -79.1256 -29.7675 -30.6112 -67.198 -71.941 -74.9827 -64.5243 -36.8208 -75.6632 -49.5469 -51.9236 -78.3021 -29.4058 -88.5447 -108.1962 -42.3913 -48.9789 -43.8053 -37.8094 -83.1124 -49.4488 -67.7982 -94.1202 -60.4357 -49.4828 -75.9437 -51.6446 -105.2105 -31.0395 -39.2052 -106.5495 -31.5897 -36.3352 -18.8946 -160.026 -49.5644 -239.91 -61.561 -72.3322 -29.669 -92.6541 -166.4571 -80.3586 -67.2158 -79.2135 -50.4271 -55.907 -21.08 -113.6714 -34.7217 -192.5426 -19.4235 -20.4897 -43.8845 -11.1356 -47.3078 -107.6987 -36.577
## Training error : 0.049833
#Evaluating Model Performance
letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##
## letter_predictions A B C D E F G H I J K L M N O P Q R S T
## A 75 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 1
## B 0 67 0 2 0 1 0 0 0 0 0 1 0 1 0 2 1 1 1 0
## C 0 0 72 0 3 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## D 1 1 0 71 0 0 1 2 2 2 1 0 0 0 0 2 1 1 0 0
## E 0 0 0 0 70 2 0 0 0 1 0 2 0 0 0 0 0 0 0 0
## F 0 0 0 0 0 76 0 0 3 0 0 0 0 0 0 6 0 0 1 0
## G 0 0 1 0 3 0 76 1 0 0 0 0 0 0 0 0 0 0 0 0
## H 0 0 0 1 0 0 1 58 0 1 0 1 1 0 0 0 1 1 0 3
## I 0 0 0 0 0 0 0 0 69 1 0 0 0 0 0 0 0 0 0 0
## J 0 0 0 0 0 0 0 0 2 66 0 0 0 0 0 0 0 0 0 0
## K 0 0 0 0 0 0 0 3 0 0 62 0 0 1 0 0 0 2 0 0
## L 0 0 0 0 0 0 1 0 0 0 0 69 0 0 0 0 0 0 0 0
## M 0 0 0 0 0 0 1 0 0 0 0 0 71 1 0 0 0 0 0 0
## N 0 0 0 0 0 1 0 0 0 0 0 0 0 78 0 0 0 0 0 0
## O 0 0 1 0 0 0 0 0 0 1 0 0 0 2 67 1 2 0 0 0
## P 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 72 0 0 0 0
## Q 0 0 0 0 0 0 0 1 0 0 0 0 0 0 3 1 65 0 0 0
## R 0 1 0 0 0 0 1 1 0 0 4 0 0 2 1 0 0 74 0 1
## S 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 68 0
## T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 88
## U 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## W 0 0 1 0 0 0 0 0 0 0 0 0 1 0 2 0 0 0 0 0
## X 0 1 0 0 0 0 0 0 0 0 2 4 0 0 0 0 0 0 0 0
## Y 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## Z 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
##
## letter_predictions U V W X Y Z
## A 0 0 0 0 0 0
## B 0 1 0 0 0 0
## C 0 0 0 0 0 0
## D 1 0 0 0 0 0
## E 0 0 0 0 0 0
## F 0 1 0 0 0 0
## G 0 0 0 0 0 0
## H 0 1 0 0 0 0
## I 0 0 0 2 0 0
## J 0 0 0 0 0 1
## K 0 0 0 0 0 0
## L 0 0 0 0 0 0
## M 0 0 2 0 0 0
## N 0 0 1 0 0 0
## O 0 0 0 0 0 0
## P 0 0 0 0 0 0
## Q 0 0 0 0 0 0
## R 0 0 0 0 0 0
## S 0 0 0 0 0 0
## T 0 0 0 0 1 0
## U 89 0 0 0 0 0
## V 0 68 0 0 1 0
## W 1 0 66 0 0 0
## X 0 0 0 84 1 0
## Y 0 0 0 0 65 0
## Z 0 0 0 0 0 81
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 133 1867
(conf_mat3 <- table(agreement, letter_predictions))
## letter_predictions
## agreement A B C D E F G H I J K L M N O P Q R S T U V W
## FALSE 3 11 4 15 5 11 5 11 3 3 6 1 4 2 7 0 5 11 3 1 1 1 5
## TRUE 75 67 72 71 70 76 76 58 69 66 62 69 71 78 67 72 65 74 68 88 89 68 66
## letter_predictions
## agreement X Y Z
## FALSE 8 4 3
## TRUE 84 65 81
(Accuracy <- sum(diag(conf_mat3))/sum(conf_mat3)*100)
## [1] 3.5
#Online News Popularity
news <- read.csv("OnlineNewsPopularity_for_R.csv")
str (news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
#Data Preparation
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
#Data Prep
news_rand <- newsShort[order(runif(10000)), ]
set.seed(12345)
#Split the data into training and test datasets
news_train <- news_rand[1:9000, ]
news_test <- news_rand[9001:10000, ]
#Model Design
nb_model <- naive_bayes(shares~., data = news_train)
## Warning: naive_bayes(): Feature popular - zero probabilities are present.
## Consider Laplace smoothing.
nb_model
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = shares ~ ., data = news_train)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## no yes
## 0.4248889 0.5751111
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: n_tokens_title (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_tokens_title no yes
## mean 9.836820 9.719668
## sd 1.945381 1.985466
##
## ---------------------------------------------------------------------------------
## ::: n_tokens_content (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_tokens_content no yes
## mean 455.5978 506.6084
## sd 355.7531 439.7857
##
## ---------------------------------------------------------------------------------
## ::: n_unique_tokens (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_unique_tokens no yes
## mean 0.5693068 0.5552458
## sd 0.1133274 0.1241536
##
## ---------------------------------------------------------------------------------
## ::: n_non_stop_words (Gaussian)
## ---------------------------------------------------------------------------------
##
## n_non_stop_words no yes
## mean 0.99372384 0.99053322
## sd 0.07898349 0.09684504
##
## ---------------------------------------------------------------------------------
## ::: num_hrefs (Gaussian)
## ---------------------------------------------------------------------------------
##
## num_hrefs no yes
## mean 9.131015 10.495363
## sd 8.663762 11.341514
##
## ---------------------------------------------------------------------------------
##
## # ... and 12 more tables
##
## ---------------------------------------------------------------------------------
#Evaluate the Model
news_Pred <- predict(nb_model, newdata = news_test)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
(conf_nat <- table(news_Pred, news_test$shares))
##
## news_Pred no yes
## no 466 0
## yes 2 532
(Accuracy <- sum(diag(conf_nat))/sum(conf_nat)*100)
## [1] 99.8