library(readr)
credit <- read_csv("C:/Users/PinakKumar/Desktop/credit.csv")
## Parsed with column specification:
## cols(
##   checking_balance = col_character(),
##   months_loan_duration = col_integer(),
##   credit_history = col_character(),
##   purpose = col_character(),
##   amount = col_integer(),
##   savings_balance = col_character(),
##   employment_duration = col_character(),
##   percent_of_income = col_integer(),
##   years_at_residence = col_integer(),
##   age = col_integer(),
##   other_credit = col_character(),
##   housing = col_character(),
##   existing_loans_count = col_integer(),
##   job = col_character(),
##   dependents = col_integer(),
##   phone = col_character(),
##   default = col_character()
## )
View(credit)
str(credit)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1000 obs. of  17 variables:
##  $ checking_balance    : chr  "< 0 DM" "1 - 200 DM" "unknown" "< 0 DM" ...
##  $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history      : chr  "critical" "good" "critical" "good" ...
##  $ purpose             : chr  "furniture/appliances" "furniture/appliances" "education" "furniture/appliances" ...
##  $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings_balance     : chr  "unknown" "< 100 DM" "< 100 DM" "< 100 DM" ...
##  $ employment_duration : chr  "> 7 years" "1 - 4 years" "4 - 7 years" "4 - 7 years" ...
##  $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ other_credit        : chr  "none" "none" "none" "none" ...
##  $ housing             : chr  "own" "own" "own" "other" ...
##  $ existing_loans_count: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ job                 : chr  "skilled" "skilled" "unskilled" "skilled" ...
##  $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ phone               : chr  "yes" "no" "no" "no" ...
##  $ default             : chr  "no" "yes" "no" "no" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 17
##   .. ..$ checking_balance    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ months_loan_duration: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ credit_history      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ purpose             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ amount              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ savings_balance     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ employment_duration : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ percent_of_income   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ years_at_residence  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ age                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ other_credit        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ housing             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ existing_loans_count: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ job                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ dependents          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ phone               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ default             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
##Step 2: Exploring the data
summary(credit$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
table(credit$default)
## 
##  no yes 
## 700 300
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
summary(credit_rand$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$default))
## 
##        no       yes 
## 0.7022222 0.2977778
prop.table(table(credit_test$default))
## 
##   no  yes 
## 0.68 0.32
str(credit_train$default)
##  chr [1:900] "yes" "no" "no" "no" "yes" "no" "yes" "yes" "no" "no" ...
credit_train$default<-as.factor(credit_train$default)
str(credit_train$default)
##  Factor w/ 2 levels "no","yes": 2 1 1 1 2 1 2 2 1 1 ...
##Step 3: Training a model on the data
library(C50)
## Warning: package 'C50' was built under R version 3.4.4
credit_model<-C5.0.default(x = credit_train[-17], y = credit_train$default)
## c50 code called exit with value 1
credit_model
## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 16 
## 
## Tree size: 0 
## 
## Non-standard options: attempt to group attributes
summary(credit_model)
## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Mon Jun 18 10:28:06 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## *** line 1 of `undefined.data': bad value of `c("< 0 DM", "1 - 200 DM", "1 - 200 DM", "< 0 DM", "1 - 200 DM", "unknown", "< 0 DM", "1 - 200 DM", "1 - 200 DM", "< 0 DM", "unknown", "> 200 DM", "> 200 DM", "1 - 200 DM", "unknown", "unknown", "unknown", "unknown", "unknown", "< 0 DM", "unknown", "< 0 DM", "> 200 DM", "unknown", "< 0 DM", "unknown", "> 200 DM", "1 - 200 DM", "unknown", "unknown", "unknown", "unknown", "< 0 DM", "1 - 200 DM", "unknown", "1 - 200 DM", "1 - 200 DM", "1 - 200 DM", "unknown", "< 0 DM", "unknown", "unknown", "< 0 DM",' for attribute `checking_balance'
## 
## Error limit exceeded
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.4.4

{r} ##Step 4: Evaluating Model Performance cred_pred <- predict (credit_model, credit_test) CrossTable(credit_test$default, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c(‘actua l default’, ‘predicted default’)) ###It indicates out of 100 records 10 were misclassified false negatives and type II error. 16 actual defaults were misclassified as not defaulting, i.e. false positives or a Type I error.

Alternate Solution:

library(ggplot2)
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.4
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
DT <- rpart(default ~ checking_balance + credit_history + months_loan_duration , credit_train, method = "class", cp=0)
summary(DT)
## Call:
## rpart(formula = default ~ checking_balance + credit_history + 
##     months_loan_duration, data = credit_train, method = "class", 
##     cp = 0)
##   n= 900 
## 
##             CP nsplit rel error    xerror       xstd
## 1 0.0410447761      0 1.0000000 1.0000000 0.05118820
## 2 0.0130597015      3 0.8768657 0.9440299 0.05032182
## 3 0.0074626866      5 0.8507463 0.9552239 0.05050180
## 4 0.0055970149      7 0.8358209 0.9589552 0.05056104
## 5 0.0037313433      9 0.8246269 0.9738806 0.05079426
## 6 0.0018656716     11 0.8171642 0.9402985 0.05026107
## 7 0.0012437811     13 0.8134328 0.9365672 0.05019994
## 8 0.0009328358     16 0.8097015 0.9440299 0.05032182
## 9 0.0000000000     20 0.8059701 0.9440299 0.05032182
## 
## Variable importance
##     checking_balance       credit_history months_loan_duration 
##                   48                   27                   24 
## 
## Node number 1: 900 observations,    complexity param=0.04104478
##   predicted class=no   expected loss=0.2977778  P(node) =1
##     class counts:   632   268
##    probabilities: 0.702 0.298 
##   left son=2 (414 obs) right son=3 (486 obs)
##   Primary splits:
##       checking_balance     splits as  RLRL,     improve=39.30076, (0 missing)
##       credit_history       splits as  LLRLR,    improve=14.01920, (0 missing)
##       months_loan_duration < 34.5 to the left,  improve=11.31821, (0 missing)
##   Surrogate splits:
##       credit_history       splits as  LRRRR,    agree=0.592, adj=0.114, (0 split)
##       months_loan_duration < 5.5  to the left,  agree=0.547, adj=0.014, (0 split)
## 
## Node number 2: 414 observations,    complexity param=0.001243781
##   predicted class=no   expected loss=0.1376812  P(node) =0.46
##     class counts:   357    57
##    probabilities: 0.862 0.138 
##   left son=4 (355 obs) right son=5 (59 obs)
##   Primary splits:
##       credit_history       splits as  LLRRR,    improve=1.869495, (0 missing)
##       months_loan_duration < 16.5 to the left,  improve=1.338274, (0 missing)
##       checking_balance     splits as  -R-L,     improve=1.155705, (0 missing)
## 
## Node number 3: 486 observations,    complexity param=0.04104478
##   predicted class=no   expected loss=0.4341564  P(node) =0.54
##     class counts:   275   211
##    probabilities: 0.566 0.434 
##   left son=6 (274 obs) right son=7 (212 obs)
##   Primary splits:
##       months_loan_duration < 22.5 to the left,  improve=10.423870, (0 missing)
##       credit_history       splits as  LLRLR,    improve= 8.598609, (0 missing)
##       checking_balance     splits as  R-L-,     improve= 3.119042, (0 missing)
##   Surrogate splits:
##       credit_history splits as  LLRRR, agree=0.611, adj=0.108, (0 split)
## 
## Node number 4: 355 observations
##   predicted class=no   expected loss=0.1183099  P(node) =0.3944444
##     class counts:   313    42
##    probabilities: 0.882 0.118 
## 
## Node number 5: 59 observations,    complexity param=0.001243781
##   predicted class=no   expected loss=0.2542373  P(node) =0.06555556
##     class counts:    44    15
##    probabilities: 0.746 0.254 
##   left son=10 (19 obs) right son=11 (40 obs)
##   Primary splits:
##       months_loan_duration < 16.5 to the left,  improve=0.5202498, (0 missing)
##       checking_balance     splits as  -R-L,     improve=0.2699402, (0 missing)
##       credit_history       splits as  --RLL,    improve=0.1328814, (0 missing)
##   Surrogate splits:
##       checking_balance splits as  -L-R, agree=0.712, adj=0.105, (0 split)
## 
## Node number 6: 274 observations,    complexity param=0.04104478
##   predicted class=no   expected loss=0.3430657  P(node) =0.3044444
##     class counts:   180    94
##    probabilities: 0.657 0.343 
##   left son=12 (249 obs) right son=13 (25 obs)
##   Primary splits:
##       credit_history       splits as  LLRLR,    improve=7.817224, (0 missing)
##       months_loan_duration < 11.5 to the left,  improve=3.919498, (0 missing)
##       checking_balance     splits as  R-L-,     improve=1.868613, (0 missing)
## 
## Node number 7: 212 observations,    complexity param=0.0130597
##   predicted class=yes  expected loss=0.4481132  P(node) =0.2355556
##     class counts:    95   117
##    probabilities: 0.448 0.552 
##   left son=14 (173 obs) right son=15 (39 obs)
##   Primary splits:
##       months_loan_duration < 47.5 to the left,  improve=2.635873, (0 missing)
##       credit_history       splits as  LRRLR,    improve=1.294771, (0 missing)
##       checking_balance     splits as  R-L-,     improve=1.058491, (0 missing)
## 
## Node number 10: 19 observations
##   predicted class=no   expected loss=0.1578947  P(node) =0.02111111
##     class counts:    16     3
##    probabilities: 0.842 0.158 
## 
## Node number 11: 40 observations,    complexity param=0.001243781
##   predicted class=no   expected loss=0.3  P(node) =0.04444444
##     class counts:    28    12
##    probabilities: 0.700 0.300 
##   left son=22 (31 obs) right son=23 (9 obs)
##   Primary splits:
##       months_loan_duration < 22.5 to the right, improve=1.516846, (0 missing)
##       credit_history       splits as  --RLR,    improve=0.122884, (0 missing)
## 
## Node number 12: 249 observations,    complexity param=0.0009328358
##   predicted class=no   expected loss=0.3052209  P(node) =0.2766667
##     class counts:   173    76
##    probabilities: 0.695 0.305 
##   left son=24 (72 obs) right son=25 (177 obs)
##   Primary splits:
##       months_loan_duration < 11.5 to the left,  improve=3.888912, (0 missing)
##       credit_history       splits as  LR-L-,    improve=2.096469, (0 missing)
##       checking_balance     splits as  R-L-,     improve=1.643716, (0 missing)
## 
## Node number 13: 25 observations
##   predicted class=yes  expected loss=0.28  P(node) =0.02777778
##     class counts:     7    18
##    probabilities: 0.280 0.720 
## 
## Node number 14: 173 observations,    complexity param=0.0130597
##   predicted class=yes  expected loss=0.4855491  P(node) =0.1922222
##     class counts:    84    89
##    probabilities: 0.486 0.514 
##   left son=28 (77 obs) right son=29 (96 obs)
##   Primary splits:
##       checking_balance     splits as  R-L-,     improve=0.9959275, (0 missing)
##       credit_history       splits as  LRRRR,    improve=0.8924092, (0 missing)
##       months_loan_duration < 31.5 to the left,  improve=0.3748116, (0 missing)
##   Surrogate splits:
##       credit_history       splits as  LRRLR,    agree=0.618, adj=0.143, (0 split)
##       months_loan_duration < 25   to the right, agree=0.566, adj=0.026, (0 split)
## 
## Node number 15: 39 observations
##   predicted class=yes  expected loss=0.2820513  P(node) =0.04333333
##     class counts:    11    28
##    probabilities: 0.282 0.718 
## 
## Node number 22: 31 observations
##   predicted class=no   expected loss=0.2258065  P(node) =0.03444444
##     class counts:    24     7
##    probabilities: 0.774 0.226 
## 
## Node number 23: 9 observations
##   predicted class=yes  expected loss=0.4444444  P(node) =0.01
##     class counts:     4     5
##    probabilities: 0.444 0.556 
## 
## Node number 24: 72 observations
##   predicted class=no   expected loss=0.1666667  P(node) =0.08
##     class counts:    60    12
##    probabilities: 0.833 0.167 
## 
## Node number 25: 177 observations,    complexity param=0.0009328358
##   predicted class=no   expected loss=0.3615819  P(node) =0.1966667
##     class counts:   113    64
##    probabilities: 0.638 0.362 
##   left son=50 (58 obs) right son=51 (119 obs)
##   Primary splits:
##       credit_history       splits as  LR-L-,    improve=2.4929420, (0 missing)
##       checking_balance     splits as  R-L-,     improve=2.0530640, (0 missing)
##       months_loan_duration < 19   to the right, improve=0.9713449, (0 missing)
##   Surrogate splits:
##       months_loan_duration < 19   to the right, agree=0.678, adj=0.017, (0 split)
## 
## Node number 28: 77 observations,    complexity param=0.005597015
##   predicted class=no   expected loss=0.4545455  P(node) =0.08555556
##     class counts:    42    35
##    probabilities: 0.545 0.455 
##   left son=56 (20 obs) right son=57 (57 obs)
##   Primary splits:
##       credit_history       splits as  RRRLL,    improve=1.2905900, (0 missing)
##       months_loan_duration < 26.5 to the left,  improve=0.1060606, (0 missing)
## 
## Node number 29: 96 observations,    complexity param=0.007462687
##   predicted class=yes  expected loss=0.4375  P(node) =0.1066667
##     class counts:    42    54
##    probabilities: 0.438 0.562 
##   left son=58 (78 obs) right son=59 (18 obs)
##   Primary splits:
##       credit_history       splits as  LLLRR,    improve=3.2500000, (0 missing)
##       months_loan_duration < 31.5 to the left,  improve=0.5105042, (0 missing)
## 
## Node number 50: 58 observations
##   predicted class=no   expected loss=0.2413793  P(node) =0.06444444
##     class counts:    44    14
##    probabilities: 0.759 0.241 
## 
## Node number 51: 119 observations,    complexity param=0.0009328358
##   predicted class=no   expected loss=0.4201681  P(node) =0.1322222
##     class counts:    69    50
##    probabilities: 0.580 0.420 
##   left son=102 (56 obs) right son=103 (63 obs)
##   Primary splits:
##       checking_balance     splits as  R-L-,     improve=0.8403361, (0 missing)
##       months_loan_duration < 19   to the right, improve=0.7629913, (0 missing)
##   Surrogate splits:
##       months_loan_duration < 14.5 to the left,  agree=0.538, adj=0.018, (0 split)
## 
## Node number 56: 20 observations
##   predicted class=no   expected loss=0.3  P(node) =0.02222222
##     class counts:    14     6
##    probabilities: 0.700 0.300 
## 
## Node number 57: 57 observations,    complexity param=0.005597015
##   predicted class=yes  expected loss=0.4912281  P(node) =0.06333333
##     class counts:    28    29
##    probabilities: 0.491 0.509 
##   left son=114 (18 obs) right son=115 (39 obs)
##   Primary splits:
##       credit_history       splits as  LRR--,    improve=0.2177238, (0 missing)
##       months_loan_duration < 26.5 to the left,  improve=0.2133124, (0 missing)
##   Surrogate splits:
##       months_loan_duration < 37.5 to the right, agree=0.754, adj=0.222, (0 split)
## 
## Node number 58: 78 observations,    complexity param=0.007462687
##   predicted class=no   expected loss=0.5  P(node) =0.08666667
##     class counts:    39    39
##    probabilities: 0.500 0.500 
##   left son=116 (16 obs) right son=117 (62 obs)
##   Primary splits:
##       credit_history       splits as  LRR--,    improve=0.6290323, (0 missing)
##       months_loan_duration < 31.5 to the left,  improve=0.4814815, (0 missing)
## 
## Node number 59: 18 observations
##   predicted class=yes  expected loss=0.1666667  P(node) =0.02
##     class counts:     3    15
##    probabilities: 0.167 0.833 
## 
## Node number 102: 56 observations
##   predicted class=no   expected loss=0.3571429  P(node) =0.06222222
##     class counts:    36    20
##    probabilities: 0.643 0.357 
## 
## Node number 103: 63 observations,    complexity param=0.0009328358
##   predicted class=no   expected loss=0.4761905  P(node) =0.07
##     class counts:    33    30
##    probabilities: 0.524 0.476 
##   left son=206 (38 obs) right son=207 (25 obs)
##   Primary splits:
##       months_loan_duration < 16.5 to the left,  improve=0.1590977, (0 missing)
## 
## Node number 114: 18 observations
##   predicted class=no   expected loss=0.4444444  P(node) =0.02
##     class counts:    10     8
##    probabilities: 0.556 0.444 
## 
## Node number 115: 39 observations,    complexity param=0.003731343
##   predicted class=yes  expected loss=0.4615385  P(node) =0.04333333
##     class counts:    18    21
##    probabilities: 0.462 0.538 
##   left son=230 (26 obs) right son=231 (13 obs)
##   Primary splits:
##       months_loan_duration < 33   to the left,  improve=0.2307692, (0 missing)
## 
## Node number 116: 16 observations
##   predicted class=no   expected loss=0.375  P(node) =0.01777778
##     class counts:    10     6
##    probabilities: 0.625 0.375 
## 
## Node number 117: 62 observations,    complexity param=0.001865672
##   predicted class=yes  expected loss=0.4677419  P(node) =0.06888889
##     class counts:    29    33
##    probabilities: 0.468 0.532 
##   left son=234 (46 obs) right son=235 (16 obs)
##   Primary splits:
##       months_loan_duration < 33   to the left,  improve=0.039446, (0 missing)
## 
## Node number 206: 38 observations
##   predicted class=no   expected loss=0.4473684  P(node) =0.04222222
##     class counts:    21    17
##    probabilities: 0.553 0.447 
## 
## Node number 207: 25 observations
##   predicted class=yes  expected loss=0.48  P(node) =0.02777778
##     class counts:    12    13
##    probabilities: 0.480 0.520 
## 
## Node number 230: 26 observations,    complexity param=0.003731343
##   predicted class=no   expected loss=0.5  P(node) =0.02888889
##     class counts:    13    13
##    probabilities: 0.500 0.500 
##   left son=460 (8 obs) right son=461 (18 obs)
##   Primary splits:
##       months_loan_duration < 25   to the right, improve=0.3611111, (0 missing)
##   Surrogate splits:
##       credit_history splits as  -RL--, agree=0.769, adj=0.25, (0 split)
## 
## Node number 231: 13 observations
##   predicted class=yes  expected loss=0.3846154  P(node) =0.01444444
##     class counts:     5     8
##    probabilities: 0.385 0.615 
## 
## Node number 234: 46 observations,    complexity param=0.001865672
##   predicted class=yes  expected loss=0.4782609  P(node) =0.05111111
##     class counts:    22    24
##    probabilities: 0.478 0.522 
##   left son=468 (9 obs) right son=469 (37 obs)
##   Primary splits:
##       months_loan_duration < 29   to the right, improve=0.1336989, (0 missing)
## 
## Node number 235: 16 observations
##   predicted class=yes  expected loss=0.4375  P(node) =0.01777778
##     class counts:     7     9
##    probabilities: 0.438 0.562 
## 
## Node number 460: 8 observations
##   predicted class=no   expected loss=0.375  P(node) =0.008888889
##     class counts:     5     3
##    probabilities: 0.625 0.375 
## 
## Node number 461: 18 observations
##   predicted class=yes  expected loss=0.4444444  P(node) =0.02
##     class counts:     8    10
##    probabilities: 0.444 0.556 
## 
## Node number 468: 9 observations
##   predicted class=no   expected loss=0.4444444  P(node) =0.01
##     class counts:     5     4
##    probabilities: 0.556 0.444 
## 
## Node number 469: 37 observations
##   predicted class=yes  expected loss=0.4594595  P(node) =0.04111111
##     class counts:    17    20
##    probabilities: 0.459 0.541
rpart.plot(DT, type=1, extra = 102)

Method #2. Support vector machines

##Step 1: Collecting the Data
letters <- read.csv("letterdata.csv")
str(letters)
## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...
##Step 2: Preparing the Data
letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]
##Step 3: Training a Model on the Data
library(kernlab)
## Warning: package 'kernlab' was built under R version 3.4.4
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")
##  Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 7886 
## 
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569 
## Training error : 0.1335
##Step 4: Evaluating Model Performance
letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##                   
## letter_predictions  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R
##                  A 73  0  0  0  0  0  0  0  0  1  0  0  0  0  3  0  4  0
##                  B  0 61  0  3  2  0  1  1  0  0  1  1  0  0  0  2  0  1
##                  C  0  0 64  0  2  0  4  2  1  0  1  2  0  0  1  0  0  0
##                  D  2  1  0 67  0  0  1  3  3  2  1  2  0  3  4  2  1  2
##                  E  0  0  1  0 64  1  1  0  0  0  2  2  0  0  0  0  2  0
##                  F  0  0  0  0  0 70  1  1  4  0  0  0  0  0  0  5  1  0
##                  G  1  1  2  1  3  2 68  1  0  0  0  1  0  0  0  0  4  1
##                  H  0  0  0  1  0  1  0 46  0  2  3  1  1  1  9  0  0  5
##                  I  0  0  0  0  0  0  0  0 65  3  0  0  0  0  0  0  0  0
##                  J  0  1  0  0  0  1  0  0  3 61  0  0  0  0  1  0  0  0
##                  K  0  1  4  0  0  0  0  5  0  0 56  0  0  2  0  0  0  4
##                  L  0  0  0  0  1  0  0  1  0  0  0 63  0  0  0  0  0  0
##                  M  0  0  1  0  0  0  1  0  0  0  0  0 70  2  0  0  0  0
##                  N  0  0  0  0  0  0  0  0  0  0  0  0  0 77  0  0  0  1
##                  O  0  0  1  1  0  0  0  1  0  1  0  0  0  0 49  1  2  0
##                  P  0  0  0  0  0  3  0  0  0  0  0  0  0  0  2 69  0  0
##                  Q  0  0  0  0  0  0  3  1  0  0  0  2  0  0  2  1 52  0
##                  R  0  4  0  0  1  0  0  3  0  0  3  0  0  0  1  0  0 64
##                  S  0  1  0  0  1  1  1  0  1  1  0  0  0  0  0  0  6  0
##                  T  0  0  0  0  1  1  0  0  0  0  1  0  0  0  0  0  0  0
##                  U  0  0  2  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##                  V  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0
##                  W  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0
##                  X  0  1  0  0  1  0  0  1  0  0  1  4  0  0  0  0  0  1
##                  Y  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  4  0  0
##                  Z  1  0  0  0  2  0  0  0  0  2  0  0  0  0  0  0  0  0
##                   
## letter_predictions  S  T  U  V  W  X  Y  Z
##                  A  0  1  2  0  1  0  0  0
##                  B  3  0  0  0  0  0  0  0
##                  C  0  0  0  0  0  0  0  0
##                  D  0  0  0  0  0  0  1  0
##                  E  6  0  0  0  0  1  0  0
##                  F  2  0  0  1  0  0  2  0
##                  G  3  2  0  0  0  0  0  0
##                  H  0  3  0  2  0  0  1  0
##                  I  2  0  0  0  0  2  1  0
##                  J  1  0  0  0  0  1  0  4
##                  K  0  1  2  0  0  4  0  0
##                  L  0  0  0  0  0  0  0  0
##                  M  0  0  1  0  6  0  0  0
##                  N  0  0  1  0  2  0  0  0
##                  O  0  0  1  0  0  0  0  0
##                  P  0  0  0  0  0  0  1  0
##                  Q  1  0  0  0  0  0  0  0
##                  R  0  1  0  1  0  0  0  0
##                  S 47  1  0  0  0  1  0  6
##                  T  1 83  1  0  0  0  2  2
##                  U  0  0 83  0  0  0  0  0
##                  V  0  0  0 64  1  0  1  0
##                  W  0  0  0  3 59  0  0  0
##                  X  0  0  0  0  0 76  1  0
##                  Y  0  1  0  0  0  1 58  0
##                  Z  5  1  0  0  0  0  0 70
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE  TRUE 
##   321  1679
###The classification was correct 1679 times out of 2000 records.

Method #3. Naïve Bayes Algorithm

##Step 1: Collecting the Data

sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)

str(sms_raw)
## 'data.frame':    5559 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "ham" "spam" ...
##  $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline"| __truncated__ ...
head(sms_raw)
##   type
## 1  ham
## 2  ham
## 3  ham
## 4 spam
## 5 spam
## 6  ham
##                                                                                                                                                                text
## 1                                                                                                                 Hope you are having a good week. Just checking in
## 2                                                                                                                                           K..give back my thanks.
## 3                                                                                                                       Am also doing in cbe only. But have to pay.
## 4            complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out! Box434SK38WP150PPM18+
## 5 okmail: Dear Dave this is your final notice to collect your 4* Tenerife Holiday or #5000 CASH award! Call 09061743806 from landline. TCs SAE Box326 CW25WX 150ppm
## 6                                                                                                                Aiya we discuss later lar... Pick u up at 4 is it?
##Step 2: Exploring and Preparing the Data
library (tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
print(sms_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 5559
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[10:15], as.character)
## $`10`
## [1] "fyi I'm at usf now, swing by the room whenever"
## 
## $`11`
## [1] "Sure thing big man. i have hockey elections at 6, shouldnâ<U+0082>¬Ë<U+009C>t go on longer than an hour though"
## 
## $`12`
## [1] "I anything lor..."
## 
## $`13`
## [1] "By march ending, i should be ready. But will call you for sure. The problem is that my capital never complete. How far with you. How's work and the ladies"
## 
## $`14`
## [1] "Hmm well, night night "
## 
## $`15`
## [1] "K I'll be sure to get up before noon and see what's what"
corpus_clean <- tm_map(sms_corpus, tolower)
corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removePunctuation)
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
strwrap(corpus_clean[[1]])
## [1] "hope good week just checking"
corpus_clean <- tm_map(corpus_clean, PlainTextDocument)
sms_dtm <- DocumentTermMatrix(corpus_clean)
sms_raw_train <- sms_raw[1:4169, ]
sms_raw_test <- sms_raw[4170:5559, ]
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]
sms_corpus_train <- corpus_clean[1:4169]
sms_corpus_test <- corpus_clean[4170:5559]
prop.table(table(sms_raw_train$type))
## 
##       ham      spam 
## 0.8647158 0.1352842
prop.table(table(sms_raw_test$type))
## 
##       ham      spam 
## 0.8683453 0.1316547
##Creating Word Clouds
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(sms_corpus_train, min.freq = 40, random.order = FALSE)

##Step 3: Training a Model on the Data
convert_count <- function(x) {
 x <- ifelse(x > 0, 1, 0)
 x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
 return(x)
}
five_times_words <- findFreqTerms(sms_dtm_train, 5)
length(five_times_words)
## [1] 1228
library( e1071)
## Warning: package 'e1071' was built under R version 3.4.4
sms_train <- DocumentTermMatrix(sms_corpus_train, control=list(dictionary=five_times_words))
sms_test <- DocumentTermMatrix(sms_corpus_test, control=list(dictionary=five_times_words))
sms_train <- apply(sms_train, 2, convert_count)
sms_test <- apply(sms_test, 2, convert_count)
sms_classifier <- naiveBayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)
## [1] "naiveBayes"
##Step 4: Evaluating Model Performance
sms_test_pred <- predict(sms_classifier, newdata=sms_test)
table(sms_test_pred, sms_raw_test$type)
##              
## sms_test_pred  ham spam
##          ham  1202   31
##          spam    5  152
###ham was correctly classified 99% of the times that is 1202

Method #4. Adding regression to trees

##Step 1: Collecting the Data
wine <- read.csv("whitewines.csv")
str(wine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...
hist(wine$quality)

##Step 2: Exploring and Preparing the Data
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]

##Step 3: Training a Model on the Data
library(rpart)
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile.acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile.acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile.acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile.acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free.sulfur.dioxide< 10.5 84   95.55952 5.369048 *
##      7) free.sulfur.dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile.acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile.acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *
library(rpart.plot)
rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

##Step 4: Evaluating Model Performance
p.rpart <- predict(m.rpart,wine_test)
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597
summary(wine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
##54 % is not a lot of correlation

ONLINE NEWS POPULARITY DATA ANALYSIS

Tree based classification

Step 1: Collecting Data

news<-read.csv('OnlineNewsPopularity.csv',header = TRUE)
news<-news[, 29:61]
str(news)
## 'data.frame':    39644 obs. of  33 variables:
##  $ self_reference_min_shares   : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares   : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess  : num  496 0 918 0 3151 ...
##  $ weekday_is_monday           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                      : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                      : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                      : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                      : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                      : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity         : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity   : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words  : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words  : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words         : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words         : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity       : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity       : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity       : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity       : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity       : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity       : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity          : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity    : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity      : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity: num  0.188 0 0 0 0.136 ...
##  $ shares                      : int  593 711 1500 1200 505 855 556 891 3600 710 ...
##Step 2: Exploring and Preparing data
library(caTools)
set.seed(123)
news$popular= ifelse(news$avg_positive_polarity>0.5,1, 0)
news<-news[,-23]
split<-sample.split(news$popular , SplitRatio=0.8)
training_set=subset(news, split==TRUE)
test_set<-subset(news, split==FALSE)

##Step 3: Training a Model on the Data and Evaluaitng
library(C50)
classifier<-glm(formula= popular~ . , 
              family=binomial, data=training_set)
classifier
## 
## Call:  glm(formula = popular ~ ., family = binomial, data = training_set)
## 
## Coefficients:
##                  (Intercept)     self_reference_min_shares  
##                   -1.830e+01                     1.673e-06  
##    self_reference_max_shares    self_reference_avg_sharess  
##                    1.346e-06                    -7.244e-07  
##            weekday_is_monday            weekday_is_tuesday  
##                   -1.572e-01                    -2.502e-01  
##         weekday_is_wednesday           weekday_is_thursday  
##                   -8.288e-02                    -2.687e-01  
##            weekday_is_friday           weekday_is_saturday  
##                   -1.470e-01                    -1.838e-01  
##            weekday_is_sunday                    is_weekend  
##                           NA                            NA  
##                       LDA_00                        LDA_01  
##                   -2.671e-01                    -8.843e-02  
##                       LDA_02                        LDA_03  
##                   -3.256e-01                    -7.457e-02  
##                       LDA_04           global_subjectivity  
##                   -8.800e-01                    -6.027e-01  
##    global_sentiment_polarity    global_rate_positive_words  
##                    2.711e+01                    -5.111e+01  
##   global_rate_negative_words           rate_positive_words  
##                    4.332e+01                    -3.067e+00  
##          rate_negative_words         min_positive_polarity  
##                    6.533e+00                     1.586e+01  
##        max_positive_polarity         avg_negative_polarity  
##                    1.089e+01                    -7.236e+00  
##        min_negative_polarity         max_negative_polarity  
##                    1.261e+00                     1.547e+00  
##           title_subjectivity      title_sentiment_polarity  
##                   -2.547e-01                    -3.948e-01  
##       abs_title_subjectivity  abs_title_sentiment_polarity  
##                    3.005e-02                     1.036e+00  
##                       shares  
##                   -2.013e-06  
## 
## Degrees of Freedom: 31715 Total (i.e. Null);  31685 Residual
## Null Deviance:       12660 
## Residual Deviance: 5585  AIC: 5647
prob_pred<-predict(classifier, type= "response", test_set[-33])
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
y_pred<-ifelse(prob_pred>0.5,1, 0)
cm<-table(test_set[,33] , y_pred)
cm
##    y_pred
##        0    1
##   0 7478   51
##   1  200  199
##7478 it was accurateky classified as non popular
#Tree Visualization
library(rpart)
library(rpart.plot)
m.rpart <- rpart(popular ~. , data=training_set)
m.rpart
## n= 31716 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 31716 1517.48500 0.05038466  
##    2) global_sentiment_polarity< 0.3182905 30960 1168.24000 0.03927649  
##      4) min_positive_polarity< 0.2678571 30271  886.40280 0.03019391  
##        8) global_sentiment_polarity< 0.240115 28139  583.37640 0.02118057  
##         16) min_positive_polarity< 0.1431818 26114  378.35340 0.01470476 *
##         17) min_positive_polarity>=0.1431818 2025  189.80540 0.10469140  
##           34) max_positive_polarity< 0.8416667 1463   37.96036 0.02665755 *
##           35) max_positive_polarity>=0.8416667 562  119.74560 0.30782920 *
##        9) global_sentiment_polarity>=0.240115 2132  270.56850 0.14915570  
##         18) rate_positive_words>=0.7639319 1883  155.47110 0.09081253 *
##         19) rate_positive_words< 0.7639319 249   60.21687 0.59036140 *
##      5) min_positive_polarity>=0.2678571 689  169.62840 0.43831640  
##       10) max_positive_polarity< 0.575 292    0.00000 0.00000000 *
##       11) max_positive_polarity>=0.575 397   72.26700 0.76070530 *
##    3) global_sentiment_polarity>=0.3182905 756  188.97880 0.50529100  
##      6) max_positive_polarity< 0.825 185   27.75135 0.18378380 *
##      7) max_positive_polarity>=0.825 571  135.90890 0.60945710 *
rpart.plot(m.rpart, digits=1)

rpart.plot(m.rpart, digits=1, fallen.leaves=TRUE, type=3, extra=101)

summary(test_set$max_positive_polarity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.6000  0.8000  0.7532  1.0000  1.0000
summary(test_set$popular)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05033 0.00000 1.00000
cor(test_set$max_positive_polarity, test_set$popular)
## [1] 0.1760677
summary(test_set$global_sentiment_polarity)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.37766  0.05737  0.12044  0.11976  0.17808  0.57374
summary(test_set$popular)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05033 0.00000 1.00000
cor(test_set$global_sentiment_polarity, test_set$popular)
## [1] 0.2417969
###24 % is not a lot of correlation

Method - Support Vector machines

preprocessed data for SVM method use

library(kernlab)

svm_classifier <- ksvm(popular~. , data=training_set, kernel=“vanilladot”) svm_classifier svmpred <- predict(svm_classifier, test_set) svmpred1= ifelse(svmpred>0.5,1, 0) svmc1<- table(svmpred1, test_set$popular) svmc1 ##All the unpopular news are predicted correctly but not the unpopular ones ```