library(readr)
credit <- read_csv("C:/Users/PinakKumar/Desktop/credit.csv")
## Parsed with column specification:
## cols(
## checking_balance = col_character(),
## months_loan_duration = col_integer(),
## credit_history = col_character(),
## purpose = col_character(),
## amount = col_integer(),
## savings_balance = col_character(),
## employment_duration = col_character(),
## percent_of_income = col_integer(),
## years_at_residence = col_integer(),
## age = col_integer(),
## other_credit = col_character(),
## housing = col_character(),
## existing_loans_count = col_integer(),
## job = col_character(),
## dependents = col_integer(),
## phone = col_character(),
## default = col_character()
## )
View(credit)
str(credit)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 17 variables:
## $ checking_balance : chr "< 0 DM" "1 - 200 DM" "unknown" "< 0 DM" ...
## $ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_history : chr "critical" "good" "critical" "good" ...
## $ purpose : chr "furniture/appliances" "furniture/appliances" "education" "furniture/appliances" ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ savings_balance : chr "unknown" "< 100 DM" "< 100 DM" "< 100 DM" ...
## $ employment_duration : chr "> 7 years" "1 - 4 years" "4 - 7 years" "4 - 7 years" ...
## $ percent_of_income : int 4 2 2 2 3 2 3 2 2 4 ...
## $ years_at_residence : int 4 2 3 4 4 4 4 2 4 2 ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ other_credit : chr "none" "none" "none" "none" ...
## $ housing : chr "own" "own" "own" "other" ...
## $ existing_loans_count: int 2 1 1 1 2 1 1 1 1 2 ...
## $ job : chr "skilled" "skilled" "unskilled" "skilled" ...
## $ dependents : int 1 1 2 2 2 2 1 1 1 1 ...
## $ phone : chr "yes" "no" "no" "no" ...
## $ default : chr "no" "yes" "no" "no" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 17
## .. ..$ checking_balance : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ months_loan_duration: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ credit_history : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ purpose : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ amount : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ savings_balance : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ employment_duration : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ percent_of_income : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ years_at_residence : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ other_credit : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ housing : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ existing_loans_count: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ job : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ dependents : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ phone : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ default : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
##Step 2: Exploring the data
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$default)
##
## no yes
## 700 300
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
summary(credit_rand$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$default))
##
## no yes
## 0.7022222 0.2977778
prop.table(table(credit_test$default))
##
## no yes
## 0.68 0.32
str(credit_train$default)
## chr [1:900] "yes" "no" "no" "no" "yes" "no" "yes" "yes" "no" "no" ...
credit_train$default<-as.factor(credit_train$default)
str(credit_train$default)
## Factor w/ 2 levels "no","yes": 2 1 1 1 2 1 2 2 1 1 ...
##Step 3: Training a model on the data
library(C50)
## Warning: package 'C50' was built under R version 3.4.4
credit_model<-C5.0.default(x = credit_train[-17], y = credit_train$default)
## c50 code called exit with value 1
credit_model
##
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 16
##
## Tree size: 0
##
## Non-standard options: attempt to group attributes
summary(credit_model)
##
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
##
##
## C5.0 [Release 2.07 GPL Edition] Mon Jun 18 10:28:06 2018
## -------------------------------
##
## Class specified by attribute `outcome'
##
## *** line 1 of `undefined.data': bad value of `c("< 0 DM", "1 - 200 DM", "1 - 200 DM", "< 0 DM", "1 - 200 DM", "unknown", "< 0 DM", "1 - 200 DM", "1 - 200 DM", "< 0 DM", "unknown", "> 200 DM", "> 200 DM", "1 - 200 DM", "unknown", "unknown", "unknown", "unknown", "unknown", "< 0 DM", "unknown", "< 0 DM", "> 200 DM", "unknown", "< 0 DM", "unknown", "> 200 DM", "1 - 200 DM", "unknown", "unknown", "unknown", "unknown", "< 0 DM", "1 - 200 DM", "unknown", "1 - 200 DM", "1 - 200 DM", "1 - 200 DM", "unknown", "< 0 DM", "unknown", "unknown", "< 0 DM",' for attribute `checking_balance'
##
## Error limit exceeded
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.4.4
{r} ##Step 4: Evaluating Model Performance cred_pred <- predict (credit_model, credit_test) CrossTable(credit_test$default, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c(‘actua l default’, ‘predicted default’)) ###It indicates out of 100 records 10 were misclassified false negatives and type II error. 16 actual defaults were misclassified as not defaulting, i.e. false positives or a Type I error.
Alternate Solution:
library(ggplot2)
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.4
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
DT <- rpart(default ~ checking_balance + credit_history + months_loan_duration , credit_train, method = "class", cp=0)
summary(DT)
## Call:
## rpart(formula = default ~ checking_balance + credit_history +
## months_loan_duration, data = credit_train, method = "class",
## cp = 0)
## n= 900
##
## CP nsplit rel error xerror xstd
## 1 0.0410447761 0 1.0000000 1.0000000 0.05118820
## 2 0.0130597015 3 0.8768657 0.9440299 0.05032182
## 3 0.0074626866 5 0.8507463 0.9552239 0.05050180
## 4 0.0055970149 7 0.8358209 0.9589552 0.05056104
## 5 0.0037313433 9 0.8246269 0.9738806 0.05079426
## 6 0.0018656716 11 0.8171642 0.9402985 0.05026107
## 7 0.0012437811 13 0.8134328 0.9365672 0.05019994
## 8 0.0009328358 16 0.8097015 0.9440299 0.05032182
## 9 0.0000000000 20 0.8059701 0.9440299 0.05032182
##
## Variable importance
## checking_balance credit_history months_loan_duration
## 48 27 24
##
## Node number 1: 900 observations, complexity param=0.04104478
## predicted class=no expected loss=0.2977778 P(node) =1
## class counts: 632 268
## probabilities: 0.702 0.298
## left son=2 (414 obs) right son=3 (486 obs)
## Primary splits:
## checking_balance splits as RLRL, improve=39.30076, (0 missing)
## credit_history splits as LLRLR, improve=14.01920, (0 missing)
## months_loan_duration < 34.5 to the left, improve=11.31821, (0 missing)
## Surrogate splits:
## credit_history splits as LRRRR, agree=0.592, adj=0.114, (0 split)
## months_loan_duration < 5.5 to the left, agree=0.547, adj=0.014, (0 split)
##
## Node number 2: 414 observations, complexity param=0.001243781
## predicted class=no expected loss=0.1376812 P(node) =0.46
## class counts: 357 57
## probabilities: 0.862 0.138
## left son=4 (355 obs) right son=5 (59 obs)
## Primary splits:
## credit_history splits as LLRRR, improve=1.869495, (0 missing)
## months_loan_duration < 16.5 to the left, improve=1.338274, (0 missing)
## checking_balance splits as -R-L, improve=1.155705, (0 missing)
##
## Node number 3: 486 observations, complexity param=0.04104478
## predicted class=no expected loss=0.4341564 P(node) =0.54
## class counts: 275 211
## probabilities: 0.566 0.434
## left son=6 (274 obs) right son=7 (212 obs)
## Primary splits:
## months_loan_duration < 22.5 to the left, improve=10.423870, (0 missing)
## credit_history splits as LLRLR, improve= 8.598609, (0 missing)
## checking_balance splits as R-L-, improve= 3.119042, (0 missing)
## Surrogate splits:
## credit_history splits as LLRRR, agree=0.611, adj=0.108, (0 split)
##
## Node number 4: 355 observations
## predicted class=no expected loss=0.1183099 P(node) =0.3944444
## class counts: 313 42
## probabilities: 0.882 0.118
##
## Node number 5: 59 observations, complexity param=0.001243781
## predicted class=no expected loss=0.2542373 P(node) =0.06555556
## class counts: 44 15
## probabilities: 0.746 0.254
## left son=10 (19 obs) right son=11 (40 obs)
## Primary splits:
## months_loan_duration < 16.5 to the left, improve=0.5202498, (0 missing)
## checking_balance splits as -R-L, improve=0.2699402, (0 missing)
## credit_history splits as --RLL, improve=0.1328814, (0 missing)
## Surrogate splits:
## checking_balance splits as -L-R, agree=0.712, adj=0.105, (0 split)
##
## Node number 6: 274 observations, complexity param=0.04104478
## predicted class=no expected loss=0.3430657 P(node) =0.3044444
## class counts: 180 94
## probabilities: 0.657 0.343
## left son=12 (249 obs) right son=13 (25 obs)
## Primary splits:
## credit_history splits as LLRLR, improve=7.817224, (0 missing)
## months_loan_duration < 11.5 to the left, improve=3.919498, (0 missing)
## checking_balance splits as R-L-, improve=1.868613, (0 missing)
##
## Node number 7: 212 observations, complexity param=0.0130597
## predicted class=yes expected loss=0.4481132 P(node) =0.2355556
## class counts: 95 117
## probabilities: 0.448 0.552
## left son=14 (173 obs) right son=15 (39 obs)
## Primary splits:
## months_loan_duration < 47.5 to the left, improve=2.635873, (0 missing)
## credit_history splits as LRRLR, improve=1.294771, (0 missing)
## checking_balance splits as R-L-, improve=1.058491, (0 missing)
##
## Node number 10: 19 observations
## predicted class=no expected loss=0.1578947 P(node) =0.02111111
## class counts: 16 3
## probabilities: 0.842 0.158
##
## Node number 11: 40 observations, complexity param=0.001243781
## predicted class=no expected loss=0.3 P(node) =0.04444444
## class counts: 28 12
## probabilities: 0.700 0.300
## left son=22 (31 obs) right son=23 (9 obs)
## Primary splits:
## months_loan_duration < 22.5 to the right, improve=1.516846, (0 missing)
## credit_history splits as --RLR, improve=0.122884, (0 missing)
##
## Node number 12: 249 observations, complexity param=0.0009328358
## predicted class=no expected loss=0.3052209 P(node) =0.2766667
## class counts: 173 76
## probabilities: 0.695 0.305
## left son=24 (72 obs) right son=25 (177 obs)
## Primary splits:
## months_loan_duration < 11.5 to the left, improve=3.888912, (0 missing)
## credit_history splits as LR-L-, improve=2.096469, (0 missing)
## checking_balance splits as R-L-, improve=1.643716, (0 missing)
##
## Node number 13: 25 observations
## predicted class=yes expected loss=0.28 P(node) =0.02777778
## class counts: 7 18
## probabilities: 0.280 0.720
##
## Node number 14: 173 observations, complexity param=0.0130597
## predicted class=yes expected loss=0.4855491 P(node) =0.1922222
## class counts: 84 89
## probabilities: 0.486 0.514
## left son=28 (77 obs) right son=29 (96 obs)
## Primary splits:
## checking_balance splits as R-L-, improve=0.9959275, (0 missing)
## credit_history splits as LRRRR, improve=0.8924092, (0 missing)
## months_loan_duration < 31.5 to the left, improve=0.3748116, (0 missing)
## Surrogate splits:
## credit_history splits as LRRLR, agree=0.618, adj=0.143, (0 split)
## months_loan_duration < 25 to the right, agree=0.566, adj=0.026, (0 split)
##
## Node number 15: 39 observations
## predicted class=yes expected loss=0.2820513 P(node) =0.04333333
## class counts: 11 28
## probabilities: 0.282 0.718
##
## Node number 22: 31 observations
## predicted class=no expected loss=0.2258065 P(node) =0.03444444
## class counts: 24 7
## probabilities: 0.774 0.226
##
## Node number 23: 9 observations
## predicted class=yes expected loss=0.4444444 P(node) =0.01
## class counts: 4 5
## probabilities: 0.444 0.556
##
## Node number 24: 72 observations
## predicted class=no expected loss=0.1666667 P(node) =0.08
## class counts: 60 12
## probabilities: 0.833 0.167
##
## Node number 25: 177 observations, complexity param=0.0009328358
## predicted class=no expected loss=0.3615819 P(node) =0.1966667
## class counts: 113 64
## probabilities: 0.638 0.362
## left son=50 (58 obs) right son=51 (119 obs)
## Primary splits:
## credit_history splits as LR-L-, improve=2.4929420, (0 missing)
## checking_balance splits as R-L-, improve=2.0530640, (0 missing)
## months_loan_duration < 19 to the right, improve=0.9713449, (0 missing)
## Surrogate splits:
## months_loan_duration < 19 to the right, agree=0.678, adj=0.017, (0 split)
##
## Node number 28: 77 observations, complexity param=0.005597015
## predicted class=no expected loss=0.4545455 P(node) =0.08555556
## class counts: 42 35
## probabilities: 0.545 0.455
## left son=56 (20 obs) right son=57 (57 obs)
## Primary splits:
## credit_history splits as RRRLL, improve=1.2905900, (0 missing)
## months_loan_duration < 26.5 to the left, improve=0.1060606, (0 missing)
##
## Node number 29: 96 observations, complexity param=0.007462687
## predicted class=yes expected loss=0.4375 P(node) =0.1066667
## class counts: 42 54
## probabilities: 0.438 0.562
## left son=58 (78 obs) right son=59 (18 obs)
## Primary splits:
## credit_history splits as LLLRR, improve=3.2500000, (0 missing)
## months_loan_duration < 31.5 to the left, improve=0.5105042, (0 missing)
##
## Node number 50: 58 observations
## predicted class=no expected loss=0.2413793 P(node) =0.06444444
## class counts: 44 14
## probabilities: 0.759 0.241
##
## Node number 51: 119 observations, complexity param=0.0009328358
## predicted class=no expected loss=0.4201681 P(node) =0.1322222
## class counts: 69 50
## probabilities: 0.580 0.420
## left son=102 (56 obs) right son=103 (63 obs)
## Primary splits:
## checking_balance splits as R-L-, improve=0.8403361, (0 missing)
## months_loan_duration < 19 to the right, improve=0.7629913, (0 missing)
## Surrogate splits:
## months_loan_duration < 14.5 to the left, agree=0.538, adj=0.018, (0 split)
##
## Node number 56: 20 observations
## predicted class=no expected loss=0.3 P(node) =0.02222222
## class counts: 14 6
## probabilities: 0.700 0.300
##
## Node number 57: 57 observations, complexity param=0.005597015
## predicted class=yes expected loss=0.4912281 P(node) =0.06333333
## class counts: 28 29
## probabilities: 0.491 0.509
## left son=114 (18 obs) right son=115 (39 obs)
## Primary splits:
## credit_history splits as LRR--, improve=0.2177238, (0 missing)
## months_loan_duration < 26.5 to the left, improve=0.2133124, (0 missing)
## Surrogate splits:
## months_loan_duration < 37.5 to the right, agree=0.754, adj=0.222, (0 split)
##
## Node number 58: 78 observations, complexity param=0.007462687
## predicted class=no expected loss=0.5 P(node) =0.08666667
## class counts: 39 39
## probabilities: 0.500 0.500
## left son=116 (16 obs) right son=117 (62 obs)
## Primary splits:
## credit_history splits as LRR--, improve=0.6290323, (0 missing)
## months_loan_duration < 31.5 to the left, improve=0.4814815, (0 missing)
##
## Node number 59: 18 observations
## predicted class=yes expected loss=0.1666667 P(node) =0.02
## class counts: 3 15
## probabilities: 0.167 0.833
##
## Node number 102: 56 observations
## predicted class=no expected loss=0.3571429 P(node) =0.06222222
## class counts: 36 20
## probabilities: 0.643 0.357
##
## Node number 103: 63 observations, complexity param=0.0009328358
## predicted class=no expected loss=0.4761905 P(node) =0.07
## class counts: 33 30
## probabilities: 0.524 0.476
## left son=206 (38 obs) right son=207 (25 obs)
## Primary splits:
## months_loan_duration < 16.5 to the left, improve=0.1590977, (0 missing)
##
## Node number 114: 18 observations
## predicted class=no expected loss=0.4444444 P(node) =0.02
## class counts: 10 8
## probabilities: 0.556 0.444
##
## Node number 115: 39 observations, complexity param=0.003731343
## predicted class=yes expected loss=0.4615385 P(node) =0.04333333
## class counts: 18 21
## probabilities: 0.462 0.538
## left son=230 (26 obs) right son=231 (13 obs)
## Primary splits:
## months_loan_duration < 33 to the left, improve=0.2307692, (0 missing)
##
## Node number 116: 16 observations
## predicted class=no expected loss=0.375 P(node) =0.01777778
## class counts: 10 6
## probabilities: 0.625 0.375
##
## Node number 117: 62 observations, complexity param=0.001865672
## predicted class=yes expected loss=0.4677419 P(node) =0.06888889
## class counts: 29 33
## probabilities: 0.468 0.532
## left son=234 (46 obs) right son=235 (16 obs)
## Primary splits:
## months_loan_duration < 33 to the left, improve=0.039446, (0 missing)
##
## Node number 206: 38 observations
## predicted class=no expected loss=0.4473684 P(node) =0.04222222
## class counts: 21 17
## probabilities: 0.553 0.447
##
## Node number 207: 25 observations
## predicted class=yes expected loss=0.48 P(node) =0.02777778
## class counts: 12 13
## probabilities: 0.480 0.520
##
## Node number 230: 26 observations, complexity param=0.003731343
## predicted class=no expected loss=0.5 P(node) =0.02888889
## class counts: 13 13
## probabilities: 0.500 0.500
## left son=460 (8 obs) right son=461 (18 obs)
## Primary splits:
## months_loan_duration < 25 to the right, improve=0.3611111, (0 missing)
## Surrogate splits:
## credit_history splits as -RL--, agree=0.769, adj=0.25, (0 split)
##
## Node number 231: 13 observations
## predicted class=yes expected loss=0.3846154 P(node) =0.01444444
## class counts: 5 8
## probabilities: 0.385 0.615
##
## Node number 234: 46 observations, complexity param=0.001865672
## predicted class=yes expected loss=0.4782609 P(node) =0.05111111
## class counts: 22 24
## probabilities: 0.478 0.522
## left son=468 (9 obs) right son=469 (37 obs)
## Primary splits:
## months_loan_duration < 29 to the right, improve=0.1336989, (0 missing)
##
## Node number 235: 16 observations
## predicted class=yes expected loss=0.4375 P(node) =0.01777778
## class counts: 7 9
## probabilities: 0.438 0.562
##
## Node number 460: 8 observations
## predicted class=no expected loss=0.375 P(node) =0.008888889
## class counts: 5 3
## probabilities: 0.625 0.375
##
## Node number 461: 18 observations
## predicted class=yes expected loss=0.4444444 P(node) =0.02
## class counts: 8 10
## probabilities: 0.444 0.556
##
## Node number 468: 9 observations
## predicted class=no expected loss=0.4444444 P(node) =0.01
## class counts: 5 4
## probabilities: 0.556 0.444
##
## Node number 469: 37 observations
## predicted class=yes expected loss=0.4594595 P(node) =0.04111111
## class counts: 17 20
## probabilities: 0.459 0.541
rpart.plot(DT, type=1, extra = 102)
##Step 1: Collecting the Data
letters <- read.csv("letterdata.csv")
str(letters)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
##Step 2: Preparing the Data
letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]
##Step 3: Training a Model on the Data
library(kernlab)
## Warning: package 'kernlab' was built under R version 3.4.4
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")
## Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 7886
##
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569
## Training error : 0.1335
##Step 4: Evaluating Model Performance
letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##
## letter_predictions A B C D E F G H I J K L M N O P Q R
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5
## I 0 0 0 0 0 0 0 0 65 3 0 0 0 0 0 0 0 0
## J 0 1 0 0 0 1 0 0 3 61 0 0 0 0 1 0 0 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0
##
## letter_predictions S T U V W X Y Z
## A 0 1 2 0 1 0 0 0
## B 3 0 0 0 0 0 0 0
## C 0 0 0 0 0 0 0 0
## D 0 0 0 0 0 0 1 0
## E 6 0 0 0 0 1 0 0
## F 2 0 0 1 0 0 2 0
## G 3 2 0 0 0 0 0 0
## H 0 3 0 2 0 0 1 0
## I 2 0 0 0 0 2 1 0
## J 1 0 0 0 0 1 0 4
## K 0 1 2 0 0 4 0 0
## L 0 0 0 0 0 0 0 0
## M 0 0 1 0 6 0 0 0
## N 0 0 1 0 2 0 0 0
## O 0 0 1 0 0 0 0 0
## P 0 0 0 0 0 0 1 0
## Q 1 0 0 0 0 0 0 0
## R 0 1 0 1 0 0 0 0
## S 47 1 0 0 0 1 0 6
## T 1 83 1 0 0 0 2 2
## U 0 0 83 0 0 0 0 0
## V 0 0 0 64 1 0 1 0
## W 0 0 0 3 59 0 0 0
## X 0 0 0 0 0 76 1 0
## Y 0 1 0 0 0 1 58 0
## Z 5 1 0 0 0 0 0 70
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 321 1679
###The classification was correct 1679 times out of 2000 records.
##Step 1: Collecting the Data
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
str(sms_raw)
## 'data.frame': 5559 obs. of 2 variables:
## $ type: chr "ham" "ham" "ham" "spam" ...
## $ text: chr "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline"| __truncated__ ...
head(sms_raw)
## type
## 1 ham
## 2 ham
## 3 ham
## 4 spam
## 5 spam
## 6 ham
## text
## 1 Hope you are having a good week. Just checking in
## 2 K..give back my thanks.
## 3 Am also doing in cbe only. But have to pay.
## 4 complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out! Box434SK38WP150PPM18+
## 5 okmail: Dear Dave this is your final notice to collect your 4* Tenerife Holiday or #5000 CASH award! Call 09061743806 from landline. TCs SAE Box326 CW25WX 150ppm
## 6 Aiya we discuss later lar... Pick u up at 4 is it?
##Step 2: Exploring and Preparing the Data
library (tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
print(sms_corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5559
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[10:15], as.character)
## $`10`
## [1] "fyi I'm at usf now, swing by the room whenever"
##
## $`11`
## [1] "Sure thing big man. i have hockey elections at 6, shouldnâ<U+0082>¬Ë<U+009C>t go on longer than an hour though"
##
## $`12`
## [1] "I anything lor..."
##
## $`13`
## [1] "By march ending, i should be ready. But will call you for sure. The problem is that my capital never complete. How far with you. How's work and the ladies"
##
## $`14`
## [1] "Hmm well, night night "
##
## $`15`
## [1] "K I'll be sure to get up before noon and see what's what"
corpus_clean <- tm_map(sms_corpus, tolower)
corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removePunctuation)
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
strwrap(corpus_clean[[1]])
## [1] "hope good week just checking"
corpus_clean <- tm_map(corpus_clean, PlainTextDocument)
sms_dtm <- DocumentTermMatrix(corpus_clean)
sms_raw_train <- sms_raw[1:4169, ]
sms_raw_test <- sms_raw[4170:5559, ]
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]
sms_corpus_train <- corpus_clean[1:4169]
sms_corpus_test <- corpus_clean[4170:5559]
prop.table(table(sms_raw_train$type))
##
## ham spam
## 0.8647158 0.1352842
prop.table(table(sms_raw_test$type))
##
## ham spam
## 0.8683453 0.1316547
##Creating Word Clouds
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(sms_corpus_train, min.freq = 40, random.order = FALSE)
##Step 3: Training a Model on the Data
convert_count <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
five_times_words <- findFreqTerms(sms_dtm_train, 5)
length(five_times_words)
## [1] 1228
library( e1071)
## Warning: package 'e1071' was built under R version 3.4.4
sms_train <- DocumentTermMatrix(sms_corpus_train, control=list(dictionary=five_times_words))
sms_test <- DocumentTermMatrix(sms_corpus_test, control=list(dictionary=five_times_words))
sms_train <- apply(sms_train, 2, convert_count)
sms_test <- apply(sms_test, 2, convert_count)
sms_classifier <- naiveBayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)
## [1] "naiveBayes"
##Step 4: Evaluating Model Performance
sms_test_pred <- predict(sms_classifier, newdata=sms_test)
table(sms_test_pred, sms_raw_test$type)
##
## sms_test_pred ham spam
## ham 1202 31
## spam 5 152
###ham was correctly classified 99% of the times that is 1202
##Step 1: Collecting the Data
wine <- read.csv("whitewines.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
hist(wine$quality)
##Step 2: Exploring and Preparing the Data
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
##Step 3: Training a Model on the Data
library(rpart)
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 2945.53200 5.870933
## 2) alcohol< 10.85 2372 1418.86100 5.604975
## 4) volatile.acidity>=0.2275 1611 821.30730 5.432030
## 8) volatile.acidity>=0.3025 688 278.97670 5.255814 *
## 9) volatile.acidity< 0.3025 923 505.04230 5.563380 *
## 5) volatile.acidity< 0.2275 761 447.36400 5.971091 *
## 3) alcohol>=10.85 1378 1070.08200 6.328737
## 6) free.sulfur.dioxide< 10.5 84 95.55952 5.369048 *
## 7) free.sulfur.dioxide>=10.5 1294 892.13600 6.391036
## 14) alcohol< 11.76667 629 430.11130 6.173291
## 28) volatile.acidity>=0.465 11 10.72727 4.545455 *
## 29) volatile.acidity< 0.465 618 389.71680 6.202265 *
## 15) alcohol>=11.76667 665 403.99400 6.596992 *
library(rpart.plot)
rpart.plot(m.rpart, digits=3)
rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)
##Step 4: Evaluating Model Performance
p.rpart <- predict(m.rpart,wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
##54 % is not a lot of correlation
news<-read.csv('OnlineNewsPopularity.csv',header = TRUE)
news<-news[, 29:61]
str(news)
## 'data.frame': 39644 obs. of 33 variables:
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity: num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
##Step 2: Exploring and Preparing data
library(caTools)
set.seed(123)
news$popular= ifelse(news$avg_positive_polarity>0.5,1, 0)
news<-news[,-23]
split<-sample.split(news$popular , SplitRatio=0.8)
training_set=subset(news, split==TRUE)
test_set<-subset(news, split==FALSE)
##Step 3: Training a Model on the Data and Evaluaitng
library(C50)
classifier<-glm(formula= popular~ . ,
family=binomial, data=training_set)
classifier
##
## Call: glm(formula = popular ~ ., family = binomial, data = training_set)
##
## Coefficients:
## (Intercept) self_reference_min_shares
## -1.830e+01 1.673e-06
## self_reference_max_shares self_reference_avg_sharess
## 1.346e-06 -7.244e-07
## weekday_is_monday weekday_is_tuesday
## -1.572e-01 -2.502e-01
## weekday_is_wednesday weekday_is_thursday
## -8.288e-02 -2.687e-01
## weekday_is_friday weekday_is_saturday
## -1.470e-01 -1.838e-01
## weekday_is_sunday is_weekend
## NA NA
## LDA_00 LDA_01
## -2.671e-01 -8.843e-02
## LDA_02 LDA_03
## -3.256e-01 -7.457e-02
## LDA_04 global_subjectivity
## -8.800e-01 -6.027e-01
## global_sentiment_polarity global_rate_positive_words
## 2.711e+01 -5.111e+01
## global_rate_negative_words rate_positive_words
## 4.332e+01 -3.067e+00
## rate_negative_words min_positive_polarity
## 6.533e+00 1.586e+01
## max_positive_polarity avg_negative_polarity
## 1.089e+01 -7.236e+00
## min_negative_polarity max_negative_polarity
## 1.261e+00 1.547e+00
## title_subjectivity title_sentiment_polarity
## -2.547e-01 -3.948e-01
## abs_title_subjectivity abs_title_sentiment_polarity
## 3.005e-02 1.036e+00
## shares
## -2.013e-06
##
## Degrees of Freedom: 31715 Total (i.e. Null); 31685 Residual
## Null Deviance: 12660
## Residual Deviance: 5585 AIC: 5647
prob_pred<-predict(classifier, type= "response", test_set[-33])
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
y_pred<-ifelse(prob_pred>0.5,1, 0)
cm<-table(test_set[,33] , y_pred)
cm
## y_pred
## 0 1
## 0 7478 51
## 1 200 199
##7478 it was accurateky classified as non popular
#Tree Visualization
library(rpart)
library(rpart.plot)
m.rpart <- rpart(popular ~. , data=training_set)
m.rpart
## n= 31716
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 31716 1517.48500 0.05038466
## 2) global_sentiment_polarity< 0.3182905 30960 1168.24000 0.03927649
## 4) min_positive_polarity< 0.2678571 30271 886.40280 0.03019391
## 8) global_sentiment_polarity< 0.240115 28139 583.37640 0.02118057
## 16) min_positive_polarity< 0.1431818 26114 378.35340 0.01470476 *
## 17) min_positive_polarity>=0.1431818 2025 189.80540 0.10469140
## 34) max_positive_polarity< 0.8416667 1463 37.96036 0.02665755 *
## 35) max_positive_polarity>=0.8416667 562 119.74560 0.30782920 *
## 9) global_sentiment_polarity>=0.240115 2132 270.56850 0.14915570
## 18) rate_positive_words>=0.7639319 1883 155.47110 0.09081253 *
## 19) rate_positive_words< 0.7639319 249 60.21687 0.59036140 *
## 5) min_positive_polarity>=0.2678571 689 169.62840 0.43831640
## 10) max_positive_polarity< 0.575 292 0.00000 0.00000000 *
## 11) max_positive_polarity>=0.575 397 72.26700 0.76070530 *
## 3) global_sentiment_polarity>=0.3182905 756 188.97880 0.50529100
## 6) max_positive_polarity< 0.825 185 27.75135 0.18378380 *
## 7) max_positive_polarity>=0.825 571 135.90890 0.60945710 *
rpart.plot(m.rpart, digits=1)
rpart.plot(m.rpart, digits=1, fallen.leaves=TRUE, type=3, extra=101)
summary(test_set$max_positive_polarity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.6000 0.8000 0.7532 1.0000 1.0000
summary(test_set$popular)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05033 0.00000 1.00000
cor(test_set$max_positive_polarity, test_set$popular)
## [1] 0.1760677
summary(test_set$global_sentiment_polarity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.37766 0.05737 0.12044 0.11976 0.17808 0.57374
summary(test_set$popular)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.05033 0.00000 1.00000
cor(test_set$global_sentiment_polarity, test_set$popular)
## [1] 0.2417969
###24 % is not a lot of correlation
library(kernlab)
svm_classifier <- ksvm(popular~. , data=training_set, kernel=“vanilladot”) svm_classifier svmpred <- predict(svm_classifier, test_set) svmpred1= ifelse(svmpred>0.5,1, 0) svmc1<- table(svmpred1, test_set$popular) svmc1 ##All the unpopular news are predicted correctly but not the unpopular ones ```