knitr::opts_chunk$set(echo = TRUE)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

1. Example -Tree-based classification

Step 1: Collecting the data

credit <- read.csv("credit.csv")
str(credit)
## 'data.frame':    1000 obs. of  17 variables:
##  $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history      : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
##  $ purpose             : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
##  $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
##  $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
##  $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ other_credit        : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ housing             : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
##  $ existing_loans_count: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ job                 : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ phone               : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ default             : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...

Step 2: Exploring the data

summary(credit$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
 table(credit$default)
## 
##  no yes 
## 700 300
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
summary(credit_rand$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$default))
## 
##        no       yes 
## 0.7022222 0.2977778
prop.table(table(credit_test$default))
## 
##   no  yes 
## 0.68 0.32

Step 3: Training a model on the data

library(C50)
credit_model<-C5.0.default(x = credit_train[-17], y = credit_train$default)
credit_model
## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 16 
## 
## Tree size: 67 
## 
## Non-standard options: attempt to group attributes
summary(credit_model)
## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun May 27 14:50:00 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## checking_balance = unknown: no (358/44)
## checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
## :...credit_history in {perfect,very good}:
##     :...dependents > 1: yes (10/1)
##     :   dependents <= 1:
##     :   :...savings_balance = < 100 DM: yes (39/11)
##     :       savings_balance in {> 1000 DM,500 - 1000 DM,unknown}: no (8/1)
##     :       savings_balance = 100 - 500 DM:
##     :       :...checking_balance = < 0 DM: no (1)
##     :           checking_balance in {> 200 DM,1 - 200 DM}: yes (5/1)
##     credit_history in {critical,good,poor}:
##     :...months_loan_duration <= 11: no (87/14)
##         months_loan_duration > 11:
##         :...savings_balance = > 1000 DM: no (13)
##             savings_balance in {< 100 DM,100 - 500 DM,500 - 1000 DM,unknown}:
##             :...checking_balance = > 200 DM:
##                 :...dependents > 1: yes (3)
##                 :   dependents <= 1:
##                 :   :...credit_history in {good,poor}: no (23/3)
##                 :       credit_history = critical:
##                 :       :...amount <= 2337: yes (3)
##                 :           amount > 2337: no (6)
##                 checking_balance = 1 - 200 DM:
##                 :...savings_balance = unknown: no (34/6)
##                 :   savings_balance in {< 100 DM,100 - 500 DM,500 - 1000 DM}:
##                 :   :...months_loan_duration > 45: yes (11/1)
##                 :       months_loan_duration <= 45:
##                 :       :...other_credit = store:
##                 :           :...age <= 35: yes (4)
##                 :           :   age > 35: no (2)
##                 :           other_credit = bank:
##                 :           :...years_at_residence <= 1: no (3)
##                 :           :   years_at_residence > 1:
##                 :           :   :...existing_loans_count <= 1: yes (5)
##                 :           :       existing_loans_count > 1:
##                 :           :       :...percent_of_income <= 2: no (4/1)
##                 :           :           percent_of_income > 2: yes (3)
##                 :           other_credit = none:
##                 :           :...job = unemployed: no (1)
##                 :               job = management:
##                 :               :...amount <= 7511: no (10/3)
##                 :               :   amount > 7511: yes (7)
##                 :               job = unskilled: [S1]
##                 :               job = skilled:
##                 :               :...dependents <= 1: no (55/15)
##                 :                   dependents > 1:
##                 :                   :...age <= 34: no (3)
##                 :                       age > 34: yes (4)
##                 checking_balance = < 0 DM:
##                 :...job = management: no (26/6)
##                     job = unemployed: yes (4/1)
##                     job = unskilled:
##                     :...employment_duration in {4 - 7 years,
##                     :   :                       unemployed}: no (4)
##                     :   employment_duration = < 1 year:
##                     :   :...other_credit = bank: no (1)
##                     :   :   other_credit in {none,store}: yes (11/2)
##                     :   employment_duration = > 7 years:
##                     :   :...other_credit in {bank,none}: no (5/1)
##                     :   :   other_credit = store: yes (2)
##                     :   employment_duration = 1 - 4 years:
##                     :   :...age <= 39: no (14/3)
##                     :       age > 39:
##                     :       :...credit_history in {critical,good}: yes (3)
##                     :           credit_history = poor: no (1)
##                     job = skilled:
##                     :...credit_history = poor:
##                         :...savings_balance in {< 100 DM,100 - 500 DM,
##                         :   :                   500 - 1000 DM}: yes (8)
##                         :   savings_balance = unknown: no (1)
##                         credit_history = critical:
##                         :...other_credit = store: no (0)
##                         :   other_credit = bank: yes (4)
##                         :   other_credit = none:
##                         :   :...savings_balance in {100 - 500 DM,
##                         :       :                   unknown}: no (1)
##                         :       savings_balance = 500 - 1000 DM: yes (1)
##                         :       savings_balance = < 100 DM:
##                         :       :...months_loan_duration <= 13:
##                         :           :...percent_of_income <= 3: yes (3)
##                         :           :   percent_of_income > 3: no (3/1)
##                         :           months_loan_duration > 13:
##                         :           :...amount <= 5293: no (10/1)
##                         :               amount > 5293: yes (2)
##                         credit_history = good:
##                         :...existing_loans_count > 1: yes (5)
##                             existing_loans_count <= 1:
##                             :...other_credit = store: no (2)
##                                 other_credit = bank:
##                                 :...percent_of_income <= 2: yes (2)
##                                 :   percent_of_income > 2: no (6/1)
##                                 other_credit = none: [S2]
## 
## SubTree [S1]
## 
## employment_duration in {< 1 year,1 - 4 years}: yes (11/3)
## employment_duration in {> 7 years,4 - 7 years,unemployed}: no (8)
## 
## SubTree [S2]
## 
## savings_balance = 100 - 500 DM: yes (3)
## savings_balance = 500 - 1000 DM: no (1)
## savings_balance = unknown:
## :...phone = no: yes (9/1)
## :   phone = yes: no (3/1)
## savings_balance = < 100 DM:
## :...percent_of_income <= 1: no (4)
##     percent_of_income > 1:
##     :...phone = yes: yes (10/1)
##         phone = no:
##         :...purpose in {business,car0,education,renovations}: yes (3)
##             purpose = car:
##             :...percent_of_income <= 3: no (2)
##             :   percent_of_income > 3: yes (6/1)
##             purpose = furniture/appliances:
##             :...years_at_residence <= 1: no (4)
##                 years_at_residence > 1:
##                 :...housing = other: no (1)
##                     housing = rent: yes (2)
##                     housing = own:
##                     :...amount <= 1778: no (3)
##                         amount > 1778:
##                         :...years_at_residence <= 3: yes (6)
##                             years_at_residence > 3: no (3/1)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      66  125(13.9%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     609    23    (a): class no
##     102   166    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% checking_balance
##   60.22% credit_history
##   53.22% months_loan_duration
##   49.44% savings_balance
##   30.89% job
##   25.89% other_credit
##   17.78% dependents
##    9.67% existing_loans_count
##    7.22% percent_of_income
##    6.67% employment_duration
##    5.78% phone
##    5.56% amount
##    3.78% years_at_residence
##    3.44% age
##    3.33% purpose
##    1.67% housing
## 
## 
## Time: 0.0 secs

Step 4: Evaluating Model Performance

cred_pred <- predict(credit_model, credit_test)
library(gmodels)
CrossTable(credit_test$default, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actua
l default', 'predicted default'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                 | predicted default 
## actua
## l default |        no |       yes | Row Total | 
## ----------------|-----------|-----------|-----------|
##              no |        57 |        11 |        68 | 
##                 |     0.570 |     0.110 |           | 
## ----------------|-----------|-----------|-----------|
##             yes |        16 |        16 |        32 | 
##                 |     0.160 |     0.160 |           | 
## ----------------|-----------|-----------|-----------|
##    Column Total |        73 |        27 |       100 | 
## ----------------|-----------|-----------|-----------|
## 
## 

It indicates out of 100 records 10 were misclassified false negatives and type II error. 16 actual defaults were misclassified as not defaulting, i.e. false positives or a Type I error.

Method #2. Support vector machines

Step 1: Collecting the Data

letters <- read.csv("letterdata.csv")
str(letters)
## 'data.frame':    20000 obs. of  17 variables:
##  $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...

Step 2: Preparing the Data

letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]

Step 3: Training a Model on the Data

library(kernlab)
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")
##  Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 7886 
## 
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569 
## Training error : 0.1335

Step 4: Evaluating Model Performance

letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##                   
## letter_predictions  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R
##                  A 73  0  0  0  0  0  0  0  0  1  0  0  0  0  3  0  4  0
##                  B  0 61  0  3  2  0  1  1  0  0  1  1  0  0  0  2  0  1
##                  C  0  0 64  0  2  0  4  2  1  0  1  2  0  0  1  0  0  0
##                  D  2  1  0 67  0  0  1  3  3  2  1  2  0  3  4  2  1  2
##                  E  0  0  1  0 64  1  1  0  0  0  2  2  0  0  0  0  2  0
##                  F  0  0  0  0  0 70  1  1  4  0  0  0  0  0  0  5  1  0
##                  G  1  1  2  1  3  2 68  1  0  0  0  1  0  0  0  0  4  1
##                  H  0  0  0  1  0  1  0 46  0  2  3  1  1  1  9  0  0  5
##                  I  0  0  0  0  0  0  0  0 65  3  0  0  0  0  0  0  0  0
##                  J  0  1  0  0  0  1  0  0  3 61  0  0  0  0  1  0  0  0
##                  K  0  1  4  0  0  0  0  5  0  0 56  0  0  2  0  0  0  4
##                  L  0  0  0  0  1  0  0  1  0  0  0 63  0  0  0  0  0  0
##                  M  0  0  1  0  0  0  1  0  0  0  0  0 70  2  0  0  0  0
##                  N  0  0  0  0  0  0  0  0  0  0  0  0  0 77  0  0  0  1
##                  O  0  0  1  1  0  0  0  1  0  1  0  0  0  0 49  1  2  0
##                  P  0  0  0  0  0  3  0  0  0  0  0  0  0  0  2 69  0  0
##                  Q  0  0  0  0  0  0  3  1  0  0  0  2  0  0  2  1 52  0
##                  R  0  4  0  0  1  0  0  3  0  0  3  0  0  0  1  0  0 64
##                  S  0  1  0  0  1  1  1  0  1  1  0  0  0  0  0  0  6  0
##                  T  0  0  0  0  1  1  0  0  0  0  1  0  0  0  0  0  0  0
##                  U  0  0  2  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##                  V  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0
##                  W  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0
##                  X  0  1  0  0  1  0  0  1  0  0  1  4  0  0  0  0  0  1
##                  Y  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  4  0  0
##                  Z  1  0  0  0  2  0  0  0  0  2  0  0  0  0  0  0  0  0
##                   
## letter_predictions  S  T  U  V  W  X  Y  Z
##                  A  0  1  2  0  1  0  0  0
##                  B  3  0  0  0  0  0  0  0
##                  C  0  0  0  0  0  0  0  0
##                  D  0  0  0  0  0  0  1  0
##                  E  6  0  0  0  0  1  0  0
##                  F  2  0  0  1  0  0  2  0
##                  G  3  2  0  0  0  0  0  0
##                  H  0  3  0  2  0  0  1  0
##                  I  2  0  0  0  0  2  1  0
##                  J  1  0  0  0  0  1  0  4
##                  K  0  1  2  0  0  4  0  0
##                  L  0  0  0  0  0  0  0  0
##                  M  0  0  1  0  6  0  0  0
##                  N  0  0  1  0  2  0  0  0
##                  O  0  0  1  0  0  0  0  0
##                  P  0  0  0  0  0  0  1  0
##                  Q  1  0  0  0  0  0  0  0
##                  R  0  1  0  1  0  0  0  0
##                  S 47  1  0  0  0  1  0  6
##                  T  1 83  1  0  0  0  2  2
##                  U  0  0 83  0  0  0  0  0
##                  V  0  0  0 64  1  0  1  0
##                  W  0  0  0  3 59  0  0  0
##                  X  0  0  0  0  0 76  1  0
##                  Y  0  1  0  0  0  1 58  0
##                  Z  5  1  0  0  0  0  0 70
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE  TRUE 
##   321  1679

The classification was correct 1679 times out of 2000 records.

Method #3. Naïve Bayes Algorithm

Step 1: Collecting the Data

sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)

str(sms_raw)
## 'data.frame':    5559 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "ham" "spam" ...
##  $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline"| __truncated__ ...
head(sms_raw)
##   type
## 1  ham
## 2  ham
## 3  ham
## 4 spam
## 5 spam
## 6  ham
##                                                                                                                                                                text
## 1                                                                                                                 Hope you are having a good week. Just checking in
## 2                                                                                                                                           K..give back my thanks.
## 3                                                                                                                       Am also doing in cbe only. But have to pay.
## 4            complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out! Box434SK38WP150PPM18+
## 5 okmail: Dear Dave this is your final notice to collect your 4* Tenerife Holiday or #5000 CASH award! Call 09061743806 from landline. TCs SAE Box326 CW25WX 150ppm
## 6                                                                                                                Aiya we discuss later lar... Pick u up at 4 is it?

Step 2: Exploring and Preparing the Data

library (tm)
## Loading required package: NLP
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
print(sms_corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 5559
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[10:15], as.character)
## $`10`
## [1] "fyi I'm at usf now, swing by the room whenever"
## 
## $`11`
## [1] "Sure thing big man. i have hockey elections at 6, shouldn€˜t go on longer than an hour though"
## 
## $`12`
## [1] "I anything lor..."
## 
## $`13`
## [1] "By march ending, i should be ready. But will call you for sure. The problem is that my capital never complete. How far with you. How's work and the ladies"
## 
## $`14`
## [1] "Hmm well, night night "
## 
## $`15`
## [1] "K I'll be sure to get up before noon and see what's what"
corpus_clean <- tm_map(sms_corpus, tolower)
corpus_clean <- tm_map(corpus_clean, removeNumbers)
corpus_clean <- tm_map(corpus_clean, removePunctuation)
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords())
corpus_clean <- tm_map(corpus_clean, stripWhitespace)
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
strwrap(corpus_clean[[1]])
## [1] "hope good week just checking"
corpus_clean <- tm_map(corpus_clean, PlainTextDocument)
sms_dtm <- DocumentTermMatrix(corpus_clean)
sms_raw_train <- sms_raw[1:4169, ]
sms_raw_test <- sms_raw[4170:5559, ]
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]
sms_corpus_train <- corpus_clean[1:4169]
sms_corpus_test <- corpus_clean[4170:5559]
prop.table(table(sms_raw_train$type))
## 
##       ham      spam 
## 0.8647158 0.1352842
prop.table(table(sms_raw_test$type))
## 
##       ham      spam 
## 0.8683453 0.1316547

Creating Word Clouds

library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(sms_corpus_train, min.freq = 40, random.order = FALSE)

##Step 3: Training a Model on the Data

convert_count <- function(x) {
 x <- ifelse(x > 0, 1, 0)
 x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
 return(x)
}
five_times_words <- findFreqTerms(sms_dtm_train, 5)
length(five_times_words)
## [1] 1228
library( e1071)
sms_train <- DocumentTermMatrix(sms_corpus_train, control=list(dictionary=five_times_words))
sms_test <- DocumentTermMatrix(sms_corpus_test, control=list(dictionary=five_times_words))
sms_train <- apply(sms_train, 2, convert_count)
sms_test <- apply(sms_test, 2, convert_count)
sms_classifier <- naiveBayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)
## [1] "naiveBayes"

Step 4: Evaluating Model Performance

sms_test_pred <- predict(sms_classifier, newdata=sms_test)
table(sms_test_pred, sms_raw_test$type)
##              
## sms_test_pred  ham spam
##          ham  1202   31
##          spam    5  152

ham was correctly classified 99% of the times that is 1202

Method 4. Adding regression to trees

Step 1: Collecting the Data

wine <- read.csv("whitewines.csv")
str(wine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile.acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric.acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual.sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free.sulfur.dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total.sulfur.dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...
hist(wine$quality)

##Step 2: Exploring and Preparing the Data

wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]

Step 3: Training a Model on the Data

library(rpart)
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile.acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile.acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile.acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile.acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free.sulfur.dioxide< 10.5 84   95.55952 5.369048 *
##      7) free.sulfur.dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile.acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile.acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *
library(rpart.plot)
rpart.plot(m.rpart, digits=3)

rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)

##Step 4: Evaluating Model Performance

p.rpart <- predict(m.rpart,wine_test)
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597
summary(wine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000
 cor(p.rpart, wine_test$quality)
## [1] 0.5369525

54 % is not a lot of correlation

ONLINE NEWS POPULARITY DATA ANALYSIS

I will be using the Online news popularity data from Mashable shared by UCI Machine Learning repository. I have created test and train data sets and used various classification methods like tree based classification and support vector machines.

Tree based classification

Step 1: Collecting Data

news<-read.csv('OnlineNewsPopularity.csv',header = TRUE)
news<-news[, 29:61]
str(news)
## 'data.frame':    39644 obs. of  33 variables:
##  $ self_reference_min_shares   : num  496 0 918 0 545 8500 545 545 0 0 ...
##  $ self_reference_max_shares   : num  496 0 918 0 16000 8500 16000 16000 0 0 ...
##  $ self_reference_avg_sharess  : num  496 0 918 0 3151 ...
##  $ weekday_is_monday           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weekday_is_tuesday          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_wednesday        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_thursday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_friday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_saturday         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday_is_sunday           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_weekend                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LDA_00                      : num  0.5003 0.7998 0.2178 0.0286 0.0286 ...
##  $ LDA_01                      : num  0.3783 0.05 0.0333 0.4193 0.0288 ...
##  $ LDA_02                      : num  0.04 0.0501 0.0334 0.4947 0.0286 ...
##  $ LDA_03                      : num  0.0413 0.0501 0.0333 0.0289 0.0286 ...
##  $ LDA_04                      : num  0.0401 0.05 0.6822 0.0286 0.8854 ...
##  $ global_subjectivity         : num  0.522 0.341 0.702 0.43 0.514 ...
##  $ global_sentiment_polarity   : num  0.0926 0.1489 0.3233 0.1007 0.281 ...
##  $ global_rate_positive_words  : num  0.0457 0.0431 0.0569 0.0414 0.0746 ...
##  $ global_rate_negative_words  : num  0.0137 0.01569 0.00948 0.02072 0.01213 ...
##  $ rate_positive_words         : num  0.769 0.733 0.857 0.667 0.86 ...
##  $ rate_negative_words         : num  0.231 0.267 0.143 0.333 0.14 ...
##  $ avg_positive_polarity       : num  0.379 0.287 0.496 0.386 0.411 ...
##  $ min_positive_polarity       : num  0.1 0.0333 0.1 0.1364 0.0333 ...
##  $ max_positive_polarity       : num  0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
##  $ avg_negative_polarity       : num  -0.35 -0.119 -0.467 -0.37 -0.22 ...
##  $ min_negative_polarity       : num  -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
##  $ max_negative_polarity       : num  -0.2 -0.1 -0.133 -0.167 -0.05 ...
##  $ title_subjectivity          : num  0.5 0 0 0 0.455 ...
##  $ title_sentiment_polarity    : num  -0.188 0 0 0 0.136 ...
##  $ abs_title_subjectivity      : num  0 0.5 0.5 0.5 0.0455 ...
##  $ abs_title_sentiment_polarity: num  0.188 0 0 0 0.136 ...
##  $ shares                      : int  593 711 1500 1200 505 855 556 891 3600 710 ...

Step 2: Exploring and Preparing data

library(caTools)
set.seed(123)
news$popular= ifelse(news$avg_positive_polarity>0.5,1, 0)
news<-news[,-23]
split<-sample.split(news$popular , SplitRatio=0.8)
training_set=subset(news, split==TRUE)
test_set<-subset(news, split==FALSE)

Step 3: Training a Model on the Data and Evaluaitng

library(C50)
classifier<-glm(formula= popular~ . , 
              family=binomial, data=training_set)
classifier
## 
## Call:  glm(formula = popular ~ ., family = binomial, data = training_set)
## 
## Coefficients:
##                  (Intercept)     self_reference_min_shares  
##                   -1.830e+01                     1.673e-06  
##    self_reference_max_shares    self_reference_avg_sharess  
##                    1.346e-06                    -7.244e-07  
##            weekday_is_monday            weekday_is_tuesday  
##                   -1.572e-01                    -2.502e-01  
##         weekday_is_wednesday           weekday_is_thursday  
##                   -8.288e-02                    -2.687e-01  
##            weekday_is_friday           weekday_is_saturday  
##                   -1.470e-01                    -1.838e-01  
##            weekday_is_sunday                    is_weekend  
##                           NA                            NA  
##                       LDA_00                        LDA_01  
##                   -2.671e-01                    -8.843e-02  
##                       LDA_02                        LDA_03  
##                   -3.256e-01                    -7.457e-02  
##                       LDA_04           global_subjectivity  
##                   -8.800e-01                    -6.027e-01  
##    global_sentiment_polarity    global_rate_positive_words  
##                    2.711e+01                    -5.111e+01  
##   global_rate_negative_words           rate_positive_words  
##                    4.332e+01                    -3.067e+00  
##          rate_negative_words         min_positive_polarity  
##                    6.533e+00                     1.586e+01  
##        max_positive_polarity         avg_negative_polarity  
##                    1.089e+01                    -7.236e+00  
##        min_negative_polarity         max_negative_polarity  
##                    1.261e+00                     1.547e+00  
##           title_subjectivity      title_sentiment_polarity  
##                   -2.547e-01                    -3.948e-01  
##       abs_title_subjectivity  abs_title_sentiment_polarity  
##                    3.005e-02                     1.036e+00  
##                       shares  
##                   -2.013e-06  
## 
## Degrees of Freedom: 31715 Total (i.e. Null);  31685 Residual
## Null Deviance:       12660 
## Residual Deviance: 5585  AIC: 5647
prob_pred<-predict(classifier, type= "response", test_set[-33])
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
y_pred<-ifelse(prob_pred>0.5,1, 0)
cm<-table(test_set[,33] , y_pred)
cm
##    y_pred
##        0    1
##   0 7478   51
##   1  200  199

Tree Visualization

library(rpart)
library(rpart.plot)
m.rpart <- rpart(popular ~. , data=training_set)
m.rpart
## n= 31716 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 31716 1517.48500 0.05038466  
##    2) global_sentiment_polarity< 0.3182905 30960 1168.24000 0.03927649  
##      4) min_positive_polarity< 0.2678571 30271  886.40280 0.03019391  
##        8) global_sentiment_polarity< 0.240115 28139  583.37640 0.02118057  
##         16) min_positive_polarity< 0.1431818 26114  378.35340 0.01470476 *
##         17) min_positive_polarity>=0.1431818 2025  189.80540 0.10469140  
##           34) max_positive_polarity< 0.8416667 1463   37.96036 0.02665755 *
##           35) max_positive_polarity>=0.8416667 562  119.74560 0.30782920 *
##        9) global_sentiment_polarity>=0.240115 2132  270.56850 0.14915570  
##         18) rate_positive_words>=0.7639319 1883  155.47110 0.09081253 *
##         19) rate_positive_words< 0.7639319 249   60.21687 0.59036140 *
##      5) min_positive_polarity>=0.2678571 689  169.62840 0.43831640  
##       10) max_positive_polarity< 0.575 292    0.00000 0.00000000 *
##       11) max_positive_polarity>=0.575 397   72.26700 0.76070530 *
##    3) global_sentiment_polarity>=0.3182905 756  188.97880 0.50529100  
##      6) max_positive_polarity< 0.825 185   27.75135 0.18378380 *
##      7) max_positive_polarity>=0.825 571  135.90890 0.60945710 *
rpart.plot(m.rpart, digits=1)

rpart.plot(m.rpart, digits=1, fallen.leaves=TRUE, type=3, extra=101)

summary(test_set$max_positive_polarity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.6000  0.8000  0.7532  1.0000  1.0000
summary(test_set$popular)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05033 0.00000 1.00000
cor(test_set$max_positive_polarity, test_set$popular)
## [1] 0.1760677
summary(test_set$global_sentiment_polarity)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.37766  0.05737  0.12044  0.11976  0.17808  0.57374
summary(test_set$popular)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.05033 0.00000 1.00000
cor(test_set$global_sentiment_polarity, test_set$popular)
## [1] 0.2417969

24 % is not a lot of correlation

Method - Support Vector machines

preprocessed data for SVM method use

library(kernlab) 
svm_classifier <- ksvm(popular~. , data=training_set, kernel="vanilladot")
##  Setting default kernel parameters
svm_classifier
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 3626 
## 
## Objective Function Value : -6945.975 
## Training error : 0.984
svmpred <- predict(svm_classifier, test_set)
svmpred1= ifelse(svmpred>0.5,1, 0)
svmc1<- table(svmpred1, test_set$popular)
svmc1
##         
## svmpred1    0    1
##        0 7529  399

All the unpopular news are predicted correctly but not the unpopular ones