credit <- read.csv("credit.csv")
str(credit)
## 'data.frame': 1000 obs. of 17 variables:
## $ checking_balance : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
## $ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_history : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
## $ purpose : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ savings_balance : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
## $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
## $ percent_of_income : int 4 2 2 2 3 2 3 2 2 4 ...
## $ years_at_residence : int 4 2 3 4 4 4 4 2 4 2 ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ other_credit : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ housing : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
## $ existing_loans_count: int 2 1 1 1 2 1 1 1 1 2 ...
## $ job : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
## $ dependents : int 1 1 2 2 2 2 1 1 1 1 ...
## $ phone : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$default)
##
## no yes
## 700 300
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
summary(credit_rand$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
prop.table(table(credit_train$default))
##
## no yes
## 0.7022222 0.2977778
prop.table(table(credit_test$default))
##
## no yes
## 0.68 0.32
library(C50)
credit_model <- C5.0(x = credit_train[-17], y = credit_train$default)
summary(credit_model)
##
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Jul 1 01:34:20 2018
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 900 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## checking_balance = unknown: no (358/44)
## checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
## :...credit_history in {perfect,very good}:
## :...dependents > 1: yes (10/1)
## : dependents <= 1:
## : :...savings_balance = < 100 DM: yes (39/11)
## : savings_balance in {> 1000 DM,500 - 1000 DM,unknown}: no (8/1)
## : savings_balance = 100 - 500 DM:
## : :...checking_balance = < 0 DM: no (1)
## : checking_balance in {> 200 DM,1 - 200 DM}: yes (5/1)
## credit_history in {critical,good,poor}:
## :...months_loan_duration <= 11: no (87/14)
## months_loan_duration > 11:
## :...savings_balance = > 1000 DM: no (13)
## savings_balance in {< 100 DM,100 - 500 DM,500 - 1000 DM,unknown}:
## :...checking_balance = > 200 DM:
## :...dependents > 1: yes (3)
## : dependents <= 1:
## : :...credit_history in {good,poor}: no (23/3)
## : credit_history = critical:
## : :...amount <= 2337: yes (3)
## : amount > 2337: no (6)
## checking_balance = 1 - 200 DM:
## :...savings_balance = unknown: no (34/6)
## : savings_balance in {< 100 DM,100 - 500 DM,500 - 1000 DM}:
## : :...months_loan_duration > 45: yes (11/1)
## : months_loan_duration <= 45:
## : :...other_credit = store:
## : :...age <= 35: yes (4)
## : : age > 35: no (2)
## : other_credit = bank:
## : :...years_at_residence <= 1: no (3)
## : : years_at_residence > 1:
## : : :...existing_loans_count <= 1: yes (5)
## : : existing_loans_count > 1:
## : : :...percent_of_income <= 2: no (4/1)
## : : percent_of_income > 2: yes (3)
## : other_credit = none:
## : :...job = unemployed: no (1)
## : job = management:
## : :...amount <= 7511: no (10/3)
## : : amount > 7511: yes (7)
## : job = unskilled: [S1]
## : job = skilled:
## : :...dependents <= 1: no (55/15)
## : dependents > 1:
## : :...age <= 34: no (3)
## : age > 34: yes (4)
## checking_balance = < 0 DM:
## :...job = management: no (26/6)
## job = unemployed: yes (4/1)
## job = unskilled:
## :...employment_duration in {4 - 7 years,
## : : unemployed}: no (4)
## : employment_duration = < 1 year:
## : :...other_credit = bank: no (1)
## : : other_credit in {none,store}: yes (11/2)
## : employment_duration = > 7 years:
## : :...other_credit in {bank,none}: no (5/1)
## : : other_credit = store: yes (2)
## : employment_duration = 1 - 4 years:
## : :...age <= 39: no (14/3)
## : age > 39:
## : :...credit_history in {critical,good}: yes (3)
## : credit_history = poor: no (1)
## job = skilled:
## :...credit_history = poor:
## :...savings_balance in {< 100 DM,100 - 500 DM,
## : : 500 - 1000 DM}: yes (8)
## : savings_balance = unknown: no (1)
## credit_history = critical:
## :...other_credit = store: no (0)
## : other_credit = bank: yes (4)
## : other_credit = none:
## : :...savings_balance in {100 - 500 DM,
## : : unknown}: no (1)
## : savings_balance = 500 - 1000 DM: yes (1)
## : savings_balance = < 100 DM:
## : :...months_loan_duration <= 13:
## : :...percent_of_income <= 3: yes (3)
## : : percent_of_income > 3: no (3/1)
## : months_loan_duration > 13:
## : :...amount <= 5293: no (10/1)
## : amount > 5293: yes (2)
## credit_history = good:
## :...existing_loans_count > 1: yes (5)
## existing_loans_count <= 1:
## :...other_credit = store: no (2)
## other_credit = bank:
## :...percent_of_income <= 2: yes (2)
## : percent_of_income > 2: no (6/1)
## other_credit = none: [S2]
##
## SubTree [S1]
##
## employment_duration in {< 1 year,1 - 4 years}: yes (11/3)
## employment_duration in {> 7 years,4 - 7 years,unemployed}: no (8)
##
## SubTree [S2]
##
## savings_balance = 100 - 500 DM: yes (3)
## savings_balance = 500 - 1000 DM: no (1)
## savings_balance = unknown:
## :...phone = no: yes (9/1)
## : phone = yes: no (3/1)
## savings_balance = < 100 DM:
## :...percent_of_income <= 1: no (4)
## percent_of_income > 1:
## :...phone = yes: yes (10/1)
## phone = no:
## :...purpose in {business,car0,education,renovations}: yes (3)
## purpose = car:
## :...percent_of_income <= 3: no (2)
## : percent_of_income > 3: yes (6/1)
## purpose = furniture/appliances:
## :...years_at_residence <= 1: no (4)
## years_at_residence > 1:
## :...housing = other: no (1)
## housing = rent: yes (2)
## housing = own:
## :...amount <= 1778: no (3)
## amount > 1778:
## :...years_at_residence <= 3: yes (6)
## years_at_residence > 3: no (3/1)
##
##
## Evaluation on training data (900 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 66 125(13.9%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 609 23 (a): class no
## 102 166 (b): class yes
##
##
## Attribute usage:
##
## 100.00% checking_balance
## 60.22% credit_history
## 53.22% months_loan_duration
## 49.44% savings_balance
## 30.89% job
## 25.89% other_credit
## 17.78% dependents
## 9.67% existing_loans_count
## 7.22% percent_of_income
## 6.67% employment_duration
## 5.78% phone
## 5.56% amount
## 3.78% years_at_residence
## 3.44% age
## 3.33% purpose
## 1.67% housing
##
##
## Time: 0.0 secs
cred_pred <- predict(credit_model, credit_test)
library(gmodels)
CrossTable(credit_test$default, cred_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c('actua l default', 'predicted default'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | predicted default
## actua l default | no | yes | Row Total |
## ----------------|-----------|-----------|-----------|
## no | 57 | 11 | 68 |
## | 0.570 | 0.110 | |
## ----------------|-----------|-----------|-----------|
## yes | 16 | 16 | 32 |
## | 0.160 | 0.160 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 73 | 27 | 100 |
## ----------------|-----------|-----------|-----------|
##
##
letters <- read.csv("letterdata.csv")
str(letters)
## 'data.frame': 20000 obs. of 17 variables:
## $ letter: Factor w/ 26 levels "A","B","C","D",..: 20 9 4 14 7 19 2 1 10 13 ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
letters_train <- letters[1:18000, ]
letters_test <- letters[18001:20000, ]
library(kernlab)
letter_classifier <- ksvm(letter ~ ., data = letters_train, kernel = "vanilladot")
## Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 7886
##
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569
## Training error : 0.1335
letter_predictions <- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##
## letter_predictions A B C D E F G H I J K L M N O P Q R
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5
## I 0 0 0 0 0 0 0 0 65 3 0 0 0 0 0 0 0 0
## J 0 1 0 0 0 1 0 0 3 61 0 0 0 0 1 0 0 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0
##
## letter_predictions S T U V W X Y Z
## A 0 1 2 0 1 0 0 0
## B 3 0 0 0 0 0 0 0
## C 0 0 0 0 0 0 0 0
## D 0 0 0 0 0 0 1 0
## E 6 0 0 0 0 1 0 0
## F 2 0 0 1 0 0 2 0
## G 3 2 0 0 0 0 0 0
## H 0 3 0 2 0 0 1 0
## I 2 0 0 0 0 2 1 0
## J 1 0 0 0 0 1 0 4
## K 0 1 2 0 0 4 0 0
## L 0 0 0 0 0 0 0 0
## M 0 0 1 0 6 0 0 0
## N 0 0 1 0 2 0 0 0
## O 0 0 1 0 0 0 0 0
## P 0 0 0 0 0 0 1 0
## Q 1 0 0 0 0 0 0 0
## R 0 1 0 1 0 0 0 0
## S 47 1 0 0 0 1 0 6
## T 1 83 1 0 0 0 2 2
## U 0 0 83 0 0 0 0 0
## V 0 0 0 64 1 0 1 0
## W 0 0 0 3 59 0 0 0
## X 0 0 0 0 0 76 1 0
## Y 0 1 0 0 0 1 58 0
## Z 5 1 0 0 0 0 0 70
agreement <- letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 321 1679
wine <- read.csv("whitewines.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
hist(wine$quality)
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
library(rpart)
library(rpart.plot)
m.rpart <- rpart(quality ~ ., data=wine_train)
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 2945.53200 5.870933
## 2) alcohol< 10.85 2372 1418.86100 5.604975
## 4) volatile.acidity>=0.2275 1611 821.30730 5.432030
## 8) volatile.acidity>=0.3025 688 278.97670 5.255814 *
## 9) volatile.acidity< 0.3025 923 505.04230 5.563380 *
## 5) volatile.acidity< 0.2275 761 447.36400 5.971091 *
## 3) alcohol>=10.85 1378 1070.08200 6.328737
## 6) free.sulfur.dioxide< 10.5 84 95.55952 5.369048 *
## 7) free.sulfur.dioxide>=10.5 1294 892.13600 6.391036
## 14) alcohol< 11.76667 629 430.11130 6.173291
## 28) volatile.acidity>=0.465 11 10.72727 4.545455 *
## 29) volatile.acidity< 0.465 618 389.71680 6.202265 *
## 15) alcohol>=11.76667 665 403.99400 6.596992 *
rpart.plot(m.rpart, digits=3)
rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type = 3, extra = 101)
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525
news <- read.csv("OnlineNewsPopularity.csv")
str(news)
## 'data.frame': 39644 obs. of 61 variables:
## $ url : Factor w/ 39644 levels "http://mashable.com/2013/01/07/amazon-instant-video-browser/",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ timedelta : num 731 731 731 731 731 731 731 731 731 731 ...
## $ n_tokens_title : num 12 9 9 9 13 10 8 12 11 10 ...
## $ n_tokens_content : num 219 255 211 531 1072 ...
## $ n_unique_tokens : num 0.664 0.605 0.575 0.504 0.416 ...
## $ n_non_stop_words : num 1 1 1 1 1 ...
## $ n_non_stop_unique_tokens : num 0.815 0.792 0.664 0.666 0.541 ...
## $ num_hrefs : num 4 3 3 9 19 2 21 20 2 4 ...
## $ num_self_hrefs : num 2 1 1 0 19 2 20 20 0 1 ...
## $ num_imgs : num 1 1 1 1 20 0 20 20 0 1 ...
## $ num_videos : num 0 0 0 0 0 0 0 0 0 1 ...
## $ average_token_length : num 4.68 4.91 4.39 4.4 4.68 ...
## $ num_keywords : num 5 4 6 7 7 9 10 9 7 5 ...
## $ data_channel_is_lifestyle : num 0 0 0 0 0 0 1 0 0 0 ...
## $ data_channel_is_entertainment: num 1 0 0 1 0 0 0 0 0 0 ...
## $ data_channel_is_bus : num 0 1 1 0 0 0 0 0 0 0 ...
## $ data_channel_is_socmed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_channel_is_tech : num 0 0 0 0 1 1 0 1 1 0 ...
## $ data_channel_is_world : num 0 0 0 0 0 0 0 0 0 1 ...
## $ kw_min_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_min : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_min_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_max_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ kw_avg_avg : num 0 0 0 0 0 0 0 0 0 0 ...
## $ self_reference_min_shares : num 496 0 918 0 545 8500 545 545 0 0 ...
## $ self_reference_max_shares : num 496 0 918 0 16000 8500 16000 16000 0 0 ...
## $ self_reference_avg_sharess : num 496 0 918 0 3151 ...
## $ weekday_is_monday : num 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday_is_tuesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_wednesday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_thursday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_friday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_saturday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday_is_sunday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ is_weekend : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LDA_00 : num 0.5003 0.7998 0.2178 0.0286 0.0286 ...
## $ LDA_01 : num 0.3783 0.05 0.0333 0.4193 0.0288 ...
## $ LDA_02 : num 0.04 0.0501 0.0334 0.4947 0.0286 ...
## $ LDA_03 : num 0.0413 0.0501 0.0333 0.0289 0.0286 ...
## $ LDA_04 : num 0.0401 0.05 0.6822 0.0286 0.8854 ...
## $ global_subjectivity : num 0.522 0.341 0.702 0.43 0.514 ...
## $ global_sentiment_polarity : num 0.0926 0.1489 0.3233 0.1007 0.281 ...
## $ global_rate_positive_words : num 0.0457 0.0431 0.0569 0.0414 0.0746 ...
## $ global_rate_negative_words : num 0.0137 0.01569 0.00948 0.02072 0.01213 ...
## $ rate_positive_words : num 0.769 0.733 0.857 0.667 0.86 ...
## $ rate_negative_words : num 0.231 0.267 0.143 0.333 0.14 ...
## $ avg_positive_polarity : num 0.379 0.287 0.496 0.386 0.411 ...
## $ min_positive_polarity : num 0.1 0.0333 0.1 0.1364 0.0333 ...
## $ max_positive_polarity : num 0.7 0.7 1 0.8 1 0.6 1 1 0.8 0.5 ...
## $ avg_negative_polarity : num -0.35 -0.119 -0.467 -0.37 -0.22 ...
## $ min_negative_polarity : num -0.6 -0.125 -0.8 -0.6 -0.5 -0.4 -0.5 -0.5 -0.125 -0.5 ...
## $ max_negative_polarity : num -0.2 -0.1 -0.133 -0.167 -0.05 ...
## $ title_subjectivity : num 0.5 0 0 0 0.455 ...
## $ title_sentiment_polarity : num -0.188 0 0 0 0.136 ...
## $ abs_title_subjectivity : num 0 0.5 0.5 0.5 0.0455 ...
## $ abs_title_sentiment_polarity : num 0.188 0 0 0 0.136 ...
## $ shares : int 593 711 1500 1200 505 855 556 891 3600 710 ...
newsShort <- data.frame(news$n_tokens_title, news$n_tokens_content, news$n_unique_tokens, news$n_non_stop_words, news$num_hrefs, news$num_imgs, news$num_videos, news$average_token_length, news$num_keywords, news$kw_max_max, news$global_sentiment_polarity, news$avg_positive_polarity, news$title_subjectivity, news$title_sentiment_polarity, news$abs_title_subjectivity, news$abs_title_sentiment_polarity, news$shares)
colnames(newsShort) <- c("n_tokens_title", "n_tokens_content", "n_unique_tokens", "n_non_stop_words", "num_hrefs", "num_imgs", "num_videos", "average_token_length", "num_keywords", "kw_max_max", "global_sentiment_polarity", "avg_positive_polarity", "title_subjectivity", "title_sentiment_polarity", "abs_title_subjectivity", "abs_title_sentiment_polarity", "shares")
newsShort$popular = rep('na', nrow(newsShort))
for(i in 1:39644) {
if(newsShort$shares[i] >= 1400) {
newsShort$popular[i] = "yes"}
else {newsShort$popular[i] = "no"}
}
newsShort$shares = newsShort$popular
newsShort$shares <- as.factor(newsShort$shares)
newsShort <- subset( newsShort, select = -popular )
set.seed(12345)
news_rand <- newsShort[order(runif(49000)), ]
news_train <- news_rand[1:44000, ]
news_test <- news_rand[44001:49000, ]
prop.table(table(news_train$shares))
##
## no yes
## 0.4662568 0.5337432
prop.table(table(news_test$shares))
##
## no yes
## 0.4676778 0.5323222
library(C50)
news_model <- C5.0(news_train[-17], news_train$shares)
summary(news_model)
##
## Call:
## C5.0.default(x = news_train[-17], y = news_train$shares)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Jul 1 01:34:44 2018
## -------------------------------
##
## Class specified by attribute `outcome'
## *** ignoring cases with bad or unknown class
##
## Read 35622 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## num_imgs > 3:
## :...kw_max_max <= 15000:
## : :...n_tokens_content <= 1111: no (20/2)
## : : n_tokens_content > 1111: yes (7/2)
## : kw_max_max > 15000:
## : :...num_keywords <= 5:
## : :...n_unique_tokens > 0.5483304:
## : : :...num_videos > 0:
## : : : :...kw_max_max <= 310800: no (8/3)
## : : : : kw_max_max > 310800: yes (87/21)
## : : : num_videos <= 0:
## : : : :...average_token_length > 5.22907: yes (13)
## : : : average_token_length <= 5.22907:
## : : : :...global_sentiment_polarity <= 0.04: no (27/8)
## : : : global_sentiment_polarity > 0.04: yes (125/42)
## : : n_unique_tokens <= 0.5483304:
## : : :...n_tokens_title > 9:
## : : :...kw_max_max <= 690400: yes (78/33)
## : : : kw_max_max > 690400: no (545/228)
## : : n_tokens_title <= 9:
## : : :...num_videos <= 1: yes (267/108)
## : : num_videos > 1:
## : : :...num_imgs > 7: no (7)
## : : num_imgs <= 7:
## : : :...num_videos <= 2: yes (7)
## : : num_videos > 2: no (6/1)
## : num_keywords > 5:
## : :...num_hrefs > 17:
## : :...n_tokens_content > 2691: yes (53/2)
## : : n_tokens_content <= 2691:
## : : :...n_tokens_content <= 342: yes (661/156)
## : : n_tokens_content > 342:
## : : :...n_tokens_title > 9:
## : : :...num_imgs <= 12:
## : : : :...num_videos <= 6: yes (592/189)
## : : : : num_videos > 6:
## : : : : :...n_tokens_title <= 11: yes (15/3)
## : : : : n_tokens_title > 11: no (28/8)
## : : : num_imgs > 12:
## : : : :...kw_max_max > 310800:
## : : : :...n_tokens_title <= 13: yes (716/287)
## : : : : n_tokens_title > 13: no (61/28)
## : : : kw_max_max <= 310800:
## : : : :...kw_max_max <= 17100: no (8)
## : : : kw_max_max > 17100:
## : : : :...num_hrefs <= 22: yes (8)
## : : : num_hrefs > 22: no (31/12)
## : : n_tokens_title <= 9:
## : : :...abs_title_sentiment_polarity <= 0.425:
## : : :...num_imgs <= 28: yes (645/187)
## : : : num_imgs > 28:
## : : : :...average_token_length > 4.696252: yes (34/6)
## : : : average_token_length <= 4.696252:
## : : : :...kw_max_max <= 617900: yes (7/2)
## : : : kw_max_max > 617900: no (33/6)
## : : abs_title_sentiment_polarity > 0.425:
## : : :...n_tokens_content > 457: yes (209/31)
## : : n_tokens_content <= 457:
## : : :...n_tokens_title <= 6: yes (6)
## : : n_tokens_title > 6:
## : : :...kw_max_max <= 690400: no (6)
## : : kw_max_max > 690400:
## : : :...num_keywords > 9: no (12/3)
## : : num_keywords <= 9: [S1]
## : num_hrefs <= 17:
## : :...title_sentiment_polarity > 0.4: yes (658/198)
## : title_sentiment_polarity <= 0.4:
## : :...kw_max_max <= 617900: yes (479/157)
## : kw_max_max > 617900:
## : :...n_tokens_title > 10:
## : :...num_hrefs > 6: yes (1088/444)
## : : num_hrefs <= 6:
## : : :...num_videos <= 0: no (503/235)
## : : num_videos > 0:
## : : :...n_unique_tokens <= 0.3534247: yes (37/3)
## : : n_unique_tokens > 0.3534247:
## : : :...avg_positive_polarity > 0.3814904: yes (127/35)
## : : avg_positive_polarity <= 0.3814904:
## : : :...num_hrefs <= 5: no (142/60)
## : : num_hrefs > 5: [S2]
## : n_tokens_title <= 10:
## : :...global_sentiment_polarity > 0.09483681: yes (1135/393)
## : global_sentiment_polarity <= 0.09483681:
## : :...n_tokens_content > 1575:
## : :...title_sentiment_polarity <= -0.1461111: yes (4/1)
## : : title_sentiment_polarity > -0.1461111: no (26/1)
## : n_tokens_content <= 1575:
## : :...kw_max_max <= 690400:
## : :...n_tokens_title > 7: no (32/9)
## : : n_tokens_title <= 7: [S3]
## : kw_max_max > 690400:
## : :...num_imgs > 6: yes (520/196)
## : num_imgs <= 6:
## : :...num_hrefs <= 9:
## : :...num_videos > 1: no (6)
## : : num_videos <= 1: [S4]
## : num_hrefs > 9: [S5]
## num_imgs <= 3:
## :...n_unique_tokens > 0.4304388:
## :...kw_max_max <= 617900:
## : :...n_tokens_content > 1204: yes (37/4)
## : : n_tokens_content <= 1204:
## : : :...kw_max_max > 69100: yes (1747/638)
## : : kw_max_max <= 69100:
## : : :...kw_max_max <= 41600: no (374/170)
## : : kw_max_max > 41600: yes (1411/638)
## : kw_max_max > 617900:
## : :...global_sentiment_polarity <= 0.08261218:
## : :...num_imgs <= 0:
## : : :...num_videos > 25:
## : : : :...num_hrefs <= 52: no (71/18)
## : : : : num_hrefs > 52: yes (4)
## : : : num_videos <= 25:
## : : : :...kw_max_max <= 690400: yes (250/107)
## : : : kw_max_max > 690400:
## : : : :...num_videos > 15: yes (103/37)
## : : : num_videos <= 15:
## : : : :...num_keywords <= 6: no (360/141)
## : : : num_keywords > 6: yes (596/292)
## : : num_imgs > 0:
## : : :...n_unique_tokens > 0.7682927:
## : : :...title_sentiment_polarity <= -0.3: no (3)
## : : : title_sentiment_polarity > -0.3: yes (45/13)
## : : n_unique_tokens <= 0.7682927:
## : : :...num_hrefs <= 1:
## : : :...kw_max_max > 690400: no (163/26)
## : : : kw_max_max <= 690400:
## : : : :...num_keywords > 6: yes (3)
## : : : num_keywords <= 6:
## : : : :...num_keywords <= 5: yes (5/1)
## : : : num_keywords > 5: no (7)
## : : num_hrefs > 1:
## : : :...num_hrefs > 20:
## : : :...num_keywords <= 6: no (150/64)
## : : : num_keywords > 6: yes (212/91)
## : : num_hrefs <= 20:
## : : :...average_token_length > 4.747073:
## : : :...num_hrefs > 4:
## : : : :...n_tokens_content <= 164: yes (32/7)
## : : : : n_tokens_content > 164: no (2129/730)
## : : : num_hrefs <= 4:
## : : : :...kw_max_max <= 690400:
## : : : :...n_tokens_title <= 9: no (22/7)
## : : : : n_tokens_title > 9: yes (21/5)
## : : : kw_max_max > 690400:
## : : : :...num_imgs <= 2: no (435/110)
## : : : num_imgs > 2:
## : : : :...n_unique_tokens <= 0.4491979: yes (4)
## : : : n_unique_tokens > 0.4491979: no (30/10)
## : : average_token_length <= 4.747073:
## : : :...num_imgs > 2:
## : : :...num_hrefs <= 3:
## : : : :...n_unique_tokens > 0.6126984: yes (3)
## : : : : n_unique_tokens <= 0.6126984: [S6]
## : : : num_hrefs > 3:
## : : : :...num_keywords > 6: yes (64/16)
## : : : num_keywords <= 6: [S7]
## : : num_imgs <= 2:
## : : :...num_keywords <= 5: no (750/284)
## : : num_keywords > 5:
## : : :...num_imgs <= 1: no (1317/581)
## : : num_imgs > 1: [S8]
## : global_sentiment_polarity > 0.08261218:
## : :...num_hrefs > 15:
## : :...n_tokens_title <= 9: yes (582/189)
## : : n_tokens_title > 9:
## : : :...abs_title_subjectivity > 0.09583333: yes (748/329)
## : : abs_title_subjectivity <= 0.09583333:
## : : :...num_imgs <= 0: yes (43/18)
## : : num_imgs > 0:
## : : :...num_keywords <= 5: no (36/6)
## : : num_keywords > 5:
## : : :...num_videos > 0: no (46/11)
## : : num_videos <= 0:
## : : :...num_imgs > 1: yes (4)
## : : num_imgs <= 1: [S9]
## : num_hrefs <= 15:
## : :...num_imgs > 1:
## : :...n_unique_tokens > 0.6486486:
## : : :...global_sentiment_polarity <= 0.3318182: yes (159/35)
## : : : global_sentiment_polarity > 0.3318182: no (10/2)
## : : n_unique_tokens <= 0.6486486:
## : : :...num_keywords <= 9: yes (1404/654)
## : : num_keywords > 9:
## : : :...num_videos > 1: yes (17/3)
## : : num_videos <= 1:
## : : :...num_videos <= 0:
## : : :...num_hrefs <= 8: yes (74/20)
## : : : num_hrefs > 8: [S10]
## : : num_videos > 0:
## : : :...avg_positive_polarity <= 0.3091198: yes (19/5)
## : : avg_positive_polarity > 0.3091198:
## : : :...num_hrefs > 9: yes (20/1)
## : : num_hrefs <= 9:
## : : :...num_hrefs <= 6: no (14/2)
## : : num_hrefs > 6: yes (22/10)
## : num_imgs <= 1:
## : :...num_videos > 0:
## : :...kw_max_max <= 690400:
## : : :...num_imgs <= 0:
## : : : :...num_keywords <= 7:
## : : : : :...n_tokens_title <= 7: no (19/8)
## : : : : : n_tokens_title > 7: yes (212/66)
## : : : : num_keywords > 7:
## : : : : :...n_unique_tokens <= 0.6703297: no (61/24)
## : : : : n_unique_tokens > 0.6703297: yes (22/4)
## : : : num_imgs > 0:
## : : : :...average_token_length <= 4.39404: yes (29/4)
## : : : average_token_length > 4.39404:
## : : : :...num_hrefs > 9: yes (44/13)
## : : : num_hrefs <= 9:
## : : : :...num_videos > 3: yes (12/3)
## : : : num_videos <= 3:
## : : : :...num_keywords > 6: no (69/17)
## : : : num_keywords <= 6: [S11]
## : : kw_max_max > 690400:
## : : :...num_videos > 10: yes (191/71)
## : : num_videos <= 10:
## : : :...n_tokens_title <= 9: no (771/370)
## : : n_tokens_title > 9:
## : : :...num_hrefs <= 9: yes (1684/755)
## : : num_hrefs > 9:
## : : :...num_keywords <= 8:
## : : :...num_keywords <= 5: [S12]
## : : : num_keywords > 5: [S13]
## : : num_keywords > 8:
## : : :...num_imgs <= 0: no (44/19)
## : : num_imgs > 0: [S14]
## : num_videos <= 0:
## : :...n_tokens_content > 668: yes (759/324)
## : n_tokens_content <= 668:
## : :...title_sentiment_polarity > -0.008333334:
## : :...num_imgs > 0:
## : : :...n_tokens_content <= 121: yes (101/36)
## : : : n_tokens_content > 121: no (4008/1746)
## : : num_imgs <= 0:
## : : :...n_tokens_content > 141:
## : : :...num_keywords <= 5: no (73/31)
## : : : num_keywords > 5: yes (411/158)
## : : n_tokens_content <= 141:
## : : :...n_tokens_title > 9:
## : : :...n_unique_tokens <= 0.8265306: no (104/21)
## : : : n_unique_tokens > 0.8265306: yes (8/1)
## : : n_tokens_title <= 9:
## : : :...kw_max_max <= 690400: yes (35/10)
## : : kw_max_max > 690400: [S15]
## : title_sentiment_polarity <= -0.008333334:
## : :...n_unique_tokens > 0.71875:
## : :...n_tokens_title > 12: no (89/3)
## : : n_tokens_title <= 12:
## : : :...n_unique_tokens <= 0.7894737: no (69/11)
## : : n_unique_tokens > 0.7894737: yes (14/5)
## : n_unique_tokens <= 0.71875:
## : :...n_tokens_title > 12: no (89/25)
## : n_tokens_title <= 12:
## : :...num_imgs > 0: no (303/124)
## : num_imgs <= 0: [S16]
## n_unique_tokens <= 0.4304388:
## :...num_hrefs > 19: yes (387/93)
## num_hrefs <= 19:
## :...num_videos > 8:
## :...num_keywords <= 8: no (40/10)
## : num_keywords > 8:
## : :...average_token_length <= 4.819527: yes (8)
## : average_token_length > 4.819527: no (2)
## num_videos <= 8:
## :...kw_max_max <= 663600: yes (180/46)
## kw_max_max > 663600:
## :...global_sentiment_polarity > 0.154227: yes (399/112)
## global_sentiment_polarity <= 0.154227:
## :...average_token_length > 4.729216:
## :...num_videos <= 1:
## : :...num_hrefs <= 14: no (166/50)
## : : num_hrefs > 14:
## : : :...global_sentiment_polarity <= 0.1335317: no (40/19)
## : : global_sentiment_polarity > 0.1335317: yes (9)
## : num_videos > 1:
## : :...title_sentiment_polarity <= -0.1402778: no (5)
## : title_sentiment_polarity > -0.1402778:
## : :...avg_positive_polarity <= 0.2935334: no (5/1)
## : avg_positive_polarity > 0.2935334: yes (19/3)
## average_token_length <= 4.729216:
## :...num_imgs > 2: yes (99/32)
## num_imgs <= 2:
## :...kw_max_max <= 690400: yes (85/34)
## kw_max_max > 690400:
## :...num_hrefs > 15: no (55/22)
## num_hrefs <= 15:
## :...num_hrefs > 10: yes (138/39)
## num_hrefs <= 10:
## :...n_tokens_content > 1113:
## :...n_unique_tokens > 0.4021739: no (50/12)
## : n_unique_tokens <= 0.4021739:
## : :...num_videos > 2: no (4)
## : num_videos <= 2: [S17]
## n_tokens_content <= 1113:
## :...num_imgs > 1: yes (37/10)
## num_imgs <= 1:
## :...num_imgs > 0: [S18]
## num_imgs <= 0: [S19]
##
## SubTree [S1]
##
## n_tokens_content <= 422: yes (15/1)
## n_tokens_content > 422: no (7/2)
##
## SubTree [S2]
##
## title_sentiment_polarity <= -0.325: no (4)
## title_sentiment_polarity > -0.325: yes (38/12)
##
## SubTree [S3]
##
## global_sentiment_polarity <= 0.02484504: no (4/1)
## global_sentiment_polarity > 0.02484504: yes (7)
##
## SubTree [S4]
##
## n_tokens_content <= 436: yes (27/10)
## n_tokens_content > 436: no (66/15)
##
## SubTree [S5]
##
## average_token_length > 5.011019: no (5)
## average_token_length <= 5.011019:
## :...num_videos <= 3: yes (52/14)
## num_videos > 3: no (5/1)
##
## SubTree [S6]
##
## global_sentiment_polarity <= 0.06216631: no (19/1)
## global_sentiment_polarity > 0.06216631:
## :...global_sentiment_polarity <= 0.07848307: yes (5)
## global_sentiment_polarity > 0.07848307: no (3)
##
## SubTree [S7]
##
## average_token_length <= 4.407609: yes (6)
## average_token_length > 4.407609:
## :...n_tokens_title <= 14: no (48/19)
## n_tokens_title > 14: yes (3)
##
## SubTree [S8]
##
## n_tokens_content <= 623: yes (216/101)
## n_tokens_content > 623:
## :...avg_positive_polarity <= 0.3597222: no (59/9)
## avg_positive_polarity > 0.3597222:
## :...num_keywords > 9: yes (4)
## num_keywords <= 9:
## :...global_sentiment_polarity <= 0.07012397: no (18/5)
## global_sentiment_polarity > 0.07012397: yes (5)
##
## SubTree [S9]
##
## abs_title_subjectivity > 0.01136364: no (23/9)
## abs_title_subjectivity <= 0.01136364:
## :...avg_positive_polarity <= 0.2931903: no (2)
## avg_positive_polarity > 0.2931903: yes (18/3)
##
## SubTree [S10]
##
## abs_title_subjectivity <= 0.4125: yes (28/8)
## abs_title_subjectivity > 0.4125: no (28/9)
##
## SubTree [S11]
##
## global_sentiment_polarity <= 0.2202445: no (35/16)
## global_sentiment_polarity > 0.2202445: yes (6)
##
## SubTree [S12]
##
## global_sentiment_polarity <= 0.2365167: no (43/14)
## global_sentiment_polarity > 0.2365167: yes (4)
##
## SubTree [S13]
##
## abs_title_sentiment_polarity <= 0.7: no (152/58)
## abs_title_sentiment_polarity > 0.7: yes (8/3)
##
## SubTree [S14]
##
## average_token_length > 4.955523: no (13/2)
## average_token_length <= 4.955523:
## :...num_videos <= 9: yes (71/20)
## num_videos > 9: no (5/1)
##
## SubTree [S15]
##
## n_tokens_content <= 96: no (19/4)
## n_tokens_content > 96:
## :...title_subjectivity <= 0.6625: yes (10)
## title_subjectivity > 0.6625: no (4/1)
##
## SubTree [S16]
##
## abs_title_subjectivity > 0.425: no (5)
## abs_title_subjectivity <= 0.425:
## :...n_tokens_title > 10: yes (23/4)
## n_tokens_title <= 10:
## :...n_tokens_title <= 9: yes (12/3)
## n_tokens_title > 9:
## :...n_tokens_content <= 168: yes (2)
## n_tokens_content > 168: no (8)
##
## SubTree [S17]
##
## global_sentiment_polarity <= 0.1441667: yes (66/25)
## global_sentiment_polarity > 0.1441667: no (7)
##
## SubTree [S18]
##
## n_non_stop_words <= 0: yes (63/19)
## n_non_stop_words > 0:
## :...global_sentiment_polarity <= 0.02890873: no (8/1)
## global_sentiment_polarity > 0.02890873:
## :...n_unique_tokens <= 0.3853428: yes (18/1)
## n_unique_tokens > 0.3853428:
## :...num_hrefs <= 5: no (27/8)
## num_hrefs > 5: yes (54/19)
##
## SubTree [S19]
##
## n_tokens_title > 14: no (28/10)
## n_tokens_title <= 14:
## :...num_videos > 0: yes (602/253)
## num_videos <= 0:
## :...n_tokens_title > 13: yes (5)
## n_tokens_title <= 13:
## :...num_keywords <= 7: no (15/1)
## num_keywords > 7:
## :...num_keywords <= 9: yes (14/3)
## num_keywords > 9:
## :...abs_title_subjectivity <= 0.09415584: yes (2)
## abs_title_subjectivity > 0.09415584: no (4)
##
##
## Evaluation on training data (35622 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 190 13393(37.6%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 8682 7927 (a): class no
## 5466 13547 (b): class yes
##
##
## Attribute usage:
##
## 100.00% num_imgs
## 98.77% kw_max_max
## 82.81% num_hrefs
## 78.13% n_unique_tokens
## 67.85% global_sentiment_polarity
## 48.03% n_tokens_content
## 45.46% num_keywords
## 44.31% num_videos
## 37.22% n_tokens_title
## 29.28% title_sentiment_polarity
## 20.81% average_token_length
## 3.18% abs_title_sentiment_polarity
## 2.90% abs_title_subjectivity
## 1.45% avg_positive_polarity
## 0.48% n_non_stop_words
## 0.04% title_subjectivity
##
##
## Time: 0.5 secs
news_pred <- predict(news_model, news_test)
(p <- table(news_pred, news_test$shares))
##
## news_pred no yes
## no 867 674
## yes 1014 1467
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 58.03083
library(kernlab)
news_model2 <- ksvm(shares ~ ., data = news_train, kernel = "vanilladot")
## Setting default kernel parameters
summary(news_model2)
## Length Class Mode
## 1 ksvm S4
news_pred2 <- predict(news_model2, news_test)
#####(p <- table(news_pred2, news_test$shares))
#####I'm not sure why I keep seeing the error "all arguments must have the same length"
library(rpart)
m.rpart <- rpart(shares ~ ., data=news_train)
summary(m.rpart)
## Call:
## rpart(formula = shares ~ ., data = news_train)
## n=35622 (8378 observations deleted due to missingness)
##
## CP nsplit rel error xerror xstd
## 1 0.03251249 0 1.0000000 1.0000000 0.005668843
## 2 0.01216208 2 0.9349750 0.9400325 0.005638366
## 3 0.01000000 3 0.9228129 0.9347944 0.005634845
##
## Variable importance
## num_imgs global_sentiment_polarity
## 44 22
## kw_max_max avg_positive_polarity
## 11 6
## num_hrefs n_non_stop_words
## 5 5
## n_tokens_content average_token_length
## 4 2
## n_unique_tokens
## 2
##
## Node number 1: 35622 observations, complexity param=0.03251249
## predicted class=yes expected loss=0.4662568 P(node) =1
## class counts: 16609 19013
## probabilities: 0.466 0.534
## left son=2 (26313 obs) right son=3 (9309 obs)
## Primary splits:
## num_imgs < 3.5 to the left, improve=204.43790, (0 missing)
## num_hrefs < 13.5 to the left, improve=163.26770, (0 missing)
## global_sentiment_polarity < 0.09269021 to the left, improve=130.72360, (0 missing)
## n_unique_tokens < 0.4396835 to the right, improve= 90.54954, (0 missing)
## num_keywords < 6.5 to the left, improve= 78.65559, (0 missing)
## Surrogate splits:
## num_hrefs < 18.5 to the left, agree=0.768, adj=0.114, (0 split)
## n_non_stop_words < 1 to the left, agree=0.757, adj=0.070, (0 split)
## n_tokens_content < 1339.5 to the left, agree=0.753, adj=0.057, (0 split)
## global_sentiment_polarity < -0.1855226 to the right, agree=0.740, adj=0.004, (0 split)
## average_token_length < 5.626769 to the left, agree=0.740, adj=0.004, (0 split)
##
## Node number 2: 26313 observations, complexity param=0.03251249
## predicted class=yes expected loss=0.4981188 P(node) =0.7386727
## class counts: 13107 13206
## probabilities: 0.498 0.502
## left son=4 (10534 obs) right son=5 (15779 obs)
## Primary splits:
## global_sentiment_polarity < 0.09269021 to the left, improve=99.22453, (0 missing)
## n_unique_tokens < 0.4304826 to the right, improve=78.06604, (0 missing)
## kw_max_max < 677000 to the right, improve=76.51894, (0 missing)
## num_hrefs < 19.5 to the left, improve=56.02928, (0 missing)
## average_token_length < 4.781043 to the right, improve=54.85230, (0 missing)
## Surrogate splits:
## avg_positive_polarity < 0.3012278 to the left, agree=0.702, adj=0.256, (0 split)
## n_unique_tokens < 0.1845133 to the left, agree=0.630, adj=0.077, (0 split)
## average_token_length < 3.659021 to the left, agree=0.630, adj=0.077, (0 split)
## n_tokens_content < 9 to the left, agree=0.630, adj=0.077, (0 split)
## n_non_stop_words < 0.5 to the left, agree=0.630, adj=0.077, (0 split)
##
## Node number 3: 9309 observations
## predicted class=yes expected loss=0.3761951 P(node) =0.2613273
## class counts: 3502 5807
## probabilities: 0.376 0.624
##
## Node number 4: 10534 observations, complexity param=0.01216208
## predicted class=no expected loss=0.4487374 P(node) =0.2957161
## class counts: 5807 4727
## probabilities: 0.551 0.449
## left son=8 (9288 obs) right son=9 (1246 obs)
## Primary splits:
## kw_max_max < 654150 to the right, improve=49.48608, (0 missing)
## n_non_stop_words < 1 to the right, improve=48.04450, (0 missing)
## n_tokens_content < 161.5 to the right, improve=47.03210, (0 missing)
## num_imgs < 0.5 to the right, improve=45.60699, (0 missing)
## average_token_length < 4.729654 to the right, improve=45.30676, (0 missing)
## Surrogate splits:
## n_unique_tokens < 0.9755102 to the left, agree=0.882, adj=0.002, (0 split)
##
## Node number 5: 15779 observations
## predicted class=yes expected loss=0.4626402 P(node) =0.4429566
## class counts: 7300 8479
## probabilities: 0.463 0.537
##
## Node number 8: 9288 observations
## predicted class=no expected loss=0.4309862 P(node) =0.2607377
## class counts: 5285 4003
## probabilities: 0.569 0.431
##
## Node number 9: 1246 observations
## predicted class=yes expected loss=0.4189406 P(node) =0.03497838
## class counts: 522 724
## probabilities: 0.419 0.581
library(rpart.plot)
rpart.plot(m.rpart, digits=3, type=1)
p.rpart <- predict(m.rpart, news_test)
summary(p.rpart)
## no yes
## Min. :0.3762 Min. :0.4310
## 1st Qu.:0.4626 1st Qu.:0.5374
## Median :0.4626 Median :0.5374
## Mean :0.4659 Mean :0.5341
## 3rd Qu.:0.4626 3rd Qu.:0.5374
## Max. :0.5690 Max. :0.6238
summary(news_test$shares)
## no yes NA's
## 1881 2141 978