Method #1 Tree-based classification

Step 1: Collecting the data

library(readr)
## Warning: package 'readr' was built under R version 3.4.4
credit <- read_csv("C:/Users/Bkulkarni/credit.csv")
## Parsed with column specification:
## cols(
##   checking_balance = col_character(),
##   months_loan_duration = col_integer(),
##   credit_history = col_character(),
##   purpose = col_character(),
##   amount = col_integer(),
##   savings_balance = col_character(),
##   employment_duration = col_character(),
##   percent_of_income = col_integer(),
##   years_at_residence = col_integer(),
##   age = col_integer(),
##   other_credit = col_character(),
##   housing = col_character(),
##   existing_loans_count = col_integer(),
##   job = col_character(),
##   dependents = col_integer(),
##   phone = col_character(),
##   default = col_character()
## )
str(credit)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1000 obs. of  17 variables:
##  $ checking_balance    : chr  "< 0 DM" "1 - 200 DM" "unknown" "< 0 DM" ...
##  $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history      : chr  "critical" "good" "critical" "good" ...
##  $ purpose             : chr  "furniture/appliances" "furniture/appliances" "education" "furniture/appliances" ...
##  $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings_balance     : chr  "unknown" "< 100 DM" "< 100 DM" "< 100 DM" ...
##  $ employment_duration : chr  "> 7 years" "1 - 4 years" "4 - 7 years" "4 - 7 years" ...
##  $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ other_credit        : chr  "none" "none" "none" "none" ...
##  $ housing             : chr  "own" "own" "own" "other" ...
##  $ existing_loans_count: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ job                 : chr  "skilled" "skilled" "unskilled" "skilled" ...
##  $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ phone               : chr  "yes" "no" "no" "no" ...
##  $ default             : chr  "no" "yes" "no" "no" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 17
##   .. ..$ checking_balance    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ months_loan_duration: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ credit_history      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ purpose             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ amount              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ savings_balance     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ employment_duration : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ percent_of_income   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ years_at_residence  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ age                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ other_credit        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ housing             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ existing_loans_count: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ job                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ dependents          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ phone               : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ default             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

Step 2: Exploring the data

summary(credit$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
table(credit$default)
## 
##  no yes 
## 700 300
set.seed(12345)
credit_rand<- credit[order(runif(1000)),]
summary(credit$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
summary(credit_rand$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
credit_train<-credit_rand[1:900,]
credit_test<-credit_rand[901:1000,]
prop.table(table(credit_train$default))
## 
##        no       yes 
## 0.7022222 0.2977778
prop.table(table(credit_test$default))
## 
##   no  yes 
## 0.68 0.32

Step 3: Training a model on the data

library(C50)
## Warning: package 'C50' was built under R version 3.4.4
credit_train$default<- as.factor(credit_train$default)
str(credit_train$default)
##  Factor w/ 2 levels "no","yes": 2 1 1 1 2 1 2 2 1 1 ...
credit_model <- C5.0( x=credit_train[-17], y=credit_train$default)
## c50 code called exit with value 1
summary(credit_model)
## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Mon May 28 20:01:30 2018
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## *** line 1 of `undefined.data': bad value of `c("< 0 DM", "1 - 200 DM", "1 - 200 DM", "< 0 DM", "1 - 200 DM", "unknown", "< 0 DM", "1 - 200 DM", "1 - 200 DM", "< 0 DM", "unknown", "> 200 DM", "> 200 DM", "1 - 200 DM", "unknown", "unknown", "unknown", "unknown", "unknown", "< 0 DM", "unknown", "< 0 DM", "> 200 DM", "unknown", "< 0 DM", "unknown", "> 200 DM", "1 - 200 DM", "unknown", "unknown", "unknown", "unknown", "< 0 DM", "1 - 200 DM", "unknown", "1 - 200 DM", "1 - 200 DM", "1 - 200 DM", "unknown", "< 0 DM", "unknown", "unknown", "< 0 DM",' for attribute `checking_balance'
## 
## Error limit exceeded

Step 4: Evaluating Model Performance

Method #2. KNN Algorithm

Step 1: Collecting the Data

library(readr)
wbcd <- read.csv("C:/Users/Bkulkarni/wisc_bc_data.csv", stringsAsFactors = FALSE)
str(wbcd)
## 'data.frame':    569 obs. of  32 variables:
##  $ id               : int  87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
##  $ diagnosis        : chr  "B" "B" "B" "B" ...
##  $ radius_mean      : num  12.3 10.6 11 11.3 15.2 ...
##  $ texture_mean     : num  12.4 18.9 16.8 13.4 13.2 ...
##  $ perimeter_mean   : num  78.8 69.3 70.9 73 97.7 ...
##  $ area_mean        : num  464 346 373 385 712 ...
##  $ smoothness_mean  : num  0.1028 0.0969 0.1077 0.1164 0.0796 ...
##  $ compactness_mean : num  0.0698 0.1147 0.078 0.1136 0.0693 ...
##  $ concavity_mean   : num  0.0399 0.0639 0.0305 0.0464 0.0339 ...
##  $ points_mean      : num  0.037 0.0264 0.0248 0.048 0.0266 ...
##  $ symmetry_mean    : num  0.196 0.192 0.171 0.177 0.172 ...
##  $ dimension_mean   : num  0.0595 0.0649 0.0634 0.0607 0.0554 ...
##  $ radius_se        : num  0.236 0.451 0.197 0.338 0.178 ...
##  $ texture_se       : num  0.666 1.197 1.387 1.343 0.412 ...
##  $ perimeter_se     : num  1.67 3.43 1.34 1.85 1.34 ...
##  $ area_se          : num  17.4 27.1 13.5 26.3 17.7 ...
##  $ smoothness_se    : num  0.00805 0.00747 0.00516 0.01127 0.00501 ...
##  $ compactness_se   : num  0.0118 0.03581 0.00936 0.03498 0.01485 ...
##  $ concavity_se     : num  0.0168 0.0335 0.0106 0.0219 0.0155 ...
##  $ points_se        : num  0.01241 0.01365 0.00748 0.01965 0.00915 ...
##  $ symmetry_se      : num  0.0192 0.035 0.0172 0.0158 0.0165 ...
##  $ dimension_se     : num  0.00225 0.00332 0.0022 0.00344 0.00177 ...
##  $ radius_worst     : num  13.5 11.9 12.4 11.9 16.2 ...
##  $ texture_worst    : num  15.6 22.9 26.4 15.8 15.7 ...
##  $ perimeter_worst  : num  87 78.3 79.9 76.5 104.5 ...
##  $ area_worst       : num  549 425 471 434 819 ...
##  $ smoothness_worst : num  0.139 0.121 0.137 0.137 0.113 ...
##  $ compactness_worst: num  0.127 0.252 0.148 0.182 0.174 ...
##  $ concavity_worst  : num  0.1242 0.1916 0.1067 0.0867 0.1362 ...
##  $ points_worst     : num  0.0939 0.0793 0.0743 0.0861 0.0818 ...
##  $ symmetry_worst   : num  0.283 0.294 0.3 0.21 0.249 ...
##  $ dimension_worst  : num  0.0677 0.0759 0.0788 0.0678 0.0677 ...
wbcd<- wbcd[-1]

Step 2: Exploring the Data

table(wbcd$diagnosis)
## 
##   B   M 
## 357 212
wbcd$diagnosis<-factor(wbcd$diagnosis, levels = c("B","M"), labels =c("Benigh","Malignant"))
round(prop.table(table(wbcd$diagnosis))*100, digits = 1)
## 
##    Benigh Malignant 
##      62.7      37.3
normalize<-function(x){return((x-min(x))/(max(x)-min(x)))}
wbcd_n<-as.data.frame(lapply(wbcd[2:31],normalize))
summary(wbcd_n$area_mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1174  0.1729  0.2169  0.2711  1.0000
wbcd_train<-wbcd_n[1:469,]
wbcd_test<-wbcd_n[470:569,]
wbcd_train_labels<-wbcd[1:469,1]
wbcd_test_labels<-wbcd[470:569,1]

Step 3: Training a Model on the Data

library(class)
wbcd_test_pred<-knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21)

Step 4: Evaluating Model Performance

Method #3. Support vector machine

Step 1: Collecting the Data

letters <- read_csv("C:/Users/Bkulkarni/letterdata.csv")
## Parsed with column specification:
## cols(
##   letter = col_character(),
##   xbox = col_integer(),
##   ybox = col_integer(),
##   width = col_integer(),
##   height = col_integer(),
##   onpix = col_integer(),
##   xbar = col_integer(),
##   ybar = col_integer(),
##   x2bar = col_integer(),
##   y2bar = col_integer(),
##   xybar = col_integer(),
##   x2ybar = col_integer(),
##   xy2bar = col_integer(),
##   xedge = col_integer(),
##   xedgey = col_integer(),
##   yedge = col_integer(),
##   yedgex = col_integer()
## )
str(letters)
## Classes 'tbl_df', 'tbl' and 'data.frame':    20000 obs. of  17 variables:
##  $ letter: chr  "T" "I" "D" "N" ...
##  $ xbox  : int  2 5 4 7 2 4 4 1 2 11 ...
##  $ ybox  : int  8 12 11 11 1 11 2 1 2 15 ...
##  $ width : int  3 3 6 6 3 5 5 3 4 13 ...
##  $ height: int  5 7 8 6 1 8 4 2 4 9 ...
##  $ onpix : int  1 2 6 3 1 3 4 1 2 7 ...
##  $ xbar  : int  8 10 10 5 8 8 8 8 10 13 ...
##  $ ybar  : int  13 5 6 9 6 8 7 2 6 2 ...
##  $ x2bar : int  0 5 2 4 6 6 6 2 2 6 ...
##  $ y2bar : int  6 4 6 6 6 9 6 2 6 2 ...
##  $ xybar : int  6 13 10 4 6 5 7 8 12 12 ...
##  $ x2ybar: int  10 3 3 4 5 6 6 2 4 1 ...
##  $ xy2bar: int  8 9 7 10 9 6 6 8 8 9 ...
##  $ xedge : int  0 2 3 6 1 0 2 1 1 8 ...
##  $ xedgey: int  8 8 7 10 7 8 8 6 6 1 ...
##  $ yedge : int  0 4 3 2 5 9 7 2 1 1 ...
##  $ yedgex: int  8 10 9 8 10 7 10 7 7 8 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 17
##   .. ..$ letter: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ xbox  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ ybox  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ width : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ height: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ onpix : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ xbar  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ ybar  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ x2bar : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ y2bar : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ xybar : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ x2ybar: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ xy2bar: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ xedge : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ xedgey: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ yedge : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ yedgex: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

Step 2: Preparing the Data

letters_train<- letters[1:18000,]
letters_test<- letters[18001:20000,]

Step3: Training a Model on the Data

library(kernlab)
## Warning: package 'kernlab' was built under R version 3.4.4
letter_classifier<-ksvm(letter~., data=letters_train, kernel="vanilladot")
##  Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 7886 
## 
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569 
## Training error : 0.1335

Step 4: Evaluating Model Performance

letter_predictions<- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##                   
## letter_predictions  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R
##                  A 73  0  0  0  0  0  0  0  0  1  0  0  0  0  3  0  4  0
##                  B  0 61  0  3  2  0  1  1  0  0  1  1  0  0  0  2  0  1
##                  C  0  0 64  0  2  0  4  2  1  0  1  2  0  0  1  0  0  0
##                  D  2  1  0 67  0  0  1  3  3  2  1  2  0  3  4  2  1  2
##                  E  0  0  1  0 64  1  1  0  0  0  2  2  0  0  0  0  2  0
##                  F  0  0  0  0  0 70  1  1  4  0  0  0  0  0  0  5  1  0
##                  G  1  1  2  1  3  2 68  1  0  0  0  1  0  0  0  0  4  1
##                  H  0  0  0  1  0  1  0 46  0  2  3  1  1  1  9  0  0  5
##                  I  0  0  0  0  0  0  0  0 65  3  0  0  0  0  0  0  0  0
##                  J  0  1  0  0  0  1  0  0  3 61  0  0  0  0  1  0  0  0
##                  K  0  1  4  0  0  0  0  5  0  0 56  0  0  2  0  0  0  4
##                  L  0  0  0  0  1  0  0  1  0  0  0 63  0  0  0  0  0  0
##                  M  0  0  1  0  0  0  1  0  0  0  0  0 70  2  0  0  0  0
##                  N  0  0  0  0  0  0  0  0  0  0  0  0  0 77  0  0  0  1
##                  O  0  0  1  1  0  0  0  1  0  1  0  0  0  0 49  1  2  0
##                  P  0  0  0  0  0  3  0  0  0  0  0  0  0  0  2 69  0  0
##                  Q  0  0  0  0  0  0  3  1  0  0  0  2  0  0  2  1 52  0
##                  R  0  4  0  0  1  0  0  3  0  0  3  0  0  0  1  0  0 64
##                  S  0  1  0  0  1  1  1  0  1  1  0  0  0  0  0  0  6  0
##                  T  0  0  0  0  1  1  0  0  0  0  1  0  0  0  0  0  0  0
##                  U  0  0  2  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##                  V  0  0  0  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0
##                  W  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0
##                  X  0  1  0  0  1  0  0  1  0  0  1  4  0  0  0  0  0  1
##                  Y  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  4  0  0
##                  Z  1  0  0  0  2  0  0  0  0  2  0  0  0  0  0  0  0  0
##                   
## letter_predictions  S  T  U  V  W  X  Y  Z
##                  A  0  1  2  0  1  0  0  0
##                  B  3  0  0  0  0  0  0  0
##                  C  0  0  0  0  0  0  0  0
##                  D  0  0  0  0  0  0  1  0
##                  E  6  0  0  0  0  1  0  0
##                  F  2  0  0  1  0  0  2  0
##                  G  3  2  0  0  0  0  0  0
##                  H  0  3  0  2  0  0  1  0
##                  I  2  0  0  0  0  2  1  0
##                  J  1  0  0  0  0  1  0  4
##                  K  0  1  2  0  0  4  0  0
##                  L  0  0  0  0  0  0  0  0
##                  M  0  0  1  0  6  0  0  0
##                  N  0  0  1  0  2  0  0  0
##                  O  0  0  1  0  0  0  0  0
##                  P  0  0  0  0  0  0  1  0
##                  Q  1  0  0  0  0  0  0  0
##                  R  0  1  0  1  0  0  0  0
##                  S 47  1  0  0  0  1  0  6
##                  T  1 83  1  0  0  0  2  2
##                  U  0  0 83  0  0  0  0  0
##                  V  0  0  0 64  1  0  1  0
##                  W  0  0  0  3 59  0  0  0
##                  X  0  0  0  0  0 76  1  0
##                  Y  0  1  0  0  0  1 58  0
##                  Z  5  1  0  0  0  0  0 70
agreement<-letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE  TRUE 
##   321  1679

Method #4. Naive Bayes Algorithm

Step 1: Collecting the Data

sms_raw <- read_csv("C:/Users/Bkulkarni/sms_spam.csv")
## Parsed with column specification:
## cols(
##   type = col_character(),
##   text = col_character()
## )
str(sms_raw)
## Classes 'tbl_df', 'tbl' and 'data.frame':    5559 obs. of  2 variables:
##  $ type: chr  "ham" "ham" "ham" "spam" ...
##  $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline "| __truncated__ ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 2
##   .. ..$ type: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ text: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
head(sms_raw)
## # A tibble: 6 x 2
##   type  text                                                              
##   <chr> <chr>                                                             
## 1 ham   Hope you are having a good week. Just checking in                 
## 2 ham   K..give back my thanks.                                           
## 3 ham   Am also doing in cbe only. But have to pay.                       
## 4 spam  complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URG~
## 5 spam  okmail: Dear Dave this is your final notice to collect your 4* Te~
## 6 ham   Aiya we discuss later lar... Pick u up at 4 is it?

Step 2: Exploring and Preparing the Data

library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
sms_corpus<- Corpus(VectorSource(sms_raw$text))
print(sms_corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5559
sms_corpus<- Corpus(VectorSource(sms_raw$text))
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[10:15], as.character)
## [[1]]
## [1] "fyi I'm at usf now, swing by the room whenever"
## 
## [[2]]
## [1] "Sure thing big man. i have hockey elections at 6, shouldn€˜t go on longer than an hour though"
## 
## [[3]]
## [1] "I anything lor..."
## 
## [[4]]
## [1] "By march ending, i should be ready. But will call you for sure. The problem is that my capital never complete. How far with you. How's work and the ladies"
## 
## [[5]]
## [1] "Hmm well, night night"
## 
## [[6]]
## [1] "K I'll be sure to get up before noon and see what's what"
corpus_clean<-tm_map(sms_corpus, tolower)
corpus_clean<-tm_map(corpus_clean, removeNumbers)
corpus_clean<-tm_map(corpus_clean, removePunctuation)
corpus_clean<-tm_map(corpus_clean, removeWords, stopwords())
corpus_clean<-tm_map(corpus_clean, stripWhitespace)
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
strwrap(corpus_clean[[1]])
## [1] "hope good week just checking"
corpus_clean <- tm_map(corpus_clean, PlainTextDocument)
sms_dtm <- DocumentTermMatrix(sms_corpus, control = list(tolower=TRUE, removeNumbers = TRUE, stopwords = TRUE, removePunctuation = TRUE ))
sms_raw_train<-sms_raw[1:4169,]
sms_raw_test<-sms_raw[4170:5559,]
sms_dtm_train<-sms_dtm[1:4169,]
sms_dtm_test<-sms_dtm[4160:5559,]
sms_corpus_train<-corpus_clean[1:4169]
sms_corpus_test<-corpus_clean[4170:5559]
prop.table(table(sms_raw_train$type))
## 
##       ham      spam 
## 0.8647158 0.1352842
prop.table(table(sms_raw_test$type))
## 
##       ham      spam 
## 0.8683453 0.1316547

Creating Word Clouds

library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
## Loading required package: RColorBrewer
wordcloud(sms_corpus_train, min.freq = 40, random.order = FALSE)

##Step 3: Training a Model on the Data

five_times_words<-findFreqTerms(sms_dtm_train,5)
length(five_times_words)
## [1] 1257
convert_count<-function(x){
  x<-ifelse(x>0,1,0) 
  x<-factor(x,levels = c(0,1),labels = c("No","Yes")) 
  return(x)
  }
sms_train<-DocumentTermMatrix(sms_corpus_train, control = list(dictionary=five_times_words))
sms_test<-DocumentTermMatrix(sms_corpus_test, control = list(dictionary=five_times_words))
sms_train<-apply(sms_train,2,convert_count)
sms_test<-apply(sms_test,2,convert_count)
library(naivebayes)
sms_classifier<-naive_bayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)
## [1] "naive_bayes"

Step 4: Evaluating Model Performance

sms_test_pred<-predict(sms_classifier, newdata = sms_test)
table(sms_test_pred, sms_raw_test$type)
##              
## sms_test_pred  ham spam
##          ham  1207  183
##          spam    0    0

Method #5. Adding regression to trees

Step 1: Collecting the Data

library(readr)
wine <- read_csv("C:/Users/Bkulkarni/whitewines.csv")
## Parsed with column specification:
## cols(
##   `fixed acidity` = col_double(),
##   `volatile acidity` = col_double(),
##   `citric acid` = col_double(),
##   `residual sugar` = col_double(),
##   chlorides = col_double(),
##   `free sulfur dioxide` = col_double(),
##   `total sulfur dioxide` = col_double(),
##   density = col_double(),
##   pH = col_double(),
##   sulphates = col_double(),
##   alcohol = col_double(),
##   quality = col_integer()
## )
str(wine)
## Classes 'tbl_df', 'tbl' and 'data.frame':    4898 obs. of  12 variables:
##  $ fixed acidity       : num  6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
##  $ volatile acidity    : num  0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
##  $ citric acid         : num  0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
##  $ residual sugar      : num  1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
##  $ chlorides           : num  0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
##  $ free sulfur dioxide : num  6 41 33 11 36 22 33 17 34 40 ...
##  $ total sulfur dioxide: num  62 113 123 74 119 95 152 67 90 130 ...
##  $ density             : num  0.993 0.999 0.995 0.991 0.993 ...
##  $ pH                  : num  3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
##  $ sulphates           : num  0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
##  $ alcohol             : num  10.4 8.9 10.1 11.2 10.9 ...
##  $ quality             : int  5 6 6 4 6 6 6 6 6 7 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 12
##   .. ..$ fixed acidity       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ volatile acidity    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ citric acid         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ residual sugar      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ chlorides           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ free sulfur dioxide : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ total sulfur dioxide: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ density             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ pH                  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ sulphates           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ alcohol             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ quality             : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
hist(wine$quality)

##Step 2: Exploring and Preparing the Data

wine_train<-wine[1:3750,]
wine_test<-wine[3751:4898,]

Step 3: Training a Model on the Data

library(rpart)
m.rpart<-rpart(quality~.,data = wine_train)
m.rpart
## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 2945.53200 5.870933  
##    2) alcohol< 10.85 2372 1418.86100 5.604975  
##      4) volatile acidity>=0.2275 1611  821.30730 5.432030  
##        8) volatile acidity>=0.3025 688  278.97670 5.255814 *
##        9) volatile acidity< 0.3025 923  505.04230 5.563380 *
##      5) volatile acidity< 0.2275 761  447.36400 5.971091 *
##    3) alcohol>=10.85 1378 1070.08200 6.328737  
##      6) free sulfur dioxide< 10.5 84   95.55952 5.369048 *
##      7) free sulfur dioxide>=10.5 1294  892.13600 6.391036  
##       14) alcohol< 11.76667 629  430.11130 6.173291  
##         28) volatile acidity>=0.465 11   10.72727 4.545455 *
##         29) volatile acidity< 0.465 618  389.71680 6.202265 *
##       15) alcohol>=11.76667 665  403.99400 6.596992 *
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
rpart.plot(m.rpart, digits = 3)

rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3,extra = 101)

#Performance

p.rpart<-predict(m.rpart, wine_test)
summary(p.rpart)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.545   5.563   5.971   5.893   6.202   6.597
summary(wine_test$quality)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.901   6.000   9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525