library(readr)
## Warning: package 'readr' was built under R version 3.4.4
credit <- read_csv("C:/Users/Bkulkarni/credit.csv")
## Parsed with column specification:
## cols(
## checking_balance = col_character(),
## months_loan_duration = col_integer(),
## credit_history = col_character(),
## purpose = col_character(),
## amount = col_integer(),
## savings_balance = col_character(),
## employment_duration = col_character(),
## percent_of_income = col_integer(),
## years_at_residence = col_integer(),
## age = col_integer(),
## other_credit = col_character(),
## housing = col_character(),
## existing_loans_count = col_integer(),
## job = col_character(),
## dependents = col_integer(),
## phone = col_character(),
## default = col_character()
## )
str(credit)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 17 variables:
## $ checking_balance : chr "< 0 DM" "1 - 200 DM" "unknown" "< 0 DM" ...
## $ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_history : chr "critical" "good" "critical" "good" ...
## $ purpose : chr "furniture/appliances" "furniture/appliances" "education" "furniture/appliances" ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ savings_balance : chr "unknown" "< 100 DM" "< 100 DM" "< 100 DM" ...
## $ employment_duration : chr "> 7 years" "1 - 4 years" "4 - 7 years" "4 - 7 years" ...
## $ percent_of_income : int 4 2 2 2 3 2 3 2 2 4 ...
## $ years_at_residence : int 4 2 3 4 4 4 4 2 4 2 ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ other_credit : chr "none" "none" "none" "none" ...
## $ housing : chr "own" "own" "own" "other" ...
## $ existing_loans_count: int 2 1 1 1 2 1 1 1 1 2 ...
## $ job : chr "skilled" "skilled" "unskilled" "skilled" ...
## $ dependents : int 1 1 2 2 2 2 1 1 1 1 ...
## $ phone : chr "yes" "no" "no" "no" ...
## $ default : chr "no" "yes" "no" "no" ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 17
## .. ..$ checking_balance : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ months_loan_duration: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ credit_history : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ purpose : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ amount : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ savings_balance : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ employment_duration : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ percent_of_income : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ years_at_residence : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ age : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ other_credit : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ housing : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ existing_loans_count: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ job : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ dependents : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ phone : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ default : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
table(credit$default)
##
## no yes
## 700 300
set.seed(12345)
credit_rand<- credit[order(runif(1000)),]
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
summary(credit_rand$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
credit_train<-credit_rand[1:900,]
credit_test<-credit_rand[901:1000,]
prop.table(table(credit_train$default))
##
## no yes
## 0.7022222 0.2977778
prop.table(table(credit_test$default))
##
## no yes
## 0.68 0.32
library(C50)
## Warning: package 'C50' was built under R version 3.4.4
credit_train$default<- as.factor(credit_train$default)
str(credit_train$default)
## Factor w/ 2 levels "no","yes": 2 1 1 1 2 1 2 2 1 1 ...
credit_model <- C5.0( x=credit_train[-17], y=credit_train$default)
## c50 code called exit with value 1
summary(credit_model)
##
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
##
##
## C5.0 [Release 2.07 GPL Edition] Mon May 28 20:01:30 2018
## -------------------------------
##
## Class specified by attribute `outcome'
##
## *** line 1 of `undefined.data': bad value of `c("< 0 DM", "1 - 200 DM", "1 - 200 DM", "< 0 DM", "1 - 200 DM", "unknown", "< 0 DM", "1 - 200 DM", "1 - 200 DM", "< 0 DM", "unknown", "> 200 DM", "> 200 DM", "1 - 200 DM", "unknown", "unknown", "unknown", "unknown", "unknown", "< 0 DM", "unknown", "< 0 DM", "> 200 DM", "unknown", "< 0 DM", "unknown", "> 200 DM", "1 - 200 DM", "unknown", "unknown", "unknown", "unknown", "< 0 DM", "1 - 200 DM", "unknown", "1 - 200 DM", "1 - 200 DM", "1 - 200 DM", "unknown", "< 0 DM", "unknown", "unknown", "< 0 DM",' for attribute `checking_balance'
##
## Error limit exceeded
library(readr)
wbcd <- read.csv("C:/Users/Bkulkarni/wisc_bc_data.csv", stringsAsFactors = FALSE)
str(wbcd)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 87139402 8910251 905520 868871 9012568 906539 925291 87880 862989 89827 ...
## $ diagnosis : chr "B" "B" "B" "B" ...
## $ radius_mean : num 12.3 10.6 11 11.3 15.2 ...
## $ texture_mean : num 12.4 18.9 16.8 13.4 13.2 ...
## $ perimeter_mean : num 78.8 69.3 70.9 73 97.7 ...
## $ area_mean : num 464 346 373 385 712 ...
## $ smoothness_mean : num 0.1028 0.0969 0.1077 0.1164 0.0796 ...
## $ compactness_mean : num 0.0698 0.1147 0.078 0.1136 0.0693 ...
## $ concavity_mean : num 0.0399 0.0639 0.0305 0.0464 0.0339 ...
## $ points_mean : num 0.037 0.0264 0.0248 0.048 0.0266 ...
## $ symmetry_mean : num 0.196 0.192 0.171 0.177 0.172 ...
## $ dimension_mean : num 0.0595 0.0649 0.0634 0.0607 0.0554 ...
## $ radius_se : num 0.236 0.451 0.197 0.338 0.178 ...
## $ texture_se : num 0.666 1.197 1.387 1.343 0.412 ...
## $ perimeter_se : num 1.67 3.43 1.34 1.85 1.34 ...
## $ area_se : num 17.4 27.1 13.5 26.3 17.7 ...
## $ smoothness_se : num 0.00805 0.00747 0.00516 0.01127 0.00501 ...
## $ compactness_se : num 0.0118 0.03581 0.00936 0.03498 0.01485 ...
## $ concavity_se : num 0.0168 0.0335 0.0106 0.0219 0.0155 ...
## $ points_se : num 0.01241 0.01365 0.00748 0.01965 0.00915 ...
## $ symmetry_se : num 0.0192 0.035 0.0172 0.0158 0.0165 ...
## $ dimension_se : num 0.00225 0.00332 0.0022 0.00344 0.00177 ...
## $ radius_worst : num 13.5 11.9 12.4 11.9 16.2 ...
## $ texture_worst : num 15.6 22.9 26.4 15.8 15.7 ...
## $ perimeter_worst : num 87 78.3 79.9 76.5 104.5 ...
## $ area_worst : num 549 425 471 434 819 ...
## $ smoothness_worst : num 0.139 0.121 0.137 0.137 0.113 ...
## $ compactness_worst: num 0.127 0.252 0.148 0.182 0.174 ...
## $ concavity_worst : num 0.1242 0.1916 0.1067 0.0867 0.1362 ...
## $ points_worst : num 0.0939 0.0793 0.0743 0.0861 0.0818 ...
## $ symmetry_worst : num 0.283 0.294 0.3 0.21 0.249 ...
## $ dimension_worst : num 0.0677 0.0759 0.0788 0.0678 0.0677 ...
wbcd<- wbcd[-1]
table(wbcd$diagnosis)
##
## B M
## 357 212
wbcd$diagnosis<-factor(wbcd$diagnosis, levels = c("B","M"), labels =c("Benigh","Malignant"))
round(prop.table(table(wbcd$diagnosis))*100, digits = 1)
##
## Benigh Malignant
## 62.7 37.3
normalize<-function(x){return((x-min(x))/(max(x)-min(x)))}
wbcd_n<-as.data.frame(lapply(wbcd[2:31],normalize))
summary(wbcd_n$area_mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1174 0.1729 0.2169 0.2711 1.0000
wbcd_train<-wbcd_n[1:469,]
wbcd_test<-wbcd_n[470:569,]
wbcd_train_labels<-wbcd[1:469,1]
wbcd_test_labels<-wbcd[470:569,1]
library(class)
wbcd_test_pred<-knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21)
letters <- read_csv("C:/Users/Bkulkarni/letterdata.csv")
## Parsed with column specification:
## cols(
## letter = col_character(),
## xbox = col_integer(),
## ybox = col_integer(),
## width = col_integer(),
## height = col_integer(),
## onpix = col_integer(),
## xbar = col_integer(),
## ybar = col_integer(),
## x2bar = col_integer(),
## y2bar = col_integer(),
## xybar = col_integer(),
## x2ybar = col_integer(),
## xy2bar = col_integer(),
## xedge = col_integer(),
## xedgey = col_integer(),
## yedge = col_integer(),
## yedgex = col_integer()
## )
str(letters)
## Classes 'tbl_df', 'tbl' and 'data.frame': 20000 obs. of 17 variables:
## $ letter: chr "T" "I" "D" "N" ...
## $ xbox : int 2 5 4 7 2 4 4 1 2 11 ...
## $ ybox : int 8 12 11 11 1 11 2 1 2 15 ...
## $ width : int 3 3 6 6 3 5 5 3 4 13 ...
## $ height: int 5 7 8 6 1 8 4 2 4 9 ...
## $ onpix : int 1 2 6 3 1 3 4 1 2 7 ...
## $ xbar : int 8 10 10 5 8 8 8 8 10 13 ...
## $ ybar : int 13 5 6 9 6 8 7 2 6 2 ...
## $ x2bar : int 0 5 2 4 6 6 6 2 2 6 ...
## $ y2bar : int 6 4 6 6 6 9 6 2 6 2 ...
## $ xybar : int 6 13 10 4 6 5 7 8 12 12 ...
## $ x2ybar: int 10 3 3 4 5 6 6 2 4 1 ...
## $ xy2bar: int 8 9 7 10 9 6 6 8 8 9 ...
## $ xedge : int 0 2 3 6 1 0 2 1 1 8 ...
## $ xedgey: int 8 8 7 10 7 8 8 6 6 1 ...
## $ yedge : int 0 4 3 2 5 9 7 2 1 1 ...
## $ yedgex: int 8 10 9 8 10 7 10 7 7 8 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 17
## .. ..$ letter: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ xbox : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ ybox : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ width : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ height: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ onpix : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ xbar : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ ybar : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ x2bar : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ y2bar : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ xybar : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ x2ybar: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ xy2bar: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ xedge : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ xedgey: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ yedge : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ yedgex: list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
letters_train<- letters[1:18000,]
letters_test<- letters[18001:20000,]
library(kernlab)
## Warning: package 'kernlab' was built under R version 3.4.4
letter_classifier<-ksvm(letter~., data=letters_train, kernel="vanilladot")
## Setting default kernel parameters
letter_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 7886
##
## Objective Function Value : -15.3458 -21.3403 -25.7672 -6.8685 -8.8812 -35.9555 -59.5883 -18.1975 -65.6075 -41.5654 -18.8559 -39.3558 -36.9961 -60.3052 -15.1694 -42.144 -35.0941 -19.4069 -15.8234 -38.6718 -33.3013 -8.5298 -12.4387 -38.2194 -14.3682 -9.5508 -165.7154 -53.2778 -79.2163 -134.5053 -184.4809 -58.9285 -46.3252 -81.004 -28.1341 -29.6955 -27.5983 -38.1764 -47.2889 -137.0497 -208.1396 -239.2616 -23.8945 -10.9655 -64.228 -12.2139 -55.7818 -10.8001 -21.2407 -11.1795 -121.5639 -33.2229 -267.3926 -81.0708 -9.4937 -4.6577 -161.5171 -86.7114 -20.9146 -16.8272 -86.6582 -16.7205 -30.3036 -20.0054 -26.2331 -29.9289 -56.1072 -11.6335 -5.2564 -14.8153 -4.983 -4.8171 -8.5044 -43.2267 -55.9 -214.755 -47.0748 -49.6539 -50.2278 -18.3767 -19.1813 -97.6132 -113.6502 -42.4112 -32.5859 -127.4807 -33.7418 -30.7568 -40.0953 -18.6792 -5.4826 -49.3916 -10.6142 -20.0286 -63.8287 -183.8297 -57.0671 -43.3721 -35.2783 -85.4451 -145.9585 -11.8002 -6.1194 -12.5323 -33.5245 -155.2248 -57.2602 -194.0785 -111.0155 -10.8207 -16.7926 -3.7766 -77.3561 -7.9004 -106.5759 -52.523 -107.0402 -78.0148 -74.4773 -24.8166 -13.2372 -7.8706 -27.2788 -13.2342 -280.2869 -32.7288 -25.9531 -149.5447 -153.8495 -10.0146 -40.8917 -6.7333 -65.2053 -72.818 -35.1252 -246.7046 -38.0738 -16.9126 -158.18 -184.0021 -50.8427 -28.7686 -164.5969 -97.8359 -386.1426 -160.3188 -181.8759 -38.3648 -37.2272 -60.116 -28.2074 -53.7383 -7.8729 -12.3159 -37.8942 -72.6434 -211.8342 -58.5023 -105.1605 -176.7259 -685.8994 -142.8147 -159.635 -366.9437 -37.6409 -73.1357 -175.1906 -131.2833 -41.1464 -77.8404 -57.8131 -8.6365 -251.3728 -14.0836 -36.5144 -2.2292 -6.1598 -16.8011 -26.5165 -67.19 -21.3366 -221.4815 -22.9219 -4.2616 -4.7901 -0.8263 -134.7538 -8.8843 -83.1109 -23.1019 -14.4251 -5.7337 -17.5244 -29.7925 -23.9243 -88.9084 -28.6719 -106.0564 -16.4981 -10.6486 -7.9315 -1.5742 -91.1706 -7.3819 -118.2628 -117.5543 -48.5606 -26.6093 -71.2968 -30.4913 -63.5712 -279.2921 -46.3025 -50.4912 -37.9431 -21.5243 -11.6202 -134.9023 -7.516 -5.8131 -10.1595 -13.6329 -27.0293 -25.7282 -151.8511 -39.0524 -105.4861 -34.2434 -15.7051 -10.2304 -3.6687 -98.2094 -7.4666 -15.2668 -75.1283 -116.5382 -16.6429 -14.9215 -55.1062 -3.0636 -8.4262 -93.6829 -38.1162 -123.1859 -4.9078 -9.1612 -1.3077 -102.9021 -23.1138 -8.5262 -57.2623 -3.4297 -20.9579 -78.2019 -50.3741 -62.3531 -6.4908 -21.9308 -2.3736 -84.3835 -126.3997 -114.8723 -26.4109 -21.5589 -61.6405 -34.9162 -66.3243 -25.1148 -6.7203 -4.6695 -65.3518 -39.7924 -67.3505 -36.2154 -10.9031 -62.2195 -14.9491 -24.3238 -65.0847 -4.9657 -64.2797 -278.2873 -14.6902 -13.9198 -18.2059 -9.8972 -78.2645 -17.454 -49.5929 -55.7786 -28.7673 -15.9476 -47.531 -17.4379 -71.0516 -5.6899 -6.2519 -97.5508 -3.8196 -7.0502 -1.1238 -147.6952 -28.2018 -414.2586 -32.3275 -35.1191 -4.9605 -90.2307 -151.3409 -90.0329 -27.9491 -42.4688 -12.5118 -26.4828 -2.0045 -62.195 -9.1662 -178.4616 -1.9406 -1.9871 -11.3982 -0.5214 -29.6136 -35.0449 -6.7569
## Training error : 0.1335
letter_predictions<- predict(letter_classifier, letters_test)
table(letter_predictions, letters_test$letter)
##
## letter_predictions A B C D E F G H I J K L M N O P Q R
## A 73 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 4 0
## B 0 61 0 3 2 0 1 1 0 0 1 1 0 0 0 2 0 1
## C 0 0 64 0 2 0 4 2 1 0 1 2 0 0 1 0 0 0
## D 2 1 0 67 0 0 1 3 3 2 1 2 0 3 4 2 1 2
## E 0 0 1 0 64 1 1 0 0 0 2 2 0 0 0 0 2 0
## F 0 0 0 0 0 70 1 1 4 0 0 0 0 0 0 5 1 0
## G 1 1 2 1 3 2 68 1 0 0 0 1 0 0 0 0 4 1
## H 0 0 0 1 0 1 0 46 0 2 3 1 1 1 9 0 0 5
## I 0 0 0 0 0 0 0 0 65 3 0 0 0 0 0 0 0 0
## J 0 1 0 0 0 1 0 0 3 61 0 0 0 0 1 0 0 0
## K 0 1 4 0 0 0 0 5 0 0 56 0 0 2 0 0 0 4
## L 0 0 0 0 1 0 0 1 0 0 0 63 0 0 0 0 0 0
## M 0 0 1 0 0 0 1 0 0 0 0 0 70 2 0 0 0 0
## N 0 0 0 0 0 0 0 0 0 0 0 0 0 77 0 0 0 1
## O 0 0 1 1 0 0 0 1 0 1 0 0 0 0 49 1 2 0
## P 0 0 0 0 0 3 0 0 0 0 0 0 0 0 2 69 0 0
## Q 0 0 0 0 0 0 3 1 0 0 0 2 0 0 2 1 52 0
## R 0 4 0 0 1 0 0 3 0 0 3 0 0 0 1 0 0 64
## S 0 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 6 0
## T 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0
## U 0 0 2 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## V 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## W 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## X 0 1 0 0 1 0 0 1 0 0 1 4 0 0 0 0 0 1
## Y 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0
## Z 1 0 0 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0
##
## letter_predictions S T U V W X Y Z
## A 0 1 2 0 1 0 0 0
## B 3 0 0 0 0 0 0 0
## C 0 0 0 0 0 0 0 0
## D 0 0 0 0 0 0 1 0
## E 6 0 0 0 0 1 0 0
## F 2 0 0 1 0 0 2 0
## G 3 2 0 0 0 0 0 0
## H 0 3 0 2 0 0 1 0
## I 2 0 0 0 0 2 1 0
## J 1 0 0 0 0 1 0 4
## K 0 1 2 0 0 4 0 0
## L 0 0 0 0 0 0 0 0
## M 0 0 1 0 6 0 0 0
## N 0 0 1 0 2 0 0 0
## O 0 0 1 0 0 0 0 0
## P 0 0 0 0 0 0 1 0
## Q 1 0 0 0 0 0 0 0
## R 0 1 0 1 0 0 0 0
## S 47 1 0 0 0 1 0 6
## T 1 83 1 0 0 0 2 2
## U 0 0 83 0 0 0 0 0
## V 0 0 0 64 1 0 1 0
## W 0 0 0 3 59 0 0 0
## X 0 0 0 0 0 76 1 0
## Y 0 1 0 0 0 1 58 0
## Z 5 1 0 0 0 0 0 70
agreement<-letter_predictions == letters_test$letter
table(agreement)
## agreement
## FALSE TRUE
## 321 1679
sms_raw <- read_csv("C:/Users/Bkulkarni/sms_spam.csv")
## Parsed with column specification:
## cols(
## type = col_character(),
## text = col_character()
## )
str(sms_raw)
## Classes 'tbl_df', 'tbl' and 'data.frame': 5559 obs. of 2 variables:
## $ type: chr "ham" "ham" "ham" "spam" ...
## $ text: chr "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline "| __truncated__ ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ type: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ text: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
head(sms_raw)
## # A tibble: 6 x 2
## type text
## <chr> <chr>
## 1 ham Hope you are having a good week. Just checking in
## 2 ham K..give back my thanks.
## 3 ham Am also doing in cbe only. But have to pay.
## 4 spam complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URG~
## 5 spam okmail: Dear Dave this is your final notice to collect your 4* Te~
## 6 ham Aiya we discuss later lar... Pick u up at 4 is it?
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
sms_corpus<- Corpus(VectorSource(sms_raw$text))
print(sms_corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5559
sms_corpus<- Corpus(VectorSource(sms_raw$text))
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[10:15], as.character)
## [[1]]
## [1] "fyi I'm at usf now, swing by the room whenever"
##
## [[2]]
## [1] "Sure thing big man. i have hockey elections at 6, shouldn€˜t go on longer than an hour though"
##
## [[3]]
## [1] "I anything lor..."
##
## [[4]]
## [1] "By march ending, i should be ready. But will call you for sure. The problem is that my capital never complete. How far with you. How's work and the ladies"
##
## [[5]]
## [1] "Hmm well, night night"
##
## [[6]]
## [1] "K I'll be sure to get up before noon and see what's what"
corpus_clean<-tm_map(sms_corpus, tolower)
corpus_clean<-tm_map(corpus_clean, removeNumbers)
corpus_clean<-tm_map(corpus_clean, removePunctuation)
corpus_clean<-tm_map(corpus_clean, removeWords, stopwords())
corpus_clean<-tm_map(corpus_clean, stripWhitespace)
strwrap(sms_corpus[[1]])
## [1] "Hope you are having a good week. Just checking in"
strwrap(corpus_clean[[1]])
## [1] "hope good week just checking"
corpus_clean <- tm_map(corpus_clean, PlainTextDocument)
sms_dtm <- DocumentTermMatrix(sms_corpus, control = list(tolower=TRUE, removeNumbers = TRUE, stopwords = TRUE, removePunctuation = TRUE ))
sms_raw_train<-sms_raw[1:4169,]
sms_raw_test<-sms_raw[4170:5559,]
sms_dtm_train<-sms_dtm[1:4169,]
sms_dtm_test<-sms_dtm[4160:5559,]
sms_corpus_train<-corpus_clean[1:4169]
sms_corpus_test<-corpus_clean[4170:5559]
prop.table(table(sms_raw_train$type))
##
## ham spam
## 0.8647158 0.1352842
prop.table(table(sms_raw_test$type))
##
## ham spam
## 0.8683453 0.1316547
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
## Loading required package: RColorBrewer
wordcloud(sms_corpus_train, min.freq = 40, random.order = FALSE)
##Step 3: Training a Model on the Data
five_times_words<-findFreqTerms(sms_dtm_train,5)
length(five_times_words)
## [1] 1257
convert_count<-function(x){
x<-ifelse(x>0,1,0)
x<-factor(x,levels = c(0,1),labels = c("No","Yes"))
return(x)
}
sms_train<-DocumentTermMatrix(sms_corpus_train, control = list(dictionary=five_times_words))
sms_test<-DocumentTermMatrix(sms_corpus_test, control = list(dictionary=five_times_words))
sms_train<-apply(sms_train,2,convert_count)
sms_test<-apply(sms_test,2,convert_count)
library(naivebayes)
sms_classifier<-naive_bayes(sms_train, factor(sms_raw_train$type))
class(sms_classifier)
## [1] "naive_bayes"
sms_test_pred<-predict(sms_classifier, newdata = sms_test)
table(sms_test_pred, sms_raw_test$type)
##
## sms_test_pred ham spam
## ham 1207 183
## spam 0 0
library(readr)
wine <- read_csv("C:/Users/Bkulkarni/whitewines.csv")
## Parsed with column specification:
## cols(
## `fixed acidity` = col_double(),
## `volatile acidity` = col_double(),
## `citric acid` = col_double(),
## `residual sugar` = col_double(),
## chlorides = col_double(),
## `free sulfur dioxide` = col_double(),
## `total sulfur dioxide` = col_double(),
## density = col_double(),
## pH = col_double(),
## sulphates = col_double(),
## alcohol = col_double(),
## quality = col_integer()
## )
str(wine)
## Classes 'tbl_df', 'tbl' and 'data.frame': 4898 obs. of 12 variables:
## $ fixed acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
## $ volatile acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
## $ citric acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
## $ residual sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
## $ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
## $ free sulfur dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
## $ total sulfur dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
## $ density : num 0.993 0.999 0.995 0.991 0.993 ...
## $ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
## $ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
## $ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
## $ quality : int 5 6 6 4 6 6 6 6 6 7 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 12
## .. ..$ fixed acidity : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ volatile acidity : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ citric acid : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ residual sugar : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ chlorides : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ free sulfur dioxide : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ total sulfur dioxide: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ density : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ pH : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ sulphates : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ alcohol : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ quality : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
hist(wine$quality)
##Step 2: Exploring and Preparing the Data
wine_train<-wine[1:3750,]
wine_test<-wine[3751:4898,]
library(rpart)
m.rpart<-rpart(quality~.,data = wine_train)
m.rpart
## n= 3750
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 3750 2945.53200 5.870933
## 2) alcohol< 10.85 2372 1418.86100 5.604975
## 4) volatile acidity>=0.2275 1611 821.30730 5.432030
## 8) volatile acidity>=0.3025 688 278.97670 5.255814 *
## 9) volatile acidity< 0.3025 923 505.04230 5.563380 *
## 5) volatile acidity< 0.2275 761 447.36400 5.971091 *
## 3) alcohol>=10.85 1378 1070.08200 6.328737
## 6) free sulfur dioxide< 10.5 84 95.55952 5.369048 *
## 7) free sulfur dioxide>=10.5 1294 892.13600 6.391036
## 14) alcohol< 11.76667 629 430.11130 6.173291
## 28) volatile acidity>=0.465 11 10.72727 4.545455 *
## 29) volatile acidity< 0.465 618 389.71680 6.202265 *
## 15) alcohol>=11.76667 665 403.99400 6.596992 *
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
rpart.plot(m.rpart, digits = 3)
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3,extra = 101)
#Performance
p.rpart<-predict(m.rpart, wine_test)
summary(p.rpart)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.545 5.563 5.971 5.893 6.202 6.597
summary(wine_test$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.901 6.000 9.000
cor(p.rpart, wine_test$quality)
## [1] 0.5369525