library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
x_train <- read.csv('X_train.csv',fileEncoding='euc-kr')
x_test <- read.csv('X_test.csv',fileEncoding='euc-kr')
y_train <- read.csv('y_train.csv',fileEncoding='euc-kr')
train<-left_join(x_train,y_train,by='cust_id')
train <-train %>% mutate(index='train')
test<- x_test %>% mutate(index='test')
data<-bind_rows(train,test)
glimpse(data)
## Rows: 5,982
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ 총구매액 <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900…
## $ 최대구매액 <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,…
## $ 환불금액 <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,…
## $ 주구매상품 <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자…
## $ 주구매지점 <chr> "강남점", "잠실점", "관악점", "광주점", "본 점", "일산…
## $ 내점일수 <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152…
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666…
## $ 주말방문비율 <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000…
## $ 구매주기 <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, …
## $ gender <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0…
## $ index <chr> "train", "train", "train", "train", "train", "train", "…
colSums(is.na(data))
## cust_id 총구매액 최대구매액 환불금액 주구매상품
## 0 0 0 3906 0
## 주구매지점 내점일수 내점당구매건수 주말방문비율 구매주기
## 0 0 0 0 0
## gender index
## 2482 0
df<-read.csv('travel_data.csv')
set.seed(1357)
train_list <-createDataPartition(y=df$TravelInsurance,p=.75,list=FALSE)
df_train<-df[train_list,]
df_test<-df[-train_list,]
glimpse(df_train)
## Rows: 1,491
## Columns: 10
## $ INDEX <int> 2, 3, 4, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, …
## $ Age <int> 34, 28, 28, 31, 28, 33, 31, 26, 31, 28, 28, 29, 34…
## $ Employment.Type <chr> "Private Sector/Self Employed", "Private Sector/Se…
## $ GraduateOrNot <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "…
## $ AnnualIncome <int> 500000, 700000, 700000, 1350000, 1450000, 800000, …
## $ FamilyMembers <int> 4, 3, 8, 3, 6, 3, 9, 5, 6, 4, 7, 5, 2, 6, 3, 4, 9,…
## $ ChronicDiseases <int> 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,…
## $ FrequentFlyer <chr> "No", "No", "Yes", "Yes", "Yes", "Yes", "No", "Yes…
## $ EverTravelledAbroad <chr> "No", "No", "No", "Yes", "Yes", "No", "No", "Yes",…
## $ TravelInsurance <int> 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,…
glimpse(df_test)
## Rows: 496
## Columns: 10
## $ INDEX <int> 0, 1, 5, 6, 12, 14, 15, 27, 33, 37, 38, 39, 43, 46…
## $ Age <int> 31, 31, 25, 31, 32, 31, 34, 28, 32, 34, 34, 33, 28…
## $ Employment.Type <chr> "Government Sector", "Private Sector/Self Employed…
## $ GraduateOrNot <chr> "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Y…
## $ AnnualIncome <int> 400000, 1250000, 1150000, 1300000, 850000, 400000,…
## $ FamilyMembers <int> 6, 7, 4, 4, 6, 3, 7, 2, 3, 4, 2, 5, 4, 4, 3, 3, 9,…
## $ ChronicDiseases <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ FrequentFlyer <chr> "No", "No", "No", "No", "No", "No", "No", "Yes", "…
## $ EverTravelledAbroad <chr> "No", "No", "No", "No", "No", "No", "No", "No", "N…
## $ TravelInsurance <int> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,…
colSums(is.na(df_train))
## INDEX Age Employment.Type GraduateOrNot
## 0 0 0 0
## AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer
## 0 0 0 0
## EverTravelledAbroad TravelInsurance
## 0 0
colSums(is.na(df_test))
## INDEX Age Employment.Type GraduateOrNot
## 0 0 0 0
## AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer
## 0 0 0 0
## EverTravelledAbroad TravelInsurance
## 0 0
df_train <- df_train %>% mutate(index='train')
df_test<-df_test %>% mutate(index='test')
df_train$TravelInsurance<- as.factor(df_train$TravelInsurance)
df_test$TravelInsurance<-as.factor(df_test$TravelInsurance)
data<-bind_rows(df_train,df_test)
colSums(is.na(data))
## INDEX Age Employment.Type GraduateOrNot
## 0 0 0 0
## AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer
## 0 0 0 0
## EverTravelledAbroad TravelInsurance index
## 0 0 0
data$TravelInsurance<-ifelse(data$TravelInsurance==0,'미가입','가입')
data$TravelInsurance<-as.factor(data$TravelInsurance)
data$GraduateOrNot<-as.factor(data$GraduateOrNot)
data$FrequentFlyer<-as.factor(data$FrequentFlyer)
data$EverTravelledAbroad<-as.factor(data$EverTravelledAbroad)
train<-data %>% filter(index=='train') %>% select(-index)
test<- data %>% filter(index=='test') %>% select(-index)
summary(train)
## INDEX Age Employment.Type GraduateOrNot
## Min. : 2.0 Min. :25.00 Length:1491 No : 220
## 1st Qu.: 500.5 1st Qu.:28.00 Class :character Yes:1271
## Median :1016.0 Median :29.00 Mode :character
## Mean :1002.9 Mean :29.62
## 3rd Qu.:1502.5 3rd Qu.:32.00
## Max. :1985.0 Max. :35.00
## AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer
## Min. : 300000 Min. :2.000 Min. :0.0000 No :1174
## 1st Qu.: 600000 1st Qu.:4.000 1st Qu.:0.0000 Yes: 317
## Median : 900000 Median :5.000 Median :0.0000
## Mean : 930550 Mean :4.728 Mean :0.2763
## 3rd Qu.:1250000 3rd Qu.:6.000 3rd Qu.:1.0000
## Max. :1800000 Max. :9.000 Max. :1.0000
## EverTravelledAbroad TravelInsurance
## No :1212 가입 :530
## Yes: 279 미가입:961
##
##
##
##
glimpse(train)
## Rows: 1,491
## Columns: 10
## $ INDEX <int> 2, 3, 4, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, …
## $ Age <int> 34, 28, 28, 31, 28, 33, 31, 26, 31, 28, 28, 29, 34…
## $ Employment.Type <chr> "Private Sector/Self Employed", "Private Sector/Se…
## $ GraduateOrNot <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, …
## $ AnnualIncome <int> 500000, 700000, 700000, 1350000, 1450000, 800000, …
## $ FamilyMembers <int> 4, 3, 8, 3, 6, 3, 9, 5, 6, 4, 7, 5, 2, 6, 3, 4, 9,…
## $ ChronicDiseases <int> 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,…
## $ FrequentFlyer <fct> No, No, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, …
## $ EverTravelledAbroad <fct> No, No, No, Yes, Yes, No, No, Yes, Yes, No, No, No…
## $ TravelInsurance <fct> 가입, 미가입, 미가입, 가입, 가입, 미가입, 미가입, …
glimpse(test)
## Rows: 496
## Columns: 10
## $ INDEX <int> 0, 1, 5, 6, 12, 14, 15, 27, 33, 37, 38, 39, 43, 46…
## $ Age <int> 31, 31, 25, 31, 32, 31, 34, 28, 32, 34, 34, 33, 28…
## $ Employment.Type <chr> "Government Sector", "Private Sector/Self Employed…
## $ GraduateOrNot <fct> Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Y…
## $ AnnualIncome <int> 400000, 1250000, 1150000, 1300000, 850000, 400000,…
## $ FamilyMembers <int> 6, 7, 4, 4, 6, 3, 7, 2, 3, 4, 2, 5, 4, 4, 3, 3, 9,…
## $ ChronicDiseases <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ FrequentFlyer <fct> No, No, No, No, No, No, No, Yes, No, No, No, Yes, …
## $ EverTravelledAbroad <fct> No, No, No, No, No, No, No, No, No, No, No, Yes, N…
## $ TravelInsurance <fct> 미가입, 미가입, 미가입, 미가입, 가입, 미가입, 미가…
colSums(is.na(train))
## INDEX Age Employment.Type GraduateOrNot
## 0 0 0 0
## AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer
## 0 0 0 0
## EverTravelledAbroad TravelInsurance
## 0 0
colSums(is.na(test))
## INDEX Age Employment.Type GraduateOrNot
## 0 0 0 0
## AnnualIncome FamilyMembers ChronicDiseases FrequentFlyer
## 0 0 0 0
## EverTravelledAbroad TravelInsurance
## 0 0
model=train(TravelInsurance~.,data=train,
method='knn',
metric="ROC",
preProcess=c('scale','center'),
trControl=trainControl(method = 'cv',
summaryFunction = twoClassSummary,
classProbs = TRUE))
model
## k-Nearest Neighbors
##
## 1491 samples
## 9 predictor
## 2 classes: '가입', '미가입'
##
## Pre-processing: scaled (9), centered (9)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1342, 1342, 1342, 1342, 1342, 1342, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 5 0.7312324 0.5264151 0.8876503
## 7 0.7470022 0.5264151 0.9094931
## 9 0.7524415 0.5226415 0.9209729
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
result<-predict(model,test,type='prob')
confusionMatrix(model)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 가입 미가입
## 가입 18.6 5.1
## 미가입 17.0 59.4
##
## Accuracy (average) : 0.7793
names(result)[1]<-'y_pred'
bind_cols(test,result) %>% select(INDEX,y_pred,TravelInsurance)->df
df
## INDEX y_pred TravelInsurance
## 1 0 0.2222222 미가입
## 2 1 0.1111111 미가입
## 3 5 0.2222222 미가입
## 4 6 0.0000000 미가입
## 5 12 0.3333333 가입
## 6 14 0.1111111 미가입
## 7 15 0.6666667 미가입
## 8 27 0.2222222 미가입
## 9 33 0.1111111 미가입
## 10 37 0.5555556 미가입
## 11 38 0.1111111 미가입
## 12 39 0.8888889 가입
## 13 43 0.1111111 가입
## 14 46 0.8888889 가입
## 15 48 0.2222222 미가입
## 16 56 0.2222222 가입
## 17 64 0.7777778 가입
## 18 65 0.2222222 미가입
## 19 72 0.2222222 미가입
## 20 74 0.1111111 미가입
## 21 83 0.5555556 가입
## 22 85 0.7777778 가입
## 23 92 0.1111111 미가입
## 24 94 0.2222222 미가입
## 25 97 0.3333333 미가입
## 26 99 0.2222222 미가입
## 27 101 0.7777778 가입
## 28 105 0.2222222 미가입
## 29 106 0.3333333 미가입
## 30 110 0.0000000 미가입
## 31 111 0.0000000 미가입
## 32 112 0.1111111 미가입
## 33 125 0.2222222 가입
## 34 127 0.1111111 미가입
## 35 128 0.6666667 가입
## 36 133 0.8888889 가입
## 37 135 0.3333333 가입
## 38 140 0.1111111 미가입
## 39 154 0.0000000 미가입
## 40 155 0.4444444 가입
## 41 158 0.1111111 가입
## 42 161 0.5555556 가입
## 43 162 0.5555556 가입
## 44 171 0.3333333 미가입
## 45 176 0.5555556 가입
## 46 178 0.1111111 미가입
## 47 183 0.5555556 미가입
## 48 184 0.2222222 미가입
## 49 187 0.3333333 미가입
## 50 188 0.3333333 미가입
## 51 195 0.8888889 가입
## 52 198 0.4444444 가입
## 53 200 0.1111111 가입
## 54 210 0.3333333 미가입
## 55 212 0.3333333 미가입
## 56 213 0.8888889 가입
## 57 222 0.2222222 미가입
## 58 223 0.1111111 미가입
## 59 224 0.1111111 미가입
## 60 233 0.4444444 미가입
## 61 234 0.2222222 미가입
## 62 238 0.2222222 미가입
## 63 242 0.1111111 미가입
## 64 243 0.2222222 미가입
## 65 249 0.1111111 미가입
## 66 250 0.5555556 가입
## 67 257 0.4444444 가입
## 68 266 0.0000000 미가입
## 69 267 0.1111111 가입
## 70 269 0.1111111 미가입
## 71 270 0.6666667 가입
## 72 272 0.2222222 미가입
## 73 273 0.4444444 가입
## 74 283 0.0000000 미가입
## 75 286 0.2222222 미가입
## 76 298 0.8888889 가입
## 77 300 0.5555556 가입
## 78 301 0.1111111 미가입
## 79 306 0.3333333 가입
## 80 309 0.0000000 가입
## 81 312 0.5555556 미가입
## 82 320 0.4444444 미가입
## 83 321 0.2222222 미가입
## 84 323 0.6666667 가입
## 85 324 0.1111111 미가입
## 86 334 0.2222222 가입
## 87 337 0.1111111 미가입
## 88 339 0.1111111 미가입
## 89 343 0.2222222 미가입
## 90 347 0.3333333 미가입
## 91 352 0.6666667 가입
## 92 356 0.3333333 미가입
## 93 359 0.2222222 미가입
## 94 361 0.6666667 가입
## 95 362 0.2222222 미가입
## 96 365 0.7777778 가입
## 97 366 0.2222222 미가입
## 98 369 0.6666667 미가입
## 99 372 0.2222222 미가입
## 100 378 0.8888889 가입
## 101 392 0.3333333 가입
## 102 394 0.0000000 가입
## 103 403 0.2222222 미가입
## 104 404 0.4444444 가입
## 105 414 0.7777778 가입
## 106 422 0.8888889 가입
## 107 423 1.0000000 가입
## 108 425 0.2222222 미가입
## 109 428 0.2222222 미가입
## 110 429 0.1111111 미가입
## 111 430 0.2222222 가입
## 112 431 0.2222222 미가입
## 113 443 0.2222222 가입
## 114 449 0.1111111 가입
## 115 456 0.8888889 가입
## 116 463 0.1111111 미가입
## 117 464 1.0000000 가입
## 118 468 0.2222222 미가입
## 119 469 0.8888889 가입
## 120 470 0.2222222 미가입
## 121 472 0.0000000 미가입
## 122 473 0.7777778 가입
## 123 479 0.3333333 미가입
## 124 482 0.6666667 미가입
## 125 484 0.3333333 가입
## 126 486 0.3333333 미가입
## 127 490 0.2222222 미가입
## 128 499 0.0000000 미가입
## 129 505 0.2222222 미가입
## 130 512 0.1111111 미가입
## 131 515 0.1111111 미가입
## 132 518 0.8888889 가입
## 133 528 0.2222222 미가입
## 134 529 0.1111111 미가입
## 135 530 0.1111111 미가입
## 136 532 0.2222222 가입
## 137 533 0.1111111 미가입
## 138 534 0.0000000 가입
## 139 536 0.1111111 미가입
## 140 538 0.3333333 미가입
## 141 547 0.7777778 가입
## 142 549 0.0000000 미가입
## 143 550 0.3333333 미가입
## 144 551 0.7777778 가입
## 145 553 1.0000000 미가입
## 146 554 0.3333333 미가입
## 147 559 0.1111111 가입
## 148 561 0.3333333 미가입
## 149 562 0.2222222 미가입
## 150 563 0.2222222 가입
## 151 565 0.1111111 미가입
## 152 570 0.3333333 미가입
## 153 584 0.1111111 미가입
## 154 586 0.0000000 미가입
## 155 593 0.3333333 미가입
## 156 594 0.5555556 미가입
## 157 598 0.6666667 가입
## 158 600 0.8888889 가입
## 159 606 0.2222222 미가입
## 160 610 0.4444444 미가입
## 161 611 0.2222222 미가입
## 162 614 0.3333333 미가입
## 163 615 0.3333333 미가입
## 164 616 0.0000000 미가입
## 165 617 0.2222222 가입
## 166 619 0.2222222 가입
## 167 623 0.1111111 미가입
## 168 625 0.2222222 미가입
## 169 628 0.2222222 미가입
## 170 633 0.2222222 미가입
## 171 642 0.0000000 미가입
## 172 654 0.3333333 미가입
## 173 655 0.2222222 미가입
## 174 659 0.3333333 미가입
## 175 662 0.2222222 미가입
## 176 667 0.7777778 미가입
## 177 675 0.1111111 미가입
## 178 676 0.0000000 미가입
## 179 677 0.1111111 미가입
## 180 681 0.1111111 미가입
## 181 682 0.8888889 가입
## 182 688 0.2222222 미가입
## 183 689 0.0000000 미가입
## 184 690 0.7777778 미가입
## 185 698 0.8888889 가입
## 186 699 0.3333333 미가입
## 187 703 0.0000000 미가입
## 188 708 0.2222222 미가입
## 189 713 0.2222222 가입
## 190 717 0.2222222 미가입
## 191 719 0.0000000 미가입
## 192 724 0.3333333 미가입
## 193 731 0.2222222 미가입
## 194 738 0.1111111 미가입
## 195 741 0.5555556 가입
## 196 742 0.4444444 가입
## 197 744 0.4444444 미가입
## 198 746 0.3333333 미가입
## 199 749 0.6666667 가입
## 200 750 0.2222222 미가입
## 201 752 0.2222222 가입
## 202 754 0.1111111 미가입
## 203 756 0.0000000 미가입
## 204 758 0.7777778 가입
## 205 759 0.5555556 가입
## 206 761 1.0000000 가입
## 207 771 0.3333333 가입
## 208 775 0.1111111 미가입
## 209 778 0.1111111 미가입
## 210 779 1.0000000 가입
## 211 781 0.2222222 가입
## 212 784 0.8888889 가입
## 213 791 0.3333333 가입
## 214 795 0.1111111 미가입
## 215 801 0.1111111 가입
## 216 804 0.3333333 미가입
## 217 805 0.2222222 미가입
## 218 807 0.3333333 가입
## 219 819 0.2222222 미가입
## 220 822 0.2222222 미가입
## 221 823 0.2222222 미가입
## 222 830 0.5555556 가입
## 223 833 0.7777778 가입
## 224 837 0.1111111 미가입
## 225 853 0.4444444 미가입
## 226 856 0.6666667 가입
## 227 859 0.3333333 미가입
## 228 863 0.2222222 미가입
## 229 865 0.8888889 가입
## 230 876 0.1111111 미가입
## 231 879 0.3333333 가입
## 232 885 0.0000000 미가입
## 233 886 0.8888889 가입
## 234 889 0.3333333 미가입
## 235 892 0.3333333 미가입
## 236 893 0.1111111 미가입
## 237 894 0.3333333 가입
## 238 896 0.0000000 미가입
## 239 902 0.6666667 가입
## 240 903 0.3333333 미가입
## 241 904 0.0000000 미가입
## 242 909 0.7777778 가입
## 243 913 1.0000000 가입
## 244 924 0.1111111 미가입
## 245 926 0.3333333 가입
## 246 927 0.8888889 가입
## 247 938 0.3333333 가입
## 248 939 0.2222222 가입
## 249 947 0.2222222 미가입
## 250 948 0.1111111 미가입
## 251 952 1.0000000 가입
## 252 959 0.2222222 미가입
## 253 965 0.6666667 미가입
## 254 968 0.1111111 미가입
## 255 973 1.0000000 가입
## 256 975 0.6666667 가입
## 257 978 0.2222222 미가입
## 258 981 0.2222222 미가입
## 259 982 0.2222222 미가입
## 260 983 0.6666667 가입
## 261 988 0.4444444 미가입
## 262 994 0.4444444 미가입
## 263 995 0.2222222 미가입
## 264 999 0.5555556 가입
## 265 1000 0.1111111 미가입
## 266 1003 0.1111111 미가입
## 267 1004 0.0000000 미가입
## 268 1005 0.2222222 미가입
## 269 1006 0.2222222 미가입
## 270 1011 0.4444444 가입
## 271 1013 0.1111111 미가입
## 272 1017 0.4444444 미가입
## 273 1019 0.5555556 가입
## 274 1022 0.1111111 미가입
## 275 1031 0.2222222 미가입
## 276 1037 0.2222222 가입
## 277 1044 0.5555556 가입
## 278 1046 0.3333333 미가입
## 279 1050 0.5555556 미가입
## 280 1053 0.1111111 가입
## 281 1055 1.0000000 가입
## 282 1057 0.3333333 미가입
## 283 1063 0.2222222 미가입
## 284 1067 0.1111111 가입
## 285 1079 0.0000000 미가입
## 286 1080 0.1111111 미가입
## 287 1082 0.2222222 미가입
## 288 1087 0.5555556 미가입
## 289 1090 0.1111111 미가입
## 290 1092 0.6666667 가입
## 291 1095 0.6666667 가입
## 292 1098 0.1111111 미가입
## 293 1106 0.1111111 미가입
## 294 1110 1.0000000 미가입
## 295 1111 0.2222222 미가입
## 296 1112 0.2222222 미가입
## 297 1135 0.1111111 가입
## 298 1140 0.7777778 미가입
## 299 1146 0.2222222 미가입
## 300 1147 1.0000000 가입
## 301 1150 0.3333333 가입
## 302 1151 0.7777778 가입
## 303 1153 0.2222222 미가입
## 304 1156 0.1111111 가입
## 305 1158 0.1111111 미가입
## 306 1162 0.3333333 가입
## 307 1169 0.7777778 가입
## 308 1172 0.3333333 가입
## 309 1173 0.5555556 가입
## 310 1175 0.1111111 미가입
## 311 1182 0.2222222 미가입
## 312 1186 0.2222222 미가입
## 313 1188 0.1111111 미가입
## 314 1190 0.1111111 미가입
## 315 1193 0.1111111 미가입
## 316 1195 0.0000000 미가입
## 317 1198 0.0000000 미가입
## 318 1202 1.0000000 가입
## 319 1206 0.1111111 가입
## 320 1207 0.2222222 미가입
## 321 1209 0.8888889 가입
## 322 1210 0.3333333 가입
## 323 1212 0.3333333 미가입
## 324 1213 0.3333333 가입
## 325 1219 0.3333333 가입
## 326 1230 0.6666667 가입
## 327 1235 0.3333333 미가입
## 328 1245 1.0000000 가입
## 329 1247 0.1111111 미가입
## 330 1257 0.4444444 미가입
## 331 1263 0.1111111 미가입
## 332 1264 0.2222222 미가입
## 333 1271 0.6666667 가입
## 334 1272 0.2222222 가입
## 335 1274 0.0000000 미가입
## 336 1275 0.1111111 미가입
## 337 1282 1.0000000 가입
## 338 1284 0.2222222 미가입
## 339 1297 0.1111111 미가입
## 340 1302 0.4444444 가입
## 341 1308 0.2222222 미가입
## 342 1309 1.0000000 가입
## 343 1319 0.1111111 미가입
## 344 1335 0.1111111 미가입
## 345 1337 0.2222222 미가입
## 346 1346 0.3333333 미가입
## 347 1349 1.0000000 가입
## 348 1352 0.0000000 미가입
## 349 1354 0.7777778 가입
## 350 1358 0.8888889 가입
## 351 1366 0.1111111 미가입
## 352 1369 0.5555556 미가입
## 353 1374 0.0000000 미가입
## 354 1379 0.5555556 미가입
## 355 1381 0.8888889 가입
## 356 1383 0.8888889 가입
## 357 1385 0.1111111 미가입
## 358 1389 0.2222222 가입
## 359 1391 1.0000000 미가입
## 360 1400 0.1111111 미가입
## 361 1401 0.0000000 미가입
## 362 1407 0.3333333 미가입
## 363 1414 0.3333333 미가입
## 364 1416 0.4444444 미가입
## 365 1418 0.3333333 가입
## 366 1420 0.2222222 미가입
## 367 1421 0.7777778 가입
## 368 1427 0.2222222 가입
## 369 1428 0.7000000 가입
## 370 1431 0.2222222 가입
## 371 1440 0.3333333 미가입
## 372 1449 0.1111111 미가입
## 373 1459 0.8888889 가입
## 374 1460 0.1111111 미가입
## 375 1462 0.4444444 미가입
## 376 1464 0.1111111 미가입
## 377 1466 0.1111111 미가입
## 378 1469 0.3333333 미가입
## 379 1472 0.1111111 미가입
## 380 1479 0.8888889 가입
## 381 1487 0.1111111 미가입
## 382 1491 1.0000000 가입
## 383 1493 0.5555556 미가입
## 384 1496 0.2222222 미가입
## 385 1501 0.3333333 미가입
## 386 1506 0.1111111 미가입
## 387 1510 0.1111111 미가입
## 388 1511 0.1111111 미가입
## 389 1516 0.3333333 미가입
## 390 1523 0.1111111 미가입
## 391 1527 0.4444444 가입
## 392 1537 0.2222222 가입
## 393 1538 0.1111111 미가입
## 394 1540 0.3333333 미가입
## 395 1541 0.3333333 미가입
## 396 1545 0.7777778 가입
## 397 1549 0.2222222 미가입
## 398 1555 0.7777778 가입
## 399 1556 1.0000000 가입
## 400 1559 0.0000000 미가입
## 401 1566 0.8888889 가입
## 402 1568 0.8888889 미가입
## 403 1572 0.5555556 미가입
## 404 1585 0.7777778 가입
## 405 1588 0.0000000 미가입
## 406 1589 0.8888889 가입
## 407 1607 0.1111111 가입
## 408 1615 0.3333333 미가입
## 409 1617 0.2222222 미가입
## 410 1618 0.0000000 미가입
## 411 1620 0.2222222 미가입
## 412 1630 0.0000000 가입
## 413 1640 0.0000000 미가입
## 414 1647 0.1111111 미가입
## 415 1648 0.2222222 미가입
## 416 1650 0.6666667 가입
## 417 1651 1.0000000 가입
## 418 1656 0.1111111 미가입
## 419 1658 0.6666667 미가입
## 420 1661 0.2222222 미가입
## 421 1662 0.1111111 미가입
## 422 1670 0.6666667 가입
## 423 1671 0.6666667 가입
## 424 1675 0.4444444 가입
## 425 1678 1.0000000 가입
## 426 1689 0.0000000 미가입
## 427 1691 1.0000000 가입
## 428 1692 0.2222222 미가입
## 429 1714 0.1111111 미가입
## 430 1722 0.7777778 가입
## 431 1727 0.2222222 미가입
## 432 1732 1.0000000 미가입
## 433 1734 0.1111111 미가입
## 434 1736 0.6666667 가입
## 435 1738 0.2222222 미가입
## 436 1742 0.2222222 미가입
## 437 1747 1.0000000 가입
## 438 1750 0.1111111 미가입
## 439 1754 0.6666667 가입
## 440 1756 0.2222222 가입
## 441 1764 0.4444444 미가입
## 442 1765 0.3333333 가입
## 443 1767 0.4444444 가입
## 444 1769 0.2222222 미가입
## 445 1770 0.1111111 미가입
## 446 1773 0.3333333 미가입
## 447 1774 0.1111111 미가입
## 448 1785 0.5555556 미가입
## 449 1791 0.1111111 미가입
## 450 1793 1.0000000 가입
## 451 1798 1.0000000 가입
## 452 1804 0.2222222 미가입
## 453 1806 0.0000000 가입
## 454 1809 0.2222222 미가입
## 455 1812 0.4444444 미가입
## 456 1813 1.0000000 가입
## 457 1818 0.1111111 미가입
## 458 1827 0.3333333 가입
## 459 1828 0.1111111 미가입
## 460 1831 0.0000000 미가입
## 461 1834 0.1111111 미가입
## 462 1835 0.0000000 미가입
## 463 1848 0.0000000 미가입
## 464 1858 0.1111111 가입
## 465 1863 0.4444444 미가입
## 466 1865 0.8888889 가입
## 467 1873 0.4444444 미가입
## 468 1882 0.7777778 가입
## 469 1886 0.3333333 미가입
## 470 1887 0.0000000 미가입
## 471 1893 0.3333333 미가입
## 472 1896 1.0000000 가입
## 473 1899 0.0000000 미가입
## 474 1901 0.2222222 미가입
## 475 1909 0.0000000 미가입
## 476 1912 0.4444444 미가입
## 477 1918 0.8888889 가입
## 478 1920 0.1111111 미가입
## 479 1921 0.0000000 미가입
## 480 1922 0.4444444 미가입
## 481 1930 0.2222222 미가입
## 482 1937 0.7777778 미가입
## 483 1938 0.6666667 미가입
## 484 1940 0.8888889 미가입
## 485 1942 0.0000000 미가입
## 486 1949 0.4444444 가입
## 487 1951 0.4444444 미가입
## 488 1953 0.4444444 가입
## 489 1957 0.1111111 미가입
## 490 1958 0.4444444 미가입
## 491 1959 0.3333333 미가입
## 492 1975 0.7777778 가입
## 493 1976 0.2222222 미가입
## 494 1980 0.1111111 미가입
## 495 1983 0.8888889 미가입
## 496 1986 0.4444444 미가입
#part4 3
train<-read.csv('Insurance_train_10.csv')
test<-read.csv("Insurance_test_10.csv")
glimpse(train)
## Rows: 6,969
## Columns: 9
## $ Gender <chr> "Male", "Female", "Male", "Male", "Male", "Female", "F…
## $ Ever_Married <chr> "No", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "…
## $ Age <int> 22, 67, 67, 56, 32, 33, 61, 55, 26, 19, 58, 41, 32, 31…
## $ Graduated <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", …
## $ Profession <chr> "Healthcare", "Engineer", "Lawyer", "Artist", "Healthc…
## $ Work_Experience <int> 1, 1, 0, 0, 1, 1, 0, 1, 1, 4, 0, 1, 9, 1, 1, 0, 12, 3,…
## $ Spending_Score <chr> "Low", "Low", "High", "Average", "Low", "Low", "Low", …
## $ Family_Size <int> 4, 1, 2, 2, 3, 3, 3, 4, 3, 4, 1, 2, 5, 6, 4, 1, 1, 4, …
## $ Segmentation <int> 4, 2, 2, 3, 3, 4, 4, 3, 1, 4, 2, 3, 4, 2, 2, 3, 1, 4, …
glimpse(test)
## Rows: 2,267
## Columns: 9
## $ X <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Gender <chr> "Female", "Male", "Female", "Male", "Male", "Male", "F…
## $ Ever_Married <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"…
## $ Age <int> 36, 37, 69, 59, 47, 61, 47, 50, 19, 22, 22, 50, 27, 18…
## $ Graduated <chr> "Yes", "Yes", "No", "No", "Yes", "Yes", "Yes", "Yes", …
## $ Profession <chr> "Engineer", "Healthcare", "", "Executive", "Doctor", "…
## $ Work_Experience <int> 0, 8, 0, 11, 0, 5, 1, 2, 0, 0, 0, 1, 8, 0, 0, 1, 1, 8,…
## $ Spending_Score <chr> "Low", "Average", "Low", "High", "High", "Low", "Avera…
## $ Family_Size <int> 1, 4, 1, 2, 5, 3, 3, 4, 4, 3, 6, 5, 3, 3, 1, 3, 2, 1, …
colSums(is.na(train))
## Gender Ever_Married Age Graduated Profession
## 0 0 0 0 0
## Work_Experience Spending_Score Family_Size Segmentation
## 0 0 0 0
colSums(is.na(test))
## X Gender Ever_Married Age Graduated
## 0 0 0 0 0
## Profession Work_Experience Spending_Score Family_Size
## 0 0 0 0
train$Segmentation<-as.factor(train$Segmentation)
model=train(Segmentation~.,data=train,
method='knn',
preProcess=c('center','scale'),
trControl=trainControl(method='cv'))
model
## k-Nearest Neighbors
##
## 6969 samples
## 8 predictor
## 4 classes: '1', '2', '3', '4'
##
## Pre-processing: centered (19), scaled (19)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 6272, 6273, 6274, 6272, 6272, 6272, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.4874497 0.3152952
## 7 0.4884457 0.3165998
## 9 0.4930420 0.3224582
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
confusionMatrix(model)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 1 2 3 4
## 1 9.9 5.5 2.8 5.3
## 2 5.3 7.3 5.4 2.0
## 3 3.9 7.8 14.0 1.3
## 4 5.2 2.8 3.2 18.1
##
## Accuracy (average) : 0.493
result<-predict(model,test,prob=TRUE)
df<-bind_cols(test,result)
## New names:
## • `` -> `...10`
glimpse(test)
## Rows: 2,267
## Columns: 9
## $ X <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Gender <chr> "Female", "Male", "Female", "Male", "Male", "Male", "F…
## $ Ever_Married <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"…
## $ Age <int> 36, 37, 69, 59, 47, 61, 47, 50, 19, 22, 22, 50, 27, 18…
## $ Graduated <chr> "Yes", "Yes", "No", "No", "Yes", "Yes", "Yes", "Yes", …
## $ Profession <chr> "Engineer", "Healthcare", "", "Executive", "Doctor", "…
## $ Work_Experience <int> 0, 8, 0, 11, 0, 5, 1, 2, 0, 0, 0, 1, 8, 0, 0, 1, 1, 8,…
## $ Spending_Score <chr> "Low", "Average", "Low", "High", "High", "Low", "Avera…
## $ Family_Size <int> 1, 4, 1, 2, 5, 3, 3, 4, 4, 3, 6, 5, 3, 3, 1, 3, 2, 1, …
glimpse(df)
## Rows: 2,267
## Columns: 10
## $ X <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Gender <chr> "Female", "Male", "Female", "Male", "Male", "Male", "F…
## $ Ever_Married <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"…
## $ Age <int> 36, 37, 69, 59, 47, 61, 47, 50, 19, 22, 22, 50, 27, 18…
## $ Graduated <chr> "Yes", "Yes", "No", "No", "Yes", "Yes", "Yes", "Yes", …
## $ Profession <chr> "Engineer", "Healthcare", "", "Executive", "Doctor", "…
## $ Work_Experience <int> 0, 8, 0, 11, 0, 5, 1, 2, 0, 0, 0, 1, 8, 0, 0, 1, 1, 8,…
## $ Spending_Score <chr> "Low", "Average", "Low", "High", "High", "Low", "Avera…
## $ Family_Size <int> 1, 4, 1, 2, 5, 3, 3, 4, 4, 3, 6, 5, 3, 3, 1, 3, 2, 1, …
## $ ...10 <fct> 2, 1, 2, 3, 3, 1, 3, 3, 4, 4, 4, 3, 4, 4, 2, 2, 3, 2, …
#part4 5
df<-read.csv('nyc.csv')
glimpse(df)
## Rows: 165
## Columns: 9
## $ Case <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Restaurant <chr> "Daniella Ristorante", "Tello's Ristorante", "Biricchino", …
## $ Price <int> 43, 32, 34, 41, 54, 52, 34, 34, 39, 44, 45, 47, 52, 35, 47,…
## $ Food <int> 22, 20, 21, 20, 24, 22, 22, 20, 22, 21, 19, 21, 21, 19, 20,…
## $ Decor <int> 18, 19, 13, 20, 19, 22, 16, 18, 19, 17, 17, 19, 19, 17, 18,…
## $ Service <int> 20, 19, 18, 17, 21, 21, 21, 21, 22, 19, 20, 21, 20, 19, 21,…
## $ East <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ latitude <dbl> 40.74683, 40.74342, 40.74886, 40.74848, 40.73958, 40.74069,…
## $ longitude <dbl> -73.99676, -73.99954, -73.99552, -74.00331, -73.99591, -73.…
summary(df)
## Case Restaurant Price Food
## Min. : 1.0 Length:165 Min. :19.00 Min. :16.00
## 1st Qu.: 43.0 Class :character 1st Qu.:36.00 1st Qu.:19.00
## Median : 84.0 Mode :character Median :43.00 Median :21.00
## Mean : 84.5 Mean :42.67 Mean :20.59
## 3rd Qu.:127.0 3rd Qu.:50.00 3rd Qu.:22.00
## Max. :168.0 Max. :65.00 Max. :25.00
## Decor Service East latitude
## Min. : 6.00 Min. :14.00 Min. :0.0000 Min. :40.71
## 1st Qu.:16.00 1st Qu.:18.00 1st Qu.:0.0000 1st Qu.:40.76
## Median :18.00 Median :20.00 Median :1.0000 Median :40.76
## Mean :17.68 Mean :19.39 Mean :0.6303 Mean :40.76
## 3rd Qu.:19.00 3rd Qu.:21.00 3rd Qu.:1.0000 3rd Qu.:40.77
## Max. :25.00 Max. :24.00 Max. :1.0000 Max. :40.80
## longitude
## Min. :-74.01
## 1st Qu.:-73.98
## Median :-73.97
## Mean :-73.97
## 3rd Qu.:-73.96
## Max. :-73.93
train_list=createDataPartition(df$Price,p=.7,list=FALSE)
train=df[train_list,]
test=df[-train_list,]
test<- test%>% select(-Price,-Restaurant,-Case)
train<-train %>% select(-Restaurant,-Case)
model=train(Price~.,data=train,
method='rf',
trControl=trainControl(method='cv'))
model
## Random Forest
##
## 118 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 107, 105, 105, 105, 107, 107, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 6.254265 0.5847555 4.897332
## 4 6.267707 0.5852535 4.931151
## 6 6.339397 0.5826628 4.946240
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
result<-predict(model,test)
head(result)
## 5 7 8 9 10 12
## 49.40921 43.50968 44.39563 49.24984 39.01449 44.16476
#part4 2.3
df<-read.csv("wbc.csv")
glimpse(df)
## Rows: 569
## Columns: 33
## $ id <int> 842302, 842517, 84300903, 84348301, 84358402, …
## $ diagnosis <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, 12.450…
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70, 19.9…
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 82.57, …
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 477.1, …
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10030, 0…
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13280, 0…
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19800, 0…
## $ concave.points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10430, 0…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, 0.2087…
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05883, 0…
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, 0.3345…
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, 0.8902…
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217, 3.18…
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.19, 53.…
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, 0.0114…
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, 0.0246…
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05688, 0…
## $ concave.points_se <dbl> 0.015870, 0.013400, 0.020580, 0.018670, 0.0188…
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01756, 0…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, 0.0051…
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47, 22.8…
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75, 27.6…
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 103.40,…
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 741.6, …
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, 0.1791…
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, 0.5249…
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40000, 0…
## $ concave.points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16250, 0…
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, 0.3985…
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07678, 0…
## $ X <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
colSums(is.na(df))
## id diagnosis radius_mean
## 0 0 0
## texture_mean perimeter_mean area_mean
## 0 0 0
## smoothness_mean compactness_mean concavity_mean
## 0 0 0
## concave.points_mean symmetry_mean fractal_dimension_mean
## 0 0 0
## radius_se texture_se perimeter_se
## 0 0 0
## area_se smoothness_se compactness_se
## 0 0 0
## concavity_se concave.points_se symmetry_se
## 0 0 0
## fractal_dimension_se radius_worst texture_worst
## 0 0 0
## perimeter_worst area_worst smoothness_worst
## 0 0 0
## compactness_worst concavity_worst concave.points_worst
## 0 0 0
## symmetry_worst fractal_dimension_worst X
## 0 0 569
df<-df %>% select(-id,-X)
train_list<-createDataPartition(df$diagnosis,p=.8,list=FALSE)
train_df<-df[train_list,]
test_df<-df[-train_list,]
test_df<-test_df %>% select(-diagnosis)
model=train(diagnosis~.,data=train_df,
method='knn',
preProcess=c('center','scale'),
trControl=trainControl(method='cv'))
model
## k-Nearest Neighbors
##
## 456 samples
## 30 predictor
## 2 classes: 'B', 'M'
##
## Pre-processing: centered (30), scaled (30)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 410, 410, 411, 411, 411, 410, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9692754 0.9331713
## 7 0.9648792 0.9230639
## 9 0.9648792 0.9240188
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
result<-predict(model,test_df,type='prob')
head(result)
## B M
## 1 0.2 0.8
## 2 0.0 1.0
## 3 0.2 0.8
## 4 1.0 0.0
## 5 0.0 1.0
## 6 0.0 1.0