rm(list=ls())
ls()
## character(0)
#10
x<-sample(1:20,10);y<-c(1,2,3,4,5,37,41,42,44,10)
a<-data.frame(x,y)

quantile(a$y)              
##    0%   25%   50%   75%  100% 
##  1.00  3.25  7.50 40.00 44.00
df<-quantile(a$y)              
df1<-abs(df[[2]]-df[[4]])
df1
## [1] 36.75
df1<-floor(df1)
df1
## [1] 36
cat(df1)
## 36
#11
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
getwd()
## [1] "C:/Users/user/Desktop"
setwd("c:/data")
df<-read.csv("facebook.csv")
df %>% glimpse
## Rows: 6,997
## Columns: 16
## $ status_id          <chr> "246675545449582_1649696485147474", "24667554544958…
## $ status_type        <chr> "video", "photo", "video", "photo", "photo", "photo…
## $ status_published   <chr> "4/22/2018 6:00", "4/21/2018 22:45", "4/21/2018 6:1…
## $ num_reactions      <int> 529, 150, 227, 111, 213, 217, 503, 295, 203, 170, 2…
## $ num_comments       <int> 512, 0, 236, 0, 0, 6, 614, 453, 1, 9, 2, 4, 4, 4, 1…
## $ num_shares         <int> 262, 0, 57, 0, 0, 0, 72, 53, 0, 1, 3, 0, 2, 0, 0, 3…
## $ num_likes          <int> 432, 150, 204, 111, 204, 211, 418, 260, 198, 167, 2…
## $ num_loves          <int> 92, 0, 21, 0, 9, 5, 70, 32, 5, 3, 7, 5, 6, 8, 10, 2…
## $ num_wows           <int> 3, 0, 1, 0, 0, 1, 10, 1, 0, 0, 1, 4, 2, 1, 1, 1, 0,…
## $ num_hahas          <int> 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, …
## $ num_sads           <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ num_angrys         <int> 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ react_comment_r    <dbl> 1.0332031, 0.0000000, 0.9618644, 0.0000000, 0.00000…
## $ react_share_r      <dbl> 2.019084, 0.000000, 3.982456, 0.000000, 0.000000, 0…
## $ postive_reactions  <int> 527, 150, 226, 111, 213, 217, 498, 293, 203, 170, 2…
## $ negative_reactions <int> 2, 0, 1, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 5, 0, …
colSums(is.na(df))
##          status_id        status_type   status_published      num_reactions 
##                  0                  0                  0                  0 
##       num_comments         num_shares          num_likes          num_loves 
##                  0                  0                  0                  0 
##           num_wows          num_hahas           num_sads         num_angrys 
##                  0                  0                  0                  0 
##    react_comment_r      react_share_r  postive_reactions negative_reactions 
##                120                117                  0                  0
df %>% mutate(ratio=(num_loves+num_wows)/(num_reactions)) %>%
  filter(ratio<0.5&ratio>0.4) %>% filter(status_type=="video") %>%
  NROW
## [1] 90
df1<-(df %>% mutate(ratio=(num_loves+num_wows)/(num_reactions)) %>%
        filter(ratio<0.5&ratio>0.4) %>% filter(status_type=="video") %>%
        NROW)
cat(df1)  
## 90
#12
library(dplyr)
df<-read.csv("netflix.csv")
df %>% glimpse
## Rows: 8,807
## Columns: 11
## $ show_id      <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1…
## $ type         <chr> "Movie", "TV Show", "TV Show", "TV Show", "TV Show", "TV …
## $ title        <chr> "Dick Johnson Is Dead", "Blood & Water", "Ganglands", "Ja…
## $ director     <chr> "Kirsten Johnson", "", "Julien Leclercq", "", "", "Mike F…
## $ cast         <chr> "", "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Mola…
## $ country      <chr> "United States", "South Africa", "", "", "India", "", "",…
## $ date_added   <chr> "25-Sep-21", "24-Sep-21", "24-Sep-21", "24-Sep-21", "24-S…
## $ release_year <int> 2020, 2021, 2021, 2021, 2021, 2021, 2021, 1993, 2021, 202…
## $ rating       <chr> "PG-13", "TV-MA", "TV-MA", "TV-MA", "TV-MA", "TV-MA", "PG…
## $ duration     <chr> "90 min", "2 Seasons", "1 Season", "1 Season", "2 Seasons…
## $ listed_in    <chr> "Documentaries", "International TV Shows, TV Dramas, TV M…
colSums(is.na(df))
##      show_id         type        title     director         cast      country 
##            0            0            0            0            0            0 
##   date_added release_year       rating     duration    listed_in 
##            0            0            0            0            0
library(lubridate)
## 필요한 패키지를 로딩중입니다: timechange
## 
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
df %>% filter(country=="United Kingdom") %>% 
  mutate(ymd=dmy(date_added)) %>% select(ymd) %>% 
  filter(ymd>='2018-01-01'&ymd<='2018-01-30')
## Warning: 15 failed to parse.
##          ymd
## 1 2018-01-30
## 2 2018-01-18
## 3 2018-01-01
## 4 2018-01-15
## 5 2018-01-01
df %>% filter(country=="United Kingdom") %>% count(date_added)
##             date_added  n
## 1                       1
## 2        April 4, 2017  1
## 3     December 1, 2018  1
## 4    December 15, 2017  1
## 5    December 15, 2018  1
## 6     December 2, 2017  1
## 7     February 1, 2019  1
## 8      January 1, 2018  1
## 9        July 26, 2019  1
## 10      March 16, 2016  1
## 11      March 31, 2017  1
## 12      March 31, 2018  2
## 13     October 1, 2019  2
## 14   September 1, 2019  1
## 15           01-Apr-17  2
## 16           01-Apr-18  2
## 17           01-Apr-19  1
## 18           01-Apr-20  3
## 19           01-Apr-21  2
## 20           01-Aug-16  6
## 21           01-Aug-17 11
## 22           01-Aug-18  1
## 23           01-Aug-19  4
## 24           01-Aug-20  1
## 25           01-Dec-17  2
## 26           01-Dec-18  2
## 27           01-Dec-19  1
## 28           01-Dec-20  2
## 29           01-Feb-18  2
## 30           01-Feb-19 11
## 31           01-Feb-20  1
## 32           01-Feb-21  1
## 33           01-Jan-18  2
## 34           01-Jan-19  2
## 35           01-Jan-20  2
## 36           01-Jan-21  1
## 37           01-Jul-17  1
## 38           01-Jul-20  1
## 39           01-Jul-21  1
## 40           01-Jun-16  1
## 41           01-Jun-17  4
## 42           01-Mar-17  7
## 43           01-Mar-18  2
## 44           01-Mar-19  2
## 45           01-May-17  1
## 46           01-May-18  2
## 47           01-May-19  3
## 48           01-May-21  1
## 49           01-Nov-16  1
## 50           01-Nov-18  1
## 51           01-Nov-19  3
## 52           01-Oct-16  1
## 53           01-Oct-17  4
## 54           01-Oct-18  2
## 55           01-Oct-19  1
## 56           01-Sep-16  5
## 57           01-Sep-17  1
## 58           01-Sep-19  1
## 59           01-Sep-20  1
## 60           02-Dec-20  1
## 61           02-Feb-19  1
## 62           02-Jan-19  1
## 63           02-Jun-17  1
## 64           02-Jun-21  3
## 65           02-Nov-16  1
## 66           02-Oct-18 10
## 67           02-Oct-19  3
## 68           03-Dec-18  1
## 69           03-Dec-19  1
## 70           03-Mar-20  1
## 71           03-May-19  1
## 72           03-Sep-20  1
## 73           04-Jan-20  1
## 74           04-Jun-21  1
## 75           04-Nov-17  1
## 76           04-Nov-19  1
## 77           04-Oct-19  1
## 78           05-Apr-19  2
## 79           05-Aug-21  1
## 80           05-Feb-19  1
## 81           05-Jan-19  1
## 82           05-Jun-19  1
## 83           05-Mar-20  1
## 84           05-Nov-19  1
## 85           05-Oct-18  1
## 86           05-Sep-20  1
## 87           06-Jul-21  5
## 88           06-Oct-20  1
## 89           06-Sep-16  1
## 90           06-Sep-17  1
## 91           07-Sep-21  1
## 92           08-Aug-21  1
## 93           08-Feb-19  1
## 94           08-Jan-19  1
## 95           08-Jan-21  1
## 96           08-May-18  1
## 97           08-Nov-19  1
## 98           09-Dec-16  1
## 99           09-Jan-20  1
## 100          09-Mar-17  1
## 101          09-Nov-16  1
## 102          09-Sep-16  1
## 103          10-Apr-18  1
## 104          10-Jan-20  2
## 105          10-Jul-18  1
## 106          10-Oct-15  1
## 107          10-Sep-19  2
## 108          11-Jan-19  1
## 109          11-Sep-17  1
## 110          11-Sep-20  1
## 111          12-Dec-17  1
## 112          12-Dec-19  1
## 113          12-Feb-21  1
## 114          12-Jan-21  1
## 115          12-Jul-19  4
## 116          12-Mar-21  1
## 117          12-May-17  1
## 118          13-Aug-17  1
## 119          13-Feb-21  1
## 120          13-Jul-17  1
## 121          13-Mar-18  1
## 122          13-May-21  1
## 123          13-Oct-17  1
## 124          13-Oct-20  1
## 125          14-Aug-20  1
## 126          14-Feb-17  1
## 127          14-May-17  1
## 128          14-Sep-21  1
## 129          15-Apr-17  1
## 130          15-Apr-20  1
## 131          15-Aug-16  3
## 132          15-Dec-16  2
## 133          15-Dec-18  2
## 134          15-Feb-19  1
## 135          15-Feb-21  1
## 136          15-Jan-18  1
## 137          15-Jun-15  1
## 138          15-Jun-17  1
## 139          15-Mar-17  1
## 140          15-Mar-19  2
## 141          15-Mar-21  1
## 142          15-May-16  1
## 143          15-May-17  1
## 144          15-May-19  2
## 145          15-May-20  2
## 146          15-May-21  1
## 147          15-Oct-18  1
## 148          15-Oct-20  1
## 149          15-Sep-17  1
## 150          15-Sep-18  2
## 151          15-Sep-20  2
## 152          15-Sep-21  1
## 153          16-Apr-19  1
## 154          16-Aug-19  1
## 155          16-Dec-16  1
## 156          16-Dec-20  1
## 157          16-Mar-17  1
## 158          16-May-18  1
## 159          16-May-19  1
## 160          16-Sep-16  1
## 161          17-Apr-19  1
## 162          17-Feb-21  1
## 163          17-May-19  1
## 164          17-Nov-16  1
## 165          17-Oct-20  1
## 166          17-Sep-17  1
## 167          17-Sep-19  1
## 168          17-Sep-20  1
## 169          17-Sep-21  1
## 170          18-Jan-18  1
## 171          18-Jan-19  1
## 172          18-Mar-16  2
## 173          18-Mar-20  1
## 174          18-May-20  1
## 175          18-Nov-16  1
## 176          18-Oct-19  3
## 177          18-Sep-17  1
## 178          19-Dec-17  1
## 179          19-Feb-19  1
## 180          19-Jan-19  1
## 181          19-Jul-20  1
## 182          19-Jun-19  1
## 183          19-Jun-21  1
## 184          19-Mar-21  1
## 185          19-Oct-18  1
## 186          20-Aug-19  2
## 187          20-Dec-18  1
## 188          20-Dec-20  1
## 189          20-Feb-18  1
## 190          20-Feb-21  1
## 191          20-Jul-18  1
## 192          20-Jun-16  1
## 193          20-Jun-17  1
## 194          20-Mar-20  2
## 195          20-May-20  1
## 196          21-Feb-19  1
## 197          21-Jul-17  1
## 198          21-Jul-20  1
## 199          21-Oct-19  1
## 200          21-Oct-20  1
## 201          22-Apr-15  1
## 202          22-Apr-17  2
## 203          22-Apr-20  1
## 204          22-Dec-18  2
## 205          22-Dec-20  2
## 206          22-Feb-17  2
## 207          22-Feb-18  1
## 208          22-Feb-19  2
## 209          22-Jul-19  1
## 210          22-Jun-18  1
## 211          22-Mar-19  1
## 212          22-May-17  1
## 213          22-May-20  1
## 214          22-Oct-20  1
## 215          23-Dec-17  1
## 216          23-Feb-17  2
## 217          23-Feb-21  1
## 218          23-Jul-21  1
## 219          23-Jun-18  2
## 220          23-Mar-18  1
## 221          23-Nov-18  1
## 222          23-Sep-20  1
## 223          24-Apr-20  1
## 224          24-Aug-18  1
## 225          24-Aug-21  1
## 226          24-Jun-20  1
## 227          24-Mar-18  1
## 228          24-Oct-17  2
## 229          24-Sep-21  1
## 230          25-Aug-16  1
## 231          25-Dec-17  1
## 232          25-Jan-19  1
## 233          25-May-20  1
## 234          25-Nov-20  1
## 235          25-Oct-17  1
## 236          25-Sep-17  1
## 237          26-Apr-20  1
## 238          26-Aug-20  1
## 239          26-Dec-17  1
## 240          26-Jan-19  1
## 241          26-Jul-18  2
## 242          26-Jul-19  6
## 243          26-Mar-21  2
## 244          26-May-16  1
## 245          27-Aug-19  1
## 246          27-Dec-17  1
## 247          27-Feb-18  1
## 248          27-Jan-17  1
## 249          27-Jun-19  1
## 250          27-Mar-18  1
## 251          27-Nov-17  1
## 252          28-Apr-16  1
## 253          28-Dec-18  1
## 254          28-Jul-17  1
## 255          28-Oct-19  1
## 256          29-Apr-20  1
## 257          29-Aug-19  1
## 258          29-Dec-17  1
## 259          29-Jan-20  1
## 260          29-Jan-21  1
## 261          29-Jun-18  2
## 262          29-Mar-19  1
## 263          29-Sep-17  1
## 264          30-Dec-20  1
## 265          30-Jan-18  1
## 266          30-Jan-20  1
## 267          30-Jul-21  1
## 268          30-Jun-16  1
## 269          30-Mar-18  1
## 270          30-Mar-19  1
## 271          30-Nov-17  1
## 272          30-Oct-20  1
## 273          30-Sep-17  1
## 274          30-Sep-19  1
## 275          31-Dec-17  1
## 276          31-Dec-18  7
## 277          31-Jan-19  1
## 278          31-Jul-20  1
## 279          31-Mar-17 12
## 280          31-Mar-18  2
## 281          31-Mar-21  1
## 282          31-May-17  1
#3장 기출 예상문제 p247
#1
library(dplyr)
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
library(recipes)
## 
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
x_test<-read.csv('X_test.csv',
                 fileEncoding = "euc-kr", na="")
x_train<-read.csv('X_train.csv',
                  fileEncoding = "euc-kr", na="")
y_train<-read.csv('y_train.csv',
                  fileEncoding = "euc-kr", na="")
x_train %>% glimpse
## Rows: 3,500
## Columns: 10
## $ cust_id        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ 총구매액       <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900…
## $ 최대구매액     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,…
## $ 환불금액       <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,…
## $ 주구매상품     <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자…
## $ 주구매지점     <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산…
## $ 내점일수       <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152…
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666…
## $ 주말방문비율   <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000…
## $ 구매주기       <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, …
y_train %>% glimpse
## Rows: 3,500
## Columns: 2
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ gender  <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,…
left_join(x_train,y_train,by='cust_id') %>% mutate(index='train')->train
train %>% glimpse
## Rows: 3,500
## Columns: 12
## $ cust_id        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ 총구매액       <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900…
## $ 최대구매액     <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,…
## $ 환불금액       <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,…
## $ 주구매상품     <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자…
## $ 주구매지점     <chr> "강남점", "잠실점", "관악점", "광주점", "본  점", "일산…
## $ 내점일수       <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152…
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666…
## $ 주말방문비율   <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000…
## $ 구매주기       <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, …
## $ gender         <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0…
## $ index          <chr> "train", "train", "train", "train", "train", "train", "…
x_test %>% mutate(index='test')->test
test %>% glimpse
## Rows: 2,482
## Columns: 11
## $ cust_id        <int> 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3…
## $ 총구매액       <dbl> 70900400, 310533100, 305264140, 7594080, 1795790, 13000…
## $ 최대구매액     <int> 22000000, 38558000, 14825000, 5225000, 1411200, 2160000…
## $ 환불금액       <int> 4050000, 48034700, 30521000, NA, NA, NA, 39566000, NA, …
## $ 주구매상품     <chr> "골프", "농산물", "가공식품", "주방용품", "수산품", "화…
## $ 주구매지점     <chr> "부산본점", "잠실점", "본  점", "부산본점", "청량리점",…
## $ 내점일수       <int> 13, 90, 101, 5, 3, 5, 144, 1, 1, 28, 21, 3, 23, 30, 3, …
## $ 내점당구매건수 <dbl> 1.461538, 2.433333, 14.623762, 2.000000, 2.666667, 2.20…
## $ 주말방문비율   <dbl> 0.78947368, 0.36986301, 0.08327691, 0.00000000, 0.12500…
## $ 구매주기       <int> 26, 3, 3, 47, 8, 61, 2, 0, 0, 12, 14, 2, 15, 11, 112, 2…
## $ index          <chr> "test", "test", "test", "test", "test", "test", "test",…
bind_rows(train,test)->full
full$gender<-ifelse(full$gender==0,"남성","여성")
full$gender<-as.factor(full$gender)
full$index<-as.factor(full$index)
names(full)
##  [1] "cust_id"        "총구매액"       "최대구매액"     "환불금액"      
##  [5] "주구매상품"     "주구매지점"     "내점일수"       "내점당구매건수"
##  [9] "주말방문비율"   "구매주기"       "gender"         "index"
data<-full %>% rename(total="총구매액",
                      max="최대구매액",
                      refund="환불금액",
                      product="주구매상품",
                      store="주구매지점",
                      day="내점일수",
                      count="내점당구매건수",
                      week="주말방문비율",
                      cycle="구매주기") %>% 
select(cust_id,index,gender,total,max,refund,product,store,day,count,week,cycle)  
colSums(is.na(data))
## cust_id   index  gender   total     max  refund product   store     day   count 
##       0       0    2482       0       0    3906       0       0       0       0 
##    week   cycle 
##       0       0
data$refund<-ifelse(is.na(data$refund),0,data$refund)
colSums(is.na(data))
## cust_id   index  gender   total     max  refund product   store     day   count 
##       0       0    2482       0       0       0       0       0       0       0 
##    week   cycle 
##       0       0
library(recipes)
recipe(gender~.,data=data) %>% 
  step_YeoJohnson(total,max,refund,day,count,week,cycle) %>% 
  step_scale(total,max,refund,day,count,week,cycle) %>% 
  step_center(total,max,refund,day,count,week,cycle) %>% 
  prep() %>% juice()->data1
data1 %>% glimpse
## Rows: 5,982
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ index   <fct> train, train, train, train, train, train, train, train, train,…
## $ total   <dbl> -0.1109616, -0.5964776, -0.5864340, -0.4794213, -0.3820895, -0…
## $ max     <dbl> -0.25879992, -0.58005471, -0.59931645, -0.47681619, 0.15253819…
## $ refund  <dbl> 1.3776676, 1.2130535, -0.7281455, -0.7281455, -0.7281455, 1.23…
## $ product <fct> 기타, 스포츠, 남성 캐주얼, 기타, 보석, 디자이너, 시티웨어, 명…
## $ store   <fct> 강남점, 잠실점, 관악점, 광주점, 본  점, 일산점, 강남점, 본  점…
## $ day     <dbl> 0.6267964, -0.9872986, -0.9872986, 0.5877041, -0.9872986, -0.7…
## $ count   <dbl> 0.92059492, -0.89611526, -0.32407144, 0.06726813, -0.89611526,…
## $ week    <dbl> 0.96145636, -1.31805060, -1.31805060, 0.32838074, -1.31805060,…
## $ cycle   <dbl> 0.28905563, -1.19528222, -1.19528222, 0.24219137, 1.78728765, …
## $ gender  <fct> 남성, 남성, 여성, 여성, 남성, 남성, 남성, 남성, 남성, 여성, 남…
data1 %>% filter(index=="train") %>% select(-index)->train
data1 %>% filter(index=="test") %>% select(-index)->test

library(caret)
ctrl<-trainControl(method='cv',number=10,
                   summaryFunction = twoClassSummary,
                   classProbs = TRUE)
train(gender~.,data=train,
      method='rpart',
      metric="ROC",
      trControl=ctrl)->rffit
train(gender~.,data=train,
      method='glm',family=binomial,
      metric="ROC",
      trControl=ctrl)->rffit1
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
rffit
## CART 
## 
## 3500 samples
##   10 predictor
##    2 classes: '남성', '여성' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 3149, 3150, 3150, 3150, 3150, 3150, ... 
## Resampling results across tuning parameters:
## 
##   cp           ROC        Sens       Spec     
##   0.005319149  0.6194238  0.8452704  0.2918517
##   0.006838906  0.6183363  0.8122806  0.3358894
##   0.007598784  0.6183363  0.8122806  0.3358894
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005319149.
predict(rffit,test,type="prob")->pred_fit1
head(pred_fit1)
##        남성      여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289
predict(rffit,test,type="raw")->pred_fit2
head(pred_fit2)
## [1] 남성 남성 남성 여성 여성 남성
## Levels: 남성 여성
head(pred_fit1)
##        남성      여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289
names(pred_fit1)[1]<-"gender"
head(pred_fit1)
##      gender      여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289
bind_cols(x_test,pred_fit1) %>% select(cust_id,gender)->df
head(df)
##   cust_id    gender
## 1    3500 0.7364290
## 2    3501 0.7364290
## 3    3502 0.7364290
## 4    3503 0.4471698
## 5    3504 0.4471698
## 6    3505 0.6927711
write.csv(df,"2022.csv",row.names=FALSE)
read.csv("2022.csv") %>% head
##   cust_id    gender
## 1    3500 0.7364290
## 2    3501 0.7364290
## 3    3502 0.7364290
## 4    3503 0.4471698
## 5    3504 0.4471698
## 6    3505 0.6927711
#2 p259
library(dplyr)
library(caret)
library(recipes)
library(pROC)
df<-read.csv("travel_data.csv")
set.seed(1357)
train_list<-createDataPartition(y=df$TravelInsurance,p=0.75,list=FALSE)
df_train<-df[train_list,]
df_test<-df[-train_list,]
NROW(df_train)
## [1] 1491
NROW(df_test)
## [1] 496
#3 p270
train<-read.csv("insurance_train_10.csv")
test<-read.csv("insurance_test_10.csv")
train %>% glimpse
## Rows: 6,969
## Columns: 9
## $ Gender          <chr> "Male", "Female", "Male", "Male", "Male", "Female", "F…
## $ Ever_Married    <chr> "No", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "…
## $ Age             <int> 22, 67, 67, 56, 32, 33, 61, 55, 26, 19, 58, 41, 32, 31…
## $ Graduated       <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", …
## $ Profession      <chr> "Healthcare", "Engineer", "Lawyer", "Artist", "Healthc…
## $ Work_Experience <int> 1, 1, 0, 0, 1, 1, 0, 1, 1, 4, 0, 1, 9, 1, 1, 0, 12, 3,…
## $ Spending_Score  <chr> "Low", "Low", "High", "Average", "Low", "Low", "Low", …
## $ Family_Size     <int> 4, 1, 2, 2, 3, 3, 3, 4, 3, 4, 1, 2, 5, 6, 4, 1, 1, 4, …
## $ Segmentation    <int> 4, 2, 2, 3, 3, 4, 4, 3, 1, 4, 2, 3, 4, 2, 2, 3, 1, 4, …
colSums(is.na(train))
##          Gender    Ever_Married             Age       Graduated      Profession 
##               0               0               0               0               0 
## Work_Experience  Spending_Score     Family_Size    Segmentation 
##               0               0               0               0
train$Segmentation<-as.factor(train$Segmentation)
library(caret)
ctrl<-trainControl(method="cv",number=10)
train(Segmentation~.,data=train,
      method='knn',trControl=ctrl,
      preProcess=c("center","scale"))->knn_fit
knn_fit
## k-Nearest Neighbors 
## 
## 6969 samples
##    8 predictor
##    4 classes: '1', '2', '3', '4' 
## 
## Pre-processing: centered (19), scaled (19) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 6273, 6272, 6273, 6272, 6273, 6271, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.4858711  0.3132567
##   7  0.4898932  0.3184834
##   9  0.4904680  0.3190329
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
confusionMatrix(knn_fit)
## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction    1    2    3    4
##          1  9.5  5.6  2.6  5.5
##          2  5.6  7.5  5.5  1.9
##          3  3.9  7.7 14.1  1.4
##          4  5.4  2.7  3.3 18.0
##                             
##  Accuracy (average) : 0.4905
predict(knn_fit,test)->pred_fit
head(pred_fit)
## [1] 2 1 1 3 3 1
## Levels: 1 2 3 4
NROW(pred_fit)
## [1] 2267
test %>% glimpse
## Rows: 2,267
## Columns: 9
## $ X               <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Gender          <chr> "Female", "Male", "Female", "Male", "Male", "Male", "F…
## $ Ever_Married    <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"…
## $ Age             <int> 36, 37, 69, 59, 47, 61, 47, 50, 19, 22, 22, 50, 27, 18…
## $ Graduated       <chr> "Yes", "Yes", "No", "No", "Yes", "Yes", "Yes", "Yes", …
## $ Profession      <chr> "Engineer", "Healthcare", "", "Executive", "Doctor", "…
## $ Work_Experience <int> 0, 8, 0, 11, 0, 5, 1, 2, 0, 0, 0, 1, 8, 0, 0, 1, 1, 8,…
## $ Spending_Score  <chr> "Low", "Average", "Low", "High", "High", "Low", "Avera…
## $ Family_Size     <int> 1, 4, 1, 2, 5, 3, 3, 4, 4, 3, 6, 5, 3, 3, 1, 3, 2, 1, …
bind_cols(test,pred_fit)->df
## New names:
## • `` -> `...10`
names(df)[9]<-"Segmentaton_pred"
df %>% select(9)->df1
write.csv(df1,"2022.csv",row.names = FALSE)
set.seed(12345)
IDX<-createDataPartition(train$Segmentation,p=0.7,list=FALSE)
train_t<-train[IDX,]
test_v<-train[-IDX,]
train_t$Segmentation<-as.factor(train_t$Segmentation)
test_v$Segmentation<-as.factor(test_v$Segmentation)
ctrl<-trainControl(method="cv",number=10)
train(Segmentation~.,data=train_t,
      method='knn',trControl=ctrl,
      preProcess=c("center","scale"))->knn_fit1
predict(knn_fit1,newdata=test_v)->test_pred
confusionMatrix(test_pred,test_v$Segmentation,mode="prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2   3   4
##          1 207 114  62 115
##          2 120 152 123  53
##          3  71 163 284  33
##          4 109  60  64 358
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4794          
##                  95% CI : (0.4578, 0.5011)
##     No Information Rate : 0.2677          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3047          
##                                           
##  Mcnemar's Test P-Value : 0.009816        
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4
## Precision             0.41566   0.3393   0.5154   0.6058
## Recall                0.40828   0.3108   0.5328   0.6404
## F1                    0.41194   0.3244   0.5240   0.6226
## Prevalence            0.24282   0.2342   0.2553   0.2677
## Detection Rate        0.09914   0.0728   0.1360   0.1715
## Detection Prevalence  0.23851   0.2146   0.2639   0.2830
## Balanced Accuracy     0.61211   0.5629   0.6806   0.7440