rm(list=ls())
ls()
## character(0)
#10
x<-sample(1:20,10);y<-c(1,2,3,4,5,37,41,42,44,10)
a<-data.frame(x,y)
quantile(a$y)
## 0% 25% 50% 75% 100%
## 1.00 3.25 7.50 40.00 44.00
df<-quantile(a$y)
df1<-abs(df[[2]]-df[[4]])
df1
## [1] 36.75
df1<-floor(df1)
df1
## [1] 36
cat(df1)
## 36
#11
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
getwd()
## [1] "C:/Users/user/Desktop"
setwd("c:/data")
df<-read.csv("facebook.csv")
df %>% glimpse
## Rows: 6,997
## Columns: 16
## $ status_id <chr> "246675545449582_1649696485147474", "24667554544958…
## $ status_type <chr> "video", "photo", "video", "photo", "photo", "photo…
## $ status_published <chr> "4/22/2018 6:00", "4/21/2018 22:45", "4/21/2018 6:1…
## $ num_reactions <int> 529, 150, 227, 111, 213, 217, 503, 295, 203, 170, 2…
## $ num_comments <int> 512, 0, 236, 0, 0, 6, 614, 453, 1, 9, 2, 4, 4, 4, 1…
## $ num_shares <int> 262, 0, 57, 0, 0, 0, 72, 53, 0, 1, 3, 0, 2, 0, 0, 3…
## $ num_likes <int> 432, 150, 204, 111, 204, 211, 418, 260, 198, 167, 2…
## $ num_loves <int> 92, 0, 21, 0, 9, 5, 70, 32, 5, 3, 7, 5, 6, 8, 10, 2…
## $ num_wows <int> 3, 0, 1, 0, 0, 1, 10, 1, 0, 0, 1, 4, 2, 1, 1, 1, 0,…
## $ num_hahas <int> 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, …
## $ num_sads <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ num_angrys <int> 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ react_comment_r <dbl> 1.0332031, 0.0000000, 0.9618644, 0.0000000, 0.00000…
## $ react_share_r <dbl> 2.019084, 0.000000, 3.982456, 0.000000, 0.000000, 0…
## $ postive_reactions <int> 527, 150, 226, 111, 213, 217, 498, 293, 203, 170, 2…
## $ negative_reactions <int> 2, 0, 1, 0, 0, 0, 5, 2, 0, 0, 0, 0, 0, 0, 0, 5, 0, …
colSums(is.na(df))
## status_id status_type status_published num_reactions
## 0 0 0 0
## num_comments num_shares num_likes num_loves
## 0 0 0 0
## num_wows num_hahas num_sads num_angrys
## 0 0 0 0
## react_comment_r react_share_r postive_reactions negative_reactions
## 120 117 0 0
df %>% mutate(ratio=(num_loves+num_wows)/(num_reactions)) %>%
filter(ratio<0.5&ratio>0.4) %>% filter(status_type=="video") %>%
NROW
## [1] 90
df1<-(df %>% mutate(ratio=(num_loves+num_wows)/(num_reactions)) %>%
filter(ratio<0.5&ratio>0.4) %>% filter(status_type=="video") %>%
NROW)
cat(df1)
## 90
#12
library(dplyr)
df<-read.csv("netflix.csv")
df %>% glimpse
## Rows: 8,807
## Columns: 11
## $ show_id <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1…
## $ type <chr> "Movie", "TV Show", "TV Show", "TV Show", "TV Show", "TV …
## $ title <chr> "Dick Johnson Is Dead", "Blood & Water", "Ganglands", "Ja…
## $ director <chr> "Kirsten Johnson", "", "Julien Leclercq", "", "", "Mike F…
## $ cast <chr> "", "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Mola…
## $ country <chr> "United States", "South Africa", "", "", "India", "", "",…
## $ date_added <chr> "25-Sep-21", "24-Sep-21", "24-Sep-21", "24-Sep-21", "24-S…
## $ release_year <int> 2020, 2021, 2021, 2021, 2021, 2021, 2021, 1993, 2021, 202…
## $ rating <chr> "PG-13", "TV-MA", "TV-MA", "TV-MA", "TV-MA", "TV-MA", "PG…
## $ duration <chr> "90 min", "2 Seasons", "1 Season", "1 Season", "2 Seasons…
## $ listed_in <chr> "Documentaries", "International TV Shows, TV Dramas, TV M…
colSums(is.na(df))
## show_id type title director cast country
## 0 0 0 0 0 0
## date_added release_year rating duration listed_in
## 0 0 0 0 0
library(lubridate)
## 필요한 패키지를 로딩중입니다: timechange
##
## 다음의 패키지를 부착합니다: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
df %>% filter(country=="United Kingdom") %>%
mutate(ymd=dmy(date_added)) %>% select(ymd) %>%
filter(ymd>='2018-01-01'&ymd<='2018-01-30')
## Warning: 15 failed to parse.
## ymd
## 1 2018-01-30
## 2 2018-01-18
## 3 2018-01-01
## 4 2018-01-15
## 5 2018-01-01
df %>% filter(country=="United Kingdom") %>% count(date_added)
## date_added n
## 1 1
## 2 April 4, 2017 1
## 3 December 1, 2018 1
## 4 December 15, 2017 1
## 5 December 15, 2018 1
## 6 December 2, 2017 1
## 7 February 1, 2019 1
## 8 January 1, 2018 1
## 9 July 26, 2019 1
## 10 March 16, 2016 1
## 11 March 31, 2017 1
## 12 March 31, 2018 2
## 13 October 1, 2019 2
## 14 September 1, 2019 1
## 15 01-Apr-17 2
## 16 01-Apr-18 2
## 17 01-Apr-19 1
## 18 01-Apr-20 3
## 19 01-Apr-21 2
## 20 01-Aug-16 6
## 21 01-Aug-17 11
## 22 01-Aug-18 1
## 23 01-Aug-19 4
## 24 01-Aug-20 1
## 25 01-Dec-17 2
## 26 01-Dec-18 2
## 27 01-Dec-19 1
## 28 01-Dec-20 2
## 29 01-Feb-18 2
## 30 01-Feb-19 11
## 31 01-Feb-20 1
## 32 01-Feb-21 1
## 33 01-Jan-18 2
## 34 01-Jan-19 2
## 35 01-Jan-20 2
## 36 01-Jan-21 1
## 37 01-Jul-17 1
## 38 01-Jul-20 1
## 39 01-Jul-21 1
## 40 01-Jun-16 1
## 41 01-Jun-17 4
## 42 01-Mar-17 7
## 43 01-Mar-18 2
## 44 01-Mar-19 2
## 45 01-May-17 1
## 46 01-May-18 2
## 47 01-May-19 3
## 48 01-May-21 1
## 49 01-Nov-16 1
## 50 01-Nov-18 1
## 51 01-Nov-19 3
## 52 01-Oct-16 1
## 53 01-Oct-17 4
## 54 01-Oct-18 2
## 55 01-Oct-19 1
## 56 01-Sep-16 5
## 57 01-Sep-17 1
## 58 01-Sep-19 1
## 59 01-Sep-20 1
## 60 02-Dec-20 1
## 61 02-Feb-19 1
## 62 02-Jan-19 1
## 63 02-Jun-17 1
## 64 02-Jun-21 3
## 65 02-Nov-16 1
## 66 02-Oct-18 10
## 67 02-Oct-19 3
## 68 03-Dec-18 1
## 69 03-Dec-19 1
## 70 03-Mar-20 1
## 71 03-May-19 1
## 72 03-Sep-20 1
## 73 04-Jan-20 1
## 74 04-Jun-21 1
## 75 04-Nov-17 1
## 76 04-Nov-19 1
## 77 04-Oct-19 1
## 78 05-Apr-19 2
## 79 05-Aug-21 1
## 80 05-Feb-19 1
## 81 05-Jan-19 1
## 82 05-Jun-19 1
## 83 05-Mar-20 1
## 84 05-Nov-19 1
## 85 05-Oct-18 1
## 86 05-Sep-20 1
## 87 06-Jul-21 5
## 88 06-Oct-20 1
## 89 06-Sep-16 1
## 90 06-Sep-17 1
## 91 07-Sep-21 1
## 92 08-Aug-21 1
## 93 08-Feb-19 1
## 94 08-Jan-19 1
## 95 08-Jan-21 1
## 96 08-May-18 1
## 97 08-Nov-19 1
## 98 09-Dec-16 1
## 99 09-Jan-20 1
## 100 09-Mar-17 1
## 101 09-Nov-16 1
## 102 09-Sep-16 1
## 103 10-Apr-18 1
## 104 10-Jan-20 2
## 105 10-Jul-18 1
## 106 10-Oct-15 1
## 107 10-Sep-19 2
## 108 11-Jan-19 1
## 109 11-Sep-17 1
## 110 11-Sep-20 1
## 111 12-Dec-17 1
## 112 12-Dec-19 1
## 113 12-Feb-21 1
## 114 12-Jan-21 1
## 115 12-Jul-19 4
## 116 12-Mar-21 1
## 117 12-May-17 1
## 118 13-Aug-17 1
## 119 13-Feb-21 1
## 120 13-Jul-17 1
## 121 13-Mar-18 1
## 122 13-May-21 1
## 123 13-Oct-17 1
## 124 13-Oct-20 1
## 125 14-Aug-20 1
## 126 14-Feb-17 1
## 127 14-May-17 1
## 128 14-Sep-21 1
## 129 15-Apr-17 1
## 130 15-Apr-20 1
## 131 15-Aug-16 3
## 132 15-Dec-16 2
## 133 15-Dec-18 2
## 134 15-Feb-19 1
## 135 15-Feb-21 1
## 136 15-Jan-18 1
## 137 15-Jun-15 1
## 138 15-Jun-17 1
## 139 15-Mar-17 1
## 140 15-Mar-19 2
## 141 15-Mar-21 1
## 142 15-May-16 1
## 143 15-May-17 1
## 144 15-May-19 2
## 145 15-May-20 2
## 146 15-May-21 1
## 147 15-Oct-18 1
## 148 15-Oct-20 1
## 149 15-Sep-17 1
## 150 15-Sep-18 2
## 151 15-Sep-20 2
## 152 15-Sep-21 1
## 153 16-Apr-19 1
## 154 16-Aug-19 1
## 155 16-Dec-16 1
## 156 16-Dec-20 1
## 157 16-Mar-17 1
## 158 16-May-18 1
## 159 16-May-19 1
## 160 16-Sep-16 1
## 161 17-Apr-19 1
## 162 17-Feb-21 1
## 163 17-May-19 1
## 164 17-Nov-16 1
## 165 17-Oct-20 1
## 166 17-Sep-17 1
## 167 17-Sep-19 1
## 168 17-Sep-20 1
## 169 17-Sep-21 1
## 170 18-Jan-18 1
## 171 18-Jan-19 1
## 172 18-Mar-16 2
## 173 18-Mar-20 1
## 174 18-May-20 1
## 175 18-Nov-16 1
## 176 18-Oct-19 3
## 177 18-Sep-17 1
## 178 19-Dec-17 1
## 179 19-Feb-19 1
## 180 19-Jan-19 1
## 181 19-Jul-20 1
## 182 19-Jun-19 1
## 183 19-Jun-21 1
## 184 19-Mar-21 1
## 185 19-Oct-18 1
## 186 20-Aug-19 2
## 187 20-Dec-18 1
## 188 20-Dec-20 1
## 189 20-Feb-18 1
## 190 20-Feb-21 1
## 191 20-Jul-18 1
## 192 20-Jun-16 1
## 193 20-Jun-17 1
## 194 20-Mar-20 2
## 195 20-May-20 1
## 196 21-Feb-19 1
## 197 21-Jul-17 1
## 198 21-Jul-20 1
## 199 21-Oct-19 1
## 200 21-Oct-20 1
## 201 22-Apr-15 1
## 202 22-Apr-17 2
## 203 22-Apr-20 1
## 204 22-Dec-18 2
## 205 22-Dec-20 2
## 206 22-Feb-17 2
## 207 22-Feb-18 1
## 208 22-Feb-19 2
## 209 22-Jul-19 1
## 210 22-Jun-18 1
## 211 22-Mar-19 1
## 212 22-May-17 1
## 213 22-May-20 1
## 214 22-Oct-20 1
## 215 23-Dec-17 1
## 216 23-Feb-17 2
## 217 23-Feb-21 1
## 218 23-Jul-21 1
## 219 23-Jun-18 2
## 220 23-Mar-18 1
## 221 23-Nov-18 1
## 222 23-Sep-20 1
## 223 24-Apr-20 1
## 224 24-Aug-18 1
## 225 24-Aug-21 1
## 226 24-Jun-20 1
## 227 24-Mar-18 1
## 228 24-Oct-17 2
## 229 24-Sep-21 1
## 230 25-Aug-16 1
## 231 25-Dec-17 1
## 232 25-Jan-19 1
## 233 25-May-20 1
## 234 25-Nov-20 1
## 235 25-Oct-17 1
## 236 25-Sep-17 1
## 237 26-Apr-20 1
## 238 26-Aug-20 1
## 239 26-Dec-17 1
## 240 26-Jan-19 1
## 241 26-Jul-18 2
## 242 26-Jul-19 6
## 243 26-Mar-21 2
## 244 26-May-16 1
## 245 27-Aug-19 1
## 246 27-Dec-17 1
## 247 27-Feb-18 1
## 248 27-Jan-17 1
## 249 27-Jun-19 1
## 250 27-Mar-18 1
## 251 27-Nov-17 1
## 252 28-Apr-16 1
## 253 28-Dec-18 1
## 254 28-Jul-17 1
## 255 28-Oct-19 1
## 256 29-Apr-20 1
## 257 29-Aug-19 1
## 258 29-Dec-17 1
## 259 29-Jan-20 1
## 260 29-Jan-21 1
## 261 29-Jun-18 2
## 262 29-Mar-19 1
## 263 29-Sep-17 1
## 264 30-Dec-20 1
## 265 30-Jan-18 1
## 266 30-Jan-20 1
## 267 30-Jul-21 1
## 268 30-Jun-16 1
## 269 30-Mar-18 1
## 270 30-Mar-19 1
## 271 30-Nov-17 1
## 272 30-Oct-20 1
## 273 30-Sep-17 1
## 274 30-Sep-19 1
## 275 31-Dec-17 1
## 276 31-Dec-18 7
## 277 31-Jan-19 1
## 278 31-Jul-20 1
## 279 31-Mar-17 12
## 280 31-Mar-18 2
## 281 31-Mar-21 1
## 282 31-May-17 1
#3장 기출 예상문제 p247
#1
library(dplyr)
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
library(recipes)
##
## 다음의 패키지를 부착합니다: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
x_test<-read.csv('X_test.csv',
fileEncoding = "euc-kr", na="")
x_train<-read.csv('X_train.csv',
fileEncoding = "euc-kr", na="")
y_train<-read.csv('y_train.csv',
fileEncoding = "euc-kr", na="")
x_train %>% glimpse
## Rows: 3,500
## Columns: 10
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ 총구매액 <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900…
## $ 최대구매액 <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,…
## $ 환불금액 <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,…
## $ 주구매상품 <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자…
## $ 주구매지점 <chr> "강남점", "잠실점", "관악점", "광주점", "본 점", "일산…
## $ 내점일수 <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152…
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666…
## $ 주말방문비율 <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000…
## $ 구매주기 <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, …
y_train %>% glimpse
## Rows: 3,500
## Columns: 2
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ gender <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,…
left_join(x_train,y_train,by='cust_id') %>% mutate(index='train')->train
train %>% glimpse
## Rows: 3,500
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ 총구매액 <dbl> 68282840, 2136000, 3197000, 16077620, 29050000, 1137900…
## $ 최대구매액 <int> 11264000, 2136000, 1639000, 4935000, 24000000, 9552000,…
## $ 환불금액 <int> 6860000, 300000, NA, NA, NA, 462000, 4582000, 29524000,…
## $ 주구매상품 <chr> "기타", "스포츠", "남성 캐주얼", "기타", "보석", "디자…
## $ 주구매지점 <chr> "강남점", "잠실점", "관악점", "광주점", "본 점", "일산…
## $ 내점일수 <int> 19, 2, 2, 18, 2, 3, 5, 63, 18, 1, 25, 3, 2, 27, 84, 152…
## $ 내점당구매건수 <dbl> 3.894737, 1.500000, 2.000000, 2.444444, 1.500000, 1.666…
## $ 주말방문비율 <dbl> 0.52702703, 0.00000000, 0.00000000, 0.31818182, 0.00000…
## $ 구매주기 <int> 17, 1, 1, 16, 85, 42, 42, 5, 15, 0, 13, 89, 16, 10, 4, …
## $ gender <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0…
## $ index <chr> "train", "train", "train", "train", "train", "train", "…
x_test %>% mutate(index='test')->test
test %>% glimpse
## Rows: 2,482
## Columns: 11
## $ cust_id <int> 3500, 3501, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3…
## $ 총구매액 <dbl> 70900400, 310533100, 305264140, 7594080, 1795790, 13000…
## $ 최대구매액 <int> 22000000, 38558000, 14825000, 5225000, 1411200, 2160000…
## $ 환불금액 <int> 4050000, 48034700, 30521000, NA, NA, NA, 39566000, NA, …
## $ 주구매상품 <chr> "골프", "농산물", "가공식품", "주방용품", "수산품", "화…
## $ 주구매지점 <chr> "부산본점", "잠실점", "본 점", "부산본점", "청량리점",…
## $ 내점일수 <int> 13, 90, 101, 5, 3, 5, 144, 1, 1, 28, 21, 3, 23, 30, 3, …
## $ 내점당구매건수 <dbl> 1.461538, 2.433333, 14.623762, 2.000000, 2.666667, 2.20…
## $ 주말방문비율 <dbl> 0.78947368, 0.36986301, 0.08327691, 0.00000000, 0.12500…
## $ 구매주기 <int> 26, 3, 3, 47, 8, 61, 2, 0, 0, 12, 14, 2, 15, 11, 112, 2…
## $ index <chr> "test", "test", "test", "test", "test", "test", "test",…
bind_rows(train,test)->full
full$gender<-ifelse(full$gender==0,"남성","여성")
full$gender<-as.factor(full$gender)
full$index<-as.factor(full$index)
names(full)
## [1] "cust_id" "총구매액" "최대구매액" "환불금액"
## [5] "주구매상품" "주구매지점" "내점일수" "내점당구매건수"
## [9] "주말방문비율" "구매주기" "gender" "index"
data<-full %>% rename(total="총구매액",
max="최대구매액",
refund="환불금액",
product="주구매상품",
store="주구매지점",
day="내점일수",
count="내점당구매건수",
week="주말방문비율",
cycle="구매주기") %>%
select(cust_id,index,gender,total,max,refund,product,store,day,count,week,cycle)
colSums(is.na(data))
## cust_id index gender total max refund product store day count
## 0 0 2482 0 0 3906 0 0 0 0
## week cycle
## 0 0
data$refund<-ifelse(is.na(data$refund),0,data$refund)
colSums(is.na(data))
## cust_id index gender total max refund product store day count
## 0 0 2482 0 0 0 0 0 0 0
## week cycle
## 0 0
library(recipes)
recipe(gender~.,data=data) %>%
step_YeoJohnson(total,max,refund,day,count,week,cycle) %>%
step_scale(total,max,refund,day,count,week,cycle) %>%
step_center(total,max,refund,day,count,week,cycle) %>%
prep() %>% juice()->data1
data1 %>% glimpse
## Rows: 5,982
## Columns: 12
## $ cust_id <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ index <fct> train, train, train, train, train, train, train, train, train,…
## $ total <dbl> -0.1109616, -0.5964776, -0.5864340, -0.4794213, -0.3820895, -0…
## $ max <dbl> -0.25879992, -0.58005471, -0.59931645, -0.47681619, 0.15253819…
## $ refund <dbl> 1.3776676, 1.2130535, -0.7281455, -0.7281455, -0.7281455, 1.23…
## $ product <fct> 기타, 스포츠, 남성 캐주얼, 기타, 보석, 디자이너, 시티웨어, 명…
## $ store <fct> 강남점, 잠실점, 관악점, 광주점, 본 점, 일산점, 강남점, 본 점…
## $ day <dbl> 0.6267964, -0.9872986, -0.9872986, 0.5877041, -0.9872986, -0.7…
## $ count <dbl> 0.92059492, -0.89611526, -0.32407144, 0.06726813, -0.89611526,…
## $ week <dbl> 0.96145636, -1.31805060, -1.31805060, 0.32838074, -1.31805060,…
## $ cycle <dbl> 0.28905563, -1.19528222, -1.19528222, 0.24219137, 1.78728765, …
## $ gender <fct> 남성, 남성, 여성, 여성, 남성, 남성, 남성, 남성, 남성, 여성, 남…
data1 %>% filter(index=="train") %>% select(-index)->train
data1 %>% filter(index=="test") %>% select(-index)->test
library(caret)
ctrl<-trainControl(method='cv',number=10,
summaryFunction = twoClassSummary,
classProbs = TRUE)
train(gender~.,data=train,
method='rpart',
metric="ROC",
trControl=ctrl)->rffit
train(gender~.,data=train,
method='glm',family=binomial,
metric="ROC",
trControl=ctrl)->rffit1
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
rffit
## CART
##
## 3500 samples
## 10 predictor
## 2 classes: '남성', '여성'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3149, 3150, 3150, 3150, 3150, 3150, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.005319149 0.6194238 0.8452704 0.2918517
## 0.006838906 0.6183363 0.8122806 0.3358894
## 0.007598784 0.6183363 0.8122806 0.3358894
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005319149.
predict(rffit,test,type="prob")->pred_fit1
head(pred_fit1)
## 남성 여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289
predict(rffit,test,type="raw")->pred_fit2
head(pred_fit2)
## [1] 남성 남성 남성 여성 여성 남성
## Levels: 남성 여성
head(pred_fit1)
## 남성 여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289
names(pred_fit1)[1]<-"gender"
head(pred_fit1)
## gender 여성
## 1 0.7364290 0.2635710
## 2 0.7364290 0.2635710
## 3 0.7364290 0.2635710
## 4 0.4471698 0.5528302
## 5 0.4471698 0.5528302
## 6 0.6927711 0.3072289
bind_cols(x_test,pred_fit1) %>% select(cust_id,gender)->df
head(df)
## cust_id gender
## 1 3500 0.7364290
## 2 3501 0.7364290
## 3 3502 0.7364290
## 4 3503 0.4471698
## 5 3504 0.4471698
## 6 3505 0.6927711
write.csv(df,"2022.csv",row.names=FALSE)
read.csv("2022.csv") %>% head
## cust_id gender
## 1 3500 0.7364290
## 2 3501 0.7364290
## 3 3502 0.7364290
## 4 3503 0.4471698
## 5 3504 0.4471698
## 6 3505 0.6927711
#2 p259
library(dplyr)
library(caret)
library(recipes)
library(pROC)
df<-read.csv("travel_data.csv")
set.seed(1357)
train_list<-createDataPartition(y=df$TravelInsurance,p=0.75,list=FALSE)
df_train<-df[train_list,]
df_test<-df[-train_list,]
NROW(df_train)
## [1] 1491
NROW(df_test)
## [1] 496
#3 p270
train<-read.csv("insurance_train_10.csv")
test<-read.csv("insurance_test_10.csv")
train %>% glimpse
## Rows: 6,969
## Columns: 9
## $ Gender <chr> "Male", "Female", "Male", "Male", "Male", "Female", "F…
## $ Ever_Married <chr> "No", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "…
## $ Age <int> 22, 67, 67, 56, 32, 33, 61, 55, 26, 19, 58, 41, 32, 31…
## $ Graduated <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "Yes", …
## $ Profession <chr> "Healthcare", "Engineer", "Lawyer", "Artist", "Healthc…
## $ Work_Experience <int> 1, 1, 0, 0, 1, 1, 0, 1, 1, 4, 0, 1, 9, 1, 1, 0, 12, 3,…
## $ Spending_Score <chr> "Low", "Low", "High", "Average", "Low", "Low", "Low", …
## $ Family_Size <int> 4, 1, 2, 2, 3, 3, 3, 4, 3, 4, 1, 2, 5, 6, 4, 1, 1, 4, …
## $ Segmentation <int> 4, 2, 2, 3, 3, 4, 4, 3, 1, 4, 2, 3, 4, 2, 2, 3, 1, 4, …
colSums(is.na(train))
## Gender Ever_Married Age Graduated Profession
## 0 0 0 0 0
## Work_Experience Spending_Score Family_Size Segmentation
## 0 0 0 0
train$Segmentation<-as.factor(train$Segmentation)
library(caret)
ctrl<-trainControl(method="cv",number=10)
train(Segmentation~.,data=train,
method='knn',trControl=ctrl,
preProcess=c("center","scale"))->knn_fit
knn_fit
## k-Nearest Neighbors
##
## 6969 samples
## 8 predictor
## 4 classes: '1', '2', '3', '4'
##
## Pre-processing: centered (19), scaled (19)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 6273, 6272, 6273, 6272, 6273, 6271, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.4858711 0.3132567
## 7 0.4898932 0.3184834
## 9 0.4904680 0.3190329
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
confusionMatrix(knn_fit)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 1 2 3 4
## 1 9.5 5.6 2.6 5.5
## 2 5.6 7.5 5.5 1.9
## 3 3.9 7.7 14.1 1.4
## 4 5.4 2.7 3.3 18.0
##
## Accuracy (average) : 0.4905
predict(knn_fit,test)->pred_fit
head(pred_fit)
## [1] 2 1 1 3 3 1
## Levels: 1 2 3 4
NROW(pred_fit)
## [1] 2267
test %>% glimpse
## Rows: 2,267
## Columns: 9
## $ X <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Gender <chr> "Female", "Male", "Female", "Male", "Male", "Male", "F…
## $ Ever_Married <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"…
## $ Age <int> 36, 37, 69, 59, 47, 61, 47, 50, 19, 22, 22, 50, 27, 18…
## $ Graduated <chr> "Yes", "Yes", "No", "No", "Yes", "Yes", "Yes", "Yes", …
## $ Profession <chr> "Engineer", "Healthcare", "", "Executive", "Doctor", "…
## $ Work_Experience <int> 0, 8, 0, 11, 0, 5, 1, 2, 0, 0, 0, 1, 8, 0, 0, 1, 1, 8,…
## $ Spending_Score <chr> "Low", "Average", "Low", "High", "High", "Low", "Avera…
## $ Family_Size <int> 1, 4, 1, 2, 5, 3, 3, 4, 4, 3, 6, 5, 3, 3, 1, 3, 2, 1, …
bind_cols(test,pred_fit)->df
## New names:
## • `` -> `...10`
names(df)[9]<-"Segmentaton_pred"
df %>% select(9)->df1
write.csv(df1,"2022.csv",row.names = FALSE)
set.seed(12345)
IDX<-createDataPartition(train$Segmentation,p=0.7,list=FALSE)
train_t<-train[IDX,]
test_v<-train[-IDX,]
train_t$Segmentation<-as.factor(train_t$Segmentation)
test_v$Segmentation<-as.factor(test_v$Segmentation)
ctrl<-trainControl(method="cv",number=10)
train(Segmentation~.,data=train_t,
method='knn',trControl=ctrl,
preProcess=c("center","scale"))->knn_fit1
predict(knn_fit1,newdata=test_v)->test_pred
confusionMatrix(test_pred,test_v$Segmentation,mode="prec_recall")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4
## 1 207 114 62 115
## 2 120 152 123 53
## 3 71 163 284 33
## 4 109 60 64 358
##
## Overall Statistics
##
## Accuracy : 0.4794
## 95% CI : (0.4578, 0.5011)
## No Information Rate : 0.2677
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3047
##
## Mcnemar's Test P-Value : 0.009816
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4
## Precision 0.41566 0.3393 0.5154 0.6058
## Recall 0.40828 0.3108 0.5328 0.6404
## F1 0.41194 0.3244 0.5240 0.6226
## Prevalence 0.24282 0.2342 0.2553 0.2677
## Detection Rate 0.09914 0.0728 0.1360 0.1715
## Detection Prevalence 0.23851 0.2146 0.2639 0.2830
## Balanced Accuracy 0.61211 0.5629 0.6806 0.7440