data <- read.csv("C:/Users/gyeon/OneDrive/바탕 화면/data/car.csv", header = TRUE, fileEncoding="UTF-8-BOM")
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## 다음의 패키지를 부착합니다: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:purrr':
##
## compact
library(dplyr)
library(arules)
## 필요한 패키지를 로딩중입니다: Matrix
##
## 다음의 패키지를 부착합니다: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## 다음의 패키지를 부착합니다: 'arules'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
# 필요한 칼럼만 선택(소비선택습관, 자동차고려성향, 보유차량유형, 연료유형, 신규차량구매여부, 성별, 연령, 가구소득)
data <- data %>%
select(SPND_CHOICE_HABIT_ONE_TY,
SPND_CHOICE_HABIT_TWO_TY,
SPND_CHOICE_HABIT_THREE_TY,
SPND_CHOICE_HABIT_EIGHT_TY,
CAR_CNSDR_INCLN_ONE_TY,
CAR_CNSDR_INCLN_TWO_TY,
CAR_CNSDR_INCLN_THREE_TY,
CAR_CNSDR_INCLN_FOUR_TY,
CAR_CNSDR_INCLN_FIVE_TY,
HOLD_VHCLE_FUEL_TY,
NEW_VHCLE_PRCHS_AT_VALUE,
SEXDSTN_FLAG_NM,
AGRDE_FLAG_NM,
HSHLD_INCOME_DGREE_NM)
# 모든 공백 문자열과 특정 값을 0으로 변환
data[data == "" | data == " " | data == "모름"] <- 0
sum(is.na(data)) # 523 -> 보유 차량 유형에서 응답되지 않은 값이 매핑 과정 거치면서 NA값이 된 것으로 예상
## [1] 0
cleaned_data <- na.omit(data) # NA값을 제거
sum(is.na(cleaned_data))
## [1] 0
table(cleaned_data$HOLD_VHCLE_FUEL_TY) # 연료 유형별 분포 확인
##
## 0 LPG
## 523 1214
## 경유(디젤) 수소(연료전지)차
## 6446 86
## 전기차 (100% 전기만 충전/사용) 하이브리드 (연료+배터리)
## 745 2147
## 휘발유
## 11995
cleaned_data <- cleaned_data %>%
filter(!(HOLD_VHCLE_FUEL_TY == "수소(연료전지)차")) # 차량 유형이 기타이거나 연료 유형이 수소인 행 제거
# 결과 확인 (연료 종류)
table(cleaned_data$HOLD_VHCLE_FUEL_TY)
##
## 0 LPG
## 523 1214
## 경유(디젤) 전기차 (100% 전기만 충전/사용)
## 6446 745
## 하이브리드 (연료+배터리) 휘발유
## 2147 11995
# "0"을 NA로 변경
cleaned_data[cleaned_data == "0"] <- NA
# NA가 포함된 행 삭제
cleaned_data <- na.omit(cleaned_data)
# 데이터 구조 확인용 코드드
#View(cleaned_data)
#lapply(cleaned_data, unique)
##############################################################################################
# 원하는 컬럼만 선택
cleaned_data <- cleaned_data %>%
select(HSHLD_INCOME_DGREE_NM,
SPND_CHOICE_HABIT_ONE_TY,
SPND_CHOICE_HABIT_TWO_TY,
SPND_CHOICE_HABIT_THREE_TY,
SPND_CHOICE_HABIT_EIGHT_TY,
CAR_CNSDR_INCLN_TWO_TY,
CAR_CNSDR_INCLN_FOUR_TY,
CAR_CNSDR_INCLN_FIVE_TY,
AGRDE_FLAG_NM,
HOLD_VHCLE_FUEL_TY)
spending_mapping <- c(
"유명 상표의 제품을 주로 산다" = '1',
"새로운 상품이 나오면 먼저 사는 편이다" = '2',
"꼭 필요한 물건만 산다" = '3',
"디자인이 좋지 않으면 사지 않는다" = '4',
"제품을 선택할 때 품질을 제일 중요하게 고려한다" = '5',
"새로운 패션이나 유행을 비교적 빨리 받아 들인다" = '6',
"가격이 비싸더라도 되도록 신기술이 적용된 제품을 산다" = '7',
"상품을 구입하기 전에 관련 정보를 많이 알아 본다" = '8'
)
car_consideration_mapping_two <- c(
"나의 사회적 지위를 표현하는 상징물이다" = '9',
"나의 사회적 지위를 표현하는 상징물은 아니다" = '10'
)
car_consideration_mapping_four <- c(
"생활의 편리성을 주는 교통수단일 뿐이다" = '11',
"생활의 편리성을 주는 교통수단일 뿐은 아니다" = '12'
)
car_consideration_mapping_five <- c(
"반드시 내 소유의 차량이어야 한다" = '13',
"반드시 내 소유의 차량이어야 할 필요는 없다" = '14'
)
car_hold_vehicle_fuel <- c(
"경유(디젤)" = '15',
"하이브리드 (연료+배터리)" = '16',
"휘발유" = '17',
"LPG" = '18',
"전기차 (100% 전기만 충전/사용)" = '19'
)
agrde_falg_nm <- c(
"20대" = '20',
"30대" = '21',
"40대" = '22',
"50대" = '23',
"60대 이상" = '24'
)
hshld_income_dgree_nm <- c(
"100만원 이하" = '25',
"200만원 내외" = '26',
"300만원 내외" = '27',
"400만원 내외" = '28',
"500만원 내외" = '29',
"600만원 내외" = '30',
"700만원 내외" = '31',
"800만원 내외" = '32',
"900만원 내외" = '33',
"1,000만원 내외" = '34',
"1,100~1,500만원 내외" = '35',
"1,600~2,000만원 내외" = '36',
"2,100만원 이상" = '37'
)
# 라벨 인코딩 매핑 적용
cleaned_data <- cleaned_data %>%
mutate(
SPND_CHOICE_HABIT_ONE_TY = spending_mapping[SPND_CHOICE_HABIT_ONE_TY],
SPND_CHOICE_HABIT_TWO_TY = spending_mapping[SPND_CHOICE_HABIT_TWO_TY],
SPND_CHOICE_HABIT_THREE_TY = spending_mapping[SPND_CHOICE_HABIT_THREE_TY],
SPND_CHOICE_HABIT_EIGHT_TY = spending_mapping[SPND_CHOICE_HABIT_EIGHT_TY],
CAR_CNSDR_INCLN_TWO_TY = car_consideration_mapping_two[CAR_CNSDR_INCLN_TWO_TY],
CAR_CNSDR_INCLN_FOUR_TY = car_consideration_mapping_four[CAR_CNSDR_INCLN_FOUR_TY],
CAR_CNSDR_INCLN_FIVE_TY = car_consideration_mapping_five[CAR_CNSDR_INCLN_FIVE_TY],
HOLD_VHCLE_FUEL_TY = car_hold_vehicle_fuel[HOLD_VHCLE_FUEL_TY],
AGRDE_FLAG_NM = agrde_falg_nm[AGRDE_FLAG_NM],
HSHLD_INCOME_DGREE_NM = hshld_income_dgree_nm[HSHLD_INCOME_DGREE_NM]
)
# 모든 열을 결합하도록 columns_to_combine 정의
columns_to_combine <- names(cleaned_data)
# 행별로 데이터를 결합하여 새로운 열 생성
cleaned_data$Grouped_Info_rererenew <- apply(cleaned_data[columns_to_combine], 1, function(row) paste(row, collapse = ","))
head(cleaned_data)
## HSHLD_INCOME_DGREE_NM SPND_CHOICE_HABIT_ONE_TY SPND_CHOICE_HABIT_TWO_TY
## 63 30 6 4
## 75 35 4 5
## 79 29 4 7
## 83 35 2 5
## 85 36 4 8
## 138 29 8 7
## SPND_CHOICE_HABIT_THREE_TY SPND_CHOICE_HABIT_EIGHT_TY
## 63 8 1
## 75 3 2
## 79 6 3
## 83 1 4
## 85 3 5
## 138 5 1
## CAR_CNSDR_INCLN_TWO_TY CAR_CNSDR_INCLN_FOUR_TY CAR_CNSDR_INCLN_FIVE_TY
## 63 9 11 13
## 75 9 11 13
## 79 9 11 13
## 83 9 12 13
## 85 9 12 14
## 138 9 11 13
## AGRDE_FLAG_NM HOLD_VHCLE_FUEL_TY Grouped_Info_rererenew
## 63 22 15 30,6,4,8,1,9,11,13,22,15
## 75 22 16 35,4,5,3,2,9,11,13,22,16
## 79 22 17 29,4,7,6,3,9,11,13,22,17
## 83 21 17 35,2,5,1,4,9,12,13,21,17
## 85 23 17 36,4,8,3,5,9,12,14,23,17
## 138 22 18 29,8,7,5,1,9,11,13,22,18
# 데이터를 CSV 파일로 저장
write.csv(cleaned_data[c("Grouped_Info_rererenew")],
"Grouped_Info_rererenew.csv",
row.names = FALSE,
quote = FALSE)
##############################################################################################
# CSV 파일 경로 (본인 로컬에 맞게 수정하세요~)
file_path <- "C:/Users/gyeon/OneDrive/Documents/Grouped_Info_rererenew.csv"
# 파일의 첫 5줄 확인
print(readLines("C:/Users/gyeon/OneDrive/Documents/Grouped_Info_rererenew.csv", n = 5))
## [1] "Grouped_Info_rererenew" "30,6,4,8,1,9,11,13,22,15"
## [3] "35,4,5,3,2,9,11,13,22,16" "29,4,7,6,3,9,11,13,22,17"
## [5] "35,2,5,1,4,9,12,13,21,17"
# 거래 데이터 로드
txn <- read.transactions(
file = file_path,
rm.duplicates = TRUE,
format = "basket",
sep = ",",
cols = NULL
)
# 데이터 확인
summary(txn)
## transactions as itemMatrix in sparse format with
## 364 rows (elements/itemsets/transactions) and
## 38 columns (items) and a density of 0.2625072
##
## most frequent items:
## 13 9 11 1 8 (Other)
## 263 237 223 199 198 2511
##
## element (itemset/transaction) length distribution:
## sizes
## 1 10
## 1 363
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 10.000 10.000 9.975 10.000 10.000
##
## includes extended item information - examples:
## labels
## 1 1
## 2 10
## 3 11
inspect(head(txn, 5)) # 상위 5개의 거래 보기
## items
## [1] {Grouped_Info_rererenew}
## [2] {1, 11, 13, 15, 22, 30, 4, 6, 8, 9}
## [3] {11, 13, 16, 2, 22, 3, 35, 4, 5, 9}
## [4] {11, 13, 17, 22, 29, 3, 4, 6, 7, 9}
## [5] {1, 12, 13, 17, 2, 21, 35, 4, 5, 9}
# 연관 규칙 생성
rules <- apriori(txn, parameter = list(supp = 0.01, conf = 0.5)) # 최소 지지도와 신뢰도 설정
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 3
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[38 item(s), 364 transaction(s)] done [0.00s].
## sorting and recoding items ... [36 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 done [0.01s].
## writing ... [45270 rule(s)] done [0.01s].
## creating S4 object ... done [0.01s].
# 생성된 규칙 확인
summary(rules)
## set of 45270 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8
## 8 253 2753 11325 17595 10986 2262 88
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 5.000 4.959 6.000 8.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01099 Min. :0.5000 Min. :0.01099 Min. : 0.692
## 1st Qu.:0.01099 1st Qu.:0.5714 1st Qu.:0.01648 1st Qu.: 1.051
## Median :0.01374 Median :0.6667 Median :0.02198 Median : 1.226
## Mean :0.02233 Mean :0.6928 Mean :0.03454 Mean : 1.313
## 3rd Qu.:0.02198 3rd Qu.:0.8000 3rd Qu.:0.03571 3rd Qu.: 1.444
## Max. :0.72253 Max. :1.0000 Max. :1.00000 Max. :15.826
## count
## Min. : 4.000
## 1st Qu.: 4.000
## Median : 5.000
## Mean : 8.129
## 3rd Qu.: 8.000
## Max. :263.000
##
## mining info:
## data ntransactions support confidence
## txn 364 0.01 0.5
## call
## apriori(data = txn, parameter = list(supp = 0.01, conf = 0.5))
# 여기부터 번호가 아닌 한글 시각화를 위한 역맵핑
# 역 매핑 생성
reverse_mapping <- c(
spending_mapping,
car_consideration_mapping_two,
car_consideration_mapping_four,
car_consideration_mapping_five,
car_hold_vehicle_fuel,
agrde_falg_nm,
hshld_income_dgree_nm
)
# 역 매핑: 숫자 -> 한글
reverse_mapping <- setNames(names(reverse_mapping), unlist(reverse_mapping))
# 규칙 항목 레이블 역매핑 함수
apply_reverse_mapping <- function(rules, mapping) {
item_labels <- itemLabels(rules)
translated_labels <- sapply(item_labels, function(item) mapping[item])
itemLabels(rules) <- translated_labels
return(rules)
}
# 시각화 함수
plot_with_labels <- function(rules, mapping, max_rules = 100) {
# 규칙 레이블을 역 매핑하여 한글로 변환
item_labels <- itemLabels(rules)
translated_labels <- sapply(item_labels, function(item) mapping[item])
itemLabels(rules) <- translated_labels
# 시각화 (최대 규칙 개수 설정)
plot(rules, method = "graph", engine = "htmlwidget", control = list(max = max_rules))
}
# 전체 시각화 실행
# 전부 다 하면 프로그램 터져서 대략 1000개만, 알아서 리프트 값 높은 순으로 해줌
plot_with_labels(rules, reverse_mapping, max_rules = 1000)
## Warning: Too many rules supplied. Only plotting the best 1000 using 'lift'
## (change control parameter max if needed).
# 규칙 요약 정보 확인
summary(rules)
## set of 45270 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8
## 8 253 2753 11325 17595 10986 2262 88
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 5.000 4.959 6.000 8.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01099 Min. :0.5000 Min. :0.01099 Min. : 0.692
## 1st Qu.:0.01099 1st Qu.:0.5714 1st Qu.:0.01648 1st Qu.: 1.051
## Median :0.01374 Median :0.6667 Median :0.02198 Median : 1.226
## Mean :0.02233 Mean :0.6928 Mean :0.03454 Mean : 1.313
## 3rd Qu.:0.02198 3rd Qu.:0.8000 3rd Qu.:0.03571 3rd Qu.: 1.444
## Max. :0.72253 Max. :1.0000 Max. :1.00000 Max. :15.826
## count
## Min. : 4.000
## 1st Qu.: 4.000
## Median : 5.000
## Mean : 8.129
## 3rd Qu.: 8.000
## Max. :263.000
##
## mining info:
## data ntransactions support confidence
## txn 364 0.01 0.5
## call
## apriori(data = txn, parameter = list(supp = 0.01, conf = 0.5))
# 1. 네트워크 그래프 방식
# lhs에 16또는 19이 포함된 규칙 필터링 및 시각화
lhs_related_rules <- subset(rules, lhs %in% c("16", "19"))
lhs_related_rules_translated <- apply_reverse_mapping(lhs_related_rules, reverse_mapping)
plot(lhs_related_rules_translated, method = "graph", engine = "htmlwidget")
## Warning: Too many rules supplied. Only plotting the best 100 using 'lift'
## (change control parameter max if needed).
# rhs에 16 또는 19이 포함된 규칙 필터링 및 시각화화
rhs_related_rules <- subset(rules, rhs %in% c("16", "19"))
rhs_related_rules_translated <- apply_reverse_mapping(rhs_related_rules, reverse_mapping)
plot(rhs_related_rules_translated, method = "graph", engine = "htmlwidget")
# 2. 병렬좌표 방식식
# RHS에 하이브리드(16) 포함된 규칙 필터링 및 시각화
rhs_related_rules <- subset(rules, rhs %in% c("16"))
rhs_related_rules_translated <- apply_reverse_mapping(rhs_related_rules, reverse_mapping)
plot(rhs_related_rules_translated, method = "paracoord", control = list(alpha = 0.8))

# RHS에 전기차(19) 포함된 규칙 필터링 및 시각회회
rhs_related_rules <- subset(rules, rhs %in% c("19"))
rhs_related_rules_translated <- apply_reverse_mapping(rhs_related_rules, reverse_mapping)
plot(rhs_related_rules_translated, method = "paracoord", control = list(alpha = 0.8))
