library(tidyverse)
Tmall_train<-read.table(file="天猫_Train.txt",header=TRUE,sep=",")
Tmall_train %>% tibble()
## # A tibble: 860 × 9
## Id brandN buyBrandN actDateN buyDateN hitN buyN storeN cartN
## <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 42000 106 3 47 3 191 3 0 0
## 2 154500 125 4 56 5 323 7 0 0
## 3 200250 43 5 31 5 173 11 1 0
## 4 239250 62 7 37 6 112 7 0 0
## 5 299750 106 2 52 2 138 2 6 0
## 6 341000 94 2 50 2 156 2 1 0
## 7 393250 43 3 22 3 127 3 1 0
## 8 417250 34 3 15 3 78 4 0 0
## 9 472000 29 2 31 2 143 2 0 0
## 10 563500 139 3 47 3 240 3 20 0
## # … with 850 more rows
## # ℹ Use `print(n = ...)` to see more rows
Tmall_test<-read.table(file="天猫_Test.txt",header=TRUE,sep=",")
Tmall_test %>% tibble()
## # A tibble: 831 × 9
## Id brandN buyBrandN actDateN buyDateN hitN buyN storeN cartN
## <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 42000 46 0 17 0 64 0 0 0
## 2 69500 127 3 26 2 361 3 0 0
## 3 154500 63 3 28 4 180 7 0 0
## 4 200250 13 1 10 1 25 1 1 0
## 5 239250 23 2 13 2 43 3 0 0
## 6 267250 11 0 6 0 27 0 0 0
## 7 418500 1 1 1 1 0 1 0 0
## 8 439000 39 1 13 2 94 2 0 0
## 9 448250 8 1 6 1 20 3 0 0
## 10 563500 39 2 15 1 52 2 2 0
## # … with 821 more rows
## # ℹ Use `print(n = ...)` to see more rows
依据原始数据计算各种转换率 \[BuyHit=buyN/hitN,活动有效度=购买量/点击量 \] \[BuyBBrand=buyBrandN/brandN , 成交有效度=成交品牌量/总品牌量 \] \[ActDNTotalDN=actDateN/days , 活跃度=活动天数/研究周期天数\] \[BuyDNactDN=buyDateN/actDateN , 消费活跃度=购买天数/活动天数 \] \[BuyOrNot ,是否有成交 \]
days<-92
Tmall_train<-Tmall_train %>%
mutate(BuyHit=ifelse(hitN!=0,round(buyN/hitN*100,2),NA),
BuyBBrand=ifelse(brandN!=0,round(buyBrandN/brandN*100,2),NA) ,
ActDNTotalDN=round(actDateN/days*100,2),
BuyDNactDN=ifelse(actDateN!=0,round(buyDateN/actDateN*100,2),NA),
BuyOrNot=sapply(buyN,FUN=function(x) ifelse(x!=0,"1","0")))
tibble(Tmall_train)
## # A tibble: 860 × 14
## Id brandN buyBr…¹ actDa…² buyDa…³ hitN buyN storeN cartN BuyHit BuyBB…⁴
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl>
## 1 42000 106 3 47 3 191 3 0 0 1.57 2.83
## 2 154500 125 4 56 5 323 7 0 0 2.17 3.2
## 3 200250 43 5 31 5 173 11 1 0 6.36 11.6
## 4 239250 62 7 37 6 112 7 0 0 6.25 11.3
## 5 299750 106 2 52 2 138 2 6 0 1.45 1.89
## 6 341000 94 2 50 2 156 2 1 0 1.28 2.13
## 7 393250 43 3 22 3 127 3 1 0 2.36 6.98
## 8 417250 34 3 15 3 78 4 0 0 5.13 8.82
## 9 472000 29 2 31 2 143 2 0 0 1.4 6.9
## 10 563500 139 3 47 3 240 3 20 0 1.25 2.16
## # … with 850 more rows, 3 more variables: ActDNTotalDN <dbl>, BuyDNactDN <dbl>,
## # BuyOrNot <chr>, and abbreviated variable names ¹buyBrandN, ²actDateN,
## # ³buyDateN, ⁴BuyBBrand
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
days<-31
Tmall_test<-Tmall_test %>%
mutate(BuyHit=ifelse(hitN!=0,round(buyN/hitN*100,2),NA),
BuyBBrand=ifelse(brandN!=0,round(buyBrandN/brandN*100,2),NA) ,
ActDNTotalDN=round(actDateN/days*100,2),
BuyDNactDN=ifelse(actDateN!=0,round(buyDateN/actDateN*100,2),NA),
BuyOrNot=sapply(buyN,FUN=function(x) ifelse(x!=0,"1","0")))
tibble(Tmall_test)
## # A tibble: 831 × 14
## Id brandN buyBr…¹ actDa…² buyDa…³ hitN buyN storeN cartN BuyHit BuyBB…⁴
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl>
## 1 42000 46 0 17 0 64 0 0 0 0 0
## 2 69500 127 3 26 2 361 3 0 0 0.83 2.36
## 3 154500 63 3 28 4 180 7 0 0 3.89 4.76
## 4 200250 13 1 10 1 25 1 1 0 4 7.69
## 5 239250 23 2 13 2 43 3 0 0 6.98 8.7
## 6 267250 11 0 6 0 27 0 0 0 0 0
## 7 418500 1 1 1 1 0 1 0 0 NA 100
## 8 439000 39 1 13 2 94 2 0 0 2.13 2.56
## 9 448250 8 1 6 1 20 3 0 0 15 12.5
## 10 563500 39 2 15 1 52 2 2 0 3.85 5.13
## # … with 821 more rows, 3 more variables: ActDNTotalDN <dbl>, BuyDNactDN <dbl>,
## # BuyOrNot <chr>, and abbreviated variable names ¹buyBrandN, ²actDateN,
## # ³buyDateN, ⁴BuyBBrand
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
write.table(Tmall_train[,-(1:9)],
file="天猫_Train_1.txt",
sep=",",quote=FALSE,append=FALSE,
row.names=FALSE,col.names=TRUE)
write.table(Tmall_test[,-(1:9)],
file="天猫_Test_1.txt",
sep=",",quote=FALSE,
append=FALSE,row.names=FALSE,
col.names=TRUE)
# 去除有缺失值的样本,只取完整观测数据
Tmall_train1<-Tmall_train[complete.cases(Tmall_train),-(1:9)]
Tmall_test1<-Tmall_test[complete.cases(Tmall_test),-(1:9)]
dim(Tmall_train1)
## [1] 855 5
dim(Tmall_test1)
## [1] 820 5
定义剔除第i个变量后的knn模型的预测误差公式 \[FI_{(i)}=e_{i}+\frac{1}{p}\]
第i个输出变量的重要性定义为 \[\omega_{i}=\frac{FI_{(i)}}{\sum_{j}{FI_{(j)}}}\]
Rstudio绘图不能显示中文解决方案
library(showtext)
showtext_auto()
利用旁置法选取活动有效度、成交有效度、活跃度以及消 费活跃度为自变量,是否有成交作为目标变量建立knn
head(Tmall_train1)
## BuyHit BuyBBrand ActDNTotalDN BuyDNactDN BuyOrNot
## 1 1.57 2.83 51.09 6.38 1
## 2 2.17 3.20 60.87 8.93 1
## 3 6.36 11.63 33.70 16.13 1
## 4 6.25 11.29 40.22 16.22 1
## 5 1.45 1.89 56.52 3.85 1
## 6 1.28 2.13 54.35 4.00 1
head(Tmall_test1)
## BuyHit BuyBBrand ActDNTotalDN BuyDNactDN BuyOrNot
## 1 0.00 0.00 54.84 0.00 0
## 2 0.83 2.36 83.87 7.69 1
## 3 3.89 4.76 90.32 14.29 1
## 4 4.00 7.69 32.26 10.00 1
## 5 6.98 8.70 41.94 15.38 1
## 6 0.00 0.00 19.35 0.00 0
library(class)
errRatio<-vector()
for(i in 1:30){
fit<-knn(train=Tmall_train1,
test=Tmall_test1,
cl=Tmall_train1[,5],k=i)
CT<-table(Tmall_test1[,5],fit)
errRatio<-c(errRatio,(1-sum(diag(CT))/sum(CT))*100)
}
#绘制分类错误率随k变化图
plot(errRatio,type="l",xlab="参数K",
ylab="错判率(%)",
main="参数K与错判率",cex.main=0.8)
points(1:30,errRatio,pch=12,col=2)
points(c(which.min(errRatio),7),errRatio[c(which.min(errRatio),7)],pch=3,col=1)
#计算剔除第i个变量后knn算法的分类错误率
errDelteX<-errRatio[7]
for(i in 1:4){
fit<-knn(train=Tmall_train1[,-i],
test=Tmall_test1[,-i],
cl=Tmall_train1[,5],k=7)
CT<-table(Tmall_test1[,5],fit)
errDelteX<-c(errDelteX,
(1-sum(diag(CT))/sum(CT))*100)
}
plot(errDelteX,type="l",xlab="剔除变量",
ylab="剔除错判率(%)",
main="剔除变量与错判率(K=7)",cex.main=0.8)
xTitle=c("1:全体变量","2:活动有效度",
"3:成交有效度","4:活跃度",
"5:消费活跃度")
legend("topleft",legend=xTitle,title="变量说明",
lty=1,cex=0.6)
#计算各个变量重要性FI
FI<-errDelteX[-1]+1/4
FI
## [1] 3.542683 5.250000 0.250000 7.323171
#计算各个输入变量权重wi
wi<-FI/sum(FI)
wi
## [1] 0.21646796 0.32078987 0.01527571 0.44746647
#绘制各个输入变量重要性权重饼图
GLabs<-paste(c("活动有效度","成交有效度",
"活跃度","消费活跃度"),
round(wi,2),sep=":")
wi %>% pie(labels=GLabs,clockwise=TRUE,main="输入变量权重",cex.main=0.8)
#绘制成交有效度与消费活跃度的二维散点图
ColPch=as.integer(as.vector(Tmall_test1[,5]))+1
Tmall_test1[,c(4,1)] %>% plot(pch=ColPch,cex=0.7,
xlim=c(0,50),ylim=c(0,50),col=ColPch,
xlab="消费活跃度",ylab="成交有效度",
main="二维特征空间中的观测",cex.main=0.8)
weightdist<-function(vec1,vec2){
if(length(vec1)!=length(vec2))
{
disp('长度不一样,不符合要求!');
} else {
ans=wi*(vec1-vec2)^2
weightdist<-ans %>% sum() %>% sqrt()
}
return(weightdist)
}
# 五折交叉检验
data<-Tmall_train1
len = length(data[,1])
size = round(len/5)
set.seed(1234)
Id = sample(1:len) #随机打乱数据
testGroup = list()
trainGroup = list()
for (i in 1:5) {
if(i==5){
addTestGroup = data[Id[(1+(i-1)*size):len],]
addTrainGroup = data[-Id[(1+(i-1)*size):len],]
}else{
addTestGroup = data[Id[(1+(i-1)*size):(i*size)],]
addTrainGroup = data[-Id[(1+(i-1)*size):(i*size)],]
}
testGroup[[i]] = addTestGroup
trainGroup[[i]] = addTrainGroup
}
names(testGroup) = c("testGroup1", "testGroup2",
"testGroup3", "testGroup4",
"testGroup5")
names(trainGroup) = c("trainGroup1", "trainGroup2",
"trainGroup3", "trainGroup4",
"trainGroup5")
err = vector()
for (i in 1:30) {
errRatioVector<-vector()
for (j in 1:5) {
testGroupData = testGroup[[j]]
trainGroupData = trainGroup[[j]]
fit<-knn(train=trainGroupData,
test=testGroupData,
cl=trainGroupData[,5],k=i,prob=FALSE)
CT<-table(testGroupData[,5],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatioVector<-c(errRatioVector,errRatio)
}
index = mean(errRatioVector)
err = c(err, index)
}
err
## [1] 0.5847953 1.0526316 0.7017544 0.5847953 0.7017544 0.7017544 0.8187135
## [8] 0.9356725 1.0526316 0.9356725 0.8187135 0.8187135 0.8187135 0.8187135
## [15] 0.8187135 0.8187135 0.9356725 0.9356725 0.9356725 0.9356725 0.9356725
## [22] 0.8187135 0.9356725 0.8187135 0.9356725 0.9356725 0.9356725 0.9356725
## [29] 0.9356725 0.9356725
#绘制分类错误率随k变化图
plot(err,type="l",xlab="参数K",
ylab="错判率(%)",
main="参数K与错判率",cex.main=0.8)
points(1:30,err,pch=15,col=2)
# 选择k=5
fit<-knn(train=Tmall_train1,
test=Tmall_test1,
cl=Tmall_train1[,5],k=5)
CT<-table(Tmall_test1[,5],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 3.292683
# 选择k=7
fit1<-knn(train=Tmall_train1,
test=Tmall_test1,
cl=Tmall_train1[,5],k=7)
CT1<-table(Tmall_test1[,5],fit1)
errRatio1<-(1-sum(diag(CT1))/sum(CT1))*100
errRatio1
## [1] 3.292683
e1071包中gknn函数参数设置:
gknn(formula, data, k=1, na.action = na.pass, scale = TRUE, method=c(“Euclidean”,“Manhattan”,“Jaccard”))
predict(object, newdata, type = c(“class”, “votes”, “prob”), na.action = na.pass)
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.2
Tmall_train1$BuyOrNot<-as.factor(Tmall_train1$BuyOrNot)
Tmall_test1$BuyOrNot<-as.factor(Tmall_test1$BuyOrNot)
model <-gknn(BuyOrNot ~., data=Tmall_train1, k = 5,
method ="Euclidean" )
fit<-predict(model, Tmall_test1, type = "class")
CT<-table(Tmall_test1[,5],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 2.804878
#加权KNN
model<-gknn(BuyOrNot ~., data=Tmall_train1, k = 5,
method =weightdist)
fit<-predict(model,Tmall_test1,type='class')
CT<-table(Tmall_test1[,5],fit)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 0.3658537
kknn包中kknn()函数的语法和参数如下:
kknn(formula = formula(train),train, test, na.action = na.omit(), k= 7, distance = 2, kernel = “optimal”, ykernel = NULL, scale=TRUE, contrasts= c(‘unordered’ = “contr.dummy”, ordered =“contr.ordinal”))
formula 公式:分类变量~特征变量;
train 指定训练样本集;
test 指定测试样本集;
na.action 缺失值处理,默认为去掉缺失值;
k近邻数值选择,默认为7;
distance 闵可夫斯基距离参数,p=2时为欧氏距离
library(kknn)
getwd()
## [1] "/Users/huanghuilin/Desktop/360安全云盘同步版/2022/2022/2022/2022年数据挖掘课程/上机/第5次上机 KNN和加权KNN算法实现/上机演示/knn算法的R语言实现上机演示"
head(Tmall_train1)
## BuyHit BuyBBrand ActDNTotalDN BuyDNactDN BuyOrNot
## 1 1.57 2.83 51.09 6.38 1
## 2 2.17 3.20 60.87 8.93 1
## 3 6.36 11.63 33.70 16.13 1
## 4 6.25 11.29 40.22 16.22 1
## 5 1.45 1.89 56.52 3.85 1
## 6 1.28 2.13 54.35 4.00 1
head(Tmall_test1)
## BuyHit BuyBBrand ActDNTotalDN BuyDNactDN BuyOrNot
## 1 0.00 0.00 54.84 0.00 0
## 2 0.83 2.36 83.87 7.69 1
## 3 3.89 4.76 90.32 14.29 1
## 4 4.00 7.69 32.26 10.00 1
## 5 6.98 8.70 41.94 15.38 1
## 6 0.00 0.00 19.35 0.00 0
Tmall_train1$BuyOrNot<-as.factor(Tmall_train1$BuyOrNot)
fit<-train.kknn(formula=BuyOrNot~.,
data=Tmall_train1, kmax=11,distance=2,
kernel=c("rectangular","triangular","gaussian"),na.action=na.omit())
plot(fit$MISCLASS[,1]*100,type="l",
main="不同核函数和近邻个数K下的错判率曲线图",
cex.main=0.8,xlab="近邻个数K",ylab="错判率(%)")
lines(fit$MISCLASS[,2]*100,lty=2,col=1)
lines(fit$MISCLASS[,3]*100,lty=3,col=2)
#给出图例
legend("topleft",
legend=c("rectangular","triangular","gaussian"),
lty=c(1,2,3),col=c(1,1,2),cex=0.7)
Tmall_test1$BuyOrNot<-as.factor(Tmall_test1$BuyOrNot)
fit<-kknn(formula=BuyOrNot~.,train=Tmall_train1,
test=Tmall_test1,k=7,distance=2,
kernel="gaussian",na.action=na.omit())
CT<-table(Tmall_test1[,5],fit$fitted.values)
errRatio<-(1-sum(diag(CT))/sum(CT))*100
errRatio
## [1] 2.804878
CT
##
## 0 1
## 0 274 23
## 1 0 523
library(class)
fit<-knn(train=Tmall_train1,test=Tmall_test1,
cl=Tmall_train1$BuyOrNot,k=7)
CT<-table(Tmall_test1[,5],fit)
errRatio<-c(errRatio,(1-sum(diag(CT))/sum(CT))*100)
errRatio
## [1] 2.804878 3.292683
errGraph<-barplot(errRatio,
main="加权K近邻法与K近邻法的错判率对比图(K=7)",
cex.main=0.8,xlab="分类方法",
ylab="错判率(%)",axes=FALSE)
axis(side=1,at=c(0,errGraph,3),
labels=c("","加权K-近邻法","K-近邻法",""),
tcl=0.25)
axis(side=2,tcl=0.25)