由于本问题是要以80次抽样的重复情况来推测抽样框大小,所以这80次抽样其实只能看作是一个样本,记为(【1】:50,【2】:6,【3】:3,【4】:1,【5】:1),此结果是本问题的基线抽样结果,无对应数字组数,为方便后续探索性数据分析,首先需要建立并扩充样本集。
一次抽样:由于每个数字出现的机会相同,所以可视为简单随机抽样。以80个数字为一组,通过函数生成随机数,并转换为整数。此为一次抽样。
循环递增抽样:从整数61开始,每个整数将上述抽样过程执行200次,到500为止。此时样本集中有88000个抽样结果。
# N <- 61
# for (N in 60:500) {
# #循环控制变量
# vl <- 1
# N <- N + 1
# for (vl in 1:200) {
# t <- table(table(floor(runif(80)*N)))
# vdf <- data.frame(t)
# names(vdf) <- c("Rep", "Freq")
# id <- rep(N,length(vdf$Freq))
# new_vdf <- cbind(id, vdf)
# if (vl == 1 && N == 61) ndf = new_vdf else ndf <- rbind(ndf, new_vdf)
# vl <- vl + 1
# }
# }
ndf <- read.csv("H:/workspace/R/genTest/genTestData4.csv")
library(Hmisc)
library(hexbin)
par(mfrow=c(1,1))
smoothScatter(ndf[which(ndf$Rep == 1),c("id", "Freq")], pch = 1, xlab = "组数", ylab = "频次", main = "重复1次的组数频次分布")
abline(h=50, lwd = 2, col = "#333333")
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
这是不同组数下,不重复的数字个数的重抽样结果。我们可由图得出在不同组数下抽样数字不重复的个数的分布状况。 颜色由浅入深代表数据点的密集程度 从x轴引辅助竖线我们可以推测出指定数字组数抽样不重复个数的上下限。 从y轴引辅助横线(如图y=50),观察贯穿情况,我们可以推测出有哪些组数可能产生不重复次数为50这个结果。
另外四个散点图含义同上
par(mfrow=c(2,2))
smoothScatter(ndf[which(ndf$Rep == 2),c("id", "Freq")], pch = 1, xlab = "组数", ylab = "频次", main = "重复2次的组数频次分布")
abline(h=6, lwd = 2, col = "#333333")
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
smoothScatter(ndf[which(ndf$Rep == 3),c("id", "Freq")], pch = 1, xlab = "组数", ylab = "频次", main = "重复3次的组数频次分布")
abline(h=3, lwd = 2, col = "#333333")
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
smoothScatter(ndf[which(ndf$Rep == 4),c("id", "Freq")], pch = 1, xlab = "组数", ylab = "频次", main = "重复4次的组数频次分布")
abline(h=1, lwd = 2, col = "#333333")
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
smoothScatter(ndf[which(ndf$Rep == 5),c("id", "Freq")], pch = 1, xlab = "组数", ylab = "频次", main = "重复5次的组数频次分布")
abline(h=1, lwd = 2, col = "#333333")
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
par(mfrow = c(1,1))
#对重复次数为1进行分析
ndf1_new = ndf[which(ndf$Rep == 1 & ndf$Freq == 50),c("id")]
h1 <- hist(ndf1_new,
breaks = 50,
col = "#C1CDC1",
xlab = "组数",
ylab = "重复出现次数",
main = "重复1次的数字为50个的组数频次分布")
#添加核密度曲线
rug(jitter(ndf1_new))
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
#添加正态拟合线
xfit <- seq(min(ndf1_new), max(ndf1_new), length = 40)
yfit <- dnorm(xfit, mean = mean(ndf1_new), sd = sd(ndf1_new))
yfit <- yfit * diff(h1$mids[1:2]) * length(ndf1_new)
lines(xfit, yfit, col = "#20B2AA", lwd = 2)
#添加均值和中位数线
abline(v = mean(ndf1_new), lwd = 3, col = "#333333")
abline(v = median(ndf1_new), lwd = 2, col = "#333333")
#添加显著水平为0.05的置信区间辅助线
abline(v = quantile(ndf1_new, probs = c(0.025,0.975)), col = "#20B2AA", lwd = "1", lty = 1)
#添加均值和二倍标准差的标注文字
text(400,20,paste("θ=", as.character(round(mean(ndf1_new),0))))
text(400,15,paste("2σ=", as.character(round(2*sd(ndf1_new),0))))
上面的每个散点图我们都做了一条辅助横线,将辅助线与散点密度图的相交处单独拆出来,就是左图的直方图。它的含义是如果80次抽样中有50次不重复,那么对应的组数是如何分布。
蓝色曲线是正态分布拟合线,两条黑色竖线是均值线和中位数线,两条蓝色竖线是2倍标准差的标志线,由此图我们可以推断,当不重复次数为50时,我们有95%的把握推断,数字组数是在【189-96,189+96】这个区间。
另外四个分布图含义相同。
par(mfrow = c(2,2))
#对重复次数为2进行分析
ndf2_new = ndf[which(ndf$Rep == 2 & ndf$Freq == 6),c("id")]
h2 <- hist(ndf2_new,
breaks = 50,
col = "#C1CDC1",
xlab = "组数",
ylab = "重复出现次数",
main = "重复2次的数字为6个的组数频次分布")
#添加核密度曲线
rug(jitter(ndf2_new))
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
#添加正态拟合线
xfit <- seq(min(ndf2_new), max(ndf2_new), length = 40)
yfit <- dnorm(xfit, mean = mean(ndf2_new), sd = sd(ndf2_new))
yfit <- yfit * diff(h2$mids[1:2]) * length(ndf2_new)
lines(xfit, yfit, col = "#20B2AA", lwd = 2)
#添加均值和中位数线
abline(v = mean(ndf2_new), lwd = 3, col = "#333333")
abline(v = median(ndf2_new), lwd = 2, col = "#333333")
#添加显著水平为0.05的置信区间辅助线
abline(v = quantile(ndf2_new, probs = c(0.025,0.975)), col = "#20B2AA", lwd = "1", lty = 1)
#添加均值和二倍标准差的标注文字
text(150,90,paste("θ=", as.character(round(mean(ndf2_new),0))))
text(150,70,paste("2σ=", as.character(round(2*sd(ndf2_new),0))))
#对重复次数为3进行分析
ndf3_new = ndf[which(ndf$Rep == 3 & ndf$Freq == 3),c("id")]
h3 <- hist(ndf3_new,
breaks = 100,
col = "#C1CDC1",
xlab = "组数",
ylab = "重复出现次数",
main = "重复3次的数字为3个的组数频次分布")
#添加核密度曲线
rug(jitter(ndf3_new))
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
#添加正态拟合线
xfit <- seq(min(ndf3_new), max(ndf3_new), length = 100)
yfit <- dnorm(xfit, mean = mean(ndf3_new), sd = sd(ndf3_new))
yfit <- yfit * diff(h3$mids[1:2]) * length(ndf3_new)
lines(xfit, yfit, col = "#20B2AA", lwd = 2)
#添加均值和中位数线
abline(v = mean(ndf3_new), lwd = 3, col = "#333333")
abline(v = median(ndf3_new), lwd = 2, col = "#333333")
#添加显著水平为0.05的置信区间辅助线
abline(v = quantile(ndf3_new, probs = c(0.025,0.975)), col = "#20B2AA", lwd = "1", lty = 1)
text(400,40,paste("θ=", as.character(round(mean(ndf3_new),0))))
text(400,30,paste("2σ=", as.character(round(2*sd(ndf3_new),0))))
#对重复次数为4进行分析
ndf4_new = ndf[which(ndf$Rep == 4 & ndf$Freq == 1),c("id")]
h4 <- hist(ndf4_new,
breaks = 100,
col = "#C1CDC1",
xlab = "组数",
ylab = "重复出现次数",
main = "重复4次的数字为1个的组数频次分布")
#添加核密度曲线
rug(jitter(ndf4_new))
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
#添加正态拟合线
xfit <- seq(min(ndf4_new), max(ndf4_new), length = 100)
yfit <- dnorm(xfit, mean = mean(ndf3_new), sd = sd(ndf3_new))
yfit <- yfit * diff(h4$mids[1:2]) * length(ndf4_new)
lines(xfit, yfit, col = "#20B2AA", lwd = 2)
#添加均值和中位数线
abline(v = mean(ndf4_new), lwd = 3, col = "#333333")
abline(v = median(ndf4_new), lwd = 2, col = "#333333")
#添加显著水平为0.05的置信区间辅助线
abline(v = quantile(ndf4_new, probs = c(0.025,0.975)), col = "#20B2AA", lwd = "1", lty = 1)
text(400,75,paste("θ=", as.character(round(mean(ndf4_new),0))))
text(400,60,paste("2σ=", as.character(round(2*sd(ndf4_new),0))))
#对重复次数为5进行分析
ndf5_new = ndf[which(ndf$Rep == 5 & ndf$Freq == 1),c("id")]
h5 <- hist(ndf5_new,
breaks = 100,
col = "#C1CDC1",
xlab = "组数",
ylab = "重复出现次数",
main = "重复4次的数字为1个时的组数频次分布")
#添加核密度曲线
rug(jitter(ndf5_new))
#添加次要刻度线
minor.tick(nx = 10, ny = 5, tick.ratio = 0.5)
#添加正态拟合线
xfit <- seq(min(ndf5_new), max(ndf5_new), length = 100)
yfit <- dnorm(xfit, mean = mean(ndf3_new), sd = sd(ndf3_new))
yfit <- yfit * diff(h5$mids[1:2]) * length(ndf5_new)
lines(xfit, yfit, col = "#20B2AA", lwd = 2)
#添加均值和中位数线
abline(v = mean(ndf5_new), lwd = 3, col = "#333333")
abline(v = median(ndf5_new), lwd = 2, col = "#333333")
#添加显著水平为0.05的置信区间辅助线
abline(v = quantile(ndf5_new, probs = c(0.025,0.975)), col = "#20B2AA", lwd = "1", lty = 1)
text(400,60,paste("θ=", as.character(round(mean(ndf5_new),0))))
text(400,50,paste("2σ=", as.character(round(2*sd(ndf5_new),0))))
ndf_new = ndf[which(ndf$Rep == 1 & ndf$Freq == 50),c("id", "Rep")]
ndf_new <- rbind(ndf_new, ndf[which(ndf$Rep == 2 & ndf$Freq == 6),c("id", "Rep")])
ndf_new <- rbind(ndf_new, ndf[which(ndf$Rep == 3 & ndf$Freq == 3),c("id", "Rep")])
ndf_new <- rbind(ndf_new, ndf[which(ndf$Rep == 4 & ndf$Freq == 1),c("id", "Rep")])
ndf_new <- rbind(ndf_new, ndf[which(ndf$Rep == 5 & ndf$Freq == 1),c("id", "Rep")])
library(ggplot2)
qplot(id, data = ndf_new, geom = c("density"),
main = "概率密度图",
facets = Rep~.,
fill = Rep)
我们观察图一/三/四/五发现概率密度图的分布都偏左,对于右侧一个数字比如400,上述五个事件同时发生的概率微乎其微。
n <- 61
for (n in 61:500) {
if (n == 61) {
tdf = data.frame(n,
length(ndf1_new[which(ndf1_new == n)])/length(ndf1_new),
length(ndf2_new[which(ndf2_new == n)])/length(ndf2_new),
length(ndf3_new[which(ndf3_new == n)])/length(ndf3_new),
length(ndf4_new[which(ndf4_new == n)])/length(ndf4_new),
length(ndf5_new[which(ndf5_new == n)])/length(ndf5_new)
)
names(tdf) <- c("id","p1","p2","p3","p4","p5")
} else {
temp <- data.frame(n,
length(ndf1_new[which(ndf1_new == n)])/length(ndf1_new),
length(ndf2_new[which(ndf2_new == n)])/length(ndf2_new),
length(ndf3_new[which(ndf3_new == n)])/length(ndf3_new),
length(ndf4_new[which(ndf4_new == n)])/length(ndf4_new),
length(ndf5_new[which(ndf5_new == n)])/length(ndf5_new))
names(temp) <- c("id","p1","p2","p3","p4","p5")
tdf <- rbind(tdf,temp)
}
}
#计算绝对概率
psum <- tdf$p1 * tdf$p2 * tdf$p3 * tdf$p4 * tdf$p5
#计算条件概率
pcon <- tdf$p1 * tdf$p2 * tdf$p3 * tdf$p4 * tdf$p5 / sum(tdf$p1 * tdf$p2 * tdf$p3 * tdf$p4 * tdf$p5)
tdf <- cbind(tdf, psum, pcon)
tdf <- tdf[order(-pcon),]
head(tdf)
## id p1 p2 p3 p4 p5
## 95 155 0.014640356 0.0003496096 0.006152043 0.004102051 0.002798880
## 62 122 0.004455761 0.0002330731 0.007616816 0.005902951 0.006797281
## 82 142 0.010184596 0.0003496096 0.005566134 0.005702851 0.002798880
## 89 149 0.010184596 0.0004661461 0.007470338 0.004302151 0.001999200
## 102 162 0.012730745 0.0004661461 0.006005566 0.004102051 0.001999200
## 69 129 0.007001910 0.0003496096 0.006737952 0.005202601 0.003198721
## psum pcon
## 95 3.615262e-13 0.04866576
## 62 3.173890e-13 0.04272435
## 82 3.163423e-13 0.04258346
## 89 3.050339e-13 0.04106122
## 102 2.922720e-13 0.03934331
## 69 2.744888e-13 0.03694948
par(mfrow = c(1,1))
#推测结果概率气泡图
symbols(tdf$id, tdf$pcon,
circles = tdf$pcon,
inches = 0.35,
fg = "white",
bg = "lightblue",
xlab = "数据组数",
ylab = "问题描述事件发生时的反推条件概率",
main = "推测结果概率气泡图")
#添加条件概率大于2%的点对应的数字组数
text(tdf[which(tdf$pcon > 0.02),]$id,
tdf[which(tdf$pcon > 0.02),]$pcon,
tdf[which(tdf$pcon > 0.02),]$id,
cex = 0.6,
col = "#222222")