第六章样本相似性可视化课堂练习

Author

221572216程子桃

1 案例数据

1.1 iris：鸢尾花数据集

R语言内置的 iris 数据集（鸢尾花数据集）是统计学和机器学习领域最经典的案例数据集之一，由英国统计学家 Ronald Fisher 在1936年首次提出。它包含了3种鸢尾花的测量数据，每个类别有50个样本，共150行数据。

变量名	数据类型	单位	说明
Sepal.Length	numeric	cm	花萼（萼片）长度，即花朵最外层绿色叶状结构的长度
Sepal.Width	numeric	cm	花萼（萼片）宽度
Petal.Length	numeric	cm	花瓣长度，即花朵内部彩色叶状结构的长度
Petal.Width	numeric	cm	花瓣宽度
Species	factor	-	鸢尾花种类（分类标签），包含3个水平： • `setosa`山鸢尾 • `versicolor`变色鸢尾 • `virginica`维吉尼亚鸢尾

setosa 类的花通常较小，花瓣短而宽，与其他两类差异明显；versicolor 和 virginica 在部分特征上有重叠，分类难度更高。

2 平行坐标图和雷达图

2.1 平行坐标图

ggplot(group=id)+geom_line+geom_point 绘图
Species 映射为颜色

2.1.1 图形观察和代码编写的心得体会

平行坐标图通过多变量平行轴线展示数据分布，用颜色区分Specie，能直观比较不同类别在各指标上的差异

2.2 雷达图

采用ggiraphExtra::ggRadar 绘图

2.2.1 图形观察和代码编写的心得体会

雷达图通过极坐标系展示多变量数据，能清晰对比不同Species在各指标上的轮廓差异，但需注意变量顺序和避免过度拥挤。

3 星图和脸谱图

3.1 星图

采用stars 函数，对标准化数据绘制圆弧星图
需要先将四个数值变量转化为矩阵，并将Species 作为矩阵的行名；
设置图例在合适位置，能完整显示；将图形分为10行。

function (x, full = TRUE, scale = TRUE, radius = TRUE, labels = dimnames(x)[[1L]], 
    locations = NULL, nrow = NULL, ncol = NULL, len = 1, key.loc = NULL, 
    key.labels = dimnames(x)[[2L]], key.xpd = TRUE, xlim = NULL, 
    ylim = NULL, flip.labels = NULL, draw.segments = FALSE, col.segments = 1L:n.seg, 
    col.stars = NA, col.lines = NA, axes = FALSE, frame.plot = axes, 
    main = NULL, sub = NULL, xlab = "", ylab = "", cex = 0.8, 
    lwd = 0.25, lty = par("lty"), xpd = FALSE, mar = pmin(par("mar"), 
        1.1 + c(2 * axes + (xlab != ""), 2 * axes + (ylab != 
            ""), 1, 0)), add = FALSE, plot = TRUE, ...) 
{
    if (is.data.frame(x)) 
        x <- data.matrix(x)
    else if (!is.matrix(x)) 
        stop("'x' must be a matrix or a data frame")
    if (!is.numeric(x)) 
        stop("data in 'x' must be numeric")
    n.loc <- nrow(x)
    n.seg <- ncol(x)
    if (is.null(locations)) {
        if (is.null(nrow)) 
            nrow <- ceiling(if (!is.numeric(ncol)) sqrt(n.loc) else n.loc/ncol)
        if (is.null(ncol)) 
            ncol <- ceiling(n.loc/nrow)
        if (nrow * ncol < n.loc) 
            stop("'nrow * ncol' is less than the number of observations")
        ff <- if (!is.null(labels)) 
            2.3
        else 2.1
        locations <- expand.grid(ff * 1L:ncol, ff * nrow:1)[1L:n.loc, 
            ]
        if (!is.null(labels) && (missing(flip.labels) || !is.logical(flip.labels))) 
            flip.labels <- ncol * mean(nchar(labels, type = "c")) > 
                30
    }
    else {
        if (is.numeric(locations) && length(locations) == 2) {
            locations <- cbind(rep.int(locations[1L], n.loc), 
                rep.int(locations[2L], n.loc))
            if (!missing(labels) && n.loc > 1) 
                warning("labels do not make sense for a single location")
            else labels <- NULL
        }
        else {
            if (is.data.frame(locations)) 
                locations <- data.matrix(locations)
            if (!is.matrix(locations) || ncol(locations) != 2) 
                stop("'locations' must be a 2-column matrix.")
            if (n.loc != nrow(locations)) 
                stop("number of rows of 'locations' and 'x' must be equal.")
        }
        if (missing(flip.labels) || !is.logical(flip.labels)) 
            flip.labels <- FALSE
    }
    xloc <- locations[, 1]
    yloc <- locations[, 2]
    angles <- if (full) 
        seq.int(0, 2 * pi, length.out = n.seg + 1)[-(n.seg + 
            1)]
    else if (draw.segments) 
        seq.int(0, pi, length.out = n.seg + 1)[-(n.seg + 1)]
    else seq.int(0, pi, length.out = n.seg)
    if (length(angles) != n.seg) 
        stop("length of 'angles' must equal 'ncol(x)'")
    if (scale) {
        x <- apply(x, 2L, function(x) (x - min(x, na.rm = TRUE))/diff(range(x, 
            na.rm = TRUE)))
    }
    x[is.na(x)] <- 0
    mx <- max(x <- x * len)
    if (is.null(xlim)) 
        xlim <- range(xloc) + c(-mx, mx)
    if (is.null(ylim)) 
        ylim <- range(yloc) + c(-mx, mx)
    deg <- pi/180
    op <- par(mar = mar, xpd = xpd)
    on.exit(par(op))
    dev.hold()
    on.exit(dev.flush(), add = TRUE)
    if (plot && !add) 
        plot(0, type = "n", ..., xlim = xlim, ylim = ylim, main = main, 
            sub = sub, xlab = xlab, ylab = ylab, asp = 1, axes = axes)
    if (!plot) 
        return(locations)
    s.x <- xloc + x * rep.int(cos(angles), rep.int(n.loc, n.seg))
    s.y <- yloc + x * rep.int(sin(angles), rep.int(n.loc, n.seg))
    if (draw.segments) {
        aangl <- c(angles, if (full) 2 * pi else pi)
        for (i in 1L:n.loc) {
            px <- py <- numeric()
            for (j in 1L:n.seg) {
                k <- seq.int(from = aangl[j], to = aangl[j + 
                  1], by = 1 * deg)
                px <- c(px, xloc[i], s.x[i, j], x[i, j] * cos(k) + 
                  xloc[i], NA)
                py <- c(py, yloc[i], s.y[i, j], x[i, j] * sin(k) + 
                  yloc[i], NA)
            }
            polygon(px, py, col = col.segments, lwd = lwd, lty = lty)
        }
    }
    else {
        for (i in 1L:n.loc) {
            polygon(s.x[i, ], s.y[i, ], lwd = lwd, lty = lty, 
                col = col.stars[i])
            polygon(s.x[i, ], s.y[i, ], lwd = lwd, lty = lty, 
                border = col.lines[i], col = col.stars[i])
            if (radius) 
                segments(rep.int(xloc[i], n.seg), rep.int(yloc[i], 
                  n.seg), s.x[i, ], s.y[i, ], lwd = lwd, lty = lty)
        }
    }
    if (!is.null(labels)) {
        y.off <- mx * (if (full) 
            1
        else 0.1)
        if (flip.labels) 
            y.off <- y.off + cex * par("cxy")[2L] * ((1L:n.loc)%%2 - 
                if (full) 
                  0.4
                else 0)
        text(xloc, yloc - y.off, labels, cex = cex, adj = c(0.5, 
            1))
    }
    if (!is.null(key.loc)) {
        par(xpd = key.xpd)
        key.x <- len * cos(angles) + key.loc[1L]
        key.y <- len * sin(angles) + key.loc[2L]
        if (draw.segments) {
            px <- py <- numeric()
            for (j in 1L:n.seg) {
                k <- seq.int(from = aangl[j], to = aangl[j + 
                  1], by = 1 * deg)
                px <- c(px, key.loc[1L], key.x[j], len * cos(k) + 
                  key.loc[1L], NA)
                py <- c(py, key.loc[2L], key.y[j], len * sin(k) + 
                  key.loc[2L], NA)
            }
            polygon(px, py, col = col.segments, lwd = lwd, lty = lty)
        }
        else {
            polygon(key.x, key.y, lwd = lwd, lty = lty)
            if (radius) 
                segments(rep.int(key.loc[1L], n.seg), rep.int(key.loc[2L], 
                  n.seg), key.x, key.y, lwd = lwd, lty = lty)
        }
        lab.angl <- angles + if (draw.segments) 
            (angles[2L] - angles[1L])/2
        else 0
        label.x <- 1.1 * len * cos(lab.angl) + key.loc[1L]
        label.y <- 1.1 * len * sin(lab.angl) + key.loc[2L]
        for (k in 1L:n.seg) {
            text.adj <- c(if (lab.angl[k] < 90 * deg || lab.angl[k] > 
                270 * deg) 0 else if (lab.angl[k] > 90 * deg && 
                lab.angl[k] < 270 * deg) 1 else 0.5, if (lab.angl[k] <= 
                90 * deg) (1 - lab.angl[k]/(90 * deg))/2 else if (lab.angl[k] <= 
                270 * deg) (lab.angl[k] - 90 * deg)/(180 * deg) else 1 - 
                (lab.angl[k] - 270 * deg)/(180 * deg))
            text(label.x[k], label.y[k], labels = key.labels[k], 
                cex = cex, adj = text.adj)
        }
    }
    if (frame.plot) 
        box(...)
    invisible(locations)
}
<bytecode: 0x0000021119d0f990>
<environment: namespace:graphics>

3.1.1 图形观察和代码编写的心得体会

圆弧星图通过极坐标分段展示多变量数据，适合直观比较Species不同类别在各指标上的相对大小，但需注意类别过多时易重叠。

3.2 脸谱图

采用aplpack::faces 函数，作Species 三个类别的脸谱图
需要先将四个数值变量转化为矩阵，并将Species 作为矩阵的行名；
设置图例在合适位置，能完整显示；将图形分为12列。

effect of variables:
 modified item       Var           
 "height of face   " "Sepal.Length"
 "width of face    " "Sepal.Width" 
 "structure of face" "Petal.Length"
 "height of mouth  " "Petal.Width" 
 "width of mouth   " "Sepal.Length"
 "smiling          " "Sepal.Width" 
 "height of eyes   " "Petal.Length"
 "width of eyes    " "Petal.Width" 
 "height of hair   " "Sepal.Length"
 "width of hair   "  "Sepal.Width" 
 "style of hair   "  "Petal.Length"
 "height of nose  "  "Petal.Width" 
 "width of nose   "  "Sepal.Length"
 "width of ear    "  "Sepal.Width" 
 "height of ear   "  "Petal.Length"