第三章 数据描述性分析 习题

3.1 女生血清蛋白分析

读入数据

w <- as.vector(unlist(read.table("exec0301.data")))
w
##   [1]  4.3 79.5 75.0 73.5 75.8 70.4 73.5 67.2 75.8 73.5 78.8 75.6 73.5 75.0
##  [15] 75.8 72.0 79.5 76.5 73.5 79.5 68.8 75.0 78.8 72.0 68.8 76.5 73.5 72.7
##  [29] 75.0 70.4 78.0 78.8 74.3 64.3 76.5 74.3 74.7 70.4 73.5 76.5 70.4 72.0
##  [43] 75.8 75.8 70.4 76.5 65.0 77.2 73.5 72.7 80.5 72.0 65.0 80.3 71.2 77.6
##  [57] 76.5 68.8 73.5 77.2 80.5 72.0 74.3 69.7 81.2 67.3 81.6 67.3 72.7 84.3
##  [71] 69.7 74.3 71.2 74.3 75.0 72.0 75.4 67.3 81.6 75.0 71.2 71.2 69.7 73.5
##  [85] 70.4 75.0 72.7 67.3 70.3 76.5 73.5 72.0 68.0 73.5 68.0 74.3 72.7 72.7
##  [99] 74.3 70.4

计算均值(mean)、方差标准差(std_dev)、极差®、标准误(sm)、变异系数(cv)、偏度(Skewness)、峰度(Kurtosis)。

source("../R-Book-Demo/ch3/data_outline.R", echo = TRUE, max.deparse.length = 3000)
## 
## > data_outline <- function(x) {
## +     n <- length(x)
## +     m <- mean(x)
## +     v <- var(x)
## +     s <- sd(x)
## +     me <- median(x)
## +     cv <- 100 * s/m
## +     css <- sum((x - m)^2)
## +     uss <- sum(x^2)
## +     R <- max(x) - min(x)
## +     R1 <- quantile(x, 3/4) - quantile(x, 1/4)
## +     sm <- s/sqrt(n)
## +     g1 <- n/((n - 1) * (n - 2)) * sum((x - m)^3)/s^3
## +     g2 <- ((n * (n + 1))/((n - 1) * (n - 2) * (n - 3)) * sum((x - 
## +         m)^4)/s^4 - (3 * (n - 1)^2)/((n - 2) * (n - 3)))
## +     data.frame(N = n, Mean = m, Var = v, std_dev = s, Median = me, 
## +         std_mean = sm, CV = cv, CSS = css, USS = uss, R = R, 
## +         R1 = R1, Skewness = g1, Kurtosis = g2, row.names = 1)
## + }
data_outline(w)
##     N  Mean   Var std_dev Median std_mean    CV  CSS    USS  R  R1
## 1 100 72.97 63.62   7.976   73.5   0.7976 10.93 6299 538731 80 4.8
##   Skewness Kurtosis
## 1   -6.504    56.06

3.5 小白鼠伤寒杆菌实验数据的箱线图

输入数据

dm <- data.frame(J1 = c(2, 4, 3, 2, 4, 7, 7, 2, 2, 5, 4, NA), J2 = c(5, 6, 8, 
    5, 10, 7, 12, 12, 6, 6, NA, NA), J3 = c(7, 11, 6, 6, 7, 9, 5, 5, 10, 6, 
    3, 10))
t(dm)
##    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## J1    2    4    3    2    4    7    7    2    2     5     4    NA
## J2    5    6    8    5   10    7   12   12    6     6    NA    NA
## J3    7   11    6    6    7    9    5    5   10     6     3    10

使用boxplot画箱线图

boxplot(dm$J1, dm$J2, dm$J3, names = c("J1", "J2", "J3"))

plot of chunk unnamed-chunk-4

使用plot画箱线图

plot(factor(c(rep(1, length(dm$J1)), rep(2, length(dm$J2)), rep(3, length(dm$J3)))), 
    unlist(dm))

plot of chunk unnamed-chunk-5

结论,J1菌型平均存活天数明显短于J2,J3菌型。

3.7 学生的四项指标散点图

读入数据

# 从exam0203.txt直接读取数据,有表头
dt <- read.table("../R-Book-Demo/ch2/exam0203.txt", head = TRUE)
dt
##       Name Sex Age Height Weight
## 1    Alice   F  13   56.5   84.0
## 2    Becka   F  13   65.3   98.0
## 3     Gail   F  14   64.3   90.0
## 4    Karen   F  12   56.3   77.0
## 5    Kathy   F  12   59.8   84.5
## 6     Mary   F  15   66.5  112.0
## 7    Sandy   F  11   51.3   50.5
## 8   Sharon   F  15   62.5  112.5
## 9    Tammy   F  14   62.8  102.5
## 10  Alfred   M  14   69.0  112.5
## 11    Duke   M  14   63.5  102.5
## 12   Guido   M  15   67.0  133.0
## 13   James   M  12   57.3   83.0
## 14 Jeffrey   M  13   62.5   84.0
## 15    John   M  12   59.0   99.5
## 16  Philip   M  16   72.0  150.0
## 17  Robert   M  12   64.8  128.0
## 18  Thomas   M  11   57.5   85.0
## 19 William   M  15   66.5  112.0

体重对于身高的散点图

plot(dt$Weight, dt$Height, xlab = "Weight", ylab = "Height")

plot of chunk unnamed-chunk-7

不同性别情况下,体重对于身高的散点图

coplot(dt$Weight ~ dt$Height | dt$Sex, xlab = "Weight", ylab = "Height")

plot of chunk unnamed-chunk-8

不同年龄段下,体重对于身高的散点图

coplot(dt$Weight ~ dt$Height | dt$Age, xlab = "Weight", ylab = "Height")

plot of chunk unnamed-chunk-9

不同性别,不同年龄段下,体重对于身高的散点图

coplot(dt$Weight ~ dt$Height | dt$Age + dt$Sex, xlab = "Weight", ylab = "Height")

plot of chunk unnamed-chunk-10

3.8 函数的三维网格曲面和二维等值线

定义函数

f <- function(x, y) x^4 - 2 * (x^2) * y + x^2 - 2 * x * y + 2 * (y^2) + 9/2 * 
    x - 4 * y + 4

定义x,y

x <- seq(-2, 3, 0.05)
y <- seq(-1, 7, 0.05)

计算函数值

z <- outer(x, y, f)

函数的二维等值线图

contour(x, y, z, levels = c(0, 1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 60, 80, 
    100))

plot of chunk unnamed-chunk-14

函数的三维网格曲面图

persp(x, y, z, theta = -30, phi = 25, col = "yellow")

plot of chunk unnamed-chunk-15

3.10 求职者问题的星图

读入求职者数据

pushd = getwd()
setwd("../R-Book-Demo/ch3")
source("group.R", echo = TRUE)
## 
## > rt <- read.table("applicant.data")
## 
## > attach(rt)
## 
## > rt$G1 <- (SC + LC + SMS + DRV + AMB + GSP + POT)/7
## 
## > rt$G2 <- (FL + EXP + SUIT)/3
## 
## > rt$G3 <- (LA + HON + KJ)/3
## 
## > rt$G4 <- AA
## 
## > rt$G5 <- APP
## 
## > AVG <- apply(rt[, 16:20], 1, mean)
## 
## > sort(AVG, decreasing = TRUE)
##     8    40    39     7    23     9     2    22    24    16    46     5 
## 9.000 8.971 8.914 8.619 8.390 8.210 8.067 8.057 8.038 7.571 7.533 7.314 
##    10    20    13    44    17    14    12     6    27    45     3    21 
## 7.305 7.219 7.210 7.095 7.076 7.048 7.029 7.019 6.914 6.905 6.733 6.695 
##    15    11     4    26    18    25    38    19    37     1    36    41 
## 6.619 6.495 6.390 6.352 6.219 6.200 6.152 5.952 5.924 5.886 5.686 5.362 
##    43    31    30    32    47    33    35    42    48    34    28    29 
## 5.314 5.048 5.010 4.648 4.524 4.438 4.257 4.257 4.190 4.048 3.267 2.514
setwd(pushd)

以15项自变量画星图

stars(rt[1:15], draw.segments = TRUE)

plot of chunk unnamed-chunk-17

以G1,G2,G3,G4,G5为轴画星图

stars(rt[16:20], draw.segments = TRUE)

plot of chunk unnamed-chunk-18

由图可看到明显7,8,9,2,39,40较佳,但是还是没有按照sort结果精确。

3.11 求职者问题的调和曲线图

定义调和曲线函数

pushd = getwd()
setwd("../R-Book-Demo/ch3")
source("unison.R", echo = TRUE, max.deparse.length = 3000)
## 
## > unison <- function(x) {
## +     if (is.data.frame(x) == TRUE) 
## +         x <- as.matrix(x)
## +     t <- seq(-pi, pi, pi/30)
## +     m <- nrow(x)
## +     n <- ncol(x)
## +     f <- array(0, c(m, length(t)))
## +     for (i in 1:m) {
## +         f[i, ] <- x[i, 1]/sqrt(2)
## +         for (j in 2:n) {
## +             if (j%%2 == 0) 
## +                 f[i, ] <- f[i, ] + x[i, j] * sin(j/2 * t)
## +             else f[i, ] <- f[i, ] + x[i, j] * cos(j%/%2 * t)
## +         }
## +     }
## +     plot(c(-pi, pi), c(min(f), max(f)), type = "n", main = "The Unison graph of Data", 
## +         xlab = "t", ylab = "f(t)")
## +     for (i in 1:m) lines(t, f[i, ], col = i)
## + }
setwd(pushd)

以G1,G2,G3,G4,G5为自变量画调和曲线图

unison(rt[16:20])

plot of chunk unnamed-chunk-20