读入数据
w <- as.vector(unlist(read.table("exec0301.data")))
w
## [1] 74.3 79.5 75.0 73.5 75.8 70.4 73.5 67.2 75.8 73.5 78.8 75.6 73.5 75.0
## [15] 75.8 72.0 79.5 76.5 73.5 79.5 68.8 75.0 78.8 72.0 68.8 76.5 73.5 72.7
## [29] 75.0 70.4 78.0 78.8 74.3 64.3 76.5 74.3 74.7 70.4 73.5 76.5 70.4 72.0
## [43] 75.8 75.8 70.4 76.5 65.0 77.2 73.5 72.7 80.5 72.0 65.0 80.3 71.2 77.6
## [57] 76.5 68.8 73.5 77.2 80.5 72.0 74.3 69.7 81.2 67.3 81.6 67.3 72.7 84.3
## [71] 69.7 74.3 71.2 74.3 75.0 72.0 75.4 67.3 81.6 75.0 71.2 71.2 69.7 73.5
## [85] 70.4 75.0 72.7 67.3 70.3 76.5 73.5 72.0 68.0 73.5 68.0 74.3 72.7 72.7
## [99] 74.3 70.4
计算均值(mean)、方差标准差(std_dev)、极差®、标准误(sm)、变异系数(cv)、偏度(Skewness)、峰度(Kurtosis)。
source("../R-Book-Demo/ch3/data_outline.R", echo = TRUE, max.deparse.length = 3000)
##
## > data_outline <- function(x) {
## + n <- length(x)
## + m <- mean(x)
## + v <- var(x)
## + s <- sd(x)
## + me <- median(x)
## + cv <- 100 * s/m
## + css <- sum((x - m)^2)
## + uss <- sum(x^2)
## + R <- max(x) - min(x)
## + R1 <- quantile(x, 3/4) - quantile(x, 1/4)
## + sm <- s/sqrt(n)
## + g1 <- n/((n - 1) * (n - 2)) * sum((x - m)^3)/s^3
## + g2 <- ((n * (n + 1))/((n - 1) * (n - 2) * (n - 3)) * sum((x -
## + m)^4)/s^4 - (3 * (n - 1)^2)/((n - 2) * (n - 3)))
## + data.frame(N = n, Mean = m, Var = v, std_dev = s, Median = me,
## + std_mean = sm, CV = cv, CSS = css, USS = uss, R = R,
## + R1 = R1, Skewness = g1, Kurtosis = g2, row.names = 1)
## + }
dow <- data_outline(w)
print(dow, digits = 4)
## N Mean Var std_dev Median std_mean CV CSS USS R R1
## 1 100 73.67 15.52 3.939 73.5 0.3939 5.347 1536 544233 20 4.6
## Skewness Kurtosis
## 1 0.05406 0.03702
直方图及密度估计曲线(绿色)以及正态密度曲线(红色)
hist(w, 20, col = "blue", freq = F)
lines(density(w), lwd = 2, col = "green")
x <- min(w):max(w)
lines(x, dnorm(x, mean(w), sd(w)), col = "red")
经验分布曲线与正态分布曲线(红色)
plot(ecdf(w), verticals = T, do.p = F)
lines(x, pnorm(x, mean(w), sd(w), log.p = F), col = "red")
正态QQ图和相应直线
qqnorm(w)
qqline(w)
stem(w, scale = 1)
##
## The decimal point is at the |
##
## 64 | 300
## 66 | 23333
## 68 | 00888777
## 70 | 344444442222
## 72 | 0000000777777555555555555
## 74 | 33333333700000004688888
## 76 | 5555555226
## 78 | 0888555
## 80 | 355266
## 82 |
## 84 | 3
boxplot(w, outline = F)
fivenum(w, na.rm = F)
## [1] 64.3 71.2 73.5 75.8 84.3
W检验方法
shapiro.test(w)
##
## Shapiro-Wilk normality test
##
## data: w
## W = 0.9901, p-value = 0.6708
Kolmogorov-Smirnov检验方法
ks.test(w, "pnorm", mean(w), sd(w))
## Warning: ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: w
## D = 0.073, p-value = 0.6611
## alternative hypothesis: two-sided
p值大于0.05,故数据可以认为是正态分布的。
读入数据
# 从exam0203.txt直接读取数据,有表头
dt <- read.table("../R-Book-Demo/ch2/exam0203.txt", head = TRUE)
dt
## Name Sex Age Height Weight
## 1 Alice F 13 56.5 84.0
## 2 Becka F 13 65.3 98.0
## 3 Gail F 14 64.3 90.0
## 4 Karen F 12 56.3 77.0
## 5 Kathy F 12 59.8 84.5
## 6 Mary F 15 66.5 112.0
## 7 Sandy F 11 51.3 50.5
## 8 Sharon F 15 62.5 112.5
## 9 Tammy F 14 62.8 102.5
## 10 Alfred M 14 69.0 112.5
## 11 Duke M 14 63.5 102.5
## 12 Guido M 15 67.0 133.0
## 13 James M 12 57.3 83.0
## 14 Jeffrey M 13 62.5 84.0
## 15 John M 12 59.0 99.5
## 16 Philip M 16 72.0 150.0
## 17 Robert M 12 64.8 128.0
## 18 Thomas M 11 57.5 85.0
## 19 William M 15 66.5 112.0
检验
cor.test(dt$Height, dt$Weight, methods = "Pearson")
##
## Pearson's product-moment correlation
##
## data: dt$Height and dt$Weight
## t = 7.555, df = 17, p-value = 7.887e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7044 0.9523
## sample estimates:
## cor
## 0.8778
因为p-value < 0.05,所以相关。
散点图及相关性
x = c(5.1, 3.5, 7.1, 6.2, 8.8, 7.8, 4.5, 5.6, 8, 6.4)
y = c(1907, 1287, 2700, 2373, 3260, 3000, 1947, 2273, 3113, 2493)
plot(x, y)
cor.test(x, y, methods = "Pearson")
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 18.91, df = 8, p-value = 6.33e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9525 0.9975
## sample estimates:
## cor
## 0.989
因为p-value < 0.05 所以相关。
求Y关于X的一元线性回归方程
d61 <- data.frame(x, y)
lm.sol <- lm(y ~ x, data = d61)
summary(lm.sol)
##
## Call:
## lm(formula = y ~ x, data = d61)
##
## Residuals:
## Min 1Q Median 3Q Max
## -128.59 -70.98 -3.73 49.26 167.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 141.0 125.1 1.13 0.29
## x 364.2 19.3 18.91 6.3e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 96.4 on 8 degrees of freedom
## Multiple R-squared: 0.978, Adjusted R-squared: 0.975
## F-statistic: 358 on 1 and 8 DF, p-value: 6.33e-08
回归方程:y=141+364.2x
因为p-value < 0.05,所以认为通过显著性检验。
预测今年数据
new <- data.frame(x = 7)
predict(lm.sol, new, interval = "prediction", level = 0.95)
## fit lwr upr
## 1 2690 2455 2925