library(plyr)
regress = function(df) {
lm(Sepal.Length ~ Sepal.Width, data = df)
}
(lms = dlply(iris, .(Species), regress))
## $setosa
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = df)
##
## Coefficients:
## (Intercept) Sepal.Width
## 2.64 0.69
##
##
## $versicolor
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = df)
##
## Coefficients:
## (Intercept) Sepal.Width
## 3.540 0.865
##
##
## $virginica
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = df)
##
## Coefficients:
## (Intercept) Sepal.Width
## 3.907 0.902
##
##
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
## Species
## 1 setosa
## 2 versicolor
## 3 virginica
# 以模型1为例,查看回归模型结果
summary(lms[1][[1]])
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5248 -0.1629 0.0217 0.1383 0.4443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6390 0.3100 8.51 3.7e-11 ***
## Sepal.Width 0.6905 0.0899 7.68 6.7e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.239 on 48 degrees of freedom
## Multiple R-squared: 0.551, Adjusted R-squared: 0.542
## F-statistic: 59 on 1 and 48 DF, p-value: 6.71e-10
# 查看回归模型所包括的变量
attributes(summary(lms[1][[1]]))
## $names
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
##
## $class
## [1] "summary.lm"
names(summary(lms[1][[1]])) #可以引用的参数名
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
summary(lms[1][[1]])$r.squared #R^2
## [1] 0.5514
coef(summary(lms[1][[1]])) #回归系数及标准差、t-value、p-value
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6390 0.3100 8.513 3.742e-11
## Sepal.Width 0.6905 0.0899 7.681 6.710e-10
coef(summary(lms[1][[1]]))[, 4] #回归系数的p-value
## (Intercept) Sepal.Width
## 3.742e-11 6.710e-10
(fstat = summary(lms[1][[1]])$fstatistic) #F统计量
## value numdf dendf
## 58.99 1.00 48.00
1 - splat(pf)(rename(fstat, c(value = "q", numdf = "df1", dendf = "df2"))) #模型的p-value
## [1] 6.71e-10
# 根据以上观察,取出各个模型的R^2和显著性水平
# 取出系数及系数的p-value
coefs <- function(x) {
mat = coef(summary(x))
b0 = mat[1, 1]
pr.b0 = mat[1, 4]
b1 = mat[2, 1]
pr.b1 = mat[2, 4]
c(intercept = b0, pr.intercept = pr.b0, slope = b1, pr.slope = pr.b1)
}
# 取出R^2
rsq <- function(x) summary(x)$r.squared
# 取出模型的p-value
pr.model <- function(x) {
fstat = summary(x)$fstatistic
1 - splat(pf)(rename(fstat, c(value = "q", numdf = "df1", dendf = "df2")))
}
# 汇总三个模型的拟合效果
ldply(lms, function(x) c(coefs(x), rsquare = rsq(x), pr.model = pr.model(x)))
## Species intercept pr.intercept slope pr.slope rsquare pr.model
## 1 setosa 2.639 3.742e-11 0.6905 6.710e-10 0.5514 6.710e-10
## 2 versicolor 3.540 9.069e-08 0.8651 8.772e-05 0.2766 8.772e-05
## 3 virginica 3.907 4.656e-06 0.9015 8.435e-04 0.2091 8.435e-04
head(baseball)
## id year stint team lg g ab r h X2b X3b hr rbi sb cs bb so
## 4 ansonca01 1871 1 RC1 25 120 29 39 11 3 0 16 6 2 2 1
## 44 forceda01 1871 1 WS3 32 162 45 45 9 4 0 29 8 0 4 0
## 68 mathebo01 1871 1 FW1 19 89 15 24 3 1 0 10 2 1 2 0
## 99 startjo01 1871 1 NY2 33 161 35 58 5 1 1 34 4 2 3 0
## 102 suttoez01 1871 1 CL1 29 128 35 45 3 7 3 23 3 1 1 0
## 106 whitede01 1871 1 CL1 29 146 40 47 6 5 1 21 2 2 4 1
## ibb hbp sh sf gidp
## 4 NA NA NA NA NA
## 44 NA NA NA NA NA
## 68 NA NA NA NA NA
## 99 NA NA NA NA NA
## 102 NA NA NA NA NA
## 106 NA NA NA NA NA
# 按team和year分组计算rbi的均值
data = ddply(baseball, .(team, year), summarize, mean.rbi = mean(rbi))
# 为了将参赛年份太少的球队排除出去,按team分组,增加一列参赛年数ys
data1 = ddply(data, .(team), transform, ys = length(year))
head(data1)
## team year mean.rbi ys
## 1 ALT 1884 NA 1
## 2 ANA 1997 18.333 8
## 3 ANA 1998 20.182 8
## 4 ANA 1999 10.714 8
## 5 ANA 2000 4.000 8
## 6 ANA 2001 5.333 8
# 只选择参赛年数多于3年的球队,并且去掉那些平均击球得分值不存在的行
data2 = subset(data1, ys > 3 & !is.na(mean.rbi))
head(data2)
## team year mean.rbi ys
## 2 ANA 1997 18.333 8
## 3 ANA 1998 20.182 8
## 4 ANA 1999 10.714 8
## 5 ANA 2000 4.000 8
## 6 ANA 2001 5.333 8
## 7 ANA 2002 0.000 8
# 设定画图边框的x、y轴位置
xlim <- range(data2$year, na.rm = TRUE)
ylim <- range(data2$mean.rbi, na.rm = TRUE)
# 绘图函数:将mean.rbi对year做线图
plotpattern <- function(df) {
require(ggplot2)
qplot(year, mean.rbi, data = df, geom = "line", main = df$team) + scale_x_continuous(limits = xlim,
breaks = seq(floor(xlim[1]/10) * 10, ceiling(xlim[2]/10) * 10, 10)) +
scale_y_continuous(limits = ylim, breaks = seq(-10, 110, 10))
}
# 测试绘图函数
plotpattern(subset(data2, team == "CHN"))
## Loading required package: ggplot2
# 绘制各个球队的曲线图,并保存在pdf文件中
pdf("E:\\A 学习\\1课程与书目\\2014春\\Dataguru-plyr\\3\\作业\\rbi.pdf", width = 8,
height = 4)
# d_ply函数本身没有输出,但可以将结果图打印到绘图设备(.print=T)
# 按不同的球队分组(球队的排序按照mean(mean.rbi)值从低到高),分别画出各个选手的图
d_ply(data2, .(reorder(team, mean.rbi)), failwith(NA, plotpattern), .print = TRUE)
dev.off()