plyr exercise 03

1.对于R的内置数据集iris，对不同的Species构建Sepal.Length~Sepal.Width的线性模型并比较三个线性回归模型的拟合效果。

library(plyr)

regress = function(df) {
    lm(Sepal.Length ~ Sepal.Width, data = df)
}

(lms = dlply(iris, .(Species), regress))

## $setosa
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = df)
## 
## Coefficients:
## (Intercept)  Sepal.Width  
##        2.64         0.69  
## 
## 
## $versicolor
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = df)
## 
## Coefficients:
## (Intercept)  Sepal.Width  
##       3.540        0.865  
## 
## 
## $virginica
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = df)
## 
## Coefficients:
## (Intercept)  Sepal.Width  
##       3.907        0.902  
## 
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##      Species
## 1     setosa
## 2 versicolor
## 3  virginica


# 以模型1为例，查看回归模型结果
summary(lms[1][[1]])

## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5248 -0.1629  0.0217  0.1383  0.4443 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6390     0.3100    8.51  3.7e-11 ***
## Sepal.Width   0.6905     0.0899    7.68  6.7e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.239 on 48 degrees of freedom
## Multiple R-squared:  0.551,  Adjusted R-squared:  0.542 
## F-statistic:   59 on 1 and 48 DF,  p-value: 6.71e-10


# 查看回归模型所包括的变量
attributes(summary(lms[1][[1]]))

## $names
##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled" 
## 
## $class
## [1] "summary.lm"

names(summary(lms[1][[1]]))  #可以引用的参数名

##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"

summary(lms[1][[1]])$r.squared  #R^2

## [1] 0.5514

coef(summary(lms[1][[1]]))  #回归系数及标准差、t-value、p-value

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)   2.6390     0.3100   8.513 3.742e-11
## Sepal.Width   0.6905     0.0899   7.681 6.710e-10

coef(summary(lms[1][[1]]))[, 4]  #回归系数的p-value

## (Intercept) Sepal.Width 
##   3.742e-11   6.710e-10

(fstat = summary(lms[1][[1]])$fstatistic)  #F统计量

## value numdf dendf 
## 58.99  1.00 48.00

1 - splat(pf)(rename(fstat, c(value = "q", numdf = "df1", dendf = "df2")))  #模型的p-value

## [1] 6.71e-10


# 根据以上观察，取出各个模型的R^2和显著性水平

# 取出系数及系数的p-value
coefs <- function(x) {
    mat = coef(summary(x))
    b0 = mat[1, 1]
    pr.b0 = mat[1, 4]
    b1 = mat[2, 1]
    pr.b1 = mat[2, 4]
    c(intercept = b0, pr.intercept = pr.b0, slope = b1, pr.slope = pr.b1)
}

# 取出R^2
rsq <- function(x) summary(x)$r.squared

# 取出模型的p-value
pr.model <- function(x) {
    fstat = summary(x)$fstatistic
    1 - splat(pf)(rename(fstat, c(value = "q", numdf = "df1", dendf = "df2")))
}

# 汇总三个模型的拟合效果
ldply(lms, function(x) c(coefs(x), rsquare = rsq(x), pr.model = pr.model(x)))

##      Species intercept pr.intercept  slope  pr.slope rsquare  pr.model
## 1     setosa     2.639    3.742e-11 0.6905 6.710e-10  0.5514 6.710e-10
## 2 versicolor     3.540    9.069e-08 0.8651 8.772e-05  0.2766 8.772e-05
## 3  virginica     3.907    4.656e-06 0.9015 8.435e-04  0.2091 8.435e-04

2.对于plyr中的数据集baseball，计算每个队伍(team变量)的每一年的人均击球得分，并画出每个队伍的人均击球得分的时间序列图。

head(baseball)

##            id year stint team lg  g  ab  r  h X2b X3b hr rbi sb cs bb so
## 4   ansonca01 1871     1  RC1    25 120 29 39  11   3  0  16  6  2  2  1
## 44  forceda01 1871     1  WS3    32 162 45 45   9   4  0  29  8  0  4  0
## 68  mathebo01 1871     1  FW1    19  89 15 24   3   1  0  10  2  1  2  0
## 99  startjo01 1871     1  NY2    33 161 35 58   5   1  1  34  4  2  3  0
## 102 suttoez01 1871     1  CL1    29 128 35 45   3   7  3  23  3  1  1  0
## 106 whitede01 1871     1  CL1    29 146 40 47   6   5  1  21  2  2  4  1
##     ibb hbp sh sf gidp
## 4    NA  NA NA NA   NA
## 44   NA  NA NA NA   NA
## 68   NA  NA NA NA   NA
## 99   NA  NA NA NA   NA
## 102  NA  NA NA NA   NA
## 106  NA  NA NA NA   NA


# 按team和year分组计算rbi的均值
data = ddply(baseball, .(team, year), summarize, mean.rbi = mean(rbi))

# 为了将参赛年份太少的球队排除出去，按team分组，增加一列参赛年数ys
data1 = ddply(data, .(team), transform, ys = length(year))
head(data1)

##   team year mean.rbi ys
## 1  ALT 1884       NA  1
## 2  ANA 1997   18.333  8
## 3  ANA 1998   20.182  8
## 4  ANA 1999   10.714  8
## 5  ANA 2000    4.000  8
## 6  ANA 2001    5.333  8


# 只选择参赛年数多于3年的球队，并且去掉那些平均击球得分值不存在的行
data2 = subset(data1, ys > 3 & !is.na(mean.rbi))
head(data2)

##   team year mean.rbi ys
## 2  ANA 1997   18.333  8
## 3  ANA 1998   20.182  8
## 4  ANA 1999   10.714  8
## 5  ANA 2000    4.000  8
## 6  ANA 2001    5.333  8
## 7  ANA 2002    0.000  8


# 设定画图边框的x、y轴位置
xlim <- range(data2$year, na.rm = TRUE)
ylim <- range(data2$mean.rbi, na.rm = TRUE)

# 绘图函数：将mean.rbi对year做线图
plotpattern <- function(df) {
    require(ggplot2)
    qplot(year, mean.rbi, data = df, geom = "line", main = df$team) + scale_x_continuous(limits = xlim, 
        breaks = seq(floor(xlim[1]/10) * 10, ceiling(xlim[2]/10) * 10, 10)) + 
        scale_y_continuous(limits = ylim, breaks = seq(-10, 110, 10))
}

# 测试绘图函数
plotpattern(subset(data2, team == "CHN"))

## Loading required package: ggplot2

plot of chunk unnamed-chunk-2

# 绘制各个球队的曲线图，并保存在pdf文件中
pdf("E:\\A 学习\\1课程与书目\\2014春\\Dataguru-plyr\\3\\作业\\rbi.pdf", width = 8, 
    height = 4)
# d_ply函数本身没有输出，但可以将结果图打印到绘图设备（.print=T）
# 按不同的球队分组（球队的排序按照mean(mean.rbi)值从低到高），分别画出各个选手的图
d_ply(data2, .(reorder(team, mean.rbi)), failwith(NA, plotpattern), .print = TRUE)
dev.off()