notes

第1章多元统计分析概述

X=matrix(1:20,10,2)
X

##       [,1] [,2]
##  [1,]    1   11
##  [2,]    2   12
##  [3,]    3   13
##  [4,]    4   14
##  [5,]    5   15
##  [6,]    6   16
##  [7,]    7   17
##  [8,]    8   18
##  [9,]    9   19
## [10,]   10   20

x=rnorm(100)#产生100个标准正态分布随机数
x

##   [1]  0.50004507  1.46773649  0.89908609 -0.44076450 -0.01943443
##   [6] -0.21289645 -1.40586848  0.78254826 -0.22203210  0.96791077
##  [11]  0.13402647  1.76918599 -0.91254099  0.74271781  0.70999072
##  [16] -0.61818885  1.68648950  0.16834679 -1.04886828 -1.29707930
##  [21]  1.13559290 -0.85227718  1.07760431 -1.24772250 -0.31498643
##  [26]  0.37235728 -0.48250220 -2.18963544 -0.65130278 -0.54361045
##  [31]  1.16003852 -0.09250449 -0.39585485 -0.86323445 -0.14611100
##  [36] -0.65601751 -0.41133070 -0.84982965 -1.50257417 -0.60946256
##  [41] -1.59371706 -0.38664190  0.92465777 -0.49111688  0.73551881
##  [46] -0.61234433 -1.38849905  0.22220757 -0.44307734  0.64588686
##  [51] -0.20535238 -1.01849811  0.98657954 -0.04260383 -0.44246464
##  [56]  1.49241284 -0.07323067 -0.60024476 -1.38231590 -1.03977229
##  [61]  0.09707323  0.67549212 -2.92765502 -1.14404648  1.03898238
##  [66] -0.05616122 -0.94582967  1.83842730 -0.26522375 -0.20266148
##  [71]  0.31541506 -0.23537082 -0.26907570 -0.41467273  1.61586293
##  [76] -0.11457830  0.69736258 -0.39956188 -1.03299561 -0.03515792
##  [81] -0.75708439  1.91107646  1.62754340 -0.81432850  1.32146026
##  [86]  1.03751034 -0.53821456  0.24921928  0.71920170 -0.11678561
##  [91] -2.19645217  0.53184495  2.07016109  0.35925740  0.22802105
##  [96]  1.26762758 -1.40609909  0.22341295  0.46947248  0.17849808

hist(x,prob=T)#做x这组数据的直方图，其中纵轴表示频率
curve(dnorm(x),add=T)#添加直方图拟合的密度函数曲线

第2章多元数据的数学表达及R使用

x1=c(171,175,159,155,152,158,154,164,168,166,159,164)#创建一个向量
x2=c(57,64,41,38,35,44,41,51,57,49,47,46)
length(x1)#向量的长度

## [1] 12

mode(x1)#数据的类型

## [1] "numeric"

rbind(x1,x2)#按行合并

##    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## x1  171  175  159  155  152  158  154  164  168   166   159   164
## x2   57   64   41   38   35   44   41   51   57    49    47    46

cbind(x1,x2)#按列合并

##        x1 x2
##  [1,] 171 57
##  [2,] 175 64
##  [3,] 159 41
##  [4,] 155 38
##  [5,] 152 35
##  [6,] 158 44
##  [7,] 154 41
##  [8,] 164 51
##  [9,] 168 57
## [10,] 166 49
## [11,] 159 47
## [12,] 164 46

matrix(x1,nrow=3,ncol=4)#利用x1数据创建矩阵

##      [,1] [,2] [,3] [,4]
## [1,]  171  155  154  166
## [2,]  175  152  164  159
## [3,]  159  158  168  164

matrix(x1,nrow=4,ncol=3)#创建行数列数发生变化的矩阵

##      [,1] [,2] [,3]
## [1,]  171  152  168
## [2,]  175  158  166
## [3,]  159  154  159
## [4,]  155  164  164

matrix(x1,nrow=4,ncol=3,byrow=T)#创建按照行排列的矩阵

##      [,1] [,2] [,3]
## [1,]  171  175  159
## [2,]  155  152  158
## [3,]  154  164  168
## [4,]  166  159  164

A=matrix(1:12,nrow=3,ncol=4)#创建矩阵
t(A)#求矩阵转置

##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9
## [4,]   10   11   12

A=B=matrix(1:12,nrow=3,ncol=4)#创建两个相同的矩
A+B#矩阵加法

##      [,1] [,2] [,3] [,4]
## [1,]    2    8   14   20
## [2,]    4   10   16   22
## [3,]    6   12   18   24

A-B#矩阵减法

##      [,1] [,2] [,3] [,4]
## [1,]    0    0    0    0
## [2,]    0    0    0    0
## [3,]    0    0    0    0

A=matrix(1:12,nrow=3,ncol=4)#创建矩阵
B=matrix(1:12,nrow=4,ncol=3)#创建矩阵
A%*%B#求矩阵的乘积

##      [,1] [,2] [,3]
## [1,]   70  158  246
## [2,]   80  184  288
## [3,]   90  210  330

A=matrix(1:16,nrow=4,ncol=4)#创建行列数相等矩阵
diag(A)#获得矩阵对角线元素

## [1]  1  6 11 16

diag(diag(A))#利用对角线元素创建对角矩阵

##      [,1] [,2] [,3] [,4]
## [1,]    1    0    0    0
## [2,]    0    6    0    0
## [3,]    0    0   11    0
## [4,]    0    0    0   16

diag(3)#创建3阶单位矩阵

##      [,1] [,2] [,3]
## [1,]    1    0    0
## [2,]    0    1    0
## [3,]    0    0    1

A=matrix(rnorm(16),4,4)#创建矩阵
A

##            [,1]        [,2]       [,3]          [,4]
## [1,] -0.7413759 -2.86248551  1.4534779  0.5867999069
## [2,]  0.1100988  2.04144332 -1.4332151 -0.5420695305
## [3,]  0.5326002 -2.26919486  0.2347451  0.0008461842
## [4,]  0.5958555 -0.02321432 -1.4678842 -0.8202258850

solve(A)#求矩阵的逆

##            [,1]       [,2]       [,3]       [,4]
## [1,] -1.1032243 -0.4707900  0.9730097 -0.4771218
## [2,] -0.3491463 -0.5572976 -0.5028235  0.1180035
## [3,] -0.8748148 -4.3459010 -2.8291284  2.2433398
## [4,]  0.7740181  7.4512325  5.7841123 -5.5838251

(A=diag(4)+1)#创建矩阵

##      [,1] [,2] [,3] [,4]
## [1,]    2    1    1    1
## [2,]    1    2    1    1
## [3,]    1    1    2    1
## [4,]    1    1    1    2

(A.e=eigen(A,symmetric=T))#求矩阵的特征值与特征向量

## $values
## [1] 5 1 1 1
## 
## $vectors
##      [,1]       [,2]       [,3]       [,4]
## [1,] -0.5  0.8660254  0.0000000  0.0000000
## [2,] -0.5 -0.2886751 -0.5773503 -0.5773503
## [3,] -0.5 -0.2886751 -0.2113249  0.7886751
## [4,] -0.5 -0.2886751  0.7886751 -0.2113249

A.e$vectors%*%diag(A.e$values)%*%t(A.e$vectors)#特征向量矩阵U和特征值矩阵D与原矩阵A的关系A=UDU'

##      [,1] [,2] [,3] [,4]
## [1,]    2    1    1    1
## [2,]    1    2    1    1
## [3,]    1    1    2    1
## [4,]    1    1    1    2

(A.c=chol(A))#矩阵的Choleskey分解

##          [,1]      [,2]      [,3]      [,4]
## [1,] 1.414214 0.7071068 0.7071068 0.7071068
## [2,] 0.000000 1.2247449 0.4082483 0.4082483
## [3,] 0.000000 0.0000000 1.1547005 0.2886751
## [4,] 0.000000 0.0000000 0.0000000 1.1180340

t(A.c)%*%A.c#Choleskey分解矩阵V与原矩阵A.c的关系A.c=V'V

##      [,1] [,2] [,3] [,4]
## [1,]    2    1    1    1
## [2,]    1    2    1    1
## [3,]    1    1    2    1
## [4,]    1    1    1    2

(A=matrix(1:18,3,6))#创建矩阵

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    4    7   10   13   16
## [2,]    2    5    8   11   14   17
## [3,]    3    6    9   12   15   18

(A.s=svd(A))#矩阵的奇异值分解

## $d
## [1] 4.589453e+01 1.640705e+00 1.366522e-15
## 
## $u
##            [,1]        [,2]       [,3]
## [1,] -0.5290354  0.74394551  0.4082483
## [2,] -0.5760715  0.03840487 -0.8164966
## [3,] -0.6231077 -0.66713577  0.4082483
## 
## $v
##             [,1]        [,2]       [,3]
## [1,] -0.07736219 -0.71960032 -0.4076688
## [2,] -0.19033085 -0.50893247  0.5745647
## [3,] -0.30329950 -0.29826463 -0.0280114
## [4,] -0.41626816 -0.08759679  0.2226621
## [5,] -0.52923682  0.12307105 -0.6212052
## [6,] -0.64220548  0.33373889  0.2596585

A.s$u%*%diag(A.s$d)%*%t(A.s$v)#矩阵的奇异值分解结果与原矩阵A的关系A=UDV'

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    4    7   10   13   16
## [2,]    2    5    8   11   14   17
## [3,]    3    6    9   12   15   18

(A=matrix(1:16,4,4))#创建矩阵

##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16

qr(A)#矩阵的QR分解

## $qr
##            [,1]        [,2]          [,3]          [,4]
## [1,] -5.4772256 -12.7801930 -2.008316e+01 -2.738613e+01
## [2,]  0.3651484  -3.2659863 -6.531973e+00 -9.797959e+00
## [3,]  0.5477226  -0.3781696  1.601186e-15  2.217027e-15
## [4,]  0.7302967  -0.9124744 -5.547002e-01 -1.478018e-15
## 
## $rank
## [1] 2
## 
## $qraux
## [1] 1.182574e+00 1.156135e+00 1.832050e+00 1.478018e-15
## 
## $pivot
## [1] 1 2 3 4
## 
## attr(,"class")
## [1] "qr"

(A=matrix(1:4,2,2))#创建矩阵

##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4

(B=matrix(rep(1,4),2,2))#创建矩阵

##      [,1] [,2]
## [1,]    1    1
## [2,]    1    1

kronecker(A,B)#矩阵的kronecker积

##      [,1] [,2] [,3] [,4]
## [1,]    1    1    3    3
## [2,]    1    1    3    3
## [3,]    2    2    4    4
## [4,]    2    2    4    4

(X=data.frame(x1,x2))#产生由X1和X2构建的数据框

##     x1 x2
## 1  171 57
## 2  175 64
## 3  159 41
## 4  155 38
## 5  152 35
## 6  158 44
## 7  154 41
## 8  164 51
## 9  168 57
## 10 166 49
## 11 159 47
## 12 164 46

(X=data.frame('身高'=x1,'体重'=x2))据框新的列标签

##    身高 体重
## 1   171   57
## 2   175   64
## 3   159   41
## 4   155   38
## 5   152   35
## 6   158   44
## 7   154   41
## 8   164   51
## 9   168   57
## 10  166   49
## 11  159   47
## 12  164   46

apply(X为矩阵,MARGIN=1表示对行运算/=2表示对列预算,FIN用来指定运算函数,…)

A=matrix(1:12,3,4)#创建矩阵
dim(A)#矩阵的维数

## [1] 3 4

nrow(A)#矩阵的行数

## [1] 3

ncol(A)#矩阵的列数

## [1] 4

rowSums(A)#矩阵按行求和

## [1] 22 26 30

rowMeans(A)#矩阵按行求均值

## [1] 5.5 6.5 7.5

colSums(A)#矩阵按列求和

## [1]  6 15 24 33

colMeans(A)#矩阵按列求均值

## [1]  2  5  8 11

apply(A,1,sum)#矩阵按行求和

## [1] 22 26 30

apply(A,1,mean)#矩阵按行求均值

## [1] 5.5 6.5 7.5

apply(A,2,sum)#矩阵按列求和

## [1]  6 15 24 33

apply(A,2,mean)#矩阵按列求均值

## [1]  2  5  8 11

A=matrix(rnorm(100),20,5)#创建矩阵
apply(A,2,var)#矩阵按列求方差

## [1] 0.8380330 1.0014935 1.6928737 1.1800251 0.6701847

apply(A,2,function(x,a)x*a,a=2)#矩阵按列求函数结果

##             [,1]       [,2]       [,3]        [,4]       [,5]
##  [1,]  1.6849429 -0.4705260  1.7202792  1.11696729  1.1895191
##  [2,]  2.2467405 -0.1679803  0.2850114  0.70654885  0.2211689
##  [3,] -0.8005214  3.0963894 -2.1206206 -0.57409623 -1.5118071
##  [4,] -2.2196454  1.7805190 -0.4658638  0.21723989  3.1251585
##  [5,] -2.8726198  1.7046809  5.1375556  3.41780925  1.3096301
##  [6,] -0.3690779  3.0400686  5.7055706  2.36362412  1.7530150
##  [7,] -3.1351343  2.3625934  0.5885855 -0.08682032 -1.9947425
##  [8,] -0.1023500  0.4797240  3.4513281  0.26224780  0.8777756
##  [9,] -2.5504198  1.0925203 -1.6792587  1.18982543  0.8248051
## [10,]  0.1610013 -1.9089909  1.2408879 -1.45690524 -0.3419066
## [11,]  0.9678919 -0.2212908 -1.0023266 -3.16162704  2.8794111
## [12,]  0.1345222  3.8792559 -0.6621181  1.29030125 -2.6970918
## [13,]  0.3435619 -1.4836508 -2.9163719  0.63825222  0.2638939
## [14,]  0.4611490  1.4811701 -3.5187995 -0.63330142  0.4933913
## [15,] -0.5067010 -0.2271658 -2.0478149 -0.71247620 -0.5878192
## [16,]  0.1306488 -0.5617005  2.9284372 -2.35807615  0.8060922
## [17,]  3.6815116  3.3292026  2.9298623  5.44525945  1.6418616
## [18,] -0.6502591 -4.0517691 -2.1130174  0.02383377 -1.7659428
## [19,] -3.2965374  1.1335201  0.8974002 -3.26297045  1.4564526
## [20,]  0.8752568 -0.8393604  0.6941933  3.28867428 -2.0086394

X=read.csv(“textdata.csv”)#读取名为textdata的csv格式文档 library(RODBC)#加载RODBC软件包 Rcode<-odbcConnectExcel(“Rcode.xls”)#读取名为Rcode的Excel工作薄 (data<-sqlFetch(Rcode,“data”))#显示Rcode中名为data表单的数据 (codedata <- sqlFetch(Rcode,“codedata”))#显示Rcode中名为codata表单的数据 x1=c(171,175,159,155,152,158,154,164,168,166,159,164)#身高 x2=c(57,64,41,38,35,44,41,51,57,49,47,46)#体重 hist(x1) #做出身高的直方图 plot(x1,x2) #做出身高和体重的散点图

table(年龄)#一维列联表 barplot(table(年龄),col=1:7)#条形图 pie(table(结果))#饼图 table(年龄,性别) #二维列联表 barplot(table(年龄,性别),beside=T,col=1:7)#以性别分组的年龄条图 barplot(table(性别,年龄),beside=T,col=1:2)#以年龄分组的性别条图 ftable(年龄,性别,结果) #以年龄、性别排列的结果频数三维列联表 ftable(性别,年龄,结果)#以性别、年龄排列的结果频数三维列联表 (ft=ftable(性别,结果,年龄))#显示以性别、结果排列的年龄频数三维列联表 rowSums(ft)#求行和 colSums(ft)#求列和 sum(ft)#求总和

第3章多多元数据的直观表示及R使用

setwd("C:/Users/lenovo/Desktop")
d3.1=read.table("d3.1.txt",header=T) 
barplot(apply(d3.1,1,mean),las=3)#按行做均值条形图

barplot(apply(d3.1,2,mean))#按列做均值图条形图

barplot(apply(d3.1[,2:8],2,mean))#去掉第一列后的数据按列做均值条形图

barplot(apply(d3.1,2,median))#按列做中位数条形

pie(apply(d3.1,2,mean))#按列做均值饼图

boxplot(d3.1)#按列做箱线图

boxplot(d3.1,horizontal=T)#箱线图中图形按水平放置

stars(d3.1,full=T,key.loc=c(13,1.5))#具有图例的360度星相图

stars(d3.1,full=F,key.loc=c(13,1.5))#具有图例的180度星相图

stars(d3.1,full=T,draw.segments=T,key.loc=c(13,1.5))#具有图例的360度彩色圆形星相图

stars(d3.1,full=F,draw.segments=T,key.loc=c(13,1.5))#具有图例的180度彩色圆形星相图

library(aplpack)#加载aplpack包

## Warning: package 'aplpack' was built under R version 3.1.3

## Loading required package: tcltk

faces(d3.1,ncol.plot=7)#按每行7个做脸谱图

## effect of variables:
##  modified item       Var   
##  "height of face   " "食品"
##  "width of face    " "衣着"
##  "structure of face" "设备"
##  "height of mouth  " "医疗"
##  "width of mouth   " "交通"
##  "smiling          " "教育"
##  "height of eyes   " "居住"
##  "width of eyes    " "杂项"
##  "height of hair   " "食品"
##  "width of hair   "  "衣着"
##  "style of hair   "  "设备"
##  "height of nose  "  "医疗"
##  "width of nose   "  "交通"
##  "width of ear    "  "教育"
##  "height of ear   "  "居住"

faces(d3.1[,2:8],ncol.plot=7)#去掉第一个变量按每行7个做脸谱图

## effect of variables:
##  modified item       Var   
##  "height of face   " "衣着"
##  "width of face    " "设备"
##  "structure of face" "医疗"
##  "height of mouth  " "交通"
##  "width of mouth   " "教育"
##  "smiling          " "居住"
##  "height of eyes   " "杂项"
##  "width of eyes    " "衣着"
##  "height of hair   " "设备"
##  "width of hair   "  "医疗"
##  "style of hair   "  "交通"
##  "height of nose  "  "教育"
##  "width of nose   "  "居住"
##  "width of ear    "  "杂项"
##  "height of ear   "  "衣着"

faces(d3.1[c(1,9,19,28,29,30),])#选择第1,9,19,28,29,30个观测的多元数据做脸谱图

## effect of variables:
##  modified item       Var   
##  "height of face   " "食品"
##  "width of face    " "衣着"
##  "structure of face" "设备"
##  "height of mouth  " "医疗"
##  "width of mouth   " "交通"
##  "smiling          " "教育"
##  "height of eyes   " "居住"
##  "width of eyes    " "杂项"
##  "height of hair   " "食品"
##  "width of hair   "  "衣着"
##  "style of hair   "  "设备"
##  "height of nose  "  "医疗"
##  "width of nose   "  "交通"
##  "width of ear    "  "教育"
##  "height of ear   "  "居住"

library(mvstats)#加载mvstats包
plot.andrews(d3.1)#绘制调和曲线图

plot.andrews(d3.1[c(1,9,19,28,29,30),])#选择第1,9,19,28,29,30个观测的多元数据做调和曲线图

第4章多元相关与回归分析及R使用

plot(x1,x2)#做散点图

lxy<-function(x,y){n=length(x);sum(x*y)-sum(x)*sum(y)/n}#建立离均差乘积和函数
lxy(x1,x1)#x1的离均差平方和

## [1] 556.9167

lxy(x2,x2)#x2的离均差平方和

## [1] 813

lxy(x1,x2)#x1的离均差乘积和

## [1] 645.5

(r=lxy(x1,x2)/sqrt(lxy(x1,x1)*lxy(x2,x2)))#显示用离均差乘积和计算的相关系数

## [1] 0.9593031

cor(x1,x2)#计算相关系数

## [1] 0.9593031

n=length(x1)#向量的长度
tr=r/sqrt((1-r^2)/(n-2))#相关系数假设检验t统计量
tr

## [1] 10.74298

cor.test(x1,x2)#相关系数假设检验

## 
##  Pearson's product-moment correlation
## 
## data:  x1 and x2
## t = 10.743, df = 10, p-value = 8.21e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8574875 0.9888163
## sample estimates:
##       cor 
## 0.9593031

setwd("C:/Users/lenovo/Desktop")
yx=read.table("d4.3.txt",header=T)#加载例4.3数据 
fm=lm(y~x,data=yx)#一元线性回归模型
fm

## 
## Call:
## lm(formula = y ~ x, data = yx)
## 
## Coefficients:
## (Intercept)            x  
##      -1.197        1.116

##             [,1]       [,2]       [,3]        [,4]     [,5]      [,6]
## 北京   -6271.737 -472.14218 -1478.7720  -47.721642 670.0846 -197.4040
## 天津   -4892.687  -80.69336 -1680.0065  468.804807 642.5301 -440.8240
## 河北   -3248.461 -142.58568 -1052.8677  161.459560 730.6497 -325.9395
## 山西   -3176.387 -287.80663  -952.9766   27.908060 602.0532 -487.2531
## 内蒙古 -3512.959 -480.82319 -1140.9499 -208.459846 700.9511 -473.6798
## 辽宁   -3895.944  128.15536 -1424.9970  192.599398 818.0546 -381.6057
## 吉林   -3284.255 -251.59105 -1253.5557  143.034021 790.3206 -451.8751
## 黑龙江 -2967.350 -142.87870 -1189.9996    8.172715 651.5781 -302.3414
## 上海   -7641.196  165.85432 -1230.5422 -188.637492 583.0494 -407.1886
## 江苏   -4560.977   35.31018 -1425.7811   18.149052 319.9015 -395.0486
## 浙江   -6117.230  -80.43122 -1145.7491 -245.495957 692.1764 -330.7203
## 安徽   -3668.009  222.85883 -1365.1952   -6.957738 467.6136 -398.1544
## 福建   -4886.053  391.30589 -1233.3928   86.680723 557.8603 -659.8796
## 江西   -3349.249  305.33829 -1342.2211 -119.420965 424.4819 -460.7248
## 山东   -3856.906 -227.33267 -1061.3222  -63.375078 712.8425 -495.6177
## 河南   -3095.495 -119.36938 -1120.7817  -53.671806 602.3980 -388.7157
## 湖北   -3718.126  226.61657 -1406.8402  -88.134677 538.4257 -470.1937
## 湖南   -3702.661  -43.40191 -1309.8025  -18.580840 457.1074 -367.0418
## 广东   -6444.785  253.07907  -608.0827  206.794612 572.2519 -427.8501
## 广西   -3626.489  411.41245 -1245.4843  127.464766 415.1013 -315.9280
## 海南   -3879.410  654.40143  -830.3841  228.024851 548.1847 -268.6813
## 重庆   -4101.280   64.21307 -1443.1986  -11.589613 651.8433 -438.0735
## 四川   -3832.917  397.68453 -1287.4281 -140.710939 575.5561 -321.1221
## 贵州   -3387.344  263.03119 -1175.8815 -167.166245 428.9832 -428.4575
## 云南   -3629.139  609.53498 -1244.3362   -3.613962 857.7951 -220.6917
## 西藏   -3603.868 1032.87564 -1388.8594 -193.249168 902.7327 -452.9071
## 陕西   -3467.587  -36.18118 -1290.7885   40.858539 421.1680 -313.2432
## 甘肃   -3209.422  -33.92919 -1146.2330  -53.411235 470.2348 -357.5851
## 青海   -3089.786   46.95751 -1181.3101  -55.270695 508.7677 -241.8020
## 宁夏   -3113.219  -51.16864 -1116.2861   61.118194 670.9278 -448.3887
## 新疆   -3125.600  -99.47641 -1142.9285 -188.221320 711.5164 -388.0790
##            [,7]          [,8]
## 北京   338.1326  -46.93540207
## 天津   262.7464  -68.06335480
## 河北   329.1848  -88.64297392
## 山西   200.4013 -153.73040440
## 内蒙古 223.8099  -13.10436469
## 辽宁   170.3246  -55.07060619
## 吉林   134.9042  -52.41184623
## 黑龙江 136.3281  -90.64088177
## 上海   199.0403   13.60840001
## 江苏   224.6134 -113.97505321
## 浙江   141.7991 -216.26974579
## 安徽   159.9425 -107.00704251
## 福建   220.8554  -95.38599704
## 江西   325.3652  -43.25605156
## 山东   354.3797 -106.08449797
## 河南   311.9963  -57.90190734
## 湖北   283.9291 -165.29453809
## 湖南   262.1497 -104.05884628
## 广东   264.6169  -80.14799721
## 广西   206.5708  -70.16690022
## 海南   307.4828  -61.60926534
## 重庆   413.6344 -167.50485784
## 四川   333.1896  -80.74317703
## 贵州   206.5979 -110.47488954
## 云南   213.7375 -194.85291383
## 西藏   230.0083   -0.08183429
## 陕西   177.1975  -76.73062051
## 甘肃   211.8872  -19.80288724
## 青海   246.9396  -17.17355476
## 宁夏   252.4040  -60.43164552
## 新疆   275.6191  -56.70631902

plot(y~x,data=yx)#做散点图
abline(fm)#添加回归线

anova(fm)#模型方差分析

## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## x          1 712077  712077   27428 < 2.2e-16 ***
## Residuals 29    753      26                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fm)#回归系数t检验

## 
## Call:
## lm(formula = y ~ x, data = yx)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.630 -3.692 -1.535  5.338 11.432 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.19656    1.16125   -1.03    0.311    
## x            1.11623    0.00674  165.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.095 on 29 degrees of freedom
## Multiple R-squared:  0.9989, Adjusted R-squared:  0.9989 
## F-statistic: 2.743e+04 on 1 and 29 DF,  p-value: < 2.2e-16

yX=read.table("d4.4.txt",header=T)#加载例4.4数据 
(fm=lm(y~x1+x2+x3+x4,data=yX))#显示多元线性回归模型

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = yX)
## 
## Coefficients:
## (Intercept)           x1           x2           x3           x4  
##  23.5321088   -0.0033866    1.1641150    0.0002919   -0.0437416

coef.sd(fm)#标准化偏回归系数结果

## $coef.sd
##            x1            x2            x3            x4 
## -0.0174513678  1.0423522972  0.0009628564 -0.0371053994

anova(fm)#多元线性回归模型方差分析

## Analysis of Variance Table
## 
## Response: y
##           Df Sum Sq Mean Sq    F value    Pr(>F)    
## x1         1 694627  694627 89259.0016 < 2.2e-16 ***
## x2         1  17803   17803  2287.6286 < 2.2e-16 ***
## x3         1     24      24     3.0569    0.0922 .  
## x4         1    174     174    22.2954 7.005e-05 ***
## Residuals 26    202       8                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fm)#多元线性回归系数t检验

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = yX)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0229 -2.1354  0.3297  1.2639  6.9690 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.5321088  4.5990714   5.117 2.47e-05 ***
## x1          -0.0033866  0.0080749  -0.419    0.678    
## x2           1.1641150  0.0404889  28.751  < 2e-16 ***
## x3           0.0002919  0.0085527   0.034    0.973    
## x4          -0.0437416  0.0092638  -4.722 7.00e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.79 on 26 degrees of freedom
## Multiple R-squared:  0.9997, Adjusted R-squared:  0.9997 
## F-statistic: 2.289e+04 on 4 and 26 DF,  p-value: < 2.2e-16

cor(yX)#多元数据相关系数矩阵

##            t         y        x1        x2        x3        x4
## t  1.0000000 0.8141349 0.8842687 0.8308056 0.8234883 0.9742470
## y  0.8141349 1.0000000 0.9871498 0.9994718 0.9912053 0.6956619
## x1 0.8842687 0.9871498 1.0000000 0.9907018 0.9867664 0.7818066
## x2 0.8308056 0.9994718 0.9907018 1.0000000 0.9917094 0.7154297
## x3 0.8234883 0.9912053 0.9867664 0.9917094 1.0000000 0.7073820
## x4 0.9742470 0.6956619 0.7818066 0.7154297 0.7073820 1.0000000

pairs(yX)#多元数据散点图

corr.test(yX)#多元数据相关系数检验

## corr test: 
##         t       y     x1     x2    x3 x4
## t   0.000   0.000  0.000  0.000 0.000  0
## y   7.550   0.000  0.000  0.000 0.000  0
## x1 10.197  33.267  0.000  0.000 0.000  0
## x2  8.039 165.614 39.214  0.000 0.000  0
## x3  7.817  40.336 32.772 41.560 0.000  0
## x4 23.268   5.215  6.752  5.514 5.389  0
## lower is t value，upper is p value

(R2=summary(fm)$r.sq)#显示多元线性回归模型决定系数

## [1] 0.9997

(R=sqrt(R2))#显示多元数据复相关系数

## [1] 0.9999

library(leaps)#加载leaps包
varsel=regsubsets(y~x1+x2+x3+x4,data=yX)#多元数据线性回归变量选择模型
result=summary(varsel)#变量选择方法结果           
data.frame(result$outmat,RSS=result$rss,R2=result$rsq)#RSS和决定系数准则结果展示

##          x1 x2 x3 x4   RSS     R2
## 1  ( 1 )     *       752.9 0.9989
## 2  ( 1 )     *     * 203.9 0.9997
## 3  ( 1 )  *  *     * 202.3 0.9997
## 4  ( 1 )  *  *  *  * 202.3 0.9997

data.frame(result$outmat,adjR2=result$adjr2,Cp=result$cp,BIC=result$bic)#调整决定系数,Cp和BIC准则结果展示

##          x1 x2 x3 x4  adjR2     Cp    BIC
## 1  ( 1 )     *       0.9989 69.745 -205.6
## 2  ( 1 )     *     * 0.9997  1.199 -242.6
## 3  ( 1 )  *  *     * 0.9997  3.001 -239.4
## 4  ( 1 )  *  *  *  * 0.9997  5.000 -236.0

fm.step=step(fm,direction="forward")#向前引入法变量选择结果

## Start:  AIC=68.15
## y ~ x1 + x2 + x3 + x4

fm.step=step(fm,direction="backward")#向后剔除法变量选择结果

## Start:  AIC=68.15
## y ~ x1 + x2 + x3 + x4
## 
##        Df Sum of Sq  RSS   AIC
## - x3    1         0  202  66.2
## - x1    1         1  204  66.4
## <none>               202  68.2
## - x4    1       174  376  85.4
## - x2    1      6433 6635 174.4
## 
## Step:  AIC=66.16
## y ~ x1 + x2 + x4
## 
##        Df Sum of Sq  RSS   AIC
## - x1    1         2  204  64.4
## <none>               202  66.2
## - x4    1       197  400  85.3
## - x2    1      7382 7585 176.5
## 
## Step:  AIC=64.39
## y ~ x2 + x4
## 
##        Df Sum of Sq    RSS   AIC
## <none>                 204  64.4
## - x4    1       549    753 102.9
## - x2    1    367655 367859 294.8

fm.step=step(fm,direction="both")#逐步筛选法变量选择结果

## Start:  AIC=68.15
## y ~ x1 + x2 + x3 + x4
## 
##        Df Sum of Sq  RSS   AIC
## - x3    1         0  202  66.2
## - x1    1         1  204  66.4
## <none>               202  68.2
## - x4    1       174  376  85.4
## - x2    1      6433 6635 174.4
## 
## Step:  AIC=66.16
## y ~ x1 + x2 + x4
## 
##        Df Sum of Sq  RSS   AIC
## - x1    1         2  204  64.4
## <none>               202  66.2
## + x3    1         0  202  68.2
## - x4    1       197  400  85.3
## - x2    1      7382 7585 176.5
## 
## Step:  AIC=64.39
## y ~ x2 + x4
## 
##        Df Sum of Sq    RSS   AIC
## <none>                 204  64.4
## + x1    1         2    202  66.2
## + x3    1         0    204  66.4
## - x4    1       549    753 102.9
## - x2    1    367655 367859 294.8

par(mar=c(4,4,2,1),cex=0.75)
x=-10:10
plot(x,1+2*x+3*x^2,"o",ylab="y=1+2x+3x^2")#多项式曲线

x=1:20
par(mfrow=c(1,2),cex=0.75)
plot(x,3+2*log(x),"o",ylab="y=3+2log(x)")# b>0对数函数曲线
plot(x,3-2*log(x),"o",ylab="y=3-2log(x)")# b<0对数函数曲线

par(mfrow=c(1,1))
par(mfrow=c(1,2),cex=0.75)
plot(x,3*exp(0.2*x),"o",ylab="y=3exp(0.2x)")#递增指数函数曲线
plot(x,3*exp(0.2/x),"o",ylab="y=3*exp(0.2/x)")#递减指数函数曲线

par(mfrow=c(1,1))
par(mfrow=c(1,2),cex=0.75)
plot(x,3*x^2,"o",ylab="y=3x^2")#b>0幂函数曲线
plot(x,3*x^-2,"o",ylab="y=3x^-2")#b<0幂函数曲线

par(mfrow=c(1,1))
par(mfrow=c(1,2),cex=0.75)
plot(x,3+2/x,"o",ylab="y=3+2/x")#b>0双曲线函数
plot(x,3-2/x,"o",ylab="y=3-2/x")#b<0双曲线函数

par(mfrow=c(1,1))
x=c(1.5,2.8,4.5,7.5,10.5,13.5,15.1,16.5,19.5,22.5,24.5,26.5)#例4.6数据
y=c(7.0,5.5,4.6,3.6,2.9,2.7,2.5,2.4,2.2,2.1,1.9,1.8)
plot(x,y)#做散点图   
lm.1=lm(y~x)#一元线性回归模型
summary(lm.1)$coef#一元线性回归模型系数

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)    5.603    0.43474  12.889 1.488e-07
## x             -0.170    0.02719  -6.254 9.456e-05

summary(lm.1)$r.sq#一元线性回归模型决定系数

## [1] 0.7964

plot(x,y)#做散点图  
abline(lm.1)#添加一元线性回归线

x1=x
x2=x^2
lm.2=lm(y~x1+x2)#二次函数回归模型 
summary(lm.2)$coef#二次函数回归模型系数

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)  6.91469   0.331987  20.828 6.346e-09
## x1          -0.46563   0.056969  -8.173 1.864e-05
## x2           0.01076   0.002009   5.353 4.604e-04

summary(lm.2)$r.sq#二次函数回归模型决定系数

## [1] 0.9513

plot(x,y)#做散点图  
lines(x,fitted(lm.2))#添加二次函数回归线

lm.log=lm(y~log(x))#对数函数回归模型 
summary(lm.log)$coef#对数函数回归模型系数

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)    7.364     0.1688   43.64 9.596e-13
## log(x)        -1.757     0.0677  -25.95 1.660e-10

summary(lm.log)$r.sq#对数函数回归模型决定系数

## [1] 0.9854

plot(x,y)#做散点图  
lines(x,fitted(lm.log))#添加对数函数回归线

lm.exp=lm(log(y)~x)#指数函数回归模型
summary(lm.exp)$coef#指数函数回归模型系数

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)  1.75966   0.075101   23.43 4.543e-10
## x           -0.04881   0.004697  -10.39 1.116e-06

summary(lm.exp)$r.sq#指数函数回归模型决定系数

## [1] 0.9153

plot(x,y)#做散点图  
lines(x,exp(fitted(lm.exp)))#添加指数函数回归线

lm.pow=lm(log(y)~log(x))#幂函数回归模型
summary(lm.pow)$coef#幂函数回归模型系数

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)   2.1907    0.02951   74.23 4.806e-15
## log(x)       -0.4724    0.01184  -39.90 2.337e-12

summary(lm.pow)$r.sq#幂函数回归模型决定系数

## [1] 0.9938

plot(x,y)#做散点图  
lines(x,exp(fitted(lm.pow)))#添加幂函数回归线

(S1=nls(y~a+b*x,start=list(a=5,b=-0.1)))#非线性回归函数的线性回归模型

## Nonlinear regression model
##   model: y ~ a + b * x
##    data: parent.frame()
##     a     b 
##  5.60 -0.17 
##  residual sum-of-squares: 5.93
## 
## Number of iterations to convergence: 1 
## Achieved convergence tolerance: 3.32e-09

plot(x,y)
lines(x,fitted(S1))

(S2=nls(y~a+b*log(x),start=list(a=5,b=-0.1)))#对数回归模型

## Nonlinear regression model
##   model: y ~ a + b * log(x)
##    data: parent.frame()
##     a     b 
##  7.36 -1.76 
##  residual sum-of-squares: 0.426
## 
## Number of iterations to convergence: 1 
## Achieved convergence tolerance: 1.47e-07

plot(x,y)
lines(x,fitted(S2))#添加对数回归线

(S3=nls(y~a*exp(b*x),start=list(a=5,b=-0.1)))#指数回归模型

## Nonlinear regression model
##   model: y ~ a * exp(b * x)
##    data: parent.frame()
##       a       b 
##  6.6312 -0.0613 
##  residual sum-of-squares: 2.49
## 
## Number of iterations to convergence: 8 
## Achieved convergence tolerance: 1.91e-06

plot(x,y)
lines(x,fitted(S3))#添加指数回归线

(S4=nls(y~a*(x^b),start=list(a=5,b=-0.1)))#幂函数回归模型

## Nonlinear regression model
##   model: y ~ a * (x^b)
##    data: parent.frame()
##      a      b 
##  8.609 -0.452 
##  residual sum-of-squares: 0.164
## 
## Number of iterations to convergence: 5 
## Achieved convergence tolerance: 6.07e-07

plot(x,y)
lines(x,fitted(S4))#添加幂函数回归线

d4.8=read.table("d4.8.txt",header=T)#选取例4.8数据
model=nls(Y~A0*(exp(m*t))*(L^a)*(K^b),data=d4.8,start=list(A0=0.45,m=0,a=0.5,b=0.5))#生产函数回归模型
model#回归结果

## Nonlinear regression model
##   model: Y ~ A0 * (exp(m * t)) * (L^a) * (K^b)
##    data: d4.8
##     A0      m      a      b 
## 0.7199 0.0437 0.4080 0.7119 
##  residual sum-of-squares: 8.92
## 
## Number of iterations to convergence: 22 
## Achieved convergence tolerance: 9.37e-07

summary(model)#模型检验

## 
## Formula: Y ~ A0 * (exp(m * t)) * (L^a) * (K^b)
## 
## Parameters:
##    Estimate Std. Error t value Pr(>|t|)    
## A0   0.7199     0.3461    2.08   0.0711 .  
## m    0.0437     0.0111    3.92   0.0044 ** 
## a    0.4080     0.1720    2.37   0.0451 *  
## b    0.7119     0.0428   16.65  1.7e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.06 on 8 degrees of freedom
## 
## Number of iterations to convergence: 22 
## Achieved convergence tolerance: 9.37e-07

0.71987*exp(0.04369*13)*75^0.40798*135^0.71187#按公式预测结果

## [1] 242.9

predict(model,data.frame(t=13,L=75,K=135))#按模型预测结果

## [1] 242.9

b=lxy(x,y)/lxy(x,x)#线性回归方程斜率 a=mean(y)-bmean(x)#线性回归方程截距 c(a=a,b=b)#显示线性回归方程估计值 plot(x,y)#做散点图 lines(x,a+bx)#添加估计方程线 SST=lxy(y,y)#因变量的离均差平方和 SSR=b*lxy(x,y)#回归平方和 SSE=SST-SSR#误差平方和 MSR=SSR/1#回归均方 MSE=SSE/(n-2)#误差均方 F= MSR/MSE#F统计量 c(SST=SST,SSR=SSR,SSE=SSE,MSR=MSR,MSE=MSE,F=F)#显示结果 sy.x=sqrt(MSE)#估计标准差 sb=sy.x/sqrt(lxy(x,x))#离均差平方和 t=b/sb#t统计量 ta=qt(1-0.05/2,n-2)#t分位数 c(sy.x=sy.x,sb=sb,t=t,ta=ta)#显示结果

第5章线性与非线性模型及R使用

d5.1=read.table("d5.1.txt",header=T)#读取例5.1数据 
logit.glm<-glm(y~x1+x2+x3,family=binomial,data=d5.1)#Logistic回归模型
summary(logit.glm)#Logistic回归模型结果

## 
## Call:
## glm(formula = y ~ x1 + x2 + x3, family = binomial, data = d5.1)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.564  -0.913  -0.789   0.964   1.600  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   0.5976     0.8948    0.67    0.504  
## x1           -1.4961     0.7049   -2.12    0.034 *
## x2           -0.0016     0.0168   -0.10    0.924  
## x3            0.3159     0.7011    0.45    0.652  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62.183  on 44  degrees of freedom
## Residual deviance: 57.026  on 41  degrees of freedom
## AIC: 65.03
## 
## Number of Fisher Scoring iterations: 4

logit.step<-step(logit.glm,direction="both")#逐步筛选法变量选择

## Start:  AIC=65.03
## y ~ x1 + x2 + x3
## 
##        Df Deviance  AIC
## - x2    1     57.0 63.0
## - x3    1     57.2 63.2
## <none>        57.0 65.0
## - x1    1     61.9 67.9
## 
## Step:  AIC=63.03
## y ~ x1 + x3
## 
##        Df Deviance  AIC
## - x3    1     57.2 61.2
## <none>        57.0 63.0
## + x2    1     57.0 65.0
## - x1    1     62.0 66.0
## 
## Step:  AIC=61.24
## y ~ x1
## 
##        Df Deviance  AIC
## <none>        57.2 61.2
## + x3    1     57.0 63.0
## + x2    1     57.2 63.2
## - x1    1     62.2 64.2

summary(logit.step)#逐步筛选法变量选择结果

## 
## Call:
## glm(formula = y ~ x1, family = binomial, data = d5.1)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.449  -0.878  -0.878   0.928   1.510  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    0.619      0.469    1.32    0.187  
## x1            -1.373      0.635   -2.16    0.031 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62.183  on 44  degrees of freedom
## Residual deviance: 57.241  on 43  degrees of freedom
## AIC: 61.24
## 
## Number of Fisher Scoring iterations: 4

pre1<-predict(logit.step,data.frame(x1=1))#预测视力正常司机Logistic回归结果
p1<-exp(pre1)/(1+exp(pre1))#预测视力正常司机发生事故概率
pre2<-predict(logit.step,data.frame(x1=0))#预测视力有问题的司机Logistic回归结果
p2<-exp(pre2)/(1+exp(pre2))#预测视力有问题的司机发生事故概率
c(p1,p2)#结果显示

##    1    1 
## 0.32 0.65

d5.2=read.table("d5.2.txt",header=T)#读取例5.2数据 
log.glm<-glm(y~x1+x2,family=poisson(link=log),data=d5.2)#多元对数线性模型
summary(log.glm)#多元对数线性模型结果

## 
## Call:
## glm(formula = y ~ x1 + x2, family = poisson(link = log), data = d5.2)
## 
## Deviance Residuals: 
##      1       2       3       4       5       6  
## -10.78   14.44   -8.47   -2.62    4.96   -3.14  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   6.1569     0.1420   43.37   <2e-16 ***
## x1            0.1291     0.0437    2.96   0.0031 ** 
## x2           -1.1257     0.0826  -13.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 662.84  on 5  degrees of freedom
## Residual deviance: 437.97  on 3  degrees of freedom
## AIC: 482
## 
## Number of Fisher Scoring iterations: 5

d5.3=read.table("d5.3.txt",header=T)#读取例5.3数据 
anova(lm(Y~factor(A),data=d5.3))#完全随机设计模型方差分析

## Analysis of Variance Table
## 
## Response: Y
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## factor(A)  2 0.1222  0.0611    40.5 8.9e-07 ***
## Residuals 15 0.0226  0.0015                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

d5.4=read.table("d5.4.txt",header=T)#读取例5.4数据
anova(lm(Y~factor(A)+factor(B),data=d5.4))#随机单位组设计模型方差分析

## Analysis of Variance Table
## 
## Response: Y
##           Df Sum Sq Mean Sq F value Pr(>F)
## factor(A)  3  15759    5253    0.43   0.74
## factor(B)  2  22385   11192    0.92   0.45
## Residuals  6  73198   12200

d5.5=read.table("d5.5.txt",header=T)#读取例5.5数据
anova(lm(Y~A+B+A:B,data=d5.5))#析因设计模型方差分析

## Analysis of Variance Table
## 
## Response: Y
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## A          1   1600    1600    28.4 0.00018 ***
## B          1   2500    2500    44.4 2.3e-05 ***
## A:B        1    729     729    12.9 0.00366 ** 
## Residuals 12    676      56                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

d5.6=read.table("d5.6.txt",header=T)#读取例5.6数据
anova(lm(Y~A+B+A*B+C+D,data=d5.6))#正交实验设计模型方差分析

## Analysis of Variance Table
## 
## Response: Y
##           Df Sum Sq Mean Sq F value Pr(>F)  
## A          1    8.0     8.0     3.2  0.216  
## B          1   18.0    18.0     7.2  0.115  
## C          1   60.5    60.5    24.2  0.039 *
## D          1    4.5     4.5     1.8  0.312  
## A:B        1   50.0    50.0    20.0  0.047 *
## Residuals  2    5.0     2.5                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1