str(Data)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Data[!complete.cases(Data)]
## data frame with 0 columns and 150 rows
summary(Data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
require(ggplot2)
qplot(x = Petal.Length, y = Petal.Width, data = iris, color = Species)
RegModel.2 <- lm(Petal.Length~Petal.Width, data=Data)
summary(RegModel.2)
##
## Call:
## lm(formula = Petal.Length ~ Petal.Width, data = Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.33542 -0.30347 -0.02955 0.25776 1.39453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08356 0.07297 14.85 <2e-16 ***
## Petal.Width 2.22994 0.05140 43.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4782 on 148 degrees of freedom
## Multiple R-squared: 0.9271, Adjusted R-squared: 0.9266
## F-statistic: 1882 on 1 and 148 DF, p-value: < 2.2e-16
RegModel.3 <- lm(Petal.Length~Petal.Width+Sepal.Length+Sepal.Width,
data=Data)
summary(RegModel.3)
##
## Call:
## lm(formula = Petal.Length ~ Petal.Width + Sepal.Length + Sepal.Width,
## data = Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.99333 -0.17656 -0.01004 0.18558 1.06909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.26271 0.29741 -0.883 0.379
## Petal.Width 1.44679 0.06761 21.399 <2e-16 ***
## Sepal.Length 0.72914 0.05832 12.502 <2e-16 ***
## Sepal.Width -0.64601 0.06850 -9.431 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.319 on 146 degrees of freedom
## Multiple R-squared: 0.968, Adjusted R-squared: 0.9674
## F-statistic: 1473 on 3 and 146 DF, p-value: < 2.2e-16
iris.lm <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = Data)
summary(iris.lm)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
## data = Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82816 -0.21989 0.01875 0.19709 0.84570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.85600 0.25078 7.401 9.85e-12 ***
## Sepal.Width 0.65084 0.06665 9.765 < 2e-16 ***
## Petal.Length 0.70913 0.05672 12.502 < 2e-16 ***
## Petal.Width -0.55648 0.12755 -4.363 2.41e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3145 on 146 degrees of freedom
## Multiple R-squared: 0.8586, Adjusted R-squared: 0.8557
## F-statistic: 295.5 on 3 and 146 DF, p-value: < 2.2e-16
建立完迴歸模型以後,可以看到右邊的「模型」套用了iris.lm。
vif(iris.lm)
## Sepal.Width Petal.Length Petal.Width
## 1.270815 15.097572 14.234335
以變異數膨脹因子是否大於10來判斷,數值越大,表示自變數容忍度越小,也越有共線性的問題。
round(cov2cor(vcov(iris.lm)), 3) # Correlations of parameter estimates
## (Intercept) Sepal.Width Petal.Length Petal.Width
## (Intercept) 1.000 -0.953 -0.511 0.359
## Sepal.Width -0.953 1.000 0.302 -0.190
## Petal.Length -0.511 0.302 1.000 -0.959
## Petal.Width 0.359 -0.190 -0.959 1.000
oldpar <- par(oma=c(0,0,3,0), mfrow=c(2,2))
plot(iris.lm)
1.RVF圖用於觀察殘差值與擬合值之間是否有曲線關係,凸顯回歸統計的線性。以此圖為例,則有線性關係。
2.NORMAL Q-Q圖首先觀察圖上之點是否平均散佈於X=Y的直線上,凸顯回歸統計的正態性。以此圖為例,則滿足正態假設。
3.SL圖首先觀察圖上之點是否平均分布在水平線的周圍,凸顯回歸統計的同方差性。以此圖為例,則滿足不變方差假設。
4.RVL圖用於觀察資料中的離群點、高槓桿點以及強影響點。 Cook’s distance代表單個樣本對整個模型的影響程度。
par(oldpar)
qqPlot(iris.lm, simulate=TRUE, id=list(method="y", n=2))
## [1] 107 136
crPlots(iris.lm, smooth=list(span=0.5))
觀察自變數與因變數是否線性
avPlots(iris.lm, id=list(method="mahal", n=2))
influencePlot(iris.lm, id=list(method="noteworthy", n=2))
## StudRes Hat CookD
## 107 -2.7276922 0.02722650 0.049861338
## 118 -0.5192061 0.09097358 0.006778547
## 132 0.4839369 0.09313111 0.006044379
## 135 -2.1300181 0.06473577 0.076651528
## 136 2.7805934 0.02196762 0.041501940
## 142 2.2910061 0.05723172 0.077404543
Studentized residual>2或是Studentized residual<2的點稱為離群點
Hat-Values上的兩條虛線則表示Hat均值得2倍與3倍,以此圖為例,可發現118與132為高槓桿點。