head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
lm.fit=lm(Sepal.Length~.,data=iris[,-5])
#다항회귀
fit2<-lm(dist~speed+I(speed^2),cars)
fit2
##
## Call:
## lm(formula = dist ~ speed + I(speed^2), data = cars)
##
## Coefficients:
## (Intercept) speed I(speed^2)
## 2.47014 0.91329 0.09996
fit3<-lm(dist~poly(speed,5),cars)
fit3
##
## Call:
## lm(formula = dist ~ poly(speed, 5), data = cars)
##
## Coefficients:
## (Intercept) poly(speed, 5)1 poly(speed, 5)2 poly(speed, 5)3
## 42.980 145.552 22.996 13.797
## poly(speed, 5)4 poly(speed, 5)5
## 18.345 5.881
#더미변수 만드는 코드
contrasts(iris[,5])
## versicolor virginica
## setosa 0 0
## versicolor 1 0
## virginica 0 1
회귀식 ssR, SSE, SST 구하기
anova(lm.fit)
## Analysis of Variance Table
##
## Response: Sepal.Length
## Df Sum Sq Mean Sq F value Pr(>F)
## Sepal.Width 1 1.412 1.412 14.274 0.0002296 ***
## Petal.Length 1 84.427 84.427 853.309 < 2.2e-16 ***
## Petal.Width 1 1.883 1.883 19.035 2.413e-05 ***
## Residuals 146 14.445 0.099
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm.fit)
##
## Call:
## lm(formula = Sepal.Length ~ ., data = iris[, -5])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82816 -0.21989 0.01875 0.19709 0.84570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.85600 0.25078 7.401 9.85e-12 ***
## Sepal.Width 0.65084 0.06665 9.765 < 2e-16 ***
## Petal.Length 0.70913 0.05672 12.502 < 2e-16 ***
## Petal.Width -0.55648 0.12755 -4.363 2.41e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3145 on 146 degrees of freedom
## Multiple R-squared: 0.8586, Adjusted R-squared: 0.8557
## F-statistic: 295.5 on 3 and 146 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm.fit)
shapiro test,nor test, ks.test는 정규성을 확인 할 때 사용
nor(앤더슨 달링) test ks.test는 비모수 ncvTest는 등분산검정 dwtest는 독립성 검정
library(nortest)
shapiro.test(lm.fit$residuals)
##
## Shapiro-Wilk normality test
##
## data: lm.fit$residuals
## W = 0.99559, p-value = 0.9349
nortest::ad.test(lm.fit$residuals)
##
## Anderson-Darling normality test
##
## data: lm.fit$residuals
## A = 0.24732, p-value = 0.7493
ncvTest(lm.fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 4.448612, Df = 1, p = 0.03493
ks.test(lm.fit$residuals,'pnorm',mean=0,sd=1)
## Warning in ks.test(lm.fit$residuals, "pnorm", mean = 0, sd = 1): ties
## should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: lm.fit$residuals
## D = 0.27446, p-value = 3.065e-10
## alternative hypothesis: two-sided
dwtest(lm.fit)
##
## Durbin-Watson test
##
## data: lm.fit
## DW = 2.0604, p-value = 0.6013
## alternative hypothesis: true autocorrelation is greater than 0
e_t=lm.fit$residuals
e_t1=c(NA,lm.fit$residuals[-length(lm.fit$residuals)])
sum((e_t[-1]-e_t1[-1])^2)/sum(e_t^2)
## [1] 2.060382
vif(lm.fit)
## Sepal.Width Petal.Length Petal.Width
## 1.270815 15.097572 14.234335
crPlots(lm.fit)
spreadLevelPlot(lm.fit)
##
## Suggested power transformation: -0.1805994
influencePlot(lm.fit)
## StudRes Hat CookD
## 107 -2.7276922 0.02722650 0.049861338
## 118 -0.5192061 0.09097358 0.006778547
## 132 0.4839369 0.09313111 0.006044379
## 135 -2.1300181 0.06473577 0.076651528
## 136 2.7805934 0.02196762 0.041501940
## 142 2.2910061 0.05723172 0.077404543
car::outlierTest(lm.fit)
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 136 2.780593 0.0061467 0.922
이상치(outlier): 회귀모형으로 잘 예측되지 않는 관측치(즉 아주 큰 양수/음수의 residual) 큰지레점(high leverage point): 비정상적인 예측변수의 값에 의한 관측치. 즉 예측변수측의 이상치로 볼 수 있다. 종속변수의 값은 관측치의 영향력을 계산하는 데 사용하지 않는다. 영향관측치(influential observation): 통계 모형 계수 결정에 불균형한 영향을 미치는 관측치로 Cook’s distance라는 통계치로 확인할 수 있다.
참고[https://rstudio-pubs-static.s3.amazonaws.com/190997_40fa09db8e344b19b14a687ea5de914b.html] 참고[]