迴歸分析期中期末程式碼

所有使用過之套件、程式碼

yujen
2022-06-15

簡報

簡報網址(請複製到分頁後開啟): “https://www.canva.com/design/DAFBtoqJOrc/35DgosMR7EJKAdaI9U-9Sg/view?utm_content=DAFBtoqJOrc&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton

套件及載入資料

library(readxl)
library("olsrr")
library("tidyverse")
library(ggplot2)
library(ggfortify)
library(ggpmisc)
library(lindia)
library(MASS)
library(car)
library(nortest)
library(lmtest)
library(lawstat)
library(gvlma)
data1  = read_excel("C:/Users/yujen/Desktop/data.xlsx")
df = read_excel("finaltest.xlsx")

期中簡單線性迴歸分析

盒鬚圖

#盒鬚圖x
ggplot(data1 ,aes(y=rate))+
  geom_boxplot(col="navy",
    outlier.colour ="steel blue"
               ,outlier.shape = 8
               ,outlier.size = 4)+coord_flip()+
  scale_x_discrete() + 
  ylab("確診率")      

#盒鬚圖Y
summary(data1$netflixpremium)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   9.26   13.54   14.20   15.14   17.68   20.32 
library(ggplot2)
ggplot(data1 ,aes(y=netflixpremium),col = "blue")+
  geom_boxplot( col="navy",
    outlier.colour ="steel blue"
                ,outlier.shape = 8
                ,outlier.size = 4)+coord_flip()+
  scale_x_discrete() + 
  ylab("Netflix_premium")

模型參數估計、模型適合度檢定

lm = lm(formula =netflixpremium~rate,data = data1)
summary(lm) #完整的表

Call:
lm(formula = netflixpremium ~ rate, data = data1)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.7604 -1.8029  0.3785  1.5189  4.3338 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  12.5031     0.6446  19.396  < 2e-16 ***
rate         12.7490     2.6022   4.899 1.13e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.506 on 48 degrees of freedom
Multiple R-squared:  0.3334,    Adjusted R-squared:  0.3195 
F-statistic:    24 on 1 and 48 DF,  p-value: 1.135e-05

散布圖

library(ggplot2)
ggplot(data1 ,aes(x =rate ,y =netflixpremium)) +
  geom_point(color = "steel blue",size = 2)

迴歸線散布圖

ggplot(data1, aes(x =rate, y =netflixpremium)) + 
  geom_point(color = "steel blue") +
  labs( x="確診率", y="Netflix premium價格",
        title="散布圖",subtitle="紅色為迴歸線",size =2) +
  theme(plot.title=element_text(color = 'black', size = 12)) + 
  theme(axis.title=element_text(color='steel blue', size=12,)) + 
  theme(axis.line = element_line(colour="black", size = 0.5)) +
  geom_smooth(method ='lm',formula =y~x,se=FALSE, color='red') +
  stat_poly_eq(formula = y~x, eq.with.lhs = "italic(hat(y))~`=`~",
               aes(label = paste(..eq.label.., ..rr.label.., sep = "*plain (\",\")~")),
               parse=TRUE) 

殘差分析

data1$residuals =residuals(lm)
autoplot(lm)

檢定

#Kolmogorov-Smirnov Test
lillie.test(data1$residuals)

    Lilliefors (Kolmogorov-Smirnov) normality test

data:  data1$residuals
D = 0.089007, p-value = 0.414
#ncvtest、bptest
ncvTest(lm)
Non-constant Variance Score Test 
Variance formula: ~ fitted.values 
Chisquare = 0.8661806, Df = 1, p = 0.35201
bptest(lm)

    studentized Breusch-Pagan test

data:  lm
BP = 1.2033, df = 1, p-value = 0.2727
#dwtest
dwtest(lm,alternative = "two.sided")

    Durbin-Watson test

data:  lm
DW = 1.9278, p-value = 0.8058
alternative hypothesis: true autocorrelation is not 0
durbinWatsonTest(lm)
 lag Autocorrelation D-W Statistic p-value
   1     -0.01901518      1.927758   0.826
 Alternative hypothesis: rho != 0

多元迴歸分析

模型參數估計、模型適合度檢定

sr_1 = lm(y~1,data = df)
sr2 = lm(formula = y~x1+D1+x2+x3+x4+x5+x6+x7+
          x1:D1+x2:D1+x3:D1+x5:D1+x6:D1+x7:D1,data = df)
sr = lm(formula =y~x1+D1+x2+x3+x4+x5+x6+x7,data = df)
anova(sr, sr2)#檢定
Analysis of Variance Table

Model 1: y ~ x1 + D1 + x2 + x3 + x4 + x5 + x6 + x7
Model 2: y ~ x1 + D1 + x2 + x3 + x4 + x5 + x6 + x7 + x1:D1 + x2:D1 + x3:D1 + 
    x5:D1 + x6:D1 + x7:D1
  Res.Df    RSS Df Sum of Sq      F Pr(>F)
1     31 41.794                           
2     25 32.434  6    9.3601 1.2025 0.3376
anova(sr)#檢定
Analysis of Variance Table

Response: y
          Df  Sum Sq Mean Sq  F value    Pr(>F)    
x1         1 173.411 173.411 128.6252 1.461e-12 ***
D1         1  17.690  17.690  13.1217   0.00103 ** 
x2         1  42.376  42.376  31.4316 3.775e-06 ***
x3         1   9.919   9.919   7.3573   0.01080 *  
x4         1   0.047   0.047   0.0352   0.85233    
x5         1   1.390   1.390   1.0313   0.31772    
x6         1   0.742   0.742   0.5502   0.46383    
x7         1   0.058   0.058   0.0431   0.83687    
Residuals 31  41.794   1.348                       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(sr)#完整的表

Call:
lm(formula = y ~ x1 + D1 + x2 + x3 + x4 + x5 + x6 + x7, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.2694 -0.7087  0.0573  0.7950  1.6789 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 12.03518    1.99408   6.035 1.11e-06 ***
x1           0.02438    0.01185   2.057 0.048194 *  
D1          -1.93081    0.52917  -3.649 0.000959 ***
x2           0.30583    0.22537   1.357 0.184583    
x3          -0.27785    0.11349  -2.448 0.020216 *  
x4          -0.41932    3.59133  -0.117 0.907803    
x5          -0.44076    0.36655  -1.202 0.238287    
x6           0.22390    0.29123   0.769 0.447835    
x7          -0.41963    2.02094  -0.208 0.836867    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.161 on 31 degrees of freedom
Multiple R-squared:  0.8546,    Adjusted R-squared:  0.8171 
F-statistic: 22.77 on 8 and 31 DF,  p-value: 6.003e-11
coef(sr) #係數
(Intercept)          x1          D1          x2          x3 
12.03517809  0.02438157 -1.93081071  0.30582900 -0.27785317 
         x4          x5          x6          x7 
-0.41932383 -0.44076144  0.22389801 -0.41963160 

逐步回歸

##逐步法
stepAIC(sr_1,direction = "both",
        scope = list(upper=sr)) #aic

stepAIC(sr_1,direction = "both",scope = list(upper=sr),
        k = log(nrow(df))) #bic
##向前法
stepAIC(sr_1,direction = "forward",
        scope = list(lower=sr_1,upper=sr)) #aic

stepAIC(sr_1,direction = "forward",
        scope = list(lower=sr_1,upper=sr),k = log(nrow(df))) #bic
##向後法
stepAIC(sr, direction = "backward",
        scope = list(upper=sr)) #aic

stepAIC(sr, direction = "backward",
        scope = list(upper=sr),k = log(nrow(df))) #bic

全子集迴歸

#全子集迴歸
allp = ols_step_all_possible(sr)
as.data.frame(allp)
plot(allp)  

#最佳子集迴歸
best = ols_step_best_subset(sr)
best
plot(best)  

篩選後模型、參數估計、模型適合度檢定

lmfit = lm(y~x1+D1+x2+x3,data =df)
coef(lmfit)
(Intercept)          x1          D1          x2          x3 
10.78579807  0.01961141 -2.03666535  0.45364282 -0.28968680 
anova(lmfit)
Analysis of Variance Table

Response: y
          Df  Sum Sq Mean Sq  F value    Pr(>F)    
x1         1 173.411 173.411 137.8415 1.085e-13 ***
D1         1  17.690  17.690  14.0619 0.0006393 ***
x2         1  42.376  42.376  33.6838 1.401e-06 ***
x3         1   9.919   9.919   7.8845 0.0080986 ** 
Residuals 35  44.031   1.258                       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

多元共線姓

vif(lmfit)
      x1       D1       x2       x3 
3.511804 1.055721 3.242370 1.126611 

殘差分析圖

autoplot(lmfit)

gvlma檢定

summary(gvlma(lmfit))

Call:
lm(formula = y ~ x1 + D1 + x2 + x3, data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.52079 -0.64468  0.01549  0.89354  1.69804 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.78580    1.04684  10.303 3.86e-12 ***
x1           0.01961    0.01014   1.933 0.061335 .  
D1          -2.03667    0.47956  -4.247 0.000152 ***
x2           0.45364    0.08347   5.435 4.30e-06 ***
x3          -0.28969    0.10317  -2.808 0.008099 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.122 on 35 degrees of freedom
Multiple R-squared:  0.8468,    Adjusted R-squared:  0.8293 
F-statistic: 48.37 on 4 and 35 DF,  p-value: 8.726e-14


ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
Level of Significance =  0.05 

Call:
 gvlma(x = lmfit) 

                    Value p-value                Decision
Global Stat        2.5428  0.6370 Assumptions acceptable.
Skewness           1.6800  0.1949 Assumptions acceptable.
Kurtosis           0.1716  0.6787 Assumptions acceptable.
Link Function      0.4103  0.5218 Assumptions acceptable.
Heteroscedasticity 0.2809  0.5961 Assumptions acceptable.

其他

##殘差分析
df$residuals =residuals(lmfit)
shapiro.test(df$residuals) #常態檢定p>0.05
lillie.test(df$residuals)

durbinWatsonTest(lmfit)#殘差獨立性檢定 dw test介於0~4 1.5~2.5 p>0.05

ncvTest(lmfit)  #變異數同質性 p>0.05
bptest(lmfit) #變異數同質性  p>0.05

##boxcox
library("trafo") 
bc = boxcox(lmfit, lambda="estim",method= "ml", lambdarange = c(-5,5),plotit = T)
bc

##Cook D
gg_cooksd(lmfit,label = T,show.threshold = T,
          threshold = "convention",scale.factor = 0.5)