library(tidyverse)
library(car)

輸入資料

DATE <- c("324", "325", "326", "327", "328", "329", "330", "331", "401", "402", "403", "404", "405", "406", "407", "408", "409", "410", "411", "412", "413", "414", "415", "416", "417", "418", "419")
Domestic <- c(15, 14, 21, 83, 34, 33, 56, 87, 104, 160, 183, 133, 216, 281, 382, 384, 442, 431, 439, 551, 744, 874, 1209, 1199, 1210, 1390, 1626) 
Oversea <- c(124, 122, 82, 120, 93, 63, 107, 152, 132, 244, 97, 142, 65, 78, 149, 123, 136, 144, 191, 112, 189, 108, 75, 152, 93, 90, 101)
Weekday <- c("Y", "Y", "N", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "Y", "Y")

建立一個資料名為cvdta

cvdta <- data.frame(DATE, Domestic, Oversea, Weekday)

檢視前六筆資料

head(cvdta)
##   DATE Domestic Oversea Weekday
## 1  324       15     124       Y
## 2  325       14     122       Y
## 3  326       21      82       N
## 4  327       83     120       N
## 5  328       34      93       Y
## 6  329       33      63       Y

檢視資料結構

str(cvdta)
## 'data.frame':    27 obs. of  4 variables:
##  $ DATE    : chr  "324" "325" "326" "327" ...
##  $ Domestic: num  15 14 21 83 34 33 56 87 104 160 ...
##  $ Oversea : num  124 122 82 120 93 63 107 152 132 244 ...
##  $ Weekday : chr  "Y" "Y" "N" "N" ...

將DATE命名成一個類別變項

cvdta$DATE <- as.factor(cvdta$DATE)

將WEEKDAY命名成一個類別變項

cvdta$Weekday <- as.factor(cvdta$Weekday)

看Oversea與Domestic對應Weekday的分布

ggplot(aes(y = Domestic, x = Oversea, color = Weekday), data = cvdta) +
  geom_point() +
  geom_smooth(method = lm, se = F) + 
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

可看出Oversea與Domestic的斜率有差異,在沒有Weekday的狀況下Oversea與Domestic接近0相關

cvmod <- lm(Domestic ~ Oversea + Weekday, data = cvdta)
summary(cvmod)
## 
## Call:
## lm(formula = Domestic ~ Oversea + Weekday, data = cvdta)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -506.0 -364.0 -175.5  236.4 1156.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  623.923    367.071   1.700    0.102
## Oversea       -1.182      2.410  -0.490    0.628
## WeekdayY     -34.906    214.547  -0.163    0.872
## 
## Residual standard error: 499.8 on 24 degrees of freedom
## Multiple R-squared:  0.01013,    Adjusted R-squared:  -0.07236 
## F-statistic: 0.1228 on 2 and 24 DF,  p-value: 0.885

因為t value =/-0.163/< /-2/且p value=0.872> 0.05,所以在95%的信心水準下,Oversea與Domestic間關係在weekday與not weekday的組間平均數差異不顯著。

因為F value=0.1228< 1且p value=0.885< 0.05,所以在95%的信心水準下,結果不顯著。

決定係數=0.01013表示依變項Domestic僅有1.013%的變異量可被自變項解釋,低於20%,因此預測率不佳。

得出公式 : Domestic = 623.923- 1.182 Oversea- 34.906 Weekday+ 499.8

#對迴歸進行診斷,藉找出最小的殘差進而找到最佳的迴歸線

畫直方圖檢視殘差是否常態分配

hist( x = residuals(cvmod),
      xlab = "Value of residual",
      main = "",
      breaks = 20)

由圖可知,殘差非常態分佈,表示預測值-觀察值的誤差不集中在0),因此model的誤差大。

畫Normal Q-Q圖再次檢視殘差是否呈常態分布

plot(cvmod, which = 2)

由圖可知殘差並不為常態分佈(平均數不等於0),因為點皆不落在線上,尤其24、26、27的點為outlier,表示有變異量不可被此model預測。

畫圖看預測值與殘差間關係

plot(cvmod, which  = 1)

若此圖呈水平線且分布呈離散代表預測值與殘差間無關係,差不會隨預測值改變,但此圖不落在水平線上且24、26、27為outlier,表示有變異量未能被模型解釋。

畫出個別自變項與殘差間關係

residualPlots(cvmod)

##            Test stat Pr(>|Test stat|)
## Oversea      -0.1894           0.8514
## Weekday                              
## Tukey test   -0.3492           0.7269

畫出預測值變異數與殘差變異數間關係

plot(cvmod, which = 3)

由圖可知分布並不為水平線和離散,所以有些變異數不能被此模型解釋。