1 load packages

library('lattice')
library('ggplot2')
library('pastecs')
## Warning: package 'pastecs' was built under R version 4.0.4

2 input data

dta <- read.table("C:/Users/pc/Desktop/ch411.txt", h=T)

3 Descriptive Statistics

head(dta) #6 rows(本資料在求得迴歸式後再重新匯入,故包含residual、y-hat)
##          res  X  Y   y.hat
## 1  2.6666667  5 63 60.3335
## 2 -0.3333333  8 67 67.3334
## 3 -0.3333333 11 74 74.3333
## 4 -1.0000000  7 64 65.0001
## 5 -4.0000000 13 75 78.9999
## 6 -7.6666667 12 69 76.6666
summary(dta)
##       res                X               Y             y.hat      
##  Min.   :-7.6667   Min.   : 5.00   Min.   :60.00   Min.   :60.33  
##  1st Qu.:-3.0000   1st Qu.: 6.75   1st Qu.:63.75   1st Qu.:64.42  
##  Median :-0.6667   Median : 9.50   Median :68.00   Median :70.83  
##  Mean   : 0.0000   Mean   : 9.25   Mean   :70.25   Mean   :70.25  
##  3rd Qu.: 0.4167   3rd Qu.:12.00   3rd Qu.:74.25   3rd Qu.:76.67  
##  Max.   :13.3333   Max.   :13.00   Max.   :90.00   Max.   :79.00
stat.desc(dta, basic = TRUE, desc=TRUE, norm=FALSE, p=0.95)
##                        res          X           Y      y.hat
## nbr.val       8.000000e+00  8.0000000   8.0000000   8.000000
## nbr.null      0.000000e+00  0.0000000   0.0000000   0.000000
## nbr.na        0.000000e+00  0.0000000   0.0000000   0.000000
## min          -7.666667e+00  5.0000000  60.0000000  60.333500
## max           1.333333e+01 13.0000000  90.0000000  78.999900
## range         2.100000e+01  8.0000000  30.0000000  18.666400
## sum          -7.771561e-16 74.0000000 562.0000000 562.000200
## median       -6.666667e-01  9.5000000  68.0000000  70.833350
## mean         -9.714451e-17  9.2500000  70.2500000  70.250025
## SE.mean       2.187628e+00  1.0978876   3.3687111   2.561701
## CI.mean.0.95  5.172917e+00  2.5960916   7.9657359   6.057461
## var           3.828571e+01  9.6428571  90.7857143  52.498500
## std.dev       6.187545e+00  3.1052950   9.5281538   7.245585
## coef.var     -6.369423e+16  0.3357076   0.1356321   0.103140

4 X,Y Correlation

mydata <- dta[, c(2,3)]#資料的2、3欄求相關係數(X、Y)

head(mydata, 6)#檢視資料前6行
##    X  Y
## 1  5 63
## 2  8 67
## 3 11 74
## 4  7 64
## 5 13 75
## 6 12 69
res <- cor(mydata)

round(res, 4)#保留四位小數
##        X      Y
## X 1.0000 0.7605
## Y 0.7605 1.0000

5 Scatter diagram

5.1 Y ~ X

xyplot(Y ~ X, data=dta,
       ylab="Y", 
       xlab="X",
       type=c("p", "g", "r"))

5.2 res ~ X

xyplot(res ~ X, data=dta,
       ylab="resdual", 
       xlab="X",
       type=c("p", "g", "r"))

5.3 res ~ y.hat

xyplot(res ~ y.hat, data=dta,
       ylab="res", 
       xlab="y.hat",
       type=c("p", "g", "r"))

6 linear models

model <- lm(formula= Y ~ X, data=dta)
summary(model)
## 
## Call:
## lm(formula = Y ~ X, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6667 -3.0000 -0.6667  0.4167 13.3333 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  48.6667     7.8869   6.171 0.000832 ***
## X             2.3333     0.8135   2.868 0.028487 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.683 on 6 degrees of freedom
## Multiple R-squared:  0.5783, Adjusted R-squared:  0.508 
## F-statistic: 8.228 on 1 and 6 DF,  p-value: 0.02849
resid(model)#把此步驟求得的residual匯回原始資料
##          1          2          3          4          5          6          7 
##  2.6666667 -0.3333333 -0.3333333 -1.0000000 -4.0000000 -7.6666667 13.3333333 
##          8 
## -2.6666667

7 Data visualisation

ggplot(data = dta, aes(x=X))+
  geom_smooth(aes(y=Y), method = 'lm')+
  geom_point(aes(y=Y))
## `geom_smooth()` using formula 'y ~ x'

8 The end