View data

data(anscombe)
View(anscombe)
dim(anscombe) #Columns and rows
## [1] 11  8
sum(is.na(anscombe)) # if there has a NA value
## [1] 0
str(anscombe)
## 'data.frame':    11 obs. of  8 variables:
##  $ x1: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x2: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x3: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x4: num  8 8 8 8 8 8 8 19 8 8 ...
##  $ y1: num  8.04 6.95 7.58 8.81 8.33 ...
##  $ y2: num  9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
##  $ y3: num  7.46 6.77 12.74 7.11 7.81 ...
##  $ y4: num  6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...
summary(anscombe)
##        x1             x2             x3             x4           y1        
##  Min.   : 4.0   Min.   : 4.0   Min.   : 4.0   Min.   : 8   Min.   : 4.260  
##  1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 8   1st Qu.: 6.315  
##  Median : 9.0   Median : 9.0   Median : 9.0   Median : 8   Median : 7.580  
##  Mean   : 9.0   Mean   : 9.0   Mean   : 9.0   Mean   : 9   Mean   : 7.501  
##  3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.: 8   3rd Qu.: 8.570  
##  Max.   :14.0   Max.   :14.0   Max.   :14.0   Max.   :19   Max.   :10.840  
##        y2              y3              y4        
##  Min.   :3.100   Min.   : 5.39   Min.   : 5.250  
##  1st Qu.:6.695   1st Qu.: 6.25   1st Qu.: 6.170  
##  Median :8.140   Median : 7.11   Median : 7.040  
##  Mean   :7.501   Mean   : 7.50   Mean   : 7.501  
##  3rd Qu.:8.950   3rd Qu.: 7.98   3rd Qu.: 8.190  
##  Max.   :9.260   Max.   :12.74   Max.   :12.500

Plot

#plot of x
boxplot(anscombe$x1 ,main="x1",col="#4682b4") #the plot of x1

boxplot(anscombe$x2 ,main="x2",col="#4169e1") #the plot of x2

boxplot(anscombe$x3 ,main="x3",col="#6495ed") #the plot of x3

boxplot(anscombe$x4 ,main="x4",col="#483d8b") #the plot of x4

#plot of y
boxplot(anscombe$y1 ,main="y1",col="#cd5c5c") #the plot of y1

boxplot(anscombe$y2 ,main="y2",col="#f08080") #the plot of y2

boxplot(anscombe$y3 ,main="y3",col="#ff7f50") #the plot of y3

boxplot(anscombe$y4 ,main="y4",col="#ff8c00") #the plot of y4

四組Correlation

round(cor(anscombe$x1,anscombe$y1),3)
## [1] 0.816
round(cor(anscombe$x2,anscombe$y2),3)
## [1] 0.816
round(cor(anscombe$x3,anscombe$y3),3)
## [1] 0.816
round(cor(anscombe$x4,anscombe$y4),3)
## [1] 0.817

八個變項的散佈圖

plot (x = anscombe$x1, y = anscombe$y1,
  xlab = "x1",
  ylab = "y1",
  main = "x1 y1散佈圖",
  xlim = c(0, 15), ylim = c(0, 15))
abline(lm(anscombe$y1~anscombe$x1), col = "blue")

plot (x = anscombe$x2, y = anscombe$y2,
  xlab = "x2",
  ylab = "y2",
  main = "x2 y2散佈圖",
  xlim = c(0, 15), ylim = c(0, 15))
abline(lm(anscombe$y2~anscombe$x2), col = "blue")

plot (x = anscombe$x3, y = anscombe$y3,
  xlab = "x3",
  ylab = "y3",
  main = "x3 y3散佈圖",
  xlim = c(0, 15), ylim = c(0, 15))
abline(lm(anscombe$y3~anscombe$x3), col = "blue")

plot (x = anscombe$x4, y = anscombe$y4,
  xlab = "x4",
  ylab = "y4",
  main = "x4 y4散佈圖",
  xlim = c(0, 15), ylim = c(0, 15))
abline(lm(anscombe$y4~anscombe$x4), col = "blue")

如上圖,儘管四組的散佈圖相差很遠,但四組相關係數及線性回歸線卻非常接近,這是由於少數離群值outlier的存在影響了線性回歸線,因此在進行數據分析前繪製分佈圖理解數據是很關鍵且重要的流程。