View data
data(anscombe)
View(anscombe)
dim(anscombe) #Columns and rows
## [1] 11 8
sum(is.na(anscombe)) # if there has a NA value
## [1] 0
str(anscombe)
## 'data.frame': 11 obs. of 8 variables:
## $ x1: num 10 8 13 9 11 14 6 4 12 7 ...
## $ x2: num 10 8 13 9 11 14 6 4 12 7 ...
## $ x3: num 10 8 13 9 11 14 6 4 12 7 ...
## $ x4: num 8 8 8 8 8 8 8 19 8 8 ...
## $ y1: num 8.04 6.95 7.58 8.81 8.33 ...
## $ y2: num 9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
## $ y3: num 7.46 6.77 12.74 7.11 7.81 ...
## $ y4: num 6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...
summary(anscombe)
## x1 x2 x3 x4 y1
## Min. : 4.0 Min. : 4.0 Min. : 4.0 Min. : 8 Min. : 4.260
## 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 8 1st Qu.: 6.315
## Median : 9.0 Median : 9.0 Median : 9.0 Median : 8 Median : 7.580
## Mean : 9.0 Mean : 9.0 Mean : 9.0 Mean : 9 Mean : 7.501
## 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.: 8 3rd Qu.: 8.570
## Max. :14.0 Max. :14.0 Max. :14.0 Max. :19 Max. :10.840
## y2 y3 y4
## Min. :3.100 Min. : 5.39 Min. : 5.250
## 1st Qu.:6.695 1st Qu.: 6.25 1st Qu.: 6.170
## Median :8.140 Median : 7.11 Median : 7.040
## Mean :7.501 Mean : 7.50 Mean : 7.501
## 3rd Qu.:8.950 3rd Qu.: 7.98 3rd Qu.: 8.190
## Max. :9.260 Max. :12.74 Max. :12.500
Plot
#plot of x
boxplot(anscombe$x1 ,main="x1",col="#4682b4") #the plot of x1

boxplot(anscombe$x2 ,main="x2",col="#4169e1") #the plot of x2

boxplot(anscombe$x3 ,main="x3",col="#6495ed") #the plot of x3

boxplot(anscombe$x4 ,main="x4",col="#483d8b") #the plot of x4

#plot of y
boxplot(anscombe$y1 ,main="y1",col="#cd5c5c") #the plot of y1

boxplot(anscombe$y2 ,main="y2",col="#f08080") #the plot of y2

boxplot(anscombe$y3 ,main="y3",col="#ff7f50") #the plot of y3

boxplot(anscombe$y4 ,main="y4",col="#ff8c00") #the plot of y4

四組Correlation
round(cor(anscombe$x1,anscombe$y1),3)
## [1] 0.816
round(cor(anscombe$x2,anscombe$y2),3)
## [1] 0.816
round(cor(anscombe$x3,anscombe$y3),3)
## [1] 0.816
round(cor(anscombe$x4,anscombe$y4),3)
## [1] 0.817
八個變項的散佈圖
plot (x = anscombe$x1, y = anscombe$y1,
xlab = "x1",
ylab = "y1",
main = "x1 y1散佈圖",
xlim = c(0, 15), ylim = c(0, 15))
abline(lm(anscombe$y1~anscombe$x1), col = "blue")

plot (x = anscombe$x2, y = anscombe$y2,
xlab = "x2",
ylab = "y2",
main = "x2 y2散佈圖",
xlim = c(0, 15), ylim = c(0, 15))
abline(lm(anscombe$y2~anscombe$x2), col = "blue")

plot (x = anscombe$x3, y = anscombe$y3,
xlab = "x3",
ylab = "y3",
main = "x3 y3散佈圖",
xlim = c(0, 15), ylim = c(0, 15))
abline(lm(anscombe$y3~anscombe$x3), col = "blue")

plot (x = anscombe$x4, y = anscombe$y4,
xlab = "x4",
ylab = "y4",
main = "x4 y4散佈圖",
xlim = c(0, 15), ylim = c(0, 15))
abline(lm(anscombe$y4~anscombe$x4), col = "blue")

如上圖,儘管四組的散佈圖相差很遠,但四組相關係數及線性回歸線卻非常接近,這是由於少數離群值outlier的存在影響了線性回歸線,因此在進行數據分析前繪製分佈圖理解數據是很關鍵且重要的流程。