library(lattice)
data("environmental")
#變項間的散佈圖
plot(environmental$temperature, environmental$wind, xlab = '溫度', ylab = '風速')
#臭氧的直方圖
hist(environmental$ozone, main = '臭氧的直方圖', xlab = '臭氧', ylab = '頻率' , col = 'purple')
#輻射的直方圖
hist(environmental$radiation, main = '輻射的直方圖', xlab = '輻射', ylab = '頻率', col = 'purple')
#溫度的直方圖
hist(environmental$temperature, main = '溫度的直方圖', xlab = '溫度', ylab = '頻率', col = 'purple')
#風速的直方圖
hist(environmental$wind, main = '風速的直方圖', xlab = '風速', ylab = '頻率', col = 'purple')
#直方圖2-臭氧
histogram(~ ozone, data = environmental, xlab = '臭氧', ylab = '機率', type = "density", col = 'brown')
#直方圖2-輻射
histogram(~ radiation, data = environmental, xlab = '輻射', ylab = '機率', type = "density", col = 'brown')
#直方圖2-溫度
histogram(~ temperature, data = environmental, xlab = '溫度', ylab = '機率', type = "density", col = 'brown')
#直方圖2-風速
histogram(~ wind, data = environmental, xlab = '風速', ylab = '機率', type = "density", col = 'brown')
#散佈圖
environmental_scores <- environmental[, c('ozone', 'radiation', 'temperature', 'wind')]
pairs(environmental_scores, pch = '.', upper.panel = panel.smooth, lower.panel = NULL, col = 'orange')
#前六筆資料
head(environmental)
## ozone radiation temperature wind
## 1 41 190 67 7.4
## 2 36 118 72 8.0
## 3 12 149 74 12.6
## 4 18 313 62 11.5
## 5 23 299 65 8.6
## 6 19 99 59 13.8
#資料的結構
str(environmental)
## 'data.frame': 111 obs. of 4 variables:
## $ ozone : num 41 36 12 18 23 19 8 16 11 14 ...
## $ radiation : num 190 118 149 313 299 99 19 256 290 274 ...
## $ temperature: num 67 72 74 62 65 59 61 69 66 68 ...
## $ wind : num 7.4 8 12.6 11.5 8.6 13.8 20.1 9.7 9.2 10.9 ...
#資料的最大值、最小值、平均數、中位數、四分位數
summary(environmental)
## ozone radiation temperature wind
## Min. : 1.0 Min. : 7.0 Min. :57.00 Min. : 2.300
## 1st Qu.: 18.0 1st Qu.:113.5 1st Qu.:71.00 1st Qu.: 7.400
## Median : 31.0 Median :207.0 Median :79.00 Median : 9.700
## Mean : 42.1 Mean :184.8 Mean :77.79 Mean : 9.939
## 3rd Qu.: 62.0 3rd Qu.:255.5 3rd Qu.:84.50 3rd Qu.:11.500
## Max. :168.0 Max. :334.0 Max. :97.00 Max. :20.700
#下載moments
install.packages("https://cran.rstudio.com/bin/windows/contrib/4.1/moments_0.14.zip", repos = NULL)
## 將程式套件安載入 'C:/Users/user/Documents/R/win-library/4.1'
## (因為 'lib' 沒有被指定)
## package 'moments' successfully unpacked and MD5 sums checked
library(moments)
## Warning: 套件 'moments' 是用 R 版本 4.1.1 來建造的
#一次計算多個變項平均數、標準差、偏態與峰度
my_summary <- function(x) {
require(moments)
funs <- c(mean, sd, skewness, kurtosis)
sapply(funs, function(f) f(x, na.rm = TRUE))
}
sapply(environmental[, c(1:4)], my_summary)
## ozone radiation temperature wind
## [1,] 42.099099 184.8018018 77.7927928 9.9387387
## [2,] 33.275969 91.1523021 9.5299691 3.5592178
## [3,] 1.248104 -0.4862466 -0.2250959 0.4537189
## [4,] 4.204408 2.0707751 2.3319550 3.2808871
#(兩兩相關)臭氧和輻射相關
round(cor(environmental$ozone,environmental$radiation), 3)
## [1] 0.348
#所有相關
round(cor(environmental_scores), 3)
## ozone radiation temperature wind
## ozone 1.000 0.348 0.699 -0.613
## radiation 0.348 1.000 0.294 -0.127
## temperature 0.699 0.294 1.000 -0.497
## wind -0.613 -0.127 -0.497 1.000
#檢定臭氧和輻射的相關是否顯著,也可以看到信賴區間
cor.test( ~ ozone + radiation , data = environmental_scores)
##
## Pearson's product-moment correlation
##
## data: ozone and radiation
## t = 3.8798, df = 109, p-value = 0.0001793
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.173194 0.502132
## sample estimates:
## cor
## 0.3483417
#臭氧和輻射的相關圖
plot(environmental$ozone, environmental$radiation, col=blues9, pch = 16, xlab = "臭氧", ylab = "輻射")
根據所有相關係數的展示,可以推論天氣和臭氧有較強烈的相關程度。
data(anscombe)
#前六筆資料
head(anscombe)
## x1 x2 x3 x4 y1 y2 y3 y4
## 1 10 10 10 8 8.04 9.14 7.46 6.58
## 2 8 8 8 8 6.95 8.14 6.77 5.76
## 3 13 13 13 8 7.58 8.74 12.74 7.71
## 4 9 9 9 8 8.81 8.77 7.11 8.84
## 5 11 11 11 8 8.33 9.26 7.81 8.47
## 6 14 14 14 8 9.96 8.10 8.84 7.04
#資料的結構
str(anscombe)
## 'data.frame': 11 obs. of 8 variables:
## $ x1: num 10 8 13 9 11 14 6 4 12 7 ...
## $ x2: num 10 8 13 9 11 14 6 4 12 7 ...
## $ x3: num 10 8 13 9 11 14 6 4 12 7 ...
## $ x4: num 8 8 8 8 8 8 8 19 8 8 ...
## $ y1: num 8.04 6.95 7.58 8.81 8.33 ...
## $ y2: num 9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
## $ y3: num 7.46 6.77 12.74 7.11 7.81 ...
## $ y4: num 6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...
#資料的最大值、最小值、平均數、中位數、四分位數
summary(anscombe)
## x1 x2 x3 x4 y1
## Min. : 4.0 Min. : 4.0 Min. : 4.0 Min. : 8 Min. : 4.260
## 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 8 1st Qu.: 6.315
## Median : 9.0 Median : 9.0 Median : 9.0 Median : 8 Median : 7.580
## Mean : 9.0 Mean : 9.0 Mean : 9.0 Mean : 9 Mean : 7.501
## 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.: 8 3rd Qu.: 8.570
## Max. :14.0 Max. :14.0 Max. :14.0 Max. :19 Max. :10.840
## y2 y3 y4
## Min. :3.100 Min. : 5.39 Min. : 5.250
## 1st Qu.:6.695 1st Qu.: 6.25 1st Qu.: 6.170
## Median :8.140 Median : 7.11 Median : 7.040
## Mean :7.501 Mean : 7.50 Mean : 7.501
## 3rd Qu.:8.950 3rd Qu.: 7.98 3rd Qu.: 8.190
## Max. :9.260 Max. :12.74 Max. :12.500
#一次計算多個變項平均數、標準差、偏態與峰度
my_summary2 <- function(x) {
require(moments)
funs <- c(mean, sd, skewness, kurtosis)
sapply(funs, function(f) f(x, na.rm = TRUE))
}
sapply(anscombe[, c(1:8)], my_summary2)
## x1 x2 x3 x4 y1 y2 y3
## [1,] 9.000000 9.000000 9.000000 9.000000 7.50090909 7.500909 7.500000
## [2,] 3.316625 3.316625 3.316625 3.316625 2.03156814 2.031657 2.030424
## [3,] 0.000000 0.000000 0.000000 2.846050 -0.05580807 -1.129108 1.592231
## [4,] 1.780000 1.780000 1.780000 9.100000 2.17906136 3.007674 5.130453
## y4
## [1,] 7.500909
## [2,] 2.030579
## [3,] 1.293025
## [4,] 4.390789
#所有相關
round(cor(anscombe), 3)
## x1 x2 x3 x4 y1 y2 y3 y4
## x1 1.000 1.000 1.000 -0.500 0.816 0.816 0.816 -0.314
## x2 1.000 1.000 1.000 -0.500 0.816 0.816 0.816 -0.314
## x3 1.000 1.000 1.000 -0.500 0.816 0.816 0.816 -0.314
## x4 -0.500 -0.500 -0.500 1.000 -0.529 -0.718 -0.345 0.817
## y1 0.816 0.816 0.816 -0.529 1.000 0.750 0.469 -0.489
## y2 0.816 0.816 0.816 -0.718 0.750 1.000 0.588 -0.478
## y3 0.816 0.816 0.816 -0.345 0.469 0.588 1.000 -0.155
## y4 -0.314 -0.314 -0.314 0.817 -0.489 -0.478 -0.155 1.000
#(x1, y1)
round(cor(anscombe$x1,anscombe$y1), 3)
## [1] 0.816
#(x2, y2)
round(cor(anscombe$x2,anscombe$y2), 3)
## [1] 0.816
#(x3, y3)
round(cor(anscombe$x3,anscombe$y3), 3)
## [1] 0.816
#(x4, y4)
round(cor(anscombe$x4,anscombe$y4), 3)
## [1] 0.817
根據四組的相關係數,皆很接近1,所以可以推論相關程度極高,也就是四組資料的散佈情形很接近。
#x1和y1的相關圖
plot(anscombe$x1, anscombe$y1, col=blues9, pch = 16, xlab = "x1", ylab = "y1")
#x2和y2的相關圖
plot(anscombe$x2, anscombe$y2, col=blues9, pch = 16, xlab = "x2", ylab = "y2")
#x3和y3的相關圖
plot(anscombe$x3, anscombe$y3, col=blues9, pch = 16, xlab = "x3", ylab = "y3")
#x4和y4的相關圖
plot(anscombe$x4, anscombe$y4, col=blues9, pch = 16, xlab = "x4", ylab = "y4")
#散佈圖
pairs(anscombe, pch = '.', upper.panel = panel.smooth, lower.panel = NULL, col = 'brown')
我發現雖然四組的相關係數非常接近,但因為(x4, y4)有一筆資料比較極端,所以散佈圖的顯示看起來比其他三組的散佈圖差異甚大。另外,四組的相關係數皆為正相關。