CW3a

CW3a-1. 變項的分布情形

library(lattice)
data("environmental")
#變項間的散佈圖
plot(environmental$temperature, environmental$wind, xlab = '溫度', ylab = '風速')

#臭氧的直方圖
hist(environmental$ozone, main = '臭氧的直方圖', xlab = '臭氧', ylab = '頻率' , col = 'purple')

#輻射的直方圖
hist(environmental$radiation, main = '輻射的直方圖', xlab = '輻射', ylab = '頻率', col = 'purple')

#溫度的直方圖
hist(environmental$temperature, main = '溫度的直方圖', xlab = '溫度', ylab = '頻率', col = 'purple')

#風速的直方圖
hist(environmental$wind, main = '風速的直方圖', xlab = '風速', ylab = '頻率', col = 'purple')

#直方圖2-臭氧
histogram(~ ozone, data = environmental, xlab = '臭氧', ylab = '機率', type = "density", col = 'brown')

#直方圖2-輻射
histogram(~ radiation, data = environmental, xlab = '輻射', ylab = '機率', type = "density", col = 'brown')

#直方圖2-溫度
histogram(~ temperature, data = environmental, xlab = '溫度', ylab = '機率', type = "density", col = 'brown')

#直方圖2-風速
histogram(~ wind, data = environmental, xlab = '風速', ylab = '機率', type = "density", col = 'brown')

#散佈圖
environmental_scores <- environmental[, c('ozone', 'radiation', 'temperature', 'wind')]
pairs(environmental_scores, pch = '.', upper.panel = panel.smooth, lower.panel = NULL, col = 'orange')

CW3a-2. 基本統計量、相關,以及平均數、變異數之檢定

#前六筆資料
head(environmental)
##   ozone radiation temperature wind
## 1    41       190          67  7.4
## 2    36       118          72  8.0
## 3    12       149          74 12.6
## 4    18       313          62 11.5
## 5    23       299          65  8.6
## 6    19        99          59 13.8
#資料的結構
str(environmental)
## 'data.frame':    111 obs. of  4 variables:
##  $ ozone      : num  41 36 12 18 23 19 8 16 11 14 ...
##  $ radiation  : num  190 118 149 313 299 99 19 256 290 274 ...
##  $ temperature: num  67 72 74 62 65 59 61 69 66 68 ...
##  $ wind       : num  7.4 8 12.6 11.5 8.6 13.8 20.1 9.7 9.2 10.9 ...
#資料的最大值、最小值、平均數、中位數、四分位數
summary(environmental)
##      ozone         radiation      temperature         wind       
##  Min.   :  1.0   Min.   :  7.0   Min.   :57.00   Min.   : 2.300  
##  1st Qu.: 18.0   1st Qu.:113.5   1st Qu.:71.00   1st Qu.: 7.400  
##  Median : 31.0   Median :207.0   Median :79.00   Median : 9.700  
##  Mean   : 42.1   Mean   :184.8   Mean   :77.79   Mean   : 9.939  
##  3rd Qu.: 62.0   3rd Qu.:255.5   3rd Qu.:84.50   3rd Qu.:11.500  
##  Max.   :168.0   Max.   :334.0   Max.   :97.00   Max.   :20.700
#下載moments
install.packages("https://cran.rstudio.com/bin/windows/contrib/4.1/moments_0.14.zip", repos = NULL)
## 將程式套件安載入 'C:/Users/user/Documents/R/win-library/4.1'
## (因為 'lib' 沒有被指定)
## package 'moments' successfully unpacked and MD5 sums checked
library(moments)
## Warning: 套件 'moments' 是用 R 版本 4.1.1 來建造的
#一次計算多個變項平均數、標準差、偏態與峰度
my_summary <- function(x) {
 require(moments)
 funs <- c(mean, sd, skewness, kurtosis)
 sapply(funs, function(f) f(x, na.rm = TRUE))
}
sapply(environmental[, c(1:4)], my_summary)
##          ozone   radiation temperature      wind
## [1,] 42.099099 184.8018018  77.7927928 9.9387387
## [2,] 33.275969  91.1523021   9.5299691 3.5592178
## [3,]  1.248104  -0.4862466  -0.2250959 0.4537189
## [4,]  4.204408   2.0707751   2.3319550 3.2808871
#(兩兩相關)臭氧和輻射相關
round(cor(environmental$ozone,environmental$radiation), 3)
## [1] 0.348
#所有相關
round(cor(environmental_scores), 3)
##              ozone radiation temperature   wind
## ozone        1.000     0.348       0.699 -0.613
## radiation    0.348     1.000       0.294 -0.127
## temperature  0.699     0.294       1.000 -0.497
## wind        -0.613    -0.127      -0.497  1.000
#檢定臭氧和輻射的相關是否顯著,也可以看到信賴區間
cor.test( ~ ozone + radiation , data = environmental_scores)
## 
##  Pearson's product-moment correlation
## 
## data:  ozone and radiation
## t = 3.8798, df = 109, p-value = 0.0001793
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.173194 0.502132
## sample estimates:
##       cor 
## 0.3483417
#臭氧和輻射的相關圖
plot(environmental$ozone, environmental$radiation, col=blues9, pch = 16, xlab = "臭氧", ylab = "輻射")

CW3a-3. 提出看法

根據所有相關係數的展示,可以推論天氣和臭氧有較強烈的相關程度。

CW3b

CW3b-1. 計算各個變項的基本統計量、相關

data(anscombe)
#前六筆資料
head(anscombe)
##   x1 x2 x3 x4   y1   y2    y3   y4
## 1 10 10 10  8 8.04 9.14  7.46 6.58
## 2  8  8  8  8 6.95 8.14  6.77 5.76
## 3 13 13 13  8 7.58 8.74 12.74 7.71
## 4  9  9  9  8 8.81 8.77  7.11 8.84
## 5 11 11 11  8 8.33 9.26  7.81 8.47
## 6 14 14 14  8 9.96 8.10  8.84 7.04
#資料的結構
str(anscombe)
## 'data.frame':    11 obs. of  8 variables:
##  $ x1: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x2: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x3: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x4: num  8 8 8 8 8 8 8 19 8 8 ...
##  $ y1: num  8.04 6.95 7.58 8.81 8.33 ...
##  $ y2: num  9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
##  $ y3: num  7.46 6.77 12.74 7.11 7.81 ...
##  $ y4: num  6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...
#資料的最大值、最小值、平均數、中位數、四分位數
summary(anscombe)
##        x1             x2             x3             x4           y1        
##  Min.   : 4.0   Min.   : 4.0   Min.   : 4.0   Min.   : 8   Min.   : 4.260  
##  1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 8   1st Qu.: 6.315  
##  Median : 9.0   Median : 9.0   Median : 9.0   Median : 8   Median : 7.580  
##  Mean   : 9.0   Mean   : 9.0   Mean   : 9.0   Mean   : 9   Mean   : 7.501  
##  3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.: 8   3rd Qu.: 8.570  
##  Max.   :14.0   Max.   :14.0   Max.   :14.0   Max.   :19   Max.   :10.840  
##        y2              y3              y4        
##  Min.   :3.100   Min.   : 5.39   Min.   : 5.250  
##  1st Qu.:6.695   1st Qu.: 6.25   1st Qu.: 6.170  
##  Median :8.140   Median : 7.11   Median : 7.040  
##  Mean   :7.501   Mean   : 7.50   Mean   : 7.501  
##  3rd Qu.:8.950   3rd Qu.: 7.98   3rd Qu.: 8.190  
##  Max.   :9.260   Max.   :12.74   Max.   :12.500
#一次計算多個變項平均數、標準差、偏態與峰度
my_summary2 <- function(x) {
 require(moments)
 funs <- c(mean, sd, skewness, kurtosis)
 sapply(funs, function(f) f(x, na.rm = TRUE))
}
sapply(anscombe[, c(1:8)], my_summary2)
##            x1       x2       x3       x4          y1        y2       y3
## [1,] 9.000000 9.000000 9.000000 9.000000  7.50090909  7.500909 7.500000
## [2,] 3.316625 3.316625 3.316625 3.316625  2.03156814  2.031657 2.030424
## [3,] 0.000000 0.000000 0.000000 2.846050 -0.05580807 -1.129108 1.592231
## [4,] 1.780000 1.780000 1.780000 9.100000  2.17906136  3.007674 5.130453
##            y4
## [1,] 7.500909
## [2,] 2.030579
## [3,] 1.293025
## [4,] 4.390789
#所有相關
round(cor(anscombe), 3)
##        x1     x2     x3     x4     y1     y2     y3     y4
## x1  1.000  1.000  1.000 -0.500  0.816  0.816  0.816 -0.314
## x2  1.000  1.000  1.000 -0.500  0.816  0.816  0.816 -0.314
## x3  1.000  1.000  1.000 -0.500  0.816  0.816  0.816 -0.314
## x4 -0.500 -0.500 -0.500  1.000 -0.529 -0.718 -0.345  0.817
## y1  0.816  0.816  0.816 -0.529  1.000  0.750  0.469 -0.489
## y2  0.816  0.816  0.816 -0.718  0.750  1.000  0.588 -0.478
## y3  0.816  0.816  0.816 -0.345  0.469  0.588  1.000 -0.155
## y4 -0.314 -0.314 -0.314  0.817 -0.489 -0.478 -0.155  1.000

CW3b-2. 四組的相關係數、散佈情形

#(x1, y1)
round(cor(anscombe$x1,anscombe$y1), 3)
## [1] 0.816
#(x2, y2)
round(cor(anscombe$x2,anscombe$y2), 3)
## [1] 0.816
#(x3, y3)
round(cor(anscombe$x3,anscombe$y3), 3)
## [1] 0.816
#(x4, y4)
round(cor(anscombe$x4,anscombe$y4), 3)
## [1] 0.817

根據四組的相關係數,皆很接近1,所以可以推論相關程度極高,也就是四組資料的散佈情形很接近。

CW3b-3. 繪製八個變項的散佈圖

#x1和y1的相關圖
plot(anscombe$x1, anscombe$y1, col=blues9, pch = 16, xlab = "x1", ylab = "y1")

#x2和y2的相關圖
plot(anscombe$x2, anscombe$y2, col=blues9, pch = 16, xlab = "x2", ylab = "y2")

#x3和y3的相關圖
plot(anscombe$x3, anscombe$y3, col=blues9, pch = 16, xlab = "x3", ylab = "y3")

#x4和y4的相關圖
plot(anscombe$x4, anscombe$y4, col=blues9, pch = 16, xlab = "x4", ylab = "y4")

#散佈圖
pairs(anscombe, pch = '.', upper.panel = panel.smooth, lower.panel = NULL, col = 'brown')

CW3b-4. 看法

我發現雖然四組的相關係數非常接近,但因為(x4, y4)有一筆資料比較極端,所以散佈圖的顯示看起來比其他三組的散佈圖差異甚大。另外,四組的相關係數皆為正相關。