Sys.setlocale(category = 'LC_ALL', locale = 'cht')
## [1] "LC_COLLATE=Chinese (Traditional)_Taiwan.950;LC_CTYPE=Chinese (Traditional)_Taiwan.950;LC_MONETARY=Chinese (Traditional)_Taiwan.950;LC_NUMERIC=C;LC_TIME=Chinese (Traditional)_Taiwan.950"
library(faraway)
## Warning: 套件 'faraway' 是用 R 版本 4.1.3 來建造的
data(hsb)
類別變項的描述統計 Frequencies: The number of observations for a particular category Proportions: The percent that each category accounts for out of the whole
str(hsb)
## 'data.frame': 200 obs. of 11 variables:
## $ id : int 70 121 86 141 172 113 50 11 84 48 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 2 2 2 2 2 2 2 2 ...
## $ race : Factor w/ 4 levels "african-amer",..: 4 4 4 4 4 4 1 3 4 1 ...
## $ ses : Factor w/ 3 levels "high","low","middle": 2 3 1 1 3 3 3 3 3 3 ...
## $ schtyp : Factor w/ 2 levels "private","public": 2 2 2 2 2 2 2 2 2 2 ...
## $ prog : Factor w/ 3 levels "academic","general",..: 2 3 2 3 1 1 2 1 2 1 ...
## $ read : int 57 68 44 63 47 44 50 34 63 57 ...
## $ write : int 52 59 33 44 52 52 59 46 57 55 ...
## $ math : int 41 53 54 47 57 51 42 45 54 52 ...
## $ science: int 47 63 58 53 53 63 53 39 58 50 ...
## $ socst : int 57 61 31 56 61 61 61 36 51 51 ...
#Frequencies
table(hsb$gender )
##
## female male
## 109 91
table(hsb$ gender, hsb$race)
##
## african-amer asian hispanic white
## female 13 8 11 77
## male 7 3 13 68
ftable(hsb$gender, hsb$race, hsb$ses)
## high low middle
##
## female african-amer 2 9 2
## asian 3 2 3
## hispanic 1 6 4
## white 23 15 39
## male african-amer 1 2 4
## asian 0 1 2
## hispanic 3 3 7
## white 25 9 34
#Proportions
prop.table(table(hsb$gender))
##
## female male
## 0.545 0.455
prop.table(ftable(hsb$gender, hsb$race, hsb$ses))
## high low middle
##
## female african-amer 0.010 0.045 0.010
## asian 0.015 0.010 0.015
## hispanic 0.005 0.030 0.020
## white 0.115 0.075 0.195
## male african-amer 0.005 0.010 0.020
## asian 0.000 0.005 0.010
## hispanic 0.015 0.015 0.035
## white 0.125 0.045 0.170
round(prop.table(ftable(hsb$gender, hsb$race, hsb$ses)), 3)
## high low middle
##
## female african-amer 0.010 0.045 0.010
## asian 0.015 0.010 0.015
## hispanic 0.005 0.030 0.020
## white 0.115 0.075 0.195
## male african-amer 0.005 0.010 0.020
## asian 0.000 0.005 0.010
## hispanic 0.015 0.015 0.035
## white 0.125 0.045 0.170
練續變項與類別變項
#有無電腦的學生數學平均與數標差
aggregate(read ~ gender, data = hsb, FUN = mean)
## gender read
## 1 female 51.73394
## 2 male 52.82418
aggregate( read ~ gender, data = hsb, FUN = sd)
## gender read
## 1 female 10.05783
## 2 male 10.50671
兩個類別
library(lattice)
## Warning: 套件 'lattice' 是用 R 版本 4.1.3 來建造的
##
## 載入套件:'lattice'
## 下列物件被遮斷自 'package:faraway':
##
## melanoma
#看看性別跟閱讀分數的的關係
densityplot(~ read, groups = gender, data = hsb, xlab = '分數', lty = c(1,2),
plot.points = F, type = "g", , main = '性別上閱讀分數的差異 (male = 虛線, female = 實線)')
#還有boxplot
boxplot(read ~ gender, hsb, col = "skyblue", border = "purple")
#也可用QQ圖比較
qq(gender~read, data = hsb, type = c('p','g'), pch = 8, aspect = 1,
xlab = '閱讀分數(男生)', ylab = '閱讀分數 (女生)')
超過兩個類別
#看看不同種族的學生閱讀平均與標準差
aggregate(read ~ race, data = hsb, FUN = mean)
## race read
## 1 african-amer 46.80000
## 2 asian 51.90909
## 3 hispanic 46.66667
## 4 white 53.92414
aggregate(read ~ race, data = hsb, FUN = sd)
## race read
## 1 african-amer 7.120024
## 2 asian 7.660999
## 3 hispanic 10.239169
## 4 white 10.276783
##多個類別
#看看不同種族學生的資料閱讀分數直方圖
histogram(~ read | race, data = hsb, xlab = '閱讀分數', ylab='機率',
type = 'density', layout = c(4, 1))
boxplot(read ~ race, hsb, xlab = "種族", ylab = "閱讀分數", frame = F, col = c("#CCDDFF", "#99BBFF", "#5599FF", "#0066FF"))
#看看不同種族、性別的學生閱讀平均與標準差
aggregate(read ~ race + gender, data = hsb, FUN = mean)
## race gender read
## 1 african-amer female 46.76923
## 2 asian female 51.75000
## 3 hispanic female 45.90909
## 4 white female 53.40260
## 5 african-amer male 46.85714
## 6 asian male 52.33333
## 7 hispanic male 47.30769
## 8 white male 54.51471
aggregate(read ~ race + gender, data = hsb, FUN = sd)
## race gender read
## 1 african-amer female 7.2933057
## 2 asian female 9.1456469
## 3 hispanic female 12.2429943
## 4 white female 9.7969648
## 5 african-amer male 7.3581830
## 6 asian male 0.5773503
## 7 hispanic male 8.6639542
## 8 white male 10.8373217
#平均數標準誤
aggregate(read ~ race + gender, data = hsb, function(x) sd(x)/sqrt(length(x)))
## race gender read
## 1 african-amer female 2.0227990
## 2 asian female 3.2334745
## 3 hispanic female 3.6914017
## 4 white female 1.1164678
## 5 african-amer male 2.7811318
## 6 asian male 0.3333333
## 7 hispanic male 2.4029486
## 8 white male 1.3142183
plot(hsb$read, hsb$write, col=blues9, pch = 16, xlab = "閱讀分數", ylab = "寫作分數")
#看看不同種族間,寫作與閱讀間的關係是否類似
xyplot(write ~ read| race, data = hsb, xlab = '閱讀分數', ylab = '寫作分數',
type = c("g", "p", "r"), cex = 0.1, layout = c(4, 1))
類別變項-t檢定和ANOVA 以t檢定比較不同性別的學生閱讀分數差異
#預設為Welch
#t檢定只能測兩個水準的類別變相
t.test(read ~ gender, data = hsb)
##
## Welch Two Sample t-test
##
## data: read by gender
## t = -0.74506, df = 188.46, p-value = 0.4572
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -3.976725 1.796263
## sample estimates:
## mean in group female mean in group male
## 51.73394 52.82418
#這才是一般假設變異數相同的t檢定
t.test(read ~ gender, data = hsb, var.equal = T)
##
## Two Sample t-test
##
## data: read by gender
## t = -0.74801, df = 198, p-value = 0.4553
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -3.964459 1.783998
## sample estimates:
## mean in group female mean in group male
## 51.73394 52.82418
ANOVA
#性別
aov(read ~ gender, data = hsb)
## Call:
## aov(formula = read ~ gender, data = hsb)
##
## Terms:
## gender Residuals
## Sum of Squares 58.949 20860.471
## Deg. of Freedom 1 198
##
## Residual standard error: 10.2643
## Estimated effects may be unbalanced
#種族
aov(read ~ race, data = hsb)
## Call:
## aov(formula = read ~ race, data = hsb)
##
## Terms:
## race Residuals
## Sum of Squares 1749.812 19169.608
## Deg. of Freedom 3 196
##
## Residual standard error: 9.889597
## Estimated effects may be unbalanced
summary(aov(read ~ gender, hsb))
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 59 58.95 0.56 0.455
## Residuals 198 20860 105.36
summary(aov(read ~ race, hsb))
## Df Sum Sq Mean Sq F value Pr(>F)
## race 3 1750 583.3 5.964 0.000654 ***
## Residuals 196 19170 97.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
結論 由t檢定及變異數分析的結果可知,性別的不同在閱讀分數上並無特別顯著的差異(p>0.05);由變異數分析結果可知,種族的不同會在閱讀分數上有較為明顯的差異(P<0.005)。