Sys.setlocale(category = 'LC_ALL', locale = 'cht')
## [1] "LC_COLLATE=Chinese (Traditional)_Taiwan.950;LC_CTYPE=Chinese (Traditional)_Taiwan.950;LC_MONETARY=Chinese (Traditional)_Taiwan.950;LC_NUMERIC=C;LC_TIME=Chinese (Traditional)_Taiwan.950"
library(faraway)
## Warning: 套件 'faraway' 是用 R 版本 4.1.3 來建造的
data(hsb)

類別變項的描述統計 Frequencies: The number of observations for a particular category Proportions: The percent that each category accounts for out of the whole

str(hsb)
## 'data.frame':    200 obs. of  11 variables:
##  $ id     : int  70 121 86 141 172 113 50 11 84 48 ...
##  $ gender : Factor w/ 2 levels "female","male": 2 1 2 2 2 2 2 2 2 2 ...
##  $ race   : Factor w/ 4 levels "african-amer",..: 4 4 4 4 4 4 1 3 4 1 ...
##  $ ses    : Factor w/ 3 levels "high","low","middle": 2 3 1 1 3 3 3 3 3 3 ...
##  $ schtyp : Factor w/ 2 levels "private","public": 2 2 2 2 2 2 2 2 2 2 ...
##  $ prog   : Factor w/ 3 levels "academic","general",..: 2 3 2 3 1 1 2 1 2 1 ...
##  $ read   : int  57 68 44 63 47 44 50 34 63 57 ...
##  $ write  : int  52 59 33 44 52 52 59 46 57 55 ...
##  $ math   : int  41 53 54 47 57 51 42 45 54 52 ...
##  $ science: int  47 63 58 53 53 63 53 39 58 50 ...
##  $ socst  : int  57 61 31 56 61 61 61 36 51 51 ...
#Frequencies
table(hsb$gender )
## 
## female   male 
##    109     91
table(hsb$ gender, hsb$race)
##         
##          african-amer asian hispanic white
##   female           13     8       11    77
##   male              7     3       13    68
ftable(hsb$gender, hsb$race, hsb$ses)
##                      high low middle
##                                     
## female african-amer     2   9      2
##        asian            3   2      3
##        hispanic         1   6      4
##        white           23  15     39
## male   african-amer     1   2      4
##        asian            0   1      2
##        hispanic         3   3      7
##        white           25   9     34
#Proportions
prop.table(table(hsb$gender))
## 
## female   male 
##  0.545  0.455
prop.table(ftable(hsb$gender, hsb$race, hsb$ses))
##                       high   low middle
##                                        
## female african-amer  0.010 0.045  0.010
##        asian         0.015 0.010  0.015
##        hispanic      0.005 0.030  0.020
##        white         0.115 0.075  0.195
## male   african-amer  0.005 0.010  0.020
##        asian         0.000 0.005  0.010
##        hispanic      0.015 0.015  0.035
##        white         0.125 0.045  0.170
round(prop.table(ftable(hsb$gender, hsb$race, hsb$ses)), 3)
##                       high   low middle
##                                        
## female african-amer  0.010 0.045  0.010
##        asian         0.015 0.010  0.015
##        hispanic      0.005 0.030  0.020
##        white         0.115 0.075  0.195
## male   african-amer  0.005 0.010  0.020
##        asian         0.000 0.005  0.010
##        hispanic      0.015 0.015  0.035
##        white         0.125 0.045  0.170

練續變項與類別變項

#有無電腦的學生數學平均與數標差
aggregate(read ~  gender, data = hsb, FUN = mean)
##   gender     read
## 1 female 51.73394
## 2   male 52.82418
aggregate( read ~ gender, data = hsb, FUN = sd)
##   gender     read
## 1 female 10.05783
## 2   male 10.50671

兩個類別

library(lattice)
## Warning: 套件 'lattice' 是用 R 版本 4.1.3 來建造的
## 
## 載入套件:'lattice'
## 下列物件被遮斷自 'package:faraway':
## 
##     melanoma
#看看性別跟閱讀分數的的關係
densityplot(~ read, groups = gender, data = hsb, xlab = '分數', lty = c(1,2),
  plot.points = F, type = "g", , main = '性別上閱讀分數的差異 (male = 虛線, female = 實線)')

#還有boxplot
boxplot(read ~ gender, hsb, col = "skyblue", border = "purple")

#也可用QQ圖比較
qq(gender~read, data = hsb, type = c('p','g'), pch = 8, aspect = 1, 
   xlab = '閱讀分數(男生)', ylab = '閱讀分數 (女生)')

超過兩個類別

#看看不同種族的學生閱讀平均與標準差
aggregate(read ~  race, data = hsb, FUN = mean)
##           race     read
## 1 african-amer 46.80000
## 2        asian 51.90909
## 3     hispanic 46.66667
## 4        white 53.92414
aggregate(read ~  race, data = hsb, FUN = sd)
##           race      read
## 1 african-amer  7.120024
## 2        asian  7.660999
## 3     hispanic 10.239169
## 4        white 10.276783
##多個類別
#看看不同種族學生的資料閱讀分數直方圖
histogram(~ read | race, data = hsb, xlab = '閱讀分數', ylab='機率',
          type = 'density', layout = c(4, 1))

boxplot(read ~ race, hsb, xlab = "種族", ylab = "閱讀分數", frame = F, col = c("#CCDDFF", "#99BBFF", "#5599FF", "#0066FF"))

#看看不同種族、性別的學生閱讀平均與標準差
aggregate(read ~ race + gender, data = hsb, FUN = mean)
##           race gender     read
## 1 african-amer female 46.76923
## 2        asian female 51.75000
## 3     hispanic female 45.90909
## 4        white female 53.40260
## 5 african-amer   male 46.85714
## 6        asian   male 52.33333
## 7     hispanic   male 47.30769
## 8        white   male 54.51471
aggregate(read ~ race + gender, data = hsb, FUN = sd)
##           race gender       read
## 1 african-amer female  7.2933057
## 2        asian female  9.1456469
## 3     hispanic female 12.2429943
## 4        white female  9.7969648
## 5 african-amer   male  7.3581830
## 6        asian   male  0.5773503
## 7     hispanic   male  8.6639542
## 8        white   male 10.8373217
#平均數標準誤
aggregate(read ~ race + gender, data = hsb, function(x) sd(x)/sqrt(length(x)))
##           race gender      read
## 1 african-amer female 2.0227990
## 2        asian female 3.2334745
## 3     hispanic female 3.6914017
## 4        white female 1.1164678
## 5 african-amer   male 2.7811318
## 6        asian   male 0.3333333
## 7     hispanic   male 2.4029486
## 8        white   male 1.3142183
plot(hsb$read, hsb$write, col=blues9, pch = 16, xlab = "閱讀分數", ylab = "寫作分數")

#看看不同種族間,寫作與閱讀間的關係是否類似
xyplot(write ~ read|  race, data = hsb, xlab = '閱讀分數', ylab = '寫作分數',
       type = c("g", "p", "r"), cex = 0.1, layout = c(4, 1))

類別變項-t檢定和ANOVA 以t檢定比較不同性別的學生閱讀分數差異

#預設為Welch
#t檢定只能測兩個水準的類別變相
t.test(read ~ gender, data = hsb)
## 
##  Welch Two Sample t-test
## 
## data:  read by gender
## t = -0.74506, df = 188.46, p-value = 0.4572
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
##  -3.976725  1.796263
## sample estimates:
## mean in group female   mean in group male 
##             51.73394             52.82418
#這才是一般假設變異數相同的t檢定
 t.test(read ~ gender, data = hsb, var.equal = T)
## 
##  Two Sample t-test
## 
## data:  read by gender
## t = -0.74801, df = 198, p-value = 0.4553
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
##  -3.964459  1.783998
## sample estimates:
## mean in group female   mean in group male 
##             51.73394             52.82418

ANOVA

#性別
aov(read ~ gender, data = hsb)
## Call:
##    aov(formula = read ~ gender, data = hsb)
## 
## Terms:
##                    gender Residuals
## Sum of Squares     58.949 20860.471
## Deg. of Freedom         1       198
## 
## Residual standard error: 10.2643
## Estimated effects may be unbalanced
#種族
aov(read ~ race, data = hsb)
## Call:
##    aov(formula = read ~ race, data = hsb)
## 
## Terms:
##                      race Residuals
## Sum of Squares   1749.812 19169.608
## Deg. of Freedom         3       196
## 
## Residual standard error: 9.889597
## Estimated effects may be unbalanced
summary(aov(read ~ gender, hsb))
##              Df Sum Sq Mean Sq F value Pr(>F)
## gender        1     59   58.95    0.56  0.455
## Residuals   198  20860  105.36
summary(aov(read ~ race, hsb))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## race          3   1750   583.3   5.964 0.000654 ***
## Residuals   196  19170    97.8                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

結論 由t檢定及變異數分析的結果可知,性別的不同在閱讀分數上並無特別顯著的差異(p>0.05);由變異數分析結果可知,種族的不同會在閱讀分數上有較為明顯的差異(P<0.005)。