教学目标

  1. 处理组别变量。
  2. 进行各种单样本均值差异、独立样本和相关样本t检验。

因子factor

  1. 因子用来表示定性变量(如性别)或顺序变量(如年级)
  2. 因子的值称为水平levels,水平可以有说明标签labels

定性变量转化为因子的函数

  • factor(变量名, levels = c(水平1的值…,水平2的值…), labels = c(‘水平1的说明’,。。。))
#对gender变量进行因子化,0表示boy,1表示girl
#因子化前后的变量名也可以相同,但不建议,因为R语言的效果不能撤销,一旦出现问题,只能从头开始分析
#第一次运行代码前要加载一下,一般使用时无需此行
load(".RData")

eee_data$gender_f<-factor(eee_data$gender,levels=c(0,1),labels = c('boy','girl'))
head(eee_data$gender)
## [1] 1 1 1 1 1 1
head(eee_data$gender_f)
## [1] girl girl girl girl girl girl
## Levels: boy girl
# 同样对birth_Jan_June进行因子化,1表示1-6月出生,0表示7-12月出生
eee_data$birth_Jan_June_f<-factor(eee_data$birth_Jan_June,levels=c(1,0),labels = c('Jan-Jun','Jul-Dec'))
head(eee_data$birth_Jan_June)
## [1] 1 1 0 1 0 1
head(eee_data$birth_Jan_June_f)
## [1] Jan-Jun Jan-Jun Jul-Dec Jan-Jun Jul-Dec Jan-Jun
## Levels: Jan-Jun Jul-Dec
# 对hometown进行因子化,0南方,1北方
eee_data$hometown_f<-factor(eee_data$hometown,levels=c(0,1),labels = c('south','north'))
head(eee_data$hometown)
## [1] 0 0 0 0 0 0
head(eee_data$hometown_f)
## [1] south south south south south south
## Levels: south north
  • 顺序变量转化为因子的函数ordered
eee_data$zhiyuan_f<-ordered(eee_data$zhiyuan,levels=c(1,2,3),labels = c('first choice','second choice','thrid choice'))
head(eee_data$zhiyuan)
## [1] 3 3 1 2 3 3
head(eee_data$zhiyuan_f)
## [1] thrid choice  thrid choice  first choice  second choice thrid choice 
## [6] thrid choice 
## Levels: first choice < second choice < thrid choice

t检验

单样本t检验

# 例1:采用 eee_data 数据.
# 检验平均身高是否可能为1.50
# 有的身高单位不对,超过100的乘以0.01
eee_data$height_c<-ifelse(eee_data$height>100,0.01*eee_data$height,eee_data$height)

# 使用t.test函数
t.test(eee_data$height_c, mu=1.50)
## 
##  One Sample t-test
## 
## data:  eee_data$height_c
## t = 32.05, df = 174, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 1.5
## 95 percent confidence interval:
##  1.627571 1.644314
## sample estimates:
## mean of x 
##  1.635943
#计算效应量 Cohen's d 
library(lsr)
cohensD(eee_data$height_c, mu=1.50)
## [1] 2.422771
#例2 利用课堂例子
# 导入数据
library(haven)
e1_vocabulary <- read_sav("https://gitee.com/vv_victorwei/r-language-data-analysis/raw/master/%E5%9D%87%E5%80%BC%E5%88%86%E6%9E%90/4.5%20E1%20children%20vocabulary%20test.sav")
head(e1_vocabulary)
## # A tibble: 6 × 2
##   student vocabulary
##     <dbl>      <dbl>
## 1     101         47
## 2     102         48
## 3     103         49
## 4     104         49
## 5     105         51
## 6     106         52
# 使用t.test函数
t.test(e1_vocabulary$vocabulary, mu=50)
## 
##  One Sample t-test
## 
## data:  e1_vocabulary$vocabulary
## t = 2.2804, df = 19, p-value = 0.0343
## alternative hypothesis: true mean is not equal to 50
## 95 percent confidence interval:
##  50.15611 53.64389
## sample estimates:
## mean of x 
##      51.9
#计算效应量 Cohen's d 
library(lsr)
cohensD(e1_vocabulary$vocabulary, mu=50)
## [1] 0.5099094

结果显示t = 32.05, df = 174, p-value < 2.2e-16,结果显著,均值与1.50有显著差异,不大可能等于1.50,身高的95%置信区间为1.628~1.644

独立样本t检验

考察南方和北方同学的身高有没有差异。

  • 自变量:hometown;因变量:height_c
  • 用height_c~hometown表示因变量和自变量的关系
  • 第一步,方差齐性检验
var.test(height_c~hometown, data=eee_data, alternative = "two.sided")
## 
##  F test to compare two variances
## 
## data:  height_c by hometown
## F = 0.80177, num df = 102, denom df = 71, p-value = 0.3046
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.5164539 1.2239023
## sample estimates:
## ratio of variances 
##           0.801773

结果不显著,说明两组方差相等

  • 第二步,独立样本t检验
# 因为上一步方差齐性检验结果不显著,所以告诉t检验,var.equal=TRUE,否则=FALSE
t.test(height_c~hometown,data = eee_data,var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  height_c by hometown
## t = -0.41546, df = 173, p-value = 0.6783
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.02064290  0.01346383
## sample estimates:
## mean in group 0 mean in group 1 
##        1.634466        1.638056
#计算效应量 Cohen's d 
library(lsr)
cohensD(height_c~hometown,data = eee_data)
## [1] 0.06382035

结果显示,t = -0.41546, df = 173, p-value = 0.6783,结果不显著,说明南北方同学的身高没有差异

# 课堂例子
library(haven)
e2_gender <- read_sav("https://gitee.com/vv_victorwei/r-language-data-analysis/raw/master/%E5%9D%87%E5%80%BC%E5%88%86%E6%9E%90/4.5%20E2%20gender%20difference%20.sav")
head(e2_gender)
## # A tibble: 6 × 2
##   group     score
##   <dbl+lbl> <dbl>
## 1 1 [boy]    70  
## 2 1 [boy]    68.3
## 3 1 [boy]    86.7
## 4 1 [boy]    70  
## 5 1 [boy]    75.0
## 6 1 [boy]    58.0
# 因子化 性别
e2_gender$group_f<-factor(e2_gender$group,levels = c(1,2),labels = c("male","female"))

#方差齐性检验
var.test(score~group_f, data=e2_gender, alternative = "two.sided")
## 
##  F test to compare two variances
## 
## data:  score by group_f
## F = 1.167, num df = 19, denom df = 19, p-value = 0.7398
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4619204 2.9484167
## sample estimates:
## ratio of variances 
##           1.167019
#t检验
t.test(score~group_f, data=e2_gender,var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  score by group_f
## t = -1.8412, df = 38, p-value = 0.0734
## alternative hypothesis: true difference in means between group male and group female is not equal to 0
## 95 percent confidence interval:
##  -13.4992797   0.6396388
## sample estimates:
##   mean in group male mean in group female 
##             67.13784             73.56766
# 效应量
library(lsr)
cohensD(score~group_f, data=e2_gender)
## [1] 0.5822476

配对样本t检验

假设a1 - a6反应了专业认同的第一个维度A,而a7 - a13反映了专业认同的第二个维度B,那么维度A和维度B的得分有没有差异呢?

  • 每个人都有维度A和B的得分,属于一一对应的配对数据
  • 因变量:专业认同得分
  • 自变量:维度(两个水平A和B)
# 使用rowMeans分别计算维度A和B的得分
eee_data$zyrt_A<-rowMeans(eee_data[19:24],na.rm = TRUE)
eee_data$zyrt_B<-rowMeans(eee_data[25:31],na.rm = TRUE)

# 使用t.test进行配对样本t检验
t.test(eee_data$zyrt_A,eee_data$zyrt_B,paired = TRUE)
## 
##  Paired t-test
## 
## data:  eee_data$zyrt_A and eee_data$zyrt_B
## t = -0.82072, df = 174, p-value = 0.4129
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.09635435  0.03975571
## sample estimates:
## mean difference 
##     -0.02829932
#计算效应量 Cohen's d 
library(lsr)
cohensD(eee_data$zyrt_A,eee_data$zyrt_B, method = "paired")
## [1] 0.0620406

结果显示,t = -0.82072, df = 174, p-value = 0.4129,结果不显著,认为维度A和维度B的得分没有显著差异。

# 课堂例子
library(haven)
e3_test <- read_sav("https://gitee.com/vv_victorwei/r-language-data-analysis/raw/master/%E5%9D%87%E5%80%BC%E5%88%86%E6%9E%90/4.5%20E3%20pretest%20and%20posttest.sav")
head(e3_test)
## # A tibble: 6 × 3
##    学号 pretest posttest
##   <dbl>   <dbl>    <dbl>
## 1     4    25.8       30
## 2    96    41.5       43
## 3    12    45.5       50
## 4    13    45.7       50
## 5    77    49.4       55
## 6     6    50.4       56
# 使用t.test进行配对样本t检验
t.test(e3_test$pretest,e3_test$posttest, paired = TRUE)
## 
##  Paired t-test
## 
## data:  e3_test$pretest and e3_test$posttest
## t = -4.7746, df = 99, p-value = 6.234e-06
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -6.128281 -2.530084
## sample estimates:
## mean difference 
##       -4.329182
#计算效应量 Cohen's d 
library(lsr)
cohensD(e3_test$pretest,e3_test$posttest, method = "paired")
## [1] 0.4774635