教学目标

处理组别变量。
进行各种单样本均值差异、独立样本和相关样本t检验。

因子factor

因子用来表示定性变量（如性别）或顺序变量（如年级）
因子的值称为水平levels，水平可以有说明标签labels

定性变量转化为因子的函数

factor(变量名, levels = c(水平1的值…，水平2的值…), labels = c(‘水平1的说明’,。。。))

#对gender变量进行因子化，0表示boy，1表示girl
#因子化前后的变量名也可以相同，但不建议，因为R语言的效果不能撤销，一旦出现问题，只能从头开始分析
#第一次运行代码前要加载一下，一般使用时无需此行
load(".RData")

eee_data$gender_f<-factor(eee_data$gender,levels=c(0,1),labels = c('boy','girl'))
head(eee_data$gender)

## [1] 1 1 1 1 1 1

head(eee_data$gender_f)

## [1] girl girl girl girl girl girl
## Levels: boy girl

# 同样对birth_Jan_June进行因子化，1表示1-6月出生，0表示7-12月出生
eee_data$birth_Jan_June_f<-factor(eee_data$birth_Jan_June,levels=c(1,0),labels = c('Jan-Jun','Jul-Dec'))
head(eee_data$birth_Jan_June)

## [1] 1 1 0 1 0 1

head(eee_data$birth_Jan_June_f)

## [1] Jan-Jun Jan-Jun Jul-Dec Jan-Jun Jul-Dec Jan-Jun
## Levels: Jan-Jun Jul-Dec

# 对hometown进行因子化，0南方，1北方
eee_data$hometown_f<-factor(eee_data$hometown,levels=c(0,1),labels = c('south','north'))
head(eee_data$hometown)

## [1] 0 0 0 0 0 0

head(eee_data$hometown_f)

## [1] south south south south south south
## Levels: south north

顺序变量转化为因子的函数ordered

eee_data$zhiyuan_f<-ordered(eee_data$zhiyuan,levels=c(1,2,3),labels = c('first choice','second choice','thrid choice'))
head(eee_data$zhiyuan)

## [1] 3 3 1 2 3 3

head(eee_data$zhiyuan_f)

## [1] thrid choice  thrid choice  first choice  second choice thrid choice 
## [6] thrid choice 
## Levels: first choice < second choice < thrid choice

t检验

单样本t检验

# 例1：采用 eee_data 数据.
# 检验平均身高是否可能为1.50
# 有的身高单位不对，超过100的乘以0.01
eee_data$height_c<-ifelse(eee_data$height>100,0.01*eee_data$height,eee_data$height)

# 使用t.test函数
t.test(eee_data$height_c, mu=1.50)

## 
##  One Sample t-test
## 
## data:  eee_data$height_c
## t = 32.05, df = 174, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 1.5
## 95 percent confidence interval:
##  1.627571 1.644314
## sample estimates:
## mean of x 
##  1.635943

#计算效应量 Cohen's d 
library(lsr)
cohensD(eee_data$height_c, mu=1.50)

## [1] 2.422771

#例2 利用课堂例子
# 导入数据
library(haven)
e1_vocabulary <- read_sav("https://gitee.com/vv_victorwei/r-language-data-analysis/raw/master/%E5%9D%87%E5%80%BC%E5%88%86%E6%9E%90/4.5%20E1%20children%20vocabulary%20test.sav")
head(e1_vocabulary)

## # A tibble: 6 × 2
##   student vocabulary
##     <dbl>      <dbl>
## 1     101         47
## 2     102         48
## 3     103         49
## 4     104         49
## 5     105         51
## 6     106         52

# 使用t.test函数
t.test(e1_vocabulary$vocabulary, mu=50)

## 
##  One Sample t-test
## 
## data:  e1_vocabulary$vocabulary
## t = 2.2804, df = 19, p-value = 0.0343
## alternative hypothesis: true mean is not equal to 50
## 95 percent confidence interval:
##  50.15611 53.64389
## sample estimates:
## mean of x 
##      51.9

#计算效应量 Cohen's d 
library(lsr)
cohensD(e1_vocabulary$vocabulary, mu=50)

## [1] 0.5099094

结果显示t = 32.05, df = 174, p-value < 2.2e-16，结果显著，均值与1.50有显著差异，不大可能等于1.50，身高的95%置信区间为1.628~1.644

独立样本t检验

考察南方和北方同学的身高有没有差异。

自变量：hometown；因变量：height_c
用height_c~hometown表示因变量和自变量的关系
第一步，方差齐性检验

var.test(height_c~hometown, data=eee_data, alternative = "two.sided")

## 
##  F test to compare two variances
## 
## data:  height_c by hometown
## F = 0.80177, num df = 102, denom df = 71, p-value = 0.3046
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.5164539 1.2239023
## sample estimates:
## ratio of variances 
##           0.801773

结果不显著，说明两组方差相等

第二步，独立样本t检验

# 因为上一步方差齐性检验结果不显著，所以告诉t检验，var.equal=TRUE，否则=FALSE
t.test(height_c~hometown,data = eee_data,var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  height_c by hometown
## t = -0.41546, df = 173, p-value = 0.6783
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.02064290  0.01346383
## sample estimates:
## mean in group 0 mean in group 1 
##        1.634466        1.638056

#计算效应量 Cohen's d 
library(lsr)
cohensD(height_c~hometown,data = eee_data)

## [1] 0.06382035

结果显示，t = -0.41546, df = 173, p-value = 0.6783，结果不显著，说明南北方同学的身高没有差异

# 课堂例子
library(haven)
e2_gender <- read_sav("https://gitee.com/vv_victorwei/r-language-data-analysis/raw/master/%E5%9D%87%E5%80%BC%E5%88%86%E6%9E%90/4.5%20E2%20gender%20difference%20.sav")
head(e2_gender)

## # A tibble: 6 × 2
##   group     score
##   <dbl+lbl> <dbl>
## 1 1 [boy]    70  
## 2 1 [boy]    68.3
## 3 1 [boy]    86.7
## 4 1 [boy]    70  
## 5 1 [boy]    75.0
## 6 1 [boy]    58.0

# 因子化 性别
e2_gender$group_f<-factor(e2_gender$group,levels = c(1,2),labels = c("male","female"))

#方差齐性检验
var.test(score~group_f, data=e2_gender, alternative = "two.sided")

## 
##  F test to compare two variances
## 
## data:  score by group_f
## F = 1.167, num df = 19, denom df = 19, p-value = 0.7398
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4619204 2.9484167
## sample estimates:
## ratio of variances 
##           1.167019

#t检验
t.test(score~group_f, data=e2_gender,var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  score by group_f
## t = -1.8412, df = 38, p-value = 0.0734
## alternative hypothesis: true difference in means between group male and group female is not equal to 0
## 95 percent confidence interval:
##  -13.4992797   0.6396388
## sample estimates:
##   mean in group male mean in group female 
##             67.13784             73.56766

# 效应量
library(lsr)
cohensD(score~group_f, data=e2_gender)

## [1] 0.5822476

配对样本t检验

假设a1 - a6反应了专业认同的第一个维度A，而a7 - a13反映了专业认同的第二个维度B，那么维度A和维度B的得分有没有差异呢？

每个人都有维度A和B的得分，属于一一对应的配对数据
因变量：专业认同得分
自变量：维度（两个水平A和B）

# 使用rowMeans分别计算维度A和B的得分
eee_data$zyrt_A<-rowMeans(eee_data[19:24],na.rm = TRUE)
eee_data$zyrt_B<-rowMeans(eee_data[25:31],na.rm = TRUE)

# 使用t.test进行配对样本t检验
t.test(eee_data$zyrt_A,eee_data$zyrt_B,paired = TRUE)

## 
##  Paired t-test
## 
## data:  eee_data$zyrt_A and eee_data$zyrt_B
## t = -0.82072, df = 174, p-value = 0.4129
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.09635435  0.03975571
## sample estimates:
## mean difference 
##     -0.02829932

#计算效应量 Cohen's d 
library(lsr)
cohensD(eee_data$zyrt_A,eee_data$zyrt_B, method = "paired")

## [1] 0.0620406

结果显示，t = -0.82072, df = 174, p-value = 0.4129，结果不显著，认为维度A和维度B的得分没有显著差异。

# 课堂例子
library(haven)
e3_test <- read_sav("https://gitee.com/vv_victorwei/r-language-data-analysis/raw/master/%E5%9D%87%E5%80%BC%E5%88%86%E6%9E%90/4.5%20E3%20pretest%20and%20posttest.sav")
head(e3_test)

## # A tibble: 6 × 3
##    学号 pretest posttest
##   <dbl>   <dbl>    <dbl>
## 1     4    25.8       30
## 2    96    41.5       43
## 3    12    45.5       50
## 4    13    45.7       50
## 5    77    49.4       55
## 6     6    50.4       56

# 使用t.test进行配对样本t检验
t.test(e3_test$pretest,e3_test$posttest, paired = TRUE)

## 
##  Paired t-test
## 
## data:  e3_test$pretest and e3_test$posttest
## t = -4.7746, df = 99, p-value = 6.234e-06
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -6.128281 -2.530084
## sample estimates:
## mean difference 
##       -4.329182

#计算效应量 Cohen's d 
library(lsr)
cohensD(e3_test$pretest,e3_test$posttest, method = "paired")

## [1] 0.4774635

专题二、t检验

Wei Wei

2022-10-09