#对gender变量进行因子化,0表示boy,1表示girl
#因子化前后的变量名也可以相同,但不建议,因为R语言的效果不能撤销,一旦出现问题,只能从头开始分析
#第一次运行代码前要加载一下,一般使用时无需此行
load(".RData")
eee_data$gender_f<-factor(eee_data$gender,levels=c(0,1),labels = c('boy','girl'))
head(eee_data$gender)
## [1] 1 1 1 1 1 1
head(eee_data$gender_f)
## [1] girl girl girl girl girl girl
## Levels: boy girl
# 同样对birth_Jan_June进行因子化,1表示1-6月出生,0表示7-12月出生
eee_data$birth_Jan_June_f<-factor(eee_data$birth_Jan_June,levels=c(1,0),labels = c('Jan-Jun','Jul-Dec'))
head(eee_data$birth_Jan_June)
## [1] 1 1 0 1 0 1
head(eee_data$birth_Jan_June_f)
## [1] Jan-Jun Jan-Jun Jul-Dec Jan-Jun Jul-Dec Jan-Jun
## Levels: Jan-Jun Jul-Dec
# 对hometown进行因子化,0南方,1北方
eee_data$hometown_f<-factor(eee_data$hometown,levels=c(0,1),labels = c('south','north'))
head(eee_data$hometown)
## [1] 0 0 0 0 0 0
head(eee_data$hometown_f)
## [1] south south south south south south
## Levels: south north
eee_data$zhiyuan_f<-ordered(eee_data$zhiyuan,levels=c(1,2,3),labels = c('first choice','second choice','thrid choice'))
head(eee_data$zhiyuan)
## [1] 3 3 1 2 3 3
head(eee_data$zhiyuan_f)
## [1] thrid choice thrid choice first choice second choice thrid choice
## [6] thrid choice
## Levels: first choice < second choice < thrid choice
# 例1:采用 eee_data 数据.
# 检验平均身高是否可能为1.50
# 有的身高单位不对,超过100的乘以0.01
eee_data$height_c<-ifelse(eee_data$height>100,0.01*eee_data$height,eee_data$height)
# 使用t.test函数
t.test(eee_data$height_c, mu=1.50)
##
## One Sample t-test
##
## data: eee_data$height_c
## t = 32.05, df = 174, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 1.5
## 95 percent confidence interval:
## 1.627571 1.644314
## sample estimates:
## mean of x
## 1.635943
#计算效应量 Cohen's d
library(lsr)
cohensD(eee_data$height_c, mu=1.50)
## [1] 2.422771
#例2 利用课堂例子
# 导入数据
library(haven)
e1_vocabulary <- read_sav("https://gitee.com/vv_victorwei/r-language-data-analysis/raw/master/%E5%9D%87%E5%80%BC%E5%88%86%E6%9E%90/4.5%20E1%20children%20vocabulary%20test.sav")
head(e1_vocabulary)
## # A tibble: 6 × 2
## student vocabulary
## <dbl> <dbl>
## 1 101 47
## 2 102 48
## 3 103 49
## 4 104 49
## 5 105 51
## 6 106 52
# 使用t.test函数
t.test(e1_vocabulary$vocabulary, mu=50)
##
## One Sample t-test
##
## data: e1_vocabulary$vocabulary
## t = 2.2804, df = 19, p-value = 0.0343
## alternative hypothesis: true mean is not equal to 50
## 95 percent confidence interval:
## 50.15611 53.64389
## sample estimates:
## mean of x
## 51.9
#计算效应量 Cohen's d
library(lsr)
cohensD(e1_vocabulary$vocabulary, mu=50)
## [1] 0.5099094
结果显示t = 32.05, df = 174, p-value < 2.2e-16,结果显著,均值与1.50有显著差异,不大可能等于1.50,身高的95%置信区间为1.628~1.644
考察南方和北方同学的身高有没有差异。
var.test(height_c~hometown, data=eee_data, alternative = "two.sided")
##
## F test to compare two variances
##
## data: height_c by hometown
## F = 0.80177, num df = 102, denom df = 71, p-value = 0.3046
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.5164539 1.2239023
## sample estimates:
## ratio of variances
## 0.801773
结果不显著,说明两组方差相等
# 因为上一步方差齐性检验结果不显著,所以告诉t检验,var.equal=TRUE,否则=FALSE
t.test(height_c~hometown,data = eee_data,var.equal=TRUE)
##
## Two Sample t-test
##
## data: height_c by hometown
## t = -0.41546, df = 173, p-value = 0.6783
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.02064290 0.01346383
## sample estimates:
## mean in group 0 mean in group 1
## 1.634466 1.638056
#计算效应量 Cohen's d
library(lsr)
cohensD(height_c~hometown,data = eee_data)
## [1] 0.06382035
结果显示,t = -0.41546, df = 173, p-value = 0.6783,结果不显著,说明南北方同学的身高没有差异
# 课堂例子
library(haven)
e2_gender <- read_sav("https://gitee.com/vv_victorwei/r-language-data-analysis/raw/master/%E5%9D%87%E5%80%BC%E5%88%86%E6%9E%90/4.5%20E2%20gender%20difference%20.sav")
head(e2_gender)
## # A tibble: 6 × 2
## group score
## <dbl+lbl> <dbl>
## 1 1 [boy] 70
## 2 1 [boy] 68.3
## 3 1 [boy] 86.7
## 4 1 [boy] 70
## 5 1 [boy] 75.0
## 6 1 [boy] 58.0
# 因子化 性别
e2_gender$group_f<-factor(e2_gender$group,levels = c(1,2),labels = c("male","female"))
#方差齐性检验
var.test(score~group_f, data=e2_gender, alternative = "two.sided")
##
## F test to compare two variances
##
## data: score by group_f
## F = 1.167, num df = 19, denom df = 19, p-value = 0.7398
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4619204 2.9484167
## sample estimates:
## ratio of variances
## 1.167019
#t检验
t.test(score~group_f, data=e2_gender,var.equal=TRUE)
##
## Two Sample t-test
##
## data: score by group_f
## t = -1.8412, df = 38, p-value = 0.0734
## alternative hypothesis: true difference in means between group male and group female is not equal to 0
## 95 percent confidence interval:
## -13.4992797 0.6396388
## sample estimates:
## mean in group male mean in group female
## 67.13784 73.56766
# 效应量
library(lsr)
cohensD(score~group_f, data=e2_gender)
## [1] 0.5822476
假设a1 - a6反应了专业认同的第一个维度A,而a7 - a13反映了专业认同的第二个维度B,那么维度A和维度B的得分有没有差异呢?
# 使用rowMeans分别计算维度A和B的得分
eee_data$zyrt_A<-rowMeans(eee_data[19:24],na.rm = TRUE)
eee_data$zyrt_B<-rowMeans(eee_data[25:31],na.rm = TRUE)
# 使用t.test进行配对样本t检验
t.test(eee_data$zyrt_A,eee_data$zyrt_B,paired = TRUE)
##
## Paired t-test
##
## data: eee_data$zyrt_A and eee_data$zyrt_B
## t = -0.82072, df = 174, p-value = 0.4129
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.09635435 0.03975571
## sample estimates:
## mean difference
## -0.02829932
#计算效应量 Cohen's d
library(lsr)
cohensD(eee_data$zyrt_A,eee_data$zyrt_B, method = "paired")
## [1] 0.0620406
结果显示,t = -0.82072, df = 174, p-value = 0.4129,结果不显著,认为维度A和维度B的得分没有显著差异。
# 课堂例子
library(haven)
e3_test <- read_sav("https://gitee.com/vv_victorwei/r-language-data-analysis/raw/master/%E5%9D%87%E5%80%BC%E5%88%86%E6%9E%90/4.5%20E3%20pretest%20and%20posttest.sav")
head(e3_test)
## # A tibble: 6 × 3
## 学号 pretest posttest
## <dbl> <dbl> <dbl>
## 1 4 25.8 30
## 2 96 41.5 43
## 3 12 45.5 50
## 4 13 45.7 50
## 5 77 49.4 55
## 6 6 50.4 56
# 使用t.test进行配对样本t检验
t.test(e3_test$pretest,e3_test$posttest, paired = TRUE)
##
## Paired t-test
##
## data: e3_test$pretest and e3_test$posttest
## t = -4.7746, df = 99, p-value = 6.234e-06
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -6.128281 -2.530084
## sample estimates:
## mean difference
## -4.329182
#计算效应量 Cohen's d
library(lsr)
cohensD(e3_test$pretest,e3_test$posttest, method = "paired")
## [1] 0.4774635