1.数据准备和描述

# 导入数据
# 如果联网,从Gitee上导入数据,或者下载后导入
library(readr)
ldt_data <- read_csv("https://gitee.com/vv_victorwei/statistics_for_undergraduate/raw/master/dataset/ldt_data.csv")
## Rows: 100 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (6): age, language, rt_word, rt_nonword, acc_word, acc_nonword
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 导入ggplot2
library(ggplot2)

# 如果想要在统计图中使用中文,可以导入显示汉字的库
library(showtext)
## Warning: 程辑包'showtext'是用R版本4.3.1 来建造的
## 载入需要的程辑包:sysfonts
## Warning: 程辑包'sysfonts'是用R版本4.3.1 来建造的
## 载入需要的程辑包:showtextdb
## Warning: 程辑包'showtextdb'是用R版本4.3.1 来建造的
showtext_auto() 

#此处自动使用字体,如果想要修改字体,可以参考:https://rpubs.com/maomaoworm/679092
# 数据变量描述
# •Participant information:
# – id: participant ID
# – age: age
# • One between-subjects independent variable (IV):
# – language: language group (1 = monolingual,
# 2 = bilingual)
# • Four columns for the two dependent variables
# (DVs) of reaction time (RT) and accuracy, crossed
# by the within-subjects IV of condition:
# – rt_word: reaction time (milliseconds) for
# word trials
# – rt_nonword: reaction time (milliseconds) for
# nonword trials
# – acc_word: accuracy for word trials
# – acc_nonword: accuracy for nonword trials.

# 修改language的变量类型为factor
ldt_data$language<-factor(ldt_data$language)

2.各种类型统计图的绘制

散点图

# 最基本的散点图,x和y如果按照顺序,可以省略x=和y=
ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point()

ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point(aes(colour='red'))

ggplot(ldt_data, aes(x = age, y = rt_word)) +
    geom_point(shape = 3)

ggplot(ldt_data, aes(x = age, y = rt_word)) +
    geom_point(shape = '点')

# 绘制年龄和阅读词语时间的散点图

ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point() +
  geom_smooth(method = "lm",se=FALSE,colour='red')
## `geom_smooth()` using formula = 'y ~ x'

#如果没有指定直线,会拟合曲线
ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point() +
  geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# 分组(单语,双语),绘制年龄和阅读词语时间的散点图

ggplot(ldt_data, aes(x = age, y = rt_word, color=language)) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#修改坐标系的变量名以及分组变量的名称及颜色,使用scale_colour_manual

ggplot(ldt_data, aes(x = age, y = rt_word, color=language)) +
  geom_point() +
  geom_smooth(method = "lm")+
  xlab("年龄")+
  ylab("反应时")+
  scale_colour_manual(name = "语言类型",labels = c("单语", "双语"),values=c('red','blue'))
## `geom_smooth()` using formula = 'y ~ x'

#使用预先设定的颜色方案,使用scale_color_brewer

ggplot(ldt_data, aes(x = age, y = rt_word, color=language)) +
  geom_point() +
  geom_smooth(method = "lm")+
  xlab("年龄(岁)")+
  ylab("反应时(毫秒)")+
  scale_color_brewer(palette = "Dark2",name = "语言类型",labels = c("单语", "双语"))
## `geom_smooth()` using formula = 'y ~ x'

#采用多个面板呈现多个相似图
# ~language表明分组/板变量,labeller修改分组变量水平(为了避免这种修改,可以将变量值直接设定为单语和双语)

ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point() +
  geom_smooth(method = "lm")+
  xlab("年龄(岁)")+
  ylab("反应时(毫秒)")+
  facet_wrap(~language,
             labeller = labeller(language = c('1' = "单语",'2' = "双语")))
## `geom_smooth()` using formula = 'y ~ x'

#采用patchwork直接用+进行展示
library(patchwork)
## Warning: 程辑包'patchwork'是用R版本4.3.2 来建造的
p1<-ggplot(ldt_data, aes(x = age, y = rt_word, color=language)) +
  geom_point() +
  geom_smooth(method = "lm")
p2<-ggplot(ldt_data, aes(x = age, y = rt_word)) +
    geom_point(shape = '点')
p1+p2+p1+p2
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

线图

# 读取在线数据,或下载后导入
library(readxl)
url <- "https://gitee.com/vv_victorwei/statistics_for_undergraduate/raw/master/dataset/shmetro.xlsx"
destfile <- "shmetro.xlsx"
curl::curl_download(url, destfile)
shmetro <- read_excel(destfile, col_names = FALSE)
## New names:
## • `` -> `...1`
## • `` -> `...2`
# 修改变量名
names(shmetro)<-c('date','count')

# 绘制线图,在指定时间,上海地铁客运量变化
ggplot(shmetro,aes(date,count))+
  geom_line(color='red')

直方图

#直方图展示单个变量的分布情况

ggplot(ldt_data, aes(x = rt_word)) +
  geom_histogram(binwidth = 10, fill = "white", colour = "black") +
  scale_x_continuous(name = "反应时(毫秒)")

ggplot(ldt_data, aes(x = rt_word)) +
  scale_x_continuous(name = "反应时(毫秒)")+
  geom_freqpoly(binwidth = 10)

#分组呈现直方图
ggplot(ldt_data, aes(x = rt_word,fill = language, color=language)) +
  geom_histogram(binwidth = 10,alpha=0.5) +
  scale_x_continuous(name = "反应时(毫秒)")

# 使用两个面板
ggplot(ldt_data, aes(x = rt_word)) +
  geom_histogram(binwidth = 10, fill = "white", colour = "black") +
  facet_wrap(~language)+
  scale_x_continuous(name = "反应时(毫秒)")

条形图

# x和y采用不同类型变量,一个是discrete离散,一个是连续的。离散可以指定x轴名称,连续可以指定间隔

# 直接一个变量的话,默认统计频次
ggplot(ldt_data,aes(x = language)) +
  geom_bar()+
  scale_x_discrete(name = "语言类型", 
                   labels = c("单语", "双语"))+
  scale_y_continuous(name='数量',
                     breaks = c(0,10,20,30,40,50))

#可以指定显示language各组的均值
ggplot(ldt_data,aes(x = language,y=rt_word)) +
  geom_bar(stat = 'summary_bin',fun=mean)+

ggplot(ldt_data,aes(x = language,y=rt_word)) +
  stat_summary(fun = "mean", geom = "bar", size = 3) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##箱线图或小提琴图

#单个变量的箱线图
ggplot(ldt_data, aes(rt_word)) + geom_boxplot()

# 分组的箱线图
ggplot(ldt_data, aes(language, rt_word)) + geom_boxplot()

ggplot(ldt_data, aes(language, rt_word)) + geom_violin()

包含均值和标准误的差异比较图

# ggplot一般只对数据直接进行绘图,如提供均值可以绘制图形。但如果需要对原始数据进行计算后的统计量制图,需要进行统计操作

df<-data.frame(age=c(4,5,6,4,5,6),
              condition=rep(c('congruent','incongruent'),each=3),
              mean=c(1055,704,730,1238,769,803),
              sd=c(15,13,15,19,14,13) 
               )

ggplot(df, aes(x=factor(age), y=mean, color=condition)) + 
    geom_line(aes(group = condition,colour=condition)) + 
    geom_point(size=3) + 
    geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd),width=0.2)+
    labs(x="年龄",y="反应时间(ms)")+ 
    scale_y_continuous(limits = c(500,1300))+
    scale_x_discrete(labels=c('4岁','5岁','6岁'))+
    scale_color_brewer(palette = "Dark2",name = "朝向类型",labels = c("一致", "不一致"))

ggplot(ldt_data, aes(x = language, y = rt_word)) +
  stat_summary(fun = "mean", geom = "point", size = 3) +
  stat_summary(fun.data = "mean_se", geom = "errorbar", width = .2) +
  scale_x_discrete(name = "语言类型", 
                   labels = c("单语", "双语"))+
  scale_y_continuous(name="反应时",
                     limits = c(330,360))
## Warning: Removed 73 rows containing non-finite values (`stat_summary()`).
## Removed 73 rows containing non-finite values (`stat_summary()`).

# 包含被试间变量和被试内变量的均值和标准误的差异比较图

# 首先需要整理数据,将短宽型转化为长窄型,即被试内变量作为个体的不同水平,以下演示不用tidyverse的做法

# s1 rt_word rt_nonword 改为
# s1 condition1 rt1
# s1 condition2 rt2
# 将被试间变量重复一遍,增加condition变量,分别为word和nonword,然后将两种条件下的rt和acc合并即可
long_data<-data.frame(id=rep(ldt_data$id,2),
                      age=rep(ldt_data$age,2),
                      language=rep(ldt_data$language,2),
                      condition=rep(c('word','nonword'),each=dim(ldt_data)[1]),
                      rt=c(ldt_data$rt_word,ldt_data$rt_nonword),
                      acc=c(ldt_data$acc_word,ldt_data$acc_nonword)
                      )

ggplot(long_data, aes(x = condition, y = rt, 
                     shape = language,
                     group = language,
                     color = language)) +
  stat_summary(fun = "mean", geom = "point", size = 3) +
  stat_summary(fun = "mean", geom = "line") +
  stat_summary(fun.data = "mean_se", geom = "errorbar", width = .2) +
  scale_color_manual(values = c("blue", "darkorange")) +
  scale_x_discrete(name = "语言类型", 
                   labels = c("单语", "双语"))+
  theme_classic()

# 对于被试内变量,往往采用折线反映不同水平上的变化,尤其是对于重复测量数据中,使用更频繁
ggplot(long_data, aes(x = condition, y = rt, group = language, shape = language)) +
  geom_point(aes(colour = language),alpha = .2) +
  geom_line(aes(group = id, colour = language), alpha = .2) +
   stat_summary(fun = "mean", geom = "point", size = 2, colour = "black") +
  stat_summary(fun = "mean", geom = "line", colour = "black") +
  stat_summary(fun.data = "mean_se", geom = "errorbar", width = .2, colour = "black") +
  theme_minimal()