1.数据准备和描述

# 导入数据
# 如果联网，从Gitee上导入数据，或者下载后导入
library(readr)
ldt_data <- read_csv("https://gitee.com/vv_victorwei/statistics_for_undergraduate/raw/master/dataset/ldt_data.csv")

## Rows: 100 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (6): age, language, rt_word, rt_nonword, acc_word, acc_nonword
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# 导入ggplot2
library(ggplot2)

# 如果想要在统计图中使用中文，可以导入显示汉字的库
library(showtext)

## Warning: 程辑包'showtext'是用R版本4.3.1 来建造的

## 载入需要的程辑包：sysfonts

## Warning: 程辑包'sysfonts'是用R版本4.3.1 来建造的

## 载入需要的程辑包：showtextdb

## Warning: 程辑包'showtextdb'是用R版本4.3.1 来建造的

showtext_auto() 

#此处自动使用字体，如果想要修改字体，可以参考：https://rpubs.com/maomaoworm/679092

# 数据变量描述
# •Participant information:
# – id: participant ID
# – age: age
# • One between-subjects independent variable (IV):
# – language: language group (1 = monolingual,
# 2 = bilingual)
# • Four columns for the two dependent variables
# (DVs) of reaction time (RT) and accuracy, crossed
# by the within-subjects IV of condition:
# – rt_word: reaction time (milliseconds) for
# word trials
# – rt_nonword: reaction time (milliseconds) for
# nonword trials
# – acc_word: accuracy for word trials
# – acc_nonword: accuracy for nonword trials.

# 修改language的变量类型为factor
ldt_data$language<-factor(ldt_data$language)

2.各种类型统计图的绘制

散点图

# 最基本的散点图，x和y如果按照顺序，可以省略x=和y=
ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point()

ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point(aes(colour='red'))

ggplot(ldt_data, aes(x = age, y = rt_word)) +
    geom_point(shape = 3)

ggplot(ldt_data, aes(x = age, y = rt_word)) +
    geom_point(shape = '点')

# 绘制年龄和阅读词语时间的散点图

ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point() +
  geom_smooth(method = "lm",se=FALSE,colour='red')

## `geom_smooth()` using formula = 'y ~ x'

#如果没有指定直线，会拟合曲线
ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point() +
  geom_smooth(se=FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# 分组（单语，双语），绘制年龄和阅读词语时间的散点图

ggplot(ldt_data, aes(x = age, y = rt_word, color=language)) +
  geom_point() +
  geom_smooth(method = "lm")

## `geom_smooth()` using formula = 'y ~ x'

#修改坐标系的变量名以及分组变量的名称及颜色,使用scale_colour_manual

ggplot(ldt_data, aes(x = age, y = rt_word, color=language)) +
  geom_point() +
  geom_smooth(method = "lm")+
  xlab("年龄")+
  ylab("反应时")+
  scale_colour_manual(name = "语言类型",labels = c("单语", "双语"),values=c('red','blue'))

## `geom_smooth()` using formula = 'y ~ x'

#使用预先设定的颜色方案,使用scale_color_brewer

ggplot(ldt_data, aes(x = age, y = rt_word, color=language)) +
  geom_point() +
  geom_smooth(method = "lm")+
  xlab("年龄（岁）")+
  ylab("反应时（毫秒）")+
  scale_color_brewer(palette = "Dark2",name = "语言类型",labels = c("单语", "双语"))

## `geom_smooth()` using formula = 'y ~ x'

#采用多个面板呈现多个相似图
# ~language表明分组/板变量，labeller修改分组变量水平（为了避免这种修改，可以将变量值直接设定为单语和双语）

ggplot(ldt_data, aes(x = age, y = rt_word)) +
  geom_point() +
  geom_smooth(method = "lm")+
  xlab("年龄（岁）")+
  ylab("反应时（毫秒）")+
  facet_wrap(~language,
             labeller = labeller(language = c('1' = "单语",'2' = "双语")))

## `geom_smooth()` using formula = 'y ~ x'

#采用patchwork直接用+进行展示
library(patchwork)

## Warning: 程辑包'patchwork'是用R版本4.3.2 来建造的

p1<-ggplot(ldt_data, aes(x = age, y = rt_word, color=language)) +
  geom_point() +
  geom_smooth(method = "lm")
p2<-ggplot(ldt_data, aes(x = age, y = rt_word)) +
    geom_point(shape = '点')
p1+p2+p1+p2

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

线图

# 读取在线数据，或下载后导入
library(readxl)
url <- "https://gitee.com/vv_victorwei/statistics_for_undergraduate/raw/master/dataset/shmetro.xlsx"
destfile <- "shmetro.xlsx"
curl::curl_download(url, destfile)
shmetro <- read_excel(destfile, col_names = FALSE)

## New names:
## • `` -> `...1`
## • `` -> `...2`

# 修改变量名
names(shmetro)<-c('date','count')

# 绘制线图,在指定时间，上海地铁客运量变化
ggplot(shmetro,aes(date,count))+
  geom_line(color='red')

直方图

#直方图展示单个变量的分布情况

ggplot(ldt_data, aes(x = rt_word)) +
  geom_histogram(binwidth = 10, fill = "white", colour = "black") +
  scale_x_continuous(name = "反应时（毫秒）")

ggplot(ldt_data, aes(x = rt_word)) +
  scale_x_continuous(name = "反应时（毫秒）")+
  geom_freqpoly(binwidth = 10)

#分组呈现直方图
ggplot(ldt_data, aes(x = rt_word,fill = language, color=language)) +
  geom_histogram(binwidth = 10,alpha=0.5) +
  scale_x_continuous(name = "反应时（毫秒）")

# 使用两个面板
ggplot(ldt_data, aes(x = rt_word)) +
  geom_histogram(binwidth = 10, fill = "white", colour = "black") +
  facet_wrap(~language)+
  scale_x_continuous(name = "反应时（毫秒）")

条形图

# x和y采用不同类型变量，一个是discrete离散，一个是连续的。离散可以指定x轴名称，连续可以指定间隔

# 直接一个变量的话，默认统计频次
ggplot(ldt_data,aes(x = language)) +
  geom_bar()+
  scale_x_discrete(name = "语言类型", 
                   labels = c("单语", "双语"))+
  scale_y_continuous(name='数量',
                     breaks = c(0,10,20,30,40,50))

#可以指定显示language各组的均值
ggplot(ldt_data,aes(x = language,y=rt_word)) +
  geom_bar(stat = 'summary_bin',fun=mean)+

ggplot(ldt_data,aes(x = language,y=rt_word)) +
  stat_summary(fun = "mean", geom = "bar", size = 3)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##箱线图或小提琴图

#单个变量的箱线图
ggplot(ldt_data, aes(rt_word)) + geom_boxplot()

# 分组的箱线图
ggplot(ldt_data, aes(language, rt_word)) + geom_boxplot()

ggplot(ldt_data, aes(language, rt_word)) + geom_violin()

包含均值和标准误的差异比较图

# ggplot一般只对数据直接进行绘图，如提供均值可以绘制图形。但如果需要对原始数据进行计算后的统计量制图，需要进行统计操作

df<-data.frame(age=c(4,5,6,4,5,6),
              condition=rep(c('congruent','incongruent'),each=3),
              mean=c(1055,704,730,1238,769,803),
              sd=c(15,13,15,19,14,13) 
               )

ggplot(df, aes(x=factor(age), y=mean, color=condition)) + 
    geom_line(aes(group = condition,colour=condition)) + 
    geom_point(size=3) + 
    geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd),width=0.2)+
    labs(x="年龄",y="反应时间(ms)")+ 
    scale_y_continuous(limits = c(500,1300))+
    scale_x_discrete(labels=c('4岁','5岁','6岁'))+
    scale_color_brewer(palette = "Dark2",name = "朝向类型",labels = c("一致", "不一致"))

ggplot(ldt_data, aes(x = language, y = rt_word)) +
  stat_summary(fun = "mean", geom = "point", size = 3) +
  stat_summary(fun.data = "mean_se", geom = "errorbar", width = .2) +
  scale_x_discrete(name = "语言类型", 
                   labels = c("单语", "双语"))+
  scale_y_continuous(name="反应时",
                     limits = c(330,360))

## Warning: Removed 73 rows containing non-finite values (`stat_summary()`).
## Removed 73 rows containing non-finite values (`stat_summary()`).

# 包含被试间变量和被试内变量的均值和标准误的差异比较图

# 首先需要整理数据，将短宽型转化为长窄型，即被试内变量作为个体的不同水平，以下演示不用tidyverse的做法

# s1 rt_word rt_nonword 改为
# s1 condition1 rt1
# s1 condition2 rt2
# 将被试间变量重复一遍，增加condition变量，分别为word和nonword，然后将两种条件下的rt和acc合并即可
long_data<-data.frame(id=rep(ldt_data$id,2),
                      age=rep(ldt_data$age,2),
                      language=rep(ldt_data$language,2),
                      condition=rep(c('word','nonword'),each=dim(ldt_data)[1]),
                      rt=c(ldt_data$rt_word,ldt_data$rt_nonword),
                      acc=c(ldt_data$acc_word,ldt_data$acc_nonword)
                      )

ggplot(long_data, aes(x = condition, y = rt, 
                     shape = language,
                     group = language,
                     color = language)) +
  stat_summary(fun = "mean", geom = "point", size = 3) +
  stat_summary(fun = "mean", geom = "line") +
  stat_summary(fun.data = "mean_se", geom = "errorbar", width = .2) +
  scale_color_manual(values = c("blue", "darkorange")) +
  scale_x_discrete(name = "语言类型", 
                   labels = c("单语", "双语"))+
  theme_classic()

# 对于被试内变量，往往采用折线反映不同水平上的变化，尤其是对于重复测量数据中，使用更频繁
ggplot(long_data, aes(x = condition, y = rt, group = language, shape = language)) +
  geom_point(aes(colour = language),alpha = .2) +
  geom_line(aes(group = id, colour = language), alpha = .2) +
   stat_summary(fun = "mean", geom = "point", size = 2, colour = "black") +
  stat_summary(fun = "mean", geom = "line", colour = "black") +
  stat_summary(fun.data = "mean_se", geom = "errorbar", width = .2, colour = "black") +
  theme_minimal()

ggplot2_learning

Victor

2023-11-16

1.数据准备和描述

2.各种类型统计图的绘制

散点图

线图

直方图

条形图

包含均值和标准误的差异比较图