lab2

one of HW solutions

library(tidyverse)
theme_set(theme_minimal())

students_data <- read_csv("~/r_viz_eu/lab1/data/students_data.csv")
students_data |> head()

## # A tibble: 6 × 10
##   profile     days_in_a_row use_days finish_late start_early correct mean_time
##   <chr>               <dbl>    <dbl>       <dbl>       <dbl>   <dbl>     <dbl>
## 1 disengaged              1        3          11           1   0.600      44.4
## 2 engaged                 4       13           8           1   0.829      25.8
## 3 utilitarian             3        1           2           2   0.738      26.6
## 4 engaged                 4        7          14           3   0.934      36.1
## 5 utilitarian             4        2           3           3   0.556      34.9
## 6 engaged                 7       14          13           2   0.637      18.6
## # … with 3 more variables: min_time <dbl>, max_time <dbl>, attemps <dbl>

students_data |>
  # mutate_all(as.integer) |>
  pivot_longer(c(everything(),-profile),
               names_to = "measures",
               values_to = "values") -> df_to_plt
df_to_plt |> 
  head()

## # A tibble: 6 × 3
##   profile    measures      values
##   <chr>      <chr>          <dbl>
## 1 disengaged days_in_a_row  1    
## 2 disengaged use_days       3    
## 3 disengaged finish_late   11    
## 4 disengaged start_early    1    
## 5 disengaged correct        0.600
## 6 disengaged mean_time     44.4

df_to_plt |> 
  ggplot(aes(y = values, x = measures, color = profile))+
  geom_jitter(position = position_jitter(seed = 2019, width = 0.3, height = 0), alpha = 0.3)

df_to_plt |> 
  filter(measures != "attemps") |> 
  ggplot(aes(y = values, x = measures, color = profile))+
  geom_point(position = position_jitter(seed = 2019, width = 0.3, height = 0), alpha = 0.3)

df_to_plt |> 
  filter(measures != "attemps") |> 
  ggplot(aes(y = values, x = measures, color = profile))+
  geom_point(position = position_jitter(seed = 2019, width = 0.3, height = 0), alpha = 0.3)

students_data |>
  mutate(across(where(is.numeric), ~ . - mean(.)))

## # A tibble: 190 × 10
##    profile     days_in_a_row use_days finish_late start_early correct mean_time
##    <chr>               <dbl>    <dbl>       <dbl>       <dbl>   <dbl>     <dbl>
##  1 disengaged         -2.55    -3.31        2.87        -3.36 -0.0731     11.5 
##  2 engaged             0.453    6.69       -0.126       -3.36  0.156      -7.15
##  3 utilitarian        -0.547   -5.31       -6.13        -2.36  0.0653     -6.29
##  4 engaged             0.453    0.689       5.87        -1.36  0.261       3.20
##  5 utilitarian         0.453   -4.31       -5.13        -1.36 -0.117       1.96
##  6 engaged             3.45     7.69        4.87        -2.36 -0.0361    -14.3 
##  7 engaged             2.45    -0.311      -0.126       -4.36  0.0943     -8.95
##  8 utilitarian         0.453   -3.31       -5.13        -2.36  0.191      22.0 
##  9 disengaged         -2.55    -5.31        8.87         5.64 -0.149      22.6 
## 10 disengaged         -1.55    -4.31        7.87         7.64 -0.212      10.5 
## # … with 180 more rows, and 3 more variables: min_time <dbl>, max_time <dbl>,
## #   attemps <dbl>

students_data |>
  mutate(across(where(is.numeric), ~ (. / max(.)) |> round(3))) |> 
  pivot_longer(c(everything(),-profile),
               names_to = "measures",
               values_to = "values") -> df_to_plt_poms

df_to_plt_poms

## # A tibble: 1,710 × 3
##    profile    measures      values
##    <chr>      <chr>          <dbl>
##  1 disengaged days_in_a_row  0.125
##  2 disengaged use_days       0.176
##  3 disengaged finish_late    0.478
##  4 disengaged start_early    0.05 
##  5 disengaged correct        0.601
##  6 disengaged mean_time      0.573
##  7 disengaged min_time       0.368
##  8 disengaged max_time       0.2  
##  9 disengaged attemps        0.068
## 10 engaged    days_in_a_row  0.5  
## # … with 1,700 more rows

df_to_plt_poms |> 
    ggplot(aes(y = values, x = measures, color = profile))+
  geom_point(position = position_jitter(seed = 2019, width = 0.3, height = 0), alpha = 0.3)

df_to_plt_poms |> 
  ggplot(aes(y = values, x = measures, color = profile))+
  geom_jitter(position = position_jitter(seed = 2019, width = 0.3, height = 0.00), alpha = 0.2)+
  geom_boxplot(alpha = 0.5, outlier.shape = NA, coef = 0)+
  ggtitle("Latent Profiles of Engagement" )+
  scale_color_brewer(palette = "Set2")+
  xlab("")+
  scale_x_discrete(guide = guide_axis(n.dodge=2))+
  theme(legend.position = c(0.8, 0.8),
      axis.title = element_text(size = 16),
      axis.text.x = element_text(family = "Roboto Mono", size = 12),
      axis.text.y = element_text(family = "Roboto Mono", size = 12),
      legend.text = element_text(family = "Roboto Mono", size = 10, color = "grey42"),
      legend.title = element_text(family = "Roboto Mono", size = 12, color = "grey52"),
      # panel.grid = element_blank(),
      legend.key = element_rect(colour = "transparent", fill = "transparent")
)+
  ylim(c(0, 1))+
    annotate(
    "text", x = 1.5, y = 0.85, family = "Poppins", size = 3, color = "gray20",
    label = "Maximum: \n 4090 attempts"
  )+
  annotate(
    "text", x = 5, y = 0.15, family = "Poppins", size = 3, color = "gray20",
    label = "2 days before \n the exam"
  )+
  annotate(
    "text", x = 6, y = 0.7, family = "Poppins", size = 3, color = "gray20",
    label = "13% of attempts \n in gamified modes"
  )+
  geom_curve(
    data = data.frame(
      x1 = c(1.5, 5, 6),
      y1 = c(0.9, 0.2 ,0.75),
      x2 = c(1.2, 6 ,5.2),
      y2 = c(1, 0.38, 0.867)
    ),
    aes(
      x = x1,
      y = y1,
      xend = x2,
      yend = y2
    ),
    arrow = arrow(length = unit(0.07, "inch")),
    size = 0.4,
    color = "gray20",
    curvature = 0.4
  ) -> poms_plot

poms_plot

ggsave(filename = "LPA_GGPLOT4_fixed.png", width = 10, height = 5)

use some of this code to improve your hm graph (10 minutes)

highlight your stuff

Introduction to gghighlight (r-project.org)

library(gghighlight)

df_to_plt_poms |> 
    ggplot(aes(y = values, x = measures, fill = profile))+
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75))+
  gghighlight(measures == "mean_time")

add proper dotplot

df_to_plt_poms |>
  ggplot(aes(y = values, x = measures, fill = profile)) +
  geom_dotplot(
    binaxis = 'y',
    stackdir = 'center',
    position = position_dodge(0.8),
    binwidth = 0.01
  ) +
  geom_boxplot(alpha = 0.7)+
  gghighlight(measures == "mean_time")

df_to_plt_poms |>
  ggplot(aes(x = measures, y = values)) +
  geom_boxplot(aes(fill = profile))+
  facet_wrap(~profile) +
  gghighlight()

iris |> 
  ggplot(aes(Species, Sepal.Length)) +
  geom_boxplot(aes(fill = Species)) +
  facet_wrap(~Species) +
  gghighlight() +
  theme(legend.position = "top") +
  labs(title = "Example 4 - highlight for box plot")

more statistics

ggplot2 Based Plots with Statistical Details • ggstatsplot (indrajeetpatil.github.io)

# library(ggstatsplot)
# ## since the confidence intervals for the effect sizes are computed using
# ## bootstrapping, important to set a seed for reproducibility
# set.seed(123)
# 
# ## function call
# ggbetweenstats(
#   data = dplyr::filter(gapminder::gapminder, year == 2007, continent != "Oceania"),
#   x = continent,
#   y = lifeExp
# )

ridges

library(ggridges)

df_to_plt_poms |> 
  ggplot(aes(x = values, y = measures))+
  geom_density_ridges()

df_to_plt_poms |> 
  ggplot(aes(x = values, y = measures, fill = profile))+
  geom_density_ridges_gradient()

df_to_plt_poms |> 
  ggplot(aes(x = values, y = measures,fill = stat(x)))+
  geom_density_ridges_gradient()+
  scale_fill_viridis_c(name = "Temp. [F]", option = "C")

df_to_plt_poms |> 
  ggplot(aes(x = values, y = measures))+
  geom_density_ridges(
    jittered_points = TRUE,
    position = position_points_jitter(width = 0.05, height = 0),
    point_shape = '|', point_size = 3, point_alpha = 1, alpha = 0.9,
  )

uncertainty visualization

library(ggdist)
library(beeswarm)

set.seed(1234)
x = rnorm(100)

make_plot = function(layout) {
  expand.grid(
    x = x,
    side = c("topright", "both", "bottomleft"),
    stringsAsFactors = FALSE
  ) |> 
    ggplot(aes(side = side, x = x)) +
    stat_dotsinterval(layout = layout) +
    facet_grid(~ side, labeller = "label_both") +
    labs(
      subtitle = paste0("stat_dotsinterval(layout = '", layout, "')"),
      x = NULL,
      y = NULL
    )
}

make_plot("bin")

make_plot("weave")

make_plot("swarm")

set.seed(1234)

abc_df = data.frame(
  value = rnorm(300, mean = c(1,2,3), sd = c(1,2,2)),
  abc = c("a", "b", "c")
)

abc_df %>%
  ggplot(aes(x = abc, y = value)) +
  stat_dots(side = "both") +
  ggtitle('stat_dots(side = "both")')

df_to_plt_poms$measures |> as.factor() |>  summary()

##       attemps       correct days_in_a_row   finish_late      max_time 
##           190           190           190           190           190 
##     mean_time      min_time   start_early      use_days 
##           190           190           190           190

df_to_plt_poms |> 
  filter(measures %in% c("attemps", "correct", "min_time")) |> 
  ggplot(aes(y = measures, x = values, fill = measures)) +
  stat_slab(aes(thickness = stat(pdf*n)), scale = 1) +
  stat_dotsinterval(side = "bottom", scale = 1, slab_size = NA) +
  scale_fill_brewer(palette = "Set2") +
  ggtitle("rainplot")

df_to_plt_poms |> 
  filter(measures %in% c("attemps", "correct", "min_time")) |> 
  ggplot(aes(y = measures, x = values, fill = profile)) +
  stat_slab(aes(thickness = stat(pdf*n)), scale = 1) +
  stat_dotsinterval(side = "bottom", scale = 1, slab_size = NA) +
  scale_fill_brewer(palette = "Set2") +
  ggtitle("rainplot by profile")

Likert vizualization

likert = read_csv("data/likert.csv") |> mutate(value = value |> factor(levels = rev(
    c(
      "Strongly agree",
      "Agree",
      "NAND",
      "Disagree",
      "Strongly disagree"
    )
  )))

cols = c("Strongly agree" = "#0571b0", "Agree" = "#92c5de",  "NAND" = "#f7f7f7","Disagree" = "#f4a582", "Strongly disagree" = "#ca0020")

likert |> 
ggplot()+
geom_bar(aes(x = reorder(slider, stga_value), y = value_count, fill = value), 
         position="fill", stat = "identity")+
  coord_flip()+
  ggthemes::theme_tufte()+
  ggtitle(label = "Likert scale", subtitle = "Exchange priorities")+
  ylab("Percentage")+
  xlab("Scale")+
  theme(legend.position = "bottom")+
  scale_fill_manual(values = cols)

likert %>% filter(value%in%c("Agree", "Strongly agree")) |> 
    mutate(value = value %>% factor(levels = c("Strongly agree", "Agree"), ordered = TRUE)) -> likert_a
likert %>% filter(!value%in%c("Agree", "Strongly agree")) |>
  mutate(value = value %>% factor(levels = c("Strongly disagree", "Disagree", "NAND"), ordered = TRUE))->likert_d



ggplot()+
  geom_bar(
    data = likert_a,
    aes(
      x = reorder(slider, stga_value),
      y = -value_count,
      fill = value
    ),
    stat = "identity"
  ) +
  geom_bar(
    data = likert_d,
    aes(
      x = reorder(slider, stga_value),
      y = value_count,
      fill = value
    ),
    stat = "identity"
  )+
  geom_hline(yintercept = 0, color =c("white")) +
  coord_flip()+
  ggthemes::theme_tufte()+
  ggtitle(label = "Likert scale", subtitle = "Exchange priorities")+
  ylab("Count")+
  xlab("Scale")+
  theme(legend.position = "bottom")+
  scale_fill_manual(values = cols)

hw2

поменяйте дату рождения на свою дату рождения или дату рождения вашего любимого актёра или актрисы

my_bd = "28/08/1998"

my_seed = my_bd |> lubridate::dmy() |> as.numeric()

elements = c("ggridges", "anotate with arrows", "change fonts", 
             "gghighlight", "ggdist", "ggExtra", 
             "ggpattern", "ggstatsplot or ggpubr", "gganimate")

print("В HW2 вам нужно изспользовать 3 из 4 пакетов или слоёв на графике")

## [1] "В HW2 вам нужно изспользовать 3 из 4 пакетов или слоёв на графике"

print("Вот они слева направо:")

## [1] "Вот они слева направо:"

set.seed(my_seed)
elements |> 
  sample(4) |> 
  paste(collapse = ", ")

## [1] "ggpattern, anotate with arrows, gghighlight, ggdist"

Вы можете использовать свои данные, или поисследовать самолёты.

Про самолёты есть набор вопросов, которые могут задать вам направление для анализа.

На вопросы необязательно отвечать, тем более их много и для части из них нужно делать текстовый анализ. Используйте их как вдохновение, либо действительно попробуйте ответить на них, но главное используйте пакеты из предыдущего чанка.

Не в каждом графике нужно использовать все пакеты сразу

можно делать в парах, сдаёте одну работу, указываете авторов

сдавать сюда

данные

https://github.com/quankiquanki/skytrax-reviews-dataset/blob/master/README.md

airline <- read_csv("~/r_viz_eu/lab2/data/airline.csv")
airport <- read_csv("~/r_viz_eu/lab2/data/airport.csv")
lounge <- read_csv("~/r_viz_eu/lab2/data/lounge.csv")
seat <- read_csv("~/r_viz_eu/lab2/data/seat.csv")

library(R3PO)
R3PO:::hw1_questions |> kableExtra::kable()

…1	Question	Comment
1	-В какой день недели чаще всего пишут отзывы про авиалинии	1 таблица
2	-Пишут ли люди более добрые отзывы на авиалинии в выходные (доброту определяем через overall_rating)	1 таблица
3	-Пишут ли люди более добрые отзывы на аэропорты в выходные (доброту определяем через overall_rating)	1 таблица
4	-Пишут ли люди более добрые отзывы на лаунж-зоны в выходные (доброту определяем через overall_rating)	1 таблица
5	-Люди из каких стран выше всего оценили свои перелеты?	1 таблица
6	-Люди из каких стран ниже всего оценивают wi-fi в аэропортах?	1 таблица
7	-Люди из каких стран ниже всего оценивают wi-fi в лаунж-зонах?	1 таблица
8	-Какие типы путешественников (type_traveller) поставили самые высокие оценки по критерию оценок за очереди?	1 таблица
9	-В каких самолетах (aircraft) самое комфортное пространство для ног?	1 таблица
10	-Правда ли оценка мест для хранения (seat_storage) выше, если перелет был в летнее время?	1 таблица
11	-Есть ли в тиблице с отзывами на аэропорты выбросы?	1 таблица
12	-Менялся ли общий рейтинг (overall_rating) с 2013 года по 2017?	1 таблица
13	-Зависит ли оценка удобства пространства для ног от упоминания сложных перелётов (их называют leg)?	с текстами
14	-На какую оценку сильнее влияет упоминание грязи или чистоты в лаунж зоне?	с текстами
15	-Где чаще летают на экономе в BOEING или в AIRBUS?	с текстами
16	-Сколько в нашей базе маршрутов с пересадками?	с текстами
17	-Какие маршруты рекомендуются чаще — прямые или с пересадками?	с текстами
18	-В каких самолетах более высокие оценки удобства пространства для ног — те, в которых два прохода или один?	с текстами
19	-В каких самолетах более высокие оценки ширины кресла — те, в которых два прохода или один?	с текстами
20	-Пользователи каких стран чаще употребляют слово "good" (но не "not good") в отрицательных отзывах на авиалинии? Т.е. находят что-то хорошее, но в целом не рекомендуют.	с текстами
21	-Пользователи каких стран чаще употребляют слово "good" (но не "not good") в отрицательных отзывах на аэропорты? Т.е. находят что-то хорошее, но в целом не рекомендуют.	с текстами
22	-Пользователи каких стран чаще употребляют слово "good" (но не "not good") в отрицательных отзывах на лаунж-зоны? Т.е. находят что-то хорошее, но в целом не рекомендуют.	с текстами
23	-Отличается ли использование слов "not good", "bad", "awful" в отзывах на авиалинии у путешественников из разных классов (cabin_flown)?	с текстами
24	-Отличается ли использование слов "not good", "bad", "awful" в отзывах на авиалинии у путешественников из разных классов (cabin_flown)?	с текстами
25	-Какая взимосвязь между общим рейтингом аэропорта и средним рейтингом лаунж зон в аэропорту? Правда ли, что впечатление от аэропорта сказывается на оценке лаунж-зон?	2 таблицы
26	-В какой стране больше всего самолётов AIRBUS? а в какой BOEING?	2 таблицы
27	-Какие аэропорты рекомендуют чаще, те в которых есть лаунджи бизнесс класса или те, в которых их нет?	2 таблицы
28	-Какие лаунж-зоны оценены по критерию оценки персонала выше, чем аэропорты, в которых они расположены?	2 таблицы
29	-Какие лаунж-зоны оценены по качеству wi-fi выше, чем аэропорты, в которых они расположены?	2 таблицы
30	-Какие лаунж-зоны оценены по чистоте (cleanliness) выше, чем аэропорты, в которых они расположены?	2 таблицы
31	-Есть ли взаимосвязь между средним рейтингом разных моделей самолета (aircraft) по критерию комфортности кресла (seat_comfort) и средней оценкой моделей самолетов по пространству для ног (seat_legroom)?	2 таблицы
32	-Есть ли взаимосвязь между средним рейтингом разных моделей самолета (aircraft) по критерию комфортности кресла (seat_comfort) и средней оценкой моделей самолетов по местам для хранения (seat_storage)?	2 таблицы

additional materials

Pu, X., & Kay, M. (2020, April). A probabilistic grammar of graphics. In Proceedings of the 2020 CHI Conference on Human Factors in Computing Systems (pp. 1-13).