library(tidyverse)
components
What is this dataset?
How familiar are your with mtcars?
Do you hate them?
?mtcars
mtcars |>
head()
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
Lets warm up your R (tydyverse) skills and make use of Nastay’s lecture
mtcars |>
ggplot(aes(x =wt, y = mpg)) +
geom_point()+
ggtitle("2 Dimensions")+
theme_minimal()
What is wrong with the following visualization?
mtcars |>
ggplot(aes(x =wt, y = mpg, color = gear)) +
geom_point()+
ggtitle("3 Dimensions, continious?")+
theme_minimal()
What is the difference between categorical variable and factor?
mtcars |>
ggplot(aes(x =wt, y = mpg, color = gear |> as_factor())) +
geom_point()+
ggtitle("3 Dimensions")+
theme_minimal()
How easy is to read the following graph?
mtcars |>
ggplot(aes(x =wt, y = mpg,
color = gear |> as_factor(),
size = cyl)) +
geom_point()+
ggtitle("4 Dimensions")+
theme_minimal()
What is the difference between these two graphs? How different will be your story?
mtcars |>
ggplot(aes(x =wt, y = mpg,
color = gear |> as_factor(),
size = cyl |> as.factor())) +
geom_point()+
facet_wrap(~cyl)+
ggtitle("4 Dimensions", "distinct by Number of cylinders")+
theme_minimal()
Try to fix this on you own
mtcars |>
ggplot(aes(x =wt, y = mpg,
color = gear |> as_factor(),
size = cyl |> as.factor())) +
geom_point()+
stat_smooth(method='lm')+
facet_wrap(~cyl)+
ggtitle("4 Dimensions", "with statistics, but something is wrong")+
theme_minimal()
Or look at the solution right away
What is the goal for adding statistics on graphs?
mtcars |>
ggplot(aes(x =wt, y = mpg,
color = gear |> as_factor())) +
geom_smooth(method='lm')+
geom_point(aes(size = cyl |> as.factor()))+
# facet_wrap(~cyl)+
ggtitle("4 Dimensions", "with statistics")+
theme_minimal()
Complex layers could be made of simpler one, thus giving us more customization
mtcars |>
ggplot(aes(x =wt, y = mpg,
color = gear |> as_factor())) +
geom_smooth(method='lm', alpha = 0.1, size = 0)+
geom_line(stat="smooth",method = "lm", size = 1.5, alpha = 0.5)+
geom_point(aes(size = cyl |> as.factor()))+
ggtitle("4 Dimensions", "with statistics from basic attributes")+
theme_minimal()
mtcars |>
ggplot(aes(x =wt, y = mpg,
color = gear |> as_factor())) +
geom_smooth(method='lm', alpha = 0.1, size = 0)+
geom_line(stat="smooth",method = "lm", size = 1.5, alpha = 0.5)+
geom_point(aes(size = cyl |> as.factor()))+
ggtitle("4 Dimensions", "with statistics from basic attributes")+
scale_color_brewer(palette = "Set1")+
theme_minimal()
see see for more color
library(see)
# https://github.com/easystats/see
mtcars |>
ggplot(aes(x =wt, y = mpg,
color = gear |> as_factor())) +
geom_smooth(method='lm', alpha = 0.1, size = 0)+
geom_line(stat="smooth",method = "lm", size = 1.5, alpha = 0.5)+
geom_point(aes(size = cyl |> as.factor()))+
ggtitle("4 Dimensions", "with statistics from basic attributes")+
# see::scale_color_material_d()+
see::scale_color_social()+
theme_minimal()
bikes <- read_csv("data/Bike Sharing Demand.csv")
bikes |> head()
## # A tibble: 6 × 12
## datetime season holiday workingday weather temp atemp humidity
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011-01-01 00:00:00 1 0 0 1 9.84 14.4 81
## 2 2011-01-01 01:00:00 1 0 0 1 9.02 13.6 80
## 3 2011-01-01 02:00:00 1 0 0 1 9.02 13.6 80
## 4 2011-01-01 03:00:00 1 0 0 1 9.84 14.4 75
## 5 2011-01-01 04:00:00 1 0 0 1 9.84 14.4 75
## 6 2011-01-01 05:00:00 1 0 0 2 9.84 12.9 75
## # … with 4 more variables: windspeed <dbl>, casual <dbl>, registered <dbl>,
## # count <dbl>
bikes |>
ggplot(aes(x = holiday))+
geom_bar()+
theme_minimal()
You could try to turn count into percentage following the tutorial
How to plot a ‘percentage plot’ with ggplot2 – Sebastian Sauer Stats Blog
# bikes |>
bikes |>
group_by(holiday) |>
summarise(count = sum(count)) |>
ggplot(aes(x = holiday, y = count))+
geom_bar(stat = "identity")+
theme_minimal()
bikes |>
mutate(holiday = holiday |> as.logical()) |>
ggplot(aes(x = holiday, y = count))+
geom_boxplot()+
theme_minimal()
Why do we have almost the same median for holidays and non holidays?
bikes |>
mutate(holiday = holiday |> as.logical()) |>
ggplot(aes(x = holiday, y = count))+
geom_boxplot()+
geom_jitter(alpha = 0.1)+
theme_minimal()
bikes |>
mutate(holiday = holiday |> as.logical()) |>
ggplot(aes(x = holiday, y = count))+
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75), trim = FALSE)+
geom_jitter(alpha = 0.1)+
theme_minimal()
library(ggpubr)
bikes |>
ggboxplot(x = "holiday", y = "count",
color = "holiday", palette =c("#00AFBB", "#E7B800")) -> p
p
# Add p-values comparing groups
# Specify the comparisons you want
my_comparisons <- list( c(0, 1))
p +
stat_compare_means(comparisons = my_comparisons)+
# Add pairwise comparisons p-value
stat_compare_means(label.y = 50) # Add global p-value
bikes |>
ggboxplot(x = "season", y = "count",
color = "season", palette ="npg") -> p2
p2
# Add p-values comparing groups
# Specify the comparisons you want
my_comparisons <- combn(c(1, 2, 3, 4), m = 2, simplify = F)
p2 +
stat_compare_means(comparisons = my_comparisons)+
# Add pairwise comparisons p-value
stat_compare_means(label.y = 1600)
bikes |>
ggplot(aes(x = atemp, y = count))+
geom_point()+
geom_smooth(method = "lm")+
theme_minimal()
bikes |>
ggplot(aes(x = atemp, y = count))+
geom_point()+
geom_smooth(method = "lm")+
theme_minimal()+
scale_y_log10()
bikes |>
filter(atemp>=30 & atemp <=45) |>
ggplot(aes(x = atemp, y = count))+
geom_point()+
geom_smooth(method = "lm")+
scale_y_log10()+
xlim(30, 45)+
theme_minimal()
bikes |>
ggplot(aes(x = atemp, y = count))+
geom_jitter(width = 0.2, height = 0.2, alpha = 0.4)+
geom_smooth(method = "lm")+
scale_y_log10()+
theme_minimal()
bikes |>
ggplot(aes(x = atemp, y = count))+
geom_jitter(width = 0.2, height = 0.2, alpha = 0.4)+
geom_smooth(method = "lm")+
scale_y_log10()+
theme_minimal()
?diamonds
diamonds |>
head()
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
diamonds |>
ggplot(aes(x = carat, y = price))+
geom_point()+
theme_minimal()
diamonds |>
ggplot(aes(x = carat, y = price))+
geom_point()+
scale_x_log10()+
scale_y_log10()+
theme_minimal()
diamonds |>
# sample_frac(0.1) |>
ggplot(aes(x = carat, y = price, color = cut))+
geom_point(alpha = 0.1)+
scale_x_log10()+
scale_y_log10()+
theme_minimal()
Есть данные и два варианта задания:
df_lp <- read_csv("data/students_data.csv")
df_lp |> skimr::skim()
Name | df_lp |
Number of rows | 190 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 9 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
profile | 0 | 1 | 7 | 11 | 0 | 3 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
days_in_a_row | 0 | 1 | 3.55 | 1.85 | 0.00 | 2.00 | 4.00 | 5.00 | 8.00 | ▅▇▅▇▁ |
use_days | 0 | 1 | 6.31 | 4.49 | 0.00 | 2.25 | 5.00 | 10.00 | 17.00 | ▇▂▅▃▁ |
finish_late | 0 | 1 | 8.13 | 5.71 | 0.00 | 3.00 | 7.50 | 13.00 | 23.00 | ▇▅▃▅▁ |
start_early | 0 | 1 | 4.36 | 3.61 | 0.00 | 2.00 | 3.00 | 4.00 | 20.00 | ▇▁▁▁▁ |
correct | 0 | 1 | 0.67 | 0.18 | 0.28 | 0.52 | 0.66 | 0.82 | 1.00 | ▂▇▇▆▇ |
mean_time | 0 | 1 | 32.92 | 16.73 | 0.04 | 20.78 | 32.46 | 43.41 | 77.44 | ▅▆▇▃▁ |
min_time | 0 | 1 | 23.28 | 7.37 | 4.70 | 18.70 | 22.20 | 27.28 | 59.50 | ▁▇▃▁▁ |
max_time | 0 | 1 | 103.80 | 37.52 | 2.70 | 82.42 | 110.50 | 132.08 | 173.90 | ▁▃▅▇▃ |
attemps | 0 | 1 | 611.21 | 572.78 | 10.00 | 149.25 | 293.50 | 1021.00 | 2214.00 | ▇▂▂▂▁ |
Это почти совсем как настоящий датасет из образовательной аналитики. На одном курсе по юриспруденции в Нидерландах студентам предложили воспользоваться мобильным приложением, где они могли отвечать на вопросы про курс и лучше подготовиться к экзамену. Эти данные кластеризовали каким-то гаусовским алгоритмом и отдали вам для визуализации.