Self-test: making a picyure with plots

load the packages needed

library(tidyverse)

read in the dino data

Note: the whole data has alot of different pictures included (all with the same variance!)

plot_data <- read_csv(file = "data/dino.csv")

reproduce this plot

We have to take ONLY the dino data so we make the correct picture

dinodata <- filter(plot_data, dataset == "dino") # taking out only the dino part of the dataset so the picture will work

dinoplot <- ggplot(data = dinodata)+
  geom_point(
    mapping = aes(
      x = x,
      y = y,
      color = y #making colours vary across levels of y
    )
  )+
  scale_color_gradient(low = "red", high = "blue") # chat taught me to use this so my dino could have a gradient, colour could be changed to create different gradients
print(dinoplot)

Changing the looks

1. Themes

Using theme_minimal() gives us a nice, clean background
There’s also theme_dark(), theme_classic() …etc.

2. Titles

ggtitle() allows for customised titles for the plot

3. Legend

guides(color = “none”) allows customised legend, in this case, removing it

dinoplot <- ggplot(data = dinodata)+
  geom_point(
    mapping = aes(
      x = x,
      y = y,
      color = y #making colours vary across levels of y
    )
  )+
  scale_color_gradient(low = "red", high = "blue")+ # chat taught me to use this so my dino could have a gradient
  theme_minimal()+ #gives a clear background
  ggtitle(
    label = "Bye Dinosaur",
    subtitle = "you were great"
    )+
  guides(color = "none") #removes legend from the plot
  
  
print(dinoplot)

extra challenge

Can you write code to show that the mean, variance, and correlation between x and y is the same for each of the datasets?? HINT: this is a group_by and summarise problem

library(tidyverse)

# Read the dataset
plot_data <- read_csv(file = "data/dino.csv")

## Rows: 1846 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): dataset
## dbl (2): x, y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Compute variance for each group x:
xvariance_comparison <- plot_data %>%
  group_by(dataset) %>%
  summarise(variance = var(x))
# x is used here, so we need two runs for x and y

yvariance_comparison <- plot_data%>%
  group_by(dataset)%>%
  summarise(variance = var(y))

# Print variance for each group
print(xvariance_comparison)

## # A tibble: 13 × 2
##    dataset    variance
##    <chr>         <dbl>
##  1 away           281.
##  2 bullseye       281.
##  3 circle         281.
##  4 dino           281.
##  5 dots           281.
##  6 h_lines        281.
##  7 high_lines     281.
##  8 slant_down     281.
##  9 slant_up       281.
## 10 star           281.
## 11 v_lines        281.
## 12 wide_lines     281.
## 13 x_shape        281.

print(yvariance_comparison)

## # A tibble: 13 × 2
##    dataset    variance
##    <chr>         <dbl>
##  1 away           726.
##  2 bullseye       726.
##  3 circle         725.
##  4 dino           726.
##  5 dots           725.
##  6 h_lines        726.
##  7 high_lines     726.
##  8 slant_down     726.
##  9 slant_up       726.
## 10 star           725.
## 11 v_lines        726.
## 12 wide_lines     726.
## 13 x_shape        725.

# Perform ANOVA test for groups, as we have more than 2 grouping variables, so we cant' use the t-test function
xanova_result <- aov(x ~ dataset, data = plot_data)
yanova_result <- aov(y ~ dataset, data = plot_data)

# Print t-test result
print(xanova_result)

## Call:
##    aov(formula = x ~ dataset, data = plot_data)
## 
## Terms:
##                  dataset Residuals
## Sum of Squares       0.0  515353.5
## Deg. of Freedom       12      1833
## 
## Residual standard error: 16.76762
## Estimated effects may be unbalanced

print(yanova_result)

## Call:
##    aov(formula = y ~ dataset, data = plot_data)
## 
## Terms:
##                 dataset Residuals
## Sum of Squares        0   1329881
## Deg. of Freedom      12      1833
## 
## Residual standard error: 26.9355
## Estimated effects may be unbalanced

w2 self test

April

07 June, 2024