library(tidyverse)
Note: the whole data has alot of different pictures included (all with the same variance!)
plot_data <- read_csv(file = "data/dino.csv")
We have to take ONLY the dino data so we make the correct picture
dinodata <- filter(plot_data, dataset == "dino") # taking out only the dino part of the dataset so the picture will work
dinoplot <- ggplot(data = dinodata)+
geom_point(
mapping = aes(
x = x,
y = y,
color = y #making colours vary across levels of y
)
)+
scale_color_gradient(low = "red", high = "blue") # chat taught me to use this so my dino could have a gradient, colour could be changed to create different gradients
print(dinoplot)
1. Themes
Using theme_minimal() gives us a nice, clean background
There’s also theme_dark(), theme_classic() …etc.
2. Titles
3. Legend
dinoplot <- ggplot(data = dinodata)+
geom_point(
mapping = aes(
x = x,
y = y,
color = y #making colours vary across levels of y
)
)+
scale_color_gradient(low = "red", high = "blue")+ # chat taught me to use this so my dino could have a gradient
theme_minimal()+ #gives a clear background
ggtitle(
label = "Bye Dinosaur",
subtitle = "you were great"
)+
guides(color = "none") #removes legend from the plot
print(dinoplot)
Can you write code to show that the mean, variance, and correlation between x and y is the same for each of the datasets?? HINT: this is a group_by and summarise problem
library(tidyverse)
# Read the dataset
plot_data <- read_csv(file = "data/dino.csv")
## Rows: 1846 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): dataset
## dbl (2): x, y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Compute variance for each group x:
xvariance_comparison <- plot_data %>%
group_by(dataset) %>%
summarise(variance = var(x))
# x is used here, so we need two runs for x and y
yvariance_comparison <- plot_data%>%
group_by(dataset)%>%
summarise(variance = var(y))
# Print variance for each group
print(xvariance_comparison)
## # A tibble: 13 × 2
## dataset variance
## <chr> <dbl>
## 1 away 281.
## 2 bullseye 281.
## 3 circle 281.
## 4 dino 281.
## 5 dots 281.
## 6 h_lines 281.
## 7 high_lines 281.
## 8 slant_down 281.
## 9 slant_up 281.
## 10 star 281.
## 11 v_lines 281.
## 12 wide_lines 281.
## 13 x_shape 281.
print(yvariance_comparison)
## # A tibble: 13 × 2
## dataset variance
## <chr> <dbl>
## 1 away 726.
## 2 bullseye 726.
## 3 circle 725.
## 4 dino 726.
## 5 dots 725.
## 6 h_lines 726.
## 7 high_lines 726.
## 8 slant_down 726.
## 9 slant_up 726.
## 10 star 725.
## 11 v_lines 726.
## 12 wide_lines 726.
## 13 x_shape 725.
# Perform ANOVA test for groups, as we have more than 2 grouping variables, so we cant' use the t-test function
xanova_result <- aov(x ~ dataset, data = plot_data)
yanova_result <- aov(y ~ dataset, data = plot_data)
# Print t-test result
print(xanova_result)
## Call:
## aov(formula = x ~ dataset, data = plot_data)
##
## Terms:
## dataset Residuals
## Sum of Squares 0.0 515353.5
## Deg. of Freedom 12 1833
##
## Residual standard error: 16.76762
## Estimated effects may be unbalanced
print(yanova_result)
## Call:
## aov(formula = y ~ dataset, data = plot_data)
##
## Terms:
## dataset Residuals
## Sum of Squares 0 1329881
## Deg. of Freedom 12 1833
##
## Residual standard error: 26.9355
## Estimated effects may be unbalanced