knitr::opts_chunk$set(message = FALSE, warning = FALSE, results='asis')
library(readxl)
library(tidyverse)
# Set personal working directory
setwd("/Users/jessicahenderson/Desktop/psychology/PSYC3361/face_lab/group_data")
# Read excel file into R
normative <- read_excel("normative.xlsx")
I have made notes where adjustments have been made and more comments below the histogram.
#creating assigning the normative data set to the name fig2_data
fig2_data <- normative %>%
select(`Overall (%)`) %>% # selecting overall normative accuracy column
# plott starts here
ggplot(data = ., mapping = aes(x = `Overall (%)`)) +
geom_histogram(
mapping = aes(y = after_stat(count/sum(count))), #changed from after_stat(density) to after_stat(count/sum(count))
binwidth = 2,
fill = "red",
color = "white",
closed = "left" # we have to add in closed left
) +
# coord_cartesian function specifically adjusts the visible portion of the plot's x and y axes while maintaining the original data range.
coord_cartesian(xlim = c(40, 100)) +
scale_y_continuous(limits = c(0, 0.20)) +
labs(
x = "UNSW Face Test Score (percent correct)",
y = "Proportion of respondents") +
# normative distribution line
stat_function(
fun = dnorm,
args = list(
mean = mean(normative$`Overall (%)`),
sd = sd(normative$`Overall (%)`)),
color= "black", lwd = 1) +
#reference line for Chance accuracy
geom_vline(
xintercept = 50, color = "black", linetype = "dotted") +
geom_text(
aes(x = 50, y = 0, label = "CHANCE"),
color = "black", size = 3, angle = 90, vjust = -0.5, hjust = -6) +
#threshold line for Mean + 2 SD
geom_vline(
xintercept =
mean(normative$`Overall (%)`) +
2 * sd(normative$`Overall (%)`),
color = "black", linetype = "dotted") +
geom_text(aes(
x = mean(normative$`Overall (%)`) +
2 * sd(normative$`Overall (%)`), y = 0, label = "MEAN +2SD"),
color = "black", size = 3, angle = 90, vjust = -0.5, hjust = -4.2) +
#adding N=290 label for participant count
geom_text(
x = 90, y = 0.10, label = paste("N =", nrow(normative)),
color = "red", size = 6) +
theme_minimal()
print(fig2_data)
To create this Figure we have to first create a frequency table to plot the histogram. Even though we fixed the bin height we can’t fix the normative distribution without a frequency table.
James said he actually didn’t plot the raw data - he plotted a frequency table he made from the raw data. The frequency table shows where the bins are and so even though we have specified binwidth = 2 it looks like the binwidth = 2.5. James confirmed that he used binwidth = 2. so it should be 45-75 but if you look at the frequency table I have made below from the data we are using it shows 45 to 47.5 James showed me his freqency table there should be 5 participants between 45-75 giving a frequency of 5/290 = 0.0172
library(gt)
# Create frequency table
freq_table <- table(normative$`Overall (%)`)
# Convert frequency table to data frame
freq_df <- data.frame("Overall (%)" = as.numeric(names(freq_table)), "Frequency" = as.numeric(freq_table))
# Create gt table object
gt_table <- gt(freq_df, caption = "Frequency Table")
# Display the gt table
gt_table
| Overall.... | Frequency |
|---|---|
| 45.00000 | 1 |
| 45.83333 | 2 |
| 46.66667 | 2 |
| 47.50000 | 1 |
| 48.33333 | 6 |
| 49.16667 | 8 |
| 50.00000 | 3 |
| 50.83333 | 5 |
| 51.66667 | 12 |
| 52.50000 | 8 |
| 53.33333 | 8 |
| 54.16667 | 15 |
| 55.00000 | 17 |
| 55.83333 | 13 |
| 56.66667 | 10 |
| 57.50000 | 9 |
| 58.33333 | 16 |
| 59.16667 | 15 |
| 60.00000 | 23 |
| 60.83333 | 17 |
| 61.66667 | 13 |
| 62.50000 | 18 |
| 63.33333 | 11 |
| 64.16667 | 14 |
| 65.00000 | 7 |
| 65.83333 | 4 |
| 66.66667 | 5 |
| 67.50000 | 6 |
| 68.33333 | 7 |
| 69.16667 | 6 |
| 70.00000 | 1 |
| 70.83333 | 2 |
| 71.66667 | 2 |
| 72.50000 | 2 |
| 75.00000 | 1 |
When I spoke to James he suggested multiplying the density scaling factor by 2 . But that seemed to easy so I thought it was cheating. However chatgpt says this:
No, multiplying the probability density function (PDF) by a scaling factor of 2 is not considered cheating. It is a common practice to adjust the scale of the PDF to ensure that the area under the curve integrates to 1, which is a property of a valid probability density function.
The area under the curve represents the probability, and it should sum up to 1 for a valid probability distribution. By scaling the PDF, you are adjusting its height to ensure that the total area under the curve is equal to 1. This scaling factor does not affect the shape or characteristics of the distribution, but only ensures that it meets the criteria of a proper probability density function.
Therefore, multiplying the PDF by 2 in this context is a valid technique to normalize the density and ensure the correct interpretation of probabilities.
library(ggplot2)
# Rename "Overall (%)" to "Overall"
colnames(normative)[colnames(normative) == "Overall (%)"] <- "Overall"
# Create frequency table
freq_table <- table(normative$Overall)
# Convert frequency table to data frame
freq_df <- data.frame("Overall" = as.numeric(names(freq_table)), "Frequency" = as.numeric(freq_table))
# Plot the histogram and adjusted normal distribution
ggplot(normative, aes(x = Overall)) +
geom_histogram(aes(y = after_stat(count/sum(count))),
binwidth = 2,
fill = "red",
color = "white",
closed = "left") +
stat_function(
fun = function(x) 2 * dnorm(x, mean = mean(normative$Overall), sd = sd(normative$Overall)),
color = "black", lwd = 1) +
labs(x = "Overall", y = "Proportion of respondents", title = "Histogram with Normal Distribution") +
geom_vline(xintercept = 50, color = "black", linetype = "dotted") +
geom_text(aes(x = 50, y = 0, label = "CHANCE"), color = "black", size = 3, angle = 90, vjust = -0.5, hjust = -4) +
geom_vline(xintercept = mean(normative$Overall) + 2 * sd(normative$Overall), color = "black", linetype = "dotted") +
geom_text(aes(x = mean(normative$Overall) + 2 * sd(normative$Overall), y = 0, label = "MEAN + 2SD"), color = "black", size = 3, angle = 90, vjust = -0.5, hjust = -3.2) +
geom_text(x = 90, y = 0.10, label = paste("N =", nrow(normative)), color = "red", size = 6) +
coord_cartesian(xlim = c(40, 100)) +
scale_y_continuous(limits = c(0, 0.20)) +
theme_minimal()