knitr::opts_chunk$set(message = FALSE, warning = FALSE, results='asis')
library(readxl)
library(tidyverse)

# Set personal working directory
setwd("/Users/jessicahenderson/Desktop/psychology/PSYC3361/face_lab/group_data")

# Read excel file into R
  normative <- read_excel("normative.xlsx")

This is my code for figure 2

More comments below the graph!

I have made notes where adjustments have been made and more comments below the histogram.

#creating assigning the normative data set to the name fig2_data
  fig2_data <- normative %>% 
  select(`Overall (%)`) %>% # selecting overall normative accuracy column
  
# plott starts here
  ggplot(data = ., mapping = aes(x = `Overall (%)`)) +
  geom_histogram(
    mapping = aes(y = after_stat(count/sum(count))),   #changed from after_stat(density) to after_stat(count/sum(count))
    binwidth = 2,
    fill = "red", 
    color = "white", 
    closed = "left"   # we have to add in closed left
    ) + 

# coord_cartesian function specifically adjusts the visible portion of the plot's x and y axes while maintaining the original data range.
  coord_cartesian(xlim = c(40, 100)) +  
  scale_y_continuous(limits = c(0, 0.20)) +
  labs(
    x = "UNSW Face Test Score (percent correct)",
    y = "Proportion of respondents") +
  
# normative distribution line
  stat_function(
    fun = dnorm, 
    args = list(
      mean = mean(normative$`Overall (%)`), 
      sd = sd(normative$`Overall (%)`)), 
    color= "black", lwd = 1) +

#reference line for Chance accuracy 
  geom_vline(
    xintercept = 50, color = "black", linetype = "dotted") +
  
  geom_text(
    aes(x = 50, y = 0, label = "CHANCE"), 
    color = "black", size = 3, angle = 90, vjust = -0.5, hjust = -6) + 

#threshold line for Mean + 2 SD
  geom_vline(
    xintercept = 
     mean(normative$`Overall (%)`) + 
     2 * sd(normative$`Overall (%)`), 
    color = "black", linetype = "dotted") +
  
  geom_text(aes(
    x = mean(normative$`Overall (%)`) + 
    2 * sd(normative$`Overall (%)`), y = 0, label = "MEAN +2SD"), 
    color = "black", size = 3, angle = 90, vjust = -0.5, hjust = -4.2) +  
  
#adding N=290 label for participant count 
  geom_text(
    x = 90, y = 0.10, label = paste("N =", nrow(normative)), 
    color = "red", size = 6) +
  theme_minimal() 

print(fig2_data)

Why is the plot like that?

As you can see the graph height is now where it needs to be - (happy)! However the normative distribution is not - (sad) !

To create this Figure we have to first create a frequency table to plot the histogram. Even though we fixed the bin height we can’t fix the normative distribution without a frequency table.

James said he actually didn’t plot the raw data - he plotted a frequency table he made from the raw data. The frequency table shows where the bins are and so even though we have specified binwidth = 2 it looks like the binwidth = 2.5. James confirmed that he used binwidth = 2. so it should be 45-75 but if you look at the frequency table I have made below from the data we are using it shows 45 to 47.5 James showed me his freqency table there should be 5 participants between 45-75 giving a frequency of 5/290 = 0.0172

library(gt)

# Create frequency table
freq_table <- table(normative$`Overall (%)`)

# Convert frequency table to data frame
freq_df <- data.frame("Overall (%)" = as.numeric(names(freq_table)), "Frequency" = as.numeric(freq_table))

# Create gt table object
gt_table <- gt(freq_df, caption = "Frequency Table")

# Display the gt table
gt_table
Frequency Table
Overall.... Frequency
45.00000 1
45.83333 2
46.66667 2
47.50000 1
48.33333 6
49.16667 8
50.00000 3
50.83333 5
51.66667 12
52.50000 8
53.33333 8
54.16667 15
55.00000 17
55.83333 13
56.66667 10
57.50000 9
58.33333 16
59.16667 15
60.00000 23
60.83333 17
61.66667 13
62.50000 18
63.33333 11
64.16667 14
65.00000 7
65.83333 4
66.66667 5
67.50000 6
68.33333 7
69.16667 6
70.00000 1
70.83333 2
71.66667 2
72.50000 2
75.00000 1

Ta Da : A graph for all

When I spoke to James he suggested multiplying the density scaling factor by 2 . But that seemed to easy so I thought it was cheating. However chatgpt says this:

No, multiplying the probability density function (PDF) by a scaling factor of 2 is not considered cheating. It is a common practice to adjust the scale of the PDF to ensure that the area under the curve integrates to 1, which is a property of a valid probability density function.

The area under the curve represents the probability, and it should sum up to 1 for a valid probability distribution. By scaling the PDF, you are adjusting its height to ensure that the total area under the curve is equal to 1. This scaling factor does not affect the shape or characteristics of the distribution, but only ensures that it meets the criteria of a proper probability density function.

Therefore, multiplying the PDF by 2 in this context is a valid technique to normalize the density and ensure the correct interpretation of probabilities.

library(ggplot2)

# Rename "Overall (%)" to "Overall"
colnames(normative)[colnames(normative) == "Overall (%)"] <- "Overall"

# Create frequency table
freq_table <- table(normative$Overall)

# Convert frequency table to data frame
freq_df <- data.frame("Overall" = as.numeric(names(freq_table)), "Frequency" = as.numeric(freq_table))

# Plot the histogram and adjusted normal distribution
ggplot(normative, aes(x = Overall)) +
  geom_histogram(aes(y = after_stat(count/sum(count))), 
                 binwidth = 2,
                 fill = "red", 
                 color = "white", 
                 closed = "left") +
  stat_function(
    fun = function(x) 2 * dnorm(x, mean = mean(normative$Overall), sd = sd(normative$Overall)), 
    color = "black", lwd = 1) +
  labs(x = "Overall", y = "Proportion of respondents", title = "Histogram with Normal Distribution") +
  geom_vline(xintercept = 50, color = "black", linetype = "dotted") +
  geom_text(aes(x = 50, y = 0, label = "CHANCE"), color = "black", size = 3, angle = 90, vjust = -0.5, hjust = -4) +
  geom_vline(xintercept = mean(normative$Overall) + 2 * sd(normative$Overall), color = "black", linetype = "dotted") +
  geom_text(aes(x = mean(normative$Overall) + 2 * sd(normative$Overall), y = 0, label = "MEAN + 2SD"), color = "black", size = 3, angle = 90, vjust = -0.5, hjust = -3.2) +
  geom_text(x = 90, y = 0.10, label = paste("N =", nrow(normative)), color = "red", size = 6) +
  coord_cartesian(xlim = c(40, 100)) +
  scale_y_continuous(limits = c(0, 0.20)) +
  theme_minimal()

The End