#Exercise 3 for Intro to Survey methods
#Sara Schodt
#Oct 25 2025

#import .csv file
music_data <- read.csv("/Users/saraschodt/Desktop/Fall 2025/Survey Methods/Assignments/Exercise 3 CCT Data/80data_reliability.csv")

# 1.    Calculate a total score for each person as the sum of their responses 
#(i.e., number of correctly identified songs for each person)

#print column names
print(colnames(music_data))

music_data$total_score <- rowSums(music_data[, c( "LL", "GNR", "GM_Flash", "Go_Gos", "Human_League", "The_Police", "New_Order",
                                       "Rob_Base", "INXS", "REO", "J_Cougar", "Flock", "TTD", "Wham",       
                                        "Marley")], na.rm = TRUE)
print(music_data)

#2. Calculate summary statistics of the total scores (minimum, maximum, mean, and standard
#deviation) 

# retroactively designating some variables to use later for kelleys

min_total <- min(music_data$total_score)
max_total <- max(music_data$total_score)
mean_total <- mean(music_data$total_score)
sd_total   <- sd(music_data$total_score)  

summary(music_data$total_score)
sd_total_scores <- sd(music_data$total_score)
print(sd_total_scores)

#Summary statistics of the total scores:
#Min. 1st Qu.  Median   Mean   3rd Qu.   Max.  SD
#0     2.00    4.50     5.35   8.00    13.00   3.64


#Histogram showing the distribution of total scores.

hist(music_data$total_score,
     main = "Histogram of Total Music Score Values",
     xlab = "Total Music Score",
     ylab = "Frequency",
     col = "lightblue",
     breaks = 7, 
     xlim = c(0, 15),
     ylim = c(0, 15))

#This histogram is pretty right skewed, suggesting that many people in this sample 
#can not identify many songs from the 80's, but the mean of 5.35 and SD of 3.64 also suggests that there is a 
#fair amount of variability in the number of songs that the group can identify.

#3. Use Cronbach’s alpha to estimate the internal consistency reliability of the total scores. (Please
#show or explain how you calculated your estimate.) What is the estimated reliability of the total
#scores and how should it be interpreted? How would you interpret the estimate for the 80’s test?
#(I got this code from Googling)

#Separate the item responses (exclude the total_scores column because we just need to know how many
#test items)
items <- music_data[ , setdiff(names(music_data), "total_score")]

#Count items
k <- ncol(items)

#Compute item variances(Alpha uses the sum of item variances in its numerator term) and total-score variance
#(bc the formula divides by the variance of the total test score (the sum of items))
item_vars <- apply(items, 2, var)
total_score_var <- var(music_data$total_score)

#Cronbach’s alpha formula (manual calculation)
alpha <- (k / (k - 1)) * (1 - sum(item_vars) / total_score_var)

#Print result
alpha

#Interpretation: Alpha estimates the proportion of total-score variance attributable 
#to a common, consistent part across items (also known as reliability). In this case, the alpha is 
#0.83 which is a pretty good internal consistency, meaning that the items do a god job 
#of measuring 80's music literacy

#4. What is the standard error of measurement (SEM) for total scores on the 80’s music test? How
#should it be interpreted?

#Calculation for SEM
SEM <- sd_total_scores * sqrt(1 - alpha)
SEM

#The SEM is 1.51 which means that a person's observed total score would be expected to vary by 
#about ±1.51 points from their true (unknown) score due to measurement error, if they took 
# the test (as though it was for the first time) many times over.

# 5. Create a table that shows the mean (sometimes called the “difficulty”) and the item-total
#correlation (sometimes called the “discrimination”) for each of the 15 test items. Which songs
#were the hardest for people to identify? Which songs were the easiest to identify? Which songs
#stood out as the most/least discriminating? How can you tell?

## a) Extract the item matrix (exclude the person total column)
test_items <- music_data[ , setdiff(names(music_data), "total_score")]

## b) Item “difficulty” (mean proportion correct)
item_means <- colMeans(test_items, na.rm = TRUE)

## c) Corrected item–total correlation for each item
# total score from items (to be explicit and to match "corrected" definition)
total_from_items <- rowSums(test_items, na.rm = TRUE)

# For each item j: cor( item_j , (total_from_items - item_j) )
item_total_corr <- sapply(names(test_items), function(j) {
  rest_total <- total_from_items - test_items[[j]]
  cor(test_items[[j]], rest_total, use = "pairwise.complete.obs")
})

## d) Put into a table
item_table <- data.frame(
  item            = names(test_items),
  mean            = as.numeric(item_means),
  corrected_item_total = as.numeric(item_total_corr),
  row.names = NULL
)

## e) sort to see hardest/easiest or most/least discriminating
hardest_first <- item_table[order(item_table$mean, decreasing = FALSE), ]
most_discriminating <- item_table[order(item_table$corrected_item_total, decreasing = TRUE), ]

## f) quick looks, from the r code I was trying out above in e)
head(hardest_first, 5)
head(most_discriminating, 5)

#hardest 5 items
#.      item   mean         corrected_item_total
#         LL 0.1086957          -0.08111331
#  New_Order 0.1086957           0.13402499
#   GM_Flash 0.1304348           0.29041971
#        TTD 0.2173913           0.56105628
#        REO 0.2391304           0.54458721
#most discriminating 5 items
#         item     mean        corrected_item_total
#          INXS 0.4565217            0.7817399
#        Go_Gos 0.3478261            0.6866643
#  Human_League 0.3913043            0.6372413
#          Wham 0.2608696            0.6277446
#         Flock 0.3478261            0.5803042

#Item Table:
print(item_table)

#.          item    mean     corrected_item_total
#            LL 0.1086957          -0.08111331
#           GNR 0.6086957           0.23603876
#      GM_Flash 0.1304348           0.29041971
#        Go_Gos 0.3478261           0.68666430
#  Human_League 0.3913043           0.63724130
#    The_Police 0.6304348           0.51697724
#     New_Order 0.1086957           0.13402499
#      Rob_Base 0.5652174           0.35730563
#          INXS 0.4565217           0.78173995
#           REO 0.2391304           0.54458721
#      J_Cougar 0.6521739           0.50811734
#         Flock 0.3478261           0.58030422
#           TTD 0.2173913           0.56105628
#          Wham 0.2608696           0.62774456
#        Marley 0.2826087           0.18186207
> 

#An item’s difficulty is the proportion correct between 0 and 1 where higher = easier.
#Easiest items (highest means) included J_Cougar (~0.65), The_Police (~0.63), and GNR (~0.61). 
#Hardest items (lowest means) were LL and New_Order (~0.11), and GM_Flash (~0.13).
#Most discriminating items were INXS (~0.78), Go_Gos (~0.69), Human_League (~0.64), Wham (~0.63) 
#because they have the highest item-total correlation scores, so people who did badly on 
#them did badly on the rest of the test and people who got them right tended to also score well
#on the rest of the items.
#The item with the smallest (negative) discrimination was LL (~−0.08) meaning that respondents who 
#got this one right tended to do *worse* on the test overall.


#Compute, for each test-taker, an estimate of their “true score” using Kelley’s
#equation. Compare the distribution of Kelley true score estimates to the distribution of observed
#total scores. What do you notice?
  
kelley <- mean_total + alpha * (music_data$total_score - mean_total)
#print Kelley true score estimate
print(kelley)
#print observed total scores
print(music_data$total_score)

c(obs_mean = mean_total, obs_sd = sd_total,
  kelley_mean = mean(kelley), kelley_sd = sd(kelley))
#They have the same mean but the kelley score has a smaller standard deviation.
#I didn´t understand why this was the case, so I looked back into the slides
#and then googled it, and from what I now understand, Kelley’s true-score estimate 
#is like a less noisy version of the observed total, that pulls extreme scores a bit 
#toward the class average depending on how reliable the test is 
#(we use reliability as part of the equation) such that greater reliability pulls 
#the scores less and lower reliability pulls the scores more. 
#Compared to the observed totals, the average stays the same but the standard deviation 
#is smaller because the extreme scores are pulled toward the center. I learned 
#that this is useful because it helps to avid over-interpreting extreme scores that 
#might just be due to random error on a particular testing occasion and can provide 
#a possible more accurate measure of how reliable the test is.
## Error in parse(text = input): <text>:163:1: unexpected '>'
## 162: #        Marley 0.2826087           0.18186207
## 163: >
##      ^