#Exercise 3 for Intro to Survey methods
#Sara Schodt
#Oct 25 2025
#import .csv file
music_data <- read.csv("/Users/saraschodt/Desktop/Fall 2025/Survey Methods/Assignments/Exercise 3 CCT Data/80data_reliability.csv")
# 1. Calculate a total score for each person as the sum of their responses
#(i.e., number of correctly identified songs for each person)
#print column names
print(colnames(music_data))
music_data$total_score <- rowSums(music_data[, c( "LL", "GNR", "GM_Flash", "Go_Gos", "Human_League", "The_Police", "New_Order",
"Rob_Base", "INXS", "REO", "J_Cougar", "Flock", "TTD", "Wham",
"Marley")], na.rm = TRUE)
print(music_data)
#2. Calculate summary statistics of the total scores (minimum, maximum, mean, and standard
#deviation)
# retroactively designating some variables to use later for kelleys
min_total <- min(music_data$total_score)
max_total <- max(music_data$total_score)
mean_total <- mean(music_data$total_score)
sd_total <- sd(music_data$total_score)
summary(music_data$total_score)
sd_total_scores <- sd(music_data$total_score)
print(sd_total_scores)
#Summary statistics of the total scores:
#Min. 1st Qu. Median Mean 3rd Qu. Max. SD
#0 2.00 4.50 5.35 8.00 13.00 3.64
#Histogram showing the distribution of total scores.
hist(music_data$total_score,
main = "Histogram of Total Music Score Values",
xlab = "Total Music Score",
ylab = "Frequency",
col = "lightblue",
breaks = 7,
xlim = c(0, 15),
ylim = c(0, 15))
#This histogram is pretty right skewed, suggesting that many people in this sample
#can not identify many songs from the 80's, but the mean of 5.35 and SD of 3.64 also suggests that there is a
#fair amount of variability in the number of songs that the group can identify.
#3. Use Cronbach’s alpha to estimate the internal consistency reliability of the total scores. (Please
#show or explain how you calculated your estimate.) What is the estimated reliability of the total
#scores and how should it be interpreted? How would you interpret the estimate for the 80’s test?
#(I got this code from Googling)
#Separate the item responses (exclude the total_scores column because we just need to know how many
#test items)
items <- music_data[ , setdiff(names(music_data), "total_score")]
#Count items
k <- ncol(items)
#Compute item variances(Alpha uses the sum of item variances in its numerator term) and total-score variance
#(bc the formula divides by the variance of the total test score (the sum of items))
item_vars <- apply(items, 2, var)
total_score_var <- var(music_data$total_score)
#Cronbach’s alpha formula (manual calculation)
alpha <- (k / (k - 1)) * (1 - sum(item_vars) / total_score_var)
#Print result
alpha
#Interpretation: Alpha estimates the proportion of total-score variance attributable
#to a common, consistent part across items (also known as reliability). In this case, the alpha is
#0.83 which is a pretty good internal consistency, meaning that the items do a god job
#of measuring 80's music literacy
#4. What is the standard error of measurement (SEM) for total scores on the 80’s music test? How
#should it be interpreted?
#Calculation for SEM
SEM <- sd_total_scores * sqrt(1 - alpha)
SEM
#The SEM is 1.51 which means that a person's observed total score would be expected to vary by
#about ±1.51 points from their true (unknown) score due to measurement error, if they took
# the test (as though it was for the first time) many times over.
# 5. Create a table that shows the mean (sometimes called the “difficulty”) and the item-total
#correlation (sometimes called the “discrimination”) for each of the 15 test items. Which songs
#were the hardest for people to identify? Which songs were the easiest to identify? Which songs
#stood out as the most/least discriminating? How can you tell?
## a) Extract the item matrix (exclude the person total column)
test_items <- music_data[ , setdiff(names(music_data), "total_score")]
## b) Item “difficulty” (mean proportion correct)
item_means <- colMeans(test_items, na.rm = TRUE)
## c) Corrected item–total correlation for each item
# total score from items (to be explicit and to match "corrected" definition)
total_from_items <- rowSums(test_items, na.rm = TRUE)
# For each item j: cor( item_j , (total_from_items - item_j) )
item_total_corr <- sapply(names(test_items), function(j) {
rest_total <- total_from_items - test_items[[j]]
cor(test_items[[j]], rest_total, use = "pairwise.complete.obs")
})
## d) Put into a table
item_table <- data.frame(
item = names(test_items),
mean = as.numeric(item_means),
corrected_item_total = as.numeric(item_total_corr),
row.names = NULL
)
## e) sort to see hardest/easiest or most/least discriminating
hardest_first <- item_table[order(item_table$mean, decreasing = FALSE), ]
most_discriminating <- item_table[order(item_table$corrected_item_total, decreasing = TRUE), ]
## f) quick looks, from the r code I was trying out above in e)
head(hardest_first, 5)
head(most_discriminating, 5)
#hardest 5 items
#. item mean corrected_item_total
# LL 0.1086957 -0.08111331
# New_Order 0.1086957 0.13402499
# GM_Flash 0.1304348 0.29041971
# TTD 0.2173913 0.56105628
# REO 0.2391304 0.54458721
#most discriminating 5 items
# item mean corrected_item_total
# INXS 0.4565217 0.7817399
# Go_Gos 0.3478261 0.6866643
# Human_League 0.3913043 0.6372413
# Wham 0.2608696 0.6277446
# Flock 0.3478261 0.5803042
#Item Table:
print(item_table)
#. item mean corrected_item_total
# LL 0.1086957 -0.08111331
# GNR 0.6086957 0.23603876
# GM_Flash 0.1304348 0.29041971
# Go_Gos 0.3478261 0.68666430
# Human_League 0.3913043 0.63724130
# The_Police 0.6304348 0.51697724
# New_Order 0.1086957 0.13402499
# Rob_Base 0.5652174 0.35730563
# INXS 0.4565217 0.78173995
# REO 0.2391304 0.54458721
# J_Cougar 0.6521739 0.50811734
# Flock 0.3478261 0.58030422
# TTD 0.2173913 0.56105628
# Wham 0.2608696 0.62774456
# Marley 0.2826087 0.18186207
>
#An item’s difficulty is the proportion correct between 0 and 1 where higher = easier.
#Easiest items (highest means) included J_Cougar (~0.65), The_Police (~0.63), and GNR (~0.61).
#Hardest items (lowest means) were LL and New_Order (~0.11), and GM_Flash (~0.13).
#Most discriminating items were INXS (~0.78), Go_Gos (~0.69), Human_League (~0.64), Wham (~0.63)
#because they have the highest item-total correlation scores, so people who did badly on
#them did badly on the rest of the test and people who got them right tended to also score well
#on the rest of the items.
#The item with the smallest (negative) discrimination was LL (~−0.08) meaning that respondents who
#got this one right tended to do *worse* on the test overall.
#Compute, for each test-taker, an estimate of their “true score” using Kelley’s
#equation. Compare the distribution of Kelley true score estimates to the distribution of observed
#total scores. What do you notice?
kelley <- mean_total + alpha * (music_data$total_score - mean_total)
#print Kelley true score estimate
print(kelley)
#print observed total scores
print(music_data$total_score)
c(obs_mean = mean_total, obs_sd = sd_total,
kelley_mean = mean(kelley), kelley_sd = sd(kelley))
#They have the same mean but the kelley score has a smaller standard deviation.
#I didn´t understand why this was the case, so I looked back into the slides
#and then googled it, and from what I now understand, Kelley’s true-score estimate
#is like a less noisy version of the observed total, that pulls extreme scores a bit
#toward the class average depending on how reliable the test is
#(we use reliability as part of the equation) such that greater reliability pulls
#the scores less and lower reliability pulls the scores more.
#Compared to the observed totals, the average stays the same but the standard deviation
#is smaller because the extreme scores are pulled toward the center. I learned
#that this is useful because it helps to avid over-interpreting extreme scores that
#might just be due to random error on a particular testing occasion and can provide
#a possible more accurate measure of how reliable the test is.
## Error in parse(text = input): <text>:163:1: unexpected '>'
## 162: # Marley 0.2826087 0.18186207
## 163: >
## ^