Problem Statement

The class had completed a quiz of 10 questions. The result was analysed to better understand how the students performed in class and whether there was any question that students needed help with.

Load packages

loading the packages at the start allows you to code with ease

pacman::p_load(readxl,tidyverse, reshape, lubridate, ggthemes, tinytex)

Import data files

2 data-sets were given - answer key (“key”) and the students’ answers (“response”).

response <- read_excel("D:/Documents/Family/TKC/20180818_MTEBA/Academic/2019Sem1/EBA5001/3/Lab/assignment/data/response.xlsx", sheet =  "Responses")
key <- read_excel("D:/Documents/Family/TKC/20180818_MTEBA/Academic/2019Sem1/EBA5001/3/Lab/assignment/data/key.xlsx", sheet =  "key")

Explore the data frames

#view the data structure of the students' responses
response.factor <- response %>% 
  mutate_all(as.factor)
summary(response.factor)

##    Student ID    Q-1          Q-2        Q-3        Q-4     Q-5    
##  AX1001 : 1   FALSE:18   Integer: 1   FALSE: 6   FALSE:68   NA:75  
##  AX1002 : 1   TRUE :57   Numeric:74   TRUE :69   TRUE : 7          
##  AX1003 : 1                                                        
##  AX1004 : 1                                                        
##  AX1005 : 1                                                        
##  AX1006 : 1                                                        
##  (Other):69                                                        
##     Q-6        Q-7        Q-8              Q-9             Q-10   
##  Error:65   Error:75   0    : 1   str(df)    : 2   data.frame:70  
##  Even : 1              Error:74   summary(df):73   Error     : 2  
##  Odd  : 9                                          list      : 2  
##                                                    matrix    : 1  
##                                                                   
##                                                                   
##

#view actual students' response data
knitr::kable(
  response[1:5, ], 
  caption = "Students' Answers")

Students’ Answers
Student ID	Q-1	Q-2	Q-3	Q-4	Q-5	Q-6	Q-7	Q-8	Q-9	Q-10
AX1001	TRUE	Numeric	TRUE	TRUE	NA	Error	Error	Error	summary(df)	data.frame
AX1002	TRUE	Numeric	TRUE	TRUE	NA	Error	Error	Error	summary(df)	data.frame
AX1003	TRUE	Numeric	TRUE	FALSE	NA	Error	Error	Error	summary(df)	data.frame
AX1004	TRUE	Numeric	TRUE	FALSE	NA	Error	Error	Error	summary(df)	data.frame
AX1005	TRUE	Numeric	TRUE	FALSE	NA	Error	Error	Error	summary(df)	data.frame

#view the data structure of the key
key.factor <- key %>% 
  mutate_all(as.factor)
summary(key.factor)

##    Q-1         Q-2      Q-3       Q-4    Q-5       Q-6       Q-7   
##  TRUE:1   Numeric:1   TRUE:1   FALSE:1   NA:1   Error:1   Error:1  
##     Q-8             Q-9            Q-10  
##  Error:1   summary(df):1   data.frame:1

#view actual key data
knitr::kable(
  key[1, ], 
  caption = "Answer Key")

Answer Key
Q-1	Q-2	Q-3	Q-4	Q-5	Q-6	Q-7	Q-8	Q-9	Q-10
TRUE	Numeric	TRUE	FALSE	NA	Error	Error	Error	summary(df)	data.frame

Check if there was any missing data

It was important to take note of any missing data as it might affect the analysis or interpretation

missing = response %>%
  filter(!complete.cases(.))

nrow(missing)/nrow(response) #no missing data

## [1] 0

Reshape the data from wide to long format

Reshaping made it into a long format suitable for visualisation

response.reshaped <- response %>%
  gather(key="question", value="answer", -`Student ID`)

key.reshaped <- key %>%
  gather(key="question", value="model.answer")

response.reshaped <- response.reshaped %>%
  left_join(key.reshaped) %>%
  mutate(correctness=ifelse(answer==model.answer,"Correct","Wrong")) %>%
  select(-contains("answer"))

## Joining, by = "question"

str(response.reshaped) #check data structure before reshaping

## Classes 'tbl_df', 'tbl' and 'data.frame':    750 obs. of  3 variables:
##  $ Student ID : chr  "AX1001" "AX1002" "AX1003" "AX1004" ...
##  $ question   : chr  "Q-1" "Q-1" "Q-1" "Q-1" ...
##  $ correctness: chr  "Correct" "Correct" "Correct" "Correct" ...

response.reshaped <- response.reshaped %>%
  mutate(question=as.factor(question),
         question=factor(question,levels=c("Q-1","Q-2","Q-3","Q-4","Q-5","Q-6","Q-7","Q-8","Q-9","Q-10"))) #order the questions

Chart: details of scores by individual students using - lollipop chart

How did each student perform?

#individual scores showing questions and individual (grouped by their scoring)
agg.ind.score <- response.reshaped %>%
  group_by(`Student ID`,correctness) %>%
  summarise(n.count=n()) %>%
  filter(!correctness=="Wrong") %>%
  select(-correctness) %>%
  arrange(desc(n.count)) %>%
  ungroup()

#change the student id into a factor
agg.ind.score <- agg.ind.score %>%
  mutate(`Student ID` = factor(`Student ID`, levels = .$`Student ID`)) %>%
  arrange(desc(n.count))

#recode the factor so that it can be re-arranged according to the score
agg.ind.score$`Student ID` = with(agg.ind.score, factor(`Student ID`, levels = rev(levels(`Student ID`))))

# lollipop chart
ggplot(agg.ind.score, aes(n.count, reorder(`Student ID`,n.count))) +
  geom_segment(aes(x = 0, y = `Student ID`, xend = n.count, yend = `Student ID`), color = "#adadad" , size=1) +
  geom_point(color="#014d64", size=3, alpha=0.6) + 
  geom_point(data=agg.ind.score[67:75,], aes(x=n.count, y=`Student ID`), colour="red", alpha=0.5 , size=3) +
  theme_economist() +
  theme(panel.grid.major.y = element_blank(),
        panel.border = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(size=7)) +
  labs(x= "Student Score", y= "Student ID", 
       title = "Individual Student Score", subtitle = "Identify students who may require help")

Insight: 9 students might need help with their work and they have been identified.

Chart: summary of scores by student group - bar chart

What were the categories that the students belonged to?

#create score band
agg.ind.score <- agg.ind.score %>%
  mutate(score_band=case_when(n.count==10 ~ "10_points",
                              n.count==9 ~ "9_points",
                              TRUE ~ "8_&_below_points"))

#bar plot ======
agg.ind.score %>% 
  group_by(score_band) %>%
  summarise(n.count=n()) %>%
  mutate(percent=round(n.count/nrow(agg.ind.score)*100,1),
         score_band=as.factor(score_band)) %>%
  arrange(desc(percent)) %>%
  ggplot(aes(x=reorder(score_band,percent), y=percent, fill=score_band)) +
  geom_bar(stat="identity", alpha=0.8, width = 0.6)+
  #geom_bar(stat="identity", alpha=0.8)+
  scale_fill_manual(values=c("#adadad","#7c260b" ,"#adadad"))+
  geom_text(aes(label=paste0(n.count," ","(",percent,"%",")")), hjust=1.2 , vjust=0.5, color="white", size=6)+
  theme_economist() +
  coord_flip() +
  theme(panel.grid.major.y = element_blank(),
        panel.border = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(size=12,hjust=1),
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        legend.position="none") +
  labs(x= "Score Band", y= "Percentage of Students (%)", 
       title = "% of Students Banded by Scores", subtitle = "Identify the potential size of students who may need help")

Insight: Generally, the class performed well - 9 or 10 marks. There were 9 students who might need help.

Chart: score distribution by question (summary) bar chart

What were the questions that students needed help?

#bar plot ======
#question level =====
agg.qn.score <- response.reshaped %>%
  group_by(question,correctness) %>%
  summarise(n.count=n()) %>%
  filter(!correctness=="Wrong") %>%
  ungroup() %>%
  mutate(percent=round(n.count/75*100,1)) %>%
  arrange(desc(percent)) %>%
  select(-correctness)

#create grouping flag to highlight the bar chart accordingly
agg.qn.score <- agg.qn.score %>%
  mutate(question = factor(question, levels = .$question),
         red_flag=case_when(percent<97 ~ "bad",
                            percent==100 ~ "excellent",
                            TRUE ~ "good")) 

ggplot(agg.qn.score, aes(x=reorder(question,percent), y=percent, fill=red_flag)) +
  geom_bar(stat="identity", alpha=0.8, width = 0.6)+
  #geom_bar(stat="identity", alpha=0.8)+
  scale_fill_manual(values=c("#7c260b","#76c0c1","#adadad"))+
  geom_text(aes(label=paste0(n.count," ","(",percent,"%",")")), hjust=1.2 , vjust=0.5, color="white", size=6)+
  theme_economist() +
  coord_flip() +
  theme(panel.grid.major.y = element_blank(),
        panel.border = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_text(size=12,hjust=1.2),
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        legend.position="none") +
  labs(x= "Question", y= "Percentage of Students (%)", 
       title = "Percentage of Correct Answers by Questions", subtitle = "Identify questions that the students may need revision")

Insight: All students knew how to answer Questions 5 and 7. Q1, Q3, Q4, Q6 and Q10 were the questions the students might need revision in.

Student Score Analysis

Kwan Chet

April 03, 2019