Data Visualization Practice #9

Task1: Specify working directory and read the file

setwd("D:/Class Materials & Work/ED_Psych 521- Data Management and Visualiztion/R Stat Package/R_Lesson 7- Advance Visualization_GGplot2/Practice 9")

#1.1 Read the file
PIRLS_Data_Merged<-read.csv("PIRLS2011G4_MergedSample_ggplot2.csv", header=T)

Task2: Recode the variable EngagedCat to a ordinal variable (Apply Label)

PIRLS_Data_Merged$EngagedCat.Labeled<-ordered(PIRLS_Data_Merged$EngagedCat,levels=c(1,2,3),labels=c('Engaged','Somewhat Engaged','Not Engaged'))

Task3: Recode the variable BulliedCat to a ordinal variable (Apply Label)

PIRLS_Data_Merged$BulliedCat.Labeled<-ordered(PIRLS_Data_Merged$BulliedCat,levels=c(1,2,3),labels=c('Never','Monthly','Weekly'))

#3.1: Assign label for 'Female' variable

PIRLS_Data_Merged$Gender<-factor(PIRLS_Data_Merged$Female,levels=c(0,1),labels=c('Male','Female'))

#Listwise deletion of missing data
PIRLS.nomiss<-na.omit(PIRLS_Data_Merged)

Task4: Grouped bar chart to display the average reading performance [Reverse label]

#4.1 Aggregate the data 'EngagedCat' and 'Gender' by computing average reading score for each subgroup

#Activate packages
library(ggrepel)
library(ggpubr)
library(ggcorrplot)
library(tidyverse)
library(GGally)

PIRLS.agg.Reading.by.Gender.EngagedCat<-na.omit(PIRLS.nomiss) %>% group_by(Gender,EngagedCat.Labeled) %>% 
  summarise(Reading.by.Gender.EngagedCat=mean(Reading))

#Do the chart
ggplot(PIRLS.agg.Reading.by.Gender.EngagedCat, aes(x=EngagedCat.Labeled,y=Reading.by.Gender.EngagedCat))+
  geom_bar(aes(color=Gender,fill=Gender),stat='identity',position='dodge')+
  
  #Label each bar
  geom_text_repel(aes(label=paste(round(Reading.by.Gender.EngagedCat,2))), vjust=-0.4,direction = "x")+
  
  #Rename X and Y axis
  ylab("Average PIRLS Reading")+ xlab("Student Engagement in Reading")+
  
  #Rename legend
  scale_fill_manual(name='Gender of Students',values=c("Salmon", "Seagreen2"))+
  scale_colour_manual(name='Gender of Students',values=c("Salmon", "Seagreen2"))+
  
  #Customize the title
  ggtitle("PIRLS Reading Score by Gender and Engagement")+
  theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
  
  #Insert grand mean horizontal reference line
  geom_hline(aes(yintercept=mean(Reading.by.Gender.EngagedCat,na.rm=T)),linetype='dashed',size=1)+
  
  #Adjust legend and background
  theme(legend.position=c(0.20,0.15), legend.box.just = "bottom")+
  theme(panel.grid.major=element_blank(),
        panel.grid.minor=element_blank())

Task5: Overlapping Histogram of Reading for three engagement group

#Aggregate 'Reading score' mean of each Engagement group for V-line

PIRLS.agg.reading.by.Engagement<-na.omit(PIRLS.nomiss) %>% group_by(EngagedCat.Labeled) %>% 
  summarise(Reading.by.EngagedCat=mean(Reading))

#Draw the chart

ggplot(PIRLS.nomiss, aes(Reading, colour=EngagedCat.Labeled, fill=EngagedCat.Labeled))+
  geom_histogram(alpha=0.2,position="identity",binwidth=30)+
  scale_color_manual(name='Student Engagement', values=c("seagreen2", "salmon", "deepskyblue"))+
  scale_fill_manual(name='Student Engagement', values=c("seagreen2", "salmon", "deepskyblue"))+
  xlab("Student Engagement in Reading")+
  
  #Insert the mean vertical reference line
  geom_vline(data=PIRLS.agg.reading.by.Engagement, aes(xintercept=Reading.by.EngagedCat,
                                                      linetype=EngagedCat.Labeled),size=0.5)+
  scale_linetype_manual(name="Student Engagement",values=c(1,2,3))+
    
  #Customize the title
  ggtitle("Histogram of Reading Score by Student Engagement")+
  theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
  
  #Adjust legend and background
  theme(legend.position=c(0.16,0.5), legend.box.just = "bottom")+
  theme(panel.grid.major=element_blank(),
        panel.grid.minor=element_blank())

Task6: Scatterplot between students’ engagement in reading and Reading Performance

#Draw the chart

ggplot(PIRLS.nomiss, aes(Engaged, Reading))+
  geom_point(colour="salmon")+
  xlab('Students Engagement in reading')+
  ylab('Students Performance in Reading')+
 
  #Customize the title
  ggtitle("Reading Engagement vs Reading Performance")+
  theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
  
  #Identifying pattern using Linear Regression
  geom_smooth(aes(colour='lm'),method='lm',se=F)+
 
#The relationship is slightly positive; Student reading performance increase along with student reading engagement
  
  #Identifying pattern using Loess
  geom_smooth(aes(colour='loess'),method='loess',se=F)+

  #Create legend for method identifying pattern
  labs(colour='Method')+
  scale_colour_manual(values=c('seagreen2',"darkorchid"))+

  #Adjust legend and background
  theme(legend.position=c(0.1,0.9), legend.box.just = "bottom")+
  theme(panel.grid.major=element_blank(),
        panel.grid.minor=element_blank())+

  #Add Pearson correlation
  stat_cor(method='pearson',label.x=1.5,label.y=680,
           colour='darkred')

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

The pattern from Loess method is slightly different from Linear Regression method. There is a drop in the middle, but increasing in a steady rate, and the pattern ends with a slight drop. ### Task7: Scatterplot of Engagement and Reading by BulliedCat

#Draw the chart

ggplot(na.omit(PIRLS.nomiss), aes(Engaged, Reading))+ #don't forget to remove missing data
  xlab('Students Engagement in reading')+
  ylab('Students Performance in Reading')+
  
  #Use linear regression to identify pattern amongst subgroup
  geom_point(aes(colour=BulliedCat.Labeled),size=2, shape=16,alpha=0.5)+
  geom_smooth(aes(colour=BulliedCat.Labeled,linetype=BulliedCat.Labeled),method='lm',se=F, size=1.2)+
  
  #Then custom color for both colour and linetyp
  scale_colour_manual('Frequency of being bullied',values=c('salmon',"seagreen2","darkorchid"))+
  scale_linetype_manual('Frequency of being bullied',values=c(1,2,3))+
  labs(title="Reading Engagement vs Reading Performance BY Bullied Category")+
  
  #Customize the title
  ggtitle("Reading Engagement vs Reading Performance BY Bullied Category")+
  theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
  
  #Adjust legend and background
  theme(legend.position=c(0.15,0.10), legend.box.just = "bottom")+
  theme(panel.grid.major=element_blank(),
        panel.grid.minor=element_blank())+
  
  #Customize title
  theme(plot.title=element_text(face='bold',size=14,hjust=0.5),
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank())+
  
  #Correlation
  stat_cor(method='pearson',label.x=2,
           aes(colour=BulliedCat.Labeled)) #Use colour = [variable name] to divide by subgroup.

## `geom_smooth()` using formula 'y ~ x'

From pattern identifying lm method, relationship between two variables in every bullied categories are slightly positive. That means variables should fluctuate the same way. ### Task8: ANOVA computation

#Chart from task4

ggplot(PIRLS.agg.Reading.by.Gender.EngagedCat, aes(x=EngagedCat.Labeled,y=Reading.by.Gender.EngagedCat))+
  geom_bar(aes(color=Gender,fill=Gender),stat='identity',position='dodge')+
  
  #Label each bar
  geom_text_repel(aes(label=paste(round(Reading.by.Gender.EngagedCat,2))), vjust=-0.4,direction = "x")+
  
  #Rename X and Y axis
  ylab("Average PIRLS Reading")+ xlab("Student Engagement in Reading")+
  
  #Rename legend
  scale_fill_manual(name='Gender of Students',values=c("Salmon", "Seagreen2"))+
  scale_colour_manual(name='Gender of Students',values=c("Salmon", "Seagreen2"))+
  
  #Customize the title
  ggtitle("PIRLS Reading Score by Gender and Engagement")+
  theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
  
  #Insert grand mean horizontal reference line
  geom_hline(aes(yintercept=mean(Reading.by.Gender.EngagedCat,na.rm=T)),linetype='dashed',size=1)+
  
  #Adjust legend and background
  theme(legend.position=c(0.80,0.90), legend.box.just = "bottom")+
  theme(panel.grid.major=element_blank(),
        panel.grid.minor=element_blank())+

#Overall ANOVA
  
  #Go back to original dataset for STAT
  stat_compare_means(method='anova',
                     data=PIRLS.nomiss,
                     
                     #Attach ANOVA result as an overall ANOVA
                     mapping=aes(x=EngagedCat.Labeled,
                                 y=Reading),size=6, colour='darkorchid')+
  
#ANOVA of Engagement by Gender
  stat_compare_means(method='anova',
                     data=PIRLS.nomiss,
                     mapping=aes(x=EngagedCat.Labeled,
                                 y=Reading,
                                 colour=Gender),
                     size=4,label.y = 200)

Task9: Visualize correlation results between Reading, Confidence, Engaged, and LikeReading (Multivariate continuous data)

We will use package ggcorrplot to create this graph

#Defining variables
variables.to.use<-c("Reading","Confidence","Engaged","LikeReading")

#Correlation on multiple variables
PIRLS.corr<-cor(PIRLS.nomiss[variables.to.use],
                method = "pearson",
                use='pairwise.complete.obs')

#visualize correlation
ggcorrplot(PIRLS.corr,
           p.mat=cor_pmat(PIRLS.nomiss[variables.to.use]),
           hc.order=T, 
           type='lower',
           color=c('Salmon','Seagreen2','dodgerblue'),  # for low, medium, high correlations
           outline.color = 'darkgoldenrod1', #frame color
           lab=T,
           legend.title='Correlation',
           pch=4, #shape we want to attach to non-significant one. 4 is 'cross'
           pch.cex=12, #how large you want the shape to be
           lab_size=6)+ #label size
  labs(title="Correlations between continuous variables")+
  
  #Customize title
  theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
  
  #Customize legend
  theme(legend.position=c(0.10,0.80), legend.box.just = "bottom")+
  
  #Customize background grid
  theme(panel.grid.major=element_line(color='firebrick'),
        panel.grid.minor=element_line(color='firebrick'),
        panel.border=element_rect(fill=NA, color='blue'))

Task10: A Scatter plot matrix of variables by subgroups with GGally

#Define variables
variables.to.use.task10<-c("Reading","Confidence","Engaged","LikeReading","Gender")

#Draw the chart

ggpairs(PIRLS.nomiss[variables.to.use.task10],aes(colour=Gender))+
  theme_bw()+
  
  #Customize background
  
  theme(panel.grid.major=element_blank(),
        panel.grid.minor=element_blank())+
  
  #Customize the title
  
  ggtitle("PIRLS matrix")+
  theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))

Data Visualization Practice #9_Advance-GGplot

Tarid Wongvorachan

June 8th, 2019