setwd("D:/Class Materials & Work/ED_Psych 521- Data Management and Visualiztion/R Stat Package/R_Lesson 7- Advance Visualization_GGplot2/Practice 9")
#1.1 Read the file
PIRLS_Data_Merged<-read.csv("PIRLS2011G4_MergedSample_ggplot2.csv", header=T)
PIRLS_Data_Merged$EngagedCat.Labeled<-ordered(PIRLS_Data_Merged$EngagedCat,levels=c(1,2,3),labels=c('Engaged','Somewhat Engaged','Not Engaged'))
PIRLS_Data_Merged$BulliedCat.Labeled<-ordered(PIRLS_Data_Merged$BulliedCat,levels=c(1,2,3),labels=c('Never','Monthly','Weekly'))
#3.1: Assign label for 'Female' variable
PIRLS_Data_Merged$Gender<-factor(PIRLS_Data_Merged$Female,levels=c(0,1),labels=c('Male','Female'))
#Listwise deletion of missing data
PIRLS.nomiss<-na.omit(PIRLS_Data_Merged)
#4.1 Aggregate the data 'EngagedCat' and 'Gender' by computing average reading score for each subgroup
#Activate packages
library(ggrepel)
library(ggpubr)
library(ggcorrplot)
library(tidyverse)
library(GGally)
PIRLS.agg.Reading.by.Gender.EngagedCat<-na.omit(PIRLS.nomiss) %>% group_by(Gender,EngagedCat.Labeled) %>%
summarise(Reading.by.Gender.EngagedCat=mean(Reading))
#Do the chart
ggplot(PIRLS.agg.Reading.by.Gender.EngagedCat, aes(x=EngagedCat.Labeled,y=Reading.by.Gender.EngagedCat))+
geom_bar(aes(color=Gender,fill=Gender),stat='identity',position='dodge')+
#Label each bar
geom_text_repel(aes(label=paste(round(Reading.by.Gender.EngagedCat,2))), vjust=-0.4,direction = "x")+
#Rename X and Y axis
ylab("Average PIRLS Reading")+ xlab("Student Engagement in Reading")+
#Rename legend
scale_fill_manual(name='Gender of Students',values=c("Salmon", "Seagreen2"))+
scale_colour_manual(name='Gender of Students',values=c("Salmon", "Seagreen2"))+
#Customize the title
ggtitle("PIRLS Reading Score by Gender and Engagement")+
theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
#Insert grand mean horizontal reference line
geom_hline(aes(yintercept=mean(Reading.by.Gender.EngagedCat,na.rm=T)),linetype='dashed',size=1)+
#Adjust legend and background
theme(legend.position=c(0.20,0.15), legend.box.just = "bottom")+
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank())
#Aggregate 'Reading score' mean of each Engagement group for V-line
PIRLS.agg.reading.by.Engagement<-na.omit(PIRLS.nomiss) %>% group_by(EngagedCat.Labeled) %>%
summarise(Reading.by.EngagedCat=mean(Reading))
#Draw the chart
ggplot(PIRLS.nomiss, aes(Reading, colour=EngagedCat.Labeled, fill=EngagedCat.Labeled))+
geom_histogram(alpha=0.2,position="identity",binwidth=30)+
scale_color_manual(name='Student Engagement', values=c("seagreen2", "salmon", "deepskyblue"))+
scale_fill_manual(name='Student Engagement', values=c("seagreen2", "salmon", "deepskyblue"))+
xlab("Student Engagement in Reading")+
#Insert the mean vertical reference line
geom_vline(data=PIRLS.agg.reading.by.Engagement, aes(xintercept=Reading.by.EngagedCat,
linetype=EngagedCat.Labeled),size=0.5)+
scale_linetype_manual(name="Student Engagement",values=c(1,2,3))+
#Customize the title
ggtitle("Histogram of Reading Score by Student Engagement")+
theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
#Adjust legend and background
theme(legend.position=c(0.16,0.5), legend.box.just = "bottom")+
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank())
#Draw the chart
ggplot(PIRLS.nomiss, aes(Engaged, Reading))+
geom_point(colour="salmon")+
xlab('Students Engagement in reading')+
ylab('Students Performance in Reading')+
#Customize the title
ggtitle("Reading Engagement vs Reading Performance")+
theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
#Identifying pattern using Linear Regression
geom_smooth(aes(colour='lm'),method='lm',se=F)+
#The relationship is slightly positive; Student reading performance increase along with student reading engagement
#Identifying pattern using Loess
geom_smooth(aes(colour='loess'),method='loess',se=F)+
#Create legend for method identifying pattern
labs(colour='Method')+
scale_colour_manual(values=c('seagreen2',"darkorchid"))+
#Adjust legend and background
theme(legend.position=c(0.1,0.9), legend.box.just = "bottom")+
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank())+
#Add Pearson correlation
stat_cor(method='pearson',label.x=1.5,label.y=680,
colour='darkred')
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
The pattern from Loess method is slightly different from Linear Regression method. There is a drop in the middle, but increasing in a steady rate, and the pattern ends with a slight drop. ### Task7: Scatterplot of Engagement and Reading by BulliedCat
#Draw the chart
ggplot(na.omit(PIRLS.nomiss), aes(Engaged, Reading))+ #don't forget to remove missing data
xlab('Students Engagement in reading')+
ylab('Students Performance in Reading')+
#Use linear regression to identify pattern amongst subgroup
geom_point(aes(colour=BulliedCat.Labeled),size=2, shape=16,alpha=0.5)+
geom_smooth(aes(colour=BulliedCat.Labeled,linetype=BulliedCat.Labeled),method='lm',se=F, size=1.2)+
#Then custom color for both colour and linetyp
scale_colour_manual('Frequency of being bullied',values=c('salmon',"seagreen2","darkorchid"))+
scale_linetype_manual('Frequency of being bullied',values=c(1,2,3))+
labs(title="Reading Engagement vs Reading Performance BY Bullied Category")+
#Customize the title
ggtitle("Reading Engagement vs Reading Performance BY Bullied Category")+
theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
#Adjust legend and background
theme(legend.position=c(0.15,0.10), legend.box.just = "bottom")+
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank())+
#Customize title
theme(plot.title=element_text(face='bold',size=14,hjust=0.5),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank())+
#Correlation
stat_cor(method='pearson',label.x=2,
aes(colour=BulliedCat.Labeled)) #Use colour = [variable name] to divide by subgroup.
## `geom_smooth()` using formula 'y ~ x'
From pattern identifying lm method, relationship between two variables in every bullied categories are slightly positive. That means variables should fluctuate the same way. ### Task8: ANOVA computation
#Chart from task4
ggplot(PIRLS.agg.Reading.by.Gender.EngagedCat, aes(x=EngagedCat.Labeled,y=Reading.by.Gender.EngagedCat))+
geom_bar(aes(color=Gender,fill=Gender),stat='identity',position='dodge')+
#Label each bar
geom_text_repel(aes(label=paste(round(Reading.by.Gender.EngagedCat,2))), vjust=-0.4,direction = "x")+
#Rename X and Y axis
ylab("Average PIRLS Reading")+ xlab("Student Engagement in Reading")+
#Rename legend
scale_fill_manual(name='Gender of Students',values=c("Salmon", "Seagreen2"))+
scale_colour_manual(name='Gender of Students',values=c("Salmon", "Seagreen2"))+
#Customize the title
ggtitle("PIRLS Reading Score by Gender and Engagement")+
theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
#Insert grand mean horizontal reference line
geom_hline(aes(yintercept=mean(Reading.by.Gender.EngagedCat,na.rm=T)),linetype='dashed',size=1)+
#Adjust legend and background
theme(legend.position=c(0.80,0.90), legend.box.just = "bottom")+
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank())+
#Overall ANOVA
#Go back to original dataset for STAT
stat_compare_means(method='anova',
data=PIRLS.nomiss,
#Attach ANOVA result as an overall ANOVA
mapping=aes(x=EngagedCat.Labeled,
y=Reading),size=6, colour='darkorchid')+
#ANOVA of Engagement by Gender
stat_compare_means(method='anova',
data=PIRLS.nomiss,
mapping=aes(x=EngagedCat.Labeled,
y=Reading,
colour=Gender),
size=4,label.y = 200)
We will use package ggcorrplot to create this graph
#Defining variables
variables.to.use<-c("Reading","Confidence","Engaged","LikeReading")
#Correlation on multiple variables
PIRLS.corr<-cor(PIRLS.nomiss[variables.to.use],
method = "pearson",
use='pairwise.complete.obs')
#visualize correlation
ggcorrplot(PIRLS.corr,
p.mat=cor_pmat(PIRLS.nomiss[variables.to.use]),
hc.order=T,
type='lower',
color=c('Salmon','Seagreen2','dodgerblue'), # for low, medium, high correlations
outline.color = 'darkgoldenrod1', #frame color
lab=T,
legend.title='Correlation',
pch=4, #shape we want to attach to non-significant one. 4 is 'cross'
pch.cex=12, #how large you want the shape to be
lab_size=6)+ #label size
labs(title="Correlations between continuous variables")+
#Customize title
theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))+
#Customize legend
theme(legend.position=c(0.10,0.80), legend.box.just = "bottom")+
#Customize background grid
theme(panel.grid.major=element_line(color='firebrick'),
panel.grid.minor=element_line(color='firebrick'),
panel.border=element_rect(fill=NA, color='blue'))
#Define variables
variables.to.use.task10<-c("Reading","Confidence","Engaged","LikeReading","Gender")
#Draw the chart
ggpairs(PIRLS.nomiss[variables.to.use.task10],aes(colour=Gender))+
theme_bw()+
#Customize background
theme(panel.grid.major=element_blank(),
panel.grid.minor=element_blank())+
#Customize the title
ggtitle("PIRLS matrix")+
theme(plot.title=element_text(face='bold',size=14,hjust=0.5,colour="darkred"))