This analysis is based on Annual School Census 2016 and Private School Census 2016 prepared by the Punjab Education Sector Reform Programme (PESRP) of the School Education Department Punjab.Thsi data was made publically available in August 2017.
We start with loading the two census files:
library(readxl)
library(plyr)
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library(grid)
library(gridExtra)
pub_schools = read_excel("Census_Oct_2018.xlsx")
prv_schools = read_excel("private_schools_census_2016.xlsx")
The first data contitioning step that we perform here is to convert Year columns as ‘Factors’
pub_schools$est_year = as.factor(pub_schools$est_year)
prv_schools$EstablishmentYear = as.factor(prv_schools$EstablishmentYear)
Next we calculate yearly count of both type of school and merge the data set together.
d1 = ddply(pub_schools, "est_year", summarise, schools = length(school_id))
names(d1) = c("Year", "Schools_Established")
d1$SchoolType = 'Public'
pub_schools_peryear = subset(d1, (Year!=0 & Year!='NULL'))
pub_schools_peryear = droplevels(pub_schools_peryear)
d2 = ddply(prv_schools, "EstablishmentYear", summarise, schools = length(emiscode))
names(d2) = c("Year", "Schools_Established")
d2$SchoolType = 'Private'
pvt_schools_peryear = d2
allSchools_perYear = merge(pub_schools_peryear, pvt_schools_peryear, all=TRUE)
Create a special function to be called for evenly pacing the x axis titles
every_nth = function(n) {
return(function(x) {x[c(TRUE, rep(FALSE, n - 1))]})
}
We create our first chart by the title “Growth of Private and Public Schools over the years”
chart_yearly = ggplot(allSchools_perYear, aes(x=Year, y=Schools_Established, fill=SchoolType)) +
geom_bar(position = position_dodge(), stat="identity") +
scale_y_continuous("No of New Schools") +
theme(axis.text.x = element_text(angle=90, vjust = 0.5)) +
theme(legend.position=c(0,1), legend.justification=c(0,1), legend.background=element_blank()) +
labs(fill = "School Type") + scale_x_discrete("Year", breaks=every_nth(n = 10)) +
annotate("text", x="1947", y = 2000, label="1947 - Independence") +
annotate("segment", x="1947", xend="1947", y=0, yend=2000)+
scale_fill_brewer(palette ="Set1") +
ggtitle("Growth of Private and Public Schools over the years") +
theme(plot.title = element_text(size = 8),
axis.title.x=element_blank())
We now move to the smaller visualizations:
school_Totals= ddply(allSchools_perYear, "SchoolType", summarise, total_schools = sum(Schools_Established))
chart_numSchools = ggplot(school_Totals,aes(x=SchoolType, y=total_schools, fill=SchoolType)) +
geom_bar(position = position_dodge(),stat="identity") +
geom_text(aes(y=total_schools-(total_schools*.10), label=total_schools), colour="black", size=3) +
scale_fill_brewer(palette ="Set1") +
coord_flip() +
ggtitle("Number of Schools") +
theme(legend.position="none") +
theme(plot.title = element_text(size = 8),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank())
For getting student and teacher counts we do the following steps:
d3 = ddply(pub_schools, "est_year", summarise, total = sum(Teachers))#, students = sum(enrollment))
names(d3) = c("Year", "Totals")
d3$Category = 'Teachers'
d3$SchoolType = 'Public'
d4 = ddply(pub_schools, "est_year", summarise, total = sum(enrollment))
names(d4) = c("Year", "Totals")
d4$Category = 'Students'
d4$SchoolType = 'Public'
d5 = ddply(prv_schools, "EstablishmentYear", summarise, total = sum(`Teaching Staff`))#, students = sum(Enrollment))
names(d5) = c("Year", "Totals")
d5$Category = 'Teachers'
d5$SchoolType = 'Private'
d6 = ddply(prv_schools, "EstablishmentYear", summarise, total = sum(Enrollment))
names(d6) = c("Year", "Totals")
d6$Category = 'Students'
d6$SchoolType = 'Private'
d7 = merge(d3, d4, all=TRUE)
d8 = merge(d5, d6, all=TRUE)
school_enrollments_1 = merge(d7, d8, all=TRUE)
school_enrollments_1 = subset(school_enrollments_1, (Year!=0 & Year!='NULL'))
school_enrollments_1 = droplevels(school_enrollments_1)
school_enrollments_2 = ddply(school_enrollments_1, c("SchoolType", "Category"), summarise, Total = sum(Totals, na.rm=T ))
school_enrollments_Students = subset(school_enrollments_2, (Category== 'Students'))
school_enrollments_Teachers = subset(school_enrollments_2, (Category== 'Teachers'))
Now that we have the necessary dataset for Teacher / Student count visuals we can create the charts:
chart_students = ggplot(school_enrollments_Students, aes(x=SchoolType, y=Total, fill=SchoolType)) +
geom_bar(position = position_dodge(), stat="identity") +
geom_text(aes(y=Total-(Total*.10), label=Total), colour="black", size=3) +
scale_fill_brewer(palette ="Set1") +
coord_flip() +
ggtitle("Number of Students Enrolled") +
theme(legend.position="none") +
theme(plot.title = element_text(size = 8),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank())
chart_teachers = ggplot(school_enrollments_Teachers, aes(x=SchoolType, y=Total, fill=SchoolType)) +
geom_bar(position = position_dodge(), stat="identity") +
geom_text(aes(y=Total-(Total*0), label=Total), colour="black", size=3) +
scale_fill_brewer(palette ="Set1") +
coord_flip() +
ggtitle("Number of Teachers") +
theme(legend.position="none") +
theme(plot.title = element_text(size = 8),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank())
Number of medals is not available in the original dataset hence we create a data frame.
NumberofMedalsin2017 <- data.frame("SchoolType" = c("Public", "Private"), "Medals" = c(38, 102))
#str(NumberofMedalsin2017)
chart_medals = ggplot(NumberofMedalsin2017, aes(x=SchoolType, y="Medals", color=SchoolType, size=Medals)) +
geom_point() +
scale_size(range = c(12, 25)) +
theme(legend.position="none") +
scale_color_brewer(palette="Set1") +
geom_text(aes(y="Medals", label=Medals), colour="black", size=6) +
ggtitle("Total medals won by public and private schools in SSC/Matric Exam 2017 in Punjab") +
theme(plot.title = element_text(size = 8),
axis.title.x=element_blank(),
#axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank())
We finally bring all the charts together:
tg1 = textGrob("There have been a mushroom growth in private schools; \nOver 40,000 schools have been established since 2006.", gp=gpar(fontfamily = "Calibri", cex=.8), just="center")
tg2 = textGrob("The most surprising figure is the teacher to student ratio \nin public vs private schools", gp=gpar(fontfamily = "Calibri", cex=.8), just="center")
tg3 = textGrob("In 2007 HSSC examination, it was observed that private \nschool students score 250% more medals than public \nschool students.", gp=gpar(fontfamily = "Calibri", cex=.8), just="center")
grid.arrange(tg1, chart_numSchools, chart_students, tg2, chart_teachers, tg3, chart_medals, chart_yearly, nrow = 7, ncol = 2,
heights = c(1,1,1,1,1,1,2),
layout_matrix = rbind(c(1,8), c(2,8), c(3,8), c(4,8), c(5,8), c(6,8), c(7,8)))
** note about the visualization: There are a few things that I could not fix given the time – TextGrobs could not be aligned to the left and I believe that they are adding significantly to the clutter.