R Notebook

df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/High_school_Grade.csv",sep=";",stringsAsFactors = F)
# rename the columns

names(df)<-c("ID","Student_Name","Venue","Grade")

# Replace some mis-spelling student names 

df$Student_Name<-gsub("Literature","VAN",df$Student_Name)

df$Student_Name<-gsub("Math","TOAN",df$Student_Name)

head(df)

##          ID        Student_Name               Venue
## 1 DTN000001           LO THI AN Nong Lam University
## 2 DTN000002 BUI NGUYEN THAO ANH Nong Lam University
## 3 DTN000003     CHU THI LAN ANH Nong Lam University
## 4 DTN000004   DUONG THI VAN ANH Nong Lam University
## 5 DTN000005   DOAN THI NGOC ANH Nong Lam University
## 6 DTN000006       HOANG MAI ANH Nong Lam University
##                                                                                      Grade
## 1 Math:   4.25   Literature:   7.50   Physics:   6.80   Chemitry:   6.00   Biology:   6.00
## 2                   Math:   5.75   Literature:   6.00   Chemitry:   6.00   Biology:   5.80
## 3                  Math:   2.25   Literature:   7.75   History:   4.50   Geography:   8.00
## 4                  Math:   3.25   Literature:   7.75   History:   7.50   Geography:   7.25
## 5                  Math:   1.75   Literature:   8.25   History:   4.50   Geography:   7.75
## 6 Math:   8.75   Literature:   7.00   Physics:   8.40   Chemitry:   6.60   English:   6.78

Separate chracters in Grade column into more pattern

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.2.5

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.5

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rebus)
library(stringr)

## Warning: package 'stringr' was built under R version 3.2.5

## 
## Attaching package: 'stringr'

## The following object is masked from 'package:rebus':
## 
##     regex

df1<- df%>% dplyr::mutate(Grade1=str_replace_all(Grade, pattern = ":" %R% one_or_more(SPC),"_"))

head(df1)

##          ID        Student_Name               Venue
## 1 DTN000001           LO THI AN Nong Lam University
## 2 DTN000002 BUI NGUYEN THAO ANH Nong Lam University
## 3 DTN000003     CHU THI LAN ANH Nong Lam University
## 4 DTN000004   DUONG THI VAN ANH Nong Lam University
## 5 DTN000005   DOAN THI NGOC ANH Nong Lam University
## 6 DTN000006       HOANG MAI ANH Nong Lam University
##                                                                                      Grade
## 1 Math:   4.25   Literature:   7.50   Physics:   6.80   Chemitry:   6.00   Biology:   6.00
## 2                   Math:   5.75   Literature:   6.00   Chemitry:   6.00   Biology:   5.80
## 3                  Math:   2.25   Literature:   7.75   History:   4.50   Geography:   8.00
## 4                  Math:   3.25   Literature:   7.75   History:   7.50   Geography:   7.25
## 5                  Math:   1.75   Literature:   8.25   History:   4.50   Geography:   7.75
## 6 Math:   8.75   Literature:   7.00   Physics:   8.40   Chemitry:   6.60   English:   6.78
##                                                                      Grade1
## 1 Math_4.25   Literature_7.50   Physics_6.80   Chemitry_6.00   Biology_6.00
## 2                Math_5.75   Literature_6.00   Chemitry_6.00   Biology_5.80
## 3               Math_2.25   Literature_7.75   History_4.50   Geography_8.00
## 4               Math_3.25   Literature_7.75   History_7.50   Geography_7.25
## 5               Math_1.75   Literature_8.25   History_4.50   Geography_7.75
## 6 Math_8.75   Literature_7.00   Physics_8.40   Chemitry_6.60   English_6.78

Based on Grade1 variable, we will create a multiple columns from that variable

df2<-df1 %>% separate_rows(Grade1, sep=one_or_more(SPC))

head(df2)

##          ID        Student_Name               Venue
## 1 DTN000001           LO THI AN Nong Lam University
## 2 DTN000001           LO THI AN Nong Lam University
## 3 DTN000001           LO THI AN Nong Lam University
## 4 DTN000001           LO THI AN Nong Lam University
## 5 DTN000001           LO THI AN Nong Lam University
## 6 DTN000002 BUI NGUYEN THAO ANH Nong Lam University
##                                                                                      Grade
## 1 Math:   4.25   Literature:   7.50   Physics:   6.80   Chemitry:   6.00   Biology:   6.00
## 2 Math:   4.25   Literature:   7.50   Physics:   6.80   Chemitry:   6.00   Biology:   6.00
## 3 Math:   4.25   Literature:   7.50   Physics:   6.80   Chemitry:   6.00   Biology:   6.00
## 4 Math:   4.25   Literature:   7.50   Physics:   6.80   Chemitry:   6.00   Biology:   6.00
## 5 Math:   4.25   Literature:   7.50   Physics:   6.80   Chemitry:   6.00   Biology:   6.00
## 6                   Math:   5.75   Literature:   6.00   Chemitry:   6.00   Biology:   5.80
##            Grade1
## 1       Math_4.25
## 2 Literature_7.50
## 3    Physics_6.80
## 4   Chemitry_6.00
## 5    Biology_6.00
## 6       Math_5.75

Separate subject and mark into two independent variables

df3<- df2%>% separate(Grade1,c("Subject","Mark"),sep="_") %>% select(Subject,Mark)

head(df3)

##      Subject Mark
## 1       Math 4.25
## 2 Literature 7.50
## 3    Physics 6.80
## 4   Chemitry 6.00
## 5    Biology 6.00
## 6       Math 5.75

Using ggplot to produce histogram plot

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.5

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:rebus':
## 
##     alpha

df3$Mark<-as.numeric(as.character(df3$Mark))

ggplot(data=df3,aes(x=Mark)) + geom_histogram(breaks=seq(0,10,by=0.5),color=4,fill=3) + ggtitle("Distribution of High School Grades") + theme_bw() + theme(plot.title = element_text(hjust=0.5))

# Density of high school grades

ggplot(data=df3,aes(x=Mark)) + geom_histogram(aes(y=..density..),breaks=seq(0,10,by=0.5),color=4,fill=3) + ggtitle("Proportion of High School Grades") + theme_bw() + theme(plot.title = element_text(hjust=0.5)) + geom_density(color=2)

High school grades by subject

ggplot(data=df3,aes(x=Mark)) + geom_histogram(breaks=seq(0,10,by=0.5),color=4,fill=3) + ggtitle("Distribution of High School Grades") + theme_bw() + theme(plot.title = element_text(hjust=0.5)) + facet_wrap(~Subject) + ylab("Number of Students")