df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/High_school_Grade.csv",sep=";",stringsAsFactors = F)
# rename the columns
names(df)<-c("ID","Student_Name","Venue","Grade")
# Replace some mis-spelling student names
df$Student_Name<-gsub("Literature","VAN",df$Student_Name)
df$Student_Name<-gsub("Math","TOAN",df$Student_Name)
head(df)
## ID Student_Name Venue
## 1 DTN000001 LO THI AN Nong Lam University
## 2 DTN000002 BUI NGUYEN THAO ANH Nong Lam University
## 3 DTN000003 CHU THI LAN ANH Nong Lam University
## 4 DTN000004 DUONG THI VAN ANH Nong Lam University
## 5 DTN000005 DOAN THI NGOC ANH Nong Lam University
## 6 DTN000006 HOANG MAI ANH Nong Lam University
## Grade
## 1 Math: 4.25 Literature: 7.50 Physics: 6.80 Chemitry: 6.00 Biology: 6.00
## 2 Math: 5.75 Literature: 6.00 Chemitry: 6.00 Biology: 5.80
## 3 Math: 2.25 Literature: 7.75 History: 4.50 Geography: 8.00
## 4 Math: 3.25 Literature: 7.75 History: 7.50 Geography: 7.25
## 5 Math: 1.75 Literature: 8.25 History: 4.50 Geography: 7.75
## 6 Math: 8.75 Literature: 7.00 Physics: 8.40 Chemitry: 6.60 English: 6.78
Separate chracters in Grade column into more pattern
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rebus)
library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
##
## Attaching package: 'stringr'
## The following object is masked from 'package:rebus':
##
## regex
df1<- df%>% dplyr::mutate(Grade1=str_replace_all(Grade, pattern = ":" %R% one_or_more(SPC),"_"))
head(df1)
## ID Student_Name Venue
## 1 DTN000001 LO THI AN Nong Lam University
## 2 DTN000002 BUI NGUYEN THAO ANH Nong Lam University
## 3 DTN000003 CHU THI LAN ANH Nong Lam University
## 4 DTN000004 DUONG THI VAN ANH Nong Lam University
## 5 DTN000005 DOAN THI NGOC ANH Nong Lam University
## 6 DTN000006 HOANG MAI ANH Nong Lam University
## Grade
## 1 Math: 4.25 Literature: 7.50 Physics: 6.80 Chemitry: 6.00 Biology: 6.00
## 2 Math: 5.75 Literature: 6.00 Chemitry: 6.00 Biology: 5.80
## 3 Math: 2.25 Literature: 7.75 History: 4.50 Geography: 8.00
## 4 Math: 3.25 Literature: 7.75 History: 7.50 Geography: 7.25
## 5 Math: 1.75 Literature: 8.25 History: 4.50 Geography: 7.75
## 6 Math: 8.75 Literature: 7.00 Physics: 8.40 Chemitry: 6.60 English: 6.78
## Grade1
## 1 Math_4.25 Literature_7.50 Physics_6.80 Chemitry_6.00 Biology_6.00
## 2 Math_5.75 Literature_6.00 Chemitry_6.00 Biology_5.80
## 3 Math_2.25 Literature_7.75 History_4.50 Geography_8.00
## 4 Math_3.25 Literature_7.75 History_7.50 Geography_7.25
## 5 Math_1.75 Literature_8.25 History_4.50 Geography_7.75
## 6 Math_8.75 Literature_7.00 Physics_8.40 Chemitry_6.60 English_6.78
Based on Grade1 variable, we will create a multiple columns from that variable
df2<-df1 %>% separate_rows(Grade1, sep=one_or_more(SPC))
head(df2)
## ID Student_Name Venue
## 1 DTN000001 LO THI AN Nong Lam University
## 2 DTN000001 LO THI AN Nong Lam University
## 3 DTN000001 LO THI AN Nong Lam University
## 4 DTN000001 LO THI AN Nong Lam University
## 5 DTN000001 LO THI AN Nong Lam University
## 6 DTN000002 BUI NGUYEN THAO ANH Nong Lam University
## Grade
## 1 Math: 4.25 Literature: 7.50 Physics: 6.80 Chemitry: 6.00 Biology: 6.00
## 2 Math: 4.25 Literature: 7.50 Physics: 6.80 Chemitry: 6.00 Biology: 6.00
## 3 Math: 4.25 Literature: 7.50 Physics: 6.80 Chemitry: 6.00 Biology: 6.00
## 4 Math: 4.25 Literature: 7.50 Physics: 6.80 Chemitry: 6.00 Biology: 6.00
## 5 Math: 4.25 Literature: 7.50 Physics: 6.80 Chemitry: 6.00 Biology: 6.00
## 6 Math: 5.75 Literature: 6.00 Chemitry: 6.00 Biology: 5.80
## Grade1
## 1 Math_4.25
## 2 Literature_7.50
## 3 Physics_6.80
## 4 Chemitry_6.00
## 5 Biology_6.00
## 6 Math_5.75
Separate subject and mark into two independent variables
df3<- df2%>% separate(Grade1,c("Subject","Mark"),sep="_") %>% select(Subject,Mark)
head(df3)
## Subject Mark
## 1 Math 4.25
## 2 Literature 7.50
## 3 Physics 6.80
## 4 Chemitry 6.00
## 5 Biology 6.00
## 6 Math 5.75
Using ggplot to produce histogram plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:rebus':
##
## alpha
df3$Mark<-as.numeric(as.character(df3$Mark))
ggplot(data=df3,aes(x=Mark)) + geom_histogram(breaks=seq(0,10,by=0.5),color=4,fill=3) + ggtitle("Distribution of High School Grades") + theme_bw() + theme(plot.title = element_text(hjust=0.5))
# Density of high school grades
ggplot(data=df3,aes(x=Mark)) + geom_histogram(aes(y=..density..),breaks=seq(0,10,by=0.5),color=4,fill=3) + ggtitle("Proportion of High School Grades") + theme_bw() + theme(plot.title = element_text(hjust=0.5)) + geom_density(color=2)
High school grades by subject
ggplot(data=df3,aes(x=Mark)) + geom_histogram(breaks=seq(0,10,by=0.5),color=4,fill=3) + ggtitle("Distribution of High School Grades") + theme_bw() + theme(plot.title = element_text(hjust=0.5)) + facet_wrap(~Subject) + ylab("Number of Students")