The dataset provided contains student information on 395 students including their grades in the class.
This assignment was accomplished by utilizing these packages for both data analysis and visualizations.
library("tidyr")
library("dplyr")
library("kableExtra")
library("ggplot2")
library("stringr")
library("lubridate")
The data is captured in the .csv format and updated into GitHub. You will see below that the data is not in a very clean form to conduct analysis easily, therefore this data set needed to be tidy.
theURL <- "https://raw.githubusercontent.com/DataScienceAR/Cuny-Assignments/master/Data-607/Data-Sets/Students.csv"
RawFile <-data.frame(read.csv(file = theURL,header = TRUE))
#Table Structure
glimpse(RawFile)
## Observations: 395
## Variables: 33
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Student.ID <int> 3093, 4344, 5967, 6467, 3331, 6082, 5943, 793, 727...
## $ sex <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F,...
## $ dob <fct> 6/5/2000, 11/25/1999, 2/2/1998, 12/20/1997, 10/4/1...
## $ address <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,...
## $ famsize <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, ...
## $ Pstatus <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T,...
## $ Medu <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4,...
## $ Fedu <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4,...
## $ Mjob <fct> at_home, at_home, at_home, health, other, services...
## $ Fjob <fct> teacher, other, other, services, other, other, oth...
## $ reason <fct> course, course, other, home, home, reputation, hom...
## $ guardian <fct> mother, father, mother, mother, father, mother, mo...
## $ traveltime <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1,...
## $ studytime <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3,...
## $ failures <int> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ schoolsup <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no,...
## $ famsup <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes...
## $ paid <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes,...
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes...
## $ nursery <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ higher <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, ...
## $ internet <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes...
## $ romantic <fct> no, no, no, yes, no, no, no, no, no, no, no, no, n...
## $ famrel <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3,...
## $ freetime <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2,...
## $ goout <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3,...
## $ Dalc <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Walc <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2,...
## $ health <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2,...
## $ nurse_visit <fct> 4/10/2014 14:59, 3/12/2015 14:59, 9/21/2015 14:59,...
## $ absences <int> 6, 4, 10, 2, 4, 10, 0, 6, 0, 0, 0, 4, 2, 2, 0, 4, ...
## $ Grades <fct> 5/6/2006, 5/5/2006, 7/8/2010, 15/14/15, 6/10/2010,...
#Top 6 rows of the table
head(RawFile)
## X Student.ID sex dob address famsize Pstatus Medu Fedu Mjob
## 1 1 3093 F 6/5/2000 U GT3 A 4 4 at_home
## 2 2 4344 F 11/25/1999 U GT3 T 1 1 at_home
## 3 3 5967 F 2/2/1998 U LE3 T 1 1 at_home
## 4 4 6467 F 12/20/1997 U GT3 T 4 2 health
## 5 5 3331 F 10/4/1998 U GT3 T 3 3 other
## 6 6 6082 M 6/16/1999 U LE3 T 4 3 services
## Fjob reason guardian traveltime studytime failures schoolsup
## 1 teacher course mother 2 2 0 yes
## 2 other course father 1 2 0 no
## 3 other other mother 1 2 3 yes
## 4 services home mother 1 3 0 no
## 5 other home father 1 2 0 no
## 6 other reputation mother 1 2 0 no
## famsup paid activities nursery higher internet romantic famrel freetime
## 1 no no no yes yes no no 4 3
## 2 yes no no no yes yes no 5 3
## 3 no yes no yes yes yes no 4 3
## 4 yes yes yes yes yes yes yes 3 2
## 5 yes yes no yes yes no no 4 3
## 6 yes yes yes yes yes yes no 5 4
## goout Dalc Walc health nurse_visit absences Grades
## 1 4 1 1 3 4/10/2014 14:59 6 5/6/2006
## 2 3 1 1 3 3/12/2015 14:59 4 5/5/2006
## 3 2 2 3 3 9/21/2015 14:59 10 7/8/2010
## 4 2 1 1 5 9/3/2015 14:59 2 15/14/15
## 5 2 1 2 5 4/7/2015 14:59 4 6/10/2010
## 6 2 1 2 5 11/15/2013 14:59 10 15/15/15
The data needs to be cleaned and manipulated for it to be presentable for analysis.
# Transform the data type of some of columns to right data type
RawFile$Student.ID <- as.character(RawFile$Student.ID)
RawFile$Medu <- as.factor(RawFile$Medu)
RawFile$Fedu <- as.factor(RawFile$Fedu)
RawFile$nurse_visit <- as.character(RawFile$nurse_visit)
RawFile$Student.ID <-str_pad(RawFile$Student.ID, width = 4,side="left",pad = "1")
names(RawFile)[names(RawFile) =='Student.ID'] <- "Student_ID"
RawFile$sex <- str_replace_all(RawFile$sex,"M","Male")
RawFile$sex <- str_replace_all(RawFile$sex,"F","Female")
head(RawFile)
## X Student_ID sex dob address famsize Pstatus Medu Fedu
## 1 1 3093 Female 6/5/2000 U GT3 A 4 4
## 2 2 4344 Female 11/25/1999 U GT3 T 1 1
## 3 3 5967 Female 2/2/1998 U LE3 T 1 1
## 4 4 6467 Female 12/20/1997 U GT3 T 4 2
## 5 5 3331 Female 10/4/1998 U GT3 T 3 3
## 6 6 6082 Male 6/16/1999 U LE3 T 4 3
## Mjob Fjob reason guardian traveltime studytime failures
## 1 at_home teacher course mother 2 2 0
## 2 at_home other course father 1 2 0
## 3 at_home other other mother 1 2 3
## 4 health services home mother 1 3 0
## 5 other other home father 1 2 0
## 6 services other reputation mother 1 2 0
## schoolsup famsup paid activities nursery higher internet romantic famrel
## 1 yes no no no yes yes no no 4
## 2 no yes no no no yes yes no 5
## 3 yes no yes no yes yes yes no 4
## 4 no yes yes yes yes yes yes yes 3
## 5 no yes yes no yes yes no no 4
## 6 no yes yes yes yes yes yes no 5
## freetime goout Dalc Walc health nurse_visit absences Grades
## 1 3 4 1 1 3 4/10/2014 14:59 6 5/6/2006
## 2 3 3 1 1 3 3/12/2015 14:59 4 5/5/2006
## 3 3 2 2 3 3 9/21/2015 14:59 10 7/8/2010
## 4 2 2 1 1 5 9/3/2015 14:59 2 15/14/15
## 5 3 2 1 2 5 4/7/2015 14:59 4 6/10/2010
## 6 4 2 1 2 5 11/15/2013 14:59 10 15/15/15
TidyFile <-RawFile %>% select(Student_ID,sex,Mjob,Fjob,guardian,traveltime,studytime,freetime,absences)
dim(TidyFile)
## [1] 395 9
head(RawFile,10)
## X Student_ID sex dob address famsize Pstatus Medu Fedu
## 1 1 3093 Female 6/5/2000 U GT3 A 4 4
## 2 2 4344 Female 11/25/1999 U GT3 T 1 1
## 3 3 5967 Female 2/2/1998 U LE3 T 1 1
## 4 4 6467 Female 12/20/1997 U GT3 T 4 2
## 5 5 3331 Female 10/4/1998 U GT3 T 3 3
## 6 6 6082 Male 6/16/1999 U LE3 T 4 3
## 7 7 5943 Male 9/20/1997 U LE3 T 2 2
## 8 8 1793 Female 7/5/1998 U GT3 A 4 4
## 9 9 7274 Male 3/18/2000 U LE3 A 3 2
## 10 10 4845 Male 3/18/1997 U GT3 T 3 4
## Mjob Fjob reason guardian traveltime studytime failures
## 1 at_home teacher course mother 2 2 0
## 2 at_home other course father 1 2 0
## 3 at_home other other mother 1 2 3
## 4 health services home mother 1 3 0
## 5 other other home father 1 2 0
## 6 services other reputation mother 1 2 0
## 7 other other home mother 1 2 0
## 8 other teacher home mother 2 2 0
## 9 services other home mother 1 2 0
## 10 other other home mother 1 2 0
## schoolsup famsup paid activities nursery higher internet romantic
## 1 yes no no no yes yes no no
## 2 no yes no no no yes yes no
## 3 yes no yes no yes yes yes no
## 4 no yes yes yes yes yes yes yes
## 5 no yes yes no yes yes no no
## 6 no yes yes yes yes yes yes no
## 7 no no no no yes yes yes no
## 8 yes yes no no yes yes no no
## 9 no yes yes no yes yes yes no
## 10 no yes yes yes yes yes yes no
## famrel freetime goout Dalc Walc health nurse_visit absences
## 1 4 3 4 1 1 3 4/10/2014 14:59 6
## 2 5 3 3 1 1 3 3/12/2015 14:59 4
## 3 4 3 2 2 3 3 9/21/2015 14:59 10
## 4 3 2 2 1 1 5 9/3/2015 14:59 2
## 5 4 3 2 1 2 5 4/7/2015 14:59 4
## 6 5 4 2 1 2 5 11/15/2013 14:59 10
## 7 4 4 4 1 1 3 9/20/2015 14:59 0
## 8 4 1 4 1 1 1 2/1/2015 14:59 6
## 9 4 2 2 1 1 1 4/12/2015 14:59 0
## 10 5 5 1 1 1 5 12/30/2014 14:59 0
## Grades
## 1 5/6/2006
## 2 5/5/2006
## 3 7/8/2010
## 4 15/14/15
## 5 6/10/2010
## 6 15/15/15
## 7 12/12/2011
## 8 6/5/2006
## 9 16/18/19
## 10 14/15/15
head(TidyFile,10)
## Student_ID sex Mjob Fjob guardian traveltime studytime
## 1 3093 Female at_home teacher mother 2 2
## 2 4344 Female at_home other father 1 2
## 3 5967 Female at_home other mother 1 2
## 4 6467 Female health services mother 1 3
## 5 3331 Female other other father 1 2
## 6 6082 Male services other mother 1 2
## 7 5943 Male other other mother 1 2
## 8 1793 Female other teacher mother 2 2
## 9 7274 Male services other mother 1 2
## 10 4845 Male other other mother 1 2
## freetime absences
## 1 3 6
## 2 3 4
## 3 3 10
## 4 2 2
## 5 3 4
## 6 4 10
## 7 4 0
## 8 1 6
## 9 2 0
## 10 5 0
subset_absence<-TidyFile %>% select(sex,guardian,absences) %>% group_by(sex,guardian) %>% summarise(Total=sum(absences))
barplot(subset_absence$Total)