Overview

The dataset provided contains student information on 395 students including their grades in the class.


R Packages Used

This assignment was accomplished by utilizing these packages for both data analysis and visualizations.

library("tidyr")
library("dplyr")
library("kableExtra")
library("ggplot2")
library("stringr")
library("lubridate")

The DataSet

The data is captured in the .csv format and updated into GitHub. You will see below that the data is not in a very clean form to conduct analysis easily, therefore this data set needed to be tidy.

theURL <- "https://raw.githubusercontent.com/DataScienceAR/Cuny-Assignments/master/Data-607/Data-Sets/Students.csv"
RawFile <-data.frame(read.csv(file = theURL,header = TRUE))

#Table Structure
glimpse(RawFile)
## Observations: 395
## Variables: 33
## $ X           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Student.ID  <int> 3093, 4344, 5967, 6467, 3331, 6082, 5943, 793, 727...
## $ sex         <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F,...
## $ dob         <fct> 6/5/2000, 11/25/1999, 2/2/1998, 12/20/1997, 10/4/1...
## $ address     <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,...
## $ famsize     <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, ...
## $ Pstatus     <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T,...
## $ Medu        <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4,...
## $ Fedu        <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4,...
## $ Mjob        <fct> at_home, at_home, at_home, health, other, services...
## $ Fjob        <fct> teacher, other, other, services, other, other, oth...
## $ reason      <fct> course, course, other, home, home, reputation, hom...
## $ guardian    <fct> mother, father, mother, mother, father, mother, mo...
## $ traveltime  <int> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1,...
## $ studytime   <int> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3,...
## $ failures    <int> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ schoolsup   <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no,...
## $ famsup      <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes...
## $ paid        <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes,...
## $ activities  <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes...
## $ nursery     <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, y...
## $ higher      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, ...
## $ internet    <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes...
## $ romantic    <fct> no, no, no, yes, no, no, no, no, no, no, no, no, n...
## $ famrel      <int> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3,...
## $ freetime    <int> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2,...
## $ goout       <int> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3,...
## $ Dalc        <int> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Walc        <int> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2,...
## $ health      <int> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2,...
## $ nurse_visit <fct> 4/10/2014 14:59, 3/12/2015 14:59, 9/21/2015 14:59,...
## $ absences    <int> 6, 4, 10, 2, 4, 10, 0, 6, 0, 0, 0, 4, 2, 2, 0, 4, ...
## $ Grades      <fct> 5/6/2006, 5/5/2006, 7/8/2010, 15/14/15, 6/10/2010,...
#Top 6 rows of the table

head(RawFile)
##   X Student.ID sex        dob address famsize Pstatus Medu Fedu     Mjob
## 1 1       3093   F   6/5/2000       U     GT3       A    4    4  at_home
## 2 2       4344   F 11/25/1999       U     GT3       T    1    1  at_home
## 3 3       5967   F   2/2/1998       U     LE3       T    1    1  at_home
## 4 4       6467   F 12/20/1997       U     GT3       T    4    2   health
## 5 5       3331   F  10/4/1998       U     GT3       T    3    3    other
## 6 6       6082   M  6/16/1999       U     LE3       T    4    3 services
##       Fjob     reason guardian traveltime studytime failures schoolsup
## 1  teacher     course   mother          2         2        0       yes
## 2    other     course   father          1         2        0        no
## 3    other      other   mother          1         2        3       yes
## 4 services       home   mother          1         3        0        no
## 5    other       home   father          1         2        0        no
## 6    other reputation   mother          1         2        0        no
##   famsup paid activities nursery higher internet romantic famrel freetime
## 1     no   no         no     yes    yes       no       no      4        3
## 2    yes   no         no      no    yes      yes       no      5        3
## 3     no  yes         no     yes    yes      yes       no      4        3
## 4    yes  yes        yes     yes    yes      yes      yes      3        2
## 5    yes  yes         no     yes    yes       no       no      4        3
## 6    yes  yes        yes     yes    yes      yes       no      5        4
##   goout Dalc Walc health      nurse_visit absences    Grades
## 1     4    1    1      3  4/10/2014 14:59        6  5/6/2006
## 2     3    1    1      3  3/12/2015 14:59        4  5/5/2006
## 3     2    2    3      3  9/21/2015 14:59       10  7/8/2010
## 4     2    1    1      5   9/3/2015 14:59        2  15/14/15
## 5     2    1    2      5   4/7/2015 14:59        4 6/10/2010
## 6     2    1    2      5 11/15/2013 14:59       10  15/15/15

Data Manipulation

The data needs to be cleaned and manipulated for it to be presentable for analysis.


Cleaning the table

  • Transform the data type of some of columns to right data type
# Transform the data type of some of columns to right data type
RawFile$Student.ID <- as.character(RawFile$Student.ID)
RawFile$Medu <- as.factor(RawFile$Medu)
RawFile$Fedu <- as.factor(RawFile$Fedu)
RawFile$nurse_visit <- as.character(RawFile$nurse_visit)

  • Padding the student id to make is 4 digit number
RawFile$Student.ID <-str_pad(RawFile$Student.ID, width = 4,side="left",pad = "1")
names(RawFile)[names(RawFile) =='Student.ID'] <- "Student_ID"

  • Replace the codes with descriptions
RawFile$sex <- str_replace_all(RawFile$sex,"M","Male")
RawFile$sex <- str_replace_all(RawFile$sex,"F","Female")

  • Select the required variables for analysis
head(RawFile)
##   X Student_ID    sex        dob address famsize Pstatus Medu Fedu
## 1 1       3093 Female   6/5/2000       U     GT3       A    4    4
## 2 2       4344 Female 11/25/1999       U     GT3       T    1    1
## 3 3       5967 Female   2/2/1998       U     LE3       T    1    1
## 4 4       6467 Female 12/20/1997       U     GT3       T    4    2
## 5 5       3331 Female  10/4/1998       U     GT3       T    3    3
## 6 6       6082   Male  6/16/1999       U     LE3       T    4    3
##       Mjob     Fjob     reason guardian traveltime studytime failures
## 1  at_home  teacher     course   mother          2         2        0
## 2  at_home    other     course   father          1         2        0
## 3  at_home    other      other   mother          1         2        3
## 4   health services       home   mother          1         3        0
## 5    other    other       home   father          1         2        0
## 6 services    other reputation   mother          1         2        0
##   schoolsup famsup paid activities nursery higher internet romantic famrel
## 1       yes     no   no         no     yes    yes       no       no      4
## 2        no    yes   no         no      no    yes      yes       no      5
## 3       yes     no  yes         no     yes    yes      yes       no      4
## 4        no    yes  yes        yes     yes    yes      yes      yes      3
## 5        no    yes  yes         no     yes    yes       no       no      4
## 6        no    yes  yes        yes     yes    yes      yes       no      5
##   freetime goout Dalc Walc health      nurse_visit absences    Grades
## 1        3     4    1    1      3  4/10/2014 14:59        6  5/6/2006
## 2        3     3    1    1      3  3/12/2015 14:59        4  5/5/2006
## 3        3     2    2    3      3  9/21/2015 14:59       10  7/8/2010
## 4        2     2    1    1      5   9/3/2015 14:59        2  15/14/15
## 5        3     2    1    2      5   4/7/2015 14:59        4 6/10/2010
## 6        4     2    1    2      5 11/15/2013 14:59       10  15/15/15
TidyFile <-RawFile %>% select(Student_ID,sex,Mjob,Fjob,guardian,traveltime,studytime,freetime,absences)
dim(TidyFile)
## [1] 395   9

Data Analysis

Untidy DataSet


head(RawFile,10)
##     X Student_ID    sex        dob address famsize Pstatus Medu Fedu
## 1   1       3093 Female   6/5/2000       U     GT3       A    4    4
## 2   2       4344 Female 11/25/1999       U     GT3       T    1    1
## 3   3       5967 Female   2/2/1998       U     LE3       T    1    1
## 4   4       6467 Female 12/20/1997       U     GT3       T    4    2
## 5   5       3331 Female  10/4/1998       U     GT3       T    3    3
## 6   6       6082   Male  6/16/1999       U     LE3       T    4    3
## 7   7       5943   Male  9/20/1997       U     LE3       T    2    2
## 8   8       1793 Female   7/5/1998       U     GT3       A    4    4
## 9   9       7274   Male  3/18/2000       U     LE3       A    3    2
## 10 10       4845   Male  3/18/1997       U     GT3       T    3    4
##        Mjob     Fjob     reason guardian traveltime studytime failures
## 1   at_home  teacher     course   mother          2         2        0
## 2   at_home    other     course   father          1         2        0
## 3   at_home    other      other   mother          1         2        3
## 4    health services       home   mother          1         3        0
## 5     other    other       home   father          1         2        0
## 6  services    other reputation   mother          1         2        0
## 7     other    other       home   mother          1         2        0
## 8     other  teacher       home   mother          2         2        0
## 9  services    other       home   mother          1         2        0
## 10    other    other       home   mother          1         2        0
##    schoolsup famsup paid activities nursery higher internet romantic
## 1        yes     no   no         no     yes    yes       no       no
## 2         no    yes   no         no      no    yes      yes       no
## 3        yes     no  yes         no     yes    yes      yes       no
## 4         no    yes  yes        yes     yes    yes      yes      yes
## 5         no    yes  yes         no     yes    yes       no       no
## 6         no    yes  yes        yes     yes    yes      yes       no
## 7         no     no   no         no     yes    yes      yes       no
## 8        yes    yes   no         no     yes    yes       no       no
## 9         no    yes  yes         no     yes    yes      yes       no
## 10        no    yes  yes        yes     yes    yes      yes       no
##    famrel freetime goout Dalc Walc health      nurse_visit absences
## 1       4        3     4    1    1      3  4/10/2014 14:59        6
## 2       5        3     3    1    1      3  3/12/2015 14:59        4
## 3       4        3     2    2    3      3  9/21/2015 14:59       10
## 4       3        2     2    1    1      5   9/3/2015 14:59        2
## 5       4        3     2    1    2      5   4/7/2015 14:59        4
## 6       5        4     2    1    2      5 11/15/2013 14:59       10
## 7       4        4     4    1    1      3  9/20/2015 14:59        0
## 8       4        1     4    1    1      1   2/1/2015 14:59        6
## 9       4        2     2    1    1      1  4/12/2015 14:59        0
## 10      5        5     1    1    1      5 12/30/2014 14:59        0
##        Grades
## 1    5/6/2006
## 2    5/5/2006
## 3    7/8/2010
## 4    15/14/15
## 5   6/10/2010
## 6    15/15/15
## 7  12/12/2011
## 8    6/5/2006
## 9    16/18/19
## 10   14/15/15

Tidy DataSet


head(TidyFile,10)
##    Student_ID    sex     Mjob     Fjob guardian traveltime studytime
## 1        3093 Female  at_home  teacher   mother          2         2
## 2        4344 Female  at_home    other   father          1         2
## 3        5967 Female  at_home    other   mother          1         2
## 4        6467 Female   health services   mother          1         3
## 5        3331 Female    other    other   father          1         2
## 6        6082   Male services    other   mother          1         2
## 7        5943   Male    other    other   mother          1         2
## 8        1793 Female    other  teacher   mother          2         2
## 9        7274   Male services    other   mother          1         2
## 10       4845   Male    other    other   mother          1         2
##    freetime absences
## 1         3        6
## 2         3        4
## 3         3       10
## 4         2        2
## 5         3        4
## 6         4       10
## 7         4        0
## 8         1        6
## 9         2        0
## 10        5        0

Analysis


subset_absence<-TidyFile %>% select(sex,guardian,absences) %>% group_by(sex,guardian) %>% summarise(Total=sum(absences))

barplot(subset_absence$Total)