Introduction

The dataset contains the student achievement in secondary education of two Portuguese schools. The objective of this project is to build classifiers to predict whether different attributes/factors like demographic, social and school related features affect the final year grades in Math subject of the students in future. The whole project is divided into two phases. Phase 1 focuses on the data preprocessing and the exploration of different attributes. Phase 2 mainly focuses on the model building for the prediction.

Data Set

The dataset is collected from UCI Machine Learning Repository which has only one dataset for the analysis. There are 33 attributes and 395 observations in the dataset. Out of 33 attributes, there is 32 descriptive features and one target feature. In phase 1, we have only one dataset observation. In phase 2, we will build training dataset and test dataset.

Target Feature

3 Data Pre-processing

Preliminary

The R packages used for the analysis are the following.

library(knitr)
library(mlr)
## Loading required package: ParamHelpers
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
## 
##     nasa
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)

The data was semicolon delimited so appropriate function was used to bring it to the readable format in R.The string values ar taken as characters. So we will later convert those variables to factors.

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   age = col_double(),
##   Medu = col_double(),
##   Fedu = col_double(),
##   traveltime = col_double(),
##   studytime = col_double(),
##   failures = col_double(),
##   famrel = col_double(),
##   freetime = col_double(),
##   goout = col_double(),
##   Dalc = col_double(),
##   Walc = col_double(),
##   health = col_double(),
##   absences = col_double(),
##   G1 = col_double(),
##   G2 = col_double(),
##   G3 = col_double()
## )
## See spec(...) for full column specifications.
##  [1] "school"     "sex"        "age"        "address"    "famsize"   
##  [6] "Pstatus"    "Medu"       "Fedu"       "Mjob"       "Fjob"      
## [11] "reason"     "guardian"   "traveltime" "studytime"  "failures"  
## [16] "schoolsup"  "famsup"     "paid"       "activities" "nursery"   
## [21] "higher"     "internet"   "romantic"   "famrel"     "freetime"  
## [26] "goout"      "Dalc"       "Walc"       "health"     "absences"  
## [31] "G1"         "G2"         "G3"

3.2 Data Cleaning and Transformation

Using str and summarizeColumns we figured out that:

-> No white spaces were encountered in the character column values. -> Apparently there were no typos or any missing values especially in the character columns

str(student_mat)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 395 obs. of  33 variables:
##  $ school    : chr  "GP" "GP" "GP" "GP" ...
##  $ sex       : chr  "F" "F" "F" "F" ...
##  $ age       : num  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : chr  "U" "U" "U" "U" ...
##  $ famsize   : chr  "GT3" "GT3" "LE3" "GT3" ...
##  $ Pstatus   : chr  "A" "T" "T" "T" ...
##  $ Medu      : num  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : num  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mjob      : chr  "at_home" "at_home" "at_home" "health" ...
##  $ Fjob      : chr  "teacher" "other" "other" "services" ...
##  $ reason    : chr  "course" "course" "other" "home" ...
##  $ guardian  : chr  "mother" "father" "mother" "mother" ...
##  $ traveltime: num  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : num  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : num  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup : chr  "yes" "no" "yes" "no" ...
##  $ famsup    : chr  "no" "yes" "no" "yes" ...
##  $ paid      : chr  "no" "no" "yes" "yes" ...
##  $ activities: chr  "no" "no" "no" "yes" ...
##  $ nursery   : chr  "yes" "no" "yes" "yes" ...
##  $ higher    : chr  "yes" "yes" "yes" "yes" ...
##  $ internet  : chr  "no" "yes" "yes" "yes" ...
##  $ romantic  : chr  "no" "no" "no" "yes" ...
##  $ famrel    : num  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : num  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : num  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : num  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : num  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : num  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : num  6 4 10 2 4 10 0 6 0 0 ...
##  $ G1        : num  5 5 7 15 6 15 12 6 16 14 ...
##  $ G2        : num  6 5 8 14 10 15 12 5 18 15 ...
##  $ G3        : num  6 6 10 15 10 15 11 6 19 15 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   school = col_character(),
##   ..   sex = col_character(),
##   ..   age = col_double(),
##   ..   address = col_character(),
##   ..   famsize = col_character(),
##   ..   Pstatus = col_character(),
##   ..   Medu = col_double(),
##   ..   Fedu = col_double(),
##   ..   Mjob = col_character(),
##   ..   Fjob = col_character(),
##   ..   reason = col_character(),
##   ..   guardian = col_character(),
##   ..   traveltime = col_double(),
##   ..   studytime = col_double(),
##   ..   failures = col_double(),
##   ..   schoolsup = col_character(),
##   ..   famsup = col_character(),
##   ..   paid = col_character(),
##   ..   activities = col_character(),
##   ..   nursery = col_character(),
##   ..   higher = col_character(),
##   ..   internet = col_character(),
##   ..   romantic = col_character(),
##   ..   famrel = col_double(),
##   ..   freetime = col_double(),
##   ..   goout = col_double(),
##   ..   Dalc = col_double(),
##   ..   Walc = col_double(),
##   ..   health = col_double(),
##   ..   absences = col_double(),
##   ..   G1 = col_double(),
##   ..   G2 = col_double(),
##   ..   G3 = col_double()
##   .. )
summarizeColumns(student_mat) %>% knitr::kable(caption='Feature Summary before Data Processing')
Feature Summary before Data Processing
name type na mean disp median mad min max nlevs
school character 0 NA 0.1164557 NA NA 46 349 2
sex character 0 NA 0.4734177 NA NA 187 208 2
age numeric 0 16.6962025 1.2760427 17 1.4826 15 22 0
address character 0 NA 0.2227848 NA NA 88 307 2
famsize character 0 NA 0.2886076 NA NA 114 281 2
Pstatus character 0 NA 0.1037975 NA NA 41 354 2
Medu numeric 0 2.7493671 1.0947351 3 1.4826 0 4 0
Fedu numeric 0 2.5215190 1.0882005 2 1.4826 0 4 0
Mjob character 0 NA 0.6430380 NA NA 34 141 5
Fjob character 0 NA 0.4506329 NA NA 18 217 5
reason character 0 NA 0.6329114 NA NA 36 145 4
guardian character 0 NA 0.3088608 NA NA 32 273 3
traveltime numeric 0 1.4481013 0.6975048 1 0.0000 1 4 0
studytime numeric 0 2.0354430 0.8392403 2 0.0000 1 4 0
failures numeric 0 0.3341772 0.7436510 0 0.0000 0 3 0
schoolsup character 0 NA 0.1291139 NA NA 51 344 2
famsup character 0 NA 0.3873418 NA NA 153 242 2
paid character 0 NA 0.4582278 NA NA 181 214 2
activities character 0 NA 0.4911392 NA NA 194 201 2
nursery character 0 NA 0.2050633 NA NA 81 314 2
higher character 0 NA 0.0506329 NA NA 20 375 2
internet character 0 NA 0.1670886 NA NA 66 329 2
romantic character 0 NA 0.3341772 NA NA 132 263 2
famrel numeric 0 3.9443038 0.8966586 4 1.4826 1 5 0
freetime numeric 0 3.2354430 0.9988620 3 1.4826 1 5 0
goout numeric 0 3.1088608 1.1132782 3 1.4826 1 5 0
Dalc numeric 0 1.4810127 0.8907414 1 0.0000 1 5 0
Walc numeric 0 2.2911392 1.2878966 2 1.4826 1 5 0
health numeric 0 3.5544304 1.3903034 4 1.4826 1 5 0
absences numeric 0 5.7088608 8.0030957 4 5.9304 0 75 0
G1 numeric 0 10.9088608 3.3191947 11 4.4478 3 19 0
G2 numeric 0 10.7139241 3.7615047 11 2.9652 0 19 0
G3 numeric 0 10.4151899 4.5814426 11 4.4478 0 20 0

We created the level table for the character features. The following insights came up:

  1. Majority of students belong to GP school and only one seventh to MS school.
  2. The number of males and females is nearly same.
  3. Most of the students are urban residents.
  4. The number of students with family size greater than 3 is more than twice the number of students with family size less than 3.
  5. School provides extra education support to very few students.
  6. Almost all the students studying Mathematics are interested in going for higher education.
sapply(student_mat[sapply(student_mat, is.character)], table)
## $school
## 
##  GP  MS 
## 349  46 
## 
## $sex
## 
##   F   M 
## 208 187 
## 
## $address
## 
##   R   U 
##  88 307 
## 
## $famsize
## 
## GT3 LE3 
## 281 114 
## 
## $Pstatus
## 
##   A   T 
##  41 354 
## 
## $Mjob
## 
##  at_home   health    other services  teacher 
##       59       34      141      103       58 
## 
## $Fjob
## 
##  at_home   health    other services  teacher 
##       20       18      217      111       29 
## 
## $reason
## 
##     course       home      other reputation 
##        145        109         36        105 
## 
## $guardian
## 
## father mother  other 
##     90    273     32 
## 
## $schoolsup
## 
##  no yes 
## 344  51 
## 
## $famsup
## 
##  no yes 
## 153 242 
## 
## $paid
## 
##  no yes 
## 214 181 
## 
## $activities
## 
##  no yes 
## 194 201 
## 
## $nursery
## 
##  no yes 
##  81 314 
## 
## $higher
## 
##  no yes 
##  20 375 
## 
## $internet
## 
##  no yes 
##  66 329 
## 
## $romantic
## 
##  no yes 
## 263 132

Just to be double sure we have used the “trimws” function to trim the white spaces present in the character column values.

student_mat[,sapply(student_mat, is.character)] <-sapply(student_mat[,sapply(student_mat, is.character)],trimws)

We modified four columns in original dataset.

  1. G3 : The target variable G3 contained numeric value from 0-20 but is based on 5 level classification system viz. “fail”, “sufficient”, “satisfactory’,”good“,”excellent". So the new column Grade was created to assign these levels to the corresponding value in G3 column(final grade/Taget variable).

  2. guardian: There were three levels to this viz. “mother”, “father”, “other”. We modified it to two level viz. parents/others and stored the values into new column named guardian1.

  3. We combined two columns Dalc (workday alcohol consumption) and Walc (weekend alcohol consumption) by averaging the value in both columns to get alcohol consumption during whole week. The averaged value was stored in column named alc. Further the average value was categorized based on 5 level classification system viz.“Very Low”, “Low”, “Nominal”, “High”, “Very High” and stored in a new column alc1

  4. absences: This column had numeric data from within the range 0 to 93. We changed it into intervals (0-10],(10-30], (30-50], (50, 93] , named them as “Very Low”, “Low”, “Medium”, “High” and stored them in the new column absences1

labels = c( "Fail", "Sufficient", "Satisfactory", "Good", "Very Good")

student_mat$Grade <- student_mat$G3 %>% cut(student_mat$G3, 
    breaks = c(0,9,11,13,15,20),
    labels = labels,
    right = TRUE)

#student_mat$G11 <- student_mat$G1 %>% cut(student_mat$G1, breaks = c(0,9,11,13,15,20),labels = labels,    right = TRUE)

#student_mat$G21 <- student_mat$G2 %>% cut(student_mat$G2, breaks = c(0,9,11,13,15,20),labels = labels,    right = TRUE)
student_mat <- student_mat %>% mutate(guardian1=ifelse(guardian %in% c('father','mother'),"Parents",
                                      ifelse(grepl('other', guardian),'other', guardian)))
student_mat
## # A tibble: 395 x 35
##    school sex     age address famsize Pstatus  Medu  Fedu Mjob  Fjob 
##    <chr>  <chr> <dbl> <chr>   <chr>   <chr>   <dbl> <dbl> <chr> <chr>
##  1 GP     F        18 U       GT3     A           4     4 at_h~ teac~
##  2 GP     F        17 U       GT3     T           1     1 at_h~ other
##  3 GP     F        15 U       LE3     T           1     1 at_h~ other
##  4 GP     F        15 U       GT3     T           4     2 heal~ serv~
##  5 GP     F        16 U       GT3     T           3     3 other other
##  6 GP     M        16 U       LE3     T           4     3 serv~ other
##  7 GP     M        16 U       LE3     T           2     2 other other
##  8 GP     F        17 U       GT3     A           4     4 other teac~
##  9 GP     M        15 U       LE3     A           3     2 serv~ other
## 10 GP     M        15 U       GT3     T           3     4 other other
## # ... with 385 more rows, and 25 more variables: reason <chr>,
## #   guardian <chr>, traveltime <dbl>, studytime <dbl>, failures <dbl>,
## #   schoolsup <chr>, famsup <chr>, paid <chr>, activities <chr>,
## #   nursery <chr>, higher <chr>, internet <chr>, romantic <chr>,
## #   famrel <dbl>, freetime <dbl>, goout <dbl>, Dalc <dbl>, Walc <dbl>,
## #   health <dbl>, absences <dbl>, G1 <dbl>, G2 <dbl>, G3 <dbl>,
## #   Grade <fct>, guardian1 <chr>
student_mat$alc = (student_mat$Dalc + student_mat$Walc) / 2

labels = c( "Very Low", "Low", "Nominal", "High", "Very High")
student_mat$alc1 <- student_mat$alc %>% cut(student_mat$alc, 
    breaks = c(0,1,2,3,4,5),
    labels = labels,
    right = TRUE)
labels = c( "Very Low", "Low", "Medium", "High")

student_mat$absences1 <- student_mat$absences %>% cut(student_mat$absences, 
    breaks = c(0,10,30,50, 93),
    labels = labels,
    right = TRUE)

Converting the character variable to factors

cols <- c("Medu", "Fedu", "famrel", "freetime","goout","Dalc", "Walc", "health", "traveltime", "studytime")
student_mat[,cols] <- data.frame(apply(student_mat[cols], 2, as.factor))

Summary statistics after data pre processing.

summarizeColumns(student_mat) %>% kable(caption='Summary Statistics after data pre-processing')
Summary Statistics after data pre-processing
name type na mean disp median mad min max nlevs
school character 0 NA 0.1164557 NA NA 46 349 2
sex character 0 NA 0.4734177 NA NA 187 208 2
age numeric 0 16.6962025 1.2760427 17.0 1.4826 15 22 0
address character 0 NA 0.2227848 NA NA 88 307 2
famsize character 0 NA 0.2886076 NA NA 114 281 2
Pstatus character 0 NA 0.1037975 NA NA 41 354 2
Medu factor 0 NA 0.6683544 NA NA 3 131 5
Fedu factor 0 NA 0.7088608 NA NA 2 115 5
Mjob character 0 NA 0.6430380 NA NA 34 141 5
Fjob character 0 NA 0.4506329 NA NA 18 217 5
reason character 0 NA 0.6329114 NA NA 36 145 4
guardian character 0 NA 0.3088608 NA NA 32 273 3
traveltime factor 0 NA 0.3493671 NA NA 8 257 4
studytime factor 0 NA 0.4987342 NA NA 27 198 4
failures numeric 0 0.3341772 0.7436510 0.0 0.0000 0 3 0
schoolsup character 0 NA 0.1291139 NA NA 51 344 2
famsup character 0 NA 0.3873418 NA NA 153 242 2
paid character 0 NA 0.4582278 NA NA 181 214 2
activities character 0 NA 0.4911392 NA NA 194 201 2
nursery character 0 NA 0.2050633 NA NA 81 314 2
higher character 0 NA 0.0506329 NA NA 20 375 2
internet character 0 NA 0.1670886 NA NA 66 329 2
romantic character 0 NA 0.3341772 NA NA 132 263 2
famrel factor 0 NA 0.5063291 NA NA 8 195 5
freetime factor 0 NA 0.6025316 NA NA 19 157 5
goout factor 0 NA 0.6708861 NA NA 23 130 5
Dalc factor 0 NA 0.3012658 NA NA 9 276 5
Walc factor 0 NA 0.6177215 NA NA 28 151 5
health factor 0 NA 0.6303797 NA NA 45 146 5
absences numeric 0 5.7088608 8.0030957 4.0 5.9304 0 75 0
G1 numeric 0 10.9088608 3.3191947 11.0 4.4478 3 19 0
G2 numeric 0 10.7139241 3.7615047 11.0 2.9652 0 19 0
G3 numeric 0 10.4151899 4.5814426 11.0 4.4478 0 20 0
Grade factor 0 NA 0.6708861 NA NA 40 130 5
guardian1 character 0 NA 0.0810127 NA NA 32 363 2
alc numeric 0 1.8860759 0.9921947 1.5 0.7413 1 5 0
alc1 factor 0 NA 0.6202532 NA NA 13 150 5
absences1 factor 0 NA 0.1670886 NA NA 2 329 4
#student_mat %>% select(student_G3) %>% summary() %>% kable(caption='Summary Statistics after data pre- processing')

4 Data Exploration

Each feature was explored individually and was split by defined classes of target feature. Further we did multivariate visualization.

4.1 Univariate Visualizations

** 4.1.1 Numerical Features **

4.1.1.1 Age: The age of students in the high school varies between 15 to 22. With maximum students of age 16. We see that the “Very Good” performance is shown by the students of age 16. 100% failure occurs in the age group 21 and 22. Maximum number of Best performances (“Good” and “Very Good”) is given by the students of age group 20. As a general trend the number of failures increases with increase in age.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table <- table(student_mat$age, student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
library(RColorBrewer)
table %>% barplot(main="Percentage comparison of Grades by age", ylab="Percentage by age", legend= rownames(table), beside=TRUE, xlab="Grade", col=1:8, ylim=c(0,100))

# c("#7fc6bc","#083642","#b1df01","#cdef9c","#466b5d")
table %>% barplot(main="Percentage comparison of Grades by age", ylab="Percentage by age", legend= rownames(table), beside=TRUE, xlab="Grade", ylim=c(0,100),col=c("#99ffcc","#ccff99","#ff9966","#d27979","#ffcccc","#e0e0d1","#ffff99","#c2c2d6"))

4.1.2 Categorical Features

4.1.2.1 School The data has been collected from two schools viz. GP and MS. Most of the students study in GP. The performance of students studying in GP is much better than those studying in MS. This justifies why the strength of students is much more in GP than in MS.

4.1.2.2 Address Type (Urban/ Rural) Majority of students stay in Urban areas. And the performance of urban students is better than those of rural area. Rural students have higher number of failures (below satisfactory performance).

4.1.2.3 Family size Most of the students belong to families with size greater than 3. There is not very clear implication from this relationship as though below average performance is more common in students with family size greater than 3 but many students from such families have given “Good” performances also.

ggplot(data=student_mat, aes(student_mat$famsize)) + 
  geom_bar(fill="blue")+ xlab("Family size")+
  ggtitle("Number of students from different family size")+
  scale_x_discrete(labels=c("LE3" = "<= 3", "GT3" = ">3"))

table3 <- table(student_mat$famsize,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100

table3 %>% barplot(main="Percentage comparison of Grades by family size", ylab="Percentage by family size", legend= c("Greater than 3", "Less than equal to 3"), beside=TRUE, xlab="Grade", col=c("green", "blue" ), ylim=c(0,100))

4.1.2.4 Cohabitation status of parents

Most of the students have parents staying together. But this doesn’t seem to indicate their performance. No clear insight can be drawn from it.

ggplot(data=student_mat, aes(student_mat$Pstatus)) + 
  geom_bar(fill="blue")+ xlab("Parent Status")+ggtitle("Number of students from different parent cohabitation status")+ scale_x_discrete(labels=c("A"= "Apart", "T" = "Together"))

table4 <- table(student_mat$Pstatus,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100

table4 %>% barplot(main="Percentage comparison of Grades by Parents' cohabitation status", ylab="Percentage by family size", legend= c("Away", "Together"), beside=TRUE, xlab="Grade", col=c("blue", "green" ), ylim=c(0,100))

4.1.2.5 Mother’s and Father’s education

ggplot(data=student_mat, aes(student_mat$Medu)) + 
  geom_bar(fill="blue")+ xlab("Mother's Education")+
  ggtitle("Count of students with different mother's education")+ 
  scale_x_discrete(labels=c("0" = "Uneducated", "1" = "up to 4th grade", "2" = "5th to 9th grade", "3"= "Secondary education", "4"= "Higher education"))

4.1.2.6 Father’s education Fathers of majority of students have received at least middle level education (5th to 9th grade). Only few have education level below this level. With regards to higher education mothers’ number is more in comparison to that of fathers’.

ggplot(data=student_mat, aes(student_mat$Fedu)) + 
  geom_bar(fill="blue")+ xlab("Father's Education")+
  ggtitle("Count of students with different father's education")+ 
  scale_x_discrete(labels=c("0" = "Uneducated", "1" = "up to 4th grade", "2" = "5th to 9th grade", "3"= "Secondary education", "4"= "Higher education"))

Count of students based on Mother’s job

table7 <- table(student_mat$Mjob,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100

Difference in performance of students based on Mother’s job

 table7 %>% barplot(main="Percentage comparison of grades by mother's job", ylab="Percentage by Mother's job", legend= c("Homemaker", "Healthcare related", "Others", "Civil Services", "Teacher"), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))
# topo.colors(12)

Count of students based on Father’s job

table8 <- table(student_mat$Fjob,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100

Difference in performance of students based on Mother’s job

barplot(table8, main="Percentage comparison of grades by Father's job", ylab="Percentage by Father's job", legend= c("At home", "Healthcare related", "Others", "Civil Services", "Teacher"), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))
# topo.colors(12)
par(mfrow=c(2,1))

barplot(table7,main="Percentage comparison of grades by Mother and Father Job", ylab="% by Mother's job", beside=TRUE, col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

table8 %>% barplot( ylab="% by Father's job", legend= c("Home","Health", "Other", "CS", "Teacher"), args.legend = list(x ='bottom', bty='n', inset=c(-0.08,0.25)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

ggplot(data=student_mat, aes(student_mat$guardian1)) + 
  geom_bar(fill="blue")+ xlab("Guardian")+
  ggtitle("Count of students based on guardian status")

Difference in performance of students based on guardian type

table9 <- table(student_mat$guardian1,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
barplot(table9, main="Percentage comparison of grades by guardian type", ylab="Percentage by guardian's type", beside=TRUE, xlab="Grade", col=c("blue", "green"), legend=c("Parents", "other"), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))
# topo.colors(12)

Count of students based on travel time

ggplot(data=student_mat, aes(student_mat$traveltime)) + 
  geom_bar(fill="blue")+ xlab("Travel Time")+
  ggtitle("Count of students based on travel time")+
  scale_x_discrete(labels=c("1"= "< 15 min", "2" = "15-30 min", "3"="30 min- 1 hr", "4"= "> 1 hr"))

table10 <- table(student_mat$traveltime,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table10 %>% barplot(main="Percentage comparison of Grades by travel time", ylab="Percentage by travel time", legend= c("< 15 min", "15-30 min", "30 min- 1 hr","> 1 hr"), beside=TRUE, xlab="Grade", col=c("green", "blue", "red", "black" ), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))

Count of students based on study time

ggplot(data=student_mat, aes(student_mat$studytime)) + 
  geom_bar(fill="blue")+ xlab("Study Time")+
  ggtitle("Count of students based on study time")+
  scale_x_discrete(labels=c("1"= "< 2 hrs", "2" = "2-5 hrs", "3"="5-10 hrs", "4"= "> 10 hrs"))

table11 <- table(student_mat$traveltime,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table11 %>% barplot(main="Percentage comparison of Grades by study time", ylab="Percentage by study time", legend= c("< 2 hrs", "2-5 hrs", "5-10 hrs","> 10 hrs"), beside=TRUE, xlab="Grade", col=c("green", "blue", "red", "black" ), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))
table12 <- table(student_mat$studytime,student_mat$sex) %>% prop.table(margin=1) %>% round(3)*100
table12 %>% barplot(main="Percentage comparison of no. of study hours with sex", ylab="% of male/females with different study times", legend= c("< 2 hrs", "2-5 hrs", "5-10 hrs","> 10 hrs"), beside=TRUE, xlab="Gender", col=c("green", "blue", "red", "black" ), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))

Count of students based on failures

ggplot(data=student_mat, aes(student_mat$failures)) + 
  geom_bar(fill="blue")+ xlab("failures")+
  ggtitle("Count of students based on past class failures")

table13 <- table(student_mat$failures,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table13 %>% barplot(main="Percentage comparison of Grades by no. of past failures", ylab="Percentage by past failures", legend= c("0", "1", "2","3"), beside=TRUE, xlab="Grade", col=c("green", "blue", "red", "black" ), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))

Count of students based on extra educational support

ggplot(data=student_mat, aes(student_mat$schoolsup)) + 
  geom_bar(fill="blue")+ xlab("extra educational support")+
  ggtitle("Count of students based on extra educational support")

#table14 %>% barplot(main="Percentage comparison of Grades by extra educational support", ylab=" % by educational suport", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
table15 <- table(student_mat$famsup,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table15 %>% barplot(main="Percentage comparison of Grades by educational support from family", ylab=" % by educational suport", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))

Count of students based on extra educational support

ggplot(data=student_mat, aes(student_mat$paid)) + 
  geom_bar(fill="blue")+ xlab("paid educational support")+
  ggtitle("Count of students based on paid educational support")

table16 <- table(student_mat$paid,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table16 %>% barplot(main="Percentage comparison of Grades by paid educational support", ylab=" % by educational suport", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))

Count of students based on extra educational support from school or family

#par(mfrow=c(3,1))
#table14 %>% barplot(main="Percentage comparison of Grades by extra educational support", ylab=" % by educational suport", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
table15 %>% barplot(main="Percentage comparison of Grades by educational support from family", ylab=" % by educational suport", beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))

table16 %>% barplot(main="Percentage comparison of Grades by paid educational support", ylab=" % by educational suport", beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))

Count of students based on whether interested in higher education

ggplot(data=student_mat, aes(student_mat$higher)) + 
  geom_bar(fill="blue")+ xlab("Interest in higher education")+
  ggtitle("Count of students based on interest in higher education")

table17 <- table(student_mat$higher,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table17 %>% barplot(main="Percentage comparison of Grades by interest in higher education", ylab=" % by interest in higher education", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))

Count of students based on whether done nursery or not

ggplot(data=student_mat, aes(student_mat$nursery)) + 
  geom_bar(fill="blue")+ xlab("Attended nursery school")+
  ggtitle("Count of students based on whether done nursery or not")

table18 <- table(student_mat$nursery,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table18 %>% barplot(main="Percentage comparison of Grades by whether \n attended nursery school or not", ylab=" % by nursery", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))

Count of students based on whether having internet connection or not

ggplot(data=student_mat, aes(student_mat$internet)) + 
  geom_bar(fill="blue")+ xlab("Internet connection")+
  ggtitle("Count of students based on whether having internet connection or not")

table19 <- table(student_mat$internet,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table19 %>% barplot(main="Percentage comparison of Grades by whether \n have internet connection or not", ylab=" % by internet connection", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))

Count of students based on whether having romantic relationship or not

ggplot(data=student_mat, aes(student_mat$romantic)) + 
  geom_bar(fill="blue")+ xlab("Romantic Relationship")+
  ggtitle("Count of students based on whether having romantic relationship or not")

Count of students based on whether having family relationships

ggplot(data=student_mat, aes(student_mat$famrel)) + 
  geom_bar(fill="blue")+ xlab("family relationship quality")+
  ggtitle("Count of students based on type of family relationships")

table21 <- table(student_mat$famrel,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table21 %>% barplot(main="Percentage comparison of Grades by quality of family relations", ylab=" % by quality of romantic relationships", legend= c("Very Bad", "Bad", "Fair", "Good", "Excellent"), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

table22 <- table(student_mat$famrel,student_mat$Pstatus) %>% prop.table(margin=1) %>% round(3)*100
table22 %>% barplot(main="Relationship of Parents cohabitation status \n and quality of family relations", ylab=" % by Parents", legend= c("Very Bad", "Bad", "Fair", "Good", "Excellent"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

Count of students based on whether having freetime

ggplot(data=student_mat, aes(student_mat$freetime)) + 
  geom_bar(fill="blue")+ xlab("freetime")+
  ggtitle("Count of students based on their free time")+
  scale_x_discrete(labels=c("1"= "Very Low", "2" = "Low", "3"="Nominal", "4"= "High", "5"="Very High"))

table23 <- table(student_mat$freetime,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table23 %>% barplot(main="Effect of freetime on grade of students", ylab=" % by freetime", legend= c("Very Low", "Low", "Nominal", "High", "Very High"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

ggplot(data=student_mat, aes(student_mat$activities)) + 
  geom_bar(fill="blue")+ xlab("Internet connection")+
  ggtitle("Count of students based on participation in extra curricular activity")

table24 <- table(student_mat$activities,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table24 %>% barplot(main="Percentage comparison of Grades by whether \n participating in extra curricular activities", ylab=" % by participation in activities", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))

# +scale_color_manual(labels = c("Rural", "Urban"))

Count of students based on level of hanging out with friends

ggplot(data=student_mat, aes(student_mat$goout)) + 
  geom_bar(fill="blue")+ xlab("freetime")+
  ggtitle("Count of students based on their free time")+
  scale_x_discrete(labels=c("1"= "Very Low", "2" = "Low", "3"="Nominal", "4"= "High", "5"="Very High"))

table25 <- table(student_mat$goout,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table25 %>% barplot(main="Effect of hanging out on grade of students", ylab=" % by hanging out frequency", legend= c("Very Low", "Low", "Nominal", "High", "Very High"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

ggplot(data=student_mat, aes(student_mat$health)) + 
  geom_bar(fill="blue")+ xlab("health")+
  ggtitle("Count of students based on their health")+
  scale_x_discrete(labels=c("1"= "Very Bad", "2" = "Bad", "3"="Fine", "4"= "Good", "5"="Very Good"))

table26 <- table(student_mat$health,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table26 %>% barplot(main="Effect of health on grade of students", ylab=" % by health", legend= c("Very Bad", "Bad", "Fine", "Good", "Very Good"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

ggplot(data=student_mat, aes(student_mat$absences)) + 
  geom_histogram(fill="blue")+ xlab("Absenteeism")+
  ggtitle("Count of students based on their absenteeism")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=student_mat, aes(student_mat$Grade, fill = student_mat$absences1)) + 
  scale_fill_manual(values=c("#7fc6bc","#083642","#b1df01", "#cdef9c"))+
  geom_bar(stat="count", position="dodge")+guides(fill = guide_legend(title = "Grade", title.position = "top"))+ ggtitle("Grades by absences")+ xlab("Absenteeism")

ggplot(data=student_mat, aes(student_mat$alc1)) + 
  geom_bar(fill="blue")+ xlab("Alcohol Consumption")+
  ggtitle("Count of students based on their alcohol consumption")

table27 <- table(student_mat$alc1,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table27 %>% barplot(main="Effect of alcohol consumption on grade of students", ylab=" % by alcohol consumption", legend= c("Very Low", "Low", "Normal", "High", "Very High"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))

#table28 <- table(student_mat$G11,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
p1 <- ggplot(data=student_mat, aes(x=student_mat$G1,y=student_mat$G3)) + 
  geom_point()+ xlab("First Term Grade")+ylab("Final grade")+
  ggtitle("Relation between First term grade and final grade")
p2 <- ggplot(data=student_mat, aes(x=student_mat$G2,y=student_mat$G3)) + 
  geom_point()+ xlab("Second Term Grade")+ ylab("Final Term Grade")
  ggtitle("Relation between Second term grade and Final grade")
## $title
## [1] "Relation between Second term grade and Final grade"
## 
## attr(,"class")
## [1] "labels"
library(grid)
vplayout <- function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)

grid.newpage()

pushViewport(viewport(layout = grid.layout(2,1, heights = unit(c(3.5,3.5), "null"))))
print(p1, vp = vplayout(1, 1))
print(p2, vp = vplayout(2, 1))