The dataset contains the student achievement in secondary education of two Portuguese schools. The objective of this project is to build classifiers to predict whether different attributes/factors like demographic, social and school related features affect the final year grades in Math subject of the students in future. The whole project is divided into two phases. Phase 1 focuses on the data preprocessing and the exploration of different attributes. Phase 2 mainly focuses on the model building for the prediction.
The dataset is collected from UCI Machine Learning Repository which has only one dataset for the analysis. There are 33 attributes and 395 observations in the dataset. Out of 33 attributes, there is 32 descriptive features and one target feature. In phase 1, we have only one dataset observation. In phase 2, we will build training dataset and test dataset.
The R packages used for the analysis are the following.
library(knitr)
library(mlr)
## Loading required package: ParamHelpers
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
The data was semicolon delimited so appropriate function was used to bring it to the readable format in R.The string values ar taken as characters. So we will later convert those variables to factors.
## Parsed with column specification:
## cols(
## .default = col_character(),
## age = col_double(),
## Medu = col_double(),
## Fedu = col_double(),
## traveltime = col_double(),
## studytime = col_double(),
## failures = col_double(),
## famrel = col_double(),
## freetime = col_double(),
## goout = col_double(),
## Dalc = col_double(),
## Walc = col_double(),
## health = col_double(),
## absences = col_double(),
## G1 = col_double(),
## G2 = col_double(),
## G3 = col_double()
## )
## See spec(...) for full column specifications.
## [1] "school" "sex" "age" "address" "famsize"
## [6] "Pstatus" "Medu" "Fedu" "Mjob" "Fjob"
## [11] "reason" "guardian" "traveltime" "studytime" "failures"
## [16] "schoolsup" "famsup" "paid" "activities" "nursery"
## [21] "higher" "internet" "romantic" "famrel" "freetime"
## [26] "goout" "Dalc" "Walc" "health" "absences"
## [31] "G1" "G2" "G3"
Using str and summarizeColumns we figured out that:
-> No white spaces were encountered in the character column values. -> Apparently there were no typos or any missing values especially in the character columns
str(student_mat)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 395 obs. of 33 variables:
## $ school : chr "GP" "GP" "GP" "GP" ...
## $ sex : chr "F" "F" "F" "F" ...
## $ age : num 18 17 15 15 16 16 16 17 15 15 ...
## $ address : chr "U" "U" "U" "U" ...
## $ famsize : chr "GT3" "GT3" "LE3" "GT3" ...
## $ Pstatus : chr "A" "T" "T" "T" ...
## $ Medu : num 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : num 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : chr "at_home" "at_home" "at_home" "health" ...
## $ Fjob : chr "teacher" "other" "other" "services" ...
## $ reason : chr "course" "course" "other" "home" ...
## $ guardian : chr "mother" "father" "mother" "mother" ...
## $ traveltime: num 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : num 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : num 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : chr "yes" "no" "yes" "no" ...
## $ famsup : chr "no" "yes" "no" "yes" ...
## $ paid : chr "no" "no" "yes" "yes" ...
## $ activities: chr "no" "no" "no" "yes" ...
## $ nursery : chr "yes" "no" "yes" "yes" ...
## $ higher : chr "yes" "yes" "yes" "yes" ...
## $ internet : chr "no" "yes" "yes" "yes" ...
## $ romantic : chr "no" "no" "no" "yes" ...
## $ famrel : num 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : num 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : num 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : num 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : num 1 1 3 1 2 2 1 1 1 1 ...
## $ health : num 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : num 6 4 10 2 4 10 0 6 0 0 ...
## $ G1 : num 5 5 7 15 6 15 12 6 16 14 ...
## $ G2 : num 6 5 8 14 10 15 12 5 18 15 ...
## $ G3 : num 6 6 10 15 10 15 11 6 19 15 ...
## - attr(*, "spec")=
## .. cols(
## .. school = col_character(),
## .. sex = col_character(),
## .. age = col_double(),
## .. address = col_character(),
## .. famsize = col_character(),
## .. Pstatus = col_character(),
## .. Medu = col_double(),
## .. Fedu = col_double(),
## .. Mjob = col_character(),
## .. Fjob = col_character(),
## .. reason = col_character(),
## .. guardian = col_character(),
## .. traveltime = col_double(),
## .. studytime = col_double(),
## .. failures = col_double(),
## .. schoolsup = col_character(),
## .. famsup = col_character(),
## .. paid = col_character(),
## .. activities = col_character(),
## .. nursery = col_character(),
## .. higher = col_character(),
## .. internet = col_character(),
## .. romantic = col_character(),
## .. famrel = col_double(),
## .. freetime = col_double(),
## .. goout = col_double(),
## .. Dalc = col_double(),
## .. Walc = col_double(),
## .. health = col_double(),
## .. absences = col_double(),
## .. G1 = col_double(),
## .. G2 = col_double(),
## .. G3 = col_double()
## .. )
summarizeColumns(student_mat) %>% knitr::kable(caption='Feature Summary before Data Processing')
name | type | na | mean | disp | median | mad | min | max | nlevs |
---|---|---|---|---|---|---|---|---|---|
school | character | 0 | NA | 0.1164557 | NA | NA | 46 | 349 | 2 |
sex | character | 0 | NA | 0.4734177 | NA | NA | 187 | 208 | 2 |
age | numeric | 0 | 16.6962025 | 1.2760427 | 17 | 1.4826 | 15 | 22 | 0 |
address | character | 0 | NA | 0.2227848 | NA | NA | 88 | 307 | 2 |
famsize | character | 0 | NA | 0.2886076 | NA | NA | 114 | 281 | 2 |
Pstatus | character | 0 | NA | 0.1037975 | NA | NA | 41 | 354 | 2 |
Medu | numeric | 0 | 2.7493671 | 1.0947351 | 3 | 1.4826 | 0 | 4 | 0 |
Fedu | numeric | 0 | 2.5215190 | 1.0882005 | 2 | 1.4826 | 0 | 4 | 0 |
Mjob | character | 0 | NA | 0.6430380 | NA | NA | 34 | 141 | 5 |
Fjob | character | 0 | NA | 0.4506329 | NA | NA | 18 | 217 | 5 |
reason | character | 0 | NA | 0.6329114 | NA | NA | 36 | 145 | 4 |
guardian | character | 0 | NA | 0.3088608 | NA | NA | 32 | 273 | 3 |
traveltime | numeric | 0 | 1.4481013 | 0.6975048 | 1 | 0.0000 | 1 | 4 | 0 |
studytime | numeric | 0 | 2.0354430 | 0.8392403 | 2 | 0.0000 | 1 | 4 | 0 |
failures | numeric | 0 | 0.3341772 | 0.7436510 | 0 | 0.0000 | 0 | 3 | 0 |
schoolsup | character | 0 | NA | 0.1291139 | NA | NA | 51 | 344 | 2 |
famsup | character | 0 | NA | 0.3873418 | NA | NA | 153 | 242 | 2 |
paid | character | 0 | NA | 0.4582278 | NA | NA | 181 | 214 | 2 |
activities | character | 0 | NA | 0.4911392 | NA | NA | 194 | 201 | 2 |
nursery | character | 0 | NA | 0.2050633 | NA | NA | 81 | 314 | 2 |
higher | character | 0 | NA | 0.0506329 | NA | NA | 20 | 375 | 2 |
internet | character | 0 | NA | 0.1670886 | NA | NA | 66 | 329 | 2 |
romantic | character | 0 | NA | 0.3341772 | NA | NA | 132 | 263 | 2 |
famrel | numeric | 0 | 3.9443038 | 0.8966586 | 4 | 1.4826 | 1 | 5 | 0 |
freetime | numeric | 0 | 3.2354430 | 0.9988620 | 3 | 1.4826 | 1 | 5 | 0 |
goout | numeric | 0 | 3.1088608 | 1.1132782 | 3 | 1.4826 | 1 | 5 | 0 |
Dalc | numeric | 0 | 1.4810127 | 0.8907414 | 1 | 0.0000 | 1 | 5 | 0 |
Walc | numeric | 0 | 2.2911392 | 1.2878966 | 2 | 1.4826 | 1 | 5 | 0 |
health | numeric | 0 | 3.5544304 | 1.3903034 | 4 | 1.4826 | 1 | 5 | 0 |
absences | numeric | 0 | 5.7088608 | 8.0030957 | 4 | 5.9304 | 0 | 75 | 0 |
G1 | numeric | 0 | 10.9088608 | 3.3191947 | 11 | 4.4478 | 3 | 19 | 0 |
G2 | numeric | 0 | 10.7139241 | 3.7615047 | 11 | 2.9652 | 0 | 19 | 0 |
G3 | numeric | 0 | 10.4151899 | 4.5814426 | 11 | 4.4478 | 0 | 20 | 0 |
We created the level table for the character features. The following insights came up:
sapply(student_mat[sapply(student_mat, is.character)], table)
## $school
##
## GP MS
## 349 46
##
## $sex
##
## F M
## 208 187
##
## $address
##
## R U
## 88 307
##
## $famsize
##
## GT3 LE3
## 281 114
##
## $Pstatus
##
## A T
## 41 354
##
## $Mjob
##
## at_home health other services teacher
## 59 34 141 103 58
##
## $Fjob
##
## at_home health other services teacher
## 20 18 217 111 29
##
## $reason
##
## course home other reputation
## 145 109 36 105
##
## $guardian
##
## father mother other
## 90 273 32
##
## $schoolsup
##
## no yes
## 344 51
##
## $famsup
##
## no yes
## 153 242
##
## $paid
##
## no yes
## 214 181
##
## $activities
##
## no yes
## 194 201
##
## $nursery
##
## no yes
## 81 314
##
## $higher
##
## no yes
## 20 375
##
## $internet
##
## no yes
## 66 329
##
## $romantic
##
## no yes
## 263 132
Just to be double sure we have used the “trimws” function to trim the white spaces present in the character column values.
student_mat[,sapply(student_mat, is.character)] <-sapply(student_mat[,sapply(student_mat, is.character)],trimws)
We modified four columns in original dataset.
G3 : The target variable G3 contained numeric value from 0-20 but is based on 5 level classification system viz. “fail”, “sufficient”, “satisfactory’,”good“,”excellent". So the new column Grade was created to assign these levels to the corresponding value in G3 column(final grade/Taget variable).
guardian: There were three levels to this viz. “mother”, “father”, “other”. We modified it to two level viz. parents/others and stored the values into new column named guardian1.
We combined two columns Dalc (workday alcohol consumption) and Walc (weekend alcohol consumption) by averaging the value in both columns to get alcohol consumption during whole week. The averaged value was stored in column named alc. Further the average value was categorized based on 5 level classification system viz.“Very Low”, “Low”, “Nominal”, “High”, “Very High” and stored in a new column alc1
absences: This column had numeric data from within the range 0 to 93. We changed it into intervals (0-10],(10-30], (30-50], (50, 93] , named them as “Very Low”, “Low”, “Medium”, “High” and stored them in the new column absences1
labels = c( "Fail", "Sufficient", "Satisfactory", "Good", "Very Good")
student_mat$Grade <- student_mat$G3 %>% cut(student_mat$G3,
breaks = c(0,9,11,13,15,20),
labels = labels,
right = TRUE)
#student_mat$G11 <- student_mat$G1 %>% cut(student_mat$G1, breaks = c(0,9,11,13,15,20),labels = labels, right = TRUE)
#student_mat$G21 <- student_mat$G2 %>% cut(student_mat$G2, breaks = c(0,9,11,13,15,20),labels = labels, right = TRUE)
student_mat <- student_mat %>% mutate(guardian1=ifelse(guardian %in% c('father','mother'),"Parents",
ifelse(grepl('other', guardian),'other', guardian)))
student_mat
## # A tibble: 395 x 35
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 GP F 18 U GT3 A 4 4 at_h~ teac~
## 2 GP F 17 U GT3 T 1 1 at_h~ other
## 3 GP F 15 U LE3 T 1 1 at_h~ other
## 4 GP F 15 U GT3 T 4 2 heal~ serv~
## 5 GP F 16 U GT3 T 3 3 other other
## 6 GP M 16 U LE3 T 4 3 serv~ other
## 7 GP M 16 U LE3 T 2 2 other other
## 8 GP F 17 U GT3 A 4 4 other teac~
## 9 GP M 15 U LE3 A 3 2 serv~ other
## 10 GP M 15 U GT3 T 3 4 other other
## # ... with 385 more rows, and 25 more variables: reason <chr>,
## # guardian <chr>, traveltime <dbl>, studytime <dbl>, failures <dbl>,
## # schoolsup <chr>, famsup <chr>, paid <chr>, activities <chr>,
## # nursery <chr>, higher <chr>, internet <chr>, romantic <chr>,
## # famrel <dbl>, freetime <dbl>, goout <dbl>, Dalc <dbl>, Walc <dbl>,
## # health <dbl>, absences <dbl>, G1 <dbl>, G2 <dbl>, G3 <dbl>,
## # Grade <fct>, guardian1 <chr>
student_mat$alc = (student_mat$Dalc + student_mat$Walc) / 2
labels = c( "Very Low", "Low", "Nominal", "High", "Very High")
student_mat$alc1 <- student_mat$alc %>% cut(student_mat$alc,
breaks = c(0,1,2,3,4,5),
labels = labels,
right = TRUE)
labels = c( "Very Low", "Low", "Medium", "High")
student_mat$absences1 <- student_mat$absences %>% cut(student_mat$absences,
breaks = c(0,10,30,50, 93),
labels = labels,
right = TRUE)
cols <- c("Medu", "Fedu", "famrel", "freetime","goout","Dalc", "Walc", "health", "traveltime", "studytime")
student_mat[,cols] <- data.frame(apply(student_mat[cols], 2, as.factor))
summarizeColumns(student_mat) %>% kable(caption='Summary Statistics after data pre-processing')
name | type | na | mean | disp | median | mad | min | max | nlevs |
---|---|---|---|---|---|---|---|---|---|
school | character | 0 | NA | 0.1164557 | NA | NA | 46 | 349 | 2 |
sex | character | 0 | NA | 0.4734177 | NA | NA | 187 | 208 | 2 |
age | numeric | 0 | 16.6962025 | 1.2760427 | 17.0 | 1.4826 | 15 | 22 | 0 |
address | character | 0 | NA | 0.2227848 | NA | NA | 88 | 307 | 2 |
famsize | character | 0 | NA | 0.2886076 | NA | NA | 114 | 281 | 2 |
Pstatus | character | 0 | NA | 0.1037975 | NA | NA | 41 | 354 | 2 |
Medu | factor | 0 | NA | 0.6683544 | NA | NA | 3 | 131 | 5 |
Fedu | factor | 0 | NA | 0.7088608 | NA | NA | 2 | 115 | 5 |
Mjob | character | 0 | NA | 0.6430380 | NA | NA | 34 | 141 | 5 |
Fjob | character | 0 | NA | 0.4506329 | NA | NA | 18 | 217 | 5 |
reason | character | 0 | NA | 0.6329114 | NA | NA | 36 | 145 | 4 |
guardian | character | 0 | NA | 0.3088608 | NA | NA | 32 | 273 | 3 |
traveltime | factor | 0 | NA | 0.3493671 | NA | NA | 8 | 257 | 4 |
studytime | factor | 0 | NA | 0.4987342 | NA | NA | 27 | 198 | 4 |
failures | numeric | 0 | 0.3341772 | 0.7436510 | 0.0 | 0.0000 | 0 | 3 | 0 |
schoolsup | character | 0 | NA | 0.1291139 | NA | NA | 51 | 344 | 2 |
famsup | character | 0 | NA | 0.3873418 | NA | NA | 153 | 242 | 2 |
paid | character | 0 | NA | 0.4582278 | NA | NA | 181 | 214 | 2 |
activities | character | 0 | NA | 0.4911392 | NA | NA | 194 | 201 | 2 |
nursery | character | 0 | NA | 0.2050633 | NA | NA | 81 | 314 | 2 |
higher | character | 0 | NA | 0.0506329 | NA | NA | 20 | 375 | 2 |
internet | character | 0 | NA | 0.1670886 | NA | NA | 66 | 329 | 2 |
romantic | character | 0 | NA | 0.3341772 | NA | NA | 132 | 263 | 2 |
famrel | factor | 0 | NA | 0.5063291 | NA | NA | 8 | 195 | 5 |
freetime | factor | 0 | NA | 0.6025316 | NA | NA | 19 | 157 | 5 |
goout | factor | 0 | NA | 0.6708861 | NA | NA | 23 | 130 | 5 |
Dalc | factor | 0 | NA | 0.3012658 | NA | NA | 9 | 276 | 5 |
Walc | factor | 0 | NA | 0.6177215 | NA | NA | 28 | 151 | 5 |
health | factor | 0 | NA | 0.6303797 | NA | NA | 45 | 146 | 5 |
absences | numeric | 0 | 5.7088608 | 8.0030957 | 4.0 | 5.9304 | 0 | 75 | 0 |
G1 | numeric | 0 | 10.9088608 | 3.3191947 | 11.0 | 4.4478 | 3 | 19 | 0 |
G2 | numeric | 0 | 10.7139241 | 3.7615047 | 11.0 | 2.9652 | 0 | 19 | 0 |
G3 | numeric | 0 | 10.4151899 | 4.5814426 | 11.0 | 4.4478 | 0 | 20 | 0 |
Grade | factor | 0 | NA | 0.6708861 | NA | NA | 40 | 130 | 5 |
guardian1 | character | 0 | NA | 0.0810127 | NA | NA | 32 | 363 | 2 |
alc | numeric | 0 | 1.8860759 | 0.9921947 | 1.5 | 0.7413 | 1 | 5 | 0 |
alc1 | factor | 0 | NA | 0.6202532 | NA | NA | 13 | 150 | 5 |
absences1 | factor | 0 | NA | 0.1670886 | NA | NA | 2 | 329 | 4 |
#student_mat %>% select(student_G3) %>% summary() %>% kable(caption='Summary Statistics after data pre- processing')
Each feature was explored individually and was split by defined classes of target feature. Further we did multivariate visualization.
** 4.1.1 Numerical Features **
4.1.1.1 Age: The age of students in the high school varies between 15 to 22. With maximum students of age 16. We see that the “Very Good” performance is shown by the students of age 16. 100% failure occurs in the age group 21 and 22. Maximum number of Best performances (“Good” and “Very Good”) is given by the students of age group 20. As a general trend the number of failures increases with increase in age.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table <- table(student_mat$age, student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
library(RColorBrewer)
table %>% barplot(main="Percentage comparison of Grades by age", ylab="Percentage by age", legend= rownames(table), beside=TRUE, xlab="Grade", col=1:8, ylim=c(0,100))
# c("#7fc6bc","#083642","#b1df01","#cdef9c","#466b5d")
table %>% barplot(main="Percentage comparison of Grades by age", ylab="Percentage by age", legend= rownames(table), beside=TRUE, xlab="Grade", ylim=c(0,100),col=c("#99ffcc","#ccff99","#ff9966","#d27979","#ffcccc","#e0e0d1","#ffff99","#c2c2d6"))
4.1.2 Categorical Features
4.1.2.1 School The data has been collected from two schools viz. GP and MS. Most of the students study in GP. The performance of students studying in GP is much better than those studying in MS. This justifies why the strength of students is much more in GP than in MS.
4.1.2.2 Address Type (Urban/ Rural) Majority of students stay in Urban areas. And the performance of urban students is better than those of rural area. Rural students have higher number of failures (below satisfactory performance).
4.1.2.3 Family size Most of the students belong to families with size greater than 3. There is not very clear implication from this relationship as though below average performance is more common in students with family size greater than 3 but many students from such families have given “Good” performances also.
ggplot(data=student_mat, aes(student_mat$famsize)) +
geom_bar(fill="blue")+ xlab("Family size")+
ggtitle("Number of students from different family size")+
scale_x_discrete(labels=c("LE3" = "<= 3", "GT3" = ">3"))
table3 <- table(student_mat$famsize,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table3 %>% barplot(main="Percentage comparison of Grades by family size", ylab="Percentage by family size", legend= c("Greater than 3", "Less than equal to 3"), beside=TRUE, xlab="Grade", col=c("green", "blue" ), ylim=c(0,100))
Most of the students have parents staying together. But this doesn’t seem to indicate their performance. No clear insight can be drawn from it.
ggplot(data=student_mat, aes(student_mat$Pstatus)) +
geom_bar(fill="blue")+ xlab("Parent Status")+ggtitle("Number of students from different parent cohabitation status")+ scale_x_discrete(labels=c("A"= "Apart", "T" = "Together"))
table4 <- table(student_mat$Pstatus,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table4 %>% barplot(main="Percentage comparison of Grades by Parents' cohabitation status", ylab="Percentage by family size", legend= c("Away", "Together"), beside=TRUE, xlab="Grade", col=c("blue", "green" ), ylim=c(0,100))
4.1.2.5 Mother’s and Father’s education
ggplot(data=student_mat, aes(student_mat$Medu)) +
geom_bar(fill="blue")+ xlab("Mother's Education")+
ggtitle("Count of students with different mother's education")+
scale_x_discrete(labels=c("0" = "Uneducated", "1" = "up to 4th grade", "2" = "5th to 9th grade", "3"= "Secondary education", "4"= "Higher education"))
4.1.2.6 Father’s education Fathers of majority of students have received at least middle level education (5th to 9th grade). Only few have education level below this level. With regards to higher education mothers’ number is more in comparison to that of fathers’.
ggplot(data=student_mat, aes(student_mat$Fedu)) +
geom_bar(fill="blue")+ xlab("Father's Education")+
ggtitle("Count of students with different father's education")+
scale_x_discrete(labels=c("0" = "Uneducated", "1" = "up to 4th grade", "2" = "5th to 9th grade", "3"= "Secondary education", "4"= "Higher education"))
table7 <- table(student_mat$Mjob,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table7 %>% barplot(main="Percentage comparison of grades by mother's job", ylab="Percentage by Mother's job", legend= c("Homemaker", "Healthcare related", "Others", "Civil Services", "Teacher"), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
# topo.colors(12)
table8 <- table(student_mat$Fjob,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
barplot(table8, main="Percentage comparison of grades by Father's job", ylab="Percentage by Father's job", legend= c("At home", "Healthcare related", "Others", "Civil Services", "Teacher"), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
# topo.colors(12)
par(mfrow=c(2,1))
barplot(table7,main="Percentage comparison of grades by Mother and Father Job", ylab="% by Mother's job", beside=TRUE, col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
table8 %>% barplot( ylab="% by Father's job", legend= c("Home","Health", "Other", "CS", "Teacher"), args.legend = list(x ='bottom', bty='n', inset=c(-0.08,0.25)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
ggplot(data=student_mat, aes(student_mat$guardian1)) +
geom_bar(fill="blue")+ xlab("Guardian")+
ggtitle("Count of students based on guardian status")
table9 <- table(student_mat$guardian1,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
barplot(table9, main="Percentage comparison of grades by guardian type", ylab="Percentage by guardian's type", beside=TRUE, xlab="Grade", col=c("blue", "green"), legend=c("Parents", "other"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
# topo.colors(12)
ggplot(data=student_mat, aes(student_mat$traveltime)) +
geom_bar(fill="blue")+ xlab("Travel Time")+
ggtitle("Count of students based on travel time")+
scale_x_discrete(labels=c("1"= "< 15 min", "2" = "15-30 min", "3"="30 min- 1 hr", "4"= "> 1 hr"))
table10 <- table(student_mat$traveltime,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table10 %>% barplot(main="Percentage comparison of Grades by travel time", ylab="Percentage by travel time", legend= c("< 15 min", "15-30 min", "30 min- 1 hr","> 1 hr"), beside=TRUE, xlab="Grade", col=c("green", "blue", "red", "black" ), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
ggplot(data=student_mat, aes(student_mat$studytime)) +
geom_bar(fill="blue")+ xlab("Study Time")+
ggtitle("Count of students based on study time")+
scale_x_discrete(labels=c("1"= "< 2 hrs", "2" = "2-5 hrs", "3"="5-10 hrs", "4"= "> 10 hrs"))
table11 <- table(student_mat$traveltime,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table11 %>% barplot(main="Percentage comparison of Grades by study time", ylab="Percentage by study time", legend= c("< 2 hrs", "2-5 hrs", "5-10 hrs","> 10 hrs"), beside=TRUE, xlab="Grade", col=c("green", "blue", "red", "black" ), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
table12 <- table(student_mat$studytime,student_mat$sex) %>% prop.table(margin=1) %>% round(3)*100
table12 %>% barplot(main="Percentage comparison of no. of study hours with sex", ylab="% of male/females with different study times", legend= c("< 2 hrs", "2-5 hrs", "5-10 hrs","> 10 hrs"), beside=TRUE, xlab="Gender", col=c("green", "blue", "red", "black" ), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
ggplot(data=student_mat, aes(student_mat$failures)) +
geom_bar(fill="blue")+ xlab("failures")+
ggtitle("Count of students based on past class failures")
table13 <- table(student_mat$failures,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table13 %>% barplot(main="Percentage comparison of Grades by no. of past failures", ylab="Percentage by past failures", legend= c("0", "1", "2","3"), beside=TRUE, xlab="Grade", col=c("green", "blue", "red", "black" ), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
ggplot(data=student_mat, aes(student_mat$schoolsup)) +
geom_bar(fill="blue")+ xlab("extra educational support")+
ggtitle("Count of students based on extra educational support")
#table14 %>% barplot(main="Percentage comparison of Grades by extra educational support", ylab=" % by educational suport", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
table15 <- table(student_mat$famsup,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table15 %>% barplot(main="Percentage comparison of Grades by educational support from family", ylab=" % by educational suport", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
ggplot(data=student_mat, aes(student_mat$paid)) +
geom_bar(fill="blue")+ xlab("paid educational support")+
ggtitle("Count of students based on paid educational support")
table16 <- table(student_mat$paid,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table16 %>% barplot(main="Percentage comparison of Grades by paid educational support", ylab=" % by educational suport", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
#par(mfrow=c(3,1))
#table14 %>% barplot(main="Percentage comparison of Grades by extra educational support", ylab=" % by educational suport", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
table15 %>% barplot(main="Percentage comparison of Grades by educational support from family", ylab=" % by educational suport", beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
table16 %>% barplot(main="Percentage comparison of Grades by paid educational support", ylab=" % by educational suport", beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
ggplot(data=student_mat, aes(student_mat$higher)) +
geom_bar(fill="blue")+ xlab("Interest in higher education")+
ggtitle("Count of students based on interest in higher education")
table17 <- table(student_mat$higher,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table17 %>% barplot(main="Percentage comparison of Grades by interest in higher education", ylab=" % by interest in higher education", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
ggplot(data=student_mat, aes(student_mat$nursery)) +
geom_bar(fill="blue")+ xlab("Attended nursery school")+
ggtitle("Count of students based on whether done nursery or not")
table18 <- table(student_mat$nursery,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table18 %>% barplot(main="Percentage comparison of Grades by whether \n attended nursery school or not", ylab=" % by nursery", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
ggplot(data=student_mat, aes(student_mat$internet)) +
geom_bar(fill="blue")+ xlab("Internet connection")+
ggtitle("Count of students based on whether having internet connection or not")
table19 <- table(student_mat$internet,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table19 %>% barplot(main="Percentage comparison of Grades by whether \n have internet connection or not", ylab=" % by internet connection", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
ggplot(data=student_mat, aes(student_mat$romantic)) +
geom_bar(fill="blue")+ xlab("Romantic Relationship")+
ggtitle("Count of students based on whether having romantic relationship or not")
ggplot(data=student_mat, aes(student_mat$famrel)) +
geom_bar(fill="blue")+ xlab("family relationship quality")+
ggtitle("Count of students based on type of family relationships")
table21 <- table(student_mat$famrel,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table21 %>% barplot(main="Percentage comparison of Grades by quality of family relations", ylab=" % by quality of romantic relationships", legend= c("Very Bad", "Bad", "Fair", "Good", "Excellent"), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
table22 <- table(student_mat$famrel,student_mat$Pstatus) %>% prop.table(margin=1) %>% round(3)*100
table22 %>% barplot(main="Relationship of Parents cohabitation status \n and quality of family relations", ylab=" % by Parents", legend= c("Very Bad", "Bad", "Fair", "Good", "Excellent"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
ggplot(data=student_mat, aes(student_mat$freetime)) +
geom_bar(fill="blue")+ xlab("freetime")+
ggtitle("Count of students based on their free time")+
scale_x_discrete(labels=c("1"= "Very Low", "2" = "Low", "3"="Nominal", "4"= "High", "5"="Very High"))
table23 <- table(student_mat$freetime,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table23 %>% barplot(main="Effect of freetime on grade of students", ylab=" % by freetime", legend= c("Very Low", "Low", "Nominal", "High", "Very High"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
ggplot(data=student_mat, aes(student_mat$activities)) +
geom_bar(fill="blue")+ xlab("Internet connection")+
ggtitle("Count of students based on participation in extra curricular activity")
table24 <- table(student_mat$activities,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table24 %>% barplot(main="Percentage comparison of Grades by whether \n participating in extra curricular activities", ylab=" % by participation in activities", legend= c("No", "Yes"), beside=TRUE, xlab="Grade", col=c("green", "blue"), ylim=c(0,100))
# +scale_color_manual(labels = c("Rural", "Urban"))
ggplot(data=student_mat, aes(student_mat$goout)) +
geom_bar(fill="blue")+ xlab("freetime")+
ggtitle("Count of students based on their free time")+
scale_x_discrete(labels=c("1"= "Very Low", "2" = "Low", "3"="Nominal", "4"= "High", "5"="Very High"))
table25 <- table(student_mat$goout,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table25 %>% barplot(main="Effect of hanging out on grade of students", ylab=" % by hanging out frequency", legend= c("Very Low", "Low", "Nominal", "High", "Very High"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
ggplot(data=student_mat, aes(student_mat$health)) +
geom_bar(fill="blue")+ xlab("health")+
ggtitle("Count of students based on their health")+
scale_x_discrete(labels=c("1"= "Very Bad", "2" = "Bad", "3"="Fine", "4"= "Good", "5"="Very Good"))
table26 <- table(student_mat$health,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table26 %>% barplot(main="Effect of health on grade of students", ylab=" % by health", legend= c("Very Bad", "Bad", "Fine", "Good", "Very Good"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
ggplot(data=student_mat, aes(student_mat$absences)) +
geom_histogram(fill="blue")+ xlab("Absenteeism")+
ggtitle("Count of students based on their absenteeism")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=student_mat, aes(student_mat$Grade, fill = student_mat$absences1)) +
scale_fill_manual(values=c("#7fc6bc","#083642","#b1df01", "#cdef9c"))+
geom_bar(stat="count", position="dodge")+guides(fill = guide_legend(title = "Grade", title.position = "top"))+ ggtitle("Grades by absences")+ xlab("Absenteeism")
ggplot(data=student_mat, aes(student_mat$alc1)) +
geom_bar(fill="blue")+ xlab("Alcohol Consumption")+
ggtitle("Count of students based on their alcohol consumption")
table27 <- table(student_mat$alc1,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
table27 %>% barplot(main="Effect of alcohol consumption on grade of students", ylab=" % by alcohol consumption", legend= c("Very Low", "Low", "Normal", "High", "Very High"), args.legend = list(x ='topleft', bty='n', inset=c(0,0.1)), beside=TRUE, xlab="Grade", col=c("black", "red", "blue", "green", "dark blue"), ylim=c(0,100))
#table28 <- table(student_mat$G11,student_mat$Grade) %>% prop.table(margin=1) %>% round(3)*100
p1 <- ggplot(data=student_mat, aes(x=student_mat$G1,y=student_mat$G3)) +
geom_point()+ xlab("First Term Grade")+ylab("Final grade")+
ggtitle("Relation between First term grade and final grade")
p2 <- ggplot(data=student_mat, aes(x=student_mat$G2,y=student_mat$G3)) +
geom_point()+ xlab("Second Term Grade")+ ylab("Final Term Grade")
ggtitle("Relation between Second term grade and Final grade")
## $title
## [1] "Relation between Second term grade and Final grade"
##
## attr(,"class")
## [1] "labels"
library(grid)
vplayout <- function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
grid.newpage()
pushViewport(viewport(layout = grid.layout(2,1, heights = unit(c(3.5,3.5), "null"))))
print(p1, vp = vplayout(1, 1))
print(p2, vp = vplayout(2, 1))