Data Description

The purpose of the assignment is to analyse yearly changes in thickness of Ozone layer and predict the changes for the next 5 years

Data PreProcessing

getwd()
## [1] "/Users/mohammadrazzak/Documents/University/RMIT/Machine Learning/Assignment phase 1/phase 2"
setwd("/Users/mohammadrazzak/Documents/University/RMIT/Machine Learning/Assignment phase 1/phase 2")
student_mat <- read_delim("student-mat.csv", 
                          ";", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   age = col_integer(),
##   Medu = col_integer(),
##   Fedu = col_integer(),
##   traveltime = col_integer(),
##   studytime = col_integer(),
##   failures = col_integer(),
##   famrel = col_integer(),
##   freetime = col_integer(),
##   goout = col_integer(),
##   Dalc = col_integer(),
##   Walc = col_integer(),
##   health = col_integer(),
##   absences = col_integer(),
##   G1 = col_integer(),
##   G2 = col_integer(),
##   G3 = col_integer()
## )
## See spec(...) for full column specifications.
student_por <- read_delim("student-por.csv", 
                          ";", escape_double = FALSE, trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   age = col_integer(),
##   Medu = col_integer(),
##   Fedu = col_integer(),
##   traveltime = col_integer(),
##   studytime = col_integer(),
##   failures = col_integer(),
##   famrel = col_integer(),
##   freetime = col_integer(),
##   goout = col_integer(),
##   Dalc = col_integer(),
##   Walc = col_integer(),
##   health = col_integer(),
##   absences = col_integer(),
##   G1 = col_integer(),
##   G2 = col_integer(),
##   G3 = col_integer()
## )
## See spec(...) for full column specifications.
total <- rbind(student_mat, student_por)


names(total) <- c('school', 'sex', 'age','address', 
                  'Family_Size', 'Parental_Status', 
                  'Mothers_Education', 'Fathers_Education',
                  'Mothers_Job',
                  'Fathers_Job', 
                  'Enrollment_Reason',
                  'Guardian',
                  'Travel_Time',
                    'Study_Time', 'Class_Repeat',
                  'Extra_Edu_Support', 
                  'Family_Edu_Support',
                  'Extra_paid_classes',
                  'Extra_Curricular_Activities',
                  'Nursery_Schooling'
                  ,'Higher_Education',
                  'Home_Internet',
                    'Romantic_Relationship',
                  'Family_Relationship', 
                  'Free_Time_Post_School',
                  'Socialising',
                  'Daily_Alcohol_Consumption',
                  'Weekend_Alcohol_Consumption'
                  , 'Health_Status', 'School_Absences',
                  'Sem1_Grade','Sem2_Grade',
                  'Final_Grade')


head(total)
## # A tibble: 6 x 33
##   school sex     age address Family_Size Parental_Status Mothers_Education
##   <chr>  <chr> <int> <chr>   <chr>       <chr>                       <int>
## 1 GP     F        18 U       GT3         A                               4
## 2 GP     F        17 U       GT3         T                               1
## 3 GP     F        15 U       LE3         T                               1
## 4 GP     F        15 U       GT3         T                               4
## 5 GP     F        16 U       GT3         T                               3
## 6 GP     M        16 U       LE3         T                               4
## # ... with 26 more variables: Fathers_Education <int>, Mothers_Job <chr>,
## #   Fathers_Job <chr>, Enrollment_Reason <chr>, Guardian <chr>,
## #   Travel_Time <int>, Study_Time <int>, Class_Repeat <int>,
## #   Extra_Edu_Support <chr>, Family_Edu_Support <chr>,
## #   Extra_paid_classes <chr>, Extra_Curricular_Activities <chr>,
## #   Nursery_Schooling <chr>, Higher_Education <chr>, Home_Internet <chr>,
## #   Romantic_Relationship <chr>, Family_Relationship <int>,
## #   Free_Time_Post_School <int>, Socialising <int>,
## #   Daily_Alcohol_Consumption <int>, Weekend_Alcohol_Consumption <int>,
## #   Health_Status <int>, School_Absences <int>, Sem1_Grade <int>,
## #   Sem2_Grade <int>, Final_Grade <int>

Inspect and Understand

total$Pass_Fail <- ifelse(total$Final_Grade >=10, "Pass", "Fail")
View(total)
head(total)
## # A tibble: 6 x 34
##   school sex     age address Family_Size Parental_Status Mothers_Education
##   <chr>  <chr> <int> <chr>   <chr>       <chr>                       <int>
## 1 GP     F        18 U       GT3         A                               4
## 2 GP     F        17 U       GT3         T                               1
## 3 GP     F        15 U       LE3         T                               1
## 4 GP     F        15 U       GT3         T                               4
## 5 GP     F        16 U       GT3         T                               3
## 6 GP     M        16 U       LE3         T                               4
## # ... with 27 more variables: Fathers_Education <int>, Mothers_Job <chr>,
## #   Fathers_Job <chr>, Enrollment_Reason <chr>, Guardian <chr>,
## #   Travel_Time <int>, Study_Time <int>, Class_Repeat <int>,
## #   Extra_Edu_Support <chr>, Family_Edu_Support <chr>,
## #   Extra_paid_classes <chr>, Extra_Curricular_Activities <chr>,
## #   Nursery_Schooling <chr>, Higher_Education <chr>, Home_Internet <chr>,
## #   Romantic_Relationship <chr>, Family_Relationship <int>,
## #   Free_Time_Post_School <int>, Socialising <int>,
## #   Daily_Alcohol_Consumption <int>, Weekend_Alcohol_Consumption <int>,
## #   Health_Status <int>, School_Absences <int>, Sem1_Grade <int>,
## #   Sem2_Grade <int>, Final_Grade <int>, Pass_Fail <chr>
dim(total)
## [1] 1044   34
str(total)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1044 obs. of  34 variables:
##  $ school                     : chr  "GP" "GP" "GP" "GP" ...
##  $ sex                        : chr  "F" "F" "F" "F" ...
##  $ age                        : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address                    : chr  "U" "U" "U" "U" ...
##  $ Family_Size                : chr  "GT3" "GT3" "LE3" "GT3" ...
##  $ Parental_Status            : chr  "A" "T" "T" "T" ...
##  $ Mothers_Education          : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fathers_Education          : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mothers_Job                : chr  "at_home" "at_home" "at_home" "health" ...
##  $ Fathers_Job                : chr  "teacher" "other" "other" "services" ...
##  $ Enrollment_Reason          : chr  "course" "course" "other" "home" ...
##  $ Guardian                   : chr  "mother" "father" "mother" "mother" ...
##  $ Travel_Time                : int  2 1 1 1 1 1 1 2 1 1 ...
##  $ Study_Time                 : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ Class_Repeat               : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ Extra_Edu_Support          : chr  "yes" "no" "yes" "no" ...
##  $ Family_Edu_Support         : chr  "no" "yes" "no" "yes" ...
##  $ Extra_paid_classes         : chr  "no" "no" "yes" "yes" ...
##  $ Extra_Curricular_Activities: chr  "no" "no" "no" "yes" ...
##  $ Nursery_Schooling          : chr  "yes" "no" "yes" "yes" ...
##  $ Higher_Education           : chr  "yes" "yes" "yes" "yes" ...
##  $ Home_Internet              : chr  "no" "yes" "yes" "yes" ...
##  $ Romantic_Relationship      : chr  "no" "no" "no" "yes" ...
##  $ Family_Relationship        : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ Free_Time_Post_School      : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ Socialising                : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Daily_Alcohol_Consumption  : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Weekend_Alcohol_Consumption: int  1 1 3 1 2 2 1 1 1 1 ...
##  $ Health_Status              : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ School_Absences            : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ Sem1_Grade                 : int  5 5 7 15 6 15 12 6 16 14 ...
##  $ Sem2_Grade                 : int  6 5 8 14 10 15 12 5 18 15 ...
##  $ Final_Grade                : int  6 6 10 15 10 15 11 6 19 15 ...
##  $ Pass_Fail                  : chr  "Fail" "Fail" "Pass" "Pass" ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 33
##   .. ..$ school    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ sex       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ age       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ address   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ famsize   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Pstatus   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Medu      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Fedu      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Mjob      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Fjob      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ reason    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ guardian  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ traveltime: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ studytime : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ failures  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ schoolsup : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ famsup    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ paid      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ activities: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ nursery   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ higher    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ internet  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ romantic  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ famrel    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ freetime  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ goout     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Dalc      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Walc      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ health    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ absences  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ G1        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ G2        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ G3        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

Feature Summary before Data Preprocessing

summarizeColumns(total) %>% knitr::kable( caption = 'Feature Summary before Data Preprocessing')
Feature Summary before Data Preprocessing
name type na mean disp median mad min max nlevs
school character 0 NA 0.2605364 NA NA 272 772 2
sex character 0 NA 0.4339080 NA NA 453 591 2
age integer 0 16.7260536 1.2399747 17 1.4826 15 22 0
address character 0 NA 0.2729885 NA NA 285 759 2
Family_Size character 0 NA 0.2931034 NA NA 306 738 2
Parental_Status character 0 NA 0.1159004 NA NA 121 923 2
Mothers_Education integer 0 2.6034483 1.1249066 3 1.4826 0 4 0
Fathers_Education integer 0 2.3879310 1.0999381 2 1.4826 0 4 0
Mothers_Job character 0 NA 0.6178161 NA NA 82 399 5
Fathers_Job character 0 NA 0.4406130 NA NA 41 584 5
Enrollment_Reason character 0 NA 0.5881226 NA NA 108 430 4
Guardian character 0 NA 0.3026820 NA NA 73 728 3
Travel_Time integer 0 1.5229885 0.7317274 1 0.0000 1 4 0
Study_Time integer 0 1.9703065 0.8343532 2 1.4826 1 4 0
Class_Repeat integer 0 0.2643678 0.6561418 0 0.0000 0 3 0
Extra_Edu_Support character 0 NA 0.1139847 NA NA 119 925 2
Family_Edu_Support character 0 NA 0.3869732 NA NA 404 640 2
Extra_paid_classes character 0 NA 0.2107280 NA NA 220 824 2
Extra_Curricular_Activities character 0 NA 0.4942529 NA NA 516 528 2
Nursery_Schooling character 0 NA 0.2001916 NA NA 209 835 2
Higher_Education character 0 NA 0.0852490 NA NA 89 955 2
Home_Internet character 0 NA 0.2078544 NA NA 217 827 2
Romantic_Relationship character 0 NA 0.3553640 NA NA 371 673 2
Family_Relationship integer 0 3.9358238 0.9334008 4 1.4826 1 5 0
Free_Time_Post_School integer 0 3.2011494 1.0315068 3 1.4826 1 5 0
Socialising integer 0 3.1561303 1.1525747 3 1.4826 1 5 0
Daily_Alcohol_Consumption integer 0 1.4942529 0.9117143 1 0.0000 1 5 0
Weekend_Alcohol_Consumption integer 0 2.2844828 1.2851048 2 1.4826 1 5 0
Health_Status integer 0 3.5431034 1.4247034 4 1.4826 1 5 0
School_Absences integer 0 4.4348659 6.2100166 2 2.9652 0 75 0
Sem1_Grade integer 0 11.2136015 2.9833939 11 2.9652 0 19 0
Sem2_Grade integer 0 11.2461686 3.2850711 11 2.9652 0 19 0
Final_Grade integer 0 11.3419540 3.8647958 11 2.9652 0 20 0
Pass_Fail character 0 NA 0.2203065 NA NA 230 814 2

Excessive White characters removed

total[, sapply( total, is.character )] <- sapply( total[, sapply( total, is.character )], trimws)

Number of grades and its frequencies

Data Exploration

q01 <- qplot(x=school,data = total,geom='bar')
p19 <- ggplot(total,aes(x=Pass_Fail))+geom_bar(fill = 'lavender')+
  labs(x="Final Grade")
p18 <- p19+facet_grid(~school)+
  ggtitle("Barplot of School by \n Final Grade")

plot_grid(q01, p18, labels = "AUTO")

Study Time Studying 2-5 hrs on weekly basis seems to get one pass in final exams

Family Educational Support Most of students had educational support from family.We can see that students with family support tend to do well. Extra Paid Classes Not many students opted for extra paid classes.Majority of students seem to have done well without extra paid classes Extra Curricular Activities It seems taking extra-curricular activities didnt affect student grades.Almost half of students did not show interest in getting involved in after school activities Romantic Relationship We can see that proportion of students who were romantically involved had low pass rate.

## $title
## [1] "Barplot of \n Socialising by \n Final Grade"
## 
## $subtitle
## NULL
## 
## attr(,"class")
## [1] "labels"

Socialising Students socialising mostly passed the exams. It didn’t affect the grades of most of them.While, students who socialised a lot didnt, also flunked in their last exam

total<- total[-c(31:33)]

data <- total


total %>% mutate_if(is.character, as.factor) -> total

Summary

The attributes were combined to the Pass/Fail target feature and we tried to know the reasoning for people passing and failing, The data exploration demonstrated the school, age, sex, travel time, address etc. had an impact on the students grades.