Project to Analyse Student Performance Data Set from UC Irvine’s Machine Learning Repository!
We will explore the following steps:
- Get our Data
- Exploratory Data Analysis (EDA)
- Clean our Data
- Review of Model Form
- Train and Test Groups
- Linear Regression Model
ENVIRONMENT SETUP
# Load the required packages (if packages are not available, install them first)
for (package in c('caret','ggplot2','ggthemes','dplyr','corrplot','caTools')) {
if (!require(package, character.only=T, quietly=T)) {
install.packages(package)
library(package,character.only=T)
}
}
##Get the Data
# Read the csv file and save in an object called "linear_data"
student_data <- read.csv("C:/Users/nkhan/Desktop/R-pushtogit/student-mat.csv",sep=";")
#Peek into the data
head(student_data)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP F 18 U GT3 A 4 4 at_home teacher
## 2 GP F 17 U GT3 T 1 1 at_home other
## 3 GP F 15 U LE3 T 1 1 at_home other
## 4 GP F 15 U GT3 T 4 2 health services
## 5 GP F 16 U GT3 T 3 3 other other
## 6 GP M 16 U LE3 T 4 3 services other
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 course mother 2 2 0 yes no no
## 2 course father 1 2 0 no yes no
## 3 other mother 1 2 3 yes no yes
## 4 home mother 1 3 0 no yes yes
## 5 home father 1 2 0 no yes yes
## 6 reputation mother 1 2 0 no yes yes
## activities nursery higher internet romantic famrel freetime goout Dalc
## 1 no yes yes no no 4 3 4 1
## 2 no no yes yes no 5 3 3 1
## 3 no yes yes yes no 4 3 2 2
## 4 yes yes yes yes yes 3 2 2 1
## 5 no yes yes no no 4 3 2 1
## 6 yes yes yes yes no 5 4 2 1
## Walc health absences G1 G2 G3
## 1 1 3 6 5 6 6
## 2 1 3 4 5 5 6
## 3 3 3 10 7 8 10
## 4 1 5 2 15 14 15
## 5 2 5 4 6 10 10
## 6 2 5 10 15 15 15
# # List types of each attributeCheck for the dimension of the data
dim(student_data)
## [1] 395 33
# Summarize the data
summary(student_data)
## school sex age address famsize Pstatus Medu
## GP:349 F:208 Min. :15.0 R: 88 GT3:281 A: 41 Min. :0.000
## MS: 46 M:187 1st Qu.:16.0 U:307 LE3:114 T:354 1st Qu.:2.000
## Median :17.0 Median :3.000
## Mean :16.7 Mean :2.749
## 3rd Qu.:18.0 3rd Qu.:4.000
## Max. :22.0 Max. :4.000
## Fedu Mjob Fjob reason
## Min. :0.000 at_home : 59 at_home : 20 course :145
## 1st Qu.:2.000 health : 34 health : 18 home :109
## Median :2.000 other :141 other :217 other : 36
## Mean :2.522 services:103 services:111 reputation:105
## 3rd Qu.:3.000 teacher : 58 teacher : 29
## Max. :4.000
## guardian traveltime studytime failures schoolsup
## father: 90 Min. :1.000 Min. :1.000 Min. :0.0000 no :344
## mother:273 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 yes: 51
## other : 32 Median :1.000 Median :2.000 Median :0.0000
## Mean :1.448 Mean :2.035 Mean :0.3342
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
## famsup paid activities nursery higher internet romantic
## no :153 no :214 no :194 no : 81 no : 20 no : 66 no :263
## yes:242 yes:181 yes:201 yes:314 yes:375 yes:329 yes:132
##
##
##
##
## famrel freetime goout Dalc
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000
## Median :4.000 Median :3.000 Median :3.000 Median :1.000
## Mean :3.944 Mean :3.235 Mean :3.109 Mean :1.481
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Walc health absences G1
## Min. :1.000 Min. :1.000 Min. : 0.000 Min. : 3.00
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.: 0.000 1st Qu.: 8.00
## Median :2.000 Median :4.000 Median : 4.000 Median :11.00
## Mean :2.291 Mean :3.554 Mean : 5.709 Mean :10.91
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:13.00
## Max. :5.000 Max. :5.000 Max. :75.000 Max. :19.00
## G2 G3
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 9.00 1st Qu.: 8.00
## Median :11.00 Median :11.00
## Mean :10.71 Mean :10.42
## 3rd Qu.:13.00 3rd Qu.:14.00
## Max. :19.00 Max. :20.00
CLEANING THE STUDENT_DATA
#Check for any NA values
any(is.na(student_data))
## [1] FALSE
# We do not see any NA values in the Student_data dataframe...
# Check for the type of attributes in the dataframe
sapply(student_data, class)
## school sex age address famsize Pstatus
## "factor" "factor" "integer" "factor" "factor" "factor"
## Medu Fedu Mjob Fjob reason guardian
## "integer" "integer" "factor" "factor" "factor" "factor"
## traveltime studytime failures schoolsup famsup paid
## "integer" "integer" "integer" "factor" "factor" "factor"
## activities nursery higher internet romantic famrel
## "factor" "factor" "factor" "factor" "factor" "integer"
## freetime goout Dalc Walc health absences
## "integer" "integer" "integer" "integer" "integer" "integer"
## G1 G2 G3
## "integer" "integer" "integer"
# we can see here at the output we have a bunch of "Integer's" and "Factors" in the dataframe
# Look into the structure of the data
str(student_data)
## 'data.frame': 395 obs. of 33 variables:
## $ school : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ G1 : int 5 5 7 15 6 15 12 6 16 14 ...
## $ G2 : int 6 5 8 14 10 15 12 5 18 15 ...
## $ G3 : int 6 6 10 15 10 15 11 6 19 15 ...
# Check for the columns and see if the colummn values nees to be factor or integer
# Here we will keep the Data as is since it has been cleaned for using already
DATA VISUALIZATION
#Let us use ggplot2 to visualize the data and get more understanding.
library(ggplot2)
library(ggthemes)
library(dplyr)
# Create a plot to understnd corelation between features in the dataframe.
# Grab only the numeric columns
num.cols <- sapply(student_data, is.numeric)
# filter numeric columns for corelation
# Use cor function to grab the corerealtion between each pf numeric variable
cor.data <- cor(student_data[,num.cols])
print(cor.data)
## age Medu Fedu traveltime
## age 1.000000000 -0.163658419 -0.163438069 0.070640721
## Medu -0.163658419 1.000000000 0.623455112 -0.171639305
## Fedu -0.163438069 0.623455112 1.000000000 -0.158194054
## traveltime 0.070640721 -0.171639305 -0.158194054 1.000000000
## studytime -0.004140037 0.064944137 -0.009174639 -0.100909119
## failures 0.243665377 -0.236679963 -0.250408444 0.092238746
## famrel 0.053940096 -0.003914458 -0.001369727 -0.016807986
## freetime 0.016434389 0.030890867 -0.012845528 -0.017024944
## goout 0.126963880 0.064094438 0.043104668 0.028539674
## Dalc 0.131124605 0.019834099 0.002386429 0.138325309
## Walc 0.117276052 -0.047123460 -0.012631018 0.134115752
## health -0.062187369 -0.046877829 0.014741537 0.007500606
## absences 0.175230079 0.100284818 0.024472887 -0.012943775
## G1 -0.064081497 0.205340997 0.190269936 -0.093039992
## G2 -0.143474049 0.215527168 0.164893393 -0.153197963
## G3 -0.161579438 0.217147496 0.152456939 -0.117142053
## studytime failures famrel freetime goout
## age -0.004140037 0.24366538 0.053940096 0.01643439 0.126963880
## Medu 0.064944137 -0.23667996 -0.003914458 0.03089087 0.064094438
## Fedu -0.009174639 -0.25040844 -0.001369727 -0.01284553 0.043104668
## traveltime -0.100909119 0.09223875 -0.016807986 -0.01702494 0.028539674
## studytime 1.000000000 -0.17356303 0.039730704 -0.14319841 -0.063903675
## failures -0.173563031 1.00000000 -0.044336626 0.09198747 0.124560922
## famrel 0.039730704 -0.04433663 1.000000000 0.15070144 0.064568411
## freetime -0.143198407 0.09198747 0.150701444 1.00000000 0.285018715
## goout -0.063903675 0.12456092 0.064568411 0.28501871 1.000000000
## Dalc -0.196019263 0.13604693 -0.077594357 0.20900085 0.266993848
## Walc -0.253784731 0.14196203 -0.113397308 0.14782181 0.420385745
## health -0.075615863 0.06582728 0.094055728 0.07573336 -0.009577254
## absences -0.062700175 0.06372583 -0.044354095 -0.05807792 0.044302220
## G1 0.160611915 -0.35471761 0.022168316 0.01261293 -0.149103967
## G2 0.135879999 -0.35589563 -0.018281347 -0.01377714 -0.162250034
## G3 0.097819690 -0.36041494 0.051363429 0.01130724 -0.132791474
## Dalc Walc health absences G1
## age 0.131124605 0.11727605 -0.062187369 0.17523008 -0.06408150
## Medu 0.019834099 -0.04712346 -0.046877829 0.10028482 0.20534100
## Fedu 0.002386429 -0.01263102 0.014741537 0.02447289 0.19026994
## traveltime 0.138325309 0.13411575 0.007500606 -0.01294378 -0.09303999
## studytime -0.196019263 -0.25378473 -0.075615863 -0.06270018 0.16061192
## failures 0.136046931 0.14196203 0.065827282 0.06372583 -0.35471761
## famrel -0.077594357 -0.11339731 0.094055728 -0.04435409 0.02216832
## freetime 0.209000848 0.14782181 0.075733357 -0.05807792 0.01261293
## goout 0.266993848 0.42038575 -0.009577254 0.04430222 -0.14910397
## Dalc 1.000000000 0.64754423 0.077179582 0.11190803 -0.09415879
## Walc 0.647544230 1.00000000 0.092476317 0.13629110 -0.12617921
## health 0.077179582 0.09247632 1.000000000 -0.02993671 -0.07317207
## absences 0.111908026 0.13629110 -0.029936711 1.00000000 -0.03100290
## G1 -0.094158792 -0.12617921 -0.073172073 -0.03100290 1.00000000
## G2 -0.064120183 -0.08492735 -0.097719866 -0.03177670 0.85211807
## G3 -0.054660041 -0.05193932 -0.061334605 0.03424732 0.80146793
## G2 G3
## age -0.14347405 -0.16157944
## Medu 0.21552717 0.21714750
## Fedu 0.16489339 0.15245694
## traveltime -0.15319796 -0.11714205
## studytime 0.13588000 0.09781969
## failures -0.35589563 -0.36041494
## famrel -0.01828135 0.05136343
## freetime -0.01377714 0.01130724
## goout -0.16225003 -0.13279147
## Dalc -0.06412018 -0.05466004
## Walc -0.08492735 -0.05193932
## health -0.09771987 -0.06133460
## absences -0.03177670 0.03424732
## G1 0.85211807 0.80146793
## G2 1.00000000 0.90486799
## G3 0.90486799 1.00000000
Lets make a ggplot
library(ggplot2)
ggplot(student_data,aes(x=G3)) + geom_histogram(bins = 20, alpha = 0.5,fill = 'blue')

# We can see many value at mean value of 10. We need to see. why is this so?
Split Data into Train and Test set
library(caTools)
# caTools have sample.split function
# Set a Seed
set.seed(101)
# Split up sample
sample <- sample.split(student_data$G3,SplitRatio = 0.7)
# 70% of data is used as train
train <- subset(student_data,sample=TRUE)
# 30% of data is used as test
test <- subset(student_data,sample == FALSE)
BUILD THE LINEAR REGRESSION MODEL
# model <- lm(y~.,data) ---- ~. calls all the features of the datafarame.
# model <- lm(y ~ x1 + x2,data)---- ~x1+x2 calls just x1 and x2 features to be tested.
Student_model <- lm(G3~.,data =train)
# Run the Model
# Interprit the model
print(summary(Student_model))
##
## Call:
## lm(formula = G3 ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9339 -0.5532 0.2680 0.9689 4.6461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.115488 2.116958 -0.527 0.598573
## schoolMS 0.480742 0.366512 1.312 0.190485
## sexM 0.174396 0.233588 0.747 0.455805
## age -0.173302 0.100780 -1.720 0.086380 .
## addressU 0.104455 0.270791 0.386 0.699922
## famsizeLE3 0.036512 0.226680 0.161 0.872128
## PstatusT -0.127673 0.335626 -0.380 0.703875
## Medu 0.129685 0.149999 0.865 0.387859
## Fedu -0.133940 0.128768 -1.040 0.298974
## Mjobhealth -0.146426 0.518491 -0.282 0.777796
## Mjobother 0.074088 0.332044 0.223 0.823565
## Mjobservices 0.046956 0.369587 0.127 0.898973
## Mjobteacher -0.026276 0.481632 -0.055 0.956522
## Fjobhealth 0.330948 0.666601 0.496 0.619871
## Fjobother -0.083582 0.476796 -0.175 0.860945
## Fjobservices -0.322142 0.493265 -0.653 0.514130
## Fjobteacher -0.112364 0.601448 -0.187 0.851907
## reasonhome -0.209183 0.256392 -0.816 0.415123
## reasonother 0.307554 0.380214 0.809 0.419120
## reasonreputation 0.129106 0.267254 0.483 0.629335
## guardianmother 0.195741 0.252672 0.775 0.439046
## guardianother 0.006565 0.463650 0.014 0.988710
## traveltime 0.096994 0.157800 0.615 0.539170
## studytime -0.104754 0.134814 -0.777 0.437667
## failures -0.160539 0.161006 -0.997 0.319399
## schoolsupyes 0.456448 0.319538 1.428 0.154043
## famsupyes 0.176870 0.224204 0.789 0.430710
## paidyes 0.075764 0.222100 0.341 0.733211
## activitiesyes -0.346047 0.205938 -1.680 0.093774 .
## nurseryyes -0.222716 0.254184 -0.876 0.381518
## higheryes 0.225921 0.500398 0.451 0.651919
## internetyes -0.144462 0.287528 -0.502 0.615679
## romanticyes -0.272008 0.219732 -1.238 0.216572
## famrel 0.356876 0.114124 3.127 0.001912 **
## freetime 0.047002 0.110209 0.426 0.670021
## goout 0.012007 0.105230 0.114 0.909224
## Dalc -0.185019 0.153124 -1.208 0.227741
## Walc 0.176772 0.114943 1.538 0.124966
## health 0.062995 0.074800 0.842 0.400259
## absences 0.045879 0.013412 3.421 0.000698 ***
## G1 0.188847 0.062373 3.028 0.002645 **
## G2 0.957330 0.053460 17.907 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.901 on 353 degrees of freedom
## Multiple R-squared: 0.8458, Adjusted R-squared: 0.8279
## F-statistic: 47.21 on 41 and 353 DF, p-value: < 2.2e-16
# Look for the pvalue of the variable
# MORE STARS MEANS HIGHER SIGNIFICANE OF THE FEATURE
# LESS STARS MEANS LOWER SIGNIFICANCE OF THE FEATURE notbeing relevant.
#Plot the residules value of the model
res <- residuals(Student_model)
#Check for its class
class(res)
## [1] "numeric"
# convert it as data frame
res <- as.data.frame(res)
head(res)
## res
## 1 0.8592152
## 2 1.9703191
## 3 2.1082992
## 4 1.8708782
## 5 1.0337345
## 6 -1.4886707
# Plot the residules(We want the residules to look like Normal Distributions)
# Normally Distributed residule means Mean of actual and predicted value is close to zero
ggplot(res,aes(res)) + geom_histogram(fill = 'blue', alpha = 0.5)

PREDICT THE STUDENT_DATA MODEL
# Make an object called G3.predictions to store the prections of model and test data.
G3.predictions <- predict(Student_model,test)
# Bind the columns to the test data
results <- cbind(G3.predictions,test$G3)
# Assign colnames to the 'results' table
colnames(results) <- c('predcited','actual')
# convert the 'results' table into dataframe
results <- as.data.frame(results)
Take Care of negative prediction values.
to_zero <- function(x){
if (x < 0){
return(0)
} else {
return(x)
}
}
# Apply zero function
results$predcited <- sapply(results$predcited,to_zero)
# Mean Squared Error
mse <- mean((results$actual - results$predcited)^2)
print('MSE')
## [1] "MSE"
print(mse)
## [1] 2.583407
# RMSE
print('Square root of MSE')
## [1] "Square root of MSE"
print(mse^0.5)
## [1] 1.607298
# Sum of the sqaured error
SSE <- sum((results$predcited - results$actual)^2)
SST <- sum((mean(student_data$G3) - results$actual)^2)
R2 <- 1 - SSE/SST
print('R2')
## [1] "R2"
print(R2)
## [1] 0.8734388