This analysis focuses on understanding how different attributes of a student affect their school attendance. The data set includes 30 attributes of 649 students from Portugal in 2008, but the experiment focuses on gender, involvement in extracurricular activities, alcohol consumption, and travel time to school. The effect of these four independent variables on number of absences is analyzed in this study. Levels of factors as used for analysis are shown as follows:
The raw data are organized into rows, with each one representing a student There are five columns- one for each of the variables indicated above. For the purpose of this experiment, only the effects of the four factors on number of absences will be analyzed, even though the complete data set includes additional attributes. The original data set also included students from a Mathematics course and a Portuguese language course and then combined the two by students that were in both courses. However, only data from the Portuguese language course was used for this experiment because it had more data points.
First, we download the rsm package to be used, and then read in the data.
#Download rsm package
library(rsm)
## Warning: package 'rsm' was built under R version 3.2.5
#Read in data- students in Portuguese language course
student0=read.table("student-por.csv",sep=";",header=TRUE)
The first 10 data points are shown below.
head(student0, n=10)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP F 18 U GT3 A 4 4 at_home teacher
## 2 GP F 17 U GT3 T 1 1 at_home other
## 3 GP F 15 U LE3 T 1 1 at_home other
## 4 GP F 15 U GT3 T 4 2 health services
## 5 GP F 16 U GT3 T 3 3 other other
## 6 GP M 16 U LE3 T 4 3 services other
## 7 GP M 16 U LE3 T 2 2 other other
## 8 GP F 17 U GT3 A 4 4 other teacher
## 9 GP M 15 U LE3 A 3 2 services other
## 10 GP M 15 U GT3 T 3 4 other other
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 course mother 2 2 0 yes no no
## 2 course father 1 2 0 no yes no
## 3 other mother 1 2 0 yes no no
## 4 home mother 1 3 0 no yes no
## 5 home father 1 2 0 no yes no
## 6 reputation mother 1 2 0 no yes no
## 7 home mother 1 2 0 no no no
## 8 home mother 2 2 0 yes yes no
## 9 home mother 1 2 0 no yes no
## 10 home mother 1 2 0 no yes no
## activities nursery higher internet romantic famrel freetime goout Dalc
## 1 no yes yes no no 4 3 4 1
## 2 no no yes yes no 5 3 3 1
## 3 no yes yes yes no 4 3 2 2
## 4 yes yes yes yes yes 3 2 2 1
## 5 no yes yes no no 4 3 2 1
## 6 yes yes yes yes no 5 4 2 1
## 7 no yes yes yes no 4 4 4 1
## 8 no yes yes no no 4 1 4 1
## 9 no yes yes yes no 4 2 2 1
## 10 yes yes yes yes no 5 5 1 1
## Walc health absences G1 G2 G3
## 1 1 3 4 0 11 11
## 2 1 3 2 9 11 11
## 3 3 3 6 12 13 12
## 4 1 5 0 14 14 14
## 5 2 5 0 11 13 13
## 6 2 5 6 12 12 13
## 7 1 3 0 13 12 13
## 8 1 1 2 10 13 13
## 9 1 1 0 15 16 17
## 10 1 5 0 12 12 13
The following code shows the structure as well as a summary of the raw data.
#Show a summary and structure of the raw data
summary(student0)
## school sex age address famsize Pstatus
## GP:423 F:383 Min. :15.00 R:197 GT3:457 A: 80
## MS:226 M:266 1st Qu.:16.00 U:452 LE3:192 T:569
## Median :17.00
## Mean :16.74
## 3rd Qu.:18.00
## Max. :22.00
## Medu Fedu Mjob Fjob
## Min. :0.000 Min. :0.000 at_home :135 at_home : 42
## 1st Qu.:2.000 1st Qu.:1.000 health : 48 health : 23
## Median :2.000 Median :2.000 other :258 other :367
## Mean :2.515 Mean :2.307 services:136 services:181
## 3rd Qu.:4.000 3rd Qu.:3.000 teacher : 72 teacher : 36
## Max. :4.000 Max. :4.000
## reason guardian traveltime studytime
## course :285 father:153 Min. :1.000 Min. :1.000
## home :149 mother:455 1st Qu.:1.000 1st Qu.:1.000
## other : 72 other : 41 Median :1.000 Median :2.000
## reputation:143 Mean :1.569 Mean :1.931
## 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :4.000 Max. :4.000
## failures schoolsup famsup paid activities nursery
## Min. :0.0000 no :581 no :251 no :610 no :334 no :128
## 1st Qu.:0.0000 yes: 68 yes:398 yes: 39 yes:315 yes:521
## Median :0.0000
## Mean :0.2219
## 3rd Qu.:0.0000
## Max. :3.0000
## higher internet romantic famrel freetime
## no : 69 no :151 no :410 Min. :1.000 Min. :1.00
## yes:580 yes:498 yes:239 1st Qu.:4.000 1st Qu.:3.00
## Median :4.000 Median :3.00
## Mean :3.931 Mean :3.18
## 3rd Qu.:5.000 3rd Qu.:4.00
## Max. :5.000 Max. :5.00
## goout Dalc Walc health
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:2.000
## Median :3.000 Median :1.000 Median :2.00 Median :4.000
## Mean :3.185 Mean :1.502 Mean :2.28 Mean :3.536
## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.00 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
## absences G1 G2 G3
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.:10.0 1st Qu.:10.00 1st Qu.:10.00
## Median : 2.000 Median :11.0 Median :11.00 Median :12.00
## Mean : 3.659 Mean :11.4 Mean :11.57 Mean :11.91
## 3rd Qu.: 6.000 3rd Qu.:13.0 3rd Qu.:13.00 3rd Qu.:14.00
## Max. :32.000 Max. :19.0 Max. :19.00 Max. :19.00
str(student0)
## 'data.frame': 649 obs. of 33 variables:
## $ school : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 0 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 4 2 6 0 0 6 0 2 0 0 ...
## $ G1 : int 0 9 12 14 11 12 13 10 15 12 ...
## $ G2 : int 11 11 13 14 13 12 12 13 16 12 ...
## $ G3 : int 11 11 12 14 13 13 13 13 17 13 ...
A histogram of the response variable is shown below. This histogram suggests that the data are heavily right skewed.
#Show a histogram of the response variables
hist(student0$absences, main = "Student Absences")
This experimental design will use response surface methodology (RSM) to analyze the effects of gender, involvement in extracurriculars, travel time to school, and alcohol consumption on school attendance. The two 3-level factors will be converted to 2-level factors in order to implement this experimental design.This design is used because it is suspected that the independent variables have an effect on the response of number of days absent. Using RSM, it is possible to analyze at what factor levels this response variable hits a maximum and minimum value.
The null hypothesis for this experiment is that gender, involvement in extracurriculars, travel time to school, and alcohol consumption have no effect on school attendance
There are some repeated measures in the raw data because some of the students have the same attributes. While no information was provided indicating how the data were collected, the data set does not seem to show any specific sampling. For this reason, we will assume the data were randomly collected for the purpose of this experiment.
First, we convert the 2-level factors into binary variables. Additionally, alcohol consumption is converted from a 5-level factor to a 3-level factor and travel time is converted from a 4-level factor to a 3-level factor.
#Convert Walc (weekend alcohol consumption) to a 3-level factor
for (i in (1:length(student0$Walc))) {
if (student0$Walc[i] == 1) {
student0$alcohol[i] <- 0
} else if (student0$Walc[i] == 2) {
student0$alcohol[i] <- 1
} else {
student0$alcohol[i] <- 2
}
}
#Convert traveltime (travel time to school) to a 3-level factor
for (i in (1:length(student0$traveltime))) {
if (student0$traveltime[i] == 1) {
student0$travel[i] <- 0
} else if (student0$traveltime[i] == 2) {
student0$travel[i] <- 1
} else {
student0$travel[i] <- 2
}
}
#Convert gender to a 2-level factor
for (i in (1:length(student0$sex))) {
if (student0$sex[i] == "M") {
student0$gender[i] <- 0
} else {
student0$gender[i] <- 1
}
}
#Convert extracurricular involvement to a 2-level factor
for (i in (1:length(student0$activities))) {
if (student0$activities[i] == "no") {
student0$extra[i] <- 0
} else {
student0$extra[i] <- 1
}
}
Next, we remove the unimportant variables from the data
#Remove unimportant variables from data
student <- student0[ -c(1:29, 31:33) ]
A summary and the structure of the new data is shown. These are the variables that are to be used for the analysis.
#Show a summary and structure of the relevant data
summary(student)
## absences alcohol travel gender
## Min. : 0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 2.000 Median :1.000 Median :0.0000 Median :1.0000
## Mean : 3.659 Mean :1.008 Mean :0.5439 Mean :0.5901
## 3rd Qu.: 6.000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :32.000 Max. :2.000 Max. :2.0000 Max. :1.0000
## extra
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4854
## 3rd Qu.:1.0000
## Max. :1.0000
str(student)
## 'data.frame': 649 obs. of 5 variables:
## $ absences: int 4 2 6 0 0 6 0 2 0 0 ...
## $ alcohol : num 0 0 2 0 1 1 0 0 0 0 ...
## $ travel : num 1 0 0 0 0 0 0 1 0 0 ...
## $ gender : num 1 1 1 1 1 0 0 1 0 0 ...
## $ extra : num 0 0 0 1 0 1 0 0 0 1 ...
A boxplot for all of the IVs and absences is shown below.
#Show a boxplot of the data
boxplot(absences ~ gender + extra + alcohol + travel, data = student, main = "Boxplot for 4 Factors and Absences")
Individual boxplots are shown below for each of the independent variables. From the following plots, it doesn’t seem like there is a significant difference in most of the means for each level of each factor. However, for alcohol consumption, it appears like there could be a slightly higher number of absences for more alcohol consumption.
#Show individual boxplots for each of the IVs
boxplot(absences ~ gender, data = student, main = "Gender")
boxplot(absences ~ extra, data = student, main = "Extracurricular Activities")
boxplot(absences ~ alcohol, data = student, main = "Alcohol Consumption")
boxplot(absences ~ travel, data = student, main = "Travel Time")
The two 3-level factors are each converted into two 2-level factors.
#Represent 3-level factor as 2 2-level factors (alcohol)
for (i in (1:length(student0$alcohol))) {
if (student$alcohol[i] == 0) {
student$alcoholA[i] <- 0
student$alcoholB[i] <- 0
} else if (student$alcohol[i] == 1) {
student$alcoholA[i] <- 1
student$alcoholB[i] <- 0
} else {
student$alcoholA[i] <- 1
student$alcoholB[i] <- 1
}
}
#Represent 3-level factor as 2 2-level factors (travel)
for (i in (1:length(student0$traveltime))) {
if (student$travel[i] == 0) {
student$travelA[i] <- 0
student$travelB[i] <- 0
} else if (student$travel[i] == 1) {
student$travelA[i] <- 1
student$travelB[i] <- 0
} else {
student$travelA[i] <- 1
student$travelB[i] <- 1
}
}
First, we create the RSM model.
#RSM model
student.rsm <- rsm(absences~SO(gender,extra,alcoholA, alcoholB, travelA, travelB), data=student)
## Warning in rsm(absences ~ SO(gender, extra, alcoholA, alcoholB, travelA, : Some coefficients are aliased - cannot use 'rsm' methods.
## Returning an 'lm' object.
summary(student.rsm)
##
## Call:
## rsm(formula = absences ~ SO(gender, extra, alcoholA, alcoholB,
## travelA, travelB), data = student)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.626 -3.354 -1.354 1.759 28.349
##
## Coefficients: (8 not defined because of singularities)
## Estimate
## (Intercept) 2.46619
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender 0.88762
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra -0.32966
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA 1.09746
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB 1.16222
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA 0.10847
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB -0.18590
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra -0.71076
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA -0.51487
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB -0.10120
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA 0.19084
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB -1.30751
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA 1.42365
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB -1.73013
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA 0.69196
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB 1.29552
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB NA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA -1.31223
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB 0.20509
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA 0.66339
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB 0.06966
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2 NA
## Std. Error
## (Intercept) 0.84784
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender 0.87294
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra 0.88489
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA 1.05288
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB 0.99031
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA 1.01406
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB 1.68013
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra 0.78676
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA 1.04820
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB 1.01358
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA 0.85943
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB 1.50768
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA 0.98191
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB 1.00079
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA 0.81384
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB 1.33402
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB NA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA 1.06932
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB 1.73406
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA 1.09553
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB 1.86183
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2 NA
## t value
## (Intercept) 2.909
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender 1.017
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra -0.373
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA 1.042
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB 1.174
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA 0.107
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB -0.111
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra -0.903
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA -0.491
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB -0.100
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA 0.222
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB -0.867
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA 1.450
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB -1.729
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA 0.850
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB 0.971
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB NA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA -1.227
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB 0.118
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA 0.606
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB 0.037
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2 NA
## Pr(>|t|)
## (Intercept) 0.00376
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender 0.30963
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra 0.70962
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA 0.29765
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB 0.24100
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA 0.91485
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB 0.91193
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra 0.36666
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA 0.62346
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB 0.92050
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA 0.82434
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB 0.38614
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA 0.14759
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB 0.08434
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA 0.39552
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB 0.33185
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB NA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA 0.22022
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB 0.90589
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA 0.54504
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB 0.97017
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2 NA
##
## (Intercept) **
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB .
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.622 on 629 degrees of freedom
## Multiple R-squared: 0.03719, Adjusted R-squared: 0.00811
## F-statistic: 1.279 on 19 and 629 DF, p-value: 0.1901
From the output, it is suggested that none of the first order, two-way interactions, or purely quadratic terms are highly significant. The only coefficient that may be statistically significant is the interaction between extracurricular involvement and alcohol consumption, which is significant at the 0.1 level.
Next, the contour plots for this model are shown.
#Show contour plots
par(mfrow=c(2,3))
contour(student.rsm, ~gender + extra + alcoholA + alcoholB + travelA + travelB, image=TRUE, at=summary(student.rsm$canonical$xs))
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
When looking at the contour plot for extracurriculars and alcoholB, it appears that high alcohol consumptioin and low extracurricular involvement results in more days absent. This could make sense since low extracurricular involvement and high alcohol consumption would intuitively result in lower school attendance. However, the fact that this significance shows up only in the interaction between the variables and not for the main effect of either is interesting. This suggests that alcohol consumption and extracurriculars alone have no effect on absences, but combined may influence school attendance.
The perspective plot for the alcoholB and extra 2fi could also allow insight into the effect on number of absences.
#Perspective plot for 2fi- extracurriculars and alcohol
par(mfrow=c(1,1))
persp(student.rsm, ~ extra + alcoholB, image=TRUE,at = c(summary(student.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Days Absent",theta=30)
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in persp.default(dat$x, dat$y, dat$z, zlim = dat$zlim, theta =
## theta, : "image" is not a graphical parameter
## Warning in persp.default(dat$x, dat$y, dat$z, xlab = dat$labs[1], ylab =
## dat$labs[2], : "image" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "image" is not a graphical
## parameter
This plot confirms what is seen in the contour plot. However, it is interesting to note that low alcohol consumption is expected to result in less absences than high alcohol consumption with high extracurricular involvement.
A Shapiro-Wilk test for Normality allows insight into whether the response variable population is normally distributed.
#Shapiro-Wilk test
shapiro.test(student$absences)
##
## Shapiro-Wilk normality test
##
## data: student$absences
## W = 0.77174, p-value < 2.2e-16
Because of the statistically significant p-value, we reject the null hypothesis that the sample comes from a population that is normally distributed.
QQ plots also help to understand the strength of the model.
qqnorm(residuals(student.rsm), main = "Normal Q-Q Plot")
qqline(residuals(student.rsm))
These plots suggest that the residuals are not normally distributed, and therefore the model may not be accurate.
It is unlikely that alcohol consumption, extracurricular involvement, gender, or travel time to class have an effect on school attendance. While it was suggested that the interaction of low extracurricular involvement with high alcohol consumption may result in more absences, the data was not concluded to be normally distributed. Therefore, we cannot reject the null hypothesis that the independent variables have no effect on number of absences.
P. Cortez and A. Silva. Using Data Mining to Predict Secondary School Student Performance. In A. Brito and J. Teixeira Eds., Proceedings of 5th FUture BUsiness TEChnology Conference (FUBUTEC 2008) pp. 5-12, Porto, Portugal, April, 2008, EUROSIS, ISBN 978-9077381-39-7.
Complete R code
#Clear Workspace
rm(list = ls())
#Download rsm package
library(rsm)
#Read in data- students in Portuguese language course
student0=read.table("/Users/rajanideshpande/Documents/Rensselaer/Academics/Fall 2016/Design of Experiments/Project 3/student-por.csv",sep=";",header=TRUE)
#Convert Walc (weekend alcohol consumption) to a 3-level factor
for (i in (1:length(student0$Walc))) {
if (student0$Walc[i] == 1) {
student0$alcohol[i] <- 0
} else if (student0$Walc[i] == 2) {
student0$alcohol[i] <- 1
} else {
student0$alcohol[i] <- 2
}
}
#Convert traveltime (travel time to school) to a 3-level factor
for (i in (1:length(student0$traveltime))) {
if (student0$traveltime[i] == 1) {
student0$travel[i] <- 0
} else if (student0$traveltime[i] == 2) {
student0$travel[i] <- 1
} else {
student0$travel[i] <- 2
}
}
#Convert gender to a 2-level factor
for (i in (1:length(student0$sex))) {
if (student0$sex[i] == "M") {
student0$gender[i] <- 0
} else {
student0$gender[i] <- 1
}
}
#Convert extracurricular involvement to a 2-level factor
for (i in (1:length(student0$activities))) {
if (student0$activities[i] == "no") {
student0$extra[i] <- 0
} else {
student0$extra[i] <- 1
}
}
#Remove unimportant variables from data
student <- student0[ -c(1:29, 31:33) ]
#Represent 3-level factor as 2 2-level factors (alcohol)
for (i in (1:length(student0$alcohol))) {
if (student$alcohol[i] == 0) {
student$alcoholA[i] <- 0
student$alcoholB[i] <- 0
} else if (student$alcohol[i] == 1) {
student$alcoholA[i] <- 1
student$alcoholB[i] <- 0
} else {
student$alcoholA[i] <- 1
student$alcoholB[i] <- 1
}
}
#Represent 3-level factor as 2 2-level factors (travel)
for (i in (1:length(student0$traveltime))) {
if (student$travel[i] == 0) {
student$travelA[i] <- 0
student$travelB[i] <- 0
} else if (student$travel[i] == 1) {
student$travelA[i] <- 1
student$travelB[i] <- 0
} else {
student$travelA[i] <- 1
student$travelB[i] <- 1
}
}
################################################################################################################
################################################################################################################
#Project 4
#Show the first 10 rows
head(student, n=10)
#Show a summary and structure of the data
summary(student)
str(student)
#Show a histogram of the response variable
hist(student$absences, main = "Student Absences")
#RSM model
student.rsm <- rsm(absences~SO(gender,extra,alcoholA, alcoholB, travelA, travelB), data=student)
summary(student.rsm)
#Show contour plots
par(mfrow=c(2,3))
contour(student.rsm, ~gender + extra + alcoholA + alcoholB + travelA + travelB, image=TRUE, at=summary(student.rsm$canonical$xs))
#Perspective plot for 2fi- extracurriculars and alcohol
par(mfrow=c(1,1))
persp(student.rsm, ~ extra + alcoholB, image=TRUE,at = c(summary(student.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Days Absent",theta=30)
#Shapiro-Wilk test
shapiro.test(student$absences)
qqnorm(residuals(student.rsm), main = "Normal Q-Q Plot")
qqline(residuals(student.rsm))