1. Setting

This analysis focuses on understanding how different attributes of a student affect their school attendance. The data set includes 30 attributes of 649 students from Portugal in 2008, but the experiment focuses on gender, involvement in extracurricular activities, alcohol consumption, and travel time to school. The effect of these four independent variables on number of absences is analyzed in this study. Levels of factors as used for analysis are shown as follows:

The raw data are organized into rows, with each one representing a student There are five columns- one for each of the variables indicated above. For the purpose of this experiment, only the effects of the four factors on number of absences will be analyzed, even though the complete data set includes additional attributes. The original data set also included students from a Mathematics course and a Portuguese language course and then combined the two by students that were in both courses. However, only data from the Portuguese language course was used for this experiment because it had more data points.

First, we download the rsm package to be used, and then read in the data.

#Download rsm package
library(rsm)
## Warning: package 'rsm' was built under R version 3.2.5
#Read in data- students in Portuguese language course 
student0=read.table("student-por.csv",sep=";",header=TRUE)

The first 10 data points are shown below.

head(student0, n=10)
##    school sex age address famsize Pstatus Medu Fedu     Mjob     Fjob
## 1      GP   F  18       U     GT3       A    4    4  at_home  teacher
## 2      GP   F  17       U     GT3       T    1    1  at_home    other
## 3      GP   F  15       U     LE3       T    1    1  at_home    other
## 4      GP   F  15       U     GT3       T    4    2   health services
## 5      GP   F  16       U     GT3       T    3    3    other    other
## 6      GP   M  16       U     LE3       T    4    3 services    other
## 7      GP   M  16       U     LE3       T    2    2    other    other
## 8      GP   F  17       U     GT3       A    4    4    other  teacher
## 9      GP   M  15       U     LE3       A    3    2 services    other
## 10     GP   M  15       U     GT3       T    3    4    other    other
##        reason guardian traveltime studytime failures schoolsup famsup paid
## 1      course   mother          2         2        0       yes     no   no
## 2      course   father          1         2        0        no    yes   no
## 3       other   mother          1         2        0       yes     no   no
## 4        home   mother          1         3        0        no    yes   no
## 5        home   father          1         2        0        no    yes   no
## 6  reputation   mother          1         2        0        no    yes   no
## 7        home   mother          1         2        0        no     no   no
## 8        home   mother          2         2        0       yes    yes   no
## 9        home   mother          1         2        0        no    yes   no
## 10       home   mother          1         2        0        no    yes   no
##    activities nursery higher internet romantic famrel freetime goout Dalc
## 1          no     yes    yes       no       no      4        3     4    1
## 2          no      no    yes      yes       no      5        3     3    1
## 3          no     yes    yes      yes       no      4        3     2    2
## 4         yes     yes    yes      yes      yes      3        2     2    1
## 5          no     yes    yes       no       no      4        3     2    1
## 6         yes     yes    yes      yes       no      5        4     2    1
## 7          no     yes    yes      yes       no      4        4     4    1
## 8          no     yes    yes       no       no      4        1     4    1
## 9          no     yes    yes      yes       no      4        2     2    1
## 10        yes     yes    yes      yes       no      5        5     1    1
##    Walc health absences G1 G2 G3
## 1     1      3        4  0 11 11
## 2     1      3        2  9 11 11
## 3     3      3        6 12 13 12
## 4     1      5        0 14 14 14
## 5     2      5        0 11 13 13
## 6     2      5        6 12 12 13
## 7     1      3        0 13 12 13
## 8     1      1        2 10 13 13
## 9     1      1        0 15 16 17
## 10    1      5        0 12 12 13

The following code shows the structure as well as a summary of the raw data.

#Show a summary and structure of the raw data
summary(student0)
##  school   sex          age        address famsize   Pstatus
##  GP:423   F:383   Min.   :15.00   R:197   GT3:457   A: 80  
##  MS:226   M:266   1st Qu.:16.00   U:452   LE3:192   T:569  
##                   Median :17.00                            
##                   Mean   :16.74                            
##                   3rd Qu.:18.00                            
##                   Max.   :22.00                            
##       Medu            Fedu             Mjob           Fjob    
##  Min.   :0.000   Min.   :0.000   at_home :135   at_home : 42  
##  1st Qu.:2.000   1st Qu.:1.000   health  : 48   health  : 23  
##  Median :2.000   Median :2.000   other   :258   other   :367  
##  Mean   :2.515   Mean   :2.307   services:136   services:181  
##  3rd Qu.:4.000   3rd Qu.:3.000   teacher : 72   teacher : 36  
##  Max.   :4.000   Max.   :4.000                                
##         reason      guardian     traveltime      studytime    
##  course    :285   father:153   Min.   :1.000   Min.   :1.000  
##  home      :149   mother:455   1st Qu.:1.000   1st Qu.:1.000  
##  other     : 72   other : 41   Median :1.000   Median :2.000  
##  reputation:143                Mean   :1.569   Mean   :1.931  
##                                3rd Qu.:2.000   3rd Qu.:2.000  
##                                Max.   :4.000   Max.   :4.000  
##     failures      schoolsup famsup     paid     activities nursery  
##  Min.   :0.0000   no :581   no :251   no :610   no :334    no :128  
##  1st Qu.:0.0000   yes: 68   yes:398   yes: 39   yes:315    yes:521  
##  Median :0.0000                                                     
##  Mean   :0.2219                                                     
##  3rd Qu.:0.0000                                                     
##  Max.   :3.0000                                                     
##  higher    internet  romantic      famrel         freetime   
##  no : 69   no :151   no :410   Min.   :1.000   Min.   :1.00  
##  yes:580   yes:498   yes:239   1st Qu.:4.000   1st Qu.:3.00  
##                                Median :4.000   Median :3.00  
##                                Mean   :3.931   Mean   :3.18  
##                                3rd Qu.:5.000   3rd Qu.:4.00  
##                                Max.   :5.000   Max.   :5.00  
##      goout            Dalc            Walc          health     
##  Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.00   1st Qu.:2.000  
##  Median :3.000   Median :1.000   Median :2.00   Median :4.000  
##  Mean   :3.185   Mean   :1.502   Mean   :2.28   Mean   :3.536  
##  3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:3.00   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.00   Max.   :5.000  
##     absences            G1             G2              G3       
##  Min.   : 0.000   Min.   : 0.0   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.:10.0   1st Qu.:10.00   1st Qu.:10.00  
##  Median : 2.000   Median :11.0   Median :11.00   Median :12.00  
##  Mean   : 3.659   Mean   :11.4   Mean   :11.57   Mean   :11.91  
##  3rd Qu.: 6.000   3rd Qu.:13.0   3rd Qu.:13.00   3rd Qu.:14.00  
##  Max.   :32.000   Max.   :19.0   Max.   :19.00   Max.   :19.00
str(student0)
## 'data.frame':    649 obs. of  33 variables:
##  $ school    : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex       : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
##  $ famsize   : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
##  $ Pstatus   : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Medu      : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mjob      : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
##  $ Fjob      : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
##  $ reason    : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
##  $ guardian  : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ paid      : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ nursery   : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ higher    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ internet  : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  4 2 6 0 0 6 0 2 0 0 ...
##  $ G1        : int  0 9 12 14 11 12 13 10 15 12 ...
##  $ G2        : int  11 11 13 14 13 12 12 13 16 12 ...
##  $ G3        : int  11 11 12 14 13 13 13 13 17 13 ...

A histogram of the response variable is shown below. This histogram suggests that the data are heavily right skewed.

#Show a histogram of the response variables
hist(student0$absences, main = "Student Absences")

2. Experimental Design

This experimental design will use response surface methodology (RSM) to analyze the effects of gender, involvement in extracurriculars, travel time to school, and alcohol consumption on school attendance. The two 3-level factors will be converted to 2-level factors in order to implement this experimental design.This design is used because it is suspected that the independent variables have an effect on the response of number of days absent. Using RSM, it is possible to analyze at what factor levels this response variable hits a maximum and minimum value.

The null hypothesis for this experiment is that gender, involvement in extracurriculars, travel time to school, and alcohol consumption have no effect on school attendance

There are some repeated measures in the raw data because some of the students have the same attributes. While no information was provided indicating how the data were collected, the data set does not seem to show any specific sampling. For this reason, we will assume the data were randomly collected for the purpose of this experiment.

3. Statistical Analysis

First, we convert the 2-level factors into binary variables. Additionally, alcohol consumption is converted from a 5-level factor to a 3-level factor and travel time is converted from a 4-level factor to a 3-level factor.

#Convert Walc (weekend alcohol consumption) to a 3-level factor
for (i in (1:length(student0$Walc))) {
  if (student0$Walc[i] == 1) {
    student0$alcohol[i] <- 0
  } else if (student0$Walc[i] == 2) {
    student0$alcohol[i] <- 1
  } else {
    student0$alcohol[i] <- 2
  }
}

#Convert traveltime (travel time to school) to a 3-level factor
for (i in (1:length(student0$traveltime))) {
  if (student0$traveltime[i] == 1) {
    student0$travel[i] <- 0
  } else if (student0$traveltime[i] == 2) {
    student0$travel[i] <- 1
  } else {
    student0$travel[i] <- 2
  }
}

#Convert gender to a 2-level factor
for (i in (1:length(student0$sex))) {
  if (student0$sex[i] == "M") {
    student0$gender[i] <- 0
  } else {
    student0$gender[i] <- 1
  }
}

#Convert extracurricular involvement to a 2-level factor
for (i in (1:length(student0$activities))) {
  if (student0$activities[i] == "no") {
    student0$extra[i] <- 0
  } else {
    student0$extra[i] <- 1
  }
}

Next, we remove the unimportant variables from the data

#Remove unimportant variables from data
student <- student0[ -c(1:29, 31:33) ]

A summary and the structure of the new data is shown. These are the variables that are to be used for the analysis.

#Show a summary and structure of the relevant data
summary(student)
##     absences         alcohol          travel           gender      
##  Min.   : 0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 0.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 2.000   Median :1.000   Median :0.0000   Median :1.0000  
##  Mean   : 3.659   Mean   :1.008   Mean   :0.5439   Mean   :0.5901  
##  3rd Qu.: 6.000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :32.000   Max.   :2.000   Max.   :2.0000   Max.   :1.0000  
##      extra       
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4854  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
str(student)
## 'data.frame':    649 obs. of  5 variables:
##  $ absences: int  4 2 6 0 0 6 0 2 0 0 ...
##  $ alcohol : num  0 0 2 0 1 1 0 0 0 0 ...
##  $ travel  : num  1 0 0 0 0 0 0 1 0 0 ...
##  $ gender  : num  1 1 1 1 1 0 0 1 0 0 ...
##  $ extra   : num  0 0 0 1 0 1 0 0 0 1 ...

A boxplot for all of the IVs and absences is shown below.

#Show a boxplot of the data
boxplot(absences ~ gender + extra + alcohol + travel, data = student, main = "Boxplot for 4 Factors and Absences")

Individual boxplots are shown below for each of the independent variables. From the following plots, it doesn’t seem like there is a significant difference in most of the means for each level of each factor. However, for alcohol consumption, it appears like there could be a slightly higher number of absences for more alcohol consumption.

#Show individual boxplots for each of the IVs
boxplot(absences ~ gender, data = student, main = "Gender")

boxplot(absences ~ extra, data = student, main = "Extracurricular Activities")

boxplot(absences ~ alcohol, data = student, main = "Alcohol Consumption")

boxplot(absences ~ travel, data = student, main = "Travel Time")

The two 3-level factors are each converted into two 2-level factors.

#Represent 3-level factor as 2 2-level factors (alcohol)
for (i in (1:length(student0$alcohol))) {
  if (student$alcohol[i] == 0) {
    student$alcoholA[i] <- 0
    student$alcoholB[i] <- 0
  } else if (student$alcohol[i] == 1) {
    student$alcoholA[i] <- 1
    student$alcoholB[i] <- 0
  } else {
    student$alcoholA[i] <- 1
    student$alcoholB[i] <- 1
  }
}

#Represent 3-level factor as 2 2-level factors (travel)
for (i in (1:length(student0$traveltime))) {
  if (student$travel[i] == 0) {
    student$travelA[i] <- 0
    student$travelB[i] <- 0
  } else if (student$travel[i] == 1) {
    student$travelA[i] <- 1
    student$travelB[i] <- 0
  } else {
    student$travelA[i] <- 1
    student$travelB[i] <- 1
  }
}

First, we create the RSM model.

#RSM model
student.rsm <- rsm(absences~SO(gender,extra,alcoholA, alcoholB, travelA, travelB), data=student)
## Warning in rsm(absences ~ SO(gender, extra, alcoholA, alcoholB, travelA, : Some coefficients are aliased - cannot use 'rsm' methods.
##   Returning an 'lm' object.
summary(student.rsm)
## 
## Call:
## rsm(formula = absences ~ SO(gender, extra, alcoholA, alcoholB, 
##     travelA, travelB), data = student)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.626 -3.354 -1.354  1.759 28.349 
## 
## Coefficients: (8 not defined because of singularities)
##                                                                           Estimate
## (Intercept)                                                                2.46619
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender              0.88762
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra              -0.32966
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA            1.09746
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB            1.16222
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA             0.10847
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB            -0.18590
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra      -0.71076
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA   -0.51487
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB   -0.10120
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA     0.19084
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB    -1.30751
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA     1.42365
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB    -1.73013
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA      0.69196
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB      1.29552
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB       NA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA  -1.31223
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB   0.20509
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA   0.66339
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB   0.06966
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB         NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2                 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2                  NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2               NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2               NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2                NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2                NA
##                                                                           Std. Error
## (Intercept)                                                                  0.84784
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender                0.87294
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra                 0.88489
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA              1.05288
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB              0.99031
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA               1.01406
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB               1.68013
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra         0.78676
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA      1.04820
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB      1.01358
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA       0.85943
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB       1.50768
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA       0.98191
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB       1.00079
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA        0.81384
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB        1.33402
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB         NA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA     1.06932
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB     1.73406
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA     1.09553
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB     1.86183
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB           NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2                   NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2                    NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2                 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2                 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2                  NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2                  NA
##                                                                           t value
## (Intercept)                                                                 2.909
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender               1.017
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra               -0.373
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA             1.042
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB             1.174
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA              0.107
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB             -0.111
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra       -0.903
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA    -0.491
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB    -0.100
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA      0.222
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB     -0.867
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA      1.450
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB     -1.729
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA       0.850
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB       0.971
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB      NA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA   -1.227
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB    0.118
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA    0.606
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB    0.037
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB        NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2                NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2                 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2              NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2              NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2               NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2               NA
##                                                                           Pr(>|t|)
## (Intercept)                                                                0.00376
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender              0.30963
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra               0.70962
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA            0.29765
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB            0.24100
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA             0.91485
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB             0.91193
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra       0.36666
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA    0.62346
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB    0.92050
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA     0.82434
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB     0.38614
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA     0.14759
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB     0.08434
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA      0.39552
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB      0.33185
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB       NA
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA   0.22022
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB   0.90589
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA   0.54504
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB   0.97017
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB         NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2                 NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2                  NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2               NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2               NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2                NA
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2                NA
##                                                                             
## (Intercept)                                                               **
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)gender               
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)extra                
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA             
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB             
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA              
## FO(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB              
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:extra        
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholA     
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:alcoholB     
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelA      
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)gender:travelB      
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholA      
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:alcoholB    . 
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelA       
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)extra:travelB       
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:alcoholB   
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelA    
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA:travelB    
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelA    
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB:travelB    
## TWI(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA:travelB     
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)gender^2             
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)extra^2              
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholA^2           
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)alcoholB^2           
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelA^2            
## PQ(gender, extra, alcoholA, alcoholB, travelA, travelB)travelB^2            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.622 on 629 degrees of freedom
## Multiple R-squared:  0.03719,    Adjusted R-squared:  0.00811 
## F-statistic: 1.279 on 19 and 629 DF,  p-value: 0.1901

From the output, it is suggested that none of the first order, two-way interactions, or purely quadratic terms are highly significant. The only coefficient that may be statistically significant is the interaction between extracurricular involvement and alcohol consumption, which is significant at the 0.1 level.

Next, the contour plots for this model are shown.

#Show contour plots
par(mfrow=c(2,3))
contour(student.rsm, ~gender + extra + alcoholA + alcoholB + travelA + travelB, image=TRUE, at=summary(student.rsm$canonical$xs))
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

When looking at the contour plot for extracurriculars and alcoholB, it appears that high alcohol consumptioin and low extracurricular involvement results in more days absent. This could make sense since low extracurricular involvement and high alcohol consumption would intuitively result in lower school attendance. However, the fact that this significance shows up only in the interaction between the variables and not for the main effect of either is interesting. This suggests that alcohol consumption and extracurriculars alone have no effect on absences, but combined may influence school attendance.

The perspective plot for the alcoholB and extra 2fi could also allow insight into the effect on number of absences.

#Perspective plot for 2fi- extracurriculars and alcohol
par(mfrow=c(1,1))
persp(student.rsm, ~ extra + alcoholB, image=TRUE,at = c(summary(student.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Days Absent",theta=30)
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in persp.default(dat$x, dat$y, dat$z, zlim = dat$zlim, theta =
## theta, : "image" is not a graphical parameter
## Warning in persp.default(dat$x, dat$y, dat$z, xlab = dat$labs[1], ylab =
## dat$labs[2], : "image" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "image" is not a graphical
## parameter

This plot confirms what is seen in the contour plot. However, it is interesting to note that low alcohol consumption is expected to result in less absences than high alcohol consumption with high extracurricular involvement.

A Shapiro-Wilk test for Normality allows insight into whether the response variable population is normally distributed.

#Shapiro-Wilk test
shapiro.test(student$absences)
## 
##  Shapiro-Wilk normality test
## 
## data:  student$absences
## W = 0.77174, p-value < 2.2e-16

Because of the statistically significant p-value, we reject the null hypothesis that the sample comes from a population that is normally distributed.

QQ plots also help to understand the strength of the model.

qqnorm(residuals(student.rsm), main = "Normal Q-Q Plot")
qqline(residuals(student.rsm))

These plots suggest that the residuals are not normally distributed, and therefore the model may not be accurate.

4. Conclusion

It is unlikely that alcohol consumption, extracurricular involvement, gender, or travel time to class have an effect on school attendance. While it was suggested that the interaction of low extracurricular involvement with high alcohol consumption may result in more absences, the data was not concluded to be normally distributed. Therefore, we cannot reject the null hypothesis that the independent variables have no effect on number of absences.

5. References

P. Cortez and A. Silva. Using Data Mining to Predict Secondary School Student Performance. In A. Brito and J. Teixeira Eds., Proceedings of 5th FUture BUsiness TEChnology Conference (FUBUTEC 2008) pp. 5-12, Porto, Portugal, April, 2008, EUROSIS, ISBN 978-9077381-39-7.

6. Appendices

Complete R code

#Clear Workspace
rm(list = ls())

#Download rsm package
library(rsm)

#Read in data- students in Portuguese language course 
student0=read.table("/Users/rajanideshpande/Documents/Rensselaer/Academics/Fall 2016/Design of Experiments/Project 3/student-por.csv",sep=";",header=TRUE)

#Convert Walc (weekend alcohol consumption) to a 3-level factor
for (i in (1:length(student0$Walc))) {
  if (student0$Walc[i] == 1) {
    student0$alcohol[i] <- 0
  } else if (student0$Walc[i] == 2) {
    student0$alcohol[i] <- 1
  } else {
    student0$alcohol[i] <- 2
  }
}

#Convert traveltime (travel time to school) to a 3-level factor
for (i in (1:length(student0$traveltime))) {
  if (student0$traveltime[i] == 1) {
    student0$travel[i] <- 0
  } else if (student0$traveltime[i] == 2) {
    student0$travel[i] <- 1
  } else {
    student0$travel[i] <- 2
  }
}


#Convert gender to a 2-level factor
for (i in (1:length(student0$sex))) {
  if (student0$sex[i] == "M") {
    student0$gender[i] <- 0
  } else {
    student0$gender[i] <- 1
  }
}

#Convert extracurricular involvement to a 2-level factor
for (i in (1:length(student0$activities))) {
  if (student0$activities[i] == "no") {
    student0$extra[i] <- 0
  } else {
    student0$extra[i] <- 1
  }
}

#Remove unimportant variables from data
student <- student0[ -c(1:29, 31:33) ]

#Represent 3-level factor as 2 2-level factors (alcohol)
for (i in (1:length(student0$alcohol))) {
  if (student$alcohol[i] == 0) {
    student$alcoholA[i] <- 0
    student$alcoholB[i] <- 0
  } else if (student$alcohol[i] == 1) {
    student$alcoholA[i] <- 1
    student$alcoholB[i] <- 0
  } else {
    student$alcoholA[i] <- 1
    student$alcoholB[i] <- 1
  }
}

#Represent 3-level factor as 2 2-level factors (travel)
for (i in (1:length(student0$traveltime))) {
  if (student$travel[i] == 0) {
    student$travelA[i] <- 0
    student$travelB[i] <- 0
  } else if (student$travel[i] == 1) {
    student$travelA[i] <- 1
    student$travelB[i] <- 0
  } else {
    student$travelA[i] <- 1
    student$travelB[i] <- 1
  }
}
################################################################################################################
################################################################################################################

#Project 4

#Show the first 10 rows
head(student, n=10)

#Show a summary and structure of the data
summary(student)
str(student)

#Show a histogram of the response variable
hist(student$absences, main = "Student Absences")

#RSM model
student.rsm <- rsm(absences~SO(gender,extra,alcoholA, alcoholB, travelA, travelB), data=student)
summary(student.rsm)

#Show contour plots
par(mfrow=c(2,3))
contour(student.rsm, ~gender + extra + alcoholA + alcoholB + travelA + travelB, image=TRUE, at=summary(student.rsm$canonical$xs))

#Perspective plot for 2fi- extracurriculars and alcohol
par(mfrow=c(1,1))
persp(student.rsm, ~ extra + alcoholB, image=TRUE,at = c(summary(student.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Days Absent",theta=30)

#Shapiro-Wilk test
shapiro.test(student$absences)

qqnorm(residuals(student.rsm), main = "Normal Q-Q Plot")
qqline(residuals(student.rsm))