This analysis focuses on understanding how different attributes of a student affect their school attendance. The data set includes 30 attributes of 649 students from Portugal in 2008, but the experiment focuses on gender, involvement in extracurricular activities, alcohol consumption, and travel time to school. The effect of these four independent variables on number of absences is analyzed in this study. Levels of factors as used for analysis are shown as follows:
The raw data are organized into rows, with each one representing a student There are five columns- one for each of the variables indicated above. For the purpose of this experiment, only the effects of the four factors on number of absences will be analyzed, even though the complete data set includes additional attributes. The original data set also included students from a Mathematics course and a Portuguese language course and then combined the two by students that were in both courses. However, only data from the Portuguese language course was used for this experiment because it had more data points.
First, we download the FrF2 package to be used, and then read in the data.
#Download FrF2 package
library(FrF2)
## Warning: package 'FrF2' was built under R version 3.2.5
## Loading required package: DoE.base
## Warning: package 'DoE.base' was built under R version 3.2.5
## Loading required package: grid
## Loading required package: conf.design
##
## Attaching package: 'DoE.base'
## The following objects are masked from 'package:stats':
##
## aov, lm
## The following object is masked from 'package:graphics':
##
## plot.design
## The following object is masked from 'package:base':
##
## lengths
#Read in data- students in Portuguese language course
student0=read.table("student-por.csv",sep=";",header=TRUE)
The first 10 data points are shown below.
head(student0, n=10)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP F 18 U GT3 A 4 4 at_home teacher
## 2 GP F 17 U GT3 T 1 1 at_home other
## 3 GP F 15 U LE3 T 1 1 at_home other
## 4 GP F 15 U GT3 T 4 2 health services
## 5 GP F 16 U GT3 T 3 3 other other
## 6 GP M 16 U LE3 T 4 3 services other
## 7 GP M 16 U LE3 T 2 2 other other
## 8 GP F 17 U GT3 A 4 4 other teacher
## 9 GP M 15 U LE3 A 3 2 services other
## 10 GP M 15 U GT3 T 3 4 other other
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 course mother 2 2 0 yes no no
## 2 course father 1 2 0 no yes no
## 3 other mother 1 2 0 yes no no
## 4 home mother 1 3 0 no yes no
## 5 home father 1 2 0 no yes no
## 6 reputation mother 1 2 0 no yes no
## 7 home mother 1 2 0 no no no
## 8 home mother 2 2 0 yes yes no
## 9 home mother 1 2 0 no yes no
## 10 home mother 1 2 0 no yes no
## activities nursery higher internet romantic famrel freetime goout Dalc
## 1 no yes yes no no 4 3 4 1
## 2 no no yes yes no 5 3 3 1
## 3 no yes yes yes no 4 3 2 2
## 4 yes yes yes yes yes 3 2 2 1
## 5 no yes yes no no 4 3 2 1
## 6 yes yes yes yes no 5 4 2 1
## 7 no yes yes yes no 4 4 4 1
## 8 no yes yes no no 4 1 4 1
## 9 no yes yes yes no 4 2 2 1
## 10 yes yes yes yes no 5 5 1 1
## Walc health absences G1 G2 G3
## 1 1 3 4 0 11 11
## 2 1 3 2 9 11 11
## 3 3 3 6 12 13 12
## 4 1 5 0 14 14 14
## 5 2 5 0 11 13 13
## 6 2 5 6 12 12 13
## 7 1 3 0 13 12 13
## 8 1 1 2 10 13 13
## 9 1 1 0 15 16 17
## 10 1 5 0 12 12 13
The following code shows the structure as well as a summary of the raw data.
#Show a summary and structure of the raw data
summary(student0)
## school sex age address famsize Pstatus
## GP:423 F:383 Min. :15.00 R:197 GT3:457 A: 80
## MS:226 M:266 1st Qu.:16.00 U:452 LE3:192 T:569
## Median :17.00
## Mean :16.74
## 3rd Qu.:18.00
## Max. :22.00
## Medu Fedu Mjob Fjob
## Min. :0.000 Min. :0.000 at_home :135 at_home : 42
## 1st Qu.:2.000 1st Qu.:1.000 health : 48 health : 23
## Median :2.000 Median :2.000 other :258 other :367
## Mean :2.515 Mean :2.307 services:136 services:181
## 3rd Qu.:4.000 3rd Qu.:3.000 teacher : 72 teacher : 36
## Max. :4.000 Max. :4.000
## reason guardian traveltime studytime
## course :285 father:153 Min. :1.000 Min. :1.000
## home :149 mother:455 1st Qu.:1.000 1st Qu.:1.000
## other : 72 other : 41 Median :1.000 Median :2.000
## reputation:143 Mean :1.569 Mean :1.931
## 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :4.000 Max. :4.000
## failures schoolsup famsup paid activities nursery
## Min. :0.0000 no :581 no :251 no :610 no :334 no :128
## 1st Qu.:0.0000 yes: 68 yes:398 yes: 39 yes:315 yes:521
## Median :0.0000
## Mean :0.2219
## 3rd Qu.:0.0000
## Max. :3.0000
## higher internet romantic famrel freetime
## no : 69 no :151 no :410 Min. :1.000 Min. :1.00
## yes:580 yes:498 yes:239 1st Qu.:4.000 1st Qu.:3.00
## Median :4.000 Median :3.00
## Mean :3.931 Mean :3.18
## 3rd Qu.:5.000 3rd Qu.:4.00
## Max. :5.000 Max. :5.00
## goout Dalc Walc health
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:2.000
## Median :3.000 Median :1.000 Median :2.00 Median :4.000
## Mean :3.185 Mean :1.502 Mean :2.28 Mean :3.536
## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.00 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
## absences G1 G2 G3
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.:10.0 1st Qu.:10.00 1st Qu.:10.00
## Median : 2.000 Median :11.0 Median :11.00 Median :12.00
## Mean : 3.659 Mean :11.4 Mean :11.57 Mean :11.91
## 3rd Qu.: 6.000 3rd Qu.:13.0 3rd Qu.:13.00 3rd Qu.:14.00
## Max. :32.000 Max. :19.0 Max. :19.00 Max. :19.00
str(student0)
## 'data.frame': 649 obs. of 33 variables:
## $ school : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 0 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 4 2 6 0 0 6 0 2 0 0 ...
## $ G1 : int 0 9 12 14 11 12 13 10 15 12 ...
## $ G2 : int 11 11 13 14 13 12 12 13 16 12 ...
## $ G3 : int 11 11 12 14 13 13 13 13 17 13 ...
A histogram of the response variable is shown below. This histogram suggests that the data are heavily right skewed.
#Show a histogram of the response variables
hist(student0$absences, main = "Student Absences")
This experimental design will use Analysis of Variance (ANOVA) on a fractional factorial design to analyze the effects of gender, involvement in extracurriculars, travel time to school, and alcohol consumption on school attendance. Specifically, the design is a 2^m-3 fractional factorial. The two 3-level factors will be converted to 2-level factors in order to implement this experimental design.
The null hypothesis for this experiment is that gender, involvement in extracurriculars, travel time to school, and alcohol consumption have no effect on school attendance
There are some repeated measures in the raw data because some of the students have the same attributes. While no information was provided indicating how the data were collected, the data set does not seem to show any specific sampling. For this reason, we will assume the data were randomly collected for the purpose of this experiment.
First, we convert the 2-level factors into binary variables. Additionally, alcohol consumption is converted from a 5-level factor to a 3-level factor and travel time is converted from a 4-level factor to a 3-level factor.
#Convert Walc (weekend alcohol consumption) to a 3-level factor
for (i in (1:length(student0$Walc))) {
if (student0$Walc[i] == 1) {
student0$alcohol[i] <- 0
} else if (student0$Walc[i] == 2) {
student0$alcohol[i] <- 1
} else {
student0$alcohol[i] <- 2
}
}
#Convert traveltime (travel time to school) to a 3-level factor
for (i in (1:length(student0$traveltime))) {
if (student0$traveltime[i] == 1) {
student0$travel[i] <- 0
} else if (student0$traveltime[i] == 2) {
student0$travel[i] <- 1
} else {
student0$travel[i] <- 2
}
}
#Convert gender to a 2-level factor
for (i in (1:length(student0$sex))) {
if (student0$sex[i] == "M") {
student0$gender[i] <- 0
} else {
student0$gender[i] <- 1
}
}
#Convert extracurricular involvement to a 2-level factor
for (i in (1:length(student0$activities))) {
if (student0$activities[i] == "no") {
student0$extra[i] <- 0
} else {
student0$extra[i] <- 1
}
}
Next, we remove the unimportant variables from the data
#Remove unimportant variables from data
student <- student0[ -c(1:29, 31:33) ]
A summary and the structure of the new data is shown. These are the variables that are to be used for the analysis.
#Show a summary and structure of the relevant data
summary(student)
## absences alcohol travel gender
## Min. : 0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 2.000 Median :1.000 Median :0.0000 Median :1.0000
## Mean : 3.659 Mean :1.008 Mean :0.5439 Mean :0.5901
## 3rd Qu.: 6.000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :32.000 Max. :2.000 Max. :2.0000 Max. :1.0000
## extra
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4854
## 3rd Qu.:1.0000
## Max. :1.0000
str(student)
## 'data.frame': 649 obs. of 5 variables:
## $ absences: int 4 2 6 0 0 6 0 2 0 0 ...
## $ alcohol : num 0 0 2 0 1 1 0 0 0 0 ...
## $ travel : num 1 0 0 0 0 0 0 1 0 0 ...
## $ gender : num 1 1 1 1 1 0 0 1 0 0 ...
## $ extra : num 0 0 0 1 0 1 0 0 0 1 ...
A boxplot for all of the IVs and absences is shown below.
#Show a boxplot of the data
boxplot(absences ~ gender + extra + alcohol + travel, data = student, main = "Boxplot for 4 Factors and Absences")
Individual boxplots are shown below for each of the independent variables. From the following plots, it doesn’t seem like there is a significant difference in most of the means for each level of each factor. However, for alcohol consumption, it appears like there could be a slightly higher number of absences for more alcohol consumption.
#Show individual boxplots for each of the IVs
boxplot(absences ~ gender, data = student, main = "Gender")
boxplot(absences ~ extra, data = student, main = "Extracurricular Activities")
boxplot(absences ~ alcohol, data = student, main = "Alcohol Consumption")
boxplot(absences ~ travel, data = student, main = "Travel Time")
The two 3-level factors are each converted into two 2-level factors.
#Represent 3-level factor as 2 2-level factors (alcohol)
for (i in (1:length(student0$alcohol))) {
if (student$alcohol[i] == 0) {
student$alcoholA[i] <- 0
student$alcoholB[i] <- 0
} else if (student$alcohol[i] == 1) {
student$alcoholA[i] <- 1
student$alcoholB[i] <- 0
} else {
student$alcoholA[i] <- 1
student$alcoholB[i] <- 1
}
}
#Represent 3-level factor as 2 2-level factors (travel)
for (i in (1:length(student0$traveltime))) {
if (student$travel[i] == 0) {
student$travelA[i] <- 0
student$travelB[i] <- 0
} else if (student$travel[i] == 1) {
student$travelA[i] <- 1
student$travelB[i] <- 0
} else {
student$travelA[i] <- 1
student$travelB[i] <- 1
}
}
Because the full factorial design is a 2^6 experiment, there would be 64 experimental runs. The following shows the treatment levels for each of these 64 runs for the full factorial design using the variables created for this experiment.
#Show all runs for the full factorial design for 2^6
FrF2(64,6,factor.names = c('gender','extra','alcoholA','alcoholB', 'travelA', 'travelB'))
## creating full factorial with 64 runs ...
## gender extra alcoholA alcoholB travelA travelB
## 1 -1 1 -1 -1 -1 1
## 2 -1 -1 1 1 -1 1
## 3 1 1 -1 -1 1 -1
## 4 1 -1 1 -1 1 -1
## 5 -1 1 1 -1 -1 1
## 6 -1 1 -1 -1 -1 -1
## 7 1 -1 -1 -1 1 1
## 8 -1 -1 -1 1 1 -1
## 9 -1 -1 -1 -1 1 -1
## 10 1 1 1 1 -1 1
## 11 -1 -1 -1 -1 -1 -1
## 12 1 1 -1 1 -1 -1
## 13 -1 1 -1 1 -1 1
## 14 -1 1 -1 1 1 -1
## 15 1 1 1 1 1 1
## 16 1 -1 1 1 1 -1
## 17 1 -1 -1 1 -1 1
## 18 -1 1 -1 -1 1 1
## 19 1 -1 -1 -1 -1 -1
## 20 -1 -1 -1 -1 -1 1
## 21 -1 -1 1 -1 -1 1
## 22 1 -1 1 -1 1 1
## 23 1 -1 -1 1 -1 -1
## 24 -1 1 -1 1 -1 -1
## 25 -1 -1 1 1 -1 -1
## 26 1 1 1 -1 1 1
## 27 1 -1 1 -1 -1 1
## 28 -1 -1 -1 1 1 1
## 29 -1 1 1 1 1 1
## 30 -1 1 1 -1 1 -1
## 31 1 -1 1 1 1 1
## 32 1 1 -1 -1 1 1
## 33 -1 1 1 -1 1 1
## 34 1 1 -1 1 1 1
## 35 1 1 1 1 1 -1
## 36 -1 -1 1 -1 1 -1
## 37 1 -1 -1 -1 1 -1
## 38 -1 -1 -1 1 -1 -1
## 39 -1 1 -1 1 1 1
## 40 -1 1 1 1 1 -1
## 41 1 -1 1 1 -1 -1
## 42 -1 -1 1 1 1 -1
## 43 1 1 1 -1 -1 1
## 44 1 -1 -1 1 1 -1
## 45 -1 -1 -1 1 -1 1
## 46 -1 -1 1 -1 -1 -1
## 47 -1 -1 1 -1 1 1
## 48 1 -1 1 -1 -1 -1
## 49 1 -1 -1 1 1 1
## 50 -1 1 1 1 -1 1
## 51 1 -1 -1 -1 -1 1
## 52 1 1 -1 1 -1 1
## 53 1 1 1 -1 1 -1
## 54 -1 1 1 1 -1 -1
## 55 -1 1 -1 -1 1 -1
## 56 1 1 -1 1 1 -1
## 57 -1 -1 1 1 1 1
## 58 -1 1 1 -1 -1 -1
## 59 -1 -1 -1 -1 1 1
## 60 1 1 -1 -1 -1 -1
## 61 1 1 -1 -1 -1 1
## 62 1 -1 1 1 -1 1
## 63 1 1 1 -1 -1 -1
## 64 1 1 1 1 -1 -1
## class=design, type= full factorial
A 2^6-3 design is used for this experiment, which only uses 1/8 of the full factorial design. This consists of 8 experimental runs. The following code shows the treatments for these experimental runs.
#Show fractional factorial design
absences = FrF2(8,6,factor.names = c('gender','extra','alcoholA','alcoholB', 'travelA', 'travelB'))
summary(absences)
## Call:
## FrF2(8, 6, factor.names = c("gender", "extra", "alcoholA", "alcoholB",
## "travelA", "travelB"))
##
## Experimental design of type FrF2
## 8 runs
##
## Factor settings (scale ends):
## gender extra alcoholA alcoholB travelA travelB
## 1 -1 -1 -1 -1 -1 -1
## 2 1 1 1 1 1 1
##
## Design generating information:
## $legend
## [1] A=gender B=extra C=alcoholA D=alcoholB E=travelA F=travelB
##
## $generators
## [1] D=AB E=AC F=BC
##
##
## Alias structure:
## $main
## [1] A=BD=CE B=AD=CF C=AE=BF D=AB=EF E=AC=DF F=BC=DE
##
## $fi2
## [1] AF=BE=CD
##
##
## The design itself:
## gender extra alcoholA alcoholB travelA travelB
## 1 -1 -1 1 1 -1 -1
## 2 1 1 1 1 1 1
## 3 1 -1 -1 -1 -1 1
## 4 -1 -1 -1 1 1 1
## 5 -1 1 -1 -1 1 -1
## 6 1 -1 1 -1 1 -1
## 7 1 1 -1 1 -1 -1
## 8 -1 1 1 -1 -1 1
## class=design, type= FrF2
The highest resolution for this design is III. The following code shows the aliasing structure for this fractional factorial design. This suggests that the main effects are aliased with some of the 2fi, which are aliased with one another.
#Show aliasing structure of fractional factorial design
design.info(absences)$aliased
## $legend
## [1] "A=gender" "B=extra" "C=alcoholA" "D=alcoholB" "E=travelA"
## [6] "F=travelB"
##
## $main
## [1] "A=BD=CE" "B=AD=CF" "C=AE=BF" "D=AB=EF" "E=AC=DF" "F=BC=DE"
##
## $fi2
## [1] "AF=BE=CD"
A linear model and ANOVA are used to estimate main effects and interaction effects. Main effects can be seen below. From the ANOVA output, it is suggested that alcoholA is the only statistically significant factor, which is significant at the 0.001 level. This suggests that the main effect for alcohol is significant, and that alcohol consumption does have an effect on number of absences. However, it is important to note that this is aliased with the 2fi for gender and travel time, and extracurricular involvement and travel time. For these reasons, this could be confounded.
#Create linear model and use ANOVA to estimate main effects and interaction effects
mainmodel <- lm(absences ~ gender + extra + alcoholA + alcoholB + travelA + travelB, data = student)
anova(mainmodel)
## Analysis of Variance Table
##
## Response: absences
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 6.4 6.353 0.2985 0.58501
## extra 1 4.5 4.479 0.2105 0.64655
## alcoholA 1 226.6 226.608 10.6480 0.00116 **
## alcoholB 1 51.3 51.308 2.4109 0.12098
## travelA 1 0.0 0.003 0.0001 0.99052
## travelB 1 4.1 4.107 0.1930 0.66061
## Residuals 642 13662.9 21.282
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The following code estimates the interaction effects. This output suggests that the interaction effect of involvement in extracurriculars and alcohol consumption is significant at the 0.1 significance level.
interactmodel <- lm((student$absences ~ (student$gender + student$extra + student$alcoholA + student$alcoholB + student$travelA + student$travelB)^2))
anova(interactmodel)
## Analysis of Variance Table
##
## Response: student$absences
## Df Sum Sq Mean Sq F value Pr(>F)
## student$gender 1 6.4 6.353 0.2974 0.585715
## student$extra 1 4.5 4.479 0.2097 0.647168
## student$alcoholA 1 226.6 226.608 10.6080 0.001187 **
## student$alcoholB 1 51.3 51.308 2.4019 0.121694
## student$travelA 1 0.0 0.003 0.0001 0.990536
## student$travelB 1 4.1 4.107 0.1922 0.661211
## student$gender:student$extra 1 13.1 13.053 0.6110 0.434691
## student$gender:student$alcoholA 1 15.8 15.813 0.7402 0.389917
## student$gender:student$alcoholB 1 0.0 0.001 0.0000 0.995883
## student$gender:student$travelA 1 0.1 0.119 0.0056 0.940599
## student$gender:student$travelB 1 23.2 23.222 1.0871 0.297521
## student$extra:student$alcoholA 1 5.4 5.433 0.2543 0.614209
## student$extra:student$alcoholB 1 61.5 61.523 2.8800 0.090180 .
## student$extra:student$travelA 1 46.5 46.498 2.1767 0.140617
## student$extra:student$travelB 1 23.8 23.763 1.1124 0.291962
## student$alcoholA:student$travelA 1 26.8 26.765 1.2529 0.263417
## student$alcoholA:student$travelB 1 0.4 0.369 0.0173 0.895458
## student$alcoholB:student$travelA 1 9.6 9.607 0.4497 0.502702
## student$alcoholB:student$travelB 1 0.0 0.030 0.0014 0.970168
## Residuals 629 13436.7 21.362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the results of these two models, we can reject the null that the IVs have no effect on school attendance. However, it initially appears that the only significant factor is alcohol consumption, and potentially the interaction of alcohol consumption and extracurricular involvement. It is important to note that aliasing could result in confounding between some of the 2fi and the main effect of alcohol consumption.
The following code estimates the coefficients for the main effects of the factors.
#Coefficient estimation
coef(mainmodel)
## (Intercept) gender extra alcoholA alcoholB travelA
## 2.86787372 0.12179417 -0.13028331 0.79680683 0.75043185 0.06467938
## travelB
## -0.27982001
The folowing plot helps to check the accuracy of the model. From this plot, it is evident that the linear model is most likely not the best fit for this data. Because of this, we should use caution when rejecting the null and find a better model, even though some of the results appeared to be statistically significant.
#Check accuracy of model
qqnorm(residuals(mainmodel), main = "Normal Q-Q Plot")
qqline(residuals(mainmodel))
The residuals plot also suggests that the linear model may not be the best fit because the points are not randomly scattered and show some linearity.
#Residuals Plot
plot(fitted(mainmodel),residuals(mainmodel))
P. Cortez and A. Silva. Using Data Mining to Predict Secondary School Student Performance. In A. Brito and J. Teixeira Eds., Proceedings of 5th FUture BUsiness TEChnology Conference (FUBUTEC 2008) pp. 5-12, Porto, Portugal, April, 2008, EUROSIS, ISBN 978-9077381-39-7.
Complete R code
#Clear Workspace
rm(list = ls())
#Download FrF2 package
library(FrF2)
#Read in data- students in Portuguese language course
student0=read.table("student-por.csv",sep=";",header=TRUE)
#Show first 10 data points
head(student0, n=10)
#Show a summary and structure of the raw data
summary(student0)
str(student0)
#Show a histogram of the response variables
hist(student0$absences, main = "Student Absences")
#Convert Walc (weekend alcohol consumption) to a 3-level factor
for (i in (1:length(student0$Walc))) {
if (student0$Walc[i] == 1) {
student0$alcohol[i] <- 0
} else if (student0$Walc[i] == 2) {
student0$alcohol[i] <- 1
} else {
student0$alcohol[i] <- 2
}
}
#Convert traveltime (travel time to school) to a 3-level factor
for (i in (1:length(student0$traveltime))) {
if (student0$traveltime[i] == 1) {
student0$travel[i] <- 0
} else if (student0$traveltime[i] == 2) {
student0$travel[i] <- 1
} else {
student0$travel[i] <- 2
}
}
#Convert gender to a 2-level factor
for (i in (1:length(student0$sex))) {
if (student0$sex[i] == "M") {
student0$gender[i] <- 0
} else {
student0$gender[i] <- 1
}
}
#Convert extracurricular involvement to a 2-level factor
for (i in (1:length(student0$activities))) {
if (student0$activities[i] == "no") {
student0$extra[i] <- 0
} else {
student0$extra[i] <- 1
}
}
#Remove unimportant variables from data
student <- student0[ -c(1:29, 31:33) ]
#Show a summary and structure of the relevant data
summary(student)
str(student)
#Show a boxplot of the data
boxplot(absences ~ gender + extra + alcohol + travel, data = student, main = "Boxplot for 4 Factors and Absences")
#Show individual boxplots for each of the IVs
boxplot(absences ~ gender, data = student, main = "Gender")
boxplot(absences ~ extra, data = student, main = "Extracurricular Activities")
boxplot(absences ~ alcohol, data = student, main = "Alcohol Consumption")
boxplot(absences ~ travel, data = student, main = "Travel Time")
#Represent 3-level factor as 2 2-level factors (alcohol)
for (i in (1:length(student0$alcohol))) {
if (student$alcohol[i] == 0) {
student$alcoholA[i] <- 0
student$alcoholB[i] <- 0
} else if (student$alcohol[i] == 1) {
student$alcoholA[i] <- 1
student$alcoholB[i] <- 0
} else {
student$alcoholA[i] <- 1
student$alcoholB[i] <- 1
}
}
#Represent 3-level factor as 2 2-level factors (travel)
for (i in (1:length(student0$traveltime))) {
if (student$travel[i] == 0) {
student$travelA[i] <- 0
student$travelB[i] <- 0
} else if (student$travel[i] == 1) {
student$travelA[i] <- 1
student$travelB[i] <- 0
} else {
student$travelA[i] <- 1
student$travelB[i] <- 1
}
}
#Show a summary and structure of the relevant data with all 2-level factors
summary(student)
str(student)
#Show all runs for the full factorial design for 2^6
FrF2(64,6,factor.names = c('gender','extra','alcoholA','alcoholB', 'travelA', 'travelB'))
#Show fractional factorial design
absences = FrF2(8,6,factor.names = c('gender','extra','alcoholA','alcoholB', 'travelA', 'travelB'))
summary(absences)
#Show aliasing structure of fractional factorial design
design.info(absences)$aliased
#Create linear model and use ANOVA to estimate main effects and interaction effects
mainmodel <- lm(absences ~ gender + extra + alcoholA + alcoholB + travelA + travelB, data = student)
anova(mainmodel)
interactmodel <- lm((student$absences ~ (student$gender + student$extra + student$alcoholA + student$alcoholB + student$travelA + student$travelB)^2))
anova(interactmodel)
#Coefficient estimation
coef(mainmodel)
#Check accuracy of model
qqnorm(residuals(mainmodel), main = "Normal Q-Q Plot")
qqline(residuals(mainmodel))
#Residuals Plot
plot(fitted(mainmodel),residuals(mainmodel))