Dataset containing test scores for reading and mathmatics.
library(Ecfun) # Package needed for Ecdat
library(Ecdat)
library(FrF2)
data("Star") # Load dataset
Below are the first 6 observations
head(Star)
## tmathssk treadssk classk totexpk sex freelunk race schidkn
## 2 473 447 small.class 7 girl no white 63
## 3 536 450 small.class 21 girl no black 20
## 5 463 439 regular.with.aide 0 boy yes black 19
## 11 559 448 regular 16 boy no white 69
## 12 489 447 small.class 5 boy yes white 79
## 13 454 431 regular 8 boy yes white 5
The dataset contains the following 4 factors.
levels(Star$sex)
## [1] "girl" "boy"
levels(Star$freelunk)
## [1] "no" "yes"
levels(Star$race)
## [1] "white" "black" "other"
levels(Star$classk)
## [1] "regular" "small.class" "regular.with.aide"
‘totexpk’ and ‘schidkn’ are continuous variables. Years of total teaching experience and school indicator variable, respectively.
The response variables are the two test scores, ‘tmathssk’ and ‘treadssk’,
There is a reference in the Ecdat package manual supposedly with information about the dataset. Unfortunately, the link is for an outdated website with no information about how the experiment was conducted or how the data was collected.
However, in this assignment a \(2^k\) fractional factorial design is used.
The data set contains 4 factors of which two have 3 levels. These two factor will be decomposed to two 2-level factors each by the following process.
x = Star[1,]
race = c(0,0)
L = levels(Star[,'race'])
for(i in seq(2,dim(Star)[1])){
if(Star[i,'race']==L[1]){
x = rbind(x,Star[i,])
race = rbind(race,c(0,0))
}
else if(Star[i,'race']==L[2]){
x = rbind(x,Star[i,])
x = rbind(x,Star[i,])
race = rbind(race,c(1,0))
race = rbind(race,c(0,1))
}
else{
x = rbind(x,Star[i,])
race = rbind(race,c(1,1))
}
}
x = cbind(x,race)
x = x[,c(1,2,3,4,5,6,8,9,10)]
colnames(x) = c(colnames(Star[1,c(1,2,3,4,5,6,8)]),'raceA','raceB')
Lc = levels(Star[,'classk'])
classk = c(1,0)
classk = rbind(classk,c(0,1))
dat = x[1,]
dat = rbind(dat,dat)
for(i in seq(2,dim(x)[1])){
if(x[i,'classk']==Lc[1]){
dat = rbind(dat,x[i,])
classk = rbind(classk,c(0,0))
}
else if(x[i,'classk']==Lc[2]){
dat = rbind(dat,x[i,])
dat = rbind(dat,x[i,])
classk = rbind(classk,c(1,0))
classk = rbind(classk,c(0,1))
}
else{
dat = rbind(dat,x[i,])
classk = rbind(classk,c(1,1))
}
}
dat = cbind(dat,classk)
colnames(dat) = c(colnames(x),'classkA','classkB')
dat = dat[,c(1,2,4,5,6,7,8,9,10,11)]
Thus, six 2-level factors remain and will be used in the design.
The fractional factorial design is \(2_{III}^{6-3}\), which yields 8 experimental runs. The R package FrF2 provides functions for creating such a design.
# Fractional Factorial Design
plan = FrF2(8,6,factor.names = c('classkA','classkB','sex','freelunk','raceA','raceB'))
plan
## classkA classkB sex freelunk raceA raceB
## 1 1 1 1 1 1 1
## 2 -1 1 1 -1 -1 1
## 3 -1 1 -1 -1 1 -1
## 4 -1 -1 -1 1 1 1
## 5 1 -1 1 -1 1 -1
## 6 -1 -1 1 1 -1 -1
## 7 1 1 -1 1 -1 -1
## 8 1 -1 -1 -1 -1 1
## class=design, type= FrF2
This design has resolution III, thus the main effects and 2-factor interactions are confounded with 2-factor interactions. The table below shows which factors are confounded.
# view aliasing
design.info(plan)$aliased
## $legend
## [1] "A=classkA" "B=classkB" "C=sex" "D=freelunk" "E=raceA"
## [6] "F=raceB"
##
## $main
## [1] "A=BD=CE" "B=AD=CF" "C=AE=BF" "D=AB=EF" "E=AC=DF" "F=BC=DE"
##
## $fi2
## [1] "AF=BE=CD"
The main effect “classkA” is confounded with the 2-factor interactions (classkB,freelunk) and (sex,raceA).
In this section, some of the factors effects on the test scores are shown in boxplots. The following boxplot shows the effect of class-type on math test scores.
boxplot(tmathssk~classk,data=Star,ylab="Math test score",xlab="type of class")
The medians of the math test scores are very similar, although the median for “small.class” is slightly higher.
par(mfrow=c(1,2))
boxplot(tmathssk~sex,data=Star,ylab="Math test score",xlab="sex")
boxplot(treadssk~sex,data=Star,ylab="Reading test score",xlab="sex")
Median test scores a slightly different for girls and boys, but not by much.
Using the model from Project 3,
m2 = lm(tmathssk+treadssk~sex+freelunk+raceA+raceB+classkA+classkB,data=dat)
anova(m2)
## Analysis of Variance Table
##
## Response: tmathssk + treadssk
## Df Sum Sq Mean Sq F value Pr(>F)
## sex 1 449201 449201 88.0830 < 2.2e-16 ***
## freelunk 1 3826342 3826342 750.2999 < 2.2e-16 ***
## raceA 1 104898 104898 20.5692 5.819e-06 ***
## raceB 1 249943 249943 49.0109 2.711e-12 ***
## classkA 1 885 885 0.1735 0.6770
## classkB 1 755 755 0.1481 0.7003
## Residuals 9866 50314135 5100
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
to compute a response surface using rsm from the rsm-package.
library(rsm)
# Model from Project 3
rm1 = rsm(tmathssk+treadssk~FO(sex,freelunk,raceA,raceB,classkA,classkB),data=dat)
summary(rm1)
##
## Call:
## rsm(formula = tmathssk + treadssk ~ FO(sex, freelunk, raceA,
## raceB, classkA, classkB), data = dat)
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 999.40229 3.37615 296.0186 < 2.2e-16 ***
## sex -14.18196 1.43825 -9.8606 < 2.2e-16 ***
## freelunk -33.44840 1.62806 -20.5449 < 2.2e-16 ***
## raceA -13.29794 1.89896 -7.0028 2.674e-12 ***
## raceB -13.29794 1.89896 -7.0028 2.674e-12 ***
## classkA 0.55503 1.44208 0.3849 0.7003
## classkB 0.55503 1.44208 0.3849 0.7003
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Multiple R-squared: 0.0843, Adjusted R-squared: 0.08374
## F-statistic: 151.4 on 6 and 9866 DF, p-value: < 2.2e-16
##
## Analysis of Variance Table
##
## Response: tmathssk + treadssk
## Df Sum Sq Mean Sq
## FO(sex, freelunk, raceA, raceB, classkA, classkB) 6 4632024 772004
## Residuals 9866 50314135 5100
## Lack of fit 53 746897 14092
## Pure error 9813 49567237 5051
## F value Pr(>F)
## FO(sex, freelunk, raceA, raceB, classkA, classkB) 151.3808 < 2.2e-16
## Residuals
## Lack of fit 2.7899 8.582e-11
## Pure error
##
## Direction of steepest ascent (at radius 1):
## sex freelunk raceA raceB classkA classkB
## -0.34660224 -0.81746729 -0.32499702 -0.32499702 0.01356467 0.01356467
##
## Corresponding increment in original units:
## sex freelunk raceA raceB classkA classkB
## -0.34660224 -0.81746729 -0.32499702 -0.32499702 0.01356467 0.01356467
The continuous variables in the data set are included in the next higher order model.
rm2 = rsm(tmathssk+treadssk~(sex+freelunk+raceA+raceB+classkA+classkB)+SO(totexpk,schidkn),data=dat)
summary(rm2)
##
## Call:
## rsm(formula = tmathssk + treadssk ~ (sex + freelunk + raceA +
## raceB + classkA + classkB) + SO(totexpk, schidkn), data = dat)
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 953.4935738 3.9529223 241.2123 < 2.2e-16 ***
## sexboy -13.5537165 1.4335641 -9.4546 < 2.2e-16 ***
## freelunkyes -33.5698487 1.6274635 -20.6271 < 2.2e-16 ***
## raceA -16.5943772 2.0661321 -8.0316 1.072e-15 ***
## raceB -16.5943772 2.0661321 -8.0316 1.072e-15 ***
## classkA 0.1934373 1.4376069 0.1346 0.892966
## classkB 0.1934373 1.4376069 0.1346 0.892966
## totexpk -1.3655254 0.4257667 -3.2072 0.001345 **
## schidkn 0.2209517 0.1546253 1.4289 0.153050
## totexpk:schidkn 0.0043207 0.0062836 0.6876 0.491710
## totexpk^2 0.0959564 0.0172895 5.5500 2.931e-08 ***
## schidkn^2 -0.0046156 0.0017852 -2.5855 0.009737 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Multiple R-squared: 0.09278, Adjusted R-squared: 0.09177
## F-statistic: 91.68 on 11 and 9861 DF, p-value: < 2.2e-16
##
## Analysis of Variance Table
##
## Response: tmathssk + treadssk
## Df Sum Sq Mean Sq F value Pr(>F)
## sex 1 449201 449201 88.8610 < 2.2e-16
## freelunk 1 3826342 3826342 756.9270 < 2.2e-16
## raceA 1 104898 104898 20.7508 5.294e-06
## raceB 1 249943 249943 49.4438 2.177e-12
## classkA 1 885 885 0.1750 0.6757
## classkB 1 755 755 0.1494 0.6991
## FO(totexpk, schidkn) 2 246436 123218 24.3750 2.755e-11
## TWI(totexpk, schidkn) 1 9165 9165 1.8130 0.1782
## PQ(totexpk, schidkn) 2 210188 105094 20.7897 9.775e-10
## Residuals 9861 49848345 5055
## Lack of fit 287 17635196 61447 18.2624 < 2.2e-16
## Pure error 9574 32213149 3365
##
## Stationary point of response surface:
## totexpk schidkn
## 6.507884 26.981345
##
## Eigenanalysis:
## $values
## [1] 0.096002763 -0.004661994
##
## $vectors
## [,1] [,2]
## totexpk -0.99976958 0.02146601
## schidkn -0.02146601 -0.99976958
The contours for the response surface are shown in the following plot.
contour(rm2, ~totexpk+schidkn, image=TRUE,at=summary(rm2$canoncial$xs))
The stationary point on the surface is [6.50;26.98].
Physical interpretation:
One important thing to notice is that schidkn, the school indicator, has very little influence on the test scores. When increasing totexpk, the teaching experience, the test score increases. Thus we can observe the high test scores in the right part of the contour plot.
The parameters for the second model are shown in the following table.
coef(rm2)
## (Intercept) sexboy
## 953.493573825 -13.553716545
## freelunkyes raceA
## -33.569848673 -16.594377198
## raceB classkA
## -16.594377198 0.193437253
## classkB FO(totexpk, schidkn)totexpk
## 0.193437253 -1.365525446
## FO(totexpk, schidkn)schidkn TWI(totexpk, schidkn)
## 0.220951746 0.004320746
## PQ(totexpk, schidkn)totexpk^2 PQ(totexpk, schidkn)schidkn^2
## 0.095956377 -0.004615609
Starting with a Normal QQ-plot. The model in rsm assumes the response is normally distributed.
plot(rm2,2)
The points should not differ much from the straight dotted line. Thus the assumption, that the response is normal, may not be satisfied.
Next,the residuals are are plotted vs the fitted values. The residuals should not increase with the fitted values.
plot(rm2,1)
The plot shows that the residuals do not increase substantially with the fitted values.
R Package Ecdat: https://cran.r-project.org/web/packages/Ecdat/index.html