Q2
#
dta2 <- women
str(dta2)
## 'data.frame': 15 obs. of 2 variables:
## $ height: num 58 59 60 61 62 63 64 65 66 67 ...
## $ weight: num 115 117 120 123 126 129 132 135 139 142 ...
#Compute mean height
mean(dta2$height)
## [1] 65
#Make a box plot of women's weights
boxplot(dta2$weight,xlab = "Women's Height")

#Draw a scatter diagram of height by weight
plot(dta2$weight,dta2$height,xlab = "Women's Weight",ylab = "Women's Height")
#Add a line to indicate the mean height on the scatter plot
mean_height <- mean(dta2$height)
abline(h = mean(dta2$height),col = "red")

Q3
#
dta3 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/mathAttainment.txt",header = T)
str(dta3)
## 'data.frame': 39 obs. of 3 variables:
## $ math2: int 28 56 51 13 39 41 30 13 17 32 ...
## $ math1: int 18 22 44 8 20 12 16 5 9 18 ...
## $ cc : num 328 406 387 167 328 ...
#mean
apply(dta3,2,mean)
## math2 math1 cc
## 28.76923 15.35897 188.83667
#sd
apply(dta3,2,sd)
## math2 math1 cc
## 10.720029 7.744224 84.842513
#correlation
cor(dta3)
## math2 math1 cc
## math2 1.0000000 0.7443604 0.6570098
## math1 0.7443604 1.0000000 0.5956771
## cc 0.6570098 0.5956771 1.0000000
#plot
plot(dta3$math1,dta3$math2,xlab = "Math Score at Year 1",ylab = "Math Score at Year 2",
main = "Mathmatics Attainment",xlim = c(0,60),ylim = c(0,60))
lm.dta3 <- lm(math2~math1, data = dta3)
grid()
abline(lm.dta3, lty=2)

#general linear regression
summary(lm.dta3)
##
## Call:
## lm(formula = math2 ~ math1, data = dta3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.430 -5.521 -0.369 4.253 20.388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.944 2.607 4.965 1.57e-05 ***
## math1 1.030 0.152 6.780 5.57e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.255 on 37 degrees of freedom
## Multiple R-squared: 0.5541, Adjusted R-squared: 0.542
## F-statistic: 45.97 on 1 and 37 DF, p-value: 5.571e-08
#Anova
anova(lm.dta3)
## Analysis of Variance Table
##
## Response: math2
## Df Sum Sq Mean Sq F value Pr(>F)
## math1 1 2419.6 2419.59 45.973 5.571e-08 ***
## Residuals 37 1947.3 52.63
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## diagnostics
# specify maximum plot region
par(pty="m")
#
plot(scale(resid(lm.dta3)) ~ fitted(lm.dta3),
ylim=c(-3.5, 3.5), type="n",
xlab="Fitted values", ylab="Standardized residuals")
#
text(fitted(lm.dta3), scale(resid(lm.dta3)), labels=rownames(dta3), cex=0.5)
#
grid()
# add a horizontal red dash line
abline(h=0, lty=2, col="red")

## Q-Q plot
qqnorm(y = scale(resid(lm.dta3)), main = "Normal Q-Q Plot",
xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")
qqline(y = scale(resid(lm.dta3)))
#
grid()

Q4
#
dta4 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/eduSex.txt",header = T)
str(dta4)
## 'data.frame': 6 obs. of 2 variables:
## $ Gender : int 1 1 1 1 0 0
## $ Education: int 0 1 1 1 0 0
#
dta4 <- within(dta4, {
Gender <- ifelse(Gender == 0, "Male", "Female")
Education <- ifelse(Education == 0, "No", "Yes")
} )
head(dta4)
## Gender Education
## 1 Female No
## 2 Female Yes
## 3 Female Yes
## 4 Female Yes
## 5 Male No
## 6 Male No
print(dtat <- with(dta4, table(Gender, Education)))
## Education
## Gender No Yes
## Female 1 3
## Male 2 0
#
dtat <- data.frame(dtat)
dtat
## Gender Education Freq
## 1 Female No 1
## 2 Male No 2
## 3 Female Yes 3
## 4 Male Yes 0
#
print(dta <- untable(dtat[, 1:2], num=dtat[,"Freq"]))
## Gender Education
## 1 Female No
## 2 Male No
## 2.1 Male No
## 3 Female Yes
## 3.1 Female Yes
## 3.2 Female Yes
Q5
#
dta5 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/verbalIQ.txt",header = T)
str(dta5)
## 'data.frame': 2287 obs. of 6 variables:
## $ school : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
## $ viq : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
## $ language: int 46 45 33 46 20 30 30 57 36 36 ...
## $ csize : int 29 29 29 29 29 29 29 29 29 29 ...
## $ ses : int 23 10 15 23 10 10 23 10 13 15 ...
dta5<- dta5 %>% mutate(school = as.factor(school),
pupil = as.factor(pupil),
id = paste("S",101:2387,sep = ""))
dta5$id <- as.factor(dta5$id)
#mean verbal IQ, language test score, class size, ses by all pupils
colMeans(dta5[,3:6])
## viq language csize ses
## 11.83406 40.93485 23.10057 27.81198
#mean verbal IQ, language test score, ses by school
head(tapply(dta5$viq, dta5$school, mean))
## 1 2 10 12 15 16
## 10.320000 9.000000 10.500000 9.433333 11.500000 11.687500
head(tapply(dta5$language, dta5$school, mean))
## 1 2 10 12 15 16
## 36.40000 23.71429 30.40000 30.86667 30.87500 41.50000
head(tapply(dta5$ses, dta5$school, mean))
## 1 2 10 12 15 16
## 13.84000 15.42857 19.60000 29.46667 28.12500 29.37500
#plot
plot(dta5$viq,dta5$language, xlab = "Verbal IQ",ylab = "Language test score")
lm.dta5 <- lm(language ~ viq + ses, data = dta5)
abline(lm.dta5,col = "red")
#
grid()

#residual
# specify maximum plot region
par(pty="m")
#
plot(scale(resid(lm.dta5)) ~ fitted(lm.dta5),
ylim=c(-3.5, 3.5), type="n",
xlab="Fitted values", ylab="Standardized residuals")
#
points(fitted(lm.dta5), scale(resid(lm.dta5)), lwd = 1)
#
grid()
# add a horizontal red dash line
abline(h=0, lty=2, lwd = 2, col="red")

# lmer
summary(result5 <- lmer(language ~ viq + ses + (1|id)+(1|school:id),data = dta5,control=lmerControl(check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE="ignore")))
## Linear mixed model fit by REML ['lmerMod']
## Formula: language ~ viq + ses + (1 | id) + (1 | school:id)
## Data: dta5
## Control:
## lmerControl(check.nobs.vs.nlev = "ignore", check.nobs.vs.rankZ = "ignore",
## check.nobs.vs.nRE = "ignore")
##
## REML criterion at convergence: 15380.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.37817 -0.38787 0.03846 0.42034 2.18933
##
## Random effects:
## Groups Name Variance Std.Dev.
## id (Intercept) 11.76 3.429
## school:id (Intercept) 19.58 4.425
## Residual 17.25 4.154
## Number of obs: 2287, groups: id, 2287; school:id, 2287
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 8.33128 0.85416 9.75
## viq 2.40542 0.07430 32.38
## ses 0.14877 0.01409 10.56
##
## Correlation of Fixed Effects:
## (Intr) viq
## viq -0.884
## ses -0.133 -0.317
#
plot(scale(resid(result5)) ~ fitted(result5),
ylim=c(-3.5, 3.5), type="n",
xlab="Fitted values", ylab="Standardized residuals")
#
points(fitted(result5), scale(resid(result5)), lwd = 1)
# add a horizontal red dash line
abline(h=0, lty=2, lwd = 2, col="red")

Q6
#
dta6 <- minn38
str(dta6)
## 'data.frame': 168 obs. of 5 variables:
## $ hs : Factor w/ 3 levels "L","M","U": 1 1 1 1 1 1 1 1 1 1 ...
## $ phs: Factor w/ 4 levels "C","E","N","O": 1 1 1 1 1 1 1 3 3 3 ...
## $ fol: Factor w/ 7 levels "F1","F2","F3",..: 1 2 3 4 5 6 7 1 2 3 ...
## $ sex: Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ f : int 87 72 52 88 32 14 20 3 6 17 ...
##number of female high school graduates
sum(dta6$sex == "F")
## [1] 84
##number of female high school graduates enrolled in college
sum(dta6$sex == "F" & dta6$phs == "C")
## [1] 21
##The distributions of high school graduates (frequecies or counts) in 1938 by father's occupational status
dta6_fol <- as.data.frame(matrix(0,7,2))
for (i in 1:7){
dta6_fol[i,2] = as.numeric(sum(dta6$fol == levels(dta6$fol)[i]))
}
dta6_fol[,1] <- paste("F",1:7,sep = "")
dta6_fol$V1 <- as.factor(dta6_fol$V1)
plot(dta6_fol$V1,dta6_fol$V2,ylab = "Frequency")

##The distributions of male high school graduates (frequecies or counts) in 1938 by post high school status.
dta6_phs <- as.data.frame(matrix(0,4,2))
for (i in 1:4){
dta6_phs[i,2] = as.numeric(sum(dta6$phs == levels(dta6$phs)[i]))
}
dta6_phs[,1] <- c("C","E","N","O")
dta6_phs$V1 <- as.factor(dta6_phs$V1)
plot(dta6_phs$V1,dta6_phs$V2,ylab = "Frequency")

Q7
#
dta7 <- sleep
str(dta7)
## 'data.frame': 20 obs. of 3 variables:
## $ extra: num 0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2 ...
## $ group: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ ID : Factor w/ 10 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## "?sleep" give us the detail of sleep data
#split the grpha into 2 by panels
par(mfrow=c(1, 2))
#
dta7_1 <- dta7[dta7$group == "1",]
dta7_2 <- dta7[dta7$group == "2",]
#individual's extra hours of sleep using drug 1
boxplot(dta7_1$extra,ylim = c(-2,7),main = "Drug 1")
#individual's extra hours of sleep using drug 2
boxplot(dta7_2$extra,ylim = c(-2,7),main = "Drug 2")

par(mfrow=c(1, 1))
#contrast individual's extra hours of sleep using drug 1 & 2
plot(dta7_1$extra,dta7_2$extra,type = "p",xlim = c(-6,6),ylim = c(-6,6))
abline(a = 0, b = 1, col = "red")

#paired t test(statistically different)
t.test(dta7_1$extra,dta7_2$extra,paired = T)
##
## Paired t-test
##
## data: dta7_1$extra and dta7_2$extra
## t = -4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.4598858 -0.7001142
## sample estimates:
## mean of the differences
## -1.58
#subjects whose weight is greater or equaviewl to 5
dta7[dta7$extra > 5,]
## extra group ID
## 17 5.5 2 7
#all subjects whose extra sleep time is more than 1 hour
dta7[dta7$extra > 1,]
## extra group ID
## 6 3.4 1 6
## 7 3.7 1 7
## 10 2.0 1 10
## 11 1.9 2 1
## 13 1.1 2 3
## 16 4.4 2 6
## 17 5.5 2 7
## 18 1.6 2 8
## 19 4.6 2 9
## 20 3.4 2 10
#all Group 1 subjects whose extra sleep time is reduced by 1 hour or more
dta7_1[dta7_1$extra < -1,]
## extra group ID
## 2 -1.6 1 2
## 4 -1.2 1 4
Q8
heatmap(Harman74.cor$cov)

A correlation matrix of 24 psychological tests given to 145 seventh and eight-grade children in a Chicago suburb by Holzinger and Swineford.
The white area represent perfect positive correlation, the color looks redder representing smaller correlation.
Q9
#
dta9 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/readingtimes.txt",header = T)
str(dta9)
## 'data.frame': 7 obs. of 14 variables:
## $ Snt : int 1 2 3 4 5 6 7
## $ Sp : int 1 2 3 4 5 6 7
## $ Wrds: int 13 16 9 9 10 18 6
## $ New : int 1 3 2 2 3 4 1
## $ S01 : num 3.43 6.48 1.71 3.68 4 ...
## $ S02 : num 2.79 5.41 2.34 3.71 2.9 ...
## $ S03 : num 4.16 4.49 3.02 2.87 2.99 ...
## $ S04 : num 3.07 5.06 2.46 2.73 2.67 ...
## $ S05 : num 3.62 9.29 6.04 4.21 3.88 ...
## $ S06 : num 3.16 5.64 2.46 6.24 3.22 ...
## $ S07 : num 3.23 8.36 4.92 3.72 3.14 ...
## $ S08 : num 7.16 4.31 3.37 6.33 6.14 ...
## $ S09 : num 1.54 2.95 1.38 1.15 2.76 ...
## $ S10 : num 4.06 6.65 2.18 3.66 3.33 ...
dta9L <- gather(dta9,key = "ID",value = "Time",5:14)
str(dta9L)
## 'data.frame': 70 obs. of 6 variables:
## $ Snt : int 1 2 3 4 5 6 7 1 2 3 ...
## $ Sp : int 1 2 3 4 5 6 7 1 2 3 ...
## $ Wrds: int 13 16 9 9 10 18 6 13 16 9 ...
## $ New : int 1 3 2 2 3 4 1 1 3 2 ...
## $ ID : chr "S01" "S01" "S01" "S01" ...
## $ Time: num 3.43 6.48 1.71 3.68 4 ...
dta9L$ID <- as.factor(dta9L$ID)
#plot mean time of each subject
mean_dta9L <- as.data.frame(matrix(0,10,2))
for (i in 1:10){
mean_dta9L[i,2] = mean(dta9L[dta9L$ID == levels(dta9L$ID)[i],]$Time)
}
mean_dta9L[,1] <- levels(dta9L$ID)
mean_dta9L$V1 <- as.factor(mean_dta9L$V1)
plot(V2~V1,data = mean_dta9L,ylab = "Subject's mean time")
abline(h = mean(dta9L$Time),col = "red") #grand mean

#the time taken to read a word (calculate by 7 condition)
wrd_time <- as.data.frame(matrix(0,7,5))
wrd_time[1:7,1:4] <- dta9L[1:7,1:4]
colnames(wrd_time) <- c("Snt","Sp","Wrds","New","Time")
for (i in 1:7){
wrd_time[i,5] <- mean(dta9L[dta9L$Snt == i,]$Time)/wrd_time[i,3]
}
wrd_time
## Snt Sp Wrds New Time
## 1 1 1 13 1 0.2787231
## 2 2 2 16 3 0.3665812
## 3 3 3 9 2 0.3319444
## 4 4 4 9 2 0.4255889
## 5 5 5 10 3 0.3504500
## 6 6 6 18 4 0.4680056
## 7 7 7 6 1 0.4270833
#boxplot
boxplot(wrd_time$Time)

#plot conditions by time
plot(Time~Snt,data = wrd_time)

#lmer
summary(result9 <- lmer(Time ~ Wrds + New + Sp + (1|ID),data = dta9L))
## Linear mixed model fit by REML ['lmerMod']
## Formula: Time ~ Wrds + New + Sp + (1 | ID)
## Data: dta9L
##
## REML criterion at convergence: 259.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8986 -0.6825 -0.0814 0.4622 3.0954
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 0.5524 0.7432
## Residual 1.8955 1.3768
## Number of obs: 70, groups: ID, 10
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -2.58695 0.80471 -3.215
## Wrds 0.45859 0.07349 6.240
## New 0.15163 0.27644 0.549
## Sp 0.33337 0.10685 3.120
##
## Correlation of Fixed Effects:
## (Intr) Wrds New
## Wrds -0.751
## New 0.383 -0.807
## Sp -0.716 0.617 -0.594
#
plot(scale(resid(result9)) ~ fitted(result9),
ylim=c(-3.5, 3.5), type="n",
xlab="Fitted values", ylab="Standardized residuals")
#
points(fitted(result9), scale(resid(result9)), lwd = 1)
# add a horizontal red dash line
abline(h=0, lty=2, lwd = 2, col="red")
