dataM_week1

Q2

#
dta2 <- women
str(dta2)

## 'data.frame':    15 obs. of  2 variables:
##  $ height: num  58 59 60 61 62 63 64 65 66 67 ...
##  $ weight: num  115 117 120 123 126 129 132 135 139 142 ...

#Compute mean height

mean(dta2$height)

## [1] 65

#Make a box plot of women's weights

boxplot(dta2$weight,xlab = "Women's Height")

#Draw a scatter diagram of height by weight

plot(dta2$weight,dta2$height,xlab = "Women's Weight",ylab = "Women's Height")

#Add a line to indicate the mean height on the scatter plot

mean_height <- mean(dta2$height)
abline(h = mean(dta2$height),col = "red")

Q3

#
dta3 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/mathAttainment.txt",header = T)
str(dta3)

## 'data.frame':    39 obs. of  3 variables:
##  $ math2: int  28 56 51 13 39 41 30 13 17 32 ...
##  $ math1: int  18 22 44 8 20 12 16 5 9 18 ...
##  $ cc   : num  328 406 387 167 328 ...

#mean
apply(dta3,2,mean)

##     math2     math1        cc 
##  28.76923  15.35897 188.83667

#sd
apply(dta3,2,sd)

##     math2     math1        cc 
## 10.720029  7.744224 84.842513

#correlation
cor(dta3)

##           math2     math1        cc
## math2 1.0000000 0.7443604 0.6570098
## math1 0.7443604 1.0000000 0.5956771
## cc    0.6570098 0.5956771 1.0000000

#plot
plot(dta3$math1,dta3$math2,xlab = "Math Score at Year 1",ylab = "Math Score at Year 2",
     main = "Mathmatics Attainment",xlim = c(0,60),ylim = c(0,60))

lm.dta3 <- lm(math2~math1, data = dta3)

grid()

abline(lm.dta3, lty=2)

#general linear regression

summary(lm.dta3)

## 
## Call:
## lm(formula = math2 ~ math1, data = dta3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.430  -5.521  -0.369   4.253  20.388 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   12.944      2.607   4.965 1.57e-05 ***
## math1          1.030      0.152   6.780 5.57e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.255 on 37 degrees of freedom
## Multiple R-squared:  0.5541, Adjusted R-squared:  0.542 
## F-statistic: 45.97 on 1 and 37 DF,  p-value: 5.571e-08

#Anova

anova(lm.dta3)

## Analysis of Variance Table
## 
## Response: math2
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## math1      1 2419.6 2419.59  45.973 5.571e-08 ***
## Residuals 37 1947.3   52.63                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## diagnostics

# specify maximum plot region
par(pty="m")

#
plot(scale(resid(lm.dta3)) ~ fitted(lm.dta3), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals") 

#
text(fitted(lm.dta3), scale(resid(lm.dta3)), labels=rownames(dta3), cex=0.5) 

#
grid()

# add a horizontal red dash line
abline(h=0, lty=2, col="red")

## Q-Q plot
qqnorm(y = scale(resid(lm.dta3)), main = "Normal Q-Q Plot",
       xlab = "Theoretical Quantiles", ylab = "Sample Quantiles")

qqline(y = scale(resid(lm.dta3)))

#
grid()

Q4

#
dta4 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/eduSex.txt",header = T)
str(dta4)

## 'data.frame':    6 obs. of  2 variables:
##  $ Gender   : int  1 1 1 1 0 0
##  $ Education: int  0 1 1 1 0 0

#
dta4 <- within(dta4, {
Gender <- ifelse(Gender == 0, "Male", "Female")
Education <- ifelse(Education == 0, "No", "Yes")
} )
head(dta4)

##   Gender Education
## 1 Female        No
## 2 Female       Yes
## 3 Female       Yes
## 4 Female       Yes
## 5   Male        No
## 6   Male        No

print(dtat <- with(dta4, table(Gender, Education)))

##         Education
## Gender   No Yes
##   Female  1   3
##   Male    2   0

#
dtat <- data.frame(dtat)
dtat

##   Gender Education Freq
## 1 Female        No    1
## 2   Male        No    2
## 3 Female       Yes    3
## 4   Male       Yes    0

#
print(dta <- untable(dtat[, 1:2], num=dtat[,"Freq"]))

##     Gender Education
## 1   Female        No
## 2     Male        No
## 2.1   Male        No
## 3   Female       Yes
## 3.1 Female       Yes
## 3.2 Female       Yes

Q5

#
dta5 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/verbalIQ.txt",header = T)
str(dta5)

## 'data.frame':    2287 obs. of  6 variables:
##  $ school  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pupil   : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ viq     : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ language: int  46 45 33 46 20 30 30 57 36 36 ...
##  $ csize   : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ ses     : int  23 10 15 23 10 10 23 10 13 15 ...

dta5<- dta5 %>% mutate(school = as.factor(school),
                pupil = as.factor(pupil),
                id = paste("S",101:2387,sep = ""))
dta5$id <- as.factor(dta5$id)

#mean verbal IQ, language test score, class size, ses by all pupils
colMeans(dta5[,3:6])

##      viq language    csize      ses 
## 11.83406 40.93485 23.10057 27.81198

#mean verbal IQ, language test score, ses by school
head(tapply(dta5$viq, dta5$school, mean))

##         1         2        10        12        15        16 
## 10.320000  9.000000 10.500000  9.433333 11.500000 11.687500

head(tapply(dta5$language, dta5$school, mean))

##        1        2       10       12       15       16 
## 36.40000 23.71429 30.40000 30.86667 30.87500 41.50000

head(tapply(dta5$ses, dta5$school, mean))

##        1        2       10       12       15       16 
## 13.84000 15.42857 19.60000 29.46667 28.12500 29.37500

#plot
plot(dta5$viq,dta5$language, xlab = "Verbal IQ",ylab = "Language test score")
lm.dta5 <- lm(language ~ viq + ses, data = dta5)
abline(lm.dta5,col = "red")

#
grid()

#residual

# specify maximum plot region
par(pty="m")

#
plot(scale(resid(lm.dta5)) ~ fitted(lm.dta5), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals") 

# 
points(fitted(lm.dta5), scale(resid(lm.dta5)), lwd = 1)

#
grid()

# add a horizontal red dash line
abline(h=0, lty=2, lwd = 2, col="red")

# lmer
summary(result5 <- lmer(language ~ viq + ses + (1|id)+(1|school:id),data = dta5,control=lmerControl(check.nobs.vs.nlev = "ignore",
                                                                                                   check.nobs.vs.rankZ = "ignore",
                                                                                                   check.nobs.vs.nRE="ignore")))

## Linear mixed model fit by REML ['lmerMod']
## Formula: language ~ viq + ses + (1 | id) + (1 | school:id)
##    Data: dta5
## Control: 
## lmerControl(check.nobs.vs.nlev = "ignore", check.nobs.vs.rankZ = "ignore",  
##     check.nobs.vs.nRE = "ignore")
## 
## REML criterion at convergence: 15380.9
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.37817 -0.38787  0.03846  0.42034  2.18933 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  id        (Intercept) 11.76    3.429   
##  school:id (Intercept) 19.58    4.425   
##  Residual              17.25    4.154   
## Number of obs: 2287, groups:  id, 2287; school:id, 2287
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  8.33128    0.85416    9.75
## viq          2.40542    0.07430   32.38
## ses          0.14877    0.01409   10.56
## 
## Correlation of Fixed Effects:
##     (Intr) viq   
## viq -0.884       
## ses -0.133 -0.317

#
plot(scale(resid(result5)) ~ fitted(result5), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals") 

# 
points(fitted(result5), scale(resid(result5)), lwd = 1)

# add a horizontal red dash line
abline(h=0, lty=2, lwd = 2, col="red")

Q6

#
dta6 <- minn38
str(dta6)

## 'data.frame':    168 obs. of  5 variables:
##  $ hs : Factor w/ 3 levels "L","M","U": 1 1 1 1 1 1 1 1 1 1 ...
##  $ phs: Factor w/ 4 levels "C","E","N","O": 1 1 1 1 1 1 1 3 3 3 ...
##  $ fol: Factor w/ 7 levels "F1","F2","F3",..: 1 2 3 4 5 6 7 1 2 3 ...
##  $ sex: Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ f  : int  87 72 52 88 32 14 20 3 6 17 ...

##number of female high school graduates

sum(dta6$sex == "F")

## [1] 84

##number of female high school graduates enrolled in college

sum(dta6$sex == "F" & dta6$phs == "C")

## [1] 21

##The distributions of high school graduates (frequecies or counts) in 1938 by father's occupational status

dta6_fol <- as.data.frame(matrix(0,7,2))


for (i in 1:7){
  dta6_fol[i,2] = as.numeric(sum(dta6$fol == levels(dta6$fol)[i]))
}

dta6_fol[,1] <- paste("F",1:7,sep = "")

dta6_fol$V1 <- as.factor(dta6_fol$V1)

plot(dta6_fol$V1,dta6_fol$V2,ylab = "Frequency")

##The distributions of male high school graduates (frequecies or counts) in 1938 by post high school status.

dta6_phs <- as.data.frame(matrix(0,4,2))

for (i in 1:4){
  dta6_phs[i,2] = as.numeric(sum(dta6$phs == levels(dta6$phs)[i]))
}

dta6_phs[,1] <- c("C","E","N","O")

dta6_phs$V1 <- as.factor(dta6_phs$V1)

plot(dta6_phs$V1,dta6_phs$V2,ylab = "Frequency")

Q7

#
dta7 <- sleep
str(dta7)

## 'data.frame':    20 obs. of  3 variables:
##  $ extra: num  0.7 -1.6 -0.2 -1.2 -0.1 3.4 3.7 0.8 0 2 ...
##  $ group: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ID   : Factor w/ 10 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...

## "?sleep" give us the detail of sleep data

#split the grpha into 2 by panels
par(mfrow=c(1, 2))

#
dta7_1 <- dta7[dta7$group == "1",]

dta7_2 <- dta7[dta7$group == "2",]

#individual's extra hours of sleep using drug 1
boxplot(dta7_1$extra,ylim = c(-2,7),main = "Drug 1")

#individual's extra hours of sleep using drug 2
boxplot(dta7_2$extra,ylim = c(-2,7),main = "Drug 2")

par(mfrow=c(1, 1))

#contrast individual's extra hours of sleep using drug 1 & 2

plot(dta7_1$extra,dta7_2$extra,type = "p",xlim = c(-6,6),ylim = c(-6,6))
abline(a = 0, b = 1, col = "red")

#paired t test(statistically different)

t.test(dta7_1$extra,dta7_2$extra,paired = T)

## 
##  Paired t-test
## 
## data:  dta7_1$extra and dta7_2$extra
## t = -4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.4598858 -0.7001142
## sample estimates:
## mean of the differences 
##                   -1.58

#subjects whose weight is greater or equaviewl to 5

dta7[dta7$extra > 5,]

##    extra group ID
## 17   5.5     2  7

#all subjects whose extra sleep time is more than 1 hour

dta7[dta7$extra > 1,]

##    extra group ID
## 6    3.4     1  6
## 7    3.7     1  7
## 10   2.0     1 10
## 11   1.9     2  1
## 13   1.1     2  3
## 16   4.4     2  6
## 17   5.5     2  7
## 18   1.6     2  8
## 19   4.6     2  9
## 20   3.4     2 10

#all Group 1 subjects whose extra sleep time is reduced by 1 hour or more

dta7_1[dta7_1$extra < -1,]

##   extra group ID
## 2  -1.6     1  2
## 4  -1.2     1  4

Q8

heatmap(Harman74.cor$cov)

A correlation matrix of 24 psychological tests given to 145 seventh and eight-grade children in a Chicago suburb by Holzinger and Swineford.

The white area represent perfect positive correlation, the color looks redder representing smaller correlation.

Q9

#
dta9 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/readingtimes.txt",header = T)
str(dta9)

## 'data.frame':    7 obs. of  14 variables:
##  $ Snt : int  1 2 3 4 5 6 7
##  $ Sp  : int  1 2 3 4 5 6 7
##  $ Wrds: int  13 16 9 9 10 18 6
##  $ New : int  1 3 2 2 3 4 1
##  $ S01 : num  3.43 6.48 1.71 3.68 4 ...
##  $ S02 : num  2.79 5.41 2.34 3.71 2.9 ...
##  $ S03 : num  4.16 4.49 3.02 2.87 2.99 ...
##  $ S04 : num  3.07 5.06 2.46 2.73 2.67 ...
##  $ S05 : num  3.62 9.29 6.04 4.21 3.88 ...
##  $ S06 : num  3.16 5.64 2.46 6.24 3.22 ...
##  $ S07 : num  3.23 8.36 4.92 3.72 3.14 ...
##  $ S08 : num  7.16 4.31 3.37 6.33 6.14 ...
##  $ S09 : num  1.54 2.95 1.38 1.15 2.76 ...
##  $ S10 : num  4.06 6.65 2.18 3.66 3.33 ...

dta9L <- gather(dta9,key = "ID",value = "Time",5:14)
str(dta9L)

## 'data.frame':    70 obs. of  6 variables:
##  $ Snt : int  1 2 3 4 5 6 7 1 2 3 ...
##  $ Sp  : int  1 2 3 4 5 6 7 1 2 3 ...
##  $ Wrds: int  13 16 9 9 10 18 6 13 16 9 ...
##  $ New : int  1 3 2 2 3 4 1 1 3 2 ...
##  $ ID  : chr  "S01" "S01" "S01" "S01" ...
##  $ Time: num  3.43 6.48 1.71 3.68 4 ...

dta9L$ID <- as.factor(dta9L$ID)

#plot mean time of each subject

mean_dta9L <- as.data.frame(matrix(0,10,2))
for (i in 1:10){
  mean_dta9L[i,2] = mean(dta9L[dta9L$ID == levels(dta9L$ID)[i],]$Time)
}
mean_dta9L[,1] <- levels(dta9L$ID)
mean_dta9L$V1 <- as.factor(mean_dta9L$V1)
plot(V2~V1,data = mean_dta9L,ylab = "Subject's mean time")
abline(h = mean(dta9L$Time),col = "red") #grand mean

#the time taken to read a word (calculate by 7 condition)

wrd_time <- as.data.frame(matrix(0,7,5))
wrd_time[1:7,1:4] <- dta9L[1:7,1:4]
colnames(wrd_time) <- c("Snt","Sp","Wrds","New","Time")
for (i in 1:7){
  wrd_time[i,5] <- mean(dta9L[dta9L$Snt == i,]$Time)/wrd_time[i,3]
}

wrd_time

##   Snt Sp Wrds New      Time
## 1   1  1   13   1 0.2787231
## 2   2  2   16   3 0.3665812
## 3   3  3    9   2 0.3319444
## 4   4  4    9   2 0.4255889
## 5   5  5   10   3 0.3504500
## 6   6  6   18   4 0.4680056
## 7   7  7    6   1 0.4270833

#boxplot

boxplot(wrd_time$Time)

#plot conditions by time

plot(Time~Snt,data = wrd_time)

#lmer

summary(result9 <- lmer(Time ~  Wrds + New  + Sp + (1|ID),data = dta9L))

## Linear mixed model fit by REML ['lmerMod']
## Formula: Time ~ Wrds + New + Sp + (1 | ID)
##    Data: dta9L
## 
## REML criterion at convergence: 259.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8986 -0.6825 -0.0814  0.4622  3.0954 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  ID       (Intercept) 0.5524   0.7432  
##  Residual             1.8955   1.3768  
## Number of obs: 70, groups:  ID, 10
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) -2.58695    0.80471  -3.215
## Wrds         0.45859    0.07349   6.240
## New          0.15163    0.27644   0.549
## Sp           0.33337    0.10685   3.120
## 
## Correlation of Fixed Effects:
##      (Intr) Wrds   New   
## Wrds -0.751              
## New   0.383 -0.807       
## Sp   -0.716  0.617 -0.594

#
plot(scale(resid(result9)) ~ fitted(result9), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals") 

# 
points(fitted(result9), scale(resid(result9)), lwd = 1)

# add a horizontal red dash line
abline(h=0, lty=2, lwd = 2, col="red")

dataM_week1

Wen_Sung Cheng

2017年3月3日

Q2

Q3

Q4

Q5

Q6

Q7

Q8

A correlation matrix of 24 psychological tests given to 145 seventh and eight-grade children in a Chicago suburb by Holzinger and Swineford.

The white area represent perfect positive correlation, the color looks redder representing smaller correlation.

Q9