1 Homework 1

Concerning the anorexia{MASS}, explain the difference between

  • anorexia[,2]
  • anorexia[“Prewt”]

and

  • lm(Postwt ~ Treat, data=anorexia)
  • lm(Postwt ~ as.numeric(Treat), data=anorexia)
library(MASS)
# str(anorexia)

# 擷取anorexia資料中的第2欄位,擷取出來的是vector
head(anorexia[,2])
## [1] 80.7 89.4 91.8 74.0 78.1 88.3
# is.vector(anorexia[,2])
# TRUE

# 擷取anorexia資料中的Prewt欄位,擷取出來的是data frame
head(anorexia["Prewt"])
##   Prewt
## 1  80.7
## 2  89.4
## 3  91.8
## 4  74.0
## 5  78.1
## 6  88.3
# is.data.frame(anorexia["Prewt"])
# TRUE
# Although the Treat is integer type
typeof(anorexia$Treat)
## [1] "integer"
# but the structure show Treat is a factor with three level: "CBT","Cont","FT"
str(anorexia)
## 'data.frame':    72 obs. of  3 variables:
##  $ Treat : Factor w/ 3 levels "CBT","Cont","FT": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Prewt : num  80.7 89.4 91.8 74 78.1 88.3 87.3 75.1 80.6 78.4 ...
##  $ Postwt: num  80.2 80.1 86.4 86.3 76.1 78.1 75.1 86.7 73.5 84.6 ...
# Treat as a factor type
lm(Postwt ~ Treat, data=anorexia)  
## 
## Call:
## lm(formula = Postwt ~ Treat, data = anorexia)
## 
## Coefficients:
## (Intercept)    TreatCont      TreatFT  
##      85.697       -4.589        4.798
# Treat as a numeric type
# this is definitely not a proper data analysis for the context of the 'anorexia' 
lm(Postwt ~ as.numeric(Treat), data=anorexia)
## 
## Call:
## lm(formula = Postwt ~ as.numeric(Treat), data = anorexia)
## 
## Coefficients:
##       (Intercept)  as.numeric(Treat)  
##            82.036              1.711

2 Homework 2

Find all possible sums from rolling three dice using R. If possible, construct a histogram for the sum of three dice. Is this a probability histogram or an empirical histogram?

2.1 recorrect

# All possible dice rolls:
dice_sums <- outer(outer(1:6, 1:6, FUN = '+'), 1:6, FUN = '+')
dice_sums
## , , 1
## 
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    3    4    5    6    7    8
## [2,]    4    5    6    7    8    9
## [3,]    5    6    7    8    9   10
## [4,]    6    7    8    9   10   11
## [5,]    7    8    9   10   11   12
## [6,]    8    9   10   11   12   13
## 
## , , 2
## 
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    4    5    6    7    8    9
## [2,]    5    6    7    8    9   10
## [3,]    6    7    8    9   10   11
## [4,]    7    8    9   10   11   12
## [5,]    8    9   10   11   12   13
## [6,]    9   10   11   12   13   14
## 
## , , 3
## 
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    5    6    7    8    9   10
## [2,]    6    7    8    9   10   11
## [3,]    7    8    9   10   11   12
## [4,]    8    9   10   11   12   13
## [5,]    9   10   11   12   13   14
## [6,]   10   11   12   13   14   15
## 
## , , 4
## 
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    6    7    8    9   10   11
## [2,]    7    8    9   10   11   12
## [3,]    8    9   10   11   12   13
## [4,]    9   10   11   12   13   14
## [5,]   10   11   12   13   14   15
## [6,]   11   12   13   14   15   16
## 
## , , 5
## 
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    7    8    9   10   11   12
## [2,]    8    9   10   11   12   13
## [3,]    9   10   11   12   13   14
## [4,]   10   11   12   13   14   15
## [5,]   11   12   13   14   15   16
## [6,]   12   13   14   15   16   17
## 
## , , 6
## 
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    8    9   10   11   12   13
## [2,]    9   10   11   12   13   14
## [3,]   10   11   12   13   14   15
## [4,]   11   12   13   14   15   16
## [5,]   12   13   14   15   16   17
## [6,]   13   14   15   16   17   18
head(expand.grid(1:6, 1:6, 1:6), 10)
##    Var1 Var2 Var3
## 1     1    1    1
## 2     2    1    1
## 3     3    1    1
## 4     4    1    1
## 5     5    1    1
## 6     6    1    1
## 7     1    2    1
## 8     2    2    1
## 9     3    2    1
## 10    4    2    1
# Probability of sum of three diced = 7: 15/216
prob <- mean(dice_sums== 7)
prob
## [1] 0.06944444
all <- c(as.matrix(dice_sums)) |> 
  hist(prob = T, main="", xlab ="sums from rolling three dice", 
       breaks=seq(min(dice_sums)-0.5, max(dice_sums)+0.5, by=1))

       axis(1, at=seq(3,18,by=1), labels=seq(3, 18,by=1))

plot(prop.table(table(dice_sums)),ylab = 'Probability', main = 'Probability of sum of rolling three Dice')

2.2 method from Google(wrong)

set.seed(1234)
# create a function for dice sampling 
dice.sum <- function(n.dice)
  {
  dice <- sample(1:6, size = n.dice, replace = TRUE)
  return(sum(dice))
  }
# all possible sums from rolling three dice based on 10000 rolls
dice <- replicate(10000, dice.sum(3))
ftable(dice)
## dice    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18
##                                                                                     
##        36  131  286  465  723  968 1148 1229 1257 1138 1002  688  453  289  144   43
# histogram plot
dice |> hist(prob=T, main="")

2.3 method from class(wrong)

sample(3:18, size=10000, replace=T, 
       prob=c(1/216, 3/216, 6/216, 10/216, 15/216, 21/216, 25/216, 27/216, 
              27/216, 25/216, 21/216, 15/216, 10/216, 6/216, 3/216, 1/216)) |>
  hist(prob=T, main="", xlab ="")

# Probability histogram represents "chance".

3 Homework 3

The IQ and behavior problem page has a dataset and a script of R code chunks. Generate a markdown file from the script to push the output in HTML for posting to course Moodle site. Explain what the code chunks do with your comments in the markdown file.

3.1 Input data

# An R script for IQ_Beh data set
dta <- read.table("IQ_Beh.txt", header = T, row.names = 1)

# header = T
## IQ_Beh.txt的第一列變成 dta 的 column name

# row.names = 1
## row.names: Set Row Names for Data Frames
## IQ_Beh.txt的第一欄變成 dta 的 Row Names

3.2 Checking data

# structure of the dta: 94 obs. of 3 variables
# Dep: D = Depressed, N = Non-Deparessed
# IQ = Child's IQ
# BP = Rating of child's behavioral problem
str(dta)
## 'data.frame':    94 obs. of  3 variables:
##  $ Dep: chr  "N" "N" "N" "N" ...
##  $ IQ : int  103 124 124 104 96 92 124 99 92 116 ...
##  $ BP : int  4 12 9 3 3 3 6 4 3 9 ...
# the top 6 rows of the dta data
head(dta)
##   Dep  IQ BP
## 1   N 103  4
## 2   N 124 12
## 3   N 124  9
## 4   N 104  3
## 5   D  96  3
## 6   N  92  3
# the class of the dta
class(dta)
## [1] "data.frame"
# Dimensions of dta  (94 rows * 3 columns)
dim(dta)
## [1] 94  3
# Show the column name of dta
names(dta)
## [1] "Dep" "IQ"  "BP"
# check the BP of dta is vector or not 
is.vector(dta$BP)
## [1] TRUE
# show the first row of dta
dta[1, ]
##   Dep  IQ BP
## 1   N 103  4
# show the 1 to 3 row of the IQ columns in dta
dta[1:3, "IQ"]
## [1] 103 124 124
# 整個資料的先以BP的由小排序到大,然後只顯示最後6個值
# 換言之是看BP值的最高6名是誰
tail(dta[order(dta$BP), ])
##    Dep  IQ BP
## 16   N  89 11
## 58   N 117 11
## 66   N 126 11
## 2    N 124 12
## 73   D  99 13
## 12   D  22 17
# 整個資料的先以BP的由大排序到小,然後只顯示最後4個值
# 換言之是看BP值的最低4名是誰
tail(dta[order(-dta$BP), ], 4)
##    Dep  IQ BP
## 77   N 124  1
## 80   N 121  1
## 24   N 106  0
## 75   N 122  0

3.3 Plot

3.3.1 Histogram

# histogram of IQ (大部分坐落在100-120)
with(dta, hist(IQ, xlab = "IQ", main = ""))

3.3.2 Boxplot

# boxplot 
# compare the behavior problem score in 2 group(depression versus non-depression)

boxplot(BP ~ Dep, data = dta, 
        xlab = "Depression", 
        ylab = "Behavior problem score")

3.3.3 Scatter plot

# 原本的Dep是character type,需先變成factor type,col才可以判別
dta$Dep<-as.factor(dta$Dep)

# Scatter plot 
# pch = 點的種類
# col = 顏色分類
# xlab = x軸的標題
# ylab = y軸的標題
plot(IQ ~ BP, data = dta, pch = 20, col = dta$Dep, xlab = "Behavior problem score", ylab = "IQ")
grid()

3.3.4 Scatter plot with regression line

# type = "n" : does not produce any points or lines.
plot(BP ~ IQ, data = dta, type = "n",
     ylab = "Behavior problem score", xlab = "IQ")
# text 把點換成文字呈現,cex字體大小
text(dta$IQ, dta$BP, labels = dta$Dep, cex = 0.5)
# Depression group 的迴歸線
abline(lm(BP ~ IQ, data = dta, subset = Dep == "D"))
# non-depression group 的迴歸線 (lty line types)
abline(lm(BP ~ IQ, data = dta, subset = Dep == "N"), lty = 2)

4 Homework 4

The usBirths2015.txt is a dataset of monthly births in the US in 2015. Summarize the number of births by season.

# library(dplyr)
dta4 <- read.table("usBirths2015.txt", header = T)
# dta4 <- dta4 |> mutate(season = ifelse(month %in% c("December", "January", "February"), "Winter", ifelse(month %in% c("March", "April", "May"), "Spring", ifelse(month %in% c("June", "July", "August"), "Summer","Fall"))))
dta4$season <- c("Winter", "Winter", "Spring", "Spring", "Spring", "Summer", 
                 "Summer", "Summer", "Fall", "Fall", "Fall", "Winter")
dta4$season <- as.factor(dta4$season)

aggregate(dta4$birth ~ dta4$season, FUN=sum)
##   dta4$season dta4$birth
## 1        Fall    1005343
## 2      Spring     977672
## 3      Summer    1035747
## 4      Winter     959735

5 Homework 5

Ten subjects read a paragraph consisting of seven sentences. The reading time (in seconds) for each sentence was the outcome measure. The predictors are the serial position of the sentence (Sp), the number of words in the sentences (Wrds), and the number of new arguments in the sentence (New).

5.1 Input data

dta5 <- read.table("readingtimes.txt", header = T)

5.2 Rank

  1. Rank subjects by their reading speed
rank(colMeans(dta5[,5:14]))
## S01 S02 S03 S04 S05 S06 S07 S08 S09 S10 
##   5   4   2   3  10   9   8   7   1   6

5.3 Reading speed

  1. Estimate, on average, how long does it take to read a word.
# how long does it take to read a word for each students
colMeans(dta5[,5:14])/mean(dta5$Wrds)
##       S01       S02       S03       S04       S05       S06       S07       S08 
## 0.3569259 0.3324568 0.3261728 0.3266049 0.4856790 0.4641852 0.4518395 0.4330864 
##       S09       S10 
## 0.2368765 0.3883457
# how long does it take to read a word for all of students
mean(colMeans(dta5[,5:14])/mean(dta5$Wrds))
## [1] 0.3802173

5.4 Reference

Source: Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated measures data in cognitive research. Journal of Experimental Psychology: Learning, Memory, and Cognition. 16, 149-157.