Concerning the anorexia{MASS}, explain the difference between
and
library(MASS)
# str(anorexia)
# 擷取anorexia資料中的第2欄位,擷取出來的是vector
head(anorexia[,2])
## [1] 80.7 89.4 91.8 74.0 78.1 88.3
# is.vector(anorexia[,2])
# TRUE
# 擷取anorexia資料中的Prewt欄位,擷取出來的是data frame
head(anorexia["Prewt"])
## Prewt
## 1 80.7
## 2 89.4
## 3 91.8
## 4 74.0
## 5 78.1
## 6 88.3
# is.data.frame(anorexia["Prewt"])
# TRUE
# Although the Treat is integer type
typeof(anorexia$Treat)
## [1] "integer"
# but the structure show Treat is a factor with three level: "CBT","Cont","FT"
str(anorexia)
## 'data.frame': 72 obs. of 3 variables:
## $ Treat : Factor w/ 3 levels "CBT","Cont","FT": 2 2 2 2 2 2 2 2 2 2 ...
## $ Prewt : num 80.7 89.4 91.8 74 78.1 88.3 87.3 75.1 80.6 78.4 ...
## $ Postwt: num 80.2 80.1 86.4 86.3 76.1 78.1 75.1 86.7 73.5 84.6 ...
# Treat as a factor type
lm(Postwt ~ Treat, data=anorexia)
##
## Call:
## lm(formula = Postwt ~ Treat, data = anorexia)
##
## Coefficients:
## (Intercept) TreatCont TreatFT
## 85.697 -4.589 4.798
# Treat as a numeric type
# this is definitely not a proper data analysis for the context of the 'anorexia'
lm(Postwt ~ as.numeric(Treat), data=anorexia)
##
## Call:
## lm(formula = Postwt ~ as.numeric(Treat), data = anorexia)
##
## Coefficients:
## (Intercept) as.numeric(Treat)
## 82.036 1.711
Find all possible sums from rolling three dice using R. If possible, construct a histogram for the sum of three dice. Is this a probability histogram or an empirical histogram?
# All possible dice rolls:
<- outer(outer(1:6, 1:6, FUN = '+'), 1:6, FUN = '+')
dice_sums dice_sums
## , , 1
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 3 4 5 6 7 8
## [2,] 4 5 6 7 8 9
## [3,] 5 6 7 8 9 10
## [4,] 6 7 8 9 10 11
## [5,] 7 8 9 10 11 12
## [6,] 8 9 10 11 12 13
##
## , , 2
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 4 5 6 7 8 9
## [2,] 5 6 7 8 9 10
## [3,] 6 7 8 9 10 11
## [4,] 7 8 9 10 11 12
## [5,] 8 9 10 11 12 13
## [6,] 9 10 11 12 13 14
##
## , , 3
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 5 6 7 8 9 10
## [2,] 6 7 8 9 10 11
## [3,] 7 8 9 10 11 12
## [4,] 8 9 10 11 12 13
## [5,] 9 10 11 12 13 14
## [6,] 10 11 12 13 14 15
##
## , , 4
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 6 7 8 9 10 11
## [2,] 7 8 9 10 11 12
## [3,] 8 9 10 11 12 13
## [4,] 9 10 11 12 13 14
## [5,] 10 11 12 13 14 15
## [6,] 11 12 13 14 15 16
##
## , , 5
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 7 8 9 10 11 12
## [2,] 8 9 10 11 12 13
## [3,] 9 10 11 12 13 14
## [4,] 10 11 12 13 14 15
## [5,] 11 12 13 14 15 16
## [6,] 12 13 14 15 16 17
##
## , , 6
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 8 9 10 11 12 13
## [2,] 9 10 11 12 13 14
## [3,] 10 11 12 13 14 15
## [4,] 11 12 13 14 15 16
## [5,] 12 13 14 15 16 17
## [6,] 13 14 15 16 17 18
head(expand.grid(1:6, 1:6, 1:6), 10)
## Var1 Var2 Var3
## 1 1 1 1
## 2 2 1 1
## 3 3 1 1
## 4 4 1 1
## 5 5 1 1
## 6 6 1 1
## 7 1 2 1
## 8 2 2 1
## 9 3 2 1
## 10 4 2 1
# Probability of sum of three diced = 7: 15/216
<- mean(dice_sums== 7)
prob prob
## [1] 0.06944444
<- c(as.matrix(dice_sums)) |>
all hist(prob = T, main="", xlab ="sums from rolling three dice",
breaks=seq(min(dice_sums)-0.5, max(dice_sums)+0.5, by=1))
axis(1, at=seq(3,18,by=1), labels=seq(3, 18,by=1))
plot(prop.table(table(dice_sums)),ylab = 'Probability', main = 'Probability of sum of rolling three Dice')
set.seed(1234)
# create a function for dice sampling
<- function(n.dice)
dice.sum
{<- sample(1:6, size = n.dice, replace = TRUE)
dice return(sum(dice))
}# all possible sums from rolling three dice based on 10000 rolls
<- replicate(10000, dice.sum(3))
dice ftable(dice)
## dice 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
##
## 36 131 286 465 723 968 1148 1229 1257 1138 1002 688 453 289 144 43
# histogram plot
|> hist(prob=T, main="") dice
sample(3:18, size=10000, replace=T,
prob=c(1/216, 3/216, 6/216, 10/216, 15/216, 21/216, 25/216, 27/216,
27/216, 25/216, 21/216, 15/216, 10/216, 6/216, 3/216, 1/216)) |>
hist(prob=T, main="", xlab ="")
# Probability histogram represents "chance".
The IQ and behavior problem page has a dataset and a script of R code chunks. Generate a markdown file from the script to push the output in HTML for posting to course Moodle site. Explain what the code chunks do with your comments in the markdown file.
# An R script for IQ_Beh data set
<- read.table("IQ_Beh.txt", header = T, row.names = 1)
dta
# header = T
## IQ_Beh.txt的第一列變成 dta 的 column name
# row.names = 1
## row.names: Set Row Names for Data Frames
## IQ_Beh.txt的第一欄變成 dta 的 Row Names
# structure of the dta: 94 obs. of 3 variables
# Dep: D = Depressed, N = Non-Deparessed
# IQ = Child's IQ
# BP = Rating of child's behavioral problem
str(dta)
## 'data.frame': 94 obs. of 3 variables:
## $ Dep: chr "N" "N" "N" "N" ...
## $ IQ : int 103 124 124 104 96 92 124 99 92 116 ...
## $ BP : int 4 12 9 3 3 3 6 4 3 9 ...
# the top 6 rows of the dta data
head(dta)
## Dep IQ BP
## 1 N 103 4
## 2 N 124 12
## 3 N 124 9
## 4 N 104 3
## 5 D 96 3
## 6 N 92 3
# the class of the dta
class(dta)
## [1] "data.frame"
# Dimensions of dta (94 rows * 3 columns)
dim(dta)
## [1] 94 3
# Show the column name of dta
names(dta)
## [1] "Dep" "IQ" "BP"
# check the BP of dta is vector or not
is.vector(dta$BP)
## [1] TRUE
# show the first row of dta
1, ] dta[
## Dep IQ BP
## 1 N 103 4
# show the 1 to 3 row of the IQ columns in dta
1:3, "IQ"] dta[
## [1] 103 124 124
# 整個資料的先以BP的由小排序到大,然後只顯示最後6個值
# 換言之是看BP值的最高6名是誰
tail(dta[order(dta$BP), ])
## Dep IQ BP
## 16 N 89 11
## 58 N 117 11
## 66 N 126 11
## 2 N 124 12
## 73 D 99 13
## 12 D 22 17
# 整個資料的先以BP的由大排序到小,然後只顯示最後4個值
# 換言之是看BP值的最低4名是誰
tail(dta[order(-dta$BP), ], 4)
## Dep IQ BP
## 77 N 124 1
## 80 N 121 1
## 24 N 106 0
## 75 N 122 0
# histogram of IQ (大部分坐落在100-120)
with(dta, hist(IQ, xlab = "IQ", main = ""))
# boxplot
# compare the behavior problem score in 2 group(depression versus non-depression)
boxplot(BP ~ Dep, data = dta,
xlab = "Depression",
ylab = "Behavior problem score")
# 原本的Dep是character type,需先變成factor type,col才可以判別
$Dep<-as.factor(dta$Dep)
dta
# Scatter plot
# pch = 點的種類
# col = 顏色分類
# xlab = x軸的標題
# ylab = y軸的標題
plot(IQ ~ BP, data = dta, pch = 20, col = dta$Dep, xlab = "Behavior problem score", ylab = "IQ")
grid()
# type = "n" : does not produce any points or lines.
plot(BP ~ IQ, data = dta, type = "n",
ylab = "Behavior problem score", xlab = "IQ")
# text 把點換成文字呈現,cex字體大小
text(dta$IQ, dta$BP, labels = dta$Dep, cex = 0.5)
# Depression group 的迴歸線
abline(lm(BP ~ IQ, data = dta, subset = Dep == "D"))
# non-depression group 的迴歸線 (lty line types)
abline(lm(BP ~ IQ, data = dta, subset = Dep == "N"), lty = 2)
The usBirths2015.txt is a dataset of monthly births in the US in 2015. Summarize the number of births by season.
# library(dplyr)
<- read.table("usBirths2015.txt", header = T)
dta4 # dta4 <- dta4 |> mutate(season = ifelse(month %in% c("December", "January", "February"), "Winter", ifelse(month %in% c("March", "April", "May"), "Spring", ifelse(month %in% c("June", "July", "August"), "Summer","Fall"))))
$season <- c("Winter", "Winter", "Spring", "Spring", "Spring", "Summer",
dta4"Summer", "Summer", "Fall", "Fall", "Fall", "Winter")
$season <- as.factor(dta4$season)
dta4
aggregate(dta4$birth ~ dta4$season, FUN=sum)
## dta4$season dta4$birth
## 1 Fall 1005343
## 2 Spring 977672
## 3 Summer 1035747
## 4 Winter 959735
Ten subjects read a paragraph consisting of seven sentences. The reading time (in seconds) for each sentence was the outcome measure. The predictors are the serial position of the sentence (Sp), the number of words in the sentences (Wrds), and the number of new arguments in the sentence (New).
<- read.table("readingtimes.txt", header = T) dta5
rank(colMeans(dta5[,5:14]))
## S01 S02 S03 S04 S05 S06 S07 S08 S09 S10
## 5 4 2 3 10 9 8 7 1 6
# how long does it take to read a word for each students
colMeans(dta5[,5:14])/mean(dta5$Wrds)
## S01 S02 S03 S04 S05 S06 S07 S08
## 0.3569259 0.3324568 0.3261728 0.3266049 0.4856790 0.4641852 0.4518395 0.4330864
## S09 S10
## 0.2368765 0.3883457
# how long does it take to read a word for all of students
mean(colMeans(dta5[,5:14])/mean(dta5$Wrds))
## [1] 0.3802173
Source: Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated measures data in cognitive research. Journal of Experimental Psychology: Learning, Memory, and Cognition. 16, 149-157.