hw1

The Law School Admission Test (LSAT) consists of a number of dichotomous items, which can be answered correctly or incorrecty. These items are constructed so that a person who scores high on the test is more likely to do well in a law school. The data set LSAT{ltm} contains answers by 1,000 persons to 5 test items. Generate a plot similar to the one below:

pacman::p_load(dplyr, ltm, tidyverse)
# input data
dta1 <- ltm::LSAT

# 使用colMeans計算每個item正確率
colMeans(dta1)

Item 1 Item 2 Item 3 Item 4 Item 5 
 0.924  0.709  0.553  0.763  0.870

# 根據正確率進行排序
dta1<-dta1[,order(colMeans(dta1))]

# 利用apply針對each row計算正確題數
dta1$total <- apply(dta1, 1, sum) #1=rowsum

# 創造一個矩陣存，後續要計算的"不同總分各題答對率"
pm <- data.frame(matrix(data=NA, 6, 6))

# 不同總分時各題的答對率
# 總分0:5, subset總分為0,1,2..5, 利用sapply計算每一column的 mean
for(i in 0:5){
  pm[(i+1),] <- sapply(subset(dta1, total==i), mean)
}
# 將原變項名複製到pm
names(pm) <- names(dta1)

# matplot 可使用矩陣繪多組觀測值
matplot(pm[-6], pch=1:5, col=1:5 , type='b')
legend("topleft", c("Item1","Item2","Item3","Item4","Item5"), pch=1:5, col=1:5, bty="n", lty=1)

hw2

Complete the R script to create the following plot

generate data

# height, weight and bmi

# generate weights
wt <- seq(40, 160, length.out=600) #40到60間產生600個等距數值

# generate heights
ht <- seq(1.40, 2.00, length.out=600)

# cross wt by ht
wtht <- expand.grid(wt=wt, ht=ht) #產生600*600種組合

# function to compute bmi from wt and ht
bmi_wh <- function(w, h) {w/(h*h)} 

# generate data matrix (datavalues, nrows, ncolums)
bmiwtht <- matrix(bmi_wh(wtht$wt, wtht$ht), length(ht), length(wt))

.filled.contour approach

#
colorbar <- c(rgb(1,1,1,0), rgb(1,1,1,0), rgb(.94,.50,.50,.5))
contour(wt, ht, bmiwtht, levels = c(18.5, 24.9, 30), drawlabels=F,
        ylab="Height (m)",
        xlab="Weight (kg)",
        main="BMI categories by height and weight")

# add grid lines
grid()

# annotate the bmi categories
text(105, 1.8, "Obese", cex=1, srt=45) # srt= rotate the text
text(92, 1.8, "Overweight", cex=1, srt=45)
text(75, 1.8, "Normal", cex=1, srt=45)
text(55, 1.8, "Underweight", cex=1, srt=55)

# 一定要放在最後，才能有色塊蓋字的效果
.filled.contour(wt, ht, bmiwtht,levels = c(18.5, 24.9, 30, 100), col= colorbar)

image approach

colorbar <- c(rgb(1,1,1,0), rgb(1,1,1,0), rgb(.94,.50,.50,0.3))

# draw the contour
contour(wt, ht, bmiwtht, bty='n', 
        levels = c(18.5, 24.9, 30), # 依BMI劃出三條等值線
        drawlabels=F,
        ylab="Height (m)",
        xlab="Weight (kg)",
        main="BMI categories by height and weight")

# add grid lines
grid()

# annotate the bmi categories
text(105, 1.8, "Obese", cex=1, srt=45) # srt= rotate the text
text(92, 1.8, "Overweight", cex=1, srt=45)
text(75, 1.8, "Normal", cex=1, srt=45)
text(55, 1.8, "Underweight", cex=1, srt=55)

#
image(wt, ht, bmiwtht,col= colorbar, breaks= c(18.5, 24.9, 30, 100), bty="n",
      ylab="Height (m)",
      xlab="Weight (kg)",
      main="BMI categories by height and weight",
      axes=T,
      xlim = c(40,160),
      ylim = c(1.4,2.0), add=T)

hw3

Use the free recall data to replicate the figure

reported in Murdock, B. B. (1962). The serial position effect of free recall. Journal of Experimental Psychology, 64, 482-488.

讀入單一檔案

# files in folder
list.files("C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62", pattern = "fr")

[1] "fr10-2.txt" "fr15-2.txt" "fr20-1.txt" "fr20-2.txt" "fr30-1.txt"
[6] "fr40-1.txt"

# 測試讀一個檔進來看看
## read.table會發現讀進來欄位數有問題
fr102 <- read.table("C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62/fr10-2.txt", header=F, fill = TRUE)
## 一個檔案的處理方式
pacman::p_load(naniar, stringr, janitor)
fr102s<-scan("C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62/fr10-2.txt", what="numeric", sep="\t")
fr102ss<-str_split_fixed(sapply(fr102s, strsplit, "_and_") , " ", 12)
fr102ssr <- fr102ss[,!apply(fr102ss == "", 2, all)]

用read.table讀進來會有部分資料被截斷

改用scan的方式讀入，並計算最大字串長度，藉此猜測應該要讀入幾個欄位

fr10-2第1201欄只有一個0，於是我手動刪除了

讀入多個檔案

# list.files
fls <-list.files(path = "C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62", pattern = "fr")

# give it path
fL <- paste0("C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62/", fls)

# 
pacman::p_load(stringr, dplyr,purrr)


# 計算每個檔案最大欄為長度
## 由於不確定每個檔案的column數，透過str_length計算欄位長度
lapply(fL, function(x){scan(x, what="numeric", sep="\t") |> str_length() |> max()}) #dtaleghth=c(24,39,39,42,45,44)

[[1]]
[1] 24

[[2]]
[1] 39

[[3]]
[1] 39

[[4]]
[1] 42

[[5]]
[1] 45

[[6]]
[1] 44

# input data by scan function
ff<-lapply(fL, function(x){scan(x, what="numeric", sep="\t")})

# 用strsplit分割字符串成column
## 根據估計的欄位長度，進行切割(最大欄位長度45，若以全部為"88"估計，最多23欄)
## 針對column內全是空的值刪除
ffs<-lapply(ff, function(x){
  y <- str_split_fixed(sapply(x, strsplit, "_and_") , " ", 23)
  y <-y[,!apply(y == "", 2, all)] 
  as.data.frame(y)
})

# 計算每個item的頻率
## 由於level順序有誤(1,10,2...)因此從新order資料順序
dta3a <-lapply(ffs, function(x) {
  y <- x |> tidyr::pivot_longer(cols = starts_with("V"),names_to = "Serial", values_to = "Item") # 轉long form
  y <- as.data.frame(table(y$Item)/1200) #計算item平均頻率
  y$Var1<-factor(as.numeric(as.character(y$Var1))) 
  y <-y[order(y$Var1),]
})

# 處理NA與不合理值
groupname<-c("10-2","15-2","20-1","20-2","30-1","40-1")
for (i in 1:6){
dta3a[[i]]$group <-groupname[i]
dta3a[[i]][dta3a[[i]]=="88" | dta3a[[i]]==""]<-NA # 將"88"與blank轉為NA
dta3a[[i]]$maxlist <- substr(dta3a[[i]]$group,start = 1, stop = 2)
# 透過找出每一data最大serial position，超出就是na
a<-as.numeric(unlist(dta3a[[i]]$Var1)) 
b<-as.numeric(dta3a[[i]]$maxlist[1])
dta3a[[i]]$Var1<-ifelse(a>b,NA,a)
}

plot

# 設置畫圖區域
## xlim 設大一點，最後一個點就不會超出了
plot(0,0, bty="n", xlim = c(0,42), ylim=c(0,1),
     xlab = "SERIA POSITION",
     ylab = "PROBABILITY OF RECALL",
     xaxs = "i", yaxs ="i") # i, 軸線相連；r,軸線有縫隙

# 用for loop一次畫六個資料的點與線
pchlist<-c(19,1,19,1,1,19) #針對每一資料圖形設定
for (i in 1:6){
points(dta3a[[i]]$Var1, 
       dta3a[[i]]$Freq,
       pch = pchlist[i])
lines(dta3a[[i]]$Var1, 
       dta3a[[i]]$Freq)}

# text label
text(2, 0.9, "10-2") #text(x,y,lable)
lines(c(2, 6), c(0.85, 0.6)) #lines(c(x1,x2),c(y1,y2))

text(14, 0.65, "15-2")
lines(c(12, 13), c(0.61, 0.63))

text(13, 0.48, "20-2")
lines(c(12, 13), c(0.45, 0.3))

text(18.5, 0.44, "20-1")
lines(c(16, 17), c(0.35, 0.4))

text(23, 0.6, "30-1")
lines(c(21.5, 21, 26), c(0.55, 0.52, 0.39))

text(35, 0.8, "40-1")
lines(c(33.5, 33, 37.5), c(0.77, 0.75, 0.59))

# axis (圖上的軸在這邊設定)
axis(1,seq(0, 40, 10), labels = FALSE)
axis(2,seq(0, 1, 0.1), labels = FALSE)

hw4

(Optional) A dataset, raz2005{rmcorr}, contains two repeated measures, on two occasions (Time), of age (in years) and adjusted volume of cerebellar hemispheres from 72 participants. Use it to replicate the following figure in which participants older than 65 years of age at Time One are plotted on a shifted panel and those whose cerebellar volume increased from Time One to Time Two are shown in blue color.

pacman::p_load(rmcorr)

# input data
dta4 <- rmcorr::raz2005
# 根據兩個時間切割資料
dta4list <- split(dta4, dta4$Time)

# plot
plot(0,0, bty="n", xlim = c(20,90), ylim=c(100,170),
     xlab = "Age (year)",
     ylab = "Cerebellar volume",
     xaxs = "r", yaxs ="r")

# 用for loop 畫「所有人」兩個時間的點與箭頭
pchlist<-c(1,20)
for (i in 1:2){
points(dta4list[[i]]$Age, 
       dta4list[[i]]$Volume,
       pch = pchlist[i])}
arrows(dta4list[[1]]$Age,dta4list[[1]]$Volume,
       dta4list[[2]]$Age,dta4list[[2]]$Volume,
       length=0.1, col = "grey")

# 針對Volume增加的人畫時間與箭頭
dta4list[[3]]<-dta4list[[1]]$Volume < dta4list[[2]]$Volume
which(dta4list[[3]]==TRUE)

[1] 18 22 29 36 44 62 63

plus <-subset(dta4, Participant %in% c(18,22,29,36,44,62,63))
pluslist <- split(plus, plus$Time)
arrows(pluslist[[1]]$Age,pluslist[[1]]$Volume,
       pluslist[[2]]$Age,pluslist[[2]]$Volume,
       length=0.1, col = "skyblue", lwd=2) # 這邊length是箭頭的長度

1108hw

Jenny Hsu

2021/11/20

hw1

hw2