The Law School Admission Test (LSAT) consists of a number of dichotomous items, which can be answered correctly or incorrecty. These items are constructed so that a person who scores high on the test is more likely to do well in a law school. The data set LSAT{ltm} contains answers by 1,000 persons to 5 test items. Generate a plot similar to the one below:
::p_load(dplyr, ltm, tidyverse)
pacman# input data
<- ltm::LSAT dta1
# 使用colMeans計算每個item正確率
colMeans(dta1)
Item 1 Item 2 Item 3 Item 4 Item 5
0.924 0.709 0.553 0.763 0.870
# 根據正確率進行排序
<-dta1[,order(colMeans(dta1))]
dta1
# 利用apply針對each row計算正確題數
$total <- apply(dta1, 1, sum) #1=rowsum
dta1
# 創造一個矩陣存,後續要計算的"不同總分各題答對率"
<- data.frame(matrix(data=NA, 6, 6))
pm
# 不同總分時各題的答對率
# 總分0:5, subset總分為0,1,2..5, 利用sapply計算每一column的 mean
for(i in 0:5){
+1),] <- sapply(subset(dta1, total==i), mean)
pm[(i
}# 將原變項名複製到pm
names(pm) <- names(dta1)
# matplot 可使用矩陣繪多組觀測值
matplot(pm[-6], pch=1:5, col=1:5 , type='b')
legend("topleft", c("Item1","Item2","Item3","Item4","Item5"), pch=1:5, col=1:5, bty="n", lty=1)
Complete the R script to create the following plot
# height, weight and bmi
# generate weights
<- seq(40, 160, length.out=600) #40到60間產生600個等距數值
wt
# generate heights
<- seq(1.40, 2.00, length.out=600)
ht
# cross wt by ht
<- expand.grid(wt=wt, ht=ht) #產生600*600種組合
wtht
# function to compute bmi from wt and ht
<- function(w, h) {w/(h*h)}
bmi_wh
# generate data matrix (datavalues, nrows, ncolums)
<- matrix(bmi_wh(wtht$wt, wtht$ht), length(ht), length(wt)) bmiwtht
#
<- c(rgb(1,1,1,0), rgb(1,1,1,0), rgb(.94,.50,.50,.5))
colorbar contour(wt, ht, bmiwtht, levels = c(18.5, 24.9, 30), drawlabels=F,
ylab="Height (m)",
xlab="Weight (kg)",
main="BMI categories by height and weight")
# add grid lines
grid()
# annotate the bmi categories
text(105, 1.8, "Obese", cex=1, srt=45) # srt= rotate the text
text(92, 1.8, "Overweight", cex=1, srt=45)
text(75, 1.8, "Normal", cex=1, srt=45)
text(55, 1.8, "Underweight", cex=1, srt=55)
# 一定要放在最後,才能有色塊蓋字的效果
.filled.contour(wt, ht, bmiwtht,levels = c(18.5, 24.9, 30, 100), col= colorbar)
<- c(rgb(1,1,1,0), rgb(1,1,1,0), rgb(.94,.50,.50,0.3))
colorbar
# draw the contour
contour(wt, ht, bmiwtht, bty='n',
levels = c(18.5, 24.9, 30), # 依BMI劃出三條等值線
drawlabels=F,
ylab="Height (m)",
xlab="Weight (kg)",
main="BMI categories by height and weight")
# add grid lines
grid()
# annotate the bmi categories
text(105, 1.8, "Obese", cex=1, srt=45) # srt= rotate the text
text(92, 1.8, "Overweight", cex=1, srt=45)
text(75, 1.8, "Normal", cex=1, srt=45)
text(55, 1.8, "Underweight", cex=1, srt=55)
#
image(wt, ht, bmiwtht,col= colorbar, breaks= c(18.5, 24.9, 30, 100), bty="n",
ylab="Height (m)",
xlab="Weight (kg)",
main="BMI categories by height and weight",
axes=T,
xlim = c(40,160),
ylim = c(1.4,2.0), add=T)
Use the free recall data to replicate the figure
reported in Murdock, B. B. (1962). The serial position effect of free recall. Journal of Experimental Psychology, 64, 482-488.
# files in folder
list.files("C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62", pattern = "fr")
[1] "fr10-2.txt" "fr15-2.txt" "fr20-1.txt" "fr20-2.txt" "fr30-1.txt"
[6] "fr40-1.txt"
# 測試讀一個檔進來看看
## read.table會發現讀進來欄位數有問題
<- read.table("C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62/fr10-2.txt", header=F, fill = TRUE)
fr102 ## 一個檔案的處理方式
::p_load(naniar, stringr, janitor)
pacman<-scan("C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62/fr10-2.txt", what="numeric", sep="\t")
fr102s<-str_split_fixed(sapply(fr102s, strsplit, "_and_") , " ", 12)
fr102ss<- fr102ss[,!apply(fr102ss == "", 2, all)] fr102ssr
用read.table讀進來會有部分資料被截斷
改用scan的方式讀入,並計算最大字串長度,藉此猜測應該要讀入幾個欄位
fr10-2第1201欄只有一個0,於是我手動刪除了
# list.files
<-list.files(path = "C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62", pattern = "fr")
fls
# give it path
<- paste0("C:/Users/user/Desktop/p_desktop/data management/1108 R base graphics/Murd62/", fls)
fL
#
::p_load(stringr, dplyr,purrr)
pacman
# 計算每個檔案最大欄為長度
## 由於不確定每個檔案的column數,透過str_length計算欄位長度
lapply(fL, function(x){scan(x, what="numeric", sep="\t") |> str_length() |> max()}) #dtaleghth=c(24,39,39,42,45,44)
[[1]]
[1] 24
[[2]]
[1] 39
[[3]]
[1] 39
[[4]]
[1] 42
[[5]]
[1] 45
[[6]]
[1] 44
# input data by scan function
<-lapply(fL, function(x){scan(x, what="numeric", sep="\t")})
ff
# 用strsplit分割字符串成column
## 根據估計的欄位長度,進行切割(最大欄位長度45,若以全部為"88"估計,最多23欄)
## 針對column內全是空的值刪除
<-lapply(ff, function(x){
ffs<- str_split_fixed(sapply(x, strsplit, "_and_") , " ", 23)
y <-y[,!apply(y == "", 2, all)]
y as.data.frame(y)
})
# 計算每個item的頻率
## 由於level順序有誤(1,10,2...)因此從新order資料順序
<-lapply(ffs, function(x) {
dta3a <- x |> tidyr::pivot_longer(cols = starts_with("V"),names_to = "Serial", values_to = "Item") # 轉long form
y <- as.data.frame(table(y$Item)/1200) #計算item平均頻率
y $Var1<-factor(as.numeric(as.character(y$Var1)))
y<-y[order(y$Var1),]
y
})
# 處理NA與不合理值
<-c("10-2","15-2","20-1","20-2","30-1","40-1")
groupnamefor (i in 1:6){
$group <-groupname[i]
dta3a[[i]]=="88" | dta3a[[i]]==""]<-NA # 將"88"與blank轉為NA
dta3a[[i]][dta3a[[i]]$maxlist <- substr(dta3a[[i]]$group,start = 1, stop = 2)
dta3a[[i]]# 透過找出每一data最大serial position,超出就是na
<-as.numeric(unlist(dta3a[[i]]$Var1))
a<-as.numeric(dta3a[[i]]$maxlist[1])
b$Var1<-ifelse(a>b,NA,a)
dta3a[[i]] }
# 設置畫圖區域
## xlim 設大一點,最後一個點就不會超出了
plot(0,0, bty="n", xlim = c(0,42), ylim=c(0,1),
xlab = "SERIA POSITION",
ylab = "PROBABILITY OF RECALL",
xaxs = "i", yaxs ="i") # i, 軸線相連;r,軸線有縫隙
# 用for loop一次畫六個資料的點與線
<-c(19,1,19,1,1,19) #針對每一資料圖形設定
pchlistfor (i in 1:6){
points(dta3a[[i]]$Var1,
$Freq,
dta3a[[i]]pch = pchlist[i])
lines(dta3a[[i]]$Var1,
$Freq)}
dta3a[[i]]
# text label
text(2, 0.9, "10-2") #text(x,y,lable)
lines(c(2, 6), c(0.85, 0.6)) #lines(c(x1,x2),c(y1,y2))
text(14, 0.65, "15-2")
lines(c(12, 13), c(0.61, 0.63))
text(13, 0.48, "20-2")
lines(c(12, 13), c(0.45, 0.3))
text(18.5, 0.44, "20-1")
lines(c(16, 17), c(0.35, 0.4))
text(23, 0.6, "30-1")
lines(c(21.5, 21, 26), c(0.55, 0.52, 0.39))
text(35, 0.8, "40-1")
lines(c(33.5, 33, 37.5), c(0.77, 0.75, 0.59))
# axis (圖上的軸在這邊設定)
axis(1,seq(0, 40, 10), labels = FALSE)
axis(2,seq(0, 1, 0.1), labels = FALSE)
(Optional) A dataset, raz2005{rmcorr}, contains two repeated measures, on two occasions (Time), of age (in years) and adjusted volume of cerebellar hemispheres from 72 participants. Use it to replicate the following figure in which participants older than 65 years of age at Time One are plotted on a shifted panel and those whose cerebellar volume increased from Time One to Time Two are shown in blue color.
::p_load(rmcorr)
pacman
# input data
<- rmcorr::raz2005
dta4 # 根據兩個時間切割資料
<- split(dta4, dta4$Time) dta4list
# plot
plot(0,0, bty="n", xlim = c(20,90), ylim=c(100,170),
xlab = "Age (year)",
ylab = "Cerebellar volume",
xaxs = "r", yaxs ="r")
# 用for loop 畫「所有人」兩個時間的點與箭頭
<-c(1,20)
pchlistfor (i in 1:2){
points(dta4list[[i]]$Age,
$Volume,
dta4list[[i]]pch = pchlist[i])}
arrows(dta4list[[1]]$Age,dta4list[[1]]$Volume,
2]]$Age,dta4list[[2]]$Volume,
dta4list[[length=0.1, col = "grey")
# 針對Volume增加的人畫時間與箭頭
3]]<-dta4list[[1]]$Volume < dta4list[[2]]$Volume
dta4list[[which(dta4list[[3]]==TRUE)
[1] 18 22 29 36 44 62 63
<-subset(dta4, Participant %in% c(18,22,29,36,44,62,63))
plus <- split(plus, plus$Time)
pluslist arrows(pluslist[[1]]$Age,pluslist[[1]]$Volume,
2]]$Age,pluslist[[2]]$Volume,
pluslist[[length=0.1, col = "skyblue", lwd=2) # 這邊length是箭頭的長度