0413 base graphic HW 3,5

HW 3

loading data and check data structure

library(MASS)
dta<-nlschools
head(dta)

##   lang   IQ class GS SES COMB
## 1   46 15.0   180 29  23    0
## 2   45 14.5   180 29  10    0
## 3   33  9.5   180 29  15    0
## 4   46 11.0   180 29  23    0
## 5   20  8.0   180 29  10    0
## 6   30  9.5   180 29  10    0

str(dta)

## 'data.frame':    2287 obs. of  6 variables:
##  $ lang : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ IQ   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ GS   : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ SES  : int  23 10 15 23 10 10 23 10 13 15 ...
##  $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

data manipulation

library(dplyr)

dta.1<-dta%>%dplyr::group_by(class)%>%
  summarize(count=n())%>%filter(count>30) # find out class with over 30 pupils 

dta.over30<- dta[dta$class%in%dta.1$class,] # extract those class with over 30 pupils from the full dataset

dta2<-split(dta.over30, as.character(dta.over30$class)) # to split the data to list by class

plot

par(mfrow=c(3, 2), mar=c(2, 2, 2, 2), oma=c(5, 5, 5, 5))
lapply(dta2, function(x) { hist(x$IQ, 
                                xlab="IQ",
                                main=" ",
                                xlim=c(6, 20), ylim=c(0, 20)) # make x/y axis consistent over 5 panel 
  
  legend('topleft', paste("Class", x$class[1], sep=":"), bty='n')})
mtext("Histogram of IQ in class with over 30 pupils", side=3, line=2, 
      col="Black", outer=TRUE, cex=1.5,font=2) # add common main title in outer margin

HW 5

download the zip file online and unzip it

fl<-"http://memory.psych.upenn.edu/files/pubs/Murd62.data.tgz"
download.file(fl,destfile="tmp.tar.gz")
untar("tmp.tar.gz",files="Murd62")
temp<-list.files("Murd62", pattern="*.txt", full.names=TRUE)

library(plyr)
library(dplyr)

data manipulation

names1<-paste("V", 1:15)
temp.1<-lapply(temp, read.table, sep="", col.names=names1, fill=T)

temp.2<-lapply(temp.1, function(x) {x <- x[-1201, ]}) # remove 0 
name<-c("10-2", "15-2", "20-1", "20-2", "30-1", "40-1")
dat.name<-Map(cbind, temp.2, group=name)

dta.prob <- lapply(temp.2, function(x){ x <- stack(x)
table(x$values) / 1200})

dta.prob2 <- lapply(dta.prob, as.data.frame) # data.frame to use rbind.fill

dta.prob3<-do.call("rbind.fill", dta.prob2)
dta.prob3$site<-(1:nrow(dta.prob3))
site.row<-dta.prob3%>%filter(Var1==88) # find the last variable of every list to find how many frequency in each list
dta.prob3$group<-c(rep("10-2",11), rep("15-2",17), rep( "20-1",21), rep("20-2", 21), rep("30-1", 32), rep("40-1", 43))

dta.prob.m<-dta.prob3[-site.row$site , -c(3)]

# remove outlier
dta.prob.m <- dta.prob.m[!(dta.prob.m$Var1 == 16 & dta.prob.m$group == "15-2"), ]
dta.prob.m <- dta.prob.m[!(dta.prob.m$Var1 == 31 & dta.prob.m$group == "30-1"), ]
dta.prob.m <- dta.prob.m[!(dta.prob.m$Var1 == 41 & dta.prob.m$group == "40-1"), ]
dta.prob.m <- dta.prob.m[!(dta.prob.m$Var1 == 50 & dta.prob.m$group == "40-1"), ]

plot

xtick<-seq(0, 40, by=5)
ytick<-seq(.00, 1.00, by=.2)
plot.new()
par(mar=c(2,2,2,3), oma=c(5,5,5,5))
plot(0,type='n', xaxs="i", yaxs="i",
     xlim = c(0, 40),
     ylim = c(0, 1),
     axes = F)
# to revise the ticks label
axis(1, at=seq(0, 40), labels = F) 
text(x=xtick,  par("usr")[3], 
     labels = xtick, pos = 1, xpd = TRUE)
axis(2, seq(0, 1, 0.05), labels=F)
text(par("usr")[1], ytick,  
     labels = ytick, pos = 2, xpd = TRUE)

# I thinks there must be a better soultion instead of depict each line and points
lines(dta.prob.m$Var1[dta.prob.m$group == "10-2"], 
      dta.prob.m$Freq[dta.prob.m$group == "10-2"], type = "l", pch = 1) 
lines(dta.prob.m$Var1[dta.prob.m$group == "15-2"], 
      dta.prob.m$Freq[dta.prob.m$group == "15-2"], type = "l", pch = 1) 
lines(dta.prob.m$Var1[dta.prob.m$group == "20-1"], 
      dta.prob.m$Freq[dta.prob.m$group == "20-1"], type = "l", pch = 1)
lines(dta.prob.m$Var1[dta.prob.m$group == "20-2"], 
      dta.prob.m$Freq[dta.prob.m$group == "20-2"], type = "l", pch = 1)
lines(dta.prob.m$Var1[dta.prob.m$group == "30-1"], 
      dta.prob.m$Freq[dta.prob.m$group == "30-1"], type = "l", pch = 1)
lines(dta.prob.m$Var1[dta.prob.m$group == "40-1"], 
      dta.prob.m$Freq[dta.prob.m$group == "40-1"], type = "l", pch = 1)
points(dta.prob.m$Var1[dta.prob.m$group == "10-2"], 
      dta.prob.m$Freq[dta.prob.m$group == "10-2"], pch = 19) 
points(dta.prob.m$Var1[dta.prob.m$group == "15-2"], 
       dta.prob.m$Freq[dta.prob.m$group == "15-2"], pch = 1) 
points(dta.prob.m$Var1[dta.prob.m$group == "20-1"], 
       dta.prob.m$Freq[dta.prob.m$group == "20-1"], pch = 1) 
points(dta.prob.m$Var1[dta.prob.m$group == "20-2"], 
       dta.prob.m$Freq[dta.prob.m$group == "20-2"], pch = 19)
points(dta.prob.m$Var1[dta.prob.m$group == "30-1"], 
       dta.prob.m$Freq[dta.prob.m$group == "30-1"], pch = 1) 
points(dta.prob.m$Var1[dta.prob.m$group == "40-1"], 
       dta.prob.m$Freq[dta.prob.m$group == "40-1"], pch = 19)

# not sure if there is any function to do this faster
text(5, 0.85, "10-2")
     lines(c(8.2, 6), c(0.7, 0.85))
text(14, 0.5, "15-2")
     lines(c(12, 13), c(0.56, 0.5))
text(11, 0.4, "20-2")
      lines(c(14, 12), c(0.3, 0.4))
text(18, 0.25, "20-1")
      lines(c(16, 17), c(0.3, 0.25))
text(25, 0.65, "30-1")
      lines(c(28, 26), c(0.55, 0.65))
text(36, 0.65, "40-1")
      lines(c(38.5, 37), c(0.6, 0.65))   
      
# label
mtext("PROBABILITY OF RECALL", side=2, outer=TRUE, line=0 )
mtext("SERIAL POSITION", side=1, outer=TRUE, line=0 )