Homework 1: Split the plot region to include histograms on the margins of a scatter diagram using the Galton{HistData} data set.
## Load file
Create a 2*2 matrix and use layout function to set the widths and heights.
plot the histogram of parent and child, respectively.
calculate the sample size (maximum) of parent and child.
set up the plot margin first, then plot the sunflowerplot of children by parents.
plot the histogram of parent (x axis) and child (y axis).
add names of x axis and y axis
par(mar=c(0, 3, 1, 1))
barplot(xh$counts, axes=FALSE, ylim=c(0, ub), space=0)
par(mar=c(3, 0, 1, 1))
barplot(yh$counts, axes=FALSE, xlim=c(0, ub), space=0, horiz=TRUE)
par(oma=c(3, 3, 0, 0))
mtext("Average height of parents (in inch)", side=1, line=2,
outer=TRUE, adj=0,
at=.4 * (mean(dta$parent) - min(dta$parent))/(diff(range(dta$parent))))
mtext("Height of child (in inch)", side=2, line=2,
outer=TRUE, adj=0,
at=.4 * (mean(dta$child) - min(dta$child))/(diff(range(dta$child))))Homework 2: Age and Suicide by Country
Load data file
## A25.34 A35.44 A45.54 A55.64 A65.74
## Canada 22 27 31 34 24
## Israel 9 19 10 14 27
## Japan 22 19 21 31 49
## Austria 29 40 52 53 69
## France 16 25 36 47 56
## Germany 28 35 41 49 52
Transform the data frame from wide to long format
## ─ Attaching packages ────────────────────────── tidyverse 1.3.0 ─
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ─ Conflicts ─────────────────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
dtaL <- dta %>%
gather(key=Age, value=Suicide, contains("A")) %>%
arrange(Country)
head(dtaL)## Country Age Suicide
## 1 Austria A25.34 29
## 2 Austria A35.44 40
## 3 Austria A45.54 52
## 4 Austria A55.64 53
## 5 Austria A65.74 69
## 6 Canada A25.34 22
Change names of age
Note. Actually, I did not really know how this work, the code chunk was I refered from my classmate (Zhe Sun). Thanks him!
## Country Age Suicide
## 1 Austria 25 to 34 29
## 2 Austria 35 to 44 40
## 3 Austria 45 to 54 52
## 4 Austria 55 to 64 53
## 5 Austria 65 to 74 69
## 6 Canada 25 to 34 22
Homework 3: Histogram of IQ for each of 5 Classes with over 30 Pupil
Load data file
## 'data.frame': 2287 obs. of 6 variables:
## $ lang : int 46 45 33 46 20 30 30 57 36 36 ...
## $ IQ : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
## $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ GS : int 29 29 29 29 29 29 29 29 29 29 ...
## $ SES : int 23 10 15 23 10 10 23 10 13 15 ...
## $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
find the class which over 30 pupils
dta_f <- dta %>%
group_by(class) %>%
dplyr::summarize(count=n()) %>%
dplyr::filter(count>30)
head(dta_f)## # A tibble: 5 x 2
## class count
## <fct> <int>
## 1 5480 31
## 2 15580 33
## 3 15980 31
## 4 16180 31
## 5 18380 31
select the class which over 30 pupils and spilt to 5 list for plot preparation.
## lang IQ class GS SES
## Min. :17.00 Min. : 6.00 15580 :33 Min. :32.00 Min. :10
## 1st Qu.:40.00 1st Qu.:11.00 5480 :31 1st Qu.:32.00 1st Qu.:23
## Median :45.00 Median :12.00 15980 :31 Median :33.00 Median :33
## Mean :43.89 Mean :12.16 16180 :31 Mean :33.41 Mean :32
## 3rd Qu.:50.00 3rd Qu.:13.00 18380 :31 3rd Qu.:34.00 3rd Qu.:40
## Max. :58.00 Max. :18.00 180 : 0 Max. :36.00 Max. :50
## (Other): 0
## COMB
## 0:157
## 1: 0
##
##
##
##
##
Plot the histogram of IQ by classes
par(mfrow=c(3, 2), mar = c(2, 1, 1, 1))
lapply(dta_class, function(x) {
hist(x$IQ,
xlab="IQ",
breaks = seq(6,18,2)
)
legend('topleft',
paste("Class", x$class[1], sep=":"),
bty='n')}
)Homework 4: SAT and GPA scores in colleges
## College SAT_No GPA_No SAT_Yes GPA_Yes
## 1 Barnard 1210 3.08 1317 3.30
## 2 Northwestern 1243 3.10 1333 3.24
## 3 Bowdoin 1200 2.85 1312 3.12
## 4 Colby 1220 2.90 1280 3.04
## 5 Carnegie Mellon 1237 2.70 1308 2.94
## 6 Georgia Tech 1233 2.62 1287 2.80
Plot a scatter plot
matplot(dta[,c(4,2)], dta[,c(5,3)],
pch=c(1, 19),
col=c("Black", "Black"),
cex=2,
xlim=c(1150,1400), ylim=c(2.6,3.4),
xlab="SAT (V+M)",
ylab="First Year GPA")
legend("topleft",
c("Submitted SAT Scores", "Did NOT Submit SAT Scores"),
pch=c(19, 1),
col=c("Black", "Black"),
bty = "n")
with(dta, segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1, lwd=1,col="black"))
with(dta[dta$College != "Bowdoin", ], text(GPA_Yes ~ SAT_Yes, labels=College, adj=c(-0.3, 0.3), cex=1))
with(dta[dta$College=="Bowdoin", ], text(GPA_Yes ~ SAT_Yes, labels=College, adj=c(-0.3, 0.3), cex=1, font=2))Homework 5: Free recall
Load data file
setwd("/Users/haolunfu/Documents/資料管理/week6/")
dta_10_2 <- read.table("Murd62/fr10-2.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88, nrow=1200)
dta_15_2 <- read.table("Murd62/fr15-2.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)
dta_20_1 <- read.table("Murd62/fr20-1.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)
dta_20_2 <- read.table("Murd62/fr20-2.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)
dta_30_1 <- read.table("Murd62/fr30-1.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)
dta_40_1 <- read.table("Murd62/fr40-1.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)
dta <- list(dta_10_2, dta_15_2, dta_20_1, dta_20_2, dta_30_1, dta_40_1)Calculate the probability of measurements by induvidual
dta_prob <- lapply(dta, function(x)
{x <- stack(x)
table(x$values) / 1200})
dta_n <- data.frame(index = c(), probability = c())
name <- c("10-2", "15-2", "20-1", "20-2", "30-1", "40-1")
for (i in 1:length(dta_prob)) {
n <- unlist(dta_prob[i])
dta_n <- rbind(data.frame(Index = names(n), Probability = n, Group = name[i], stringsAsFactors = F), dta_n)
}Outlier exclusion
Solution 1:
Plot
plot(dta_n$Index, dta_n$Probability, type="o",
xlim = c(0, 40), ylim = c(0, 1), axes = F,
xlab = "SERIAL POSITION",
ylab = "PROBABILITY OF RECALL")
axis(1, 0:40)
axis(2, seq(0, 1, 0.05))
# add label names
text(2, 0.9, "10-2")
lines(c(2, 6), c(0.85, 0.6))
text(16, 0.7, "15-2")
lines(c(14, 12.5), c(0.7, 0.7))
text(11, 0.41, "20-2")
lines(c(11, 13), c(0.39, 0.3))
text(19, 0.25, "20-1")
lines(c(19, 16), c(0.27, 0.35))
text(21, 0.55, "30-1")
lines(c(21, 26), c(0.52, 0.39))
text(33, 0.8, "40-1")
lines(c(33, 37.5), c(0.75, 0.59))Solution 2:
plot(dta_n$Index, dta_n$Probability, type="p",
xlim = c(0, 40), ylim = c(0, 1), axes = F,
xlab = "SERIAL POSITION",
ylab = "PROBABILITY OF RECALL")
axis(1, 0:40)
axis(2, seq(0, 1, 0.05))
for(i in name){
with(dta_n[dta_n$Group==i,],
lines(dta_n$Index, dta_n$Probability), pch=1)}