Exercise-1

# Galton's data on the heights of parents and their children

# read data
dta <- HistData::Galton
# create a 2 by 2 matrix
zones <- matrix(c(2, 0, 1, 3), ncol=2, byrow=TRUE)
# 
layout(zones, widths=c(4/5, 1/5), heights = c(1/5, 4/5))

# histogram of parents
xh <- with(dta, hist(parent, plot=FALSE))
# histogram of chidren
yh <- with(dta, hist(child, plot=FALSE))

# compute mode
ub <- max(c(xh$counts, yh$counts))

# margin setting
par(mar=c(3, 3, 1, 1))

# sunflowerplot 
with(dta, sunflowerplot(parent, child))

# margin setting
par(mar=c(0, 3, 1, 1))
# bar plot
# no axes display
# y axis from 0 to ub
barplot(xh$counts, axes=FALSE, ylim=c(0, ub), space=0)

# margin setting
par(mar=c(3, 0, 1, 1))
# barplot
# no axes display
# x axis from 0 to ub
barplot(yh$counts, axes=FALSE, xlim=c(0, ub), space=0, horiz=TRUE)
# margin setting
par(oma=c(3, 3, 0, 0))

# create labels for horizontal axis
# compute cordinate scale
mtext("Average height of parents (in inch)", side=1, line=2, 
      outer=TRUE, adj=0, 
      at=.4 * (mean(dta$parent) - min(dta$parent))/(diff(range(dta$parent))))

# create labels for vertical axis
# compute cordinate scale
mtext("Height of child (in inch)", side=2, line=2, 
      outer=TRUE, adj=0,
      at=.4 * (mean(dta$child) - min(dta$child))/(diff(range(dta$child))))

Exercise-2

# read data 
dta_suicides <- HSAUR3::suicides2

# create id column
dta_suicides$country <- row.names(dta_suicides)
# arrange the position of columns
dta_suicides <- dta_suicides[, c(6, 1:5)]
# transform to long form 
dta_suicides_long <- reshape(dta_suicides, 
                            varying = names(dta_suicides)[-1],
                            idvar = "country",
                            timevar = "age",
                            times = names(dta_suicides)[-1],
                            v.names = "suicides",
                            new.row.names = NULL,
                            direction = "long")

# change names of age group by pattern
dta_suicides_long$age <- sprintf("%s to %s", 
                                 substr(dta_suicides_long$age, 2, 3),
                                 substr(dta_suicides_long$age, 5, 6))

# remove row names
row.names(dta_suicides_long) <- NULL

# examine data
head(dta_suicides_long)
##   country      age suicides
## 1  Canada 25 to 34       22
## 2  Israel 25 to 34        9
## 3   Japan 25 to 34       22
## 4 Austria 25 to 34       29
## 5  France 25 to 34       16
## 6 Germany 25 to 34       28
plot.new()
boxplot(dta_suicides_long$suicides ~ dta_suicides_long$age,
        xlab = "age groups", ylab = "number of suicides",
        main = "number of suicides in different countries by age groups")

Exercise-3

# load package 
library(MASS)
library(dplyr)
# read data
dta_nlschools <- nlschools

# find out the classes that have more than 30 puoils
over30 <-  dta_nlschools %>% 
                  group_by(class) %>% 
                  summarise(freq = n()) %>% 
                  filter(freq > 30)

# store the data in a new object
dta_nlschools_over30 <- dta_nlschools[dta_nlschools$class %in% over30$class, ]

# split data by class
# store each class in a list element
dta_nlschools_over30 <- split(dta_nlschools_over30,
                              as.character(dta_nlschools_over30$class))

# plot
plot.new()
# panel and margin setting
par(mfrow=c(4, 2), mar = c(2, 1, 2, 1))

# apply histogram plotting function to each element(each class) in the list
lapply(dta_nlschools_over30, function(x) {hist(x$IQ, 
                                               xlab = "IQ",
                                               main = "histogram of IQ",
                                               axes = F,
                                               breaks = seq(6, 18, by =
                                                              2))
                                              axis(1,
                                                   at = seq(6, 18, 2))
                                               legend("topright",
                                                 paste("class",
                                                       x$class[1],
                                                       sep = ": "),
                                                 bty="n")})

Exercise-4

# read data
dta <- read.table("sat_gpa.txt", header = T)
# examine data  
dta
##           College SAT_No GPA_No SAT_Yes GPA_Yes
## 1         Barnard   1210   3.08    1317    3.30
## 2    Northwestern   1243   3.10    1333    3.24
## 3         Bowdoin   1200   2.85    1312    3.12
## 4           Colby   1220   2.90    1280    3.04
## 5 Carnegie Mellon   1237   2.70    1308    2.94
## 6    Georgia Tech   1233   2.62    1287    2.80
# transform data
dta_new <- data.frame(college = dta$College,
                      SAT = c(dta$SAT_No, dta$SAT_Yes),
                      GPA = c(dta$GPA_No, dta$GPA_Yes),
                      submit = c(rep("no", 6),
                                     rep("yes", 6)))
dta_new
##            college  SAT  GPA submit
## 1          Barnard 1210 3.08     no
## 2     Northwestern 1243 3.10     no
## 3          Bowdoin 1200 2.85     no
## 4            Colby 1220 2.90     no
## 5  Carnegie Mellon 1237 2.70     no
## 6     Georgia Tech 1233 2.62     no
## 7          Barnard 1317 3.30    yes
## 8     Northwestern 1333 3.24    yes
## 9          Bowdoin 1312 3.12    yes
## 10           Colby 1280 3.04    yes
## 11 Carnegie Mellon 1308 2.94    yes
## 12    Georgia Tech 1287 2.80    yes
plot.new()
par(las=1)
plot(dta_new$SAT, dta_new$GPA, type = "n", xlim = c(1150, 1400),
     ylim = c(2.6, 3.4), xlab = "SAT(V+M)", ylab = "First Year GPA")
axis(2, at = c(2.6, 2.8, 3.2))
axis(1, seq(1150, 1400, 50))
plot.window(xlim = c(1150, 1400), ylim = c(2.6, 3.4))
for (i in as.character(unique(dta_new$college))) {
  lines(dta_new$SAT[dta_new$college == i],
      dta_new$GPA[dta_new$college == i])
  
}

points(dta$SAT_No, dta$GPA_No, pch = 19, cex = 2.5)
points(dta$SAT_Yes, dta$GPA_Yes, pch = 1, cex = 2.5)

legend("topleft", c("Submitted SAT Scores", "Did NOT Submit SAT Scores"), pch = c(1, 19), bty = "n")

text(1330, 3.35, "Barnard")
text(1350, 3.27, "Northwestern")
text(1320, 3.15, "Bowdoin")
text(1280, 3.00, "Colby")
text(1338, 2.90, "Carnegie Mellon")
text(1315, 2.77, "Georgia Tech")

Exercise-5

# store names of files to be read 
files <- paste("Murd62/", list.files("Murd62/", pattern = "*.txt"),
               sep ="")

# because the width of each row is not the same
# count the maximun width of each files
lapply(files, function(x){max(count.fields(x))})
## [[1]]
## [1] 10
## 
## [[2]]
## [1] 15
## 
## [[3]]
## [1] 15
## 
## [[4]]
## [1] 15
## 
## [[5]]
## [1] 15
## 
## [[6]]
## [1] 15
# column names for file has 10 columns
names1 <- paste("V", 1:10)
# column names for files have 15 columns
names2 <- paste("V", 1:15)

# read data
dta10.2 <- read.table("Murd62/fr10-2.txt", sep = "", col.names = names1, fill = T, nrows = 1200)
dta15.2 <- read.table("Murd62/fr15-2.txt", sep = "", col.names = names2, fill = T)
dta20.1 <- read.table("Murd62/fr20-1.txt", sep = "", col.names = names2, fill = T)
dta20.2 <- read.table("Murd62/fr20-2.txt", sep = "", col.names = names2, fill = T)
dta30.1 <- read.table("Murd62/fr30-1.txt", sep = "", col.names = names2, fill = T)
dta40.1 <- read.table("Murd62/fr40-1.txt", sep = "", col.names = names2, fill = T)
# store 5 data set in a list
dta_recall <- list(dta10.2, dta15.2, dta20.1, dta20.2, dta30.1, dta40.1)

# compute probability
dta_recall_prob <- lapply(dta_recall,
                             function(x){
                                         x <- stack(x)
                                         table(x$values) / 1200})
# create an empty data frame
dta <- data.frame(index = c(), probability = c())

# store group names in a vector
group_names <- c("10.2", "15.2", "20.1", "20.2", "30.1", "40.1")

# transform list into dataframe
for (i in 1:6) {
  v <- unlist(dta_recall_prob[i])
  dta <- rbind(data.frame(index = names(v), probability = v, group = group_names[i], stringsAsFactors = F), dta)
}

# remove rows have index == 88
dta <- dta[!dta$index == "88", ]

first try

# first try 
plot.new()
plot(dta$index, dta$probability, type = "n", xlim = c(0, 40),
     ylim = c(0, 1))

for (i in group_names) {
  lines(dta$index[dta$group == i], 
        dta$probability[dta$group == i])
}

There exists data points that have been removed from the original figure

Correct the plot based on first try

# remove outliers
dta <- dta[!(dta$index == 16 & dta$group == "15.2"), ]
dta <- dta[!(dta$index == 31 & dta$group == "30.1"), ]
dta <- dta[!(dta$index == 41 & dta$group == "40.1"), ]
dta <- dta[!(dta$index == 50 & dta$group == "40.1"), ]
# 
plot.new()
plot(dta$index, dta$probability, type = "n", 
     xlim = c(0, 40),
     ylim = c(0, 1),
     axes = F,
     xlab = "SERIAL POSITION",
     ylab = "PROBABILITY OF RECALL")
axis(1, 0.0:40.0)
axis(2, seq(0, 1, 0.05))

# lines plot
for (i in group_names) {
  lines(dta$index[dta$group == i], 
        dta$probability[dta$group == i], type = "l", pch = 1)
  points(dta$index[dta$group == i], 
        dta$probability[dta$group == i])
}

# text label
text(2, 0.9, "10-2")
lines(c(2, 6), c(0.85, 0.6))
# text label
text(16, 0.7, "15-2")
lines(c(14, 12.5), c(0.7, 0.7))
# text label
text(11, 0.41, "20-2")
lines(c(11, 13), c(0.39, 0.3))
# text label
text(19, 0.25, "20-1")
lines(c(19, 16), c(0.27, 0.35))
# text label 
text(21, 0.55, "30-1")
lines(c(21, 26), c(0.52, 0.39))
# text label
text(33, 0.8, "40-1")
lines(c(33, 37.5), c(0.75, 0.59))