Homework 1: Split the plot region to include histograms on the margins of a scatter diagram using the Galton{HistData} data set.

## Load file

library(HistData)
dta <- HistData::Galton

Create a 2*2 matrix and use layout function to set the widths and heights.

zones <- matrix(c(2, 0, 1, 3), ncol=2, byrow=TRUE)
layout(zones, widths=c(4/5, 1/5), heights = c(1/5, 4/5))

plot the histogram of parent and child, respectively.

calculate the sample size (maximum) of parent and child.

xh <- with(dta, hist(parent, plot=FALSE))
yh <- with(dta, hist(child, plot=FALSE))
ub <- max(c(xh$counts, yh$counts))

set up the plot margin first, then plot the sunflowerplot of children by parents.

par(mar=c(3, 3, 1, 1))
with(dta, sunflowerplot(parent, child))

plot the histogram of parent (x axis) and child (y axis).

add names of x axis and y axis

par(mar=c(0, 3, 1, 1))
barplot(xh$counts, axes=FALSE, ylim=c(0, ub), space=0)

par(mar=c(3, 0, 1, 1))
barplot(yh$counts, axes=FALSE, xlim=c(0, ub), space=0, horiz=TRUE)

par(oma=c(3, 3, 0, 0))
mtext("Average height of parents (in inch)", side=1, line=2, 
      outer=TRUE, adj=0, 
      at=.4 * (mean(dta$parent) - min(dta$parent))/(diff(range(dta$parent))))
mtext("Height of child (in inch)", side=2, line=2, 
      outer=TRUE, adj=0,
      at=.4 * (mean(dta$child) - min(dta$child))/(diff(range(dta$child))))

Homework 2: Age and Suicide by Country

Load data file

dta <- HSAUR3::suicides2
head(dta)

##         A25.34 A35.44 A45.54 A55.64 A65.74
## Canada      22     27     31     34     24
## Israel       9     19     10     14     27
## Japan       22     19     21     31     49
## Austria     29     40     52     53     69
## France      16     25     36     47     56
## Germany     28     35     41     49     52

dta$Country <- row.names(dta)

Transform the data frame from wide to long format

library(tidyverse)

## ─ Attaching packages ────────────────────────── tidyverse 1.3.0 ─

## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0

## ─ Conflicts ─────────────────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
dtaL <- dta %>% 
  gather(key=Age, value=Suicide, contains("A")) %>%
  arrange(Country)
head(dtaL)

##   Country    Age Suicide
## 1 Austria A25.34      29
## 2 Austria A35.44      40
## 3 Austria A45.54      52
## 4 Austria A55.64      53
## 5 Austria A65.74      69
## 6  Canada A25.34      22

Change names of age

Note. Actually, I did not really know how this work, the code chunk was I refered from my classmate (Zhe Sun). Thanks him!

dtaL$Age <- sprintf("%s to %s", 
                                 substr(dtaL$Age, 2, 3),
                                 substr(dtaL$Age, 5, 6))
head(dtaL)

##   Country      Age Suicide
## 1 Austria 25 to 34      29
## 2 Austria 35 to 44      40
## 3 Austria 45 to 54      52
## 4 Austria 55 to 64      53
## 5 Austria 65 to 74      69
## 6  Canada 25 to 34      22

Plot the boxplot of suicide by age.

boxplot(Suicide~ Age, data = dtaL)

Homework 3: Histogram of IQ for each of 5 Classes with over 30 Pupil

Load data file

dta <- MASS::nlschools
str(dta)

## 'data.frame':    2287 obs. of  6 variables:
##  $ lang : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ IQ   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ GS   : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ SES  : int  23 10 15 23 10 10 23 10 13 15 ...
##  $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

find the class which over 30 pupils

dta_f <- dta %>%
  group_by(class) %>%
  dplyr::summarize(count=n()) %>%
  dplyr::filter(count>30)

head(dta_f)

## # A tibble: 5 x 2
##   class count
##   <fct> <int>
## 1 5480     31
## 2 15580    33
## 3 15980    31
## 4 16180    31
## 5 18380    31

select the class which over 30 pupils and spilt to 5 list for plot preparation.

dta_n<- dta[dta$class %in% dta_f$class,]
summary(dta_n)

##       lang             IQ            class          GS             SES    
##  Min.   :17.00   Min.   : 6.00   15580  :33   Min.   :32.00   Min.   :10  
##  1st Qu.:40.00   1st Qu.:11.00   5480   :31   1st Qu.:32.00   1st Qu.:23  
##  Median :45.00   Median :12.00   15980  :31   Median :33.00   Median :33  
##  Mean   :43.89   Mean   :12.16   16180  :31   Mean   :33.41   Mean   :32  
##  3rd Qu.:50.00   3rd Qu.:13.00   18380  :31   3rd Qu.:34.00   3rd Qu.:40  
##  Max.   :58.00   Max.   :18.00   180    : 0   Max.   :36.00   Max.   :50  
##                                  (Other): 0                               
##  COMB   
##  0:157  
##  1:  0  
##         
##         
##         
##         
##

dta_class<-split(dta_n, as.character(dta_n$class))

Plot the histogram of IQ by classes

par(mfrow=c(3, 2), mar = c(2, 1, 1, 1))

lapply(dta_class, function(x) {
  hist(x$IQ, 
       xlab="IQ", 
       breaks = seq(6,18,2)
  )
  legend('topleft', 
         paste("Class", x$class[1], sep=":"), 
         bty='n')}
)

Homework 4: SAT and GPA scores in colleges

dta <- read.table("/Users/haolunfu/Documents/資料管理/week6/sat_gpa.txt", header = TRUE)
head(dta)

##           College SAT_No GPA_No SAT_Yes GPA_Yes
## 1         Barnard   1210   3.08    1317    3.30
## 2    Northwestern   1243   3.10    1333    3.24
## 3         Bowdoin   1200   2.85    1312    3.12
## 4           Colby   1220   2.90    1280    3.04
## 5 Carnegie Mellon   1237   2.70    1308    2.94
## 6    Georgia Tech   1233   2.62    1287    2.80

Plot a scatter plot

matplot(dta[,c(4,2)], dta[,c(5,3)], 
        pch=c(1, 19), 
        col=c("Black", "Black"),
        cex=2,
        xlim=c(1150,1400), ylim=c(2.6,3.4),
        xlab="SAT (V+M)",
        ylab="First Year GPA")
legend("topleft", 
       c("Submitted SAT Scores", "Did NOT Submit SAT Scores"), 
       pch=c(19, 1), 
       col=c("Black", "Black"),
       bty = "n")

with(dta, segments(SAT_No, GPA_No, SAT_Yes, GPA_Yes, lty=1,  lwd=1,col="black"))
with(dta[dta$College != "Bowdoin", ], text(GPA_Yes ~ SAT_Yes, labels=College, adj=c(-0.3, 0.3), cex=1))
with(dta[dta$College=="Bowdoin", ], text(GPA_Yes ~ SAT_Yes, labels=College, adj=c(-0.3, 0.3), cex=1, font=2))

Homework 5: Free recall

Load data file

setwd("/Users/haolunfu/Documents/資料管理/week6/")
dta_10_2 <- read.table("Murd62/fr10-2.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88, nrow=1200)
dta_15_2 <- read.table("Murd62/fr15-2.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)
dta_20_1 <- read.table("Murd62/fr20-1.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)
dta_20_2 <- read.table("Murd62/fr20-2.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)
dta_30_1 <- read.table("Murd62/fr30-1.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)
dta_40_1 <- read.table("Murd62/fr40-1.txt", sep = "", fill=T, col.names = paste("V", 1:15), na.strings = 88)

dta <- list(dta_10_2, dta_15_2, dta_20_1, dta_20_2, dta_30_1, dta_40_1)

Calculate the probability of measurements by induvidual

dta_prob <- lapply(dta, function(x)
  {x <- stack(x)
  table(x$values) / 1200})

dta_n <- data.frame(index = c(), probability = c())

name <- c("10-2", "15-2", "20-1", "20-2", "30-1", "40-1")

for (i in 1:length(dta_prob)) {
  n <- unlist(dta_prob[i])
  dta_n <- rbind(data.frame(Index = names(n), Probability = n, Group = name[i], stringsAsFactors = F), dta_n)
}

Plot

plot(dta_n$Index, dta_n$Probability, type = "o", xlim = c(0, 40),
     ylim = c(0, 1))

Outlier exclusion

dta_n <- dta_n[!(dta_n$Index == 16 & dta_n$Group == "15-2"), ]
dta_n <- dta_n[!(dta_n$Index == 31 & dta_n$Group == "30-1"), ]
dta_n <- dta_n[!(dta_n$Index == 41 & dta_n$Group == "40-1"), ]
dta_n <- dta_n[!(dta_n$Index == 50 & dta_n$Group == "40-1"), ]

Solution 1:

Plot

plot(dta_n$Index, dta_n$Probability, type="o",
     xlim = c(0, 40), ylim = c(0, 1), axes = F, 
     xlab = "SERIAL POSITION",
     ylab = "PROBABILITY OF RECALL")
axis(1, 0:40)
axis(2, seq(0, 1, 0.05))

# add label names
text(2, 0.9, "10-2")
lines(c(2, 6), c(0.85, 0.6))
text(16, 0.7, "15-2")
lines(c(14, 12.5), c(0.7, 0.7))
text(11, 0.41, "20-2")
lines(c(11, 13), c(0.39, 0.3))
text(19, 0.25, "20-1")
lines(c(19, 16), c(0.27, 0.35))
text(21, 0.55, "30-1")
lines(c(21, 26), c(0.52, 0.39))
text(33, 0.8, "40-1")
lines(c(33, 37.5), c(0.75, 0.59))

Solution 2:

plot(dta_n$Index, dta_n$Probability, type="p",
     xlim = c(0, 40), ylim = c(0, 1), axes = F, 
     xlab = "SERIAL POSITION",
     ylab = "PROBABILITY OF RECALL")

axis(1, 0:40)
axis(2, seq(0, 1, 0.05))

for(i in name){
  with(dta_n[dta_n$Group==i,], 
       lines(dta_n$Index, dta_n$Probability), pch=1)}

Week 6 Homework (R base)

Hao-Lun Fu

2020-04-20

Homework 1: Split the plot region to include histograms on the margins of a scatter diagram using the Galton{HistData} data set.

Create a 2*2 matrix and use layout function to set the widths and heights.

plot the histogram of parent and child, respectively.

calculate the sample size (maximum) of parent and child.

set up the plot margin first, then plot the sunflowerplot of children by parents.

plot the histogram of parent (x axis) and child (y axis).

add names of x axis and y axis

Homework 2: Age and Suicide by Country

Load data file

Transform the data frame from wide to long format

Change names of age

Note. Actually, I did not really know how this work, the code chunk was I refered from my classmate (Zhe Sun). Thanks him!

Plot the boxplot of suicide by age.

Homework 3: Histogram of IQ for each of 5 Classes with over 30 Pupil

Load data file

find the class which over 30 pupils

select the class which over 30 pupils and spilt to 5 list for plot preparation.

Plot the histogram of IQ by classes

Homework 4: SAT and GPA scores in colleges

Plot a scatter plot

Homework 5: Free recall

Load data file

Calculate the probability of measurements by induvidual

Plot

Outlier exclusion

Solution 1:

Plot

Solution 2:

I don’t know why the lines or segments extend to wrong positions (e.g., from 10-2 to 15-2), even though I tried another classmates’ code.