Exercise-1

example("ToothGrowth")

## 
## TthGrw> require(graphics)
## 
## TthGrw> coplot(len ~ dose | supp, data = ToothGrowth, panel = panel.smooth,
## TthGrw+        xlab = "ToothGrowth data: length vs dose, given type of supplement")

Reading data

# import data into the global environment
data("ToothGrowth")

# format the name of data object for consistency
dta_toothGrowth <- ToothGrowth
remove(ToothGrowth)

Data dimensions and variable names

# 60 rows * 3 cols data frame
dim(dta_toothGrowth)

## [1] 60  3

# variable names
names(dta_toothGrowth)

## [1] "len"  "supp" "dose"

# first 6 rows of the data frame
head(dta_toothGrowth)

##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5

Data manipulation

# ignore variable "dose"
dta_toothGrowth <- dta_toothGrowth[,-3] 
# first 6 rows of the data frame
head(dta_toothGrowth)

##    len supp
## 1  4.2   VC
## 2 11.5   VC
## 3  7.3   VC
## 4  5.8   VC
## 5  6.4   VC
## 6 10.0   VC

Descriptive statistics

# mean
aggregate(len ~ supp, mean, data=dta_toothGrowth)

##   supp      len
## 1   OJ 20.66333
## 2   VC 16.96333

# sd
aggregate(len ~ supp, sd, data=dta_toothGrowth)

##   supp      len
## 1   OJ 6.605561
## 2   VC 8.266029

# number of cases
aggregate(len ~ supp, length, data=dta_toothGrowth)

##   supp len
## 1   OJ  30
## 2   VC  30

Exploratory data analysis (EDA)

lattice::bwplot(len ~ supp, data = dta_toothGrowth, 
                main="Distribution of length of tooth growth by group",
                xlab="type of vitamin supplement", ylab="length of tooth growth")

Comparing two means

t.test(len ~ supp, data = dta_toothGrowth)

## 
##  Welch Two Sample t-test
## 
## data:  len by supp
## t = 1.9153, df = 55.309, p-value = 0.06063
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1710156  7.5710156
## sample estimates:
## mean in group OJ mean in group VC 
##         20.66333         16.96333

Results

multicon::diffPlot(len ~ supp, 
                   data = dta_toothGrowth, 
                   grp.names=c("OJ", "VC"),
                   xlab="type of vitamin supplement", 
                   ylab="mean length of tooth growth")

Diagnostics

Checking Normality

with(dta_toothGrowth, 
     lattice::histogram(~ len | supp, type="density", 
                        xlab="length of tooth growth"))

Exercise-2

all possible sums

# probabilistic method
set.seed(314)
die1 <- sample(1:6, size = 10000, replace = T)
die2 <- sample(1:6, size = 10000, replace = T)
die3 <- sample(1:6, size = 10000, replace = T)
sum <- die1 + die2 + die3

# compute all possible sums by directly searching

# # create three dice objects
# die1 <- 1:6
# die2 <- 1:6
# die3 <- 1:6
# 
# # create an empty vector for storing each computation
# sum <- c()
# 
# # strating index of vector sum
# n = 1
# 
# # compute all possible results using nested for loop
# for (a in die1) {
#   for (b in die2) {
#      for (c in die3) {
#        if (n > 216) {
#          break
#        } else {
#         sum[n] <- die1[a] + die2[b] + die3[c]
#         n <- n + 1
#        }
#       }
#     }
#   }

# get the numbers of unique results of sum
length(unique(sum))

## [1] 16

There are 16 possible sums from rolling three dice.

histogram

hist(sum)

empirical histogram

The first program generated pseudo random process to compute all possible sums. therefore, it is a empirical histogram.

probability histogram

The second program which is commented above will generate a probability histogram because neither have the computer rolled 3 dice nor there was a pseudo random process.

Exercise-3

#
# An R script for IQ_Beh data set
#

# Read file "IQ_Beh.txt" into the global environment as a data object called "dta". The paramenter "header = T" tells R that the first row is the name of each column
dta <- read.table("IQ_Beh.txt", header = T, row.names = 1)

# 1. display the internal structure of an R object, in this case "dta"
# 2. display the class of "dta"
# 3. display the dimention of  "dta", which is 94 * 3
# 4. display the type of each column and 10 example entries for each column
str(dta)

## 'data.frame':    94 obs. of  3 variables:
##  $ Dep: Factor w/ 2 levels "D","N": 2 2 2 2 1 2 2 2 2 2 ...
##  $ IQ : int  103 124 124 104 96 92 124 99 92 116 ...
##  $ BP : int  4 12 9 3 3 3 6 4 3 9 ...

# display the first 6 rows of "dta" and the names of each column
head(dta)

##   Dep  IQ BP
## 1   N 103  4
## 2   N 124 12
## 3   N 124  9
## 4   N 104  3
## 5   D  96  3
## 6   N  92  3

# display the class of "dta" which is data.frame
class(dta)

## [1] "data.frame"

# display the dimention of "dta" which is 94 * 3
dim(dta)

## [1] 94  3

# display column names of each column from "dta"
names(dta)

## [1] "Dep" "IQ"  "BP"

# ask R whether the coulumn BP of dta a vector
is.vector(dta$BP)

## [1] TRUE

# display the first row of dta and the names of each column
dta[1, ]

##   Dep  IQ BP
## 1   N 103  4

# display the first three elements in vector IQ, which is a column in dta
dta[1:3, "IQ"]

## [1] 103 124 124

# display 6 rows which have 6 largest numbers of BP in an increasing order
tail(dta[order(dta$BP), ])

##    Dep  IQ BP
## 16   N  89 11
## 58   N 117 11
## 66   N 126 11
## 2    N 124 12
## 73   D  99 13
## 12   D  22 17

# display 4 rows which have 4 smallest numbers of BP in an descending order
tail(dta[order(-dta$BP), ], 4)

##    Dep  IQ BP
## 77   N 124  1
## 80   N 121  1
## 24   N 106  0
## 75   N 122  0

# draw a histogram of column "IQ"
with(dta, hist(IQ, xlab = "IQ", main = ""))

# draw a grouped boxplot (BP ~ Dep)
boxplot(BP ~ Dep, data = dta, 
        xlab = "Depression", 
        ylab = "Behavior problem score")

# draw a scatter plot (IQ by BP)
# specified dep by different colors
# choose solid point to display a data point
# set background to grid 
plot(IQ ~ BP, data = dta, pch = 20, col = dta$Dep, 
     xlab = "Behavior problem score", ylab = "IQ")
grid()

# draw a scatter plot (BP by IQ) specified different groups by letters "D" and "N"
# draw a regression line of group "D" 
# draw a regression line of group "N" and specified it by dashed line
plot(BP ~ IQ, data = dta, type = "n",
     ylab = "Behavior problem score", xlab = "IQ")
text(dta$IQ, dta$BP, labels = dta$Dep, cex = 0.5)
abline(lm(BP ~ IQ, data = dta, subset = Dep == "D"))
abline(lm(BP ~ IQ, data = dta, subset = Dep == "N"), lty = 2)

## end

Exercise-4

# read data
dta_usBirths2015 <- read.table("usBirths2015.txt", header = T)

dta_usBirths2015

##     birth     month
## 1  325955   January
## 2  298058  February
## 3  328923     March
## 4  320832     April
## 5  327917       May
## 6  330541      June
## 7  353415      July
## 8  351791    August
## 9  347516 September
## 10 339007   October
## 11 318820  November
## 12 335722  December

# create a new column named spring whose data is tansformed by column month
dta_usBirths2015$season[1:3] <- "spring"
dta_usBirths2015$season[4:6] <- "summer"
dta_usBirths2015$season[7:9] <- "fall"
dta_usBirths2015$season[10:12] <- "winter"

aggregate(data=dta_usBirths2015, birth ~ season, FUN = "sum")

##   season   birth
## 1   fall 1052722
## 2 spring  952936
## 3 summer  979290
## 4 winter  993549

# tidyverse version
# library(tidyverse)
# dta_usBirths2015 %>% 
#   group_by(season) %>% 
#   summarise(birth_by_season = sum(birth)) %>% 
#   arrange(birth_by_season)

This table shows the number of birth in descending order by seasons.

Exercise-5

dta_readingTime <- read.table("readingtimes.txt", header = T)
head(dta_readingTime)

##   Snt Sp Wrds New   S01   S02   S03   S04   S05    S06    S07   S08   S09   S10
## 1   1  1   13   1 3.429 2.795 4.161 3.071 3.625  3.161  3.232 7.161 1.536 4.063
## 2   2  2   16   3 6.482 5.411 4.491 5.063 9.295  5.643  8.357 4.313 2.946 6.652
## 3   3  3    9   2 1.714 2.339 3.018 2.464 6.045  2.455  4.920 3.366 1.375 2.179
## 4   4  4    9   2 3.679 3.714 2.866 2.732 4.205  6.241  3.723 6.330 1.152 3.661
## 5   5  5   10   3 4.000 2.902 2.991 2.670 3.884  3.223  3.143 6.143 2.759 3.330
## 6   6  6   18   4 6.973 8.018 6.625 7.571 8.795 13.188 11.170 6.071 7.964 7.866

str(dta_readingTime)

## 'data.frame':    7 obs. of  14 variables:
##  $ Snt : int  1 2 3 4 5 6 7
##  $ Sp  : int  1 2 3 4 5 6 7
##  $ Wrds: int  13 16 9 9 10 18 6
##  $ New : int  1 3 2 2 3 4 1
##  $ S01 : num  3.43 6.48 1.71 3.68 4 ...
##  $ S02 : num  2.79 5.41 2.34 3.71 2.9 ...
##  $ S03 : num  4.16 4.49 3.02 2.87 2.99 ...
##  $ S04 : num  3.07 5.06 2.46 2.73 2.67 ...
##  $ S05 : num  3.62 9.29 6.04 4.21 3.88 ...
##  $ S06 : num  3.16 5.64 2.46 6.24 3.22 ...
##  $ S07 : num  3.23 8.36 4.92 3.72 3.14 ...
##  $ S08 : num  7.16 4.31 3.37 6.33 6.14 ...
##  $ S09 : num  1.54 2.95 1.38 1.15 2.76 ...
##  $ S10 : num  4.06 6.65 2.18 3.66 3.33 ...

Rank subjects by their reading speeed

# get a vector that contains the total time each subject has used
time <- apply(dta_readingTime[,-c(1:4)], 2, sum)
# sort the result in an increasing order
sort(rank(time))

## S09 S03 S04 S02 S01 S10 S08 S07 S06 S05 
##   1   2   3   4   5   6   7   8   9  10

# tidyverse version
# dta_readingTime_long <- dta_readingTime %>% 
#   pivot_longer(c(`S01`, `S02`, `S03`, `S04`, 
#                  `S05`, `S06`, `S07`, `S08`,
#                  `S09`, `S10`), 
#                names_to = "subject",
#                values_to = "time")
# head(dta_readingTime_long)
# sum(dta_readingTime$S01)

#   averageSpeed_by_subject <- dta_readingTime_long %>% 
#     # compute reading speed (using formular speed = time / words) for each sentence
#     mutate(speed = time / Wrds) %>% 
#     group_by(subject) %>% 
#     # compute averageSpeed for 7 sentences
#     summarise(averageSpeed = mean(speed)) %>% 
#     # sort the results in an increasing order
#     arrange(averageSpeed) %>% 
#     mutate(rank = rank(averageSpeed))
# 
# averageSpeed_by_subject

This vector gives the information of the average speed of each subject in an increasing order.

Estimate, on average, how long does it take to read a word

# compute how many words are there
words <- sum(dta_readingTime$Wrds)
speed <- time / words
round(speed, 2)

##  S01  S02  S03  S04  S05  S06  S07  S08  S09  S10 
## 0.36 0.33 0.33 0.33 0.49 0.46 0.45 0.43 0.24 0.39

This vector displays that the reading speed of each subject.

# compute the average speed
round(mean(speed), 2)

## [1] 0.38

The average speed of these 10 subjects is 0.38.

Exercises-316

Zhe Sun

3/23/2020