example("ToothGrowth")
##
## TthGrw> require(graphics)
##
## TthGrw> coplot(len ~ dose | supp, data = ToothGrowth, panel = panel.smooth,
## TthGrw+ xlab = "ToothGrowth data: length vs dose, given type of supplement")
# import data into the global environment
data("ToothGrowth")
# format the name of data object for consistency
dta_toothGrowth <- ToothGrowth
remove(ToothGrowth)
# 60 rows * 3 cols data frame
dim(dta_toothGrowth)
## [1] 60 3
# variable names
names(dta_toothGrowth)
## [1] "len" "supp" "dose"
# first 6 rows of the data frame
head(dta_toothGrowth)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
## 5 6.4 VC 0.5
## 6 10.0 VC 0.5
# ignore variable "dose"
dta_toothGrowth <- dta_toothGrowth[,-3]
# first 6 rows of the data frame
head(dta_toothGrowth)
## len supp
## 1 4.2 VC
## 2 11.5 VC
## 3 7.3 VC
## 4 5.8 VC
## 5 6.4 VC
## 6 10.0 VC
# mean
aggregate(len ~ supp, mean, data=dta_toothGrowth)
## supp len
## 1 OJ 20.66333
## 2 VC 16.96333
# sd
aggregate(len ~ supp, sd, data=dta_toothGrowth)
## supp len
## 1 OJ 6.605561
## 2 VC 8.266029
# number of cases
aggregate(len ~ supp, length, data=dta_toothGrowth)
## supp len
## 1 OJ 30
## 2 VC 30
lattice::bwplot(len ~ supp, data = dta_toothGrowth,
main="Distribution of length of tooth growth by group",
xlab="type of vitamin supplement", ylab="length of tooth growth")
t.test(len ~ supp, data = dta_toothGrowth)
##
## Welch Two Sample t-test
##
## data: len by supp
## t = 1.9153, df = 55.309, p-value = 0.06063
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1710156 7.5710156
## sample estimates:
## mean in group OJ mean in group VC
## 20.66333 16.96333
multicon::diffPlot(len ~ supp,
data = dta_toothGrowth,
grp.names=c("OJ", "VC"),
xlab="type of vitamin supplement",
ylab="mean length of tooth growth")
with(dta_toothGrowth,
lattice::histogram(~ len | supp, type="density",
xlab="length of tooth growth"))
# probabilistic method
set.seed(314)
die1 <- sample(1:6, size = 10000, replace = T)
die2 <- sample(1:6, size = 10000, replace = T)
die3 <- sample(1:6, size = 10000, replace = T)
sum <- die1 + die2 + die3
# compute all possible sums by directly searching
# # create three dice objects
# die1 <- 1:6
# die2 <- 1:6
# die3 <- 1:6
#
# # create an empty vector for storing each computation
# sum <- c()
#
# # strating index of vector sum
# n = 1
#
# # compute all possible results using nested for loop
# for (a in die1) {
# for (b in die2) {
# for (c in die3) {
# if (n > 216) {
# break
# } else {
# sum[n] <- die1[a] + die2[b] + die3[c]
# n <- n + 1
# }
# }
# }
# }
# get the numbers of unique results of sum
length(unique(sum))
## [1] 16
There are 16 possible sums from rolling three dice.
hist(sum)
The first program generated pseudo random process to compute all possible sums. therefore, it is a empirical histogram.
The second program which is commented above will generate a probability histogram because neither have the computer rolled 3 dice nor there was a pseudo random process.
#
# An R script for IQ_Beh data set
#
# Read file "IQ_Beh.txt" into the global environment as a data object called "dta". The paramenter "header = T" tells R that the first row is the name of each column
dta <- read.table("IQ_Beh.txt", header = T, row.names = 1)
# 1. display the internal structure of an R object, in this case "dta"
# 2. display the class of "dta"
# 3. display the dimention of "dta", which is 94 * 3
# 4. display the type of each column and 10 example entries for each column
str(dta)
## 'data.frame': 94 obs. of 3 variables:
## $ Dep: Factor w/ 2 levels "D","N": 2 2 2 2 1 2 2 2 2 2 ...
## $ IQ : int 103 124 124 104 96 92 124 99 92 116 ...
## $ BP : int 4 12 9 3 3 3 6 4 3 9 ...
# display the first 6 rows of "dta" and the names of each column
head(dta)
## Dep IQ BP
## 1 N 103 4
## 2 N 124 12
## 3 N 124 9
## 4 N 104 3
## 5 D 96 3
## 6 N 92 3
# display the class of "dta" which is data.frame
class(dta)
## [1] "data.frame"
# display the dimention of "dta" which is 94 * 3
dim(dta)
## [1] 94 3
# display column names of each column from "dta"
names(dta)
## [1] "Dep" "IQ" "BP"
# ask R whether the coulumn BP of dta a vector
is.vector(dta$BP)
## [1] TRUE
# display the first row of dta and the names of each column
dta[1, ]
## Dep IQ BP
## 1 N 103 4
# display the first three elements in vector IQ, which is a column in dta
dta[1:3, "IQ"]
## [1] 103 124 124
# display 6 rows which have 6 largest numbers of BP in an increasing order
tail(dta[order(dta$BP), ])
## Dep IQ BP
## 16 N 89 11
## 58 N 117 11
## 66 N 126 11
## 2 N 124 12
## 73 D 99 13
## 12 D 22 17
# display 4 rows which have 4 smallest numbers of BP in an descending order
tail(dta[order(-dta$BP), ], 4)
## Dep IQ BP
## 77 N 124 1
## 80 N 121 1
## 24 N 106 0
## 75 N 122 0
# draw a histogram of column "IQ"
with(dta, hist(IQ, xlab = "IQ", main = ""))
# draw a grouped boxplot (BP ~ Dep)
boxplot(BP ~ Dep, data = dta,
xlab = "Depression",
ylab = "Behavior problem score")
# draw a scatter plot (IQ by BP)
# specified dep by different colors
# choose solid point to display a data point
# set background to grid
plot(IQ ~ BP, data = dta, pch = 20, col = dta$Dep,
xlab = "Behavior problem score", ylab = "IQ")
grid()
# draw a scatter plot (BP by IQ) specified different groups by letters "D" and "N"
# draw a regression line of group "D"
# draw a regression line of group "N" and specified it by dashed line
plot(BP ~ IQ, data = dta, type = "n",
ylab = "Behavior problem score", xlab = "IQ")
text(dta$IQ, dta$BP, labels = dta$Dep, cex = 0.5)
abline(lm(BP ~ IQ, data = dta, subset = Dep == "D"))
abline(lm(BP ~ IQ, data = dta, subset = Dep == "N"), lty = 2)
## end
# read data
dta_usBirths2015 <- read.table("usBirths2015.txt", header = T)
dta_usBirths2015
## birth month
## 1 325955 January
## 2 298058 February
## 3 328923 March
## 4 320832 April
## 5 327917 May
## 6 330541 June
## 7 353415 July
## 8 351791 August
## 9 347516 September
## 10 339007 October
## 11 318820 November
## 12 335722 December
# create a new column named spring whose data is tansformed by column month
dta_usBirths2015$season[1:3] <- "spring"
dta_usBirths2015$season[4:6] <- "summer"
dta_usBirths2015$season[7:9] <- "fall"
dta_usBirths2015$season[10:12] <- "winter"
aggregate(data=dta_usBirths2015, birth ~ season, FUN = "sum")
## season birth
## 1 fall 1052722
## 2 spring 952936
## 3 summer 979290
## 4 winter 993549
# tidyverse version
# library(tidyverse)
# dta_usBirths2015 %>%
# group_by(season) %>%
# summarise(birth_by_season = sum(birth)) %>%
# arrange(birth_by_season)
This table shows the number of birth in descending order by seasons.
dta_readingTime <- read.table("readingtimes.txt", header = T)
head(dta_readingTime)
## Snt Sp Wrds New S01 S02 S03 S04 S05 S06 S07 S08 S09 S10
## 1 1 1 13 1 3.429 2.795 4.161 3.071 3.625 3.161 3.232 7.161 1.536 4.063
## 2 2 2 16 3 6.482 5.411 4.491 5.063 9.295 5.643 8.357 4.313 2.946 6.652
## 3 3 3 9 2 1.714 2.339 3.018 2.464 6.045 2.455 4.920 3.366 1.375 2.179
## 4 4 4 9 2 3.679 3.714 2.866 2.732 4.205 6.241 3.723 6.330 1.152 3.661
## 5 5 5 10 3 4.000 2.902 2.991 2.670 3.884 3.223 3.143 6.143 2.759 3.330
## 6 6 6 18 4 6.973 8.018 6.625 7.571 8.795 13.188 11.170 6.071 7.964 7.866
str(dta_readingTime)
## 'data.frame': 7 obs. of 14 variables:
## $ Snt : int 1 2 3 4 5 6 7
## $ Sp : int 1 2 3 4 5 6 7
## $ Wrds: int 13 16 9 9 10 18 6
## $ New : int 1 3 2 2 3 4 1
## $ S01 : num 3.43 6.48 1.71 3.68 4 ...
## $ S02 : num 2.79 5.41 2.34 3.71 2.9 ...
## $ S03 : num 4.16 4.49 3.02 2.87 2.99 ...
## $ S04 : num 3.07 5.06 2.46 2.73 2.67 ...
## $ S05 : num 3.62 9.29 6.04 4.21 3.88 ...
## $ S06 : num 3.16 5.64 2.46 6.24 3.22 ...
## $ S07 : num 3.23 8.36 4.92 3.72 3.14 ...
## $ S08 : num 7.16 4.31 3.37 6.33 6.14 ...
## $ S09 : num 1.54 2.95 1.38 1.15 2.76 ...
## $ S10 : num 4.06 6.65 2.18 3.66 3.33 ...
# get a vector that contains the total time each subject has used
time <- apply(dta_readingTime[,-c(1:4)], 2, sum)
# sort the result in an increasing order
sort(rank(time))
## S09 S03 S04 S02 S01 S10 S08 S07 S06 S05
## 1 2 3 4 5 6 7 8 9 10
# tidyverse version
# dta_readingTime_long <- dta_readingTime %>%
# pivot_longer(c(`S01`, `S02`, `S03`, `S04`,
# `S05`, `S06`, `S07`, `S08`,
# `S09`, `S10`),
# names_to = "subject",
# values_to = "time")
# head(dta_readingTime_long)
# sum(dta_readingTime$S01)
# averageSpeed_by_subject <- dta_readingTime_long %>%
# # compute reading speed (using formular speed = time / words) for each sentence
# mutate(speed = time / Wrds) %>%
# group_by(subject) %>%
# # compute averageSpeed for 7 sentences
# summarise(averageSpeed = mean(speed)) %>%
# # sort the results in an increasing order
# arrange(averageSpeed) %>%
# mutate(rank = rank(averageSpeed))
#
# averageSpeed_by_subject
This vector gives the information of the average speed of each subject in an increasing order.
# compute how many words are there
words <- sum(dta_readingTime$Wrds)
speed <- time / words
round(speed, 2)
## S01 S02 S03 S04 S05 S06 S07 S08 S09 S10
## 0.36 0.33 0.33 0.33 0.49 0.46 0.45 0.43 0.24 0.39
This vector displays that the reading speed of each subject.
# compute the average speed
round(mean(speed), 2)
## [1] 0.38
The average speed of these 10 subjects is 0.38.