1. Đọc dữ liệu, vẽ các biểu đồ

Dữ liệu hoa Iris và Caschool

library(datasets)
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

Sepal: đài hoa
Petal: cánh hoa

Biểu đồ cho biến định tính

attach(iris)
table(Species)

## Species
##     setosa versicolor  virginica 
##         50         50         50

par(mfrow=c(1,2))
barplot(table(Species))
pie(table(Species))

Vẽ biểu đồ histogram

names(iris)

## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"

hist(Sepal.Length)

hist(Sepal.Length[Species=='setosa'])

hist(Sepal.Length[Species=='versicolor'])

hist(Sepal.Length[Species=='virginica'])

#Vẽ biểu đồ hộp

boxplot(Sepal.Length)

boxplot(Sepal.Length~Species)

Biểu đồ tán xạ

plot(Sepal.Length)

plot(Sepal.Length,Sepal.Width)

plot(Sepal.Length[Species=='setosa'],Sepal.Width[Species=='setosa'])

Dữ liệu đặc biệt kiểu .table

library(datasets)
Titanic

## , , Age = Child, Survived = No
## 
##       Sex
## Class  Male Female
##   1st     0      0
##   2nd     0      0
##   3rd    35     17
##   Crew    0      0
## 
## , , Age = Adult, Survived = No
## 
##       Sex
## Class  Male Female
##   1st   118      4
##   2nd   154     13
##   3rd   387     89
##   Crew  670      3
## 
## , , Age = Child, Survived = Yes
## 
##       Sex
## Class  Male Female
##   1st     5      1
##   2nd    11     13
##   3rd    13     14
##   Crew    0      0
## 
## , , Age = Adult, Survived = Yes
## 
##       Sex
## Class  Male Female
##   1st    57    140
##   2nd    14     80
##   3rd    75     76
##   Crew  192     20

Titanic.df=as.data.frame(Titanic)
Titanic.df

##    Class    Sex   Age Survived Freq
## 1    1st   Male Child       No    0
## 2    2nd   Male Child       No    0
## 3    3rd   Male Child       No   35
## 4   Crew   Male Child       No    0
## 5    1st Female Child       No    0
## 6    2nd Female Child       No    0
## 7    3rd Female Child       No   17
## 8   Crew Female Child       No    0
## 9    1st   Male Adult       No  118
## 10   2nd   Male Adult       No  154
## 11   3rd   Male Adult       No  387
## 12  Crew   Male Adult       No  670
## 13   1st Female Adult       No    4
## 14   2nd Female Adult       No   13
## 15   3rd Female Adult       No   89
## 16  Crew Female Adult       No    3
## 17   1st   Male Child      Yes    5
## 18   2nd   Male Child      Yes   11
## 19   3rd   Male Child      Yes   13
## 20  Crew   Male Child      Yes    0
## 21   1st Female Child      Yes    1
## 22   2nd Female Child      Yes   13
## 23   3rd Female Child      Yes   14
## 24  Crew Female Child      Yes    0
## 25   1st   Male Adult      Yes   57
## 26   2nd   Male Adult      Yes   14
## 27   3rd   Male Adult      Yes   75
## 28  Crew   Male Adult      Yes  192
## 29   1st Female Adult      Yes  140
## 30   2nd Female Adult      Yes   80
## 31   3rd Female Adult      Yes   76
## 32  Crew Female Adult      Yes   20

2. Mô phỏng dữ liệu

Phân phối nhị thức

set.seed(19)
x<- rbinom(100, 20, 0.5)
hist(x,xlim=c(0,20))

# Phân phối Poisson

x <- rpois(100, lambda=15)
hist(x)

# Phân phối mũ

x<- rexp(150, 0.1)
par(mfrow=c(1,2))
hist(x)
curve(dexp(x,10))

#Phân phối Chi bình phương

curve(dchisq(x, 1), xlim=c(0,10), ylim=c(0,0.6), col="red", lwd=3)
curve(dchisq(x, 2), add=T, col="green", lwd=3)
curve(dchisq(x, 3), add=T, col="blue", lwd=3)
curve(dchisq(x, 5), add=T, col="orange", lwd=3)
abline(h=0, lty=3)
legend(par("usr")[2], par("usr")[4],
xjust=1,
c("df=1", "df=2", "df=3", "df=5"), lwd=3, lty=1,
col=c("red", "green", "blue", "orange"))

# Phân phối Student

curve(dt(x, 1), xlim=c(-3,3), ylim=c(0,0.4), col="red", lwd=3)
curve(dt(x, 2), add=T, col="blue", lwd=3)
curve(dt(x, 5), add=T, col="green", lwd=3)
curve(dt(x, 10), add=T, col="orange", lwd=3)
curve(dnorm(x), add=T, lwd=4, lty=3)
title(main="Student T distributions")
legend(par("usr")[2], par("usr")[4],xjust=0.9,
c("df=1", "df=2", "df=5", "df=10", "Std.norm."),
lwd=c(2,2,2,2,2),
lty=c(1,1,1,1,3),
col=c("red", "blue", "green", "orange", par("fg")))

#Phân phối Fisher

curve(df(x,1,1), xlim=c(0,2), ylim=c(0,0.8), lwd=3)
curve(df(x,3,1), add=T)
curve(df(x,6,1), add=T, lwd=3)
curve(df(x,3,3), add=T, col="red")
curve(df(x,6,3), add=T, col="red", lwd=3)
curve(df(x,3,6), add=T, col="blue")
curve(df(x,6,6), add=T, col="blue", lwd=3)
title(main="Fisher F distributions")
legend(par("usr")[2], par("usr")[4],
xjust=1,
c("df=1,1", "df=3,1", "df=6,1", "df=3,3", "df=6,3",
"df=3,6", "df=6,6"),
lwd=c(1,1,3,1,3,1,3),
lty=c(2,1,1,1,1,1,1),
col=c(par("fg"), par("fg"), par("fg"), "red", "blue", "blue"))

#Dùng biểu đồ Q-Q plot so sánh sự tương đồng giữa các phân phối

#hai phân phối chuẩn
x=rnorm(1000,12,1)
y=rnorm(1000,12,1)
qqplot(x,y)

#PP chuẩn, PP đều

x=runif(1000,12,111)
y=rnorm(1000,12,1)
qqplot(x,y)

qqnorm(x)

qqnorm(y)

Một số biểu đồ thông dụng

Ngô Thị Thanh Nga -TLU