Biểu đồ phân bố với his()

Đọc dữ liệu PISA Data Vietnam 2015.csv và gọi là đối tượng pisa. Thử nghiệm với hist()

pisa=read.csv("/Users/admin/Desktop/Textbook/TDTU Datasets for 2020 Workshop/PISA Data Vietnam 2015.csv")
head(pisa)

##     School SchoolSize ClassSize STratio SchoolType  Area Region   Age Gender
## 1 70400001        883        18  22.075          3 URBAN  SOUTH 15.58   Boys
## 2 70400001        883        18  22.075          3 URBAN  SOUTH 15.92   Boys
## 3 70400001        883        18  22.075          3 URBAN  SOUTH 15.42  Girls
## 4 70400001        883        18  22.075          3 URBAN  SOUTH 15.58  Girls
## 5 70400001        883        18  22.075          3 URBAN  SOUTH 15.92  Girls
## 6 70400001        883        18  22.075          3 URBAN  SOUTH 16.25  Girls
##   PARED HISCED  WEALTH INSTSCIE JOYSCIE  ICTRES    Math    Read Science
## 1     9      2 -2.0697   0.9798  2.1635 -1.5244 439.923 412.290 475.612
## 2    12      4 -1.7903   1.7359  2.1635 -1.9305 406.251 409.598 450.320
## 3     9      2 -2.1942  -0.2063 -0.1808 -1.6093 414.369 384.307 405.787
## 4     5      1 -2.0301  -0.3115 -0.4318 -1.6250 468.801 459.104 462.968
## 5     9      2 -1.0522   0.7648  1.3031 -0.5305 355.432 402.435 453.736
## 6     5      1 -3.0570   0.3708  0.5094 -2.5873 458.955 483.885 529.866

hist(pisa$Math, col="green")

## Đổi tên hình và trục x

hist(pisa$Math, col="green", main="Distribution of Math Scores", xlab="Math Scores")

hist(pisa$Math, col="green", main="Phân bố điểm toán", xlab="Điểm khoa học", ylab="Tần số")

## Thêm đường biểu diễn (mật độ xác suất)

hist(pisa$Science, col="blue", border="white", prob=T)
lines(density(pisa$Science), col="black")

## Vẽ 2 phân bố cho nam và nữ

p1 = hist(pisa$Science[pisa$Gender=="Boys"], plot=F)
p2 = hist(pisa$Science[pisa$Gender=="Girls"], plot=F)
plot(p1, col="skyblue", border="white")
plot(p2, add=T, col=scales::alpha("green", 0.5), border="white")

### Ghi chú: plot=F –> không cho ra hình, add=T –> chồng hình lên nhau # Task 2: Biểu đồ phân bố với lattice() ## Vẽ biểu đồ mật độ xác suất theo Gender và Area

library(lattice)
densityplot(~pisa$Science, groups=Gender, data=pisa)

densityplot(~pisa$Science, groups=Area, data=pisa)

densityplot(~pisa$Science, groups=Area,data=pisa,auto.key=list(space="right"))

## Chia 3 cửa sổ (dùng packages lattice và gridExtra)

library("lattice")
p1 = densityplot(~Science, groups=Area, data=pisa, auto.key=list(space="top"))
p2 = densityplot(~Math, groups=Area, data=pisa)
p3 = densityplot(~Read, groups=Area, data=pisa)
library(gridExtra)
grid.arrange(p1, p2, p3, ncol=3)

# Task 3: Biểu đồ hộp với boxplot() ## Chia màn hình ra 3 cửa sổ

par(mfrow=c(1,3))
boxplot(pisa$Science, col="purple")
boxplot(pisa$Science ~ pisa$Gender, col=c("blue", "red")) 
boxplot(pisa$Science ~ pisa$Region, col=c("blue", "red", "purple"))

## Biểu đồ tương quan với plot() ### Biểu đồ tương quan giữa điểm toán và khoa học

plot(pisa$Science~ pisa$Math, col="blue")
plot(pisa$Science ~ pisa$Math, col="blue")
abline(lm(pisa$Science ~ pisa$Math), col="red")

# Phân tích biểu đồ chất lượng cao (ggplot2) # Task 5: Biểu đồ tương quan ## Đọc dữ liệu “obesity data” vào R và gọi đối tượng là “ob” ### > t=file.choose() ### >t ### [1] “/Users/admin/Desktop/Textbook/TDTU Datasets for 2020 Workshop/Obesity data.csv”

ob=read.csv("/Users/admin/Desktop/Textbook/TDTU Datasets for 2020 Workshop/Obesity data.csv")

Tạo ra biến mới gọi là “OB” dựa vào biến bmi: nếu bmi < 18.5 thì OB = “Underweight”; bmi từ 18.5 đến 24.9 thì OB=“Normal”; bmi từ 25.0 đến 29.9 thì OB=“Overweight”; bmi từ 30.0 trở lên, OB=“Obese”

ob$OB [ob$bmi < 18.5] = "Underweight"
ob$OB [ob$bmi >= 18.5 & ob$bmi < 24.9] = "Normal" 
ob$OB [ob$bmi >= 25.0 & ob$bmi < 29.9] = "Overweight" 
ob$OB [ob$bmi >= 30] = "Obese"
ob$OB = factor(ob$OB, levels=c("Underweight", "Normal", "Overweight", "Obese"))

Tìm hiểu phân bố của OB

library(ggplot2)
p = ggplot(data=ob, aes(OB, fill=OB)) + geom_bar()
p = p + xlab("Obesity group") + ylab("Frequency")
p + theme(legend.position="none")

# Task 6: Vẽ biểu đồ tương quan giữa weight và pcfat dùng ggplot2. ## Biểu đồ đơn giản

library(ggplot2); library(ggthemes); library(gridExtra)

p = ggplot(data=ob, aes(x=weight, y=pcfat))
p = p + geom_point() + geom_smooth()
p = p + xlab("Weight") + ylab("Percent body fat") + ggtitle("Weight and Percent Body Fat")
p = p + theme(plot.title=element_text(hjust=0.5)) 
p

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Biểu đồ theo nhóm nam nữ

p = ggplot(data=ob, aes(x=weight, y=pcfat, fill=gender, col=gender))
p = p + geom_point() + geom_smooth()
p = p + xlab("Weight") + ylab("Percent body fat") + ggtitle("Weight and Percent Body Fat for men and women separately")
p = p + theme(plot.title=element_text(hjust=0.5))
p + theme_economist()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

### Có thể thử theme_tufte(), theme_few(), theme_wsj(), theme_clean(), theme_hc()

Biểu đồ tương quan + biểu đồ phân bố dùng package ggExtra

library(ggExtra)
p = ggplot(data=ob, aes(x=bmi, y=pcfat, fill=gender, col=gender))
p = p + geom_point() + geom_smooth()
ggMarginal(p, type="histogram", groupColour=T, groupFill=T)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

###Có thể thử nghiệm thêm với type (“density”, “violin”, “boxplot”)

Biểu đồ tương quan đa biến (package GGally)

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

dat = ob[, c("gender", "age", "bmi", "weight", "height", "pcfat")]
ggpairs(dat)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(data=dat, mapping = aes(color = gender))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(data=ob, mapping = aes(color = gender), columns = c("age", "weight", "bmi", "pcfat"))

# Task 7: Vẽ biểu đồ phân bố (histogram) ## Đọc dữ liệu “PISA Data Vietnam 2015.csv” và gọi là đối tượng “pisa”

pisa=read.csv("/Users/admin/Desktop/Textbook/TDTU Datasets for 2020 Workshop/PISA Data Vietnam 2015.csv")

Vẽ biểu đồ phân bố điểm môn khoa học

library(gridExtra)
p = ggplot(data=pisa, aes(x=Science))
p1 = p + geom_histogram(color="white", fill="blue")
p = ggplot(data=pisa, aes(x=Science))
p = p + geom_histogram(aes(y=..density..), color="white", fill="blue")
p2 = p + geom_density(col="red")
grid.arrange(p1, p2, ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Phân bố điểm môn khoa học theo vùng (Area)

p = ggplot(data=pisa, aes(x=Science, fill=Area))
p1 = p + geom_histogram(position="dodge")
p2 = ggplot(data=pisa, aes(x=Science, fill=Area, color=Area)) + geom_density(alpha = 0.1)
grid.arrange(p1, p2, nrow=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Histogram và xác suất tích lũy

p = ggplot(data=pisa, aes(HISCED))
p = p + stat_ecdf(color="red", lwd=1)
p = p + geom_bar(aes(y = (..count..)/sum(..count..)), fill="blue", colour='yellow')
p

## Warning: Removed 14 rows containing non-finite values (stat_ecdf).

## Warning: Removed 14 rows containing non-finite values (stat_count).

# Task 8: Vẽ biểu đồ hộp (box plot) ## Biểu đồ hộp theo vùng

p = ggplot(data=pisa, aes(x=Area, y=Science, col=Area, fill=Area))
p1 = p + geom_boxplot(col="black")
p2 = p + geom_boxplot(col="black") + geom_jitter(alpha=0.05)
grid.arrange(p1, p2, ncol=2)

## Biểu đồ hộp theo kinh tế

p = ggplot(data=pisa, aes(x=PARED, y=Science, fill=PARED))
p1 = p + geom_boxplot(col="black") + geom_jitter(alpha=0.02)

p = ggplot(data=na.omit(pisa), aes(x=factor(PARED), y=PARED, fill=factor(PARED), color=factor(PARED)))
p2 = p + geom_boxplot(col="black") + geom_jitter(alpha=0.05)
grid.arrange(p1, p2, ncol=2)

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

## Warning: Removed 14 rows containing missing values (stat_boxplot).

## Warning: Removed 14 rows containing missing values (geom_point).

day 2

Le Ngoc Diep

January 6, 2020