1. R Markdown là gì?

Bài tập này thực hành trên file R Markdown: Đây là định dạng trên R để viết các báo cáo (xuất file dưới dạng Word, .pdf (nếu máy tình cài Latex), html file và các định dạng khác). Tham khảo cheasheat về R Markdown tại link https://rstudio.com/wp-content/uploads/2015/02/rmarkdown-cheatsheet.pdf .

Một số lựa chọn về hiện R code và kết quả trong báo cáo:

2.Ước lượng khoảng

Bước 0: cài đặt WD và load dữ liệu

Cài đặt working directory để chỉ folder làm việc cho R và load dữ liệu

rm(list=ls())
load("~/Dropbox/Research/R-Viasm/data/DienThoaiDiDong.rda")
str(DuLieu)
## 'data.frame':    2000 obs. of  10 variables:
##  $ GioiTinh        : Factor w/ 2 levels "Nam","Nu": 1 1 2 1 2 2 2 2 2 1 ...
##  $ TinhTrangHonNhan: Factor w/ 3 levels "Chua lap gia dinh",..: 3 3 1 2 2 3 1 3 3 3 ...
##  $ NgheNghiep      : Factor w/ 9 levels "Cong nhan","DKLD dac biet",..: 3 8 1 7 7 8 3 1 3 5 ...
##  $ KhuVuc          : Factor w/ 3 levels "Bac","Nam","Trung": 1 1 1 1 3 2 1 2 1 2 ...
##  $ HinhThuc        : Factor w/ 2 levels "Tra sau","Tra truoc": 2 2 2 2 2 2 1 2 1 2 ...
##  $ Mang            : Factor w/ 7 levels "Beeline","EVN Telecom",..: 6 1 1 7 7 1 7 7 7 6 ...
##  $ Jan             : num  67 136 4 80 196 251 108 73 117 69 ...
##  $ Feb             : num  180 21 0 230 155 79 199 173 237 205 ...
##  $ Mar             : num  142 157 302 182 234 80 292 247 265 147 ...
##  $ April           : num  150 31 0 122 209 42 204 172 138 203 ...
dim(DuLieu)
## [1] 2000   10
names(DuLieu)
##  [1] "GioiTinh"         "TinhTrangHonNhan" "NgheNghiep"       "KhuVuc"          
##  [5] "HinhThuc"         "Mang"             "Jan"              "Feb"             
##  [9] "Mar"              "April"
attach(DuLieu)

Xem 5 dòng đầu tiên và 10 cột đầu của số liệu DuLieu

head(DuLieu)
##   GioiTinh  TinhTrangHonNhan NgheNghiep KhuVuc  HinhThuc      Mang Jan Feb Mar
## 1      Nam  Dang co gia dinh Hanh chinh    Bac Tra truoc   Viettel  67 180 142
## 2      Nam  Dang co gia dinh       TXTN    Bac Tra truoc   Beeline 136  21 157
## 3       Nu Chua lap gia dinh  Cong nhan    Bac Tra truoc   Beeline   4   0 302
## 4      Nam         Da ly hon       TXCN    Bac Tra truoc VinaPhone  80 230 182
## 5       Nu         Da ly hon       TXCN  Trung Tra truoc VinaPhone 196 155 234
## 6       Nu  Dang co gia dinh       TXTN    Nam Tra truoc   Beeline 251  79  80
##   April
## 1   150
## 2    31
## 3     0
## 4   122
## 5   209
## 6    42

Có thể đặt lệnh để các dòng kí tự và kết quả trên không hiện ra trong báo cáo!!!

Tương tự, xem thông tin 5 dòng đầu và một số cột của DuLieu

head(DuLieu[, c(1:3, 5:7)])
##   GioiTinh  TinhTrangHonNhan NgheNghiep  HinhThuc      Mang Jan
## 1      Nam  Dang co gia dinh Hanh chinh Tra truoc   Viettel  67
## 2      Nam  Dang co gia dinh       TXTN Tra truoc   Beeline 136
## 3       Nu Chua lap gia dinh  Cong nhan Tra truoc   Beeline   4
## 4      Nam         Da ly hon       TXCN Tra truoc VinaPhone  80
## 5       Nu         Da ly hon       TXCN Tra truoc VinaPhone 196
## 6       Nu  Dang co gia dinh       TXTN Tra truoc   Beeline 251

Ước lượng khoảng cho trung bình

t.test(Jan,conf.level=0.9)
## 
##  One Sample t-test
## 
## data:  Jan
## t = 103.98, df = 1999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 90 percent confidence interval:
##  120.6193 124.4987
## sample estimates:
## mean of x 
##   122.559

Kết luận: Khoảng ước lượng độ tin cậy 90% cho Jan là (120.6193; 124.4987)

Uoc luong khoang cho tdt trung binh thang 2

Ket qua: (157.8417; 162.8573)

Ước lượng khoảng cho tỉ lệ

Ước lượng khoảng với độ tin cậy 95% cho tỉ lệ người dùng hơn 150 nghìn trong tháng 1

sum(Jan>150)
## [1] 603
length(Jan)
## [1] 2000
prop.test(n=2000,x=603,correct=F,conf.level=0.95)
## 
##  1-sample proportions test without continuity correction
## 
## data:  603 out of 2000, null probability 0.5
## X-squared = 315.22, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.2817840 0.3219771
## sample estimates:
##      p 
## 0.3015

Kết quả: (0.2817840; 0.3219771)

3.Kiểm định trung bình

3.1 Kiểm đinh một trung bình (phân phối chuẩn, hoặc cỡ mẫu đủ lớn)

t.test(Jan,mu=130,alternative = 'less')
## 
##  One Sample t-test
## 
## data:  Jan
## t = -6.313, df = 1999, p-value = 1.681e-10
## alternative hypothesis: true mean is less than 130
## 95 percent confidence interval:
##      -Inf 124.4987
## sample estimates:
## mean of x 
##   122.559

Kết luận: p-value = 1.681e-10

3.2 Kiểm đinh một trung bình (phân phối chuẩn, phương sai đã biết)

library(BSDA)
## Loading required package: lattice
## 
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
## 
##     Orange
z.test(Jan,sigma.x=11,mu=130,alternative = 'less')
## 
##  One-sample z-Test
## 
## data:  Jan
## z = -30.252, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 130
## 95 percent confidence interval:
##        NA 122.9636
## sample estimates:
## mean of x 
##   122.559

Kết luận: p-value < 2.2e-16

3.3 Kiểm định trung bình, mẫu thứ cấp

tb=mean(Jan)
tb
## [1] 122.559
dlc=sd(Jan)
dlc
## [1] 52.71223
n=length(Jan)
n
## [1] 2000
library(BSDA)
zsum.test(n.x=n,mean.x=tb,sigma.x=11,mu=130,alternative = 'less')
## 
##  One-sample z-Test
## 
## data:  Summarized x
## z = -30.252, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 130
## 95 percent confidence interval:
##        NA 122.9636
## sample estimates:
## mean of x 
##   122.559
tsum.test(mean.x = tb, s.x=dlc, n.x=n,mu=130,alt='l' )
## Warning in tsum.test(mean.x = tb, s.x = dlc, n.x = n, mu = 130, alt = "l"):
## argument 'var.equal' ignored for one-sample test.
## 
##  One-sample t-Test
## 
## data:  Summarized x
## t = -6.313, df = 1999, p-value = 1.681e-10
## alternative hypothesis: true mean is less than 130
## 95 percent confidence interval:
##        NA 124.4987
## sample estimates:
## mean of x 
##   122.559

3.4 Kiem dinh trung binh hai tong the (lay mau theo doi)

Kiem dinh xem tb thang 1 co nho hon thang 2 ko (Ho: M1-M2=0, H1: M1-M2<0)

t.test(Jan,Feb, mu=0, alt='l', paired = T)
## 
##  Paired t-test
## 
## data:  Jan and Feb
## t = -21.964, df = 1999, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -34.95905
## sample estimates:
## mean of the differences 
##                -37.7905
t.test(Feb~GioiTinh,alt='g',var.equal=T)
## 
##  Two Sample t-test
## 
## data:  Feb by GioiTinh
## t = 0.66796, df = 1998, p-value = 0.2521
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -2.520812       Inf
## sample estimates:
## mean in group Nam  mean in group Nu 
##          161.1021          159.3799
Feb.nam=Feb[GioiTinh=='Nam']
Feb.nu=Feb[GioiTinh=='Nu']
t.test(Feb.nam,Feb.nu,mu=0,alt='g',var.equal = T)
## 
##  Two Sample t-test
## 
## data:  Feb.nam and Feb.nu
## t = 0.66796, df = 1998, p-value = 0.2521
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -2.520812       Inf
## sample estimates:
## mean of x mean of y 
##  161.1021  159.3799

4. Phan tich phuong sai

anova(lm(Feb~KhuVuc))
## Analysis of Variance Table
## 
## Response: Feb
##             Df  Sum Sq Mean Sq F value Pr(>F)
## KhuVuc       2   14160  7080.2  2.1674 0.1147
## Residuals 1997 6523426  3266.6

So sanh nhieu trung vi

kruskal.test(Feb~KhuVuc)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Feb by KhuVuc
## Kruskal-Wallis chi-squared = 5.4038, df = 2, p-value = 0.06708

Phan tich sau

anova(lm(Feb~Mang))
## Analysis of Variance Table
## 
## Response: Feb
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## Mang         6 1375351  229225  88.498 < 2.2e-16 ***
## Residuals 1993 5162235    2590                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov(lm(Feb~Mang)))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = lm(Feb ~ Mang))
## 
## $Mang
##                                 diff         lwr       upr     p adj
## EVN Telecom-Beeline        54.817308   15.848480  93.78614 0.0006811
## Mobiphone-Beeline         116.852592   85.552142 148.15304 0.0000000
## S-Phone-Beeline            57.125000   16.285552  97.96445 0.0007520
## Vietnamobile-Beeline        8.149390  -30.455660  46.75444 0.9960902
## Viettel-Beeline            95.463782   64.261365 126.66620 0.0000000
## VinaPhone-Beeline         131.572833  100.327332 162.81833 0.0000000
## Mobiphone-EVN Telecom      62.035285   37.172581  86.89799 0.0000000
## S-Phone-EVN Telecom         2.307692  -33.835030  38.45041 0.9999963
## Vietnamobile-EVN Telecom  -46.667917  -80.265324 -13.07051 0.0008486
## Viettel-EVN Telecom        40.646474   15.907302  65.38565 0.0000275
## VinaPhone-EVN Telecom      76.755525   51.962036 101.54902 0.0000000
## S-Phone-Mobiphone         -59.727592  -87.430390 -32.02479 0.0000000
## Vietnamobile-Mobiphone   -108.703202 -132.991766 -84.41464 0.0000000
## Viettel-Mobiphone         -21.388811  -29.943064 -12.83456 0.0000000
## VinaPhone-Mobiphone        14.720241    6.010144  23.43034 0.0000137
## Vietnamobile-S-Phone      -48.975610  -84.725809 -13.22541 0.0010728
## Viettel-S-Phone            38.338782   10.746797  65.93077 0.0008438
## VinaPhone-S-Phone          74.447833   46.807136 102.08853 0.0000000
## Viettel-Vietnamobile       87.314391   63.152294 111.47649 0.0000000
## VinaPhone-Vietnamobile    123.423443   99.205733 147.64115 0.0000000
## VinaPhone-Viettel          36.109051   27.758100  44.46000 0.0000000

So sanh hai phuong sai

var.test(Feb.nam,Feb.nu)
## 
##  F test to compare two variances
## 
## data:  Feb.nam and Feb.nu
## F = 1.0367, num df = 1125, denom df = 873, p-value = 0.5743
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.914343 1.174258
## sample estimates:
## ratio of variances 
##           1.036701

So sanh nhieu phuong sai

bartlett.test(Feb~KhuVuc)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  Feb by KhuVuc
## Bartlett's K-squared = 0.43232, df = 2, p-value = 0.8056
bartlett.test(Feb~Mang)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  Feb by Mang
## Bartlett's K-squared = 7.6407, df = 6, p-value = 0.2656

5. Kiem dinh tinh doc lap giua hai bien dinh tinh

table(GioiTinh,NgheNghiep)
##         NgheNghiep
## GioiTinh Cong nhan DKLD dac biet Hanh chinh HSSV Ky thuat NCKH TXCN TXTN VHNT
##      Nam       171            38        186   50      179   38  183  176  105
##      Nu        148            17        143   35      138   21  148  140   84
chisq.test(table(GioiTinh,NgheNghiep))
## 
##  Pearson's Chi-squared test
## 
## data:  table(GioiTinh, NgheNghiep)
## X-squared = 6.6336, df = 8, p-value = 0.5766
chisq.test(table(Mang,NgheNghiep))
## Warning in chisq.test(table(Mang, NgheNghiep)): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(Mang, NgheNghiep)
## X-squared = 47.384, df = 48, p-value = 0.498

6. Kiểm định về phân phối tổng thể

kiem dinh Bowman-Shelton (Kiểm định phân phối chuẩn)

library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
x=rnorm(1000,12,1)
jarque.bera.test(x)
## 
##  Jarque Bera Test
## 
## data:  x
## X-squared = 0.15832, df = 2, p-value = 0.9239
?jarque.bera.test

Kiem dinh Kolmogorov-Smirnov

library(stats)
x=rnorm(10000,12,1)
ks.test(x, "pnorm", 12, 1) 
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.0070148, p-value = 0.7088
## alternative hypothesis: two-sided
ks.test(x,"pexp",12)
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(x, "pnorm", 1, 1) 
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(x, "pnorm",mean(x),sd(x)) 
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.0060458, p-value = 0.8582
## alternative hypothesis: two-sided
jarque.bera.test(x)
## 
##  Jarque Bera Test
## 
## data:  x
## X-squared = 3.0498, df = 2, p-value = 0.2176
#shapiro.test(x)
?ks.test

#Goi fBasics

library(fBasics)
## Loading required package: timeDate
## Loading required package: timeSeries
?ksnormTest
x=rnorm(10000,12,1)
ks.test(x, "pnorm",mean(x),sd(x)) 
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.0053112, p-value = 0.9405
## alternative hypothesis: two-sided
ksnormTest(x)
## 
## Title:
##  One-sample Kolmogorov-Smirnov test
## 
## Test Results:
##   STATISTIC:
##     D: 1
##   P VALUE:
##     Alternative Two-Sided: < 2.2e-16 
##     Alternative      Less: < 2.2e-16 
##     Alternative   Greater: 1 
## 
## Description:
##  Wed Sep 15 11:56:18 2021 by user:

Chu y kiem dinh ks.test chi nen dung cho truong hop phan phoi lien tuc

x=rpois(1000,12)
ks.test(x, "ppois", 12) 
## Warning in ks.test(x, "ppois", 12): ties should not be present for the
## Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.1096, p-value = 7.377e-11
## alternative hypothesis: two-sided

Kiem dinh ks.test cho hai tong the

x=rnorm(1000,12,1)
y=rnorm(1000,12,1)
ks.test(x, y) 
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.031, p-value = 0.7226
## alternative hypothesis: two-sided

#so sanh 3 loai kiem dinh khi ket qua

x=runif(1000,min=10,max=30)
jarque.bera.test(x)
## 
##  Jarque Bera Test
## 
## data:  x
## X-squared = 56.076, df = 2, p-value = 6.657e-13
ks.test(x,'pnorm',mean(x),sd(x))
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.06127, p-value = 0.001097
## alternative hypothesis: two-sided
shapiro.test(x)
## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.95811, p-value = 2.578e-16

Su dung Q-Q plot so sanh su tuong dong giua hai phan phoi

x=rnorm(1000,12,1)
y=rnorm(1000,12,1)
par(mfrow=c(2,2))
qqplot(x,y)

x=runif(1000,12,111)
y=rnorm(1000,12,1)
qqplot(x,y)

x=rnorm(1000,112,1)
y=rnorm(1000,112,1)
qqplot(x,y)

x=rnorm(1000,112,50)
qqnorm((x-mean(x))/sd(x))

ks.test(x,y)
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.487, p-value < 2.2e-16
## alternative hypothesis: two-sided
y=rnorm(1000,1,1)
qqplot(x,y)

ks.test(x,y)
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.991, p-value < 2.2e-16
## alternative hypothesis: two-sided

Su dung Q-Q plot so sanh phan phoi chuan;

qqplot
## function (x, y, plot.it = TRUE, xlab = deparse1(substitute(x)), 
##     ylab = deparse1(substitute(y)), ...) 
## {
##     sx <- sort(x)
##     sy <- sort(y)
##     lenx <- length(sx)
##     leny <- length(sy)
##     if (leny < lenx) 
##         sx <- approx(1L:lenx, sx, n = leny)$y
##     if (leny > lenx) 
##         sy <- approx(1L:leny, sy, n = lenx)$y
##     if (plot.it) 
##         plot(sx, sy, xlab = xlab, ylab = ylab, ...)
##     invisible(list(x = sx, y = sy))
## }
## <bytecode: 0x7fa733bd6cd0>
## <environment: namespace:stats>
par(mfrow=c(1,3))
qqnorm
## function (y, ...) 
## UseMethod("qqnorm")
## <bytecode: 0x7fa72f13c6b0>
## <environment: namespace:stats>
x=rnorm(1000,0,1)
qqnorm(x)

x=runif(1000,12,111)
qqnorm(x,main = 'phan phoi deu')

x=rpois(1000,11)
qqnorm(x,main = 'Phan phoi Poisson')

x=rnorm(1000,12,2)
qqnorm(x,main = 'phan phoi chuan')