Bài tập này thực hành trên file R Markdown: Đây là định dạng trên R để viết các báo cáo (xuất file dưới dạng Word, .pdf (nếu máy tình cài Latex), html file và các định dạng khác). Tham khảo cheasheat về R Markdown tại link https://rstudio.com/wp-content/uploads/2015/02/rmarkdown-cheatsheet.pdf .
Một số lựa chọn về hiện R code và kết quả trong báo cáo:
Cài đặt working directory để chỉ folder làm việc cho R và load dữ liệu
rm(list=ls())
load("~/Dropbox/Research/R-Viasm/data/DienThoaiDiDong.rda")
str(DuLieu)
## 'data.frame': 2000 obs. of 10 variables:
## $ GioiTinh : Factor w/ 2 levels "Nam","Nu": 1 1 2 1 2 2 2 2 2 1 ...
## $ TinhTrangHonNhan: Factor w/ 3 levels "Chua lap gia dinh",..: 3 3 1 2 2 3 1 3 3 3 ...
## $ NgheNghiep : Factor w/ 9 levels "Cong nhan","DKLD dac biet",..: 3 8 1 7 7 8 3 1 3 5 ...
## $ KhuVuc : Factor w/ 3 levels "Bac","Nam","Trung": 1 1 1 1 3 2 1 2 1 2 ...
## $ HinhThuc : Factor w/ 2 levels "Tra sau","Tra truoc": 2 2 2 2 2 2 1 2 1 2 ...
## $ Mang : Factor w/ 7 levels "Beeline","EVN Telecom",..: 6 1 1 7 7 1 7 7 7 6 ...
## $ Jan : num 67 136 4 80 196 251 108 73 117 69 ...
## $ Feb : num 180 21 0 230 155 79 199 173 237 205 ...
## $ Mar : num 142 157 302 182 234 80 292 247 265 147 ...
## $ April : num 150 31 0 122 209 42 204 172 138 203 ...
dim(DuLieu)
## [1] 2000 10
names(DuLieu)
## [1] "GioiTinh" "TinhTrangHonNhan" "NgheNghiep" "KhuVuc"
## [5] "HinhThuc" "Mang" "Jan" "Feb"
## [9] "Mar" "April"
attach(DuLieu)
Xem 5 dòng đầu tiên và 10 cột đầu của số liệu DuLieu
head(DuLieu)
## GioiTinh TinhTrangHonNhan NgheNghiep KhuVuc HinhThuc Mang Jan Feb Mar
## 1 Nam Dang co gia dinh Hanh chinh Bac Tra truoc Viettel 67 180 142
## 2 Nam Dang co gia dinh TXTN Bac Tra truoc Beeline 136 21 157
## 3 Nu Chua lap gia dinh Cong nhan Bac Tra truoc Beeline 4 0 302
## 4 Nam Da ly hon TXCN Bac Tra truoc VinaPhone 80 230 182
## 5 Nu Da ly hon TXCN Trung Tra truoc VinaPhone 196 155 234
## 6 Nu Dang co gia dinh TXTN Nam Tra truoc Beeline 251 79 80
## April
## 1 150
## 2 31
## 3 0
## 4 122
## 5 209
## 6 42
Có thể đặt lệnh để các dòng kí tự và kết quả trên không hiện ra trong báo cáo!!!
Tương tự, xem thông tin 5 dòng đầu và một số cột của DuLieu
head(DuLieu[, c(1:3, 5:7)])
## GioiTinh TinhTrangHonNhan NgheNghiep HinhThuc Mang Jan
## 1 Nam Dang co gia dinh Hanh chinh Tra truoc Viettel 67
## 2 Nam Dang co gia dinh TXTN Tra truoc Beeline 136
## 3 Nu Chua lap gia dinh Cong nhan Tra truoc Beeline 4
## 4 Nam Da ly hon TXCN Tra truoc VinaPhone 80
## 5 Nu Da ly hon TXCN Tra truoc VinaPhone 196
## 6 Nu Dang co gia dinh TXTN Tra truoc Beeline 251
t.test(Jan,conf.level=0.9)
##
## One Sample t-test
##
## data: Jan
## t = 103.98, df = 1999, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 90 percent confidence interval:
## 120.6193 124.4987
## sample estimates:
## mean of x
## 122.559
Kết luận: Khoảng ước lượng độ tin cậy 90% cho Jan là (120.6193; 124.4987)
Uoc luong khoang cho tdt trung binh thang 2
Ket qua: (157.8417; 162.8573)
Ước lượng khoảng với độ tin cậy 95% cho tỉ lệ người dùng hơn 150 nghìn trong tháng 1
sum(Jan>150)
## [1] 603
length(Jan)
## [1] 2000
prop.test(n=2000,x=603,correct=F,conf.level=0.95)
##
## 1-sample proportions test without continuity correction
##
## data: 603 out of 2000, null probability 0.5
## X-squared = 315.22, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.2817840 0.3219771
## sample estimates:
## p
## 0.3015
Kết quả: (0.2817840; 0.3219771)
t.test(Jan,mu=130,alternative = 'less')
##
## One Sample t-test
##
## data: Jan
## t = -6.313, df = 1999, p-value = 1.681e-10
## alternative hypothesis: true mean is less than 130
## 95 percent confidence interval:
## -Inf 124.4987
## sample estimates:
## mean of x
## 122.559
Kết luận: p-value = 1.681e-10
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
z.test(Jan,sigma.x=11,mu=130,alternative = 'less')
##
## One-sample z-Test
##
## data: Jan
## z = -30.252, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 130
## 95 percent confidence interval:
## NA 122.9636
## sample estimates:
## mean of x
## 122.559
Kết luận: p-value < 2.2e-16
tb=mean(Jan)
tb
## [1] 122.559
dlc=sd(Jan)
dlc
## [1] 52.71223
n=length(Jan)
n
## [1] 2000
library(BSDA)
zsum.test(n.x=n,mean.x=tb,sigma.x=11,mu=130,alternative = 'less')
##
## One-sample z-Test
##
## data: Summarized x
## z = -30.252, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 130
## 95 percent confidence interval:
## NA 122.9636
## sample estimates:
## mean of x
## 122.559
tsum.test(mean.x = tb, s.x=dlc, n.x=n,mu=130,alt='l' )
## Warning in tsum.test(mean.x = tb, s.x = dlc, n.x = n, mu = 130, alt = "l"):
## argument 'var.equal' ignored for one-sample test.
##
## One-sample t-Test
##
## data: Summarized x
## t = -6.313, df = 1999, p-value = 1.681e-10
## alternative hypothesis: true mean is less than 130
## 95 percent confidence interval:
## NA 124.4987
## sample estimates:
## mean of x
## 122.559
Kiem dinh xem tb thang 1 co nho hon thang 2 ko (Ho: M1-M2=0, H1: M1-M2<0)
t.test(Jan,Feb, mu=0, alt='l', paired = T)
##
## Paired t-test
##
## data: Jan and Feb
## t = -21.964, df = 1999, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -34.95905
## sample estimates:
## mean of the differences
## -37.7905
t.test(Feb~GioiTinh,alt='g',var.equal=T)
##
## Two Sample t-test
##
## data: Feb by GioiTinh
## t = 0.66796, df = 1998, p-value = 0.2521
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -2.520812 Inf
## sample estimates:
## mean in group Nam mean in group Nu
## 161.1021 159.3799
Feb.nam=Feb[GioiTinh=='Nam']
Feb.nu=Feb[GioiTinh=='Nu']
t.test(Feb.nam,Feb.nu,mu=0,alt='g',var.equal = T)
##
## Two Sample t-test
##
## data: Feb.nam and Feb.nu
## t = 0.66796, df = 1998, p-value = 0.2521
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -2.520812 Inf
## sample estimates:
## mean of x mean of y
## 161.1021 159.3799
anova(lm(Feb~KhuVuc))
## Analysis of Variance Table
##
## Response: Feb
## Df Sum Sq Mean Sq F value Pr(>F)
## KhuVuc 2 14160 7080.2 2.1674 0.1147
## Residuals 1997 6523426 3266.6
kruskal.test(Feb~KhuVuc)
##
## Kruskal-Wallis rank sum test
##
## data: Feb by KhuVuc
## Kruskal-Wallis chi-squared = 5.4038, df = 2, p-value = 0.06708
anova(lm(Feb~Mang))
## Analysis of Variance Table
##
## Response: Feb
## Df Sum Sq Mean Sq F value Pr(>F)
## Mang 6 1375351 229225 88.498 < 2.2e-16 ***
## Residuals 1993 5162235 2590
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov(lm(Feb~Mang)))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = lm(Feb ~ Mang))
##
## $Mang
## diff lwr upr p adj
## EVN Telecom-Beeline 54.817308 15.848480 93.78614 0.0006811
## Mobiphone-Beeline 116.852592 85.552142 148.15304 0.0000000
## S-Phone-Beeline 57.125000 16.285552 97.96445 0.0007520
## Vietnamobile-Beeline 8.149390 -30.455660 46.75444 0.9960902
## Viettel-Beeline 95.463782 64.261365 126.66620 0.0000000
## VinaPhone-Beeline 131.572833 100.327332 162.81833 0.0000000
## Mobiphone-EVN Telecom 62.035285 37.172581 86.89799 0.0000000
## S-Phone-EVN Telecom 2.307692 -33.835030 38.45041 0.9999963
## Vietnamobile-EVN Telecom -46.667917 -80.265324 -13.07051 0.0008486
## Viettel-EVN Telecom 40.646474 15.907302 65.38565 0.0000275
## VinaPhone-EVN Telecom 76.755525 51.962036 101.54902 0.0000000
## S-Phone-Mobiphone -59.727592 -87.430390 -32.02479 0.0000000
## Vietnamobile-Mobiphone -108.703202 -132.991766 -84.41464 0.0000000
## Viettel-Mobiphone -21.388811 -29.943064 -12.83456 0.0000000
## VinaPhone-Mobiphone 14.720241 6.010144 23.43034 0.0000137
## Vietnamobile-S-Phone -48.975610 -84.725809 -13.22541 0.0010728
## Viettel-S-Phone 38.338782 10.746797 65.93077 0.0008438
## VinaPhone-S-Phone 74.447833 46.807136 102.08853 0.0000000
## Viettel-Vietnamobile 87.314391 63.152294 111.47649 0.0000000
## VinaPhone-Vietnamobile 123.423443 99.205733 147.64115 0.0000000
## VinaPhone-Viettel 36.109051 27.758100 44.46000 0.0000000
var.test(Feb.nam,Feb.nu)
##
## F test to compare two variances
##
## data: Feb.nam and Feb.nu
## F = 1.0367, num df = 1125, denom df = 873, p-value = 0.5743
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.914343 1.174258
## sample estimates:
## ratio of variances
## 1.036701
bartlett.test(Feb~KhuVuc)
##
## Bartlett test of homogeneity of variances
##
## data: Feb by KhuVuc
## Bartlett's K-squared = 0.43232, df = 2, p-value = 0.8056
bartlett.test(Feb~Mang)
##
## Bartlett test of homogeneity of variances
##
## data: Feb by Mang
## Bartlett's K-squared = 7.6407, df = 6, p-value = 0.2656
table(GioiTinh,NgheNghiep)
## NgheNghiep
## GioiTinh Cong nhan DKLD dac biet Hanh chinh HSSV Ky thuat NCKH TXCN TXTN VHNT
## Nam 171 38 186 50 179 38 183 176 105
## Nu 148 17 143 35 138 21 148 140 84
chisq.test(table(GioiTinh,NgheNghiep))
##
## Pearson's Chi-squared test
##
## data: table(GioiTinh, NgheNghiep)
## X-squared = 6.6336, df = 8, p-value = 0.5766
chisq.test(table(Mang,NgheNghiep))
## Warning in chisq.test(table(Mang, NgheNghiep)): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: table(Mang, NgheNghiep)
## X-squared = 47.384, df = 48, p-value = 0.498
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
x=rnorm(1000,12,1)
jarque.bera.test(x)
##
## Jarque Bera Test
##
## data: x
## X-squared = 0.15832, df = 2, p-value = 0.9239
?jarque.bera.test
library(stats)
x=rnorm(10000,12,1)
ks.test(x, "pnorm", 12, 1)
##
## One-sample Kolmogorov-Smirnov test
##
## data: x
## D = 0.0070148, p-value = 0.7088
## alternative hypothesis: two-sided
ks.test(x,"pexp",12)
##
## One-sample Kolmogorov-Smirnov test
##
## data: x
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(x, "pnorm", 1, 1)
##
## One-sample Kolmogorov-Smirnov test
##
## data: x
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(x, "pnorm",mean(x),sd(x))
##
## One-sample Kolmogorov-Smirnov test
##
## data: x
## D = 0.0060458, p-value = 0.8582
## alternative hypothesis: two-sided
jarque.bera.test(x)
##
## Jarque Bera Test
##
## data: x
## X-squared = 3.0498, df = 2, p-value = 0.2176
#shapiro.test(x)
?ks.test
#Goi fBasics
library(fBasics)
## Loading required package: timeDate
## Loading required package: timeSeries
?ksnormTest
x=rnorm(10000,12,1)
ks.test(x, "pnorm",mean(x),sd(x))
##
## One-sample Kolmogorov-Smirnov test
##
## data: x
## D = 0.0053112, p-value = 0.9405
## alternative hypothesis: two-sided
ksnormTest(x)
##
## Title:
## One-sample Kolmogorov-Smirnov test
##
## Test Results:
## STATISTIC:
## D: 1
## P VALUE:
## Alternative Two-Sided: < 2.2e-16
## Alternative Less: < 2.2e-16
## Alternative Greater: 1
##
## Description:
## Wed Sep 15 11:56:18 2021 by user:
x=rpois(1000,12)
ks.test(x, "ppois", 12)
## Warning in ks.test(x, "ppois", 12): ties should not be present for the
## Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: x
## D = 0.1096, p-value = 7.377e-11
## alternative hypothesis: two-sided
x=rnorm(1000,12,1)
y=rnorm(1000,12,1)
ks.test(x, y)
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.031, p-value = 0.7226
## alternative hypothesis: two-sided
#so sanh 3 loai kiem dinh khi ket qua
x=runif(1000,min=10,max=30)
jarque.bera.test(x)
##
## Jarque Bera Test
##
## data: x
## X-squared = 56.076, df = 2, p-value = 6.657e-13
ks.test(x,'pnorm',mean(x),sd(x))
##
## One-sample Kolmogorov-Smirnov test
##
## data: x
## D = 0.06127, p-value = 0.001097
## alternative hypothesis: two-sided
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.95811, p-value = 2.578e-16
x=rnorm(1000,12,1)
y=rnorm(1000,12,1)
par(mfrow=c(2,2))
qqplot(x,y)
x=runif(1000,12,111)
y=rnorm(1000,12,1)
qqplot(x,y)
x=rnorm(1000,112,1)
y=rnorm(1000,112,1)
qqplot(x,y)
x=rnorm(1000,112,50)
qqnorm((x-mean(x))/sd(x))
ks.test(x,y)
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.487, p-value < 2.2e-16
## alternative hypothesis: two-sided
y=rnorm(1000,1,1)
qqplot(x,y)
ks.test(x,y)
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and y
## D = 0.991, p-value < 2.2e-16
## alternative hypothesis: two-sided
qqplot
## function (x, y, plot.it = TRUE, xlab = deparse1(substitute(x)),
## ylab = deparse1(substitute(y)), ...)
## {
## sx <- sort(x)
## sy <- sort(y)
## lenx <- length(sx)
## leny <- length(sy)
## if (leny < lenx)
## sx <- approx(1L:lenx, sx, n = leny)$y
## if (leny > lenx)
## sy <- approx(1L:leny, sy, n = lenx)$y
## if (plot.it)
## plot(sx, sy, xlab = xlab, ylab = ylab, ...)
## invisible(list(x = sx, y = sy))
## }
## <bytecode: 0x7fa733bd6cd0>
## <environment: namespace:stats>
par(mfrow=c(1,3))
qqnorm
## function (y, ...)
## UseMethod("qqnorm")
## <bytecode: 0x7fa72f13c6b0>
## <environment: namespace:stats>
x=rnorm(1000,0,1)
qqnorm(x)
x=runif(1000,12,111)
qqnorm(x,main = 'phan phoi deu')
x=rpois(1000,11)
qqnorm(x,main = 'Phan phoi Poisson')
x=rnorm(1000,12,2)
qqnorm(x,main = 'phan phoi chuan')