#Loading Packages

#load gapminder to get the dataset 'gapminder'
library(gapminder)
library(ggplot2)
library(ggthemes)
library(gridExtra)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(compareGroups)
## Loading required package: SNPassoc
## Loading required package: haplo.stats
## Loading required package: survival
## Loading required package: mvtnorm
## Loading required package: parallel
## Registered S3 method overwritten by 'SNPassoc':
##   method            from       
##   summary.haplo.glm haplo.stats

Reading data PISA

t = "C:\\Users\\trieu\\OneDrive\\HOC TAP VA TAP HUAN\\THONG KE R 122019\\Dataset thuc hanh\\PISA Data Vietnam 2015.csv"
pisa = read.csv(t)
head(pisa)
##     School SchoolSize ClassSize STratio SchoolType  Area Region   Age Gender
## 1 70400001        883        18  22.075          3 URBAN  SOUTH 15.58   Boys
## 2 70400001        883        18  22.075          3 URBAN  SOUTH 15.92   Boys
## 3 70400001        883        18  22.075          3 URBAN  SOUTH 15.42  Girls
## 4 70400001        883        18  22.075          3 URBAN  SOUTH 15.58  Girls
## 5 70400001        883        18  22.075          3 URBAN  SOUTH 15.92  Girls
## 6 70400001        883        18  22.075          3 URBAN  SOUTH 16.25  Girls
##   PARED HISCED  WEALTH INSTSCIE JOYSCIE  ICTRES    Math    Read Science
## 1     9      2 -2.0697   0.9798  2.1635 -1.5244 439.923 412.290 475.612
## 2    12      4 -1.7903   1.7359  2.1635 -1.9305 406.251 409.598 450.320
## 3     9      2 -2.1942  -0.2063 -0.1808 -1.6093 414.369 384.307 405.787
## 4     5      1 -2.0301  -0.3115 -0.4318 -1.6250 468.801 459.104 462.968
## 5     9      2 -1.0522   0.7648  1.3031 -0.5305 355.432 402.435 453.736
## 6     5      1 -3.0570   0.3708  0.5094 -2.5873 458.955 483.885 529.866
dim(pisa)
## [1] 5826   18

Discriptive Analysis with table1 and compareGroups

table1(~Region + Area + SchoolType + Math + Read + Science | Area, data=pisa)
REMOTE
(n=410)
RURAL
(n=2368)
URBAN
(n=3048)
Overall
(n=5826)
Region
CENTRAL 198 (48.3%) 857 (36.2%) 951 (31.2%) 2006 (34.4%)
NORTH 148 (36.1%) 764 (32.3%) 1046 (34.3%) 1958 (33.6%)
SOUTH 64 (15.6%) 747 (31.5%) 1051 (34.5%) 1862 (32.0%)
Area
REMOTE 410 (100%) 0 (0%) 0 (0%) 410 (7.0%)
RURAL 0 (0%) 2368 (100%) 0 (0%) 2368 (40.6%)
URBAN 0 (0%) 0 (0%) 3048 (100%) 3048 (52.3%)
SchoolType
Mean (SD) 3.00 (0.00) 2.89 (0.464) 2.80 (0.600) 2.85 (0.528)
Median [Min, Max] 3.00 [3.00, 3.00] 3.00 [1.00, 3.00] 3.00 [1.00, 3.00] 3.00 [1.00, 3.00]
Missing 0 (0%) 0 (0%) 35 (1.1%) 35 (0.6%)
Math
Mean (SD) 450 (82.0) 500 (81.9) 499 (79.3) 496 (81.5)
Median [Min, Max] 446 [216, 696] 498 [273, 818] 497 [202, 820] 493 [202, 820]
Read
Mean (SD) 440 (76.0) 491 (67.6) 496 (69.6) 490 (70.6)
Median [Min, Max] 439 [233, 643] 490 [292, 744] 495 [107, 718] 489 [107, 744]
Science
Mean (SD) 482 (74.4) 529 (75.5) 527 (72.8) 525 (75.0)
Median [Min, Max] 475 [307, 698] 529 [335, 807] 525 [293, 799] 524 [293, 807]
temp = compareGroups(Area ~ PARED + WEALTH + Math + Read + Science, data=pisa)
createTable(temp)
## 
## --------Summary descriptives table by 'Area'---------
## 
## ________________________________________________________ 
##            REMOTE       RURAL        URBAN     p.overall 
##            N=410        N=2368       N=3048              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## PARED   7.90 (3.69)  9.38 (3.47)  9.56 (3.48)   <0.001   
## WEALTH  -3.00 (1.25) -2.22 (1.08) -2.12 (1.16)  <0.001   
## Math     450 (82.0)   500 (81.9)   499 (79.3)   <0.001   
## Read     440 (76.0)   491 (67.6)   496 (69.6)   <0.001   
## Science  482 (74.4)   529 (75.5)   527 (72.8)   <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Histogram

#Simple histogram of Math using ggplots
p1 = ggplot(data=pisa, aes(x=Math))
p1 = p1 + geom_histogram(fill="blue", color="white")
p1 + labs(x="Math Score", y="Number of Students")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Simple histogram of Math with probability and density line
p2 = ggplot(data=pisa, aes(x=Math))
p2 = p2 + geom_histogram(aes(y=..density..), fill="blue", color="white")
p2 = p2 + geom_density(col="red")
p2 + labs(x="Math Score", y="Probability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Histogram of Math by Gender
p3 = ggplot(data=pisa, aes(x=Math, fill=Gender))
p3 = p3 + geom_histogram(position = "dodge")
p3 + labs(x="Math Score", y="Number of Students")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p4 = ggplot(data=pisa, aes(x=Math, fill=Area, color=Area))
p4 = p4 + geom_density(alpha = 0.1)
p4 + labs(x="Math Score", y="Density")

grid.arrange(p1, p2, p3, p4, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Barplot

#Number of Students by Area
p = ggplot(data=pisa, aes(x=Area, fill=Area, col=Area))
p + geom_bar(position = "dodge")

#Number of Students by Area and Gender - dodge
p = ggplot(data=pisa, aes(x=Area, fill=Gender, col=Gender))
p + geom_bar(position = "dodge")

#Stack bar
p = ggplot(data=pisa, aes(x=Area, fill=Gender, col=Gender))
p + geom_bar(position = "stack")

#Stack bar by Region, Area, and Gender
p = ggplot(data=pisa, aes(x=Area, fill=Gender, col=Gender))
p + geom_bar(position = "stack") + facet_grid(~Region)

#Boxplot

#Simple Boxplot for Math by Gender
p = ggplot(data=pisa, aes(x=Gender, y=Math, fill=Gender))
p + geom_boxplot()

#Simple Boxplot for Math by Area and Gender
p = ggplot(data=pisa, aes(x=Gender, y=Math, fill=Gender))
p + geom_boxplot() + facet_grid(~Area)

#Simple Boxplot for Math by Area and Gender with jitter
p = ggplot(data=pisa, aes(x=Gender, y=Math, fill=Gender, col=Gender))
p = p + geom_boxplot(col="black") + geom_jitter(alpha=0.1)
p + facet_grid(~Area)

#Scatter Plot

#Simple Correlation between WEALTH and Science
p = ggplot(data=pisa, aes(x=WEALTH, y=Science))
p + geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).

#Simple Correlation between WEALTH and Science by Area
p = ggplot(data=pisa, aes(x=WEALTH, y=Science, col=Area))
p + geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).

#Simple Correlation between WEALTH and Science by Area and smooth
p = ggplot(data=pisa, aes(x=WEALTH, y=Science, col=Area))
p + geom_point() + geom_smooth(method="lm", se=F)
## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing missing values (geom_point).

#Multivariable Correlation

#Correlation between Math, Science, Read by Area
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
dat = pisa[, c("Area", "Math", "Science", "Read")]
ggpairs(data=dat, mapping = aes(color = Area))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.