Loading packages

# Load packages
library(gapminder)

## Warning: package 'gapminder' was built under R version 3.6.2

library(ggplot2)
library(ggthemes)

## Warning: package 'ggthemes' was built under R version 3.6.2

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.6.2

library(table1)

## Warning: package 'table1' was built under R version 3.6.2

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(compareGroups)

## Warning: package 'compareGroups' was built under R version 3.6.2

## Loading required package: SNPassoc

## Loading required package: haplo.stats

## Loading required package: survival

## Loading required package: mvtnorm

## Loading required package: parallel

## Registered S3 method overwritten by 'SNPassoc':
##   method            from       
##   summary.haplo.glm haplo.stats

Reading data PISA

t="D:\\R\\TAI LIEU TAP HUAN R\\Dataset thuc hanh\\PISA Data Vietnam 2015.csv"
pisa=read.csv(t)
head(pisa)

##     School SchoolSize ClassSize STratio SchoolType  Area Region   Age Gender
## 1 70400001        883        18  22.075          3 URBAN  SOUTH 15.58   Boys
## 2 70400001        883        18  22.075          3 URBAN  SOUTH 15.92   Boys
## 3 70400001        883        18  22.075          3 URBAN  SOUTH 15.42  Girls
## 4 70400001        883        18  22.075          3 URBAN  SOUTH 15.58  Girls
## 5 70400001        883        18  22.075          3 URBAN  SOUTH 15.92  Girls
## 6 70400001        883        18  22.075          3 URBAN  SOUTH 16.25  Girls
##   PARED HISCED  WEALTH INSTSCIE JOYSCIE  ICTRES    Math    Read Science
## 1     9      2 -2.0697   0.9798  2.1635 -1.5244 439.923 412.290 475.612
## 2    12      4 -1.7903   1.7359  2.1635 -1.9305 406.251 409.598 450.320
## 3     9      2 -2.1942  -0.2063 -0.1808 -1.6093 414.369 384.307 405.787
## 4     5      1 -2.0301  -0.3115 -0.4318 -1.6250 468.801 459.104 462.968
## 5     9      2 -1.0522   0.7648  1.3031 -0.5305 355.432 402.435 453.736
## 6     5      1 -3.0570   0.3708  0.5094 -2.5873 458.955 483.885 529.866

dim(pisa)

## [1] 5826   18

Descriptive analysis with table1 and compareGroups

table1(~Region+Area+SchoolType+Math+Read+Science|Area,data=pisa)

	REMOTE (n=410)	RURAL (n=2368)	URBAN (n=3048)	Overall (n=5826)
Region
CENTRAL	198 (48.3%)	857 (36.2%)	951 (31.2%)	2006 (34.4%)
NORTH	148 (36.1%)	764 (32.3%)	1046 (34.3%)	1958 (33.6%)
SOUTH	64 (15.6%)	747 (31.5%)	1051 (34.5%)	1862 (32.0%)
Area
REMOTE	410 (100%)	0 (0%)	0 (0%)	410 (7.0%)
RURAL	0 (0%)	2368 (100%)	0 (0%)	2368 (40.6%)
URBAN	0 (0%)	0 (0%)	3048 (100%)	3048 (52.3%)
SchoolType
Mean (SD)	3.00 (0.00)	2.89 (0.464)	2.80 (0.600)	2.85 (0.528)
Median [Min, Max]	3.00 [3.00, 3.00]	3.00 [1.00, 3.00]	3.00 [1.00, 3.00]	3.00 [1.00, 3.00]
Missing	0 (0%)	0 (0%)	35 (1.1%)	35 (0.6%)
Math
Mean (SD)	450 (82.0)	500 (81.9)	499 (79.3)	496 (81.5)
Median [Min, Max]	446 [216, 696]	498 [273, 818]	497 [202, 820]	493 [202, 820]
Read
Mean (SD)	440 (76.0)	491 (67.6)	496 (69.6)	490 (70.6)
Median [Min, Max]	439 [233, 643]	490 [292, 744]	495 [107, 718]	489 [107, 744]
Science
Mean (SD)	482 (74.4)	529 (75.5)	527 (72.8)	525 (75.0)
Median [Min, Max]	475 [307, 698]	529 [335, 807]	525 [293, 799]	524 [293, 807]

temp=compareGroups(Area~PARED+WEALTH+Math+Read+Science,data=pisa)
createTable(temp)

## 
## --------Summary descriptives table by 'Area'---------
## 
## ________________________________________________________ 
##            REMOTE       RURAL        URBAN     p.overall 
##            N=410        N=2368       N=3048              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## PARED   7.90 (3.69)  9.38 (3.47)  9.56 (3.48)   <0.001   
## WEALTH  -3.00 (1.25) -2.22 (1.08) -2.12 (1.16)  <0.001   
## Math     450 (82.0)   500 (81.9)   499 (79.3)   <0.001   
## Read     440 (76.0)   491 (67.6)   496 (69.6)   <0.001   
## Science  482 (74.4)   529 (75.5)   527 (72.8)   <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Histogram

# Simple histogram of Math using ggplot2
p1=ggplot(data=pisa,aes(x=Math))
p1=p1+geom_histogram(fill="blue",color="white")
p1+labs(x="Math Score",y="Number of Student")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Histogram

# Simple histogram of Math with probability and density line
p2=ggplot(data=pisa,aes(x=Math))
p2=p2+geom_histogram(aes(y=..density..),color="white",fill="blue")
p2=p2+geom_density(col="red")
p2+labs(x="Math Score",y="Probability")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram of Math by Gender
p3=ggplot(data=pisa,aes(x=Math,fill=Gender))
p3=p3+geom_histogram(position="dodge")
p3+labs(x="Math Score",y="Number of Students")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p4=ggplot(data=pisa,aes(x=Math,fill=Area,color=Area))
p4=p4+geom_density(alpha=0.1)
p4+labs(x="Math Score",y="Density")

grid.arrange(p1,p2,p3,p4,ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Barplot

# Number of Students by Area
p=ggplot(data=pisa,aes(x=Area,fill=Area,col=Area))
p+geom_bar(position="dodge")

# Number of Students by Area and Gender - dodge
p=ggplot(data=pisa,aes(x=Area,fill=Gender,col=Gender))
p+geom_bar(position="dodge")

# Stack bar
p=ggplot(data=pisa,aes(x=Area,fill=Gender,col=Gender))
p+geom_bar(position="stack")

# Stack bar by Region, Area and Gender
p=ggplot(data=pisa,aes(x=Area,fill=Gender,col=Gender))
p+geom_bar(position="stack")+facet_grid(~Region)

# Boxplot

# Simple boxplot for Math of Gender
p=ggplot(data=pisa,aes(x=Gender,y=Math,fill=Gender))
p+geom_boxplot()

# Simple boxplot for Math by Area and Gender
p=ggplot(data=pisa,aes(x=Gender,y=Math,fill=Gender))
p+geom_boxplot()+facet_grid(~Area)

# Simple boxplot for Math by Area and Gender with jitter
p=ggplot(data=pisa,aes(x=Gender,y=Math,fill=Gender,col=Gender))
p=p+geom_boxplot(col="black")+geom_jitter(alpha=0.1)
p+facet_grid(~Area)

Scatter plot

# Simple correlation between WEALTH and Science
p=ggplot(data=pisa,aes(x=WEALTH,y=Science))
p+geom_point()

## Warning: Removed 15 rows containing missing values (geom_point).

# Simple correlation between WEALTH and Science by Area
p=ggplot(data=pisa,aes(x=WEALTH,y=Science,col=Area))
p+geom_point()

## Warning: Removed 15 rows containing missing values (geom_point).

# Simple correlation between WEALTH and Science by Area and smooth
p=ggplot(data=pisa,aes(x=WEALTH,y=Science,col=Area))
p+geom_point()+geom_smooth(method="lm",se=F)+theme_economist()

## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing missing values (geom_point).

Multivariable correlation

# Correlation between Math, Science, Read by Area
library(GGally)

## Warning: package 'GGally' was built under R version 3.6.2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

dat=pisa[,c("Area","Math","Science","Read")]
ggpairs(data=dat,mapping=aes(color=Area))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Day 2 R

Duc Chien Ngo

12/24/2019

Loading packages

Reading data PISA

Descriptive analysis with table1 and compareGroups

Histogram

Histogram

Barplot

Scatter plot

Multivariable correlation