Install some new packages

install.packages(c("table1", "compareGroups","GGally","gridExtra","gapminder","ggthemes","ggplot2"))

Load packages

library(SNPassoc)

## Loading required package: haplo.stats

## Loading required package: survival

## Loading required package: mvtnorm

## Loading required package: parallel

## Registered S3 method overwritten by 'SNPassoc':
##   method            from       
##   summary.haplo.glm haplo.stats

library(haplo.stats)
library(survival)
library(mvtnorm)
library(parallel)
library(gapminder)
library(ggplot2)
library(ggthemes)
library(gridExtra)
library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

library(compareGroups)

Read PISA data

t = "C:\\Users\\Thuan Nguyen\\OneDrive\\R in BUH\\Dataset thuc hanh\\PISA Data Vietnam 2015.csv"
pisa = read.csv(t)
dim(pisa)

## [1] 5826   18

Descriptive analysis with table1 and compareGroup

table1(~Region + Area + SchoolType + Math + Read + Science | Area, data = pisa)

	REMOTE (n=410)	RURAL (n=2368)	URBAN (n=3048)	Overall (n=5826)
Region
CENTRAL	198 (48.3%)	857 (36.2%)	951 (31.2%)	2006 (34.4%)
NORTH	148 (36.1%)	764 (32.3%)	1046 (34.3%)	1958 (33.6%)
SOUTH	64 (15.6%)	747 (31.5%)	1051 (34.5%)	1862 (32.0%)
Area
REMOTE	410 (100%)	0 (0%)	0 (0%)	410 (7.0%)
RURAL	0 (0%)	2368 (100%)	0 (0%)	2368 (40.6%)
URBAN	0 (0%)	0 (0%)	3048 (100%)	3048 (52.3%)
SchoolType
Mean (SD)	3.00 (0.00)	2.89 (0.464)	2.80 (0.600)	2.85 (0.528)
Median [Min, Max]	3.00 [3.00, 3.00]	3.00 [1.00, 3.00]	3.00 [1.00, 3.00]	3.00 [1.00, 3.00]
Missing	0 (0%)	0 (0%)	35 (1.1%)	35 (0.6%)
Math
Mean (SD)	450 (82.0)	500 (81.9)	499 (79.3)	496 (81.5)
Median [Min, Max]	446 [216, 696]	498 [273, 818]	497 [202, 820]	493 [202, 820]
Read
Mean (SD)	440 (76.0)	491 (67.6)	496 (69.6)	490 (70.6)
Median [Min, Max]	439 [233, 643]	490 [292, 744]	495 [107, 718]	489 [107, 744]
Science
Mean (SD)	482 (74.4)	529 (75.5)	527 (72.8)	525 (75.0)
Median [Min, Max]	475 [307, 698]	529 [335, 807]	525 [293, 799]	524 [293, 807]

temp = compareGroups(Area ~ PARED + WEALTH + Math + Read + Science, data = pisa)
createTable(temp)

## 
## --------Summary descriptives table by 'Area'---------
## 
## ________________________________________________________ 
##            REMOTE       RURAL        URBAN     p.overall 
##            N=410        N=2368       N=3048              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## PARED   7.90 (3.69)  9.38 (3.47)  9.56 (3.48)   <0.001   
## WEALTH  -3.00 (1.25) -2.22 (1.08) -2.12 (1.16)  <0.001   
## Math     450 (82.0)   500 (81.9)   499 (79.3)   <0.001   
## Read     440 (76.0)   491 (67.6)   496 (69.6)   <0.001   
## Science  482 (74.4)   529 (75.5)   527 (72.8)   <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Histogram

#Simple histogram of Math using ggplot2
p1 = ggplot(data = pisa,aes(x=Math))
p1 = p1 + geom_histogram(fill="blue",color="white")
p1+labs(x="Math Score",y="Number of students")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Simple histogram of Math with probability and density line
p2 = ggplot(pisa, aes(x=Math))
p2 = p2 + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p2 + geom_density(col="red")
p2+ labs(x = "Math score", y = "Probability")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram of Math by Gender
p3 = ggplot(pisa, aes(x = Math, fill = Gender))
p3 = p3 + geom_histogram(position = "dodge")
p3 + labs(x="Math Score",y="Number of students")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p4 = ggplot(pisa, aes(x = Math, fill = Area, color = Area))
p4 = p4 + geom_density(alpha = 0.1)
p4 + labs(x = "Math Score", y = "Density")

grid.arrange(p1, p2, p3, p4, ncol = 2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Barplot

#Number of students by Area
p5 = ggplot(pisa, aes(x = Area, fill = Area, col = Area))
p5 = p5 + geom_bar(position = "dodge")
p5

# Number of student by Area and Gender - dodge
p6 = ggplot(pisa,aes(x = Area, fill = Gender, col = Gender))
p6 = p6 + geom_bar(position = "dodge")
p6

#Stack bar
p7 = ggplot(pisa, aes(x = Area, fill = Gender, col = Gender))
p7 = p7 + geom_bar(position = "stack")
p7

# Stack bar by Region, Area and Gender
p8 = ggplot(pisa, aes(x = Area, fill = Gender, col = Gender))
p8 = p8 + geom_bar(position = "stack") + facet_grid(~Region)
p8

grid.arrange(p5, p6, p7, p8, ncol = 2)

Boxplot

#Simple boxplot for Math by Gender
p9 = ggplot(pisa, aes(x = Gender, y = Math, fill = Gender))
p9 = p9 + geom_boxplot()
p9

#Boxplot for Math by Gender and Area
p10 = ggplot(pisa, aes(x = Gender, y = Math, fill = Gender))
p10 = p10 + geom_boxplot() + facet_grid(~Area)
p10

#Simple Boxplot for Math by Area, Gender with jitter
p11 = ggplot(pisa, aes(x = Gender, y = Math, fill = Gender, col = Gender))
p11 = p11 + geom_boxplot(col = "black") + geom_jitter(alpha = 0.1)
p11 = p11 + facet_grid(~Area)
p11

grid.arrange(p9, p10, p11, ncol = 3)

Scatter plot

#Simple correlation between WEALTH and Science
p = ggplot(pisa, aes(x = WEALTH, y = Science))
p + geom_point()

## Warning: Removed 15 rows containing missing values (geom_point).

#Simple correlation between Wealth and Math by Area
p = ggplot(pisa, aes(x = WEALTH, y = Science, col = Area))
p + geom_point()

## Warning: Removed 15 rows containing missing values (geom_point).

#Simple correlation between Wealth and Math by Area and Smooth
p = ggplot(pisa, aes(x = WEALTH, y = Science, col = Area))
p + geom_point() + geom_smooth(method = "lm", se = F)

## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing missing values (geom_point).

Multivariable correlation

#Correlation between Math, Science, Read by Area
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

dat = pisa[, c("Area", "Math", "Science", "Read")]
p = ggpairs(dat, mapping = aes(color = Area))
p

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Multivariable correlation

#Correlation between Math, Science, Read by Gender
library(GGally)
dat = pisa[, c("Gender", "Math", "Science", "Read")]
p = ggpairs(dat, mapping = aes(color = Gender))
p

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Data visualization with ggplot2

Thuan Nguyen

12/24/2019

Install some new packages

Load packages

Read PISA data

Descriptive analysis with table1 and compareGroup

Histogram

Barplot

Boxplot

Scatter plot

Multivariable correlation

Multivariable correlation