# Load packages
library(gapminder)
## Warning: package 'gapminder' was built under R version 3.6.2
library(ggplot2)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.6.2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.6.2
library(table1)
## Warning: package 'table1' was built under R version 3.6.2
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(compareGroups)
## Warning: package 'compareGroups' was built under R version 3.6.2
## Loading required package: SNPassoc
## Loading required package: haplo.stats
## Loading required package: survival
## Loading required package: mvtnorm
## Loading required package: parallel
## Registered S3 method overwritten by 'SNPassoc':
## method from
## summary.haplo.glm haplo.stats
t="D:\\R\\TAI LIEU TAP HUAN R\\Dataset thuc hanh\\PISA Data Vietnam 2015.csv"
pisa=read.csv(t)
head(pisa)
## School SchoolSize ClassSize STratio SchoolType Area Region Age Gender
## 1 70400001 883 18 22.075 3 URBAN SOUTH 15.58 Boys
## 2 70400001 883 18 22.075 3 URBAN SOUTH 15.92 Boys
## 3 70400001 883 18 22.075 3 URBAN SOUTH 15.42 Girls
## 4 70400001 883 18 22.075 3 URBAN SOUTH 15.58 Girls
## 5 70400001 883 18 22.075 3 URBAN SOUTH 15.92 Girls
## 6 70400001 883 18 22.075 3 URBAN SOUTH 16.25 Girls
## PARED HISCED WEALTH INSTSCIE JOYSCIE ICTRES Math Read Science
## 1 9 2 -2.0697 0.9798 2.1635 -1.5244 439.923 412.290 475.612
## 2 12 4 -1.7903 1.7359 2.1635 -1.9305 406.251 409.598 450.320
## 3 9 2 -2.1942 -0.2063 -0.1808 -1.6093 414.369 384.307 405.787
## 4 5 1 -2.0301 -0.3115 -0.4318 -1.6250 468.801 459.104 462.968
## 5 9 2 -1.0522 0.7648 1.3031 -0.5305 355.432 402.435 453.736
## 6 5 1 -3.0570 0.3708 0.5094 -2.5873 458.955 483.885 529.866
dim(pisa)
## [1] 5826 18
table1(~Region+Area+SchoolType+Math+Read+Science|Area,data=pisa)
| REMOTE (n=410) |
RURAL (n=2368) |
URBAN (n=3048) |
Overall (n=5826) |
|
|---|---|---|---|---|
| Region | ||||
| CENTRAL | 198 (48.3%) | 857 (36.2%) | 951 (31.2%) | 2006 (34.4%) |
| NORTH | 148 (36.1%) | 764 (32.3%) | 1046 (34.3%) | 1958 (33.6%) |
| SOUTH | 64 (15.6%) | 747 (31.5%) | 1051 (34.5%) | 1862 (32.0%) |
| Area | ||||
| REMOTE | 410 (100%) | 0 (0%) | 0 (0%) | 410 (7.0%) |
| RURAL | 0 (0%) | 2368 (100%) | 0 (0%) | 2368 (40.6%) |
| URBAN | 0 (0%) | 0 (0%) | 3048 (100%) | 3048 (52.3%) |
| SchoolType | ||||
| Mean (SD) | 3.00 (0.00) | 2.89 (0.464) | 2.80 (0.600) | 2.85 (0.528) |
| Median [Min, Max] | 3.00 [3.00, 3.00] | 3.00 [1.00, 3.00] | 3.00 [1.00, 3.00] | 3.00 [1.00, 3.00] |
| Missing | 0 (0%) | 0 (0%) | 35 (1.1%) | 35 (0.6%) |
| Math | ||||
| Mean (SD) | 450 (82.0) | 500 (81.9) | 499 (79.3) | 496 (81.5) |
| Median [Min, Max] | 446 [216, 696] | 498 [273, 818] | 497 [202, 820] | 493 [202, 820] |
| Read | ||||
| Mean (SD) | 440 (76.0) | 491 (67.6) | 496 (69.6) | 490 (70.6) |
| Median [Min, Max] | 439 [233, 643] | 490 [292, 744] | 495 [107, 718] | 489 [107, 744] |
| Science | ||||
| Mean (SD) | 482 (74.4) | 529 (75.5) | 527 (72.8) | 525 (75.0) |
| Median [Min, Max] | 475 [307, 698] | 529 [335, 807] | 525 [293, 799] | 524 [293, 807] |
temp=compareGroups(Area~PARED+WEALTH+Math+Read+Science,data=pisa)
createTable(temp)
##
## --------Summary descriptives table by 'Area'---------
##
## ________________________________________________________
## REMOTE RURAL URBAN p.overall
## N=410 N=2368 N=3048
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## PARED 7.90 (3.69) 9.38 (3.47) 9.56 (3.48) <0.001
## WEALTH -3.00 (1.25) -2.22 (1.08) -2.12 (1.16) <0.001
## Math 450 (82.0) 500 (81.9) 499 (79.3) <0.001
## Read 440 (76.0) 491 (67.6) 496 (69.6) <0.001
## Science 482 (74.4) 529 (75.5) 527 (72.8) <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
# Simple histogram of Math using ggplot2
p1=ggplot(data=pisa,aes(x=Math))
p1=p1+geom_histogram(fill="blue",color="white")
p1+labs(x="Math Score",y="Number of Student")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Simple histogram of Math with probability and density line
p2=ggplot(data=pisa,aes(x=Math))
p2=p2+geom_histogram(aes(y=..density..),color="white",fill="blue")
p2=p2+geom_density(col="red")
p2+labs(x="Math Score",y="Probability")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Histogram of Math by Gender
p3=ggplot(data=pisa,aes(x=Math,fill=Gender))
p3=p3+geom_histogram(position="dodge")
p3+labs(x="Math Score",y="Number of Students")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p4=ggplot(data=pisa,aes(x=Math,fill=Area,color=Area))
p4=p4+geom_density(alpha=0.1)
p4+labs(x="Math Score",y="Density")
grid.arrange(p1,p2,p3,p4,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Number of Students by Area
p=ggplot(data=pisa,aes(x=Area,fill=Area,col=Area))
p+geom_bar(position="dodge")
# Number of Students by Area and Gender - dodge
p=ggplot(data=pisa,aes(x=Area,fill=Gender,col=Gender))
p+geom_bar(position="dodge")
# Stack bar
p=ggplot(data=pisa,aes(x=Area,fill=Gender,col=Gender))
p+geom_bar(position="stack")
# Stack bar by Region, Area and Gender
p=ggplot(data=pisa,aes(x=Area,fill=Gender,col=Gender))
p+geom_bar(position="stack")+facet_grid(~Region)
# Boxplot
# Simple boxplot for Math of Gender
p=ggplot(data=pisa,aes(x=Gender,y=Math,fill=Gender))
p+geom_boxplot()
# Simple boxplot for Math by Area and Gender
p=ggplot(data=pisa,aes(x=Gender,y=Math,fill=Gender))
p+geom_boxplot()+facet_grid(~Area)
# Simple boxplot for Math by Area and Gender with jitter
p=ggplot(data=pisa,aes(x=Gender,y=Math,fill=Gender,col=Gender))
p=p+geom_boxplot(col="black")+geom_jitter(alpha=0.1)
p+facet_grid(~Area)
# Simple correlation between WEALTH and Science
p=ggplot(data=pisa,aes(x=WEALTH,y=Science))
p+geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).
# Simple correlation between WEALTH and Science by Area
p=ggplot(data=pisa,aes(x=WEALTH,y=Science,col=Area))
p+geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).
# Simple correlation between WEALTH and Science by Area and smooth
p=ggplot(data=pisa,aes(x=WEALTH,y=Science,col=Area))
p+geom_point()+geom_smooth(method="lm",se=F)+theme_economist()
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
# Correlation between Math, Science, Read by Area
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
dat=pisa[,c("Area","Math","Science","Read")]
ggpairs(data=dat,mapping=aes(color=Area))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.