Loading libraries

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(magrittr))

suppressPackageStartupMessages(library(readxl))
suppressPackageStartupMessages(library(labelled))
suppressPackageStartupMessages(library(table1))
suppressPackageStartupMessages(library(gapminder))

suppressPackageStartupMessages(library(gapminder))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggthemes))
suppressPackageStartupMessages(library(gridExtra))

Reading data

# Obesity data
ob = read.csv("~/Dropbox/_Conferences and Workshops/TDTU 2018/Datasets/obesity data.csv")
head(ob)

##   id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat
## 1  1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3
## 2  2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8
## 3  3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0
## 4  4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8
## 5  5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8
## 6  6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2

# Example dat
ex = read.csv("~/Dropbox/_Books and websites/Statistics.vn data for book (Data Analysis with R)/Datasets/ExampleData.csv", header=T)
head(ex)

##   id e2b1 g1b1 i11     pcs1     mcs1 cesd1 indtot1 drugrisk1 sexrisk1 pcrec1
## 1  1   NA    0  NA 54.22583 52.23480     7       5         0        1      1
## 2  2   NA    0   8 59.56066 41.72696    11      12         0        0      0
## 3  3   NA    0  NA 58.45777 56.77131    14      99        13        4      0
## 4  4   NA    0  NA 46.60988 14.65925    44      97         0        4      0
## 5  5   NA    0  64 31.41642 40.67421    26      55         0        4      0
## 6  6   NA    0   1 43.20495 50.05917    23      12         0        4      0
##   e2b2 g1b2 i12     pcs2     mcs2 cesd2 indtot2 drugrisk2 sexrisk2 pcrec2 e2b3
## 1   NA   NA  NA       NA       NA    NA      NA        NA       NA     NA   NA
## 2   NA   NA  NA       NA       NA    NA      NA        NA       NA     NA   NA
## 3   NA   NA  NA       NA       NA    NA      NA        NA       NA     NA   NA
## 4    1   NA  NA 57.56092 23.91316    NA      NA        NA       NA      0   NA
## 5   NA    0  NA 44.83340 42.44469    27      34         0        2      0    1
## 6   NA   NA  NA       NA       NA    NA      NA        NA       NA     NA   NA
##   g1b3 i13     pcs3     mcs3 cesd3 indtot3 drugrisk3 sexrisk3 pcrec3 e2b4 g1b4
## 1    0  NA 52.06106 56.06010     8       0         0        1      2   NA    0
## 2   NA  NA       NA       NA    NA      NA        NA       NA     NA   NA   NA
## 3   NA  NA       NA       NA    NA      NA        NA       NA     NA    2    0
## 4   NA  NA       NA       NA    NA      NA        NA       NA     NA   NA    0
## 5    0  13 25.02625 50.93477    15      37         0        4      0   NA    0
## 6   NA  NA       NA       NA    NA      NA        NA       NA     NA   NA   NA
##   i14     pcs4     mcs4 cesd4 indtot4 drugrisk4 sexrisk4 pcrec4 a15a a15b d1
## 1   8 52.27281 58.04143     5      34         0        3      2    0    0  3
## 2  NA       NA       NA    NA      NA        NA       NA     NA    2    3 22
## 3   3 66.24387 13.55065    49      94        19        4      0    0    0  0
## 4  NA 57.05444 53.45058    20       5         0        4      0    0    0  2
## 5   6 44.65918 32.72392    28      58         0        2      0   15    0 12
## 6  NA       NA       NA    NA      NA        NA       NA     NA    0    0  1
##   e2b f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j f1k f1l f1m f1n f1o f1p f1q f1r
## 1  NA   3   2   3   0   2   3   3   0   2   3   3   0   1   2   2   2   2   3
## 2  NA   3   2   0   3   3   2   0   0   3   0   3   0   0   3   0   0   0   2
## 3  NA   3   2   3   0   2   2   1   3   2   3   1   0   1   3   2   0   0   3
## 4   1   0   0   1   3   2   2   1   3   0   0   1   2   2   2   0  NA   2   0
## 5   1   3   0   3   3   3   3   1   3   3   2   3   2   2   3   0   3   3   3
## 6  NA   1   0   1   3   0   0   0   3   0   1   1   3   1   0   1   3   0   0
##   f1s f1t g1b i1 i2 age treat homeless      pcs       mcs cesd indtot pss_fr
## 1   3   2   1 13 26  37     1        0 58.41369 25.111990   49     39      0
## 2   0   0   1 56 62  37     1        1 36.03694 26.670307   30     43      1
## 3   2   0   0  0  0  26     0        0 74.80633  6.762923   39     41     13
## 4   0   1   0  5  5  39     0        0 61.93168 43.967880   15     28     11
## 5   3   3   0 10 13  32     0        1 37.34558 21.675755   39     38     10
## 6   0   0   0  4  4  47     1        0 46.47521 55.508991    6     29      5
##   drugrisk sexrisk satreat drinkstatus daysdrink anysubstatus daysanysub
## 1        0       4       0           1       177            1        177
## 2        0       7       0           1         2            1          2
## 3       20       2       0           1         3            1          3
## 4        0       4       1           0       196            1        189
## 5        0       6       0           1         2            1          2
## 6        0       5       0           1        31            1         31
##   linkstatus dayslink female substance racegrp
## 1          1      225      0   cocaine   black
## 2         NA       NA      0   alcohol   white
## 3          0      365      0    heroin   black
## 4          0      343      1    heroin   white
## 5          1       57      0   cocaine   black
## 6          0      365      1   cocaine   black

# Example data 
dat = read.table("~/Dropbox/_Conferences and Workshops/TDTU 2018/Datasets/adult.data.txt", sep = ',', fill = F, strip.white = T)
head(dat)

##   V1               V2     V3        V4 V5                 V6                V7
## 1 39        State-gov  77516 Bachelors 13      Never-married      Adm-clerical
## 2 50 Self-emp-not-inc  83311 Bachelors 13 Married-civ-spouse   Exec-managerial
## 3 38          Private 215646   HS-grad  9           Divorced Handlers-cleaners
## 4 53          Private 234721      11th  7 Married-civ-spouse Handlers-cleaners
## 5 28          Private 338409 Bachelors 13 Married-civ-spouse    Prof-specialty
## 6 37          Private 284582   Masters 14 Married-civ-spouse   Exec-managerial
##              V8    V9    V10  V11 V12 V13           V14   V15
## 1 Not-in-family White   Male 2174   0  40 United-States <=50K
## 2       Husband White   Male    0   0  13 United-States <=50K
## 3 Not-in-family White   Male    0   0  40 United-States <=50K
## 4       Husband Black   Male    0   0  40 United-States <=50K
## 5          Wife Black Female    0   0  40          Cuba <=50K
## 6          Wife White Female    0   0  40 United-States <=50K

Histogram

p = ggplot(data=ob, aes(x=pcfat)) 
p + geom_histogram(aes(y=..density..), color="white", fill="blue") + geom_density(col="red")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 = ggplot(data=ob, aes(x=pcfat, fill=gender)) + geom_histogram(position="dodge") 

p2 = ggplot(data=ob, aes(x=pcfat, fill=gender, color=gender)) + geom_density(alpha = 0.1)

grid.arrange(p1, p2, nrow=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Boxplot

ggplot(data=ob, aes(x=gender, y=pcfat)) + geom_boxplot(aes(fill=gender, alpha = 0.5)) + geom_jitter(aes(color = gender), alpha=0.2) + theme(legend.position="none") + labs(x="Gender", y="Percent body fat")

Barplot from summary data

Country = c("Vietnam", "Thailand", "Malaysia", "Indonesia", "Singapore")
Pubs = c(17178, 42914, 71229, 19585, 69883)
HiCi = c(171, 338, 415, 127, 1467)
df = data.frame(Country, Pubs, HiCi)

p = ggplot(data=df, aes(x=Country, y=Pubs, fill=Country, label=Pubs))
p + geom_bar(stat="identity") + geom_text(size=3, color="white", position=position_stack(vjust=0.5)) + theme_economist() + theme(legend.position="none")

# With order 
p = ggplot(data=df, aes(x=reorder(Country, Pubs), y=Pubs, fill=Country, label=Pubs)) + geom_bar(stat="identity") 
p + geom_text(size=3, color="white", position=position_stack(vjust=0.5)) + theme_economist() + theme(legend.position="none")

Barplot from raw data

colnames(dat) = c("Age", "Workclass", "fnlwgt", "Education", "Education_years", "Marital_status", "Occupation", "Relationship", "Race", "Sex", "Capital.gain","Capital.loss", "Hours.per.week", "Native.country", "Income")

p = ggplot(data=dat, aes(Workclass, fill=Workclass))
p = p + geom_bar() + theme(legend.position="none")
p + theme(axis.title.x = element_text(face="bold", colour="Black", size=15), axis.text.x  = element_text(angle=45, vjust=0.5, size=15))

temp = dat %>% dplyr::count(Workclass) %>% mutate(prop=prop.table(n))
p = ggplot(data=temp, aes(x = reorder(Workclass, n), y=n, fill=Workclass))
p = p + geom_bar(stat="identity") + theme(legend.position="none")
p + xlab("Work class")

# With two variables
p = ggplot(data=dat, aes(x=Education, fill=Income))
p + geom_bar() + theme(axis.text.x = element_text(angle=45, vjust=0.5, size=15))

# Percentage 
cc1 = dplyr::count(dat, Education, Income)
cc2 = left_join(cc1, dplyr::count(cc1, Education, wt = n))

## Joining, by = c("Education", "n")

temp = dat %>% dplyr::group_by(Education, Income) %>% dplyr::summarise(count=dplyr::n()) %>% dplyr::mutate(perc=count/sum(count))

p = ggplot(data=temp, aes(x=Education, y=perc*100, fill=Income)) + geom_bar(stat="identity")
p + theme(axis.text.x = element_text(angle=90, size=12))

# Barplot and percent 
os = read.csv("~/Dropbox/_Conferences and Workshops/Da Nang 2017/Datasets/osteo-data.csv", na.string="NA", header=T)
head(os)

##      sex age weight height fx hipfx fnbmd smoking
## 1   Male  73     98    175  0     0  1.08       1
## 2 Female  68     72    166  0     0  0.97       0
## 3   Male  68     87    184  0     0  1.01       0
## 4 Female  62     72    173  0     0  0.84       1
## 5   Male  61     72    173  0     0  0.81       1
## 6 Female  76     57    156  0     0  0.74       0

p = ggplot(os %>% dplyr::count(fx, sex) %>% dplyr::mutate(pct=n/sum(n)), aes(factor(fx), n, fill=sex))
p = p + geom_bar(stat="identity")
p = p + geom_text(aes(label=paste0(sprintf("%1.1f", pct*100),"%")), position=position_stack(vjust=0.5))
p + xlab("Fracture") + ylab("Percent")

Barplot of means

means = aggregate(dat$Education_years, by=list(dat$Occupation), FUN=mean) 

colnames(means) = c("Occupation", "Mean")
 
p = ggplot(data=means, aes(x=reorder(Occupation, Mean), y=Mean, fill=Occupation, label=Mean))
p + geom_bar(stat="identity", width=1, color="white", position=position_dodge()) + theme(legend.position="none") + xlab("Occupation") + ylab("Years") + coord_flip()

Scatter plot

p = ggplot(ob, aes(x=bmi, y=pcfat, fill=gender)) 

p + geom_point(aes(col=gender)) + geom_smooth(method="lm", formula=y~x+I(x^2)) + theme_bw() +  xlab("Body mass index") + ylab("Percent body fat") + ggtitle("Distribution of PBF") + theme(legend.position="bottom")

Spaghetti plot

t1 = read.csv("~/Dropbox/Temp Files/Matthys Scheltema/Data I.csv", header=T, na.strings="")
t1$Urinary=(t1$Urinary.Summary-mean(t1$Urinary.Summary, na.rm=T))/sd(t1$Urinary.Summary, na.rm=T)
t1$Bowel = (t1$Bowel.Summary-mean(t1$Bowel.Summary, na.rm=T))/sd(t1$Bowel.Summary, na.rm=T)
t1$Sexual = (t1$Sexual.Summary-mean(t1$Sexual.Summary, na.rm=T))/sd(t1$Sexual.Summary, na.rm=T)
t1$SF12.PCS = (t1$SF12.PCS-mean(t1$SF12.PCS, na.rm=T))/sd(t1$SF12.PCS, na.rm=T)
t1$SF12.MCS = (t1$SF12.MCS-mean(t1$SF12.MCS, na.rm=T))/sd(t1$SF12.MCS, na.rm=T)
t1$AUA = (t1$AUA-mean(t1$AUA, na.rm=T))/sd(t1$AUA, na.rm=T)
t1$id = as.factor(t1$ID)
t1$Treat[t1$Group==0] <- "IRE"
t1$Treat[t1$Group==1] <- "RARP"
t1$Treat = as.factor(t1$Treat)
head(t1)

##      ID Group Time Urinary.Summary Bowel.Summary Sexual.Summary  SF12.PCS
## 1  2105     0    1        91.66667      96.42857       45.83333 0.4586150
## 2  2105     0    2        80.58333      91.07143       43.61538 0.4586150
## 3  2105     0    3        85.41667      25.00000       15.38462 0.4586150
## 4  2105     0    4        87.50000      89.28571       41.69231 0.5142755
## 5 13268     0    1        91.66667     100.00000       92.30769 0.4586150
## 6 13268     0    2        81.81818     100.00000       57.69231 0.5142755
##    SF12.MCS        AUA     Urinary      Bowel      Sexual    id Treat
## 1 0.4982233  0.6234905  0.48417023  0.3382291 -0.09250541  2105   IRE
## 2 0.4982233  0.9867277 -0.29292156 -0.2776478 -0.17641298  2105   IRE
## 3 0.4982233  0.4418720  0.04596058 -7.8734640 -1.24441569  2105   IRE
## 4 0.2941898  1.7132019  0.19203046 -0.4829402 -0.24916521  2105   IRE
## 5 0.4982233 -1.0110765  0.48417023  0.7488138  1.66567344 13268   IRE
## 6 0.2941898 -0.6478394 -0.20634195  0.7488138  0.35613333 13268   IRE

p = ggplot(data=t1, aes(x=Time, y=Urinary.Summary, group=ID, col=Treat))
p + geom_line() + facet_grid(. ~Treat) + stat_smooth(aes(group = 1))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 4 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 row(s) containing missing values (geom_path).

Labels and axes

p = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender, fill=gender))

p = p + geom_point() + xlab("Body mass index") + ylab("Percent body fat") + theme(axis.text.x=element_text(face="bold", size=14, color="red", angle=45), axis.text.y=element_text(face="bold", size=12))

p + theme(axis.title.x = element_text(face="bold", colour="#990000", size=15), axis.title.y = element_text(face="bold", colour="blue", size=15)) + ylim(0, 50) + xlim(5, 40) + theme(legend.position="top")

Common graphs using R

Loading libraries

Reading data

Histogram

Boxplot

Barplot from summary data

Barplot from raw data

Barplot of means

Scatter plot

Spaghetti plot

Labels and axes