Loading libraries
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(readxl))
suppressPackageStartupMessages(library(labelled))
suppressPackageStartupMessages(library(table1))
suppressPackageStartupMessages(library(gapminder))
suppressPackageStartupMessages(library(gapminder))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggthemes))
suppressPackageStartupMessages(library(gridExtra))
Reading data
# Obesity data
ob = read.csv("~/Dropbox/_Conferences and Workshops/TDTU 2018/Datasets/obesity data.csv")
head(ob)
## id gender height weight bmi age WBBMC wbbmd fat lean pcfat
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2
# Example dat
ex = read.csv("~/Dropbox/_Books and websites/Statistics.vn data for book (Data Analysis with R)/Datasets/ExampleData.csv", header=T)
head(ex)
## id e2b1 g1b1 i11 pcs1 mcs1 cesd1 indtot1 drugrisk1 sexrisk1 pcrec1
## 1 1 NA 0 NA 54.22583 52.23480 7 5 0 1 1
## 2 2 NA 0 8 59.56066 41.72696 11 12 0 0 0
## 3 3 NA 0 NA 58.45777 56.77131 14 99 13 4 0
## 4 4 NA 0 NA 46.60988 14.65925 44 97 0 4 0
## 5 5 NA 0 64 31.41642 40.67421 26 55 0 4 0
## 6 6 NA 0 1 43.20495 50.05917 23 12 0 4 0
## e2b2 g1b2 i12 pcs2 mcs2 cesd2 indtot2 drugrisk2 sexrisk2 pcrec2 e2b3
## 1 NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA
## 4 1 NA NA 57.56092 23.91316 NA NA NA NA 0 NA
## 5 NA 0 NA 44.83340 42.44469 27 34 0 2 0 1
## 6 NA NA NA NA NA NA NA NA NA NA NA
## g1b3 i13 pcs3 mcs3 cesd3 indtot3 drugrisk3 sexrisk3 pcrec3 e2b4 g1b4
## 1 0 NA 52.06106 56.06010 8 0 0 1 2 NA 0
## 2 NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA 2 0
## 4 NA NA NA NA NA NA NA NA NA NA 0
## 5 0 13 25.02625 50.93477 15 37 0 4 0 NA 0
## 6 NA NA NA NA NA NA NA NA NA NA NA
## i14 pcs4 mcs4 cesd4 indtot4 drugrisk4 sexrisk4 pcrec4 a15a a15b d1
## 1 8 52.27281 58.04143 5 34 0 3 2 0 0 3
## 2 NA NA NA NA NA NA NA NA 2 3 22
## 3 3 66.24387 13.55065 49 94 19 4 0 0 0 0
## 4 NA 57.05444 53.45058 20 5 0 4 0 0 0 2
## 5 6 44.65918 32.72392 28 58 0 2 0 15 0 12
## 6 NA NA NA NA NA NA NA NA 0 0 1
## e2b f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j f1k f1l f1m f1n f1o f1p f1q f1r
## 1 NA 3 2 3 0 2 3 3 0 2 3 3 0 1 2 2 2 2 3
## 2 NA 3 2 0 3 3 2 0 0 3 0 3 0 0 3 0 0 0 2
## 3 NA 3 2 3 0 2 2 1 3 2 3 1 0 1 3 2 0 0 3
## 4 1 0 0 1 3 2 2 1 3 0 0 1 2 2 2 0 NA 2 0
## 5 1 3 0 3 3 3 3 1 3 3 2 3 2 2 3 0 3 3 3
## 6 NA 1 0 1 3 0 0 0 3 0 1 1 3 1 0 1 3 0 0
## f1s f1t g1b i1 i2 age treat homeless pcs mcs cesd indtot pss_fr
## 1 3 2 1 13 26 37 1 0 58.41369 25.111990 49 39 0
## 2 0 0 1 56 62 37 1 1 36.03694 26.670307 30 43 1
## 3 2 0 0 0 0 26 0 0 74.80633 6.762923 39 41 13
## 4 0 1 0 5 5 39 0 0 61.93168 43.967880 15 28 11
## 5 3 3 0 10 13 32 0 1 37.34558 21.675755 39 38 10
## 6 0 0 0 4 4 47 1 0 46.47521 55.508991 6 29 5
## drugrisk sexrisk satreat drinkstatus daysdrink anysubstatus daysanysub
## 1 0 4 0 1 177 1 177
## 2 0 7 0 1 2 1 2
## 3 20 2 0 1 3 1 3
## 4 0 4 1 0 196 1 189
## 5 0 6 0 1 2 1 2
## 6 0 5 0 1 31 1 31
## linkstatus dayslink female substance racegrp
## 1 1 225 0 cocaine black
## 2 NA NA 0 alcohol white
## 3 0 365 0 heroin black
## 4 0 343 1 heroin white
## 5 1 57 0 cocaine black
## 6 0 365 1 cocaine black
# Example data
dat = read.table("~/Dropbox/_Conferences and Workshops/TDTU 2018/Datasets/adult.data.txt", sep = ',', fill = F, strip.white = T)
head(dat)
## V1 V2 V3 V4 V5 V6 V7
## 1 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical
## 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial
## 3 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners
## 4 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners
## 5 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty
## 6 37 Private 284582 Masters 14 Married-civ-spouse Exec-managerial
## V8 V9 V10 V11 V12 V13 V14 V15
## 1 Not-in-family White Male 2174 0 40 United-States <=50K
## 2 Husband White Male 0 0 13 United-States <=50K
## 3 Not-in-family White Male 0 0 40 United-States <=50K
## 4 Husband Black Male 0 0 40 United-States <=50K
## 5 Wife Black Female 0 0 40 Cuba <=50K
## 6 Wife White Female 0 0 40 United-States <=50K
Histogram
p = ggplot(data=ob, aes(x=pcfat))
p + geom_histogram(aes(y=..density..), color="white", fill="blue") + geom_density(col="red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 = ggplot(data=ob, aes(x=pcfat, fill=gender)) + geom_histogram(position="dodge")
p2 = ggplot(data=ob, aes(x=pcfat, fill=gender, color=gender)) + geom_density(alpha = 0.1)
grid.arrange(p1, p2, nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Boxplot
ggplot(data=ob, aes(x=gender, y=pcfat)) + geom_boxplot(aes(fill=gender, alpha = 0.5)) + geom_jitter(aes(color = gender), alpha=0.2) + theme(legend.position="none") + labs(x="Gender", y="Percent body fat")

Barplot from summary data
Country = c("Vietnam", "Thailand", "Malaysia", "Indonesia", "Singapore")
Pubs = c(17178, 42914, 71229, 19585, 69883)
HiCi = c(171, 338, 415, 127, 1467)
df = data.frame(Country, Pubs, HiCi)
p = ggplot(data=df, aes(x=Country, y=Pubs, fill=Country, label=Pubs))
p + geom_bar(stat="identity") + geom_text(size=3, color="white", position=position_stack(vjust=0.5)) + theme_economist() + theme(legend.position="none")

# With order
p = ggplot(data=df, aes(x=reorder(Country, Pubs), y=Pubs, fill=Country, label=Pubs)) + geom_bar(stat="identity")
p + geom_text(size=3, color="white", position=position_stack(vjust=0.5)) + theme_economist() + theme(legend.position="none")

Barplot from raw data
colnames(dat) = c("Age", "Workclass", "fnlwgt", "Education", "Education_years", "Marital_status", "Occupation", "Relationship", "Race", "Sex", "Capital.gain","Capital.loss", "Hours.per.week", "Native.country", "Income")
p = ggplot(data=dat, aes(Workclass, fill=Workclass))
p = p + geom_bar() + theme(legend.position="none")
p + theme(axis.title.x = element_text(face="bold", colour="Black", size=15), axis.text.x = element_text(angle=45, vjust=0.5, size=15))

temp = dat %>% dplyr::count(Workclass) %>% mutate(prop=prop.table(n))
p = ggplot(data=temp, aes(x = reorder(Workclass, n), y=n, fill=Workclass))
p = p + geom_bar(stat="identity") + theme(legend.position="none")
p + xlab("Work class")

# With two variables
p = ggplot(data=dat, aes(x=Education, fill=Income))
p + geom_bar() + theme(axis.text.x = element_text(angle=45, vjust=0.5, size=15))

# Percentage
cc1 = dplyr::count(dat, Education, Income)
cc2 = left_join(cc1, dplyr::count(cc1, Education, wt = n))
## Joining, by = c("Education", "n")
temp = dat %>% dplyr::group_by(Education, Income) %>% dplyr::summarise(count=dplyr::n()) %>% dplyr::mutate(perc=count/sum(count))
p = ggplot(data=temp, aes(x=Education, y=perc*100, fill=Income)) + geom_bar(stat="identity")
p + theme(axis.text.x = element_text(angle=90, size=12))

# Barplot and percent
os = read.csv("~/Dropbox/_Conferences and Workshops/Da Nang 2017/Datasets/osteo-data.csv", na.string="NA", header=T)
head(os)
## sex age weight height fx hipfx fnbmd smoking
## 1 Male 73 98 175 0 0 1.08 1
## 2 Female 68 72 166 0 0 0.97 0
## 3 Male 68 87 184 0 0 1.01 0
## 4 Female 62 72 173 0 0 0.84 1
## 5 Male 61 72 173 0 0 0.81 1
## 6 Female 76 57 156 0 0 0.74 0
p = ggplot(os %>% dplyr::count(fx, sex) %>% dplyr::mutate(pct=n/sum(n)), aes(factor(fx), n, fill=sex))
p = p + geom_bar(stat="identity")
p = p + geom_text(aes(label=paste0(sprintf("%1.1f", pct*100),"%")), position=position_stack(vjust=0.5))
p + xlab("Fracture") + ylab("Percent")

Barplot of means
means = aggregate(dat$Education_years, by=list(dat$Occupation), FUN=mean)
colnames(means) = c("Occupation", "Mean")
p = ggplot(data=means, aes(x=reorder(Occupation, Mean), y=Mean, fill=Occupation, label=Mean))
p + geom_bar(stat="identity", width=1, color="white", position=position_dodge()) + theme(legend.position="none") + xlab("Occupation") + ylab("Years") + coord_flip()

Scatter plot
p = ggplot(ob, aes(x=bmi, y=pcfat, fill=gender))
p + geom_point(aes(col=gender)) + geom_smooth(method="lm", formula=y~x+I(x^2)) + theme_bw() + xlab("Body mass index") + ylab("Percent body fat") + ggtitle("Distribution of PBF") + theme(legend.position="bottom")

Spaghetti plot
t1 = read.csv("~/Dropbox/Temp Files/Matthys Scheltema/Data I.csv", header=T, na.strings="")
t1$Urinary=(t1$Urinary.Summary-mean(t1$Urinary.Summary, na.rm=T))/sd(t1$Urinary.Summary, na.rm=T)
t1$Bowel = (t1$Bowel.Summary-mean(t1$Bowel.Summary, na.rm=T))/sd(t1$Bowel.Summary, na.rm=T)
t1$Sexual = (t1$Sexual.Summary-mean(t1$Sexual.Summary, na.rm=T))/sd(t1$Sexual.Summary, na.rm=T)
t1$SF12.PCS = (t1$SF12.PCS-mean(t1$SF12.PCS, na.rm=T))/sd(t1$SF12.PCS, na.rm=T)
t1$SF12.MCS = (t1$SF12.MCS-mean(t1$SF12.MCS, na.rm=T))/sd(t1$SF12.MCS, na.rm=T)
t1$AUA = (t1$AUA-mean(t1$AUA, na.rm=T))/sd(t1$AUA, na.rm=T)
t1$id = as.factor(t1$ID)
t1$Treat[t1$Group==0] <- "IRE"
t1$Treat[t1$Group==1] <- "RARP"
t1$Treat = as.factor(t1$Treat)
head(t1)
## ID Group Time Urinary.Summary Bowel.Summary Sexual.Summary SF12.PCS
## 1 2105 0 1 91.66667 96.42857 45.83333 0.4586150
## 2 2105 0 2 80.58333 91.07143 43.61538 0.4586150
## 3 2105 0 3 85.41667 25.00000 15.38462 0.4586150
## 4 2105 0 4 87.50000 89.28571 41.69231 0.5142755
## 5 13268 0 1 91.66667 100.00000 92.30769 0.4586150
## 6 13268 0 2 81.81818 100.00000 57.69231 0.5142755
## SF12.MCS AUA Urinary Bowel Sexual id Treat
## 1 0.4982233 0.6234905 0.48417023 0.3382291 -0.09250541 2105 IRE
## 2 0.4982233 0.9867277 -0.29292156 -0.2776478 -0.17641298 2105 IRE
## 3 0.4982233 0.4418720 0.04596058 -7.8734640 -1.24441569 2105 IRE
## 4 0.2941898 1.7132019 0.19203046 -0.4829402 -0.24916521 2105 IRE
## 5 0.4982233 -1.0110765 0.48417023 0.7488138 1.66567344 13268 IRE
## 6 0.2941898 -0.6478394 -0.20634195 0.7488138 0.35613333 13268 IRE
p = ggplot(data=t1, aes(x=Time, y=Urinary.Summary, group=ID, col=Treat))
p + geom_line() + facet_grid(. ~Treat) + stat_smooth(aes(group = 1))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 row(s) containing missing values (geom_path).

Labels and axes
p = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender, fill=gender))
p = p + geom_point() + xlab("Body mass index") + ylab("Percent body fat") + theme(axis.text.x=element_text(face="bold", size=14, color="red", angle=45), axis.text.y=element_text(face="bold", size=12))
p + theme(axis.title.x = element_text(face="bold", colour="#990000", size=15), axis.title.y = element_text(face="bold", colour="blue", size=15)) + ylim(0, 50) + xlim(5, 40) + theme(legend.position="top")
