Base R approach

fL <- "http://www.amstat.org/publications/jse/datasets/sat.dat.txt"
dta <- read.table(fL, row.names=1)
names(dta) <- c("Spending", "PTR", "Salary", "PE", "Verbal", "Math", "SAT")
str(dta)

## 'data.frame':    50 obs. of  7 variables:
##  $ Spending: num  4.41 8.96 4.78 4.46 4.99 ...
##  $ PTR     : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
##  $ Salary  : num  31.1 48 32.2 28.9 41.1 ...
##  $ PE      : int  8 47 27 6 45 29 81 68 48 65 ...
##  $ Verbal  : int  491 445 448 482 417 462 431 429 420 406 ...
##  $ Math    : int  538 489 496 523 485 518 477 468 469 448 ...
##  $ SAT     : int  1029 934 944 1005 902 980 908 897 889 854 ...

dta

with(plot)

with(dta, plot(Salary, SAT, type='n', main='SAT and Teacher\'s Salary')) 
#type='n' no plotting.

# add regression line
abline(lm(SAT ~ Salary, data=dta))
# label states
with(dta, text(Salary, SAT, labels=row.names(dta), cex=.5))

## row.names

row.names(dta[which(nchar(row.names(dta)) <= 5),])

## [1] "Idaho" "Iowa"  "Maine" "Ohio"  "Texas" "Utah"

row.names(dta[nchar(row.names(dta)) <= 5,]) #equal

## [1] "Idaho" "Iowa"  "Maine" "Ohio"  "Texas" "Utah"

#Augment data with the built-in state.region{datasets}
dta$Region <- state.region
with(dta, table(Region))

## Region
##     Northeast         South North Central          West 
##             9            16            12            13

name abbreviation

dta$SID <- state.abb #name abbreviation
subset(dta, Region == 'Northeast')["SID"]

split

split(dta, "Region")$Region[,7]

##  [1] 1029  934  944 1005  902  980  908  897  889  854  889  979 1048  882
## [15] 1099 1060  999 1021  896  909  907 1033 1085 1036 1045 1009 1050  917
## [29]  935  898 1015  892  865 1107  975 1027  947  880  888  844 1068 1040
## [43]  893 1076  901  896  937  932 1073 1001

# = dta[,7]?

unstack

unstack(dta[c("SAT", "Region")])

## $Northeast
## [1] 908 896 907 935 898 892 880 888 901
## 
## $South
##  [1] 1029 1005  897  889  854  999 1021  909 1036  865 1027  844 1040  893
## [15]  896  932
## 
## $`North Central`
##  [1] 1048  882 1099 1060 1033 1085 1045 1050 1107  975 1068 1073
## 
## $West
##  [1]  934  944  902  980  889  979 1009  917 1015  947 1076  937 1001

with(boxplot)

with(dta, boxplot(SAT ~ Region, horizontal=F, col="bisque", varwidth=T))

# varwidth 根據sample size比例盒子大小

aggregate

mss <- aggregate(cbind(SAT, Salary) ~ Region, data=dta, FUN=mean)
mss

mss[order(mss["SAT"]),]

xyplot

library(lattice)
xyplot(SAT ~ Salary, groups=Region, data=dta, type=c("g","p","r"), auto.key=list(columns=4)) # type g grid r regression

# auto.key=list(columns=4) label columns排列

lapply

lapply(split(dta, dta$Region), function(x) coef(lm(x$SAT ~ x$Salary)))

## $Northeast
## (Intercept)    x$Salary 
## 933.3285652  -0.7931598 
## 
## $South
## (Intercept)    x$Salary 
## 1224.689015   -8.758555 
## 
## $`North Central`
## (Intercept)    x$Salary 
## 1212.192020   -4.939625 
## 
## $West
## (Intercept)    x$Salary 
## 1188.215929   -6.463652

# lapply 將前面的東西一個一個餵給後面的function

reshape > longFormat

dtaL <- reshape(dta, varying=list(5:6), idvar="SID", v.names="Score", timevar="Test",times=c("Verbal", "Math"), direction="long")
# idvar.timevar的times 最前面
set.seed(1215)
subset(dtaL, SID %in% sample(state.abb, 3))

xyplot

xyplot(Score ~ Salary | Region, groups=Test, data=dtaL, type=c("p", "g", "r"),auto.key=list(columns=2))

## aggregate

mspp <- aggregate(cbind(Spending, PTR, PE) ~ Region, data=dta, FUN=mean)
merge(mspp, mss, by='Region')[c(2,3,4,1),]

quantile

quantile(dta$PE, probs=seq(from=0, to=1, by=.1))

##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##  4.0  5.9  9.0 10.7 14.2 28.0 47.4 58.0 65.6 70.0 81.0

cut > numToFactor

dta$PEf <- with(dta, cut(PE, ordered=T, breaks=c(0, 11, 51, 100), 
                         labels=c("Low", "Middle", "High")))
# ordered=T => Levels: Low < Middle < High
with(dta, table(PEf))

## PEf
##    Low Middle   High 
##     18     15     17

mss_p <- aggregate(cbind(SAT,Salary) ~ PEf, data = dta, FUN=mean)
mss_p

Plot!

with(dta,plot(SAT~Salary,type='n'))#col=as.numeric(PEf)+1,
abline(lm(SAT~Salary,data=dta),col='purple')
lapply(split(dta,dta$PEf), function(x) abline(lm(SAT~Salary,data=x),col=as.numeric(x$PEf)+1,lty=2))

## $Low
## NULL
## 
## $Middle
## NULL
## 
## $High
## NULL

with(dta,text(SAT~Salary,labels=row.names(dta),col=as.numeric(PEf)+1,cex=.5))
with(mss_p,points(SAT~Salary,col=as.numeric(PEf)+1,pch=20))
with(mss_p,abline(lm(SAT~Salary),lty=3,col='purple'))

## boxplot & arrows & points

rb <- boxplot(SAT ~ PEf, data = dta, col = 'bisque', xlab="Percent eligible")
# tapply 根據第二個的類別做第一個的funciton
mn.t <- with(dta, tapply(SAT, PEf, mean))
se.t <- with(dta, tapply(SAT, PEf, sd))/sqrt(rb$n)
xi <- 0.3 + seq(rb$n)
arrows(xi, mn.t-1.96*se.t, xi, mn.t+1.96*se.t, code=3, col="orange", angle=90, length=.05, lwd=2)
points(xi, mn.t, col="darkorange", pch=16)

## xyplot

xyplot(SAT ~ Salary | PEf, data=dta, type = c("r","g","p"), auto.key = list(columns = 4))

Tidy approach

data(Hsb82, package="mlmRev")
knitr::kable(head(Hsb82))

school	minrty	sx	ses	mAch	meanses	sector	cses
1224	No	Female	-1.528	5.876	-0.434383	Public	-1.093617
1224	No	Female	-0.588	19.708	-0.434383	Public	-0.153617
1224	No	Male	-0.528	20.349	-0.434383	Public	-0.093617
1224	No	Male	-0.668	8.781	-0.434383	Public	-0.233617
1224	No	Male	-0.158	17.898	-0.434383	Public	0.276383
1224	No	Male	0.022	4.583	-0.434383	Public	0.456383

hist(
  unlist(
    lapply(
      split(Hsb82[,"mAch"], Hsb82$school), mean, na.rm=T)), 
            main="Distribution of school averages", xlab="Mean math scores", prob=T)

library(tidyverse)

## ─ Attaching packages ────────────────────────────────── tidyverse 1.2.1 ─

## ✔ ggplot2 3.3.0     ✔ purrr   0.3.3
## ✔ tibble  2.1.1     ✔ dplyr   0.8.5
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ─ Conflicts ──────────────────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

attributes(women[1, 1])

## NULL

attributes(as_tibble(women)[1, 1])

## $names
## [1] "height"
## 
## $row.names
## [1] 1
## 
## $class
## [1] "tbl_df"     "tbl"        "data.frame"

%<>% & rename

%<>% – pipe to right & assign to left

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(dplyr)
dta <- as_tibble(mlmRev::Hsb82)
names(dta)

## [1] "school"  "minrty"  "sx"      "ses"     "mAch"    "meanses" "sector" 
## [8] "cses"

dta %<>% rename(SID=school, Minority=minrty, Gender=sx, SES=ses, Math=mAch,MSES=meanses, Sector=sector, CSES=cses)
names(dta)

## [1] "SID"      "Minority" "Gender"   "SES"      "Math"     "MSES"    
## [7] "Sector"   "CSES"

filter

dta %>% filter(SID == 1224, Gender == "Female") %>% head

# Not run
library(conflicted)
filter <- dplyr::filter

slice

dta %>% slice(-5:-n())

# head(dta, 4)

arrange

dta %>% arrange(desc(Math), SID) %>% head(., 5)

# head(dta[order(dta$Math, decreasing=T),], 5)

select & contains

dta %>% select(contains("SE")) %>% head

# conflict_prefer("select", "dplyr")

mutate & densityplot

dta %>% 
 mutate(SES_f = cut(SES, breaks=quantile(SES, probs=c(0, .33, .67, 1)), 
                   label=c("Low", "Middle", "High"), ordered=T)) %>%
 densityplot(~ Math, group=SES_f, data=., xlab="Socio-economic Status",
             plot.points=FALSE, ref=TRUE, auto.key=list(columns=3))

case_when

dta %>% mutate(Grade = case_when(
  Math < 7.28 ~ "Q1",
  Math >= 7.28 & Math < 12.13 ~ "Q2",
  Math >= 12.13 & Math < 18.32 ~ "Q3",
  Math >= 18.32 ~ "Q4")) %>%
  select(Grade, Math) %>%
  head

%T>% & transmute

%T% – tee，分流做兩件事

dta %>% transmute(SES, Math) %T>%
 plot(., xlab="Socio-economic Status", ylab="Math score", pch='.') %>% colMeans

##          SES         Math 
## 1.433542e-04 1.274785e+01

%$%

%$% – exposition pipe, like with()

dta %$% cor(Math, SES)

## [1] 0.3607556

group_by & summarize

dta %>%
  group_by(SID) %>%
  summarize(r=cor(Math, SES)) %>%
  select(r) %>% 
  summary()

##        r          
##  Min.   :-0.2406  
##  1st Qu.: 0.1220  
##  Median : 0.2366  
##  Mean   : 0.2299  
##  3rd Qu.: 0.3330  
##  Max.   : 0.6338

# summary(unlist(lapply(split(dta, dta$SID), function(x) cor(x$Math, x$SES))))

segplot

library(latticeExtra)

## Loading required package: RColorBrewer

## 
## Attaching package: 'latticeExtra'

## The following object is masked from 'package:ggplot2':
## 
##     layer

dta %>% group_by(Minority, Gender) %>%
 summarize(Math_m=mean(Math, na.rm=T), Math_se=sd(Math, na.rm=T)/sqrt(n()), Math_lb=Math_m-1.96*Math_se, Math_ub=Math_m+1.96*Math_se) %>%
 latticeExtra::segplot(Gender ~ Math_lb + Math_ub | Minority, data=., center=Math_m, draw.bands=FALSE, segments.fun=panel.arrows, ends='both', angle=90, length=1, unit='mm')

spread & ungroup & transmute

Summarize a transformed variable

dta %>%
 mutate(U90 = Math > quantile(Math, prob = 0.9)) %>%
 group_by(U90, Gender) %>%
 summarize(tot=n()) %>%
 tidyr::spread(key=Gender, value=tot) %>%
 ungroup() %>%
 transmute(Prop_F = Female / (Female+Male), U90=U90)

distinct & sample_n & pull

Select at random by a variable

set.seed(1234)
s5 <- dta %>% distinct(SID) %>% sample_n(5) %>% pull()
dta %>% filter(SID %in% s5) %>%
 xyplot(Math ~ CSES | SID, groups=Gender, type=c("p","g","r"), data=., xlab = "Socio-economic Status score", ylab="Math score", auto.key=list(columns=2))

## sample_frac & hist(table)

by_school <- dta %>% group_by(SID)
set.seed(1234)
hsb10 <- sample_frac(by_school, 0.1)
hist(table(hsb10$SID)/table(Hsb82$school), prob=T, xlab="Proportion", main="Distribution of sample proportion")

pacman::p_load(HSAUR3)
data("BtheB", package="HSAUR3")
# example(BtheB)

kable & kable_styling

library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

head(BtheB) %>% kable("html") %>%
 kable_styling(bootstrap_options="striped", full_width=T)

drug	length	treatment	bdi.pre	bdi.2m	bdi.3m	bdi.5m	bdi.8m
No	>6m	TAU	29	2	2	NA	NA
Yes	>6m	BtheB	32	16	24	17	20
Yes	<6m	TAU	25	20	NA	NA	NA
No	>6m	BtheB	21	17	16	10	9
Yes	>6m	BtheB	26	23	NA	NA	NA
Yes	<6m	BtheB	7	0	0	0	0

dta <- as_tibble(BtheB) %>% 
 mutate(Subject=paste0("S", 101:200)) %>%
 rename(bdi.0m=bdi.pre)
dta %>% as.data.frame() %>% head

gather

to long format

dtaL <- dta %>% 
 rename(Drug=drug, Treatment=treatment, Length=length) %>%
 gather(key=Time, value=BDI, contains("bdi")) %>%
 arrange(Subject)
dtaL %>% as.data.frame() %>% head(., 11)

separate & parse_number

dtaL %<>% 
 separate(Time, c("prefix", "month")) %>%
 mutate(Month=parse_number(month)) %>%
 select(-c(prefix, month))
dtaL %>% as.data.frame() %>% head(., 9)

unite

Bind columns

dtaL %>% unite(TDL, Treatment, Drug, Length) %>%
 xyplot(BDI ~ Month | TDL, groups=Subject, data=., type=c("b","g"), xlab="Time (in months)", ylab="BDI score")

separate

long to wide

dtaW <- dtaL %>%
 mutate(pre=rep("m", dim(dtaL)[1])) %>%
 unite(Time, pre, Month) %>%
 spread(Time, BDI) %>%
 arrange(Subject)
dtaW %>% as.data.frame() %>% head(., 5)

aggregate(cbind(m_0, m_2, m_3, m_5, m_8) ~ Treatment, data=dtaW, FUN=mean)

Individual regression

dtaL %>% select(Subject, BDI, Month) %>%
 split(list(.$Subject)) %>%
 purrr::map(~ coef(lm(BDI ~ Month, data=.))) %>%
 unlist %>% as.numeric %>%
 matrix(ncol=2, byrow=T) %>%
 plot(xlab="Intercept estimates", ylab="Slope estimates")

cal_betas <- function(df) { lm(BDI ~ Month, data = df)$coefficients }
betas <- t(simplify2array(by(dtaL, dtaL$Subject, cal_betas)))
plot(betas, xlab="Intercept estimates", ylab="Slope estimates", main="Scatter diagram of regression estimates")

m0 <- lm(BDI ~ Subject + Month + Subject:Month - 1, data=dtaL)
n <- dim(dta)[1]
df <- data.frame(b0=coef(m0)[1:n], b1=c(coef(m0)[n+1], coef(m0)[n+1] + coef(m0)[(n+2):(2*n)]))
plot(df)

merge

nbl_c <- read.table("nobel_countries.txt", h = T)
nbl_w <- read.table("nobel_winners.txt", h = T)
head(nbl_c, 3)

tail(nbl_w, 3)

merge(nbl_c, nbl_w)

merge(nbl_c, nbl_w, all = TRUE)

inner_join

交集

inner_join(nbl_w, nbl_c)

## Joining, by = "Year"

semi-join

只保留x的column

semi_join(nbl_w, nbl_c)

## Joining, by = "Year"

left_join

保留x的key

left_join(nbl_w, nbl_c)

## Joining, by = "Year"

anti_join

x中沒有對應到y的，只列出x的行

anti_join(nbl_w, nbl_c)

## Joining, by = "Year"

full_join

聯集

full_join(nbl_w, nbl_c)

## Joining, by = "Year"

c5slideExercise

Data Wrangling

0410

Base R approach

with(plot)

name abbreviation

split

unstack

with(boxplot)

aggregate

xyplot

lapply

reshape > longFormat

xyplot

quantile

cut > numToFactor

Plot!

Tidy approach

%<>% & rename

filter

slice

arrange

select & contains

mutate & densityplot

case_when

%T>% & transmute

%$%

group_by & summarize

segplot

spread & ungroup & transmute

distinct & sample_n & pull

kable & kable_styling

gather

separate & parse_number

unite

separate

Individual regression

merge

inner_join

semi-join

left_join

anti_join

full_join