fL <- "http://www.amstat.org/publications/jse/datasets/sat.dat.txt"
dta <- read.table(fL, row.names=1)
names(dta) <- c("Spending", "PTR", "Salary", "PE", "Verbal", "Math", "SAT")
str(dta)## 'data.frame': 50 obs. of 7 variables:
## $ Spending: num 4.41 8.96 4.78 4.46 4.99 ...
## $ PTR : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
## $ Salary : num 31.1 48 32.2 28.9 41.1 ...
## $ PE : int 8 47 27 6 45 29 81 68 48 65 ...
## $ Verbal : int 491 445 448 482 417 462 431 429 420 406 ...
## $ Math : int 538 489 496 523 485 518 477 468 469 448 ...
## $ SAT : int 1029 934 944 1005 902 980 908 897 889 854 ...
with(dta, plot(Salary, SAT, type='n', main='SAT and Teacher\'s Salary'))
#type='n' no plotting.
# add regression line
abline(lm(SAT ~ Salary, data=dta))
# label states
with(dta, text(Salary, SAT, labels=row.names(dta), cex=.5)) ## row.names
## [1] "Idaho" "Iowa" "Maine" "Ohio" "Texas" "Utah"
## [1] "Idaho" "Iowa" "Maine" "Ohio" "Texas" "Utah"
#Augment data with the built-in state.region{datasets}
dta$Region <- state.region
with(dta, table(Region))## Region
## Northeast South North Central West
## 9 16 12 13
## [1] 1029 934 944 1005 902 980 908 897 889 854 889 979 1048 882
## [15] 1099 1060 999 1021 896 909 907 1033 1085 1036 1045 1009 1050 917
## [29] 935 898 1015 892 865 1107 975 1027 947 880 888 844 1068 1040
## [43] 893 1076 901 896 937 932 1073 1001
## $Northeast
## [1] 908 896 907 935 898 892 880 888 901
##
## $South
## [1] 1029 1005 897 889 854 999 1021 909 1036 865 1027 844 1040 893
## [15] 896 932
##
## $`North Central`
## [1] 1048 882 1099 1060 1033 1085 1045 1050 1107 975 1068 1073
##
## $West
## [1] 934 944 902 980 889 979 1009 917 1015 947 1076 937 1001
library(lattice)
xyplot(SAT ~ Salary, groups=Region, data=dta, type=c("g","p","r"), auto.key=list(columns=4)) # type g grid r regression## $Northeast
## (Intercept) x$Salary
## 933.3285652 -0.7931598
##
## $South
## (Intercept) x$Salary
## 1224.689015 -8.758555
##
## $`North Central`
## (Intercept) x$Salary
## 1212.192020 -4.939625
##
## $West
## (Intercept) x$Salary
## 1188.215929 -6.463652
dtaL <- reshape(dta, varying=list(5:6), idvar="SID", v.names="Score", timevar="Test",times=c("Verbal", "Math"), direction="long")
# idvar.timevar的times 最前面
set.seed(1215)
subset(dtaL, SID %in% sample(state.abb, 3))xyplot(Score ~ Salary | Region, groups=Test, data=dtaL, type=c("p", "g", "r"),auto.key=list(columns=2)) ## aggregate
mspp <- aggregate(cbind(Spending, PTR, PE) ~ Region, data=dta, FUN=mean)
merge(mspp, mss, by='Region')[c(2,3,4,1),]## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 4.0 5.9 9.0 10.7 14.2 28.0 47.4 58.0 65.6 70.0 81.0
dta$PEf <- with(dta, cut(PE, ordered=T, breaks=c(0, 11, 51, 100),
labels=c("Low", "Middle", "High")))
# ordered=T => Levels: Low < Middle < High
with(dta, table(PEf))## PEf
## Low Middle High
## 18 15 17
with(dta,plot(SAT~Salary,type='n'))#col=as.numeric(PEf)+1,
abline(lm(SAT~Salary,data=dta),col='purple')
lapply(split(dta,dta$PEf), function(x) abline(lm(SAT~Salary,data=x),col=as.numeric(x$PEf)+1,lty=2))## $Low
## NULL
##
## $Middle
## NULL
##
## $High
## NULL
with(dta,text(SAT~Salary,labels=row.names(dta),col=as.numeric(PEf)+1,cex=.5))
with(mss_p,points(SAT~Salary,col=as.numeric(PEf)+1,pch=20))
with(mss_p,abline(lm(SAT~Salary),lty=3,col='purple')) ## boxplot & arrows & points
rb <- boxplot(SAT ~ PEf, data = dta, col = 'bisque', xlab="Percent eligible")
# tapply 根據第二個的類別做第一個的funciton
mn.t <- with(dta, tapply(SAT, PEf, mean))
se.t <- with(dta, tapply(SAT, PEf, sd))/sqrt(rb$n)
xi <- 0.3 + seq(rb$n)
arrows(xi, mn.t-1.96*se.t, xi, mn.t+1.96*se.t, code=3, col="orange", angle=90, length=.05, lwd=2)
points(xi, mn.t, col="darkorange", pch=16) ## xyplot
| school | minrty | sx | ses | mAch | meanses | sector | cses |
|---|---|---|---|---|---|---|---|
| 1224 | No | Female | -1.528 | 5.876 | -0.434383 | Public | -1.093617 |
| 1224 | No | Female | -0.588 | 19.708 | -0.434383 | Public | -0.153617 |
| 1224 | No | Male | -0.528 | 20.349 | -0.434383 | Public | -0.093617 |
| 1224 | No | Male | -0.668 | 8.781 | -0.434383 | Public | -0.233617 |
| 1224 | No | Male | -0.158 | 17.898 | -0.434383 | Public | 0.276383 |
| 1224 | No | Male | 0.022 | 4.583 | -0.434383 | Public | 0.456383 |
hist(
unlist(
lapply(
split(Hsb82[,"mAch"], Hsb82$school), mean, na.rm=T)),
main="Distribution of school averages", xlab="Mean math scores", prob=T)## ─ Attaching packages ────────────────────────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 3.3.0 ✔ purrr 0.3.3
## ✔ tibble 2.1.1 ✔ dplyr 0.8.5
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ─ Conflicts ──────────────────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## NULL
## $names
## [1] "height"
##
## $row.names
## [1] 1
##
## $class
## [1] "tbl_df" "tbl" "data.frame"
%<>% – pipe to right & assign to left
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## [1] "school" "minrty" "sx" "ses" "mAch" "meanses" "sector"
## [8] "cses"
dta %<>% rename(SID=school, Minority=minrty, Gender=sx, SES=ses, Math=mAch,MSES=meanses, Sector=sector, CSES=cses)
names(dta)## [1] "SID" "Minority" "Gender" "SES" "Math" "MSES"
## [7] "Sector" "CSES"
dta %>%
mutate(SES_f = cut(SES, breaks=quantile(SES, probs=c(0, .33, .67, 1)),
label=c("Low", "Middle", "High"), ordered=T)) %>%
densityplot(~ Math, group=SES_f, data=., xlab="Socio-economic Status",
plot.points=FALSE, ref=TRUE, auto.key=list(columns=3))dta %>% mutate(Grade = case_when(
Math < 7.28 ~ "Q1",
Math >= 7.28 & Math < 12.13 ~ "Q2",
Math >= 12.13 & Math < 18.32 ~ "Q3",
Math >= 18.32 ~ "Q4")) %>%
select(Grade, Math) %>%
head%T% – tee,分流做兩件事
dta %>% transmute(SES, Math) %T>%
plot(., xlab="Socio-economic Status", ylab="Math score", pch='.') %>% colMeans## SES Math
## 1.433542e-04 1.274785e+01
## r
## Min. :-0.2406
## 1st Qu.: 0.1220
## Median : 0.2366
## Mean : 0.2299
## 3rd Qu.: 0.3330
## Max. : 0.6338
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
##
## layer
dta %>% group_by(Minority, Gender) %>%
summarize(Math_m=mean(Math, na.rm=T), Math_se=sd(Math, na.rm=T)/sqrt(n()), Math_lb=Math_m-1.96*Math_se, Math_ub=Math_m+1.96*Math_se) %>%
latticeExtra::segplot(Gender ~ Math_lb + Math_ub | Minority, data=., center=Math_m, draw.bands=FALSE, segments.fun=panel.arrows, ends='both', angle=90, length=1, unit='mm')Summarize a transformed variable
dta %>%
mutate(U90 = Math > quantile(Math, prob = 0.9)) %>%
group_by(U90, Gender) %>%
summarize(tot=n()) %>%
tidyr::spread(key=Gender, value=tot) %>%
ungroup() %>%
transmute(Prop_F = Female / (Female+Male), U90=U90)Select at random by a variable
set.seed(1234)
s5 <- dta %>% distinct(SID) %>% sample_n(5) %>% pull()
dta %>% filter(SID %in% s5) %>%
xyplot(Math ~ CSES | SID, groups=Gender, type=c("p","g","r"), data=., xlab = "Socio-economic Status score", ylab="Math score", auto.key=list(columns=2)) ## sample_frac & hist(table)
by_school <- dta %>% group_by(SID)
set.seed(1234)
hsb10 <- sample_frac(by_school, 0.1)
hist(table(hsb10$SID)/table(Hsb82$school), prob=T, xlab="Proportion", main="Distribution of sample proportion")##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
| drug | length | treatment | bdi.pre | bdi.2m | bdi.3m | bdi.5m | bdi.8m |
|---|---|---|---|---|---|---|---|
| No | >6m | TAU | 29 | 2 | 2 | NA | NA |
| Yes | >6m | BtheB | 32 | 16 | 24 | 17 | 20 |
| Yes | <6m | TAU | 25 | 20 | NA | NA | NA |
| No | >6m | BtheB | 21 | 17 | 16 | 10 | 9 |
| Yes | >6m | BtheB | 26 | 23 | NA | NA | NA |
| Yes | <6m | BtheB | 7 | 0 | 0 | 0 | 0 |
dta <- as_tibble(BtheB) %>%
mutate(Subject=paste0("S", 101:200)) %>%
rename(bdi.0m=bdi.pre)
dta %>% as.data.frame() %>% headto long format
dtaL <- dta %>%
rename(Drug=drug, Treatment=treatment, Length=length) %>%
gather(key=Time, value=BDI, contains("bdi")) %>%
arrange(Subject)
dtaL %>% as.data.frame() %>% head(., 11)dtaL %<>%
separate(Time, c("prefix", "month")) %>%
mutate(Month=parse_number(month)) %>%
select(-c(prefix, month))
dtaL %>% as.data.frame() %>% head(., 9)Bind columns
dtaL %>% unite(TDL, Treatment, Drug, Length) %>%
xyplot(BDI ~ Month | TDL, groups=Subject, data=., type=c("b","g"), xlab="Time (in months)", ylab="BDI score")long to wide
dtaW <- dtaL %>%
mutate(pre=rep("m", dim(dtaL)[1])) %>%
unite(Time, pre, Month) %>%
spread(Time, BDI) %>%
arrange(Subject)
dtaW %>% as.data.frame() %>% head(., 5)dtaL %>% select(Subject, BDI, Month) %>%
split(list(.$Subject)) %>%
purrr::map(~ coef(lm(BDI ~ Month, data=.))) %>%
unlist %>% as.numeric %>%
matrix(ncol=2, byrow=T) %>%
plot(xlab="Intercept estimates", ylab="Slope estimates")cal_betas <- function(df) { lm(BDI ~ Month, data = df)$coefficients }
betas <- t(simplify2array(by(dtaL, dtaL$Subject, cal_betas)))
plot(betas, xlab="Intercept estimates", ylab="Slope estimates", main="Scatter diagram of regression estimates")m0 <- lm(BDI ~ Subject + Month + Subject:Month - 1, data=dtaL)
n <- dim(dta)[1]
df <- data.frame(b0=coef(m0)[1:n], b1=c(coef(m0)[n+1], coef(m0)[n+1] + coef(m0)[(n+2):(2*n)]))
plot(df)