pacman::p_load(Ecdat,mosaic,memisc,mlmRev,HSAUR3,knitr,kableExtra,readr,dplyr,ggplot2,tidyr,car,magrittr,tibble,data.table,stringr)
# read dataset
dta <- Ecdat::Caschool
head(dta)
## distcod county district grspan enrltot teachers
## 1 75119 Alameda Sunol Glen Unified KK-08 195 10.90
## 2 61499 Butte Manzanita Elementary KK-08 240 11.15
## 3 61549 Butte Thermalito Union Elementary KK-08 1550 82.90
## 4 61457 Butte Golden Feather Union Elementary KK-08 243 14.00
## 5 61523 Butte Palermo Union Elementary KK-08 1335 71.50
## 6 62042 Fresno Burrel Union Elementary KK-08 137 6.40
## calwpct mealpct computer testscr compstu expnstu str avginc
## 1 0.5102 2.0408 67 690.80 0.3435898 6384.911 17.88991 22.690001
## 2 15.4167 47.9167 101 661.20 0.4208333 5099.381 21.52466 9.824000
## 3 55.0323 76.3226 169 643.60 0.1090323 5501.955 18.69723 8.978000
## 4 36.4754 77.0492 85 647.70 0.3497942 7101.831 17.35714 8.978000
## 5 33.1086 78.4270 171 640.85 0.1280899 5235.988 18.67133 9.080333
## 6 12.3188 86.9565 25 605.55 0.1824818 5580.147 21.40625 10.415000
## elpct readscr mathscr
## 1 0.000000 691.6 690.0
## 2 4.583333 660.5 661.9
## 3 30.000002 636.3 650.9
## 4 0.000000 651.9 643.5
## 5 13.857677 641.8 639.9
## 6 12.408759 605.7 605.4
#Select random
set.seed(1070326)
dta2 <- group_by(dta,county) %>% sample_n(1)
#draw a scatter diagram
plot(dta2$mathscr,dta2$readscr,xlab = "Average math score", ylab = "Average reading score")
options(continue=" “)
options(digits=3)
options(width=72)
ds = read.csv(“http://www.amherst.edu/~nhorton/r2/datasets/help.csv”) library(dplyr)
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
names(newds)
str(newds[,1:10])
summary(newds[,1:10])
head(newds, n=3)
comment(newds) = “HELP baseline dataset” comment(newds)
save(ds, file=“savedfile”)
write.csv(ds, file=“ds.csv”)
library(foreign) ### 寫成SAS可讀格式 write.foreign(newds, “file.dat”, “file.sas”, package=“SAS”)
with(newds, cesd[1:10]) with(newds, head(cesd, 10)) with(newds, cesd[cesd > 56])
library(dplyr) ###先選擇newds中cesd大於56的觀察值,再選擇其id和cesd兩欄資料檢視。 filter(newds, cesd > 56) %>% select(id, cesd) #
with(newds, sort(cesd)[1:4])
with(newds, which.min(cesd))
library(mosaic) ###檢查f1g變項的NA值,輸出成tbl tally(~ is.na(f1g), data=newds)
###對f1g變項做一些敘述統計,輸出成tbl favstats(~ f1g, data=newds)
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), f1q, f1r, f1s, f1t)) ### 計算NA數量 nmisscesd = apply(is.na(cesditems), 1, sum) ncesditems = cesditems ncesditems[is.na(cesditems)] = 0
newcesd = apply(ncesditems, 1, sum)
imputemeancesd = 20/(20-nmisscesd)*newcesd
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
library(dplyr) # mutate來自dplyr library(memisc) # cases來自memisc newds = mutate(newds, drinkstat= cases(“abstinent” = i1==0, “moderate” = (i1>0 & i1<=1 & i2<=3 & female==1) | (i1>0 & i1<=2 & i2<=4 & female==0), “highrisk” = ((i1>1 | i2>3) & female==1) | ((i1>2 | i2>4) & female==0)))
library(mosaic); detach(package:memisc); detach(package:MASS)
library(dplyr) ###選擇i1, i2, female, drinkstat四個變項 tmpds = select(newds, i1, i2, female, drinkstat)
tmpds[365:370,]
filter(tmpds, drinkstat==“moderate” & female==1)
library(gmodels) # CrossTable出處 with(tmpds, CrossTable(drinkstat)) with(tmpds, CrossTable(drinkstat, female, prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
newds = transform(newds, gender=factor(female, c(0,1), c(“Male”,“Female”))) tally(~ female + gender, margin=FALSE, data=newds)
library(dplyr) ### 依序使用cesd和id排序ds資料框 newds = arrange(ds, cesd, i1) newds[1:5, c(“cesd”, “i1”, “id”)] # 選出ds中的女性並計算其平均 females = filter(ds, female==1) with(females, mean(cesd)) mean(ds\(cesd[ds\)female==1])
with(ds, tapply(cesd, female, mean))
library(mosaic) mean(cesd ~ female, data=ds)
#read dataset
dta4<-HSAUR3::backpain
head(dta4)
## ID status driver suburban
## 1 1 case yes yes
## 2 1 control yes no
## 3 2 case yes yes
## 4 2 control yes yes
## 5 3 case yes no
## 6 3 control yes yes
with(dta4, table(driver, suburban,status))
## , , status = case
##
## suburban
## driver no yes
## no 26 6
## yes 64 121
##
## , , status = control
##
## suburban
## driver no yes
## no 47 7
## yes 63 100
group_by(dta4,driver, suburban,status) %>%
tally() %>% spread(status, n) %>% mutate(total = case + control)
## # A tibble: 4 x 5
## # Groups: driver, suburban [4]
## driver suburban case control total
## <fct> <fct> <int> <int> <int>
## 1 no no 26 47 73
## 2 no yes 6 7 13
## 3 yes no 64 63 127
## 4 yes yes 121 100 221
library(datasets)
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 3.4.4
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
dta5_1<-state.x77
dta5_2<-USArrests
# merge
dta5<-merge(dta5_1,dta5_2)
# plot
ggpairs(dta5[, -1])
##Highly correlated(0.7~0.99):Life_exp & Illiteracy
library(MASS)
dta6<-merge(Animals,mammals,"row.names")
head(dta6)
## Row.names body.x brain.x body.y brain.y
## 1 African elephant 6654.00 5712.0 6654.00 5712.0
## 2 Asian elephant 2547.00 4603.0 2547.00 4603.0
## 3 Cat 3.30 25.6 3.30 25.6
## 4 Chimpanzee 52.16 440.0 52.16 440.0
## 5 Cow 465.00 423.0 465.00 423.0
## 6 Donkey 187.10 419.0 187.10 419.0
duplicated(dta6)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE
dim(dta6)
## [1] 23 5
# read dataset
dta7 <- Ecdat::Caschool
dta7$ratio <- dta7$enrltot / dta7$teachers
dta7 <- dta7 %>% filter(grspan == "KK-08") %>%
mutate(level_read = cut(readscr, breaks = quantile(readscr, probs = c(0, .33, .67, 1)), label = c("L", "M", "H"), ordered = T))
# plot
library(lattice)
xyplot(readscr ~ ratio | level_read, data = dta7,
type = c("p", "g", "r"), layout = c(3, 1),
xlab = "Student-Teacher Ratio",
ylab = "Reading Score")
#8-1:Find the median prestige score...
dta8_1 <- car::Prestige
group_by(na.omit(dta8_1),type) %>% summarise(median(prestige))
## # A tibble: 3 x 2
## type `median(prestige)`
## <fct> <dbl>
## 1 bc 35.9
## 2 prof 68.4
## 3 wc 41.5
#8-2:define two levels of prestige...
dta8_2 <- car::Prestige %>%
na.omit %>%
group_by(type) %>%
mutate(m = median(prestige),
PrestigeLevel = memisc::cases("High" = prestige >= m, "Low" = prestige < m))
# plot
xyplot(income ~ education |type,
group = PrestigeLevel,
data = dta8_2,
type = c("p", "g", "r"),
layout = c(3, 1),
auto.key=list(space="right", row=2),
xlab = "Average education of occupational incumbents",
ylab = "Average income of incumbents")
library(ggplot2)
dta_9 <- mlmRev::Hsb82 %>% group_by(sector, school) %>% summarise(m = mean(mAch, na.rm = TRUE),s = sd(mAch, na.rm = TRUE),n = n(),se = s/sqrt(n)) %>% mutate(lower.ci = m-2*se, upper.ci = m+2*se)
# plot
ggplot(dta_9, aes(x = school, y = m)) + geom_point() + geom_errorbar(aes(ymax = upper.ci, ymin = lower.ci))+ coord_flip()+ facet_wrap(~sector)+ labs(x = "School",y = "Math Mean Score by School")