dta <- Caschool
head(dta, 3)
distcod county district grspan enrltot teachers
1 75119 Alameda Sunol Glen Unified KK-08 195 10.90
2 61499 Butte Manzanita Elementary KK-08 240 11.15
3 61549 Butte Thermalito Union Elementary KK-08 1550 82.90
calwpct mealpct computer testscr compstu expnstu str avginc elpct
1 0.5102 2.041 67 690.8 0.3436 6385 17.89 22.690 0.000
2 15.4167 47.917 101 661.2 0.4208 5099 21.52 9.824 4.583
3 55.0323 76.323 169 643.6 0.1090 5502 18.70 8.978 30.000
readscr mathscr
1 691.6 690.0
2 660.5 661.9
3 636.3 650.9
#sampling
set.seed(33333)
new_dta <- dta %>% group_by(county) %>%
sample_n(1)
#plot
plot(new_dta$mathscr,new_dta$readscr,
xlab = "Average math score", ylab = "Average reading score")
###
options(continue=" ")
###基本設定
options(digits=3) #設小數點位數
options(width=72) # narrow output
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv") #讀入資料
library(dplyr) #讀入package
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e,
f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t) #選取需要的變項
###檢視資料
names(newds) #列出變項名稱
str(newds[,1:10]) #檢視前十筆變項的結構
summary(newds[,1:10]) # 檢視前十筆變項的摘要
head(newds, n=3) #檢視前三筆資料
##寫註解、儲存
comment(newds) = "HELP baseline dataset" #在object下註解
comment(newds) #檢視註解
save(ds, file="savedfile") #儲存
### 輸出檔案
write.csv(ds, file="ds.csv") #輸出csv檔
## 輸出檔案
library(foreign) #讀入package
write.foreign(newds, "file.dat", "file.sas", package="SAS") #分別輸出data、code的檔,使用package : SAS
### 檢視資料
with(newds, cesd[1:10]) #在newds這個object中,看變項csed前十列
with(newds, head(cesd, 10)) #在newds這個object中,看變項csed前十列
with(newds, cesd[cesd > 56]) #在newds這個object中,看變項csed數值超過56的數值
### 篩選資料
library(dplyr) #讀入package
filter(newds, cesd > 56) %>% select(id, cesd) #選出csed大於56的列,並指顯示id、cesd兩個變項
### 檢視
with(newds, sort(cesd)[1:4]) #將csed由小排到大後,看前四筆資料的數值
with(newds, which.min(cesd)) #看csed最小的數值位於第幾列
### 檢視資料
library(mosaic) #讀入資料
tally(~ is.na(f1g), data=newds) #判別變項flg裡的na,用table列出
favstats(~ f1g, data=newds) #對變項flg做摘要,內容含min,Q1,median,Q3,max,mean,sd,n,missing
### 整理資料
# 反向題重新編碼reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
(3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
f1q, f1r, f1s, f1t))
nmisscesd = apply(is.na(cesditems), 1, sum) #計算na個數
ncesditems = cesditems #建一個object
ncesditems[is.na(cesditems)] = 0 #把為na者用0代替
newcesd = apply(ncesditems, 1, sum) #計算na個數
imputemeancesd = 20/(20-nmisscesd)*newcesd #用平均值取代na
### 建立資料框架
#建立資料框架,列出nmisscesd大於0的列
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
### 改變資料
library(dplyr) #讀入package
library(memisc) #讀入package
newds = mutate(newds, drinkstat=
cases(
"abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
(i1>0 & i1<=2 & i2<=4 & female==0),
"highrisk" = ((i1>1 | i2>3) & female==1) |
((i1>2 | i2>4) & female==0))) #把變項改成類別資料
### 整理package
library(mosaic) #讀入package
detach(package:memisc) #卸載package
detach(package:MASS) #卸載package
### 整理資料
library(dplyr) #讀入package
tmpds = select(newds, i1, i2, female, drinkstat) #選擇變項
tmpds[365:370,] #列出第365到第370列
library(dplyr) #讀入package
filter(tmpds, drinkstat=="moderate" & female==1) #篩選擁有指定變項的值的列
### 製作表
library(gmodels) #讀入package
with(tmpds, CrossTable(drinkstat)) #次數與機率表
#列為飲酒、行為性別,不顯示行、列的比率,不顯示chi-square
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
### 整理、處理資料
newds = transform(newds,
gender=factor(female, c(0,1), c("Male","Female"))) #依據female設一個新的變項
tally(~ female + gender, margin=FALSE, data=newds) #製成table
library(dplyr) #讀入package
newds = arrange(ds, cesd, i1) #依序排列的資料框
newds[1:5, c("cesd", "i1", "id")] #列出前五筆,只顯示上述變項
library(dplyr) #讀入package
females = filter(ds, female==1) #列出變項female為1的列
with(females, mean(cesd)) #算某變項的平均數
mean(ds$cesd[ds$female==1]) #an alternative approach
with(ds, tapply(cesd, female, mean)) #算性別csed的平均
library(mosaic) #讀入package
mean(cesd ~ female, data=ds) #算性別csed的平均
library(HSAUR3)
dta4 <- backpain
head(dta4)
ID status driver suburban
1 1 case yes yes
2 1 control yes no
3 2 case yes yes
4 2 control yes yes
5 3 case yes no
6 3 control yes yes
with(dta4, table(driver, suburban,status))
, , status = case
suburban
driver no yes
no 26 6
yes 64 121
, , status = control
suburban
driver no yes
no 47 7
yes 63 100
#table
dta4 %>%
group_by(driver, suburban,status) %>%
tally() %>%
spread(status, n) %>%
mutate(total = case + control)
# A tibble: 4 x 5
# Groups: driver, suburban [4]
driver suburban case control total
<fct> <fct> <int> <int> <int>
1 no no 26 47 73
2 no yes 6 7 13
3 yes no 64 63 127
4 yes yes 121 100 221
Apply table() using dplyr : source
library(datasets)
dta5 <- merge(state.x77, USArrests, by = "row.names")
P <- cor(dta5[,-1]); P
Population Income Illiteracy Life Exp Murder.x HS Grad
Population 1.00000 0.20823 0.10762 -0.06805 0.34364 -0.09849
Income 0.20823 1.00000 -0.43708 0.34026 -0.23008 0.61993
Illiteracy 0.10762 -0.43708 1.00000 -0.58848 0.70298 -0.65719
Life Exp -0.06805 0.34026 -0.58848 1.00000 -0.78085 0.58222
Murder.x 0.34364 -0.23008 0.70298 -0.78085 1.00000 -0.48797
HS Grad -0.09849 0.61993 -0.65719 0.58222 -0.48797 1.00000
Frost -0.33215 0.22628 -0.67195 0.26207 -0.53888 0.36678
Area 0.02254 0.36332 0.07726 -0.10733 0.22839 0.33354
Murder.y 0.32024 -0.21521 0.70678 -0.77850 0.93369 -0.52159
Assault 0.31702 0.04093 0.51101 -0.62666 0.73976 -0.23031
UrbanPop 0.51260 0.48053 -0.06220 0.27147 0.01638 0.35868
Rape 0.30523 0.35739 0.15460 -0.26957 0.57996 0.27073
Frost Area Murder.y Assault UrbanPop Rape
Population -0.33215 0.02254 0.32024 0.31702 0.51260 0.3052
Income 0.22628 0.36332 -0.21521 0.04093 0.48053 0.3574
Illiteracy -0.67195 0.07726 0.70678 0.51101 -0.06220 0.1546
Life Exp 0.26207 -0.10733 -0.77850 -0.62666 0.27147 -0.2696
Murder.x -0.53888 0.22839 0.93369 0.73976 0.01638 0.5800
HS Grad 0.36678 0.33354 -0.52159 -0.23031 0.35868 0.2707
Frost 1.00000 0.05923 -0.54140 -0.46824 -0.24619 -0.2792
Area 0.05923 1.00000 0.14809 0.23121 -0.06155 0.5250
Murder.y -0.54140 0.14809 1.00000 0.80187 0.06957 0.5636
Assault -0.46824 0.23121 0.80187 1.00000 0.25887 0.6652
UrbanPop -0.24619 -0.06155 0.06957 0.25887 1.00000 0.4113
Rape -0.27921 0.52496 0.56358 0.66524 0.41134 1.0000
corrplot(P, method = "circle")
兩邊資料的謀殺個數有高相關(分別為1976與1973的數據)
library(MASS)
dta6 <- rbind(rownames_to_column(Animals,"type"), rownames_to_column(mammals,"type"))
dta6 <- dta6[!duplicated(dta6), ]
duplicated(dta6)
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[67] FALSE
dim(dta6)
[1] 67 3
library(Ecdat)
dta7 <- Caschool
dta7$ratio <- dta7$enrltot / dta7$teachers
dta7 <- dta7 %>% filter(grspan == "KK-08") %>%
mutate(level_read = cut(readscr, breaks = quantile(readscr, probs = c(0, .33, .67, 1)), label = c("L", "M", "H"), ordered = T))
# plot
library(lattice)
xyplot(readscr ~ ratio | level_read, data = dta7,
type = c("p", "g", "r"), layout = c(3, 1),
xlab = "Student-Teacher Ratio",
ylab = "Reading Score")
library(car)
Attaching package: 'car'
The following object is masked from 'package:memisc':
recode
The following object is masked from 'package:purrr':
some
The following object is masked from 'package:Ecdat':
Mroz
The following object is masked from 'package:dplyr':
recode
dta8 <- Prestige
dta8 <- dta8 %>% na.omit() %>%
group_by(type) %>%
summarize(m = median(prestige, na.rm = TRUE)); dta8
# A tibble: 3 x 2
type m
<fct> <dbl>
1 bc 35.9
2 prof 68.4
3 wc 41.5
library(car)
dta8_n <- Prestige
dta8_n <- dta8_n %>% na.omit() %>%
group_by(type) %>%
mutate(M = median(prestige), level = cases("High" = prestige > M,
"Low" = prestige <= M))
# plot
ggplot(dta8_n, aes(x = education, y = income, group = level)) +
geom_point(aes(color = level)) +
stat_smooth(method = "lm", se = F, aes(color = level)) +
facet_wrap(~ factor(type)) +
labs(x= "Average education of each occupation", y = "Average income of occupation") +
theme_bw()
library(mlmRev)
dta9 <- Hsb82
head(dta9, 3)
school minrty sx ses mAch meanses sector cses
1 1224 No Female -1.528 5.876 -0.4344 Public -1.09362
2 1224 No Female -0.588 19.708 -0.4344 Public -0.15362
3 1224 No Male -0.528 20.349 -0.4344 Public -0.09362
dta9 <- dta9 %>% group_by(school, sector) %>%
summarize(math_m = mean(mAch, na.rm = T),
math_se = sd(mAch, na.rm = T)/sqrt(n()))
# plot
ggplot(data = dta9, aes(x = school, y = math_m)) +
geom_point() +
facet_wrap( ~ sector) +
geom_errorbar(aes(ymin = math_m - 2*math_se, ymax = math_m + 2*math_se), width = .1) +
labs(x = "school", y = "Average Math Achievement score") +
coord_flip() +
theme_bw()