2
#讀進資料&看資料
dt2<-Ecdat::Caschool
knitr::kable(head(dt2))| distcod | county | district | grspan | enrltot | teachers | calwpct | mealpct | computer | testscr | compstu | expnstu | str | avginc | elpct | readscr | mathscr |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 75119 | Alameda | Sunol Glen Unified | KK-08 | 195 | 10.90 | 0.5102 | 2.0408 | 67 | 690.80 | 0.3435898 | 6384.911 | 17.88991 | 22.690001 | 0.000000 | 691.6 | 690.0 |
| 61499 | Butte | Manzanita Elementary | KK-08 | 240 | 11.15 | 15.4167 | 47.9167 | 101 | 661.20 | 0.4208333 | 5099.381 | 21.52466 | 9.824000 | 4.583334 | 660.5 | 661.9 |
| 61549 | Butte | Thermalito Union Elementary | KK-08 | 1550 | 82.90 | 55.0323 | 76.3226 | 169 | 643.60 | 0.1090323 | 5501.955 | 18.69723 | 8.978000 | 30.000002 | 636.3 | 650.9 |
| 61457 | Butte | Golden Feather Union Elementary | KK-08 | 243 | 14.00 | 36.4754 | 77.0492 | 85 | 647.70 | 0.3497942 | 7101.831 | 17.35714 | 8.978000 | 0.000000 | 651.9 | 643.5 |
| 61523 | Butte | Palermo Union Elementary | KK-08 | 1335 | 71.50 | 33.1086 | 78.4270 | 171 | 640.85 | 0.1280899 | 5235.988 | 18.67133 | 9.080333 | 13.857677 | 641.8 | 639.9 |
| 62042 | Fresno | Burrel Union Elementary | KK-08 | 137 | 6.40 | 12.3188 | 86.9565 | 25 | 605.55 | 0.1824818 | 5580.147 | 21.40625 | 10.415000 | 12.408759 | 605.7 | 605.4 |
#設亂數種子,每個國家抽一個,然後畫散布圖
set.seed(1234567)
dt2.1 <- dt2 %>% group_by(county) %>% sample_n(1)
ggplot(dt2.1, aes(x=mathscr,y=readscr)) + geom_point()3
## ----echo=FALSE,eval=TRUE------------------------------------------------
options(continue=" ")
## ------------------------------------------------------------------------
options(digits=3) #設定小數位數為3
options(width=72) # narrow output
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv") #讀進資料
library(dplyr) #載入dplyr packages
#選擇要載入的變項
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e,
f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
## ------------------------------------------------------------------------
names(newds) #看變項的名稱## [1] "cesd" "female" "i1" "i2" "id" "treat" "f1a"
## [8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
## [15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
## [22] "f1p" "f1q" "f1r" "f1s" "f1t"
str(newds[,1:10]) # structure of the first 10 variables## 'data.frame': 453 obs. of 10 variables:
## $ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
## $ female: int 0 0 0 1 0 1 1 0 1 0 ...
## $ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
## $ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ treat : int 1 1 0 0 0 1 0 1 0 1 ...
## $ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
## $ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
## $ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
## $ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
## ------------------------------------------------------------------------
summary(newds[,1:10]) # summary of the first 10 variables## cesd female i1 i2
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:25.0 1st Qu.:0.000 1st Qu.: 3.0 1st Qu.: 3.0
## Median :34.0 Median :0.000 Median : 13.0 Median : 15.0
## Mean :32.8 Mean :0.236 Mean : 17.9 Mean : 22.6
## 3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.: 26.0 3rd Qu.: 32.0
## Max. :60.0 Max. :1.000 Max. :142.0 Max. :184.0
## id treat f1a f1b
## Min. : 1 Min. :0.000 Min. :0.00 Min. :0.00
## 1st Qu.:119 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
## Median :233 Median :0.000 Median :2.00 Median :1.00
## Mean :233 Mean :0.497 Mean :1.63 Mean :1.39
## 3rd Qu.:348 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
## Max. :470 Max. :1.000 Max. :3.00 Max. :3.00
## f1c f1d
## Min. :0.00 Min. :0.00
## 1st Qu.:1.00 1st Qu.:0.00
## Median :2.00 Median :1.00
## Mean :1.92 Mean :1.56
## 3rd Qu.:3.00 3rd Qu.:3.00
## Max. :3.00 Max. :3.00
## ------------------------------------------------------------------------
head(newds, n=3) #看前三筆資料## cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1 49 0 13 26 1 1 3 2 3 0 2 3 3 0 2 3
## 2 30 0 56 62 2 1 3 2 0 3 3 2 0 0 3 0
## 3 39 0 0 0 3 0 3 2 3 0 2 2 1 3 2 3
## f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1 3 0 1 2 2 2 2 3 3 2
## 2 3 0 0 3 0 0 0 2 0 0
## 3 1 0 1 3 2 0 0 3 2 0
## ------------------------------------------------------------------------
comment(newds) = "HELP baseline dataset"
comment(newds) #看設置的comment## [1] "HELP baseline dataset"
save(ds, file="savedfile") #把檔案存檔
## ------------------------------------------------------------------------
write.csv(ds, file="ds.csv") #存成.csv檔
## ------------------------------------------------------------------------
library(foreign) #載入package
write.foreign(newds, "file.dat", "file.sas", package="SAS") #把檔案存成SAS
## ------------------------------------------------------------------------
with(newds, cesd[1:10]) #看cesd的前十個數值是多少## [1] 49 30 39 15 39 6 52 32 50 46
with(newds, head(cesd, 10)) #看cesd的前十個數值是多少## [1] 49 30 39 15 39 6 52 32 50 46
## ------------------------------------------------------------------------
with(newds, cesd[cesd > 56]) #找出cesd大於56的數值有多少## [1] 57 58 57 60 58 58 57
## ------------------------------------------------------------------------
library(dplyr) #載入package
filter(newds, cesd > 56) %>% select(id, cesd) #挑出cesd>56的id編號## id cesd
## 1 71 57
## 2 127 58
## 3 200 57
## 4 228 60
## 5 273 58
## 6 351 58
## 7 13 57
## ------------------------------------------------------------------------
with(newds, sort(cesd)[1:4]) #把cesd由小排到大之後看前四筆## [1] 1 3 3 4
with(newds, which.min(cesd)) #找到cesd的最小值排在第幾列## [1] 199
## ------------------------------------------------------------------------
library(mosaic) #載入package## Loading required package: lattice
## Loading required package: ggformula
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median,
## prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
tally(~ is.na(f1g), data=newds) #檢查flg的遺漏值,轉換成tbl## is.na(f1g)
## TRUE FALSE
## 1 452
favstats(~ f1g, data=newds) #對flg做敘述統計報告## min Q1 median Q3 max mean sd n missing
## 0 1 2 3 3 1.73 1.1 452 1
## ------------------------------------------------------------------------
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
(3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
f1q, f1r, f1s, f1t)) #將反向題挑出來重新編碼
nmisscesd = apply(is.na(cesditems), 1, sum) #計算NA的數量
ncesditems = cesditems #重新存成另一個資料
ncesditems[is.na(cesditems)] = 0 #讓NA變成0
newcesd = apply(ncesditems, 1, sum) #計算沒有遺漏值之後的平均
imputemeancesd = 20/(20-nmisscesd)*newcesd #用平均來代替NA
##把這些東西列成一個表格
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,] ## newcesd newds.cesd nmisscesd imputemeancesd
## 4 15 15 1 15.8
## 17 19 19 1 20.0
## 87 44 44 1 46.3
## 101 17 17 1 17.9
## 154 29 29 1 30.5
## 177 44 44 1 46.3
## 229 39 39 1 41.1
## ----createdrink,message=FALSE-------------------------------------------
library(dplyr) #載入package
library(memisc) #載入package## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'memisc'
## The following object is masked from 'package:Matrix':
##
## as.array
## The following objects are masked from 'package:dplyr':
##
## collect, recode, rename
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
newds = mutate(newds, drinkstat=
cases(
"abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
(i1>0 & i1<=2 & i2<=4 & female==0),
"highrisk" = ((i1>1 | i2>3) & female==1) |
((i1>2 | i2>4) & female==0)))
#把不同數值的資料給他一個類別的標籤
## ----echo=FALSE----------------------------------------------------------
library(mosaic) #載入package
##解除這兩個package
detach(package:memisc)
detach(package:MASS)
## ------------------------------------------------------------------------
library(dplyr) #載入package
tmpds = select(newds, i1, i2, female, drinkstat) #挑出需要的變項
tmpds[365:370,] #看365到370筆資料## i1 i2 female drinkstat
## 365 6 24 0 highrisk
## 366 6 6 0 highrisk
## 367 0 0 0 abstinent
## 368 0 0 1 abstinent
## 369 8 8 0 highrisk
## 370 32 32 0 highrisk
filter(tmpds, drinkstat=="moderate" & female==1) #找出喝酒程度是moderate的女性## i1 i2 female drinkstat
## 1 1 1 1 moderate
## 2 1 3 1 moderate
## 3 1 2 1 moderate
## 4 1 1 1 moderate
## 5 1 1 1 moderate
## 6 1 1 1 moderate
## 7 1 1 1 moderate
library(gmodels) #載入package
with(tmpds, CrossTable(drinkstat)) #看次數分配表##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | abstinent | moderate | highrisk |
## |-----------|-----------|-----------|
## | 68 | 28 | 357 |
## | 0.150 | 0.062 | 0.788 |
## |-----------|-----------|-----------|
##
##
##
##
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE)) #橫列是喝酒程度、直欄是性別##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | female
## drinkstat | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## abstinent | 42 | 26 | 68 |
## | 0.618 | 0.382 | 0.150 |
## -------------|-----------|-----------|-----------|
## moderate | 21 | 7 | 28 |
## | 0.750 | 0.250 | 0.062 |
## -------------|-----------|-----------|-----------|
## highrisk | 283 | 74 | 357 |
## | 0.793 | 0.207 | 0.788 |
## -------------|-----------|-----------|-----------|
## Column Total | 346 | 107 | 453 |
## -------------|-----------|-----------|-----------|
##
##
##把性別的0,1對應male與female的標籤,創一個新變項
newds = transform(newds,
gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)## gender
## female Male Female
## 0 346 0
## 1 0 107
newds = arrange(ds, cesd, i1) #排序
newds[1:5, c("cesd", "i1", "id")] #顯示1-5筆## cesd i1 id
## 1 1 3 233
## 2 3 1 139
## 3 3 13 418
## 4 4 4 251
## 5 4 9 95
females = filter(ds, female==1) #選出ds中的女性
with(females, mean(cesd)) #計算女性的平均cesd## [1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1]) #計算女性的平均cesd## [1] 36.9
## ------------------------------------------------------------------------
with(ds, tapply(cesd, female, mean)) #計算平均值## 0 1
## 31.6 36.9
library(mosaic) #載入package
mean(cesd ~ female, data=ds) #用~右邊的東西來界定根據甚麼計算平均## 0 1
## 31.6 36.9
4
read in the data and see
dt4<-HSAUR3::backpain
head(dt4) %>% knitr::kable()| ID | status | driver | suburban |
|---|---|---|---|
| 1 | case | yes | yes |
| 1 | control | yes | no |
| 2 | case | yes | yes |
| 2 | control | yes | yes |
| 3 | case | yes | no |
| 3 | control | yes | yes |
先把資料的列連表弄出來,然後把它們變成data.frame後拆開,再新增total一欄
with(dt4, ftable(driver, suburban,status)) %>% as.data.frame() %>% tidyr::spread(status,Freq) %>% mutate(total=case+control) ## driver suburban case control total
## 1 no no 26 47 73
## 2 no yes 6 7 13
## 3 yes no 64 63 127
## 4 yes yes 121 100 221
5
讀進資料後合併
dt51<-state.x77
dt52<-USArrests
dt5<-merge(dt51,dt52)
ggpairs(dt5[, -1])不過我沒有看出有什麼有趣的事情.. # 6
讀進資料後合併起來,看看有沒有重複,發現沒有
dt61<-MASS::Animals
dt62<-MASS::mammals
dt6<-merge(dt61,dt62,"row.names")
head(dt6)## Row.names body.x brain.x body.y brain.y
## 1 African elephant 6654.0 5712.0 6654.0 5712.0
## 2 Asian elephant 2547.0 4603.0 2547.0 4603.0
## 3 Cat 3.3 25.6 3.3 25.6
## 4 Chimpanzee 52.2 440.0 52.2 440.0
## 5 Cow 465.0 423.0 465.0 423.0
## 6 Donkey 187.1 419.0 187.1 419.0
duplicated(dt6)## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE
7
讀進資料後新增新的變項
dt7<-Ecdat::Caschool
dt7<- dt7 %>% mutate(ratio=round(enrltot/teachers,2),
reading_cut=cut(readscr, breaks = quantile(readscr, probs = c(0,.33,.67,1)),
label = c("L","M","H")))
#看一下資料的樣子
head(dt7)## distcod county district grspan enrltot
## 1 75119 Alameda Sunol Glen Unified KK-08 195
## 2 61499 Butte Manzanita Elementary KK-08 240
## 3 61549 Butte Thermalito Union Elementary KK-08 1550
## 4 61457 Butte Golden Feather Union Elementary KK-08 243
## 5 61523 Butte Palermo Union Elementary KK-08 1335
## 6 62042 Fresno Burrel Union Elementary KK-08 137
## teachers calwpct mealpct computer testscr compstu expnstu str avginc
## 1 10.9 0.51 2.04 67 691 0.344 6385 17.9 22.69
## 2 11.1 15.42 47.92 101 661 0.421 5099 21.5 9.82
## 3 82.9 55.03 76.32 169 644 0.109 5502 18.7 8.98
## 4 14.0 36.48 77.05 85 648 0.350 7102 17.4 8.98
## 5 71.5 33.11 78.43 171 641 0.128 5236 18.7 9.08
## 6 6.4 12.32 86.96 25 606 0.182 5580 21.4 10.41
## elpct readscr mathscr ratio reading_cut
## 1 0.00 692 690 17.9 H
## 2 4.58 660 662 21.5 M
## 3 30.00 636 651 18.7 L
## 4 0.00 652 644 17.4 M
## 5 13.86 642 640 18.7 L
## 6 12.41 606 605 21.4 L
畫圖
lattice::xyplot(readscr ~ ratio | reading_cut, data = dt7,
type = c("p", "g", "r"), layout = c(3, 1),
xlab = "Student-Teacher Ratio",
ylab = "Reading Score")8
read in the data and see
dt8<-car::Prestige
knitr::kable(head(dt8))| education | income | women | prestige | census | type | |
|---|---|---|---|---|---|---|
| gov.administrators | 13.1 | 12351 | 11.16 | 68.8 | 1113 | prof |
| general.managers | 12.3 | 25879 | 4.02 | 69.1 | 1130 | prof |
| accountants | 12.8 | 9271 | 15.70 | 63.4 | 1171 | prof |
| purchasing.officers | 11.4 | 8865 | 9.11 | 56.8 | 1175 | prof |
| chemists | 14.6 | 8403 | 11.68 | 73.5 | 2111 | prof |
| physicists | 15.6 | 11030 | 5.13 | 77.6 | 2113 | prof |
Find the median prestige score for each of the three types of occupation, respectively.
dt8 %>% na.omit() %>% group_by(type) %>% summarise(median(prestige))## # A tibble: 3 x 2
## type `median(prestige)`
## <fct> <dbl>
## 1 bc 35.9
## 2 prof 68.4
## 3 wc 41.5
The relationship between income and education for each category
dt8 %>% filter(type=="bc") %>%
mutate(type_bc = cut(prestige, breaks = c(0,35.9,100),
label = c("L", "H"),ordered = T)) %>%
ggplot(data = ., aes(x = education, y = income,color=type_bc)) +
geom_point() + geom_line(aes(group = type_bc)) 在type=bc的組別中,prestige比較高的income也比較高、education也比較高
dt8 %>% filter(type=="prof") %>%
mutate(type_pr = cut(prestige, breaks = c(0,68.4,100),
label = c("L", "H"),ordered = T)) %>%
ggplot(data = ., aes(x = education, y = income,color=type_pr)) +
geom_point() + geom_line(aes(group = type_pr))在type=prof的組別中,prestige比較高的組別income的變化比較大、education比較高。
dt8 %>% filter(type=="wc") %>%
mutate(type_wc = cut(prestige, breaks = c(0,41.5,100),
label = c("L", "H"),ordered = T)) %>%
ggplot(data = ., aes(x = education, y = income,color=type_wc)) +
geom_point() + geom_line(aes(group = type_wc))在type=wc的組別中,prestige比較高的組別education也比較高,但income看起來無太大差異
9
read the data and see
dt9<-mlmRev::Hsb82
dt9$school <- factor(dt9$school)
knitr::kable(head(dt9))| school | minrty | sx | ses | mAch | meanses | sector | cses |
|---|---|---|---|---|---|---|---|
| 1224 | No | Female | -1.528 | 5.88 | -0.434 | Public | -1.094 |
| 1224 | No | Female | -0.588 | 19.71 | -0.434 | Public | -0.154 |
| 1224 | No | Male | -0.528 | 20.35 | -0.434 | Public | -0.094 |
| 1224 | No | Male | -0.668 | 8.78 | -0.434 | Public | -0.234 |
| 1224 | No | Male | -0.158 | 17.90 | -0.434 | Public | 0.276 |
| 1224 | No | Male | 0.022 | 4.58 | -0.434 | Public | 0.456 |
dt9 %>% group_by(school) %>% summarise(mathsum=mean(mAch))## # A tibble: 160 x 2
## school mathsum
## <ord> <dbl>
## 1 8367 4.55
## 2 8854 4.24
## 3 4458 5.81
## 4 5762 4.32
## 5 6990 5.98
## 6 5815 7.27
## 7 7172 8.07
## 8 4868 12.3
## 9 7341 9.79
## 10 1358 11.2
## # ... with 150 more rows
plot the confidence interval
dt9 %>%
group_by(school) %>%
summarize(math_m=mean(mAch, na.rm = T),
math_se=sd(mAch, na.rm = T)/sqrt(n())) %>%
ggplot(data = ., aes(x = school, y = math_m)) +
geom_point() +
geom_line(aes(group = school)) +
geom_errorbar(aes(ymin = math_m - 2*math_se, ymax = math_m + 2*math_se), width = .1) +
labs(x = "school", y = "Average Math score") +
theme_bw() ## geom_path: Each group consists of only one observation. Do you need
## to adjust the group aesthetic?
calculate the confidence interval
dt9 %>%
group_by(school) %>%
summarize(math_m=mean(mAch, na.rm = T),
math_se=sd(mAch, na.rm = T)/sqrt(n())) %>%
mutate(math_ci_upper=math_m - 2*math_se,
math_ci_lower=math_m + 2*math_se) ## # A tibble: 160 x 5
## school math_m math_se math_ci_upper math_ci_lower
## <ord> <dbl> <dbl> <dbl> <dbl>
## 1 8367 4.55 1.18 2.19 6.92
## 2 8854 4.24 0.956 2.33 6.15
## 3 4458 5.81 0.646 4.52 7.10
## 4 5762 4.32 0.821 2.68 5.97
## 5 6990 5.98 0.729 4.52 7.44
## 6 5815 7.27 1.12 5.04 9.51
## 7 7172 8.07 0.846 6.38 9.76
## 8 4868 12.3 0.932 10.4 14.2
## 9 7341 9.79 0.757 8.28 11.3
## 10 1358 11.2 1.07 9.06 13.4
## # ... with 150 more rows