Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score mathscr against average reading score readscr for the sampled data set. Make sure your results are reproducible (e.g., the same random sample will be drawn each time).
library(Ecfun)
library(Ecdat)
data(Caschool, package="Ecdat")
dta_exer1 <- Caschool
str(dta_exer1) #420 obs. of 17 variables'data.frame': 420 obs. of 17 variables:
$ distcod : int 75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
$ county : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
$ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
$ grspan : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
$ enrltot : int 195 240 1550 243 1335 137 195 888 379 2247 ...
$ teachers: num 10.9 11.1 82.9 14 71.5 ...
$ calwpct : num 0.51 15.42 55.03 36.48 33.11 ...
$ mealpct : num 2.04 47.92 76.32 77.05 78.43 ...
$ computer: int 67 101 169 85 171 25 28 66 35 0 ...
$ testscr : num 691 661 644 648 641 ...
$ compstu : num 0.344 0.421 0.109 0.35 0.128 ...
$ expnstu : num 6385 5099 5502 7102 5236 ...
$ str : num 17.9 21.5 18.7 17.4 18.7 ...
$ avginc : num 22.69 9.82 8.98 8.98 9.08 ...
$ elpct : num 0 4.58 30 0 13.86 ...
$ readscr : num 692 660 636 652 642 ...
$ mathscr : num 690 662 651 644 640 ...
typeof(dta_exer1) #list[1] "list"
head(dta_exer1) distcod county district grspan enrltot teachers
1 75119 Alameda Sunol Glen Unified KK-08 195 10.90
2 61499 Butte Manzanita Elementary KK-08 240 11.15
3 61549 Butte Thermalito Union Elementary KK-08 1550 82.90
4 61457 Butte Golden Feather Union Elementary KK-08 243 14.00
5 61523 Butte Palermo Union Elementary KK-08 1335 71.50
6 62042 Fresno Burrel Union Elementary KK-08 137 6.40
calwpct mealpct computer testscr compstu expnstu str avginc
1 0.5102 2.0408 67 690.80 0.3435898 6384.911 17.88991 22.690001
2 15.4167 47.9167 101 661.20 0.4208333 5099.381 21.52466 9.824000
3 55.0323 76.3226 169 643.60 0.1090323 5501.955 18.69723 8.978000
4 36.4754 77.0492 85 647.70 0.3497942 7101.831 17.35714 8.978000
5 33.1086 78.4270 171 640.85 0.1280899 5235.988 18.67133 9.080333
6 12.3188 86.9565 25 605.55 0.1824818 5580.147 21.40625 10.415000
elpct readscr mathscr
1 0.000000 691.6 690.0
2 4.583333 660.5 661.9
3 30.000002 636.3 650.9
4 0.000000 651.9 643.5
5 13.857677 641.8 639.9
6 12.408759 605.7 605.4
dta_exer1|> head() |> knitr::kable()| distcod | county | district | grspan | enrltot | teachers | calwpct | mealpct | computer | testscr | compstu | expnstu | str | avginc | elpct | readscr | mathscr |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 75119 | Alameda | Sunol Glen Unified | KK-08 | 195 | 10.90 | 0.5102 | 2.0408 | 67 | 690.80 | 0.3435898 | 6384.911 | 17.88991 | 22.690001 | 0.000000 | 691.6 | 690.0 |
| 61499 | Butte | Manzanita Elementary | KK-08 | 240 | 11.15 | 15.4167 | 47.9167 | 101 | 661.20 | 0.4208333 | 5099.381 | 21.52466 | 9.824000 | 4.583334 | 660.5 | 661.9 |
| 61549 | Butte | Thermalito Union Elementary | KK-08 | 1550 | 82.90 | 55.0323 | 76.3226 | 169 | 643.60 | 0.1090323 | 5501.955 | 18.69723 | 8.978000 | 30.000002 | 636.3 | 650.9 |
| 61457 | Butte | Golden Feather Union Elementary | KK-08 | 243 | 14.00 | 36.4754 | 77.0492 | 85 | 647.70 | 0.3497942 | 7101.831 | 17.35714 | 8.978000 | 0.000000 | 651.9 | 643.5 |
| 61523 | Butte | Palermo Union Elementary | KK-08 | 1335 | 71.50 | 33.1086 | 78.4270 | 171 | 640.85 | 0.1280899 | 5235.988 | 18.67133 | 9.080333 | 13.857677 | 641.8 | 639.9 |
| 62042 | Fresno | Burrel Union Elementary | KK-08 | 137 | 6.40 | 12.3188 | 86.9565 | 25 | 605.55 | 0.1824818 | 5580.147 | 21.40625 | 10.415000 | 12.408759 | 605.7 | 605.4 |
table(dta_exer1$county)
Alameda Butte Calaveras Contra Costa El Dorado
1 6 1 7 10
Fresno Glenn Humboldt Imperial Inyo
12 3 17 6 1
Kern Kings Lake Lassen Los Angeles
27 9 2 5 27
Madera Marin Mendocino Merced Monterey
5 8 1 11 7
Nevada Orange Placer Riverside Sacramento
9 11 11 4 7
San Benito San Bernardino San Diego San Joaquin San Luis Obispo
3 10 21 6 2
San Mateo Santa Barbara Santa Clara Santa Cruz Shasta
17 11 20 7 13
Siskiyou Sonoma Stanislaus Sutter Tehama
9 29 7 6 8
Trinity Tulare Tuolumne Ventura Yuba
2 24 6 9 2
library(tidyverse)
library(dbplyr)
set.seed(01) #建立一個隨機選樣的seed number(每次選出來都一樣)
dta_exer1s <- group_by(dta_exer1, county) %>%
sample_n(1) #每個county 選1個sample
str(dta_exer1s) #選出45個資料grouped_df [45 x 17] (S3: grouped_df/tbl_df/tbl/data.frame)
$ distcod : int [1:45] 75119 61523 61572 61705 61945 62323 62554 62794 63073 63255 ...
$ county : Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
$ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 270 217 176 278 230 60 116 42 37 ...
$ grspan : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 2 ...
$ enrltot : int [1:45] 195 1335 777 353 586 205 144 129 3760 1510 ...
$ teachers: num [1:45] 10.9 71.5 36.8 20 26 ...
$ calwpct : num [1:45] 0.51 33.11 12.99 8.67 7.51 ...
$ mealpct : num [1:45] 2.04 78.43 39.85 33.06 37.88 ...
$ computer: int [1:45] 67 171 148 8 78 24 14 17 717 141 ...
$ testscr : num [1:45] 691 641 657 635 656 ...
$ compstu : num [1:45] 0.3436 0.1281 0.1905 0.0227 0.1331 ...
$ expnstu : num [1:45] 6385 5236 5483 5238 5724 ...
$ str : num [1:45] 17.9 18.7 21.1 17.7 22.5 ...
$ avginc : num [1:45] 22.69 9.08 13.24 14.65 12.39 ...
$ elpct : num [1:45] 0 13.858 1.158 5.382 0.853 ...
$ readscr : num [1:45] 692 642 663 637 662 ...
$ mathscr : num [1:45] 690 640 650 634 650 ...
- attr(*, "groups")= tibble [45 x 2] (S3: tbl_df/tbl/data.frame)
..$ county: Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
..$ .rows : list<int> [1:45]
.. ..$ : int 1
.. ..$ : int 2
.. ..$ : int 3
.. ..$ : int 4
.. ..$ : int 5
.. ..$ : int 6
.. ..$ : int 7
.. ..$ : int 8
.. ..$ : int 9
.. ..$ : int 10
.. ..$ : int 11
.. ..$ : int 12
.. ..$ : int 13
.. ..$ : int 14
.. ..$ : int 15
.. ..$ : int 16
.. ..$ : int 17
.. ..$ : int 18
.. ..$ : int 19
.. ..$ : int 20
.. ..$ : int 21
.. ..$ : int 22
.. ..$ : int 23
.. ..$ : int 24
.. ..$ : int 25
.. ..$ : int 26
.. ..$ : int 27
.. ..$ : int 28
.. ..$ : int 29
.. ..$ : int 30
.. ..$ : int 31
.. ..$ : int 32
.. ..$ : int 33
.. ..$ : int 34
.. ..$ : int 35
.. ..$ : int 36
.. ..$ : int 37
.. ..$ : int 38
.. ..$ : int 39
.. ..$ : int 40
.. ..$ : int 41
.. ..$ : int 42
.. ..$ : int 43
.. ..$ : int 44
.. ..$ : int 45
.. ..@ ptype: int(0)
..- attr(*, ".drop")= logi TRUE
plot(x=dta_exer1s$mathscr, # X軸的值
y=dta_exer1s$readscr, # Y軸的值
main="math and read score from each school", # 圖片名稱
xlab="math score", # X軸名稱
ylab="read score", # Y軸名稱
pch=16, #點的形狀
cex=0.5, #點的大小
col="#228B22") #點的圖形
lm.model <- lm(mathscr~readscr, dta_exer1s) #迴歸線
abline(lm.model,
lwd=1)
grid() #參考線library(lattice)
xyplot(x=mathscr~readscr, # Wind放在Y軸,Temp放在X軸
data=dta_exer1s,
group = county,
main="math and read score from each school", # 圖片名稱
xlab="math score", # X軸名稱
ylab="read score",
auto.key=list(space="top", # 位置在上方
columns=5, # 1x5的方式呈現標籤
title="county labels", # 標籤名稱
cex.title=0.1) # 標籤字體大小
)Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach. The tail end of the data object should looks as follows: classID language_mean language_lb language_ub 131 11.273 … … 132 10.550 … … 133 10.643 … …
library(tidyverse)
data(nlschools, package="MASS")
dta_exer2 <- nlschools
str(dta_exer2) #2287 obs. of 6 variables'data.frame': 2287 obs. of 6 variables:
$ lang : int 46 45 33 46 20 30 30 57 36 36 ...
$ IQ : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
$ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
$ GS : int 29 29 29 29 29 29 29 29 29 29 ...
$ SES : int 23 10 15 23 10 10 23 10 13 15 ...
$ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
typeof(dta_exer2) #list[1] "list"
head(dta_exer2) lang IQ class GS SES COMB
1 46 15.0 180 29 23 0
2 45 14.5 180 29 10 0
3 33 9.5 180 29 15 0
4 46 11.0 180 29 23 0
5 20 8.0 180 29 10 0
6 30 9.5 180 29 10 0
dta_exer2|> head() |> knitr::kable() | lang | IQ | class | GS | SES | COMB |
|---|---|---|---|---|---|
| 46 | 15.0 | 180 | 29 | 23 | 0 |
| 45 | 14.5 | 180 | 29 | 10 | 0 |
| 33 | 9.5 | 180 | 29 | 15 | 0 |
| 46 | 11.0 | 180 | 29 | 23 | 0 |
| 20 | 8.0 | 180 | 29 | 10 | 0 |
| 30 | 9.5 | 180 | 29 | 10 | 0 |
dta_exer2<-as_tibble(dta_exer2) # Save a copy of data as tibbleUsage:nlschools Format:This data frame contains 2287 rows and the following columns:
variables:
lang/language test score.
IQ/verbal IQ.
class/class ID.
GS/class size: number of eighth-grade pupils recorded in the class (there may be others: see COMB, and some may have been omitted with missing values).
SES/social-economic status of pupil’s family.
COMB/were the pupils taught in a multi-grade class (0/1)? Classes which contained pupils from grades 7 and 8 are coded 1, but only eighth-graders were tested.
language_lu = language_mean-1.96*sd/sqrt(count)
options(digits=3)
language_lu = language_mean-1.96*sd/sqrt(count))
language_lb = language_mean+1.96*sd/sqrt(count))
dta_exer2 %>%
group_by(class) %>% # group by class
summarize(language_mean = mean(lang), #計算mean
language_sd = sd(lang), #計算sd
count = n(), #計算n
language_lb = language_mean - 1.96*language_sd/sqrt(count),
language_ub = language_mean + 1.96*language_sd/sqrt(count)) %>%
# 計算95% CI=mean+-1.96*sd/sqrt(n)
arrange(language_mean) %>%
# 依照language_mean排序(由小到大),即為順排
# arrange(desc(language_mean))為由大到小
mutate(classID = paste0("13",1:n())) %>%
#新增一個classID,從"13"+1~n開始
dplyr::select(., c(7,2,5,6)) %>%
#按順序選column 7,2,5,6
head() |> knitr::kable (digits = 1) #做表格,取小數點第一位| classID | language_mean | language_lb | language_ub |
|---|---|---|---|
| 131 | 20.0 | 13.5 | 26.5 |
| 132 | 23.5 | 20.2 | 26.8 |
| 133 | 23.7 | 19.0 | 28.4 |
| 134 | 28.4 | 23.3 | 33.6 |
| 135 | 29.3 | 21.1 | 37.5 |
| 136 | 30.2 | 23.0 | 37.3 |
Convert the wide form of the O’Brien and Kaiser’s repeated-measures data in OBrienKaiser{carData} to the long form in OBrienKaiserLong{carData}.
library(carData)
data(OBrienKaiser, package="carData")
dta_exer3 <- OBrienKaiser
str(dta_exer3) #16 obs. of 17 variables'data.frame': 16 obs. of 17 variables:
$ treatment: Factor w/ 3 levels "control","A",..: 1 1 1 1 1 2 2 2 2 3 ...
..- attr(*, "contrasts")= num [1:3, 1:2] -2 1 1 0 -1 1
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:3] "control" "A" "B"
.. .. ..$ : NULL
$ gender : Factor w/ 2 levels "F","M": 2 2 2 1 1 2 2 1 1 2 ...
..- attr(*, "contrasts")= chr "contr.sum"
$ pre.1 : num 1 4 5 5 3 7 5 2 3 4 ...
$ pre.2 : num 2 4 6 4 4 8 5 3 3 4 ...
$ pre.3 : num 4 5 5 7 6 7 6 5 4 5 ...
$ pre.4 : num 2 3 7 5 4 9 4 3 6 3 ...
$ pre.5 : num 1 4 7 4 3 9 5 2 4 4 ...
$ post.1 : num 3 2 4 2 6 9 7 2 4 6 ...
$ post.2 : num 2 2 5 2 7 9 7 4 5 7 ...
$ post.3 : num 5 3 7 3 8 10 8 8 6 6 ...
$ post.4 : num 3 5 5 5 6 8 10 6 4 8 ...
$ post.5 : num 2 3 4 3 3 9 8 5 1 8 ...
$ fup.1 : num 2 4 7 4 4 9 8 6 5 8 ...
$ fup.2 : num 3 5 6 4 3 10 9 6 4 8 ...
$ fup.3 : num 2 6 9 5 6 11 11 7 7 9 ...
$ fup.4 : num 4 4 7 3 4 9 9 5 5 7 ...
$ fup.5 : num 4 1 6 4 3 6 8 6 4 8 ...
head(dta_exer3) treatment gender pre.1 pre.2 pre.3 pre.4 pre.5 post.1 post.2 post.3 post.4
1 control M 1 2 4 2 1 3 2 5 3
2 control M 4 4 5 3 4 2 2 3 5
3 control M 5 6 5 7 7 4 5 7 5
4 control F 5 4 7 5 4 2 2 3 5
5 control F 3 4 6 4 3 6 7 8 6
6 A M 7 8 7 9 9 9 9 10 8
post.5 fup.1 fup.2 fup.3 fup.4 fup.5
1 2 2 3 2 4 4
2 3 4 5 6 4 1
3 4 7 6 9 7 6
4 3 4 4 5 3 4
5 3 4 3 6 4 3
6 9 9 10 11 9 6
dta_exer3|> head() |> knitr::kable()| treatment | gender | pre.1 | pre.2 | pre.3 | pre.4 | pre.5 | post.1 | post.2 | post.3 | post.4 | post.5 | fup.1 | fup.2 | fup.3 | fup.4 | fup.5 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| control | M | 1 | 2 | 4 | 2 | 1 | 3 | 2 | 5 | 3 | 2 | 2 | 3 | 2 | 4 | 4 |
| control | M | 4 | 4 | 5 | 3 | 4 | 2 | 2 | 3 | 5 | 3 | 4 | 5 | 6 | 4 | 1 |
| control | M | 5 | 6 | 5 | 7 | 7 | 4 | 5 | 7 | 5 | 4 | 7 | 6 | 9 | 7 | 6 |
| control | F | 5 | 4 | 7 | 5 | 4 | 2 | 2 | 3 | 5 | 3 | 4 | 4 | 5 | 3 | 4 |
| control | F | 3 | 4 | 6 | 4 | 3 | 6 | 7 | 8 | 6 | 3 | 4 | 3 | 6 | 4 | 3 |
| A | M | 7 | 8 | 7 | 9 | 9 | 9 | 9 | 10 | 8 | 9 | 9 | 10 | 11 | 9 | 6 |
data(OBrienKaiserLong, package="carData")
dta_exer3L <- OBrienKaiserLong
str(dta_exer3L)'data.frame': 240 obs. of 6 variables:
$ treatment: Factor w/ 3 levels "control","A",..: 1 1 1 1 1 1 1 1 1 1 ...
$ gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
$ score : num 1 2 4 2 1 3 2 5 3 2 ...
$ id : int 1 1 1 1 1 1 1 1 1 1 ...
$ phase : Factor w/ 3 levels "pre","post","fup": 1 1 1 1 1 2 2 2 2 2 ...
$ hour : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5 1 2 3 4 5 ...
- attr(*, "reshapeLong")=List of 4
..$ varying:List of 1
.. ..$ score: chr [1:15] "pre.1" "pre.2" "pre.3" "pre.4" ...
.. ..- attr(*, "v.names")= chr "score"
.. ..- attr(*, "times")= int [1:15] 1 2 3 4 5 6 7 8 9 10 ...
..$ v.names: chr "score"
..$ idvar : chr "id"
..$ timevar: chr "phase.hour"
head(dta_exer3L) treatment gender score id phase hour
1.1 control M 1 1 pre 1
1.2 control M 2 1 pre 2
1.3 control M 4 1 pre 3
1.4 control M 2 1 pre 4
1.5 control M 1 1 pre 5
1.6 control M 3 1 post 1
dta_exer3L|> head() |> knitr::kable()| treatment | gender | score | id | phase | hour | |
|---|---|---|---|---|---|---|
| 1.1 | control | M | 1 | 1 | pre | 1 |
| 1.2 | control | M | 2 | 1 | pre | 2 |
| 1.3 | control | M | 4 | 1 | pre | 3 |
| 1.4 | control | M | 2 | 1 | pre | 4 |
| 1.5 | control | M | 1 | 1 | pre | 5 |
| 1.6 | control | M | 3 | 1 | post | 1 |
OBrienKaiser同一個資料呈現方式不同 為個案重複施測的資料
dta_exer3Ln <- as_tibble(dta_exer3) %>%
mutate(id = paste0(1:n())) %>% #先給id
pivot_longer(cols=starts_with(c("pre", "post", "fup")),
names_to="phase",
values_to="score")
#選column裡面有pre, post, fup的進入column "phase"
#對應的值,進入column"score"
dta_exer3Ln %>%
separate(phase, c("phase", "hour")) %>%
#phase目前有文字也有數值
#將文字分開存入column"phase",入值存入column"hour"
dplyr::select(., c(1,2,6,3,4,5)) # A tibble: 240 x 6
treatment gender score id phase hour
<fct> <fct> <dbl> <chr> <chr> <chr>
1 control M 1 1 pre 1
2 control M 2 1 pre 2
3 control M 4 1 pre 3
4 control M 2 1 pre 4
5 control M 1 1 pre 5
6 control M 3 1 post 1
7 control M 2 1 post 2
8 control M 5 1 post 3
9 control M 3 1 post 4
10 control M 2 1 post 5
# ... with 230 more rows
#按照所選的column排列順序
dta_exer3Ln |> as.data.frame()|> head() treatment gender id phase score
1 control M 1 pre.1 1
2 control M 1 pre.2 2
3 control M 1 pre.3 4
4 control M 1 pre.4 2
5 control M 1 pre.5 1
6 control M 1 post.1 3
#將資料改為data.frame,列出前6筆Information about student majors, activities in intramural sports, and survey responses about their satisfaction with the on-campus cafeteria (for the variety and quality of the food served, and hours of operation) are recorded in three separate plain text files by three different administrative units: majors sports cafeteria
Input these files into an R session, merge them and output the results as in the comma-separate values file .
#先把.txt檔帶入,有標題 h=T
majors <- read.table("D:/User/Desktop/datamanagement/20211101/Exercise_4/student_major.txt", h = T)
sports <- read.table("D:/User/Desktop/datamanagement/20211101/Exercise_4/student_sport.txt", h = T)
cafeteria <- read.table("D:/User/Desktop/datamanagement/20211101/Exercise_4/student_cafe.txt", h = T)str(majors) #'data.frame': 14 obs. of 2 variables'data.frame': 14 obs. of 2 variables:
$ id : int 123 345 624 789 851 555 992 484 717 839 ...
$ major: chr "Mathematics" "Computer Science" "History" "Biology" ...
head(majors) id major
1 123 Mathematics
2 345 Computer Science
3 624 History
4 789 Biology
5 851 Computer Science
6 555 History
names(majors) #"id" "major"[1] "id" "major"
str(sports) #'data.frame': 13 obs. of 2 variables:'data.frame': 13 obs. of 2 variables:
$ id : int 123 123 222 385 385 481 555 717 789 851 ...
$ sport: chr "basketball" "badminton" "badminton" "basketball" ...
head(sports) id sport
1 123 basketball
2 123 badminton
3 222 badminton
4 385 basketball
5 385 volleyball
6 481 basketball
names(sports) #"id" "sport"[1] "id" "sport"
str(cafeteria) #'data.frame': 7 obs. of 4 variables:'data.frame': 7 obs. of 4 variables:
$ id : int 123 345 385 481 624 851 992
$ variety: int 3 4 1 2 2 5 4
$ quality: int 4 4 1 3 3 4 5
$ hours : int 2 2 2 3 4 1 3
head(cafeteria) id variety quality hours
1 123 3 4 2
2 345 4 4 2
3 385 1 1 2
4 481 2 3 3
5 624 2 3 4
6 851 5 4 1
names(cafeteria) #"id" "variety" "quality" "hours" [1] "id" "variety" "quality" "hours"
#以上資料共通的column是iddta_ms <- merge(majors, sports, by="id", all = TRUE)
dta_exer4 <- merge(dta_ms, cafeteria, by="id", all = TRUE)
str(dta_exer4) #'data.frame': 18 obs. of 6 variables'data.frame': 18 obs. of 6 variables:
$ id : int 123 123 222 345 385 385 481 484 555 624 ...
$ major : chr "Mathematics" "Mathematics" "Computer Science" "Computer Science" ...
$ sport : chr "badminton" "basketball" "badminton" NA ...
$ variety: int 3 3 NA 4 1 1 2 NA NA 2 ...
$ quality: int 4 4 NA 4 1 1 3 NA NA 3 ...
$ hours : int 2 2 NA 2 2 2 3 NA NA 4 ...
knitr::include_graphics("exercise_4.png")dta_exer4 id major sport variety quality hours
1 123 Mathematics badminton 3 4 2
2 123 Mathematics basketball 3 4 2
3 222 Computer Science badminton NA NA NA
4 345 Computer Science <NA> 4 4 2
5 385 Mathematics volleyball 1 1 2
6 385 Mathematics basketball 1 1 2
7 481 Music basketball 2 3 3
8 484 Music <NA> NA NA NA
9 555 History volleyball NA NA NA
10 624 History <NA> 2 3 4
11 717 English basketball NA NA NA
12 789 Biology basketball NA NA NA
13 839 Biology <NA> NA NA NA
14 851 Computer Science badminton 5 4 1
15 851 Computer Science volleyball 5 4 1
16 989 English <NA> NA NA NA
17 992 Biology badminton 4 5 3
18 992 Biology volleyball 4 5 3
#資料排序與圖檔不太一樣,但看不出圖檔的邏輯是甚麼#output成csv
save(dta_exer4, file="savedfile")
write.csv(dta_exer4, file="dta_exer4.csv")
knitr::include_graphics("dta_exer4.png")The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit. Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care. Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless. Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years. A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions. The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.
ds <- read.csv("https://nhorton.people.amherst.edu/sasr2/datasets/help.csv")
library(dplyr)
#從網路讀取csv檔
newds = dplyr::select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
#選ds, cesd,...,flt這些變項,存到newds檔中names(newds) #列出newds所有column名稱 [1] "cesd" "female" "i1" "i2" "id" "treat" "f1a" "f1b"
[9] "f1c" "f1d" "f1e" "f1f" "f1g" "f1h" "f1i" "f1j"
[17] "f1k" "f1l" "f1m" "f1n" "f1o" "f1p" "f1q" "f1r"
[25] "f1s" "f1t"
str(newds[,1:10]) # 列出newds第1~10個column的結構:453 obs. of 10 variables'data.frame': 453 obs. of 10 variables:
$ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
$ female: int 0 0 0 1 0 1 1 0 1 0 ...
$ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
$ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ treat : int 1 1 0 0 0 1 0 1 0 1 ...
$ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
$ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
$ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
$ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
# 列出newds第1~10個column的統計摘要
# summary:給出每個欄位的「最大值」、「最小值」、「平均值」、「中位數」「第一四分位數」…等等
summary(newds[,1:10]) cesd female i1 i2
Min. : 1.00 Min. :0.0000 Min. : 0.00 Min. : 0.00
1st Qu.:25.00 1st Qu.:0.0000 1st Qu.: 3.00 1st Qu.: 3.00
Median :34.00 Median :0.0000 Median : 13.00 Median : 15.00
Mean :32.85 Mean :0.2362 Mean : 17.91 Mean : 22.65
3rd Qu.:41.00 3rd Qu.:0.0000 3rd Qu.: 26.00 3rd Qu.: 32.00
Max. :60.00 Max. :1.0000 Max. :142.00 Max. :184.00
id treat f1a f1b
Min. : 1.0 Min. :0.0000 Min. :0.000 Min. :0.000
1st Qu.:119.0 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.000
Median :233.0 Median :0.0000 Median :2.000 Median :1.000
Mean :233.4 Mean :0.4967 Mean :1.634 Mean :1.391
3rd Qu.:348.0 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:2.000
Max. :470.0 Max. :1.0000 Max. :3.000 Max. :3.000
f1c f1d
Min. :0.000 Min. :0.000
1st Qu.:1.000 1st Qu.:0.000
Median :2.000 Median :1.000
Mean :1.923 Mean :1.565
3rd Qu.:3.000 3rd Qu.:3.000
Max. :3.000 Max. :3.000
#列出newds前3個row
head(newds, n=3) cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j f1k f1l
1 49 0 13 26 1 1 3 2 3 0 2 3 3 0 2 3 3 0
2 30 0 56 62 2 1 3 2 0 3 3 2 0 0 3 0 3 0
3 39 0 0 0 3 0 3 2 3 0 2 2 1 3 2 3 1 0
f1m f1n f1o f1p f1q f1r f1s f1t
1 1 2 2 2 2 3 3 2
2 0 3 0 0 0 2 0 0
3 1 3 2 0 0 3 2 0
comment(newds) = "HELP baseline dataset"
#把"HELP baseline dataset"寫入comment
comment(newds)[1] "HELP baseline dataset"
#comment(newds)是甚麼save(ds, file="savedfile")
write.csv(ds, file="ds.csv")
#把資料output變成ds.csv檔
#不知道資料如何指定資料夾,但剛剛有在自己的電腦搜尋到這個檔案
library(foreign)
#foreign這個package可以把資料讀入其他類型檔案或轉出存成其他檔案
write.foreign(newds, "file.dat", "file.sas", package="SAS")
#這邊是把資料存成"file.dat", "file.sas"#列出newds的column "cesd"第1~10筆資料
with(newds, cesd[1:10]) [1] 49 30 39 15 39 6 52 32 50 46
#跟上面的語法功能是一樣的(從頭數來的10筆)
#但當不知道最後面的資料有幾筆時,這個可以改成tail(cesd, 10)
with(newds, head(cesd, 10)) [1] 49 30 39 15 39 6 52 32 50 46
#列出newds的column "cesd" 筆56大的值
#會逐一檢查列出,因此會重複
with(newds, cesd[cesd > 56])[1] 57 58 57 60 58 58 57
library(dplyr)
#篩選newds的column "cesd" 筆56大的值,傳入下一個步驟select
filter(newds, cesd > 56) %>%
#選出id和cesd(cesd>56)
select(id, cesd) id cesd
1 71 57
2 127 58
3 200 57
4 228 60
5 273 58
6 351 58
7 13 57
#sort()函數是對向量進行從小到大的排序
#列出newds的column "cesd"從小到大的前4筆(因此會有重複)
with(newds, sort(cesd)[1:4])[1] 1 3 3 4
#which.min(cesd)是問cesd最小值是第幾個(row)
with(newds, which.min(cesd))[1] 199
#which.max(cesd)是問cesd最大值是第幾個(row)
with(newds, which.max(cesd))[1] 194
library(mosaic)
#tally is a convenient wrapper for summarise that will either call n or sum(n) depending on whether you're tallying for the first time, or re-tallying.
#問newds資料中,column(f1g)有沒有missing data
#結果is.na(f1g)答案為TRUE=1 FALSE=452 (有一筆NA資料,452筆都不是NA)
tally(~ is.na(f1g), data=newds)is.na(f1g)
TRUE FALSE
1 452
#Computes mean, standard deviation, quartiles, sample size and number of missing values for a data vector.
#計算nweds資料中,column(f1g)的mean, standard deviation, quartiles, sample size and number of missing values
favstats(~ f1g, data=newds) min Q1 median Q3 max mean sd n missing
0 1 2 3 3 1.730088 1.095314 452 1
# reverse code f1d, f1h, f1l and f1p
# f1d, f1h, f1l and f1p是反向題計分
# 這些題目最高都是3分,所以才會寫(3-f1d)
# cbind()透過 column 合併
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
(3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
f1q, f1r, f1s, f1t))
#cesd有missing data的資料,帶入1
nmisscesd = apply(is.na(cesditems), 1, sum)
ncesditems = cesditems
#若ncesditems是NA的話,帶入0
ncesditems[is.na(cesditems)] = 0
#ncesditems水平加總為newcesd
newcesd = apply(ncesditems, 1, sum)
#imputemeancesd=20/(20-NA總數)*newcesd? 用途不清
imputemeancesd = 20/(20-nmisscesd)*newcesddata.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,] newcesd newds.cesd nmisscesd imputemeancesd
4 15 15 1 15.78947
17 19 19 1 20.00000
87 44 44 1 46.31579
101 17 17 1 17.89474
154 29 29 1 30.52632
177 44 44 1 46.31579
229 39 39 1 41.05263
library(dplyr)
library(memisc)
#在newds中新增一個drinkstat變項
#根據column"i1"、"i2"、"female"條件,決定存入值為abstinent、moderate、highrisk
#例如:i1==0時,column"drinkstat"存入"abstinent"
newds = mutate(newds, drinkstat=
cases("abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
(i1>0 & i1<=2 & i2<=4 & female==0),
"highrisk" = ((i1>1 | i2>3) & female==1) |
((i1>2 | i2>4) & female==0)))library(mosaic)
detach(package:memisc)
detach(package:MASS)library(dplyr)
#選取newds資料中column "i1, i2, female, drinkstat"
#形成一個新object,命為tmpds
tmpds = select(newds, i1, i2, female, drinkstat)
#列出365~370的資料
str(tmpds) #453 obs. of 4 variables'data.frame': 453 obs. of 4 variables:
$ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
$ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
$ female : int 0 0 0 1 0 1 1 0 1 0 ...
$ drinkstat: Factor w/ 3 levels "abstinent","moderate",..: 3 3 1 3 3 3 3 3 3 3 ...
- attr(*, "comment")= chr "HELP baseline dataset"
tmpds[365:370,] i1 i2 female drinkstat
365 6 24 0 highrisk
366 6 6 0 highrisk
367 0 0 0 abstinent
368 0 0 1 abstinent
369 8 8 0 highrisk
370 32 32 0 highrisk
library(dplyr)
#篩選tmpds資料,drinkstat=="moderate" & female==1的,共有7筆
filter(tmpds, drinkstat=="moderate" & female==1) i1 i2 female drinkstat
1 1 1 1 moderate
2 1 3 1 moderate
3 1 2 1 moderate
4 1 1 1 moderate
5 1 1 1 moderate
6 1 1 1 moderate
7 1 1 1 moderate
library(gmodels)
#以tmpds資料,drinkstat做一個交叉表,包括個數和佔比
with(tmpds, CrossTable(drinkstat))
Cell Contents
|-------------------------|
| N |
| N / Table Total |
|-------------------------|
Total Observations in Table: 453
| abstinent | moderate | highrisk |
|-----------|-----------|-----------|
| 68 | 28 | 357 |
| 0.150 | 0.062 | 0.788 |
|-----------|-----------|-----------|
#針對CrossTable()可進行兩兩類別變數的交叉表分析及卡方檢定
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
Cell Contents
|-------------------------|
| N |
| N / Row Total |
|-------------------------|
Total Observations in Table: 453
| female
drinkstat | 0 | 1 | Row Total |
-------------|-----------|-----------|-----------|
abstinent | 42 | 26 | 68 |
| 0.618 | 0.382 | 0.150 |
-------------|-----------|-----------|-----------|
moderate | 21 | 7 | 28 |
| 0.750 | 0.250 | 0.062 |
-------------|-----------|-----------|-----------|
highrisk | 283 | 74 | 357 |
| 0.793 | 0.207 | 0.788 |
-------------|-----------|-----------|-----------|
Column Total | 346 | 107 | 453 |
-------------|-----------|-----------|-----------|
#資料轉換:新增 gender 變項,將female中的0 或 1 轉換成 factor
#female==0者,存成male,female==1者存成female
newds = transform(newds,
gender=factor(female, c(0,1), c("Male","Female")))
#做成male和female的交叉表,計算個數
tally(~ female + gender, margin=FALSE, data=newds) gender
female Male Female
0 346 0
1 0 107
library(dplyr)
#將ds 資料重新由小到大排列
#第一層依據為 cesd,第二層依據為i1
newds = arrange(ds, cesd, i1)
#列出排序後的newds前1~5比資料
newds[1:5, c("cesd", "i1", "id")] cesd i1 id
1 1 3 233
2 3 1 139
3 3 13 418
4 4 4 251
5 4 9 95
library(dplyr)
#篩選ds資料中,female==1者,命名為females
females = filter(ds, female==1)
#計算females的cesd變項的平均
with(females, mean(cesd))[1] 36.88785
# an alternative approach,計算females的cesd變項的平均
mean(ds$cesd[ds$female==1])[1] 36.88785
#透過tapply針對female 欄位計算 cesd的平均分數
#結果 female==0者,cesd分數為31.6
# female==1者,cesd分數為36.9
with(ds, tapply(cesd, female, mean)) 0 1
31.59827 36.88785
library(mosaic)
#an alternative approach
mean(cesd ~ female, data=ds) 0 1
31.59827 36.88785