exercise_20211101

Exercise_1

Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score mathscr against average reading score readscr for the sampled data set. Make sure your results are reproducible (e.g., the same random sample will be drawn each time).

library(Ecfun)
library(Ecdat)
data(Caschool, package="Ecdat")
dta_exer1 <- Caschool
str(dta_exer1)  #420 obs. of  17 variables

'data.frame':   420 obs. of  17 variables:
 $ distcod : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
 $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
 $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
 $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
 $ enrltot : int  195 240 1550 243 1335 137 195 888 379 2247 ...
 $ teachers: num  10.9 11.1 82.9 14 71.5 ...
 $ calwpct : num  0.51 15.42 55.03 36.48 33.11 ...
 $ mealpct : num  2.04 47.92 76.32 77.05 78.43 ...
 $ computer: int  67 101 169 85 171 25 28 66 35 0 ...
 $ testscr : num  691 661 644 648 641 ...
 $ compstu : num  0.344 0.421 0.109 0.35 0.128 ...
 $ expnstu : num  6385 5099 5502 7102 5236 ...
 $ str     : num  17.9 21.5 18.7 17.4 18.7 ...
 $ avginc  : num  22.69 9.82 8.98 8.98 9.08 ...
 $ elpct   : num  0 4.58 30 0 13.86 ...
 $ readscr : num  692 660 636 652 642 ...
 $ mathscr : num  690 662 651 644 640 ...

typeof(dta_exer1) #list

[1] "list"

head(dta_exer1)

  distcod  county                        district grspan enrltot teachers
1   75119 Alameda              Sunol Glen Unified  KK-08     195    10.90
2   61499   Butte            Manzanita Elementary  KK-08     240    11.15
3   61549   Butte     Thermalito Union Elementary  KK-08    1550    82.90
4   61457   Butte Golden Feather Union Elementary  KK-08     243    14.00
5   61523   Butte        Palermo Union Elementary  KK-08    1335    71.50
6   62042  Fresno         Burrel Union Elementary  KK-08     137     6.40
  calwpct mealpct computer testscr   compstu  expnstu      str    avginc
1  0.5102  2.0408       67  690.80 0.3435898 6384.911 17.88991 22.690001
2 15.4167 47.9167      101  661.20 0.4208333 5099.381 21.52466  9.824000
3 55.0323 76.3226      169  643.60 0.1090323 5501.955 18.69723  8.978000
4 36.4754 77.0492       85  647.70 0.3497942 7101.831 17.35714  8.978000
5 33.1086 78.4270      171  640.85 0.1280899 5235.988 18.67133  9.080333
6 12.3188 86.9565       25  605.55 0.1824818 5580.147 21.40625 10.415000
      elpct readscr mathscr
1  0.000000   691.6   690.0
2  4.583333   660.5   661.9
3 30.000002   636.3   650.9
4  0.000000   651.9   643.5
5 13.857677   641.8   639.9
6 12.408759   605.7   605.4

dta_exer1|> head() |> knitr::kable()

distcod	county	district	grspan	enrltot	teachers	calwpct	mealpct	computer	testscr	compstu	expnstu	str	avginc	elpct	readscr	mathscr
75119	Alameda	Sunol Glen Unified	KK-08	195	10.90	0.5102	2.0408	67	690.80	0.3435898	6384.911	17.88991	22.690001	0.000000	691.6	690.0
61499	Butte	Manzanita Elementary	KK-08	240	11.15	15.4167	47.9167	101	661.20	0.4208333	5099.381	21.52466	9.824000	4.583334	660.5	661.9
61549	Butte	Thermalito Union Elementary	KK-08	1550	82.90	55.0323	76.3226	169	643.60	0.1090323	5501.955	18.69723	8.978000	30.000002	636.3	650.9
61457	Butte	Golden Feather Union Elementary	KK-08	243	14.00	36.4754	77.0492	85	647.70	0.3497942	7101.831	17.35714	8.978000	0.000000	651.9	643.5
61523	Butte	Palermo Union Elementary	KK-08	1335	71.50	33.1086	78.4270	171	640.85	0.1280899	5235.988	18.67133	9.080333	13.857677	641.8	639.9
62042	Fresno	Burrel Union Elementary	KK-08	137	6.40	12.3188	86.9565	25	605.55	0.1824818	5580.147	21.40625	10.415000	12.408759	605.7	605.4

table(dta_exer1$county)


        Alameda           Butte       Calaveras    Contra Costa       El Dorado 
              1               6               1               7              10 
         Fresno           Glenn        Humboldt        Imperial            Inyo 
             12               3              17               6               1 
           Kern           Kings            Lake          Lassen     Los Angeles 
             27               9               2               5              27 
         Madera           Marin       Mendocino          Merced        Monterey 
              5               8               1              11               7 
         Nevada          Orange          Placer       Riverside      Sacramento 
              9              11              11               4               7 
     San Benito  San Bernardino       San Diego     San Joaquin San Luis Obispo 
              3              10              21               6               2 
      San Mateo   Santa Barbara     Santa Clara      Santa Cruz          Shasta 
             17              11              20               7              13 
       Siskiyou          Sonoma      Stanislaus          Sutter          Tehama 
              9              29               7               6               8 
        Trinity          Tulare        Tuolumne         Ventura            Yuba 
              2              24               6               9               2

library(tidyverse)
library(dbplyr)
set.seed(01)  #建立一個隨機選樣的seed number(每次選出來都一樣)
dta_exer1s <- group_by(dta_exer1, county) %>%   
   sample_n(1)  #每個county 選1個sample
str(dta_exer1s)  #選出45個資料

grouped_df [45 x 17] (S3: grouped_df/tbl_df/tbl/data.frame)
 $ distcod : int [1:45] 75119 61523 61572 61705 61945 62323 62554 62794 63073 63255 ...
 $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 270 217 176 278 230 60 116 42 37 ...
 $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 2 ...
 $ enrltot : int [1:45] 195 1335 777 353 586 205 144 129 3760 1510 ...
 $ teachers: num [1:45] 10.9 71.5 36.8 20 26 ...
 $ calwpct : num [1:45] 0.51 33.11 12.99 8.67 7.51 ...
 $ mealpct : num [1:45] 2.04 78.43 39.85 33.06 37.88 ...
 $ computer: int [1:45] 67 171 148 8 78 24 14 17 717 141 ...
 $ testscr : num [1:45] 691 641 657 635 656 ...
 $ compstu : num [1:45] 0.3436 0.1281 0.1905 0.0227 0.1331 ...
 $ expnstu : num [1:45] 6385 5236 5483 5238 5724 ...
 $ str     : num [1:45] 17.9 18.7 21.1 17.7 22.5 ...
 $ avginc  : num [1:45] 22.69 9.08 13.24 14.65 12.39 ...
 $ elpct   : num [1:45] 0 13.858 1.158 5.382 0.853 ...
 $ readscr : num [1:45] 692 642 663 637 662 ...
 $ mathscr : num [1:45] 690 640 650 634 650 ...
 - attr(*, "groups")= tibble [45 x 2] (S3: tbl_df/tbl/data.frame)
  ..$ county: Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
  ..$ .rows : list<int> [1:45] 
  .. ..$ : int 1
  .. ..$ : int 2
  .. ..$ : int 3
  .. ..$ : int 4
  .. ..$ : int 5
  .. ..$ : int 6
  .. ..$ : int 7
  .. ..$ : int 8
  .. ..$ : int 9
  .. ..$ : int 10
  .. ..$ : int 11
  .. ..$ : int 12
  .. ..$ : int 13
  .. ..$ : int 14
  .. ..$ : int 15
  .. ..$ : int 16
  .. ..$ : int 17
  .. ..$ : int 18
  .. ..$ : int 19
  .. ..$ : int 20
  .. ..$ : int 21
  .. ..$ : int 22
  .. ..$ : int 23
  .. ..$ : int 24
  .. ..$ : int 25
  .. ..$ : int 26
  .. ..$ : int 27
  .. ..$ : int 28
  .. ..$ : int 29
  .. ..$ : int 30
  .. ..$ : int 31
  .. ..$ : int 32
  .. ..$ : int 33
  .. ..$ : int 34
  .. ..$ : int 35
  .. ..$ : int 36
  .. ..$ : int 37
  .. ..$ : int 38
  .. ..$ : int 39
  .. ..$ : int 40
  .. ..$ : int 41
  .. ..$ : int 42
  .. ..$ : int 43
  .. ..$ : int 44
  .. ..$ : int 45
  .. ..@ ptype: int(0) 
  ..- attr(*, ".drop")= logi TRUE

plot(x=dta_exer1s$mathscr,   # X軸的值
     y=dta_exer1s$readscr,   # Y軸的值
     main="math and read score from each school",   # 圖片名稱
     xlab="math score",      # X軸名稱
     ylab="read score",      # Y軸名稱 
     pch=16,                 #點的形狀
     cex=0.5,                #點的大小
     col="#228B22")          #點的圖形

lm.model <- lm(mathscr~readscr, dta_exer1s)   #迴歸線
abline(lm.model,                          
       lwd=1) 
grid()  #參考線

library(lattice)
xyplot(x=mathscr~readscr,         # Wind放在Y軸，Temp放在X軸
       data=dta_exer1s,     
       group = county, 
       main="math and read score from each school",   # 圖片名稱
       xlab="math score",      # X軸名稱
       ylab="read score", 
       
       auto.key=list(space="top",          # 位置在上方 
                     columns=5,            # 1x5的方式呈現標籤
                     title="county labels", # 標籤名稱
                     cex.title=0.1)          # 標籤字體大小
      )

Exercise_2

Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach. The tail end of the data object should looks as follows: classID language_mean language_lb language_ub 131 11.273 … … 132 10.550 … … 133 10.643 … …

library(tidyverse)
data(nlschools, package="MASS")
dta_exer2 <- nlschools
str(dta_exer2)  #2287 obs. of  6 variables

'data.frame':   2287 obs. of  6 variables:
 $ lang : int  46 45 33 46 20 30 30 57 36 36 ...
 $ IQ   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
 $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ GS   : int  29 29 29 29 29 29 29 29 29 29 ...
 $ SES  : int  23 10 15 23 10 10 23 10 13 15 ...
 $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

typeof(dta_exer2) #list

[1] "list"

head(dta_exer2)

  lang   IQ class GS SES COMB
1   46 15.0   180 29  23    0
2   45 14.5   180 29  10    0
3   33  9.5   180 29  15    0
4   46 11.0   180 29  23    0
5   20  8.0   180 29  10    0
6   30  9.5   180 29  10    0

dta_exer2|> head() |> knitr::kable()

lang	IQ	class	GS	SES
46	15.0	180	29	23
45	14.5	180	29	10
33	9.5	180	29	15
46	11.0	180	29	23
20	8.0	180	29	10
30	9.5	180	29	10

dta_exer2<-as_tibble(dta_exer2)  # Save a copy of data as tibble

Usage:nlschools Format:This data frame contains 2287 rows and the following columns:

variables:

lang/language test score.

IQ/verbal IQ.

class/class ID.

GS/class size: number of eighth-grade pupils recorded in the class (there may be others: see COMB, and some may have been omitted with missing values).

SES/social-economic status of pupil’s family.

COMB/were the pupils taught in a multi-grade class (0/1)? Classes which contained pupils from grades 7 and 8 are coded 1, but only eighth-graders were tested.

        language_lu = language_mean-1.96*sd/sqrt(count)

options(digits=3)

        language_lu = language_mean-1.96*sd/sqrt(count))
                    language_lb = language_mean+1.96*sd/sqrt(count))

dta_exer2 %>% 
  group_by(class) %>% # group by class
   summarize(language_mean = mean(lang),  #計算mean
             language_sd = sd(lang), #計算sd
             count = n(), #計算n
             language_lb = language_mean - 1.96*language_sd/sqrt(count), 
             language_ub = language_mean + 1.96*language_sd/sqrt(count)) %>% 
  # 計算95% CI=mean+-1.96*sd/sqrt(n)
  arrange(language_mean) %>%
  # 依照language_mean排序(由小到大)，即為順排
  # arrange(desc(language_mean))為由大到小
  mutate(classID = paste0("13",1:n())) %>%   
  #新增一個classID，從"13"+1~n開始
  dplyr::select(., c(7,2,5,6)) %>%
  #按順序選column 7,2,5,6
  head() |> knitr::kable (digits = 1)  #做表格，取小數點第一位

classID	language_mean	language_lb	language_ub
131	20.0	13.5	26.5
132	23.5	20.2	26.8
133	23.7	19.0	28.4
134	28.4	23.3	33.6
135	29.3	21.1	37.5
136	30.2	23.0	37.3

Exercise_3

Convert the wide form of the O’Brien and Kaiser’s repeated-measures data in OBrienKaiser{carData} to the long form in OBrienKaiserLong{carData}.

library(carData)
data(OBrienKaiser, package="carData")
dta_exer3 <- OBrienKaiser
str(dta_exer3)  #16 obs. of  17 variables

'data.frame':   16 obs. of  17 variables:
 $ treatment: Factor w/ 3 levels "control","A",..: 1 1 1 1 1 2 2 2 2 3 ...
  ..- attr(*, "contrasts")= num [1:3, 1:2] -2 1 1 0 -1 1
  .. ..- attr(*, "dimnames")=List of 2
  .. .. ..$ : chr [1:3] "control" "A" "B"
  .. .. ..$ : NULL
 $ gender   : Factor w/ 2 levels "F","M": 2 2 2 1 1 2 2 1 1 2 ...
  ..- attr(*, "contrasts")= chr "contr.sum"
 $ pre.1    : num  1 4 5 5 3 7 5 2 3 4 ...
 $ pre.2    : num  2 4 6 4 4 8 5 3 3 4 ...
 $ pre.3    : num  4 5 5 7 6 7 6 5 4 5 ...
 $ pre.4    : num  2 3 7 5 4 9 4 3 6 3 ...
 $ pre.5    : num  1 4 7 4 3 9 5 2 4 4 ...
 $ post.1   : num  3 2 4 2 6 9 7 2 4 6 ...
 $ post.2   : num  2 2 5 2 7 9 7 4 5 7 ...
 $ post.3   : num  5 3 7 3 8 10 8 8 6 6 ...
 $ post.4   : num  3 5 5 5 6 8 10 6 4 8 ...
 $ post.5   : num  2 3 4 3 3 9 8 5 1 8 ...
 $ fup.1    : num  2 4 7 4 4 9 8 6 5 8 ...
 $ fup.2    : num  3 5 6 4 3 10 9 6 4 8 ...
 $ fup.3    : num  2 6 9 5 6 11 11 7 7 9 ...
 $ fup.4    : num  4 4 7 3 4 9 9 5 5 7 ...
 $ fup.5    : num  4 1 6 4 3 6 8 6 4 8 ...

head(dta_exer3)

  treatment gender pre.1 pre.2 pre.3 pre.4 pre.5 post.1 post.2 post.3 post.4
1   control      M     1     2     4     2     1      3      2      5      3
2   control      M     4     4     5     3     4      2      2      3      5
3   control      M     5     6     5     7     7      4      5      7      5
4   control      F     5     4     7     5     4      2      2      3      5
5   control      F     3     4     6     4     3      6      7      8      6
6         A      M     7     8     7     9     9      9      9     10      8
  post.5 fup.1 fup.2 fup.3 fup.4 fup.5
1      2     2     3     2     4     4
2      3     4     5     6     4     1
3      4     7     6     9     7     6
4      3     4     4     5     3     4
5      3     4     3     6     4     3
6      9     9    10    11     9     6

dta_exer3|> head() |> knitr::kable()

treatment	gender	pre.1	pre.2	pre.3	pre.4	pre.5	post.1	post.2	post.3	post.4	post.5	fup.1	fup.2	fup.3	fup.4	fup.5
control	M	1	2	4	2	1	3	2	5	3	2	2	3	2	4	4
control	M	4	4	5	3	4	2	2	3	5	3	4	5	6	4	1
control	M	5	6	5	7	7	4	5	7	5	4	7	6	9	7	6
control	F	5	4	7	5	4	2	2	3	5	3	4	4	5	3	4
control	F	3	4	6	4	3	6	7	8	6	3	4	3	6	4	3
A	M	7	8	7	9	9	9	9	10	8	9	9	10	11	9	6

data(OBrienKaiserLong, package="carData")
dta_exer3L <- OBrienKaiserLong
str(dta_exer3L)

'data.frame':   240 obs. of  6 variables:
 $ treatment: Factor w/ 3 levels "control","A",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ gender   : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
 $ score    : num  1 2 4 2 1 3 2 5 3 2 ...
 $ id       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ phase    : Factor w/ 3 levels "pre","post","fup": 1 1 1 1 1 2 2 2 2 2 ...
 $ hour     : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5 1 2 3 4 5 ...
 - attr(*, "reshapeLong")=List of 4
  ..$ varying:List of 1
  .. ..$ score: chr [1:15] "pre.1" "pre.2" "pre.3" "pre.4" ...
  .. ..- attr(*, "v.names")= chr "score"
  .. ..- attr(*, "times")= int [1:15] 1 2 3 4 5 6 7 8 9 10 ...
  ..$ v.names: chr "score"
  ..$ idvar  : chr "id"
  ..$ timevar: chr "phase.hour"

head(dta_exer3L)

    treatment gender score id phase hour
1.1   control      M     1  1   pre    1
1.2   control      M     2  1   pre    2
1.3   control      M     4  1   pre    3
1.4   control      M     2  1   pre    4
1.5   control      M     1  1   pre    5
1.6   control      M     3  1  post    1

dta_exer3L|> head() |> knitr::kable()

	treatment	gender	score	id	phase	hour
1.1	control	M	1	1	pre	1
1.2	control	M	2	1	pre	2
1.3	control	M	4	1	pre	3
1.4	control	M	2	1	pre	4
1.5	control	M	1	1	pre	5
1.6	control	M	3	1	post	1

OBrienKaiser同一個資料呈現方式不同為個案重複施測的資料

dta_exer3Ln <- as_tibble(dta_exer3) %>% 
    mutate(id = paste0(1:n())) %>% #先給id
    pivot_longer(cols=starts_with(c("pre", "post", "fup")),  
                 names_to="phase",
                 values_to="score")
#選column裡面有pre, post, fup的進入column "phase"
#對應的值，進入column"score"
dta_exer3Ln %>%
  separate(phase, c("phase", "hour")) %>%
#phase目前有文字也有數值
#將文字分開存入column"phase"，入值存入column"hour"
  dplyr::select(., c(1,2,6,3,4,5))

# A tibble: 240 x 6
   treatment gender score id    phase hour 
   <fct>     <fct>  <dbl> <chr> <chr> <chr>
 1 control   M          1 1     pre   1    
 2 control   M          2 1     pre   2    
 3 control   M          4 1     pre   3    
 4 control   M          2 1     pre   4    
 5 control   M          1 1     pre   5    
 6 control   M          3 1     post  1    
 7 control   M          2 1     post  2    
 8 control   M          5 1     post  3    
 9 control   M          3 1     post  4    
10 control   M          2 1     post  5    
# ... with 230 more rows

#按照所選的column排列順序

dta_exer3Ln |> as.data.frame()|> head()

  treatment gender id  phase score
1   control      M  1  pre.1     1
2   control      M  1  pre.2     2
3   control      M  1  pre.3     4
4   control      M  1  pre.4     2
5   control      M  1  pre.5     1
6   control      M  1 post.1     3

#將資料改為data.frame，列出前6筆

Exercise_4

Information about student majors, activities in intramural sports, and survey responses about their satisfaction with the on-campus cafeteria (for the variety and quality of the food served, and hours of operation) are recorded in three separate plain text files by three different administrative units: majors sports cafeteria

Input these files into an R session, merge them and output the results as in the comma-separate values file .

#先把.txt檔帶入，有標題 h=T
majors <- read.table("D:/User/Desktop/datamanagement/20211101/Exercise_4/student_major.txt", h = T)
sports <- read.table("D:/User/Desktop/datamanagement/20211101/Exercise_4/student_sport.txt", h = T)
cafeteria <- read.table("D:/User/Desktop/datamanagement/20211101/Exercise_4/student_cafe.txt", h = T)

str(majors)      #'data.frame': 14 obs. of  2 variables

'data.frame':   14 obs. of  2 variables:
 $ id   : int  123 345 624 789 851 555 992 484 717 839 ...
 $ major: chr  "Mathematics" "Computer Science" "History" "Biology" ...

head(majors)

   id            major
1 123      Mathematics
2 345 Computer Science
3 624          History
4 789          Biology
5 851 Computer Science
6 555         History

names(majors)    #"id" "major"

[1] "id"    "major"

str(sports)      #'data.frame': 13 obs. of  2 variables:

'data.frame':   13 obs. of  2 variables:
 $ id   : int  123 123 222 385 385 481 555 717 789 851 ...
 $ sport: chr  "basketball" "badminton" "badminton" "basketball" ...

head(sports)

   id      sport
1 123 basketball
2 123  badminton
3 222  badminton
4 385 basketball
5 385 volleyball
6 481 basketball

names(sports)    #"id" "sport"

[1] "id"    "sport"

str(cafeteria)   #'data.frame': 7 obs. of  4 variables:

'data.frame':   7 obs. of  4 variables:
 $ id     : int  123 345 385 481 624 851 992
 $ variety: int  3 4 1 2 2 5 4
 $ quality: int  4 4 1 3 3 4 5
 $ hours  : int  2 2 2 3 4 1 3

head(cafeteria)

   id variety quality hours
1 123       3       4     2
2 345       4       4     2
3 385       1       1     2
4 481       2       3     3
5 624       2       3     4
6 851       5       4     1

names(cafeteria) #"id" "variety" "quality" "hours"

[1] "id"      "variety" "quality" "hours"

#以上資料共通的column是id

dta_ms <- merge(majors, sports, by="id", all = TRUE) 
dta_exer4 <- merge(dta_ms, cafeteria, by="id", all = TRUE)
str(dta_exer4)  #'data.frame':  18 obs. of  6 variables

'data.frame':   18 obs. of  6 variables:
 $ id     : int  123 123 222 345 385 385 481 484 555 624 ...
 $ major  : chr  "Mathematics" "Mathematics" "Computer Science" "Computer Science" ...
 $ sport  : chr  "badminton" "basketball" "badminton" NA ...
 $ variety: int  3 3 NA 4 1 1 2 NA NA 2 ...
 $ quality: int  4 4 NA 4 1 1 3 NA NA 3 ...
 $ hours  : int  2 2 NA 2 2 2 3 NA NA 4 ...

knitr::include_graphics("exercise_4.png")

dta_exer4

    id            major      sport variety quality hours
1  123      Mathematics  badminton       3       4     2
2  123      Mathematics basketball       3       4     2
3  222 Computer Science  badminton      NA      NA    NA
4  345 Computer Science       <NA>       4       4     2
5  385      Mathematics volleyball       1       1     2
6  385      Mathematics basketball       1       1     2
7  481            Music basketball       2       3     3
8  484            Music       <NA>      NA      NA    NA
9  555         History  volleyball      NA      NA    NA
10 624          History       <NA>       2       3     4
11 717          English basketball      NA      NA    NA
12 789          Biology basketball      NA      NA    NA
13 839          Biology       <NA>      NA      NA    NA
14 851 Computer Science  badminton       5       4     1
15 851 Computer Science volleyball       5       4     1
16 989          English       <NA>      NA      NA    NA
17 992          Biology  badminton       4       5     3
18 992          Biology volleyball       4       5     3

#資料排序與圖檔不太一樣，但看不出圖檔的邏輯是甚麼

#output成csv
save(dta_exer4, file="savedfile")
write.csv(dta_exer4, file="dta_exer4.csv")
knitr::include_graphics("dta_exer4.png")

Exercise_5

The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit. Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care. Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless. Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years. A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions. The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.

ds <- read.csv("https://nhorton.people.amherst.edu/sasr2/datasets/help.csv")
library(dplyr)
#從網路讀取csv檔
newds = dplyr::select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
#選ds, cesd,...,flt這些變項，存到newds檔中

names(newds) #列出newds所有column名稱

 [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"    "f1b"   
 [9] "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"    "f1i"    "f1j"   
[17] "f1k"    "f1l"    "f1m"    "f1n"    "f1o"    "f1p"    "f1q"    "f1r"   
[25] "f1s"    "f1t"

str(newds[,1:10]) # 列出newds第1~10個column的結構：453 obs. of  10 variables

'data.frame':   453 obs. of  10 variables:
 $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
 $ female: int  0 0 0 1 0 1 1 0 1 0 ...
 $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
 $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
 $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
 $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
 $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
 $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
 $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
 $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...

# 列出newds第1~10個column的統計摘要
# summary：給出每個欄位的「最大值」、「最小值」、「平均值」、「中位數」「第一四分位數」…等等
summary(newds[,1:10])

      cesd           female             i1               i2        
 Min.   : 1.00   Min.   :0.0000   Min.   :  0.00   Min.   :  0.00  
 1st Qu.:25.00   1st Qu.:0.0000   1st Qu.:  3.00   1st Qu.:  3.00  
 Median :34.00   Median :0.0000   Median : 13.00   Median : 15.00  
 Mean   :32.85   Mean   :0.2362   Mean   : 17.91   Mean   : 22.65  
 3rd Qu.:41.00   3rd Qu.:0.0000   3rd Qu.: 26.00   3rd Qu.: 32.00  
 Max.   :60.00   Max.   :1.0000   Max.   :142.00   Max.   :184.00  
       id            treat             f1a             f1b       
 Min.   :  1.0   Min.   :0.0000   Min.   :0.000   Min.   :0.000  
 1st Qu.:119.0   1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.000  
 Median :233.0   Median :0.0000   Median :2.000   Median :1.000  
 Mean   :233.4   Mean   :0.4967   Mean   :1.634   Mean   :1.391  
 3rd Qu.:348.0   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:2.000  
 Max.   :470.0   Max.   :1.0000   Max.   :3.000   Max.   :3.000  
      f1c             f1d       
 Min.   :0.000   Min.   :0.000  
 1st Qu.:1.000   1st Qu.:0.000  
 Median :2.000   Median :1.000  
 Mean   :1.923   Mean   :1.565  
 3rd Qu.:3.000   3rd Qu.:3.000  
 Max.   :3.000   Max.   :3.000

#列出newds前3個row
head(newds, n=3)

  cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j f1k f1l
1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3   3   0
2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0   3   0
3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3   1   0
  f1m f1n f1o f1p f1q f1r f1s f1t
1   1   2   2   2   2   3   3   2
2   0   3   0   0   0   2   0   0
3   1   3   2   0   0   3   2   0

comment(newds) = "HELP baseline dataset" 
#把"HELP baseline dataset"寫入comment
comment(newds)

[1] "HELP baseline dataset"

#comment(newds)是甚麼

save(ds, file="savedfile")
write.csv(ds, file="ds.csv")
#把資料output變成ds.csv檔
#不知道資料如何指定資料夾，但剛剛有在自己的電腦搜尋到這個檔案
library(foreign)
#foreign這個package可以把資料讀入其他類型檔案或轉出存成其他檔案
write.foreign(newds, "file.dat", "file.sas", package="SAS")
#這邊是把資料存成"file.dat", "file.sas"

#列出newds的column "cesd"第1~10筆資料
with(newds, cesd[1:10])

 [1] 49 30 39 15 39  6 52 32 50 46

#跟上面的語法功能是一樣的(從頭數來的10筆)
#但當不知道最後面的資料有幾筆時，這個可以改成tail(cesd, 10)
with(newds, head(cesd, 10))

 [1] 49 30 39 15 39  6 52 32 50 46

#列出newds的column "cesd" 筆56大的值
#會逐一檢查列出，因此會重複
with(newds, cesd[cesd > 56])

[1] 57 58 57 60 58 58 57

library(dplyr)
#篩選newds的column "cesd" 筆56大的值，傳入下一個步驟select
filter(newds, cesd > 56) %>% 
#選出id和cesd(cesd>56)
select(id, cesd)

#sort()函數是對向量進行從小到大的排序
#列出newds的column "cesd"從小到大的前4筆(因此會有重複)
with(newds, sort(cesd)[1:4])

[1] 1 3 3 4

#which.min(cesd)是問cesd最小值是第幾個(row)
with(newds, which.min(cesd))

[1] 199

#which.max(cesd)是問cesd最大值是第幾個(row)
with(newds, which.max(cesd))

[1] 194

library(mosaic)
#tally is a convenient wrapper for summarise that will either call n or sum(n) depending on whether you're tallying for the first time, or re-tallying.
#問newds資料中，column(f1g)有沒有missing data
#結果is.na(f1g)答案為TRUE=1   FALSE=452   (有一筆NA資料，452筆都不是NA)
tally(~ is.na(f1g), data=newds)

is.na(f1g)
 TRUE FALSE 
    1   452

#Computes mean, standard deviation, quartiles, sample size and number of missing values for a data vector.
#計算nweds資料中，column(f1g)的mean, standard deviation, quartiles, sample size and number of missing values
favstats(~ f1g, data=newds)

 min Q1 median Q3 max     mean       sd   n missing
   0  1      2  3   3 1.730088 1.095314 452       1

# reverse code f1d, f1h, f1l and f1p
# f1d, f1h, f1l and f1p是反向題計分
# 這些題目最高都是3分，所以才會寫(3-f1d)
# cbind()透過 column 合併
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t))
#cesd有missing data的資料，帶入1
nmisscesd = apply(is.na(cesditems), 1, sum)
ncesditems = cesditems
#若ncesditems是NA的話，帶入0
ncesditems[is.na(cesditems)] = 0
#ncesditems水平加總為newcesd
newcesd = apply(ncesditems, 1, sum)
#imputemeancesd=20/(20-NA總數)*newcesd?  用途不清
imputemeancesd = 20/(20-nmisscesd)*newcesd

data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]

    newcesd newds.cesd nmisscesd imputemeancesd
4        15         15         1       15.78947
17       19         19         1       20.00000
87       44         44         1       46.31579
101      17         17         1       17.89474
154      29         29         1       30.52632
177      44         44         1       46.31579
229      39         39         1       41.05263

library(dplyr)
library(memisc)
#在newds中新增一個drinkstat變項
#根據column"i1"、"i2"、"female"條件，決定存入值為abstinent、moderate、highrisk
#例如：i1==0時，column"drinkstat"存入"abstinent"
newds = mutate(newds, drinkstat= 
  cases("abstinent" = i1==0,
        "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
          (i1>0 & i1<=2 & i2<=4 & female==0),
        "highrisk" = ((i1>1 | i2>3) & female==1) |
          ((i1>2 | i2>4) & female==0)))

library(mosaic)
detach(package:memisc)
detach(package:MASS)

library(dplyr)
#選取newds資料中column "i1, i2, female, drinkstat"
#形成一個新object，命為tmpds
tmpds = select(newds, i1, i2, female, drinkstat)
#列出365~370的資料
str(tmpds) #453 obs. of  4 variables

'data.frame':   453 obs. of  4 variables:
 $ i1       : int  13 56 0 5 10 4 13 12 71 20 ...
 $ i2       : int  26 62 0 5 13 4 20 24 129 27 ...
 $ female   : int  0 0 0 1 0 1 1 0 1 0 ...
 $ drinkstat: Factor w/ 3 levels "abstinent","moderate",..: 3 3 1 3 3 3 3 3 3 3 ...
 - attr(*, "comment")= chr "HELP baseline dataset"

tmpds[365:370,]

    i1 i2 female drinkstat
365  6 24      0  highrisk
366  6  6      0  highrisk
367  0  0      0 abstinent
368  0  0      1 abstinent
369  8  8      0  highrisk
370 32 32      0  highrisk

library(dplyr)
#篩選tmpds資料，drinkstat=="moderate" & female==1的，共有7筆
filter(tmpds, drinkstat=="moderate" & female==1)

  i1 i2 female drinkstat
1  1  1      1  moderate
2  1  3      1  moderate
3  1  2      1  moderate
4  1  1      1  moderate
5  1  1      1  moderate
6  1  1      1  moderate
7  1  1      1  moderate

library(gmodels)
#以tmpds資料，drinkstat做一個交叉表，包括個數和佔比
with(tmpds, CrossTable(drinkstat))


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  453 

 
          | abstinent |  moderate |  highrisk | 
          |-----------|-----------|-----------|
          |        68 |        28 |       357 | 
          |     0.150 |     0.062 |     0.788 | 
          |-----------|-----------|-----------|

#針對CrossTable()可進行兩兩類別變數的交叉表分析及卡方檢定
with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))


 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|-------------------------|

 
Total Observations in Table:  453 

 
             | female 
   drinkstat |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
   abstinent |        42 |        26 |        68 | 
             |     0.618 |     0.382 |     0.150 | 
-------------|-----------|-----------|-----------|
    moderate |        21 |         7 |        28 | 
             |     0.750 |     0.250 |     0.062 | 
-------------|-----------|-----------|-----------|
    highrisk |       283 |        74 |       357 | 
             |     0.793 |     0.207 |     0.788 | 
-------------|-----------|-----------|-----------|
Column Total |       346 |       107 |       453 | 
-------------|-----------|-----------|-----------|

#資料轉換：新增 gender 變項，將female中的0 或 1 轉換成 factor
#female==0者，存成male，female==1者存成female
newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female")))
#做成male和female的交叉表，計算個數
tally(~ female + gender, margin=FALSE, data=newds)

      gender
female Male Female
     0  346      0
     1    0    107

library(dplyr)
#將ds 資料重新由小到大排列
#第一層依據為 cesd，第二層依據為i1
newds = arrange(ds, cesd, i1)
#列出排序後的newds前1~5比資料
newds[1:5, c("cesd", "i1", "id")]

  cesd i1  id
1    1  3 233
2    3  1 139
3    3 13 418
4    4  4 251
5    4  9  95

library(dplyr)
#篩選ds資料中，female==1者，命名為females
females = filter(ds, female==1)
#計算females的cesd變項的平均
with(females, mean(cesd))

[1] 36.88785

# an alternative approach，計算females的cesd變項的平均
mean(ds$cesd[ds$female==1])

[1] 36.88785

#透過tapply針對female 欄位計算 cesd的平均分數
#結果 female==0者，cesd分數為31.6
# female==1者，cesd分數為36.9
with(ds, tapply(cesd, female, mean))

       0        1 
31.59827 36.88785

library(mosaic)
#an alternative approach
mean(cesd ~ female, data=ds)

       0        1 
31.59827 36.88785