hw1

Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score (mathscr) against average reading score (readscr) for the sampled data set. Make sure your results are reproducible (e.g., the same random sample will be drawn each time).

# input data
pacman::p_load(Ecdat)
data(Caschool, package="Ecdat")
#了解這個資料中的訊息
?Caschool
# 顯示Caschool的資料結構
str(Caschool)
'data.frame':   420 obs. of  17 variables:
 $ distcod : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
 $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
 $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
 $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
 $ enrltot : int  195 240 1550 243 1335 137 195 888 379 2247 ...
 $ teachers: num  10.9 11.1 82.9 14 71.5 ...
 $ calwpct : num  0.51 15.42 55.03 36.48 33.11 ...
 $ mealpct : num  2.04 47.92 76.32 77.05 78.43 ...
 $ computer: int  67 101 169 85 171 25 28 66 35 0 ...
 $ testscr : num  691 661 644 648 641 ...
 $ compstu : num  0.344 0.421 0.109 0.35 0.128 ...
 $ expnstu : num  6385 5099 5502 7102 5236 ...
 $ str     : num  17.9 21.5 18.7 17.4 18.7 ...
 $ avginc  : num  22.69 9.82 8.98 8.98 9.08 ...
 $ elpct   : num  0 4.58 30 0 13.86 ...
 $ readscr : num  692 660 636 652 642 ...
 $ mathscr : num  690 662 651 644 640 ...
pacman::p_load(Momocs,dplyr)
set.seed(1234)
dta1 <-Caschool |>
  distinct(county, .keep_all = TRUE)|> #distinct 只保留唯一county列,並用.keep_all保留所有欄位資料
  sample_n(45)  #隨機抽樣本數量
head(dta1)
    distcod      county                    district grspan enrltot teachers
104   65557   Mendocino      Arena Union Elementary  KK-08     337    16.55
45    67199   Riverside           Perris Elementary  KK-06    4258   221.00
70    70904      Sonoma         Roseland Elementary  KK-06    1154    62.10
188   64105      Lassen Janesville Union Elementary  KK-08     516    26.00
19    64709 Los Angeles           Lennox Elementary  KK-08    6880   303.03
8     63834        Kern         Vineland Elementary  KK-08     888    42.50
    calwpct  mealpct computer testscr    compstu  expnstu      str  avginc
104 21.2625  60.1329       26  639.30 0.07715134 6089.911 20.36254 10.0350
45  24.6595  92.7431      324  628.75 0.07609206 5092.917 19.26697 10.0505
70  22.9277  84.9206      129  634.15 0.11178510 6056.497 18.58293 11.6650
188  9.2593  24.8148       65  651.90 0.12596899 4974.281 19.84615 13.4670
19  21.2824  94.9712      960  619.80 0.13953489 5064.616 22.70402  7.0220
8   18.8063 100.0000       66  609.00 0.07432432 4565.746 20.89412  8.1740
        elpct readscr mathscr
104 15.430267   642.7   635.9
45  36.801315   626.5   631.0
70  55.892544   636.8   631.5
188  1.937984   656.3   647.5
19  77.005814   619.1   620.5
8   46.959461   605.5   612.5
class(dta1)
[1] "data.frame"

有點不確定,distinct過程中就是隨機只只保留唯一county列了嗎? 如果是的話,應該就不用再寫sample_n(45)

pacman::p_load("basicPlotteR")
# plot
with(dta1, plot(mathscr,readscr, xlab="mathscr", ylab="readscr", pch=20, bty='n', col="lightblue", axes = FALSE)) #bty外框限
abline(lm(mathscr~readscr, data= dta1))
addTextLabels(dta1$mathscr, dta1$readscr,labels=dta1$county, cex.label = 0.5,
  col.label = "black",  col.line = "grey")
axis(1, at = seq(600,700, by=10))
axis(2, at = seq(600,700, by=10))
grid()

addTextLabels This function is similar to the text() function but it will attempt to re-locate labels that will overlap

more about addTextLabels

hw2

Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach. The tail end of the data object should looks as follows:

# input data
pacman::p_load(MASS)
data(nlschools, package="MASS")
#了解這個資料中的訊息
?nlschools
# 顯示nlschools的資料結構
str(nlschools)
'data.frame':   2287 obs. of  6 variables:
 $ lang : int  46 45 33 46 20 30 30 57 36 36 ...
 $ IQ   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
 $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ GS   : int  29 29 29 29 29 29 29 29 29 29 ...
 $ SES  : int  23 10 15 23 10 10 23 10 13 15 ...
 $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
dta2<- nlschools

# 先利用group_by class,計算每個班級的mean, sd
dta2 %<>% group_by(class) %>% dplyr::summarize(Mean = mean(lang), SD =sd(lang)) # |> 用這個無法將summarize資料存到dta2,但用%>%可以

# 用transmute 新增欄位,並只保留這些欄位
tbl <- dta2 |> transmute(classID = class,
                         language_mean = Mean,
                         language_lb = Mean-1.96*SD/sqrt(n()),
                         language_lu = Mean+1.96*SD/sqrt(n()))

# older ID 並看前六筆
tbl[order(tbl$classID),] |> head() |>knitr::kable()
classID language_mean language_lb language_lu
180 36.40000 34.91109 37.88891
280 23.71429 22.64069 24.78789
1082 30.40000 28.67265 32.12735
1280 30.86667 29.68244 32.05090
1580 30.87500 28.61403 33.13597
1680 41.50000 40.42897 42.57103

hw3

Convert the wide form of the O’Brien and Kaiser’s repeated-measures data in OBrienKaiser{carData} to the long form in OBrienKaiserLong{carData}.

input OBrienKaiser data

pacman::p_load(carData)
data(OBrienKaiser, package="carData")
# 了解這個資料中的訊息
?OBrienKaiser
# 顯示OBrienKaiser的資料結構
str(OBrienKaiser)
'data.frame':   16 obs. of  17 variables:
 $ treatment: Factor w/ 3 levels "control","A",..: 1 1 1 1 1 2 2 2 2 3 ...
  ..- attr(*, "contrasts")= num [1:3, 1:2] -2 1 1 0 -1 1
  .. ..- attr(*, "dimnames")=List of 2
  .. .. ..$ : chr [1:3] "control" "A" "B"
  .. .. ..$ : NULL
 $ gender   : Factor w/ 2 levels "F","M": 2 2 2 1 1 2 2 1 1 2 ...
  ..- attr(*, "contrasts")= chr "contr.sum"
 $ pre.1    : num  1 4 5 5 3 7 5 2 3 4 ...
 $ pre.2    : num  2 4 6 4 4 8 5 3 3 4 ...
 $ pre.3    : num  4 5 5 7 6 7 6 5 4 5 ...
 $ pre.4    : num  2 3 7 5 4 9 4 3 6 3 ...
 $ pre.5    : num  1 4 7 4 3 9 5 2 4 4 ...
 $ post.1   : num  3 2 4 2 6 9 7 2 4 6 ...
 $ post.2   : num  2 2 5 2 7 9 7 4 5 7 ...
 $ post.3   : num  5 3 7 3 8 10 8 8 6 6 ...
 $ post.4   : num  3 5 5 5 6 8 10 6 4 8 ...
 $ post.5   : num  2 3 4 3 3 9 8 5 1 8 ...
 $ fup.1    : num  2 4 7 4 4 9 8 6 5 8 ...
 $ fup.2    : num  3 5 6 4 3 10 9 6 4 8 ...
 $ fup.3    : num  2 6 9 5 6 11 11 7 7 9 ...
 $ fup.4    : num  4 4 7 3 4 9 9 5 5 7 ...
 $ fup.5    : num  4 1 6 4 3 6 8 6 4 8 ...

input OBrienKaiserLong data

pacman::p_load(carData)
data(OBrienKaiserLong, package="carData")
# 了解這個資料中的訊息
?OBrienKaiserLong
# 顯示OBrienKaiserLong的資料結構
str(OBrienKaiserLong)
'data.frame':   240 obs. of  6 variables:
 $ treatment: Factor w/ 3 levels "control","A",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ gender   : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
 $ score    : num  1 2 4 2 1 3 2 5 3 2 ...
 $ id       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ phase    : Factor w/ 3 levels "pre","post","fup": 1 1 1 1 1 2 2 2 2 2 ...
 $ hour     : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5 1 2 3 4 5 ...
 - attr(*, "reshapeLong")=List of 4
  ..$ varying:List of 1
  .. ..$ score: chr [1:15] "pre.1" "pre.2" "pre.3" "pre.4" ...
  .. ..- attr(*, "v.names")= chr "score"
  .. ..- attr(*, "times")= int [1:15] 1 2 3 4 5 6 7 8 9 10 ...
  ..$ v.names: chr "score"
  ..$ idvar  : chr "id"
  ..$ timevar: chr "phase.hour"

Convert wide form to long form

library(tidyverse)
# 將OBrienKaiser存入dta3中
dta3 <-OBrienKaiser

# 先給他id,再轉為long form
dtaL <- dta3 |>
  mutate(id = rep(1:n()))|>
  pivot_longer(
    cols = starts_with(c("pre","post","fup")), #辨識以pre, post, fup開頭的欄位
               names_to = "Time", #欄位名稱為Time
               values_to = "score") #值會放入score

dtaL %<>%
  separate(Time, c("phase","hour")) |> #將Time拆解為phase與hour兩欄
  mutate(hour=parse_number(hour)) # hour透過parse_number擷取"pre.x"後的x(數字)
# A tibble: 240 x 6
   treatment gender    id phase  hour score
   <fct>     <fct>  <int> <chr> <dbl> <dbl>
 1 control   M          1 pre       1     1
 2 control   M          1 pre       2     2
 3 control   M          1 pre       3     4
 4 control   M          1 pre       4     2
 5 control   M          1 pre       5     1
 6 control   M          1 post      1     3
 7 control   M          1 post      2     2
 8 control   M          1 post      3     5
 9 control   M          1 post      4     3
10 control   M          1 post      5     2
# ... with 230 more rows
# change column order
dtaL2 <- dtaL[,c(1,2,6,3,4,5)]

# check our results is correct
head(dtaL2,n=16L)
# A tibble: 16 x 6
   treatment gender score    id phase hour 
   <fct>     <fct>  <dbl> <int> <chr> <chr>
 1 control   M          1     1 pre   1    
 2 control   M          2     1 pre   2    
 3 control   M          4     1 pre   3    
 4 control   M          2     1 pre   4    
 5 control   M          1     1 pre   5    
 6 control   M          3     1 post  1    
 7 control   M          2     1 post  2    
 8 control   M          5     1 post  3    
 9 control   M          3     1 post  4    
10 control   M          2     1 post  5    
11 control   M          2     1 fup   1    
12 control   M          3     1 fup   2    
13 control   M          2     1 fup   3    
14 control   M          4     1 fup   4    
15 control   M          4     1 fup   5    
16 control   M          4     2 pre   1    
head(OBrienKaiserLong,n=16L)
     treatment gender score id phase hour
1.1    control      M     1  1   pre    1
1.2    control      M     2  1   pre    2
1.3    control      M     4  1   pre    3
1.4    control      M     2  1   pre    4
1.5    control      M     1  1   pre    5
1.6    control      M     3  1  post    1
1.7    control      M     2  1  post    2
1.8    control      M     5  1  post    3
1.9    control      M     3  1  post    4
1.10   control      M     2  1  post    5
1.11   control      M     2  1   fup    1
1.12   control      M     3  1   fup    2
1.13   control      M     2  1   fup    3
1.14   control      M     4  1   fup    4
1.15   control      M     4  1   fup    5
2.1    control      M     4  2   pre    1

hw4

Information about student majors, activities in intramural sports, and survey responses about their satisfaction with the on-campus cafeteria (for the variety and quality of the food served, and hours of operation) are recorded in three separate plain text files by three different administrative units: majors, sports, cafeteria

Input these files into an R session, merge them and output the results as in the comma-separate values file .

# input data
majors <- read.table("student_major.txt", header = TRUE)
sports <-read.table("student_sport.txt", header = TRUE)
cafeteria <-read.table("student_cafe.txt", header = TRUE)

# 存入list比較方便後續merge動作
listdta3<-list(majors,sports,cafeteria)
dplyr::glimpse(listdta3)
List of 3
 $ :'data.frame':   14 obs. of  2 variables:
  ..$ id   : int [1:14] 123 345 624 789 851 555 992 484 717 839 ...
  ..$ major: chr [1:14] "Mathematics" "Computer Science" "History" "Biology" ...
 $ :'data.frame':   13 obs. of  2 variables:
  ..$ id   : int [1:13] 123 123 222 385 385 481 555 717 789 851 ...
  ..$ sport: chr [1:13] "basketball" "badminton" "badminton" "basketball" ...
 $ :'data.frame':   7 obs. of  4 variables:
  ..$ id     : int [1:7] 123 345 385 481 624 851 992
  ..$ variety: int [1:7] 3 4 1 2 2 5 4
  ..$ quality: int [1:7] 4 4 1 3 3 4 5
  ..$ hours  : int [1:7] 2 2 2 3 4 1 3

read_table 讀進來會出現空白欄位與quote on chr (ex:major,sport),使用read.table比較省事

differ read.table and read_table

Tidy approach

library(tidyverse)
tidydta3<- listdta3 %>% reduce(full_join, by='id') # full_join return all rows and columns from all datasets
#
write.csv(tidydta3, "tidydta3.csv", sep = "," ,row.names = F)

Warning in write.csv(mrgdta3, “mrgdta3.csv”, sep = “,”) : attempt to set ‘sep’ ignored

write.csv()預設就是以”,“為分隔號,所以R說他忽略’sep’

write.csv() uses “.” for the decimal point and a comma (“,”) for the separator.

write.csv2() uses a comma (“,”) for the decimal point and a semicolon (“;”) for the separator. differ write.csv and write.csv2

Merge approach

# Roll our own merging function
mrg <- function(f1, f2){                                
  merge(f1, f2, by="id", all=T)
}

# as.data.frame is easier to read in Rstudio
mrgdta3<-Reduce(mrg, listdta3) %>% as.data.frame()

#
write.csv(mrgdta3, "mrgdta3.csv" ,row.names = F) #可以省略sep

hw5

The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit. Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care. Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless. Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years. A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions. The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.

Source: Kleinman, K., & Horton, N.J. (2015). Using R for Data Management, Statistical Analysis, and Graphics.

options(continue="  ")
options(digits=3)
options(width=72) # narrow output 
ds = read.csv("https://nhorton.people.amherst.edu/sasr2/datasets/help.csv")
library(dplyr)
# 選擇變項
newds <- ds |> dplyr::select(cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
names(newds) # show all column name
 [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
 [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
[15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
[22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"   
str(newds[,1:10]) # structure of the first 10 variables
'data.frame':   453 obs. of  10 variables:
 $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
 $ female: int  0 0 0 1 0 1 1 0 1 0 ...
 $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
 $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
 $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
 $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
 $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
 $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
 $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
 $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...
summary(newds[,1:10]) # summary of the first 10 variables
      cesd          female            i1              i2       
 Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
 1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
 Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
 Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
 3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
 Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
       id          treat            f1a            f1b      
 Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
 1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
 Median :233   Median :0.000   Median :2.00   Median :1.00  
 Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
 3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
 Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
      f1c            f1d      
 Min.   :0.00   Min.   :0.00  
 1st Qu.:1.00   1st Qu.:0.00  
 Median :2.00   Median :1.00  
 Mean   :1.92   Mean   :1.56  
 3rd Qu.:3.00   3rd Qu.:3.00  
 Max.   :3.00   Max.   :3.00  
head(newds, n=3)
  cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
  f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
1   3   0   1   2   2   2   2   3   3   2
2   3   0   0   3   0   0   0   2   0   0
3   1   0   1   3   2   0   0   3   2   0

input and output data

# comment
comment(newds) = "HELP baseline dataset"
comment(newds)
[1] "HELP baseline dataset"
# output R object
save(ds, file="savedfile") #Save R Objects can use load to read back
load("savedfile")
# output csv
write.csv(ds, file="ds.csv")
# 存成SAS可讀的檔案和code(but檔案file.dat在SAS無法開啟)
library(foreign)
write.foreign(newds, "file.dat", "file.sas", package="SAS")
file.show("file.dat")

write.foreign write.foreign(df, datafile, codefile, package = c(“SPSS”, “Stata”, “SAS”), …)

head 與 column[]

#用with attach data,並列出cesd前10的數值
with(newds, cesd[1:10])
 [1] 49 30 39 15 39  6 52 32 50 46
#同上,改用head列出前10數值
with(newds, head(cesd, 10)) 
 [1] 49 30 39 15 39  6 52 32 50 46

filter 與 column[]

#在cesd欄取出cesd > 56
with(newds, cesd[cesd > 56]) 
[1] 57 58 57 60 58 58 57
#改用filter做一樣的事情
library(dplyr)
filter(newds, cesd > 56) %>% dplyr::select(id, cesd)
   id cesd
1  71   57
2 127   58
3 200   57
4 228   60
5 273   58
6 351   58
7  13   57

sort, which.min

with(newds, sort(cesd)[1:4])
[1] 1 3 3 4
with(newds, which.min(cesd)) #location of the (first) minimum vector
[1] 199

tally, favstats

library(mosaic)
tally(~ is.na(f1g), data=newds) 
is.na(f1g)
 TRUE FALSE 
    1   452 
favstats(~ f1g, data=newds)
 min Q1 median Q3 max mean  sd   n missing
   0  1      2  3   3 1.73 1.1 452       1

Counts/tally observations by group.

“favstats” is short for “favorite statistics”: it will give you the some of the most popular summary statistics for numerical variables.

how to impute na in cesd scale

# reverse code f1d, f1h, f1l and f1p
#cbind多個欄位,並直接透過3-VARIABLE 達到 reverse code
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t)) 

# 計算每一列在cesditems 的NA總數
nmisscesd = apply(is.na(cesditems), 1, sum) 
# 將cesditems 放入 ncesditems中
ncesditems = cesditems
# 如果ncesditems是NA的話,把它改為0
ncesditems[is.na(cesditems)] = 0
# 計算每一列在ncesditems 的總分
newcesd = apply(ncesditems, 1, sum)
# 填補遺漏值後的cesd總分?
imputemeancesd = 20/(20-nmisscesd)*newcesd
# 比較三個不同計算方式的cesd總分,並關注有na的列(nmisscesd>0)
dta5<-data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]

newds$cesd = newcesd

原始資料中是沒有進行遺漏值填補的值,其與我們將NA視為0計算之總分相同。

nmisscesd呈現的是遺漏多少個題目,imputemeancesd是經由填補後的CESD分數

cases

pacman::p_load(dplyr,memisc)
newds = mutate(newds, drinkstat= 
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))

建立drinkstat新欄位,並透過條件設定將之分為abstinent、moderate、highrisk三組 cases 與 case_when()功能相同,可用於多重判斷時

library(mosaic)
detach(package:memisc)
detach(package:MASS)

select, filter

library(dplyr)
tmpds = select(newds, i1, i2, female, drinkstat)#擷取newds中的四個變項,形成一個新物件tmpds
tmpds[365:370,] #列出 tmpds 中第 365 到 370 筆資料
    i1 i2 female drinkstat
365  6 24      0  highrisk
366  6  6      0  highrisk
367  0  0      0 abstinent
368  0  0      1 abstinent
369  8  8      0  highrisk
370 32 32      0  highrisk
library(dplyr)
filter(tmpds, drinkstat=="moderate" & female==1)#篩選出 tmpds 中,drinkstat 為 moderate 且 female為1的列
  i1 i2 female drinkstat
1  1  1      1  moderate
2  1  3      1  moderate
3  1  2      1  moderate
4  1  1      1  moderate
5  1  1      1  moderate
6  1  1      1  moderate
7  1  1      1  moderate

create table

# CrossTable
library(gmodels)
with(tmpds, CrossTable(drinkstat)) # 顯示 drinkstat 三組的n與%

 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  453 

 
          | abstinent |  moderate |  highrisk | 
          |-----------|-----------|-----------|
          |        68 |        28 |       357 | 
          |     0.150 |     0.062 |     0.788 | 
          |-----------|-----------|-----------|



 
with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE)) #顯示 drinkstat 和 female 變項的列連表

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|-------------------------|

 
Total Observations in Table:  453 

 
             | female 
   drinkstat |         0 |         1 | Row Total | 
-------------|-----------|-----------|-----------|
   abstinent |        42 |        26 |        68 | 
             |     0.618 |     0.382 |     0.150 | 
-------------|-----------|-----------|-----------|
    moderate |        21 |         7 |        28 | 
             |     0.750 |     0.250 |     0.062 | 
-------------|-----------|-----------|-----------|
    highrisk |       283 |        74 |       357 | 
             |     0.793 |     0.207 |     0.788 | 
-------------|-----------|-----------|-----------|
Column Total |       346 |       107 |       453 | 
-------------|-----------|-----------|-----------|

 
# 新增gender欄位並將原本female欄位的0 改變(transform)為Male, 1為Female
newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female"))) 
# 用tally產生列連表
tally(~ female + gender, margin=FALSE, data=newds)
      gender
female Male Female
     0  346      0
     1    0    107

arrange

# 資料重新排列
library(dplyr)
newds = arrange(ds, cesd, i1) # 將ds 資料重新由小到大排列,第一層依據為 cesd,第二層依據為i1
newds[1:5, c("cesd", "i1", "id")] #顯示 newds 中,變項 cesd、i1 和 id 的前五筆資料
  cesd i1  id
1    1  3 233
2    3  1 139
3    3 13 418
4    4  4 251
5    4  9  95

different approach to calculatemean on cesd by female

#
library(dplyr)
females = filter(ds, female==1) # 篩選 ds 的資料,篩選條件為 female 值為 1 的資料,存為一個新物件 females
with(females, mean(cesd)) # 計算females物件中cesd的平均數
[1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1]) # 先抓ds$female為1,再透過ds$cesd[]列出cesd中女性的值並做平均
[1] 36.9
with(ds, tapply(cesd, female, mean)) # 透過tapply針對female 欄位計算 cesd的平均分數
   0    1 
31.6 36.9 
library(mosaic)
mean(cesd ~ female, data=ds) # cesd分數按性別分別計算其平均
   0    1 
31.6 36.9