library(dplyr)
library(foreign)
library(mosaic)
library(gmodels)
# output 小數點到第幾位(3=小數點後兩位)v
options(digits=3) 
# narrow output 
options(width=72) 
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv")
# 選擇這些變項
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, 
  f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
# 顯示名字
names(newds)
##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"
# structure of the first 10 variables
str(newds[,1:10]) 
## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...
# summary of the first 10 variables
summary(newds[,1:10]) 
##       cesd          female            i1              i2       
##  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
##  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
##  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
##        id          treat            f1a            f1b      
##  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median :233   Median :0.000   Median :2.00   Median :1.00  
##  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00
# output the first 3 rows
head(newds, n=3)
# set comment
comment(newds) = "HELP baseline dataset"
comment(newds)
## [1] "HELP baseline dataset"
# save file
save(ds, file="savedfile")
# write file as csv
write.csv(ds, file="ds.csv")
# write file as .dat & .sas
write.foreign(newds, "file.dat", "file.sas", package="SAS")
# output the first 10 newds$cesd data
with(newds, cesd[1:10])
##  [1] 49 30 39 15 39  6 52 32 50 46
with(newds, head(cesd, 10))
##  [1] 49 30 39 15 39  6 52 32 50 46
# output the cesd data which newds$cesd > 56
with(newds, cesd[cesd > 56])
## [1] 57 58 57 60 58 58 57
# 篩選cesd > 56的資料,只取id, cesd做輸出
filter(newds, cesd > 56) %>% select(id, cesd)
# 那出前四筆cesd資料且排序
with(newds, sort(cesd)[1:4])
## [1] 1 3 3 4
# 第幾行的資料有cesd最小值
with(newds, which.min(cesd))
## [1] 199
# count f1g是否遺失
tally(~ is.na(f1g), data=newds)
## is.na(f1g)
##  TRUE FALSE 
##     1   452
# output flg 的統計值
favstats(~ f1g, data=newds)
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t))
# 每一row去加總有NA的cesditems
nmisscesd = apply(is.na(cesditems), 1, sum)
ncesditems = cesditems
# 把NA設成0
ncesditems[is.na(cesditems)] = 0
# 每一row去加總所有columns
newcesd = apply(ncesditems, 1, sum)
imputemeancesd = 20/(20-nmisscesd)*newcesd
# 選出cesditems不是NA的列轉換成dataframe
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
library(dplyr)
library(memisc)
# 分類
newds = mutate(newds, drinkstat= 
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))
# 選取某些變項,輸出 365~370 rows的資料
tmpds = select(newds, i1, i2, female, drinkstat)
tmpds[365:370,]
# 篩選
filter(tmpds, drinkstat=="moderate" & female==1)
# print出總數以及比例
with(tmpds, CrossTable(drinkstat))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
## 
with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
## 
# 
newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)
##       gender
## female Male Female
##      0  346      0
##      1    0    107
# 以cesd, i1兩變項排序,選前五筆資料的c("cesd", "i1", "id")來看
newds = arrange(ds, cesd, i1)
newds[1:5, c("cesd", "i1", "id")]
# 算female==1時的cesd平均
females = filter(ds, female==1)
with(females, mean(cesd))
## [1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1])
## [1] 36.9
# 以female分組去看cesd的mean
with(ds, tapply(cesd, female, mean))
##    0    1 
## 31.6 36.9
# an alternative approach
mean(cesd ~ female, data=ds)
##    0    1 
## 31.6 36.9