library(dplyr)
library(foreign)
library(mosaic)
library(gmodels)
# output 小數點到第幾位(3=小數點後兩位)v
options(digits=3)
# narrow output
options(width=72)
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv")
# 選擇這些變項
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e,
f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
# 顯示名字
names(newds)
## [1] "cesd" "female" "i1" "i2" "id" "treat" "f1a"
## [8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
## [15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
## [22] "f1p" "f1q" "f1r" "f1s" "f1t"
# structure of the first 10 variables
str(newds[,1:10])
## 'data.frame': 453 obs. of 10 variables:
## $ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
## $ female: int 0 0 0 1 0 1 1 0 1 0 ...
## $ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
## $ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ treat : int 1 1 0 0 0 1 0 1 0 1 ...
## $ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
## $ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
## $ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
## $ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
# summary of the first 10 variables
summary(newds[,1:10])
## cesd female i1 i2
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:25.0 1st Qu.:0.000 1st Qu.: 3.0 1st Qu.: 3.0
## Median :34.0 Median :0.000 Median : 13.0 Median : 15.0
## Mean :32.8 Mean :0.236 Mean : 17.9 Mean : 22.6
## 3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.: 26.0 3rd Qu.: 32.0
## Max. :60.0 Max. :1.000 Max. :142.0 Max. :184.0
## id treat f1a f1b
## Min. : 1 Min. :0.000 Min. :0.00 Min. :0.00
## 1st Qu.:119 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
## Median :233 Median :0.000 Median :2.00 Median :1.00
## Mean :233 Mean :0.497 Mean :1.63 Mean :1.39
## 3rd Qu.:348 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
## Max. :470 Max. :1.000 Max. :3.00 Max. :3.00
## f1c f1d
## Min. :0.00 Min. :0.00
## 1st Qu.:1.00 1st Qu.:0.00
## Median :2.00 Median :1.00
## Mean :1.92 Mean :1.56
## 3rd Qu.:3.00 3rd Qu.:3.00
## Max. :3.00 Max. :3.00
# output the first 3 rows
head(newds, n=3)
# set comment
comment(newds) = "HELP baseline dataset"
comment(newds)
## [1] "HELP baseline dataset"
# save file
save(ds, file="savedfile")
# write file as csv
write.csv(ds, file="ds.csv")
# write file as .dat & .sas
write.foreign(newds, "file.dat", "file.sas", package="SAS")
# output the first 10 newds$cesd data
with(newds, cesd[1:10])
## [1] 49 30 39 15 39 6 52 32 50 46
with(newds, head(cesd, 10))
## [1] 49 30 39 15 39 6 52 32 50 46
# output the cesd data which newds$cesd > 56
with(newds, cesd[cesd > 56])
## [1] 57 58 57 60 58 58 57
# 篩選cesd > 56的資料,只取id, cesd做輸出
filter(newds, cesd > 56) %>% select(id, cesd)
# 那出前四筆cesd資料且排序
with(newds, sort(cesd)[1:4])
## [1] 1 3 3 4
# 第幾行的資料有cesd最小值
with(newds, which.min(cesd))
## [1] 199
# count f1g是否遺失
tally(~ is.na(f1g), data=newds)
## is.na(f1g)
## TRUE FALSE
## 1 452
# output flg 的統計值
favstats(~ f1g, data=newds)
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
(3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
f1q, f1r, f1s, f1t))
# 每一row去加總有NA的cesditems
nmisscesd = apply(is.na(cesditems), 1, sum)
ncesditems = cesditems
# 把NA設成0
ncesditems[is.na(cesditems)] = 0
# 每一row去加總所有columns
newcesd = apply(ncesditems, 1, sum)
imputemeancesd = 20/(20-nmisscesd)*newcesd
# 選出cesditems不是NA的列轉換成dataframe
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
library(dplyr)
library(memisc)
# 分類
newds = mutate(newds, drinkstat=
cases(
"abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
(i1>0 & i1<=2 & i2<=4 & female==0),
"highrisk" = ((i1>1 | i2>3) & female==1) |
((i1>2 | i2>4) & female==0)))
# 選取某些變項,輸出 365~370 rows的資料
tmpds = select(newds, i1, i2, female, drinkstat)
tmpds[365:370,]
# 篩選
filter(tmpds, drinkstat=="moderate" & female==1)
# print出總數以及比例
with(tmpds, CrossTable(drinkstat))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | abstinent | moderate | highrisk |
## |-----------|-----------|-----------|
## | 68 | 28 | 357 |
## | 0.150 | 0.062 | 0.788 |
## |-----------|-----------|-----------|
##
##
##
##
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | female
## drinkstat | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## abstinent | 42 | 26 | 68 |
## | 0.618 | 0.382 | 0.150 |
## -------------|-----------|-----------|-----------|
## moderate | 21 | 7 | 28 |
## | 0.750 | 0.250 | 0.062 |
## -------------|-----------|-----------|-----------|
## highrisk | 283 | 74 | 357 |
## | 0.793 | 0.207 | 0.788 |
## -------------|-----------|-----------|-----------|
## Column Total | 346 | 107 | 453 |
## -------------|-----------|-----------|-----------|
##
##
#
newds = transform(newds,
gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)
## gender
## female Male Female
## 0 346 0
## 1 0 107
# 以cesd, i1兩變項排序,選前五筆資料的c("cesd", "i1", "id")來看
newds = arrange(ds, cesd, i1)
newds[1:5, c("cesd", "i1", "id")]
# 算female==1時的cesd平均
females = filter(ds, female==1)
with(females, mean(cesd))
## [1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1])
## [1] 36.9
# 以female分組去看cesd的mean
with(ds, tapply(cesd, female, mean))
## 0 1
## 31.6 36.9
# an alternative approach
mean(cesd ~ female, data=ds)
## 0 1
## 31.6 36.9