Exercise 5

ds <- read.csv("C:\\Users\\shabby\\Desktop\\DataManagement\\week6\\help.csv") #讀取數據
library(dplyr) #加載包
## 
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
newds = dplyr::select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t) # 選取變項

## ------------------------------------------------------------------------
names(newds) # 查看變項名
##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"    "f1b"   
##  [9] "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"    "f1i"    "f1j"   
## [17] "f1k"    "f1l"    "f1m"    "f1n"    "f1o"    "f1p"    "f1q"    "f1r"   
## [25] "f1s"    "f1t"
str(newds[,1:10]) # structure of the first 10 variables顯示前十個變項類型
## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...
## ------------------------------------------------------------------------
summary(newds[,1:10]) # summary of the first 10 variables顯示前十筆資料摘要
##       cesd           female             i1               i2        
##  Min.   : 1.00   Min.   :0.0000   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:25.00   1st Qu.:0.0000   1st Qu.:  3.00   1st Qu.:  3.00  
##  Median :34.00   Median :0.0000   Median : 13.00   Median : 15.00  
##  Mean   :32.85   Mean   :0.2362   Mean   : 17.91   Mean   : 22.65  
##  3rd Qu.:41.00   3rd Qu.:0.0000   3rd Qu.: 26.00   3rd Qu.: 32.00  
##  Max.   :60.00   Max.   :1.0000   Max.   :142.00   Max.   :184.00  
##        id            treat             f1a             f1b       
##  Min.   :  1.0   Min.   :0.0000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:119.0   1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.000  
##  Median :233.0   Median :0.0000   Median :2.000   Median :1.000  
##  Mean   :233.4   Mean   :0.4967   Mean   :1.634   Mean   :1.391  
##  3rd Qu.:348.0   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :470.0   Max.   :1.0000   Max.   :3.000   Max.   :3.000  
##       f1c             f1d       
##  Min.   :0.000   Min.   :0.000  
##  1st Qu.:1.000   1st Qu.:0.000  
##  Median :2.000   Median :1.000  
##  Mean   :1.923   Mean   :1.565  
##  3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :3.000   Max.   :3.000
## ------------------------------------------------------------------------
head(newds, n=3)#展示前三筆數據
##   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j f1k f1l
## 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3   3   0
## 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0   3   0
## 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3   1   0
##   f1m f1n f1o f1p f1q f1r f1s f1t
## 1   1   2   2   2   2   3   3   2
## 2   0   3   0   0   0   2   0   0
## 3   1   3   2   0   0   3   2   0
## ------------------------------------------------------------------------
comment(newds) = "HELP baseline dataset"
comment(newds)#寫進comment
## [1] "HELP baseline dataset"
save(ds, file="savedfile")#將變數ds儲存,儲存格式file="savedfile"

## ------------------------------------------------------------------------
write.csv(ds, file="ds.csv") #將數據儲存為csv文件

## ------------------------------------------------------------------------
library(foreign)
write.foreign(newds, "file.dat", "file.sas", package="SAS")
#存成SAS可讀形式
## ------------------------------------------------------------------------
with(newds, cesd[1:10]) #用with列出cesd前10的數值
##  [1] 49 30 39 15 39  6 52 32 50 46
with(newds, head(cesd, 10)) #用head列出前10數值
##  [1] 49 30 39 15 39  6 52 32 50 46
## ------------------------------------------------------------------------
with(newds, cesd[cesd > 56]) # 在cesd變項中篩選出cesd > 56的值
## [1] 57 58 57 60 58 58 57
## ------------------------------------------------------------------------
library(dplyr)
filter(newds, cesd > 56) %>% select(id, cesd) #同上,篩選出cesd大於56的值
##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57
## ------------------------------------------------------------------------
with(newds, sort(cesd)[1:4])#將cesd由高到低排序,並展示4個最小值
## [1] 1 3 3 4
with(newds, which.min(cesd))#找到最小值的位置
## [1] 199
## ------------------------------------------------------------------------
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## 载入程辑包:'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
tally(~ is.na(f1g), data=newds) # 找出f1g中缺失值
## is.na(f1g)
##  TRUE FALSE 
##     1   452
favstats(~ f1g, data=newds) #對f1g變項求和
##  min Q1 median Q3 max     mean       sd   n missing
##    0  1      2  3   3 1.730088 1.095314 452       1
## ------------------------------------------------------------------------
# reverse code f1d, f1h, f1l and f1p 有點沒看懂,好像是要增加列數,但不知道3-的含義
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t))

nmisscesd = apply(is.na(cesditems), 1, sum)#計算每一列在cesditems 的NA總數

ncesditems = cesditems  # 將cesditems 放入 ncesditems中
ncesditems[is.na(cesditems)] = 0 #若有缺失值,將其記作0
newcesd = apply(ncesditems, 1, sum) # 計算每一列在ncesditems 的總分
imputemeancesd = 20/(20-nmisscesd)*newcesd #填補分數並說明計算方式

## ------------------------------------------------------------------------
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1       15.78947
## 17       19         19         1       20.00000
## 87       44         44         1       46.31579
## 101      17         17         1       17.89474
## 154      29         29         1       30.52632
## 177      44         44         1       46.31579
## 229      39         39         1       41.05263
#比較cesd分數,不關注缺失值

## ----createdrink,message=FALSE-------------------------------------------
library(dplyr)
library(memisc)
## 载入需要的程辑包:MASS
## 
## 载入程辑包:'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 载入程辑包:'memisc'
## The following object is masked from 'package:Matrix':
## 
##     as.array
## The following object is masked from 'package:ggplot2':
## 
##     syms
## The following objects are masked from 'package:dplyr':
## 
##     collect, recode, rename, syms
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
## 
##     as.array
newds = mutate(newds, drinkstat= 
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))
#生成drinkstat變項,並透過條件設定將之分為abstinent、moderate、highrisk三種

## ----echo=FALSE----------------------------------------------------------
library(mosaic)

## ----echo=FALSE----------------------------------------------------------
detach(package:memisc)
detach(package:MASS)

## ------------------------------------------------------------------------
library(dplyr)
tmpds <- dplyr::select(newds, i1, i2, female, drinkstat) #取newds中的四個變項, i1, i2, female, drinkstat
tmpds[365:370,] #展示365到370行數據
##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk
## ------------------------------------------------------------------------
library(dplyr)
filter(tmpds, drinkstat=="moderate" & female==1)
##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate
#從tmpds 中選出drinkstat=moderate , female=1的列
## ----message=FALSE-------------------------------------------------------
library(gmodels)
with(tmpds, CrossTable(drinkstat))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
## 
#顯示 drinkstat 三組的數量與比重
## ------------------------------------------------------------------------
with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
## 
#顯示 drinkstat 和 female 變項的列連表
## ------------------------------------------------------------------------
newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)
##       gender
## female Male Female
##      0  346      0
##      1    0    107
#生成gender變項,以原有的變項進行轉化

## ------------------------------------------------------------------------
library(dplyr)
newds = arrange(ds, cesd, i1) #將ds 資料重新由小到大排列,第一層依據為 cesd,第二層依據為i1
newds[1:5, c("cesd", "i1", "id")]
##   cesd i1  id
## 1    1  3 233
## 2    3  1 139
## 3    3 13 418
## 4    4  4 251
## 5    4  9  95
#顯示 newds 中,變項 cesd、i1 和 id 的前五筆資料
## ------------------------------------------------------------------------
library(dplyr)
females = filter(ds, female==1)# 篩選 ds 的資料,篩選條件為 female 值為 1 的資料,存為一個新物件 females
with(females, mean(cesd))
## [1] 36.88785
# an alternative approach
mean(ds$cesd[ds$female==1])  #計算females中cesd的平均數
## [1] 36.88785
## ------------------------------------------------------------------------
with(ds, tapply(cesd, female, mean))
##        0        1 
## 31.59827 36.88785
library(mosaic)
mean(cesd ~ female, data=ds)
##        0        1 
## 31.59827 36.88785
#兩者都是按性別計算cesd的平均數