Data wrangling: Homework 6

讀取資料，選取其中 10 項欄位製成新資料

options(digits=3)
options(width=72) # narrow output 
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv")
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
newds = dplyr::select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)

讀取這 10 項變量名稱及檢查其資料結構

names(newds)
#>  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
#>  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
#> [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
#> [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"
str(newds[,1:10]) # structure of the first 10 variables
#> 'data.frame':    453 obs. of  10 variables:
#>  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
#>  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
#>  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
#>  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
#>  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
#>  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
#>  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
#>  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
#>  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
#>  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...

基礎統計這 10 項變量

summary(newds[,1:10]) # summary of the first 10 variables
#>       cesd          female            i1              i2       
#>  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
#>  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
#>  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
#>  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
#>  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
#>  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
#>        id          treat            f1a            f1b      
#>  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
#>  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
#>  Median :233   Median :0.000   Median :2.00   Median :1.00  
#>  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
#>  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
#>  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
#>       f1c            f1d      
#>  Min.   :0.00   Min.   :0.00  
#>  1st Qu.:1.00   1st Qu.:0.00  
#>  Median :2.00   Median :1.00  
#>  Mean   :1.92   Mean   :1.56  
#>  3rd Qu.:3.00   3rd Qu.:3.00  
#>  Max.   :3.00   Max.   :3.00

瀏覽這組資料的前三筆觀察值

head(newds, n=3)
#>   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
#> 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
#> 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
#> 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
#>   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
#> 1   3   0   1   2   2   2   2   3   3   2
#> 2   3   0   0   3   0   0   0   2   0   0
#> 3   1   0   1   3   2   0   0   3   2   0

替 newds 這筆資料加上註解，並將原先的 ds 資料另外儲存

comment(newds) = "HELP baseline dataset"
comment(newds)
#> [1] "HELP baseline dataset"
save(ds, file="savedfile")

將 ds 資料轉寫成 .csv 檔

write.csv(ds, file="ds.csv")

將 ds 檔案轉寫成 SAS 軟體可讀的檔案類型

library(foreign)
write.foreign(newds, "file.dat", "file.sas", package="SAS")

篩選出 newds 資料中 cesd 欄位的 1 到 10 筆資料

with(newds, cesd[1:10])
#>  [1] 49 30 39 15 39  6 52 32 50 46
with(newds, head(cesd, 10))
#>  [1] 49 30 39 15 39  6 52 32 50 46

篩選出 newsds 資料中 cesd 欄位數值大於 56 的資料

with(newds, cesd[cesd > 56])
#> [1] 57 58 57 60 58 58 57

使用 Tidy 做一樣的篩選，並指定呈現 id 和 cesd 兩個欄位的資料

library(dplyr)
filter(newds, cesd > 56) %>% select(id, cesd)
#>    id cesd
#> 1  71   57
#> 2 127   58
#> 3 200   57
#> 4 228   60
#> 5 273   58
#> 6 351   58
#> 7  13   57

排序 cesd 資料，由小到大，列出最小的前 4 個，另外尋找最小值在真正順序的第 199 個

with(newds, sort(cesd)[1:4])
#> [1] 1 3 3 4
with(newds, which.min(cesd))
#> [1] 199

使用 Tidy 尋找 newds 中 flg 欄位是否有遺漏值，以及針對 flg 進行 summary

library(mosaic)
#> Loading required package: lattice
#> Loading required package: ggformula
#> Loading required package: ggplot2
#> Loading required package: ggstance
#> 
#> Attaching package: 'ggstance'
#> The following objects are masked from 'package:ggplot2':
#> 
#>     geom_errorbarh, GeomErrorbarh
#> 
#> New to ggformula?  Try the tutorials: 
#>  learnr::run_tutorial("introduction", package = "ggformula")
#>  learnr::run_tutorial("refining", package = "ggformula")
#> Loading required package: mosaicData
#> Loading required package: Matrix
#> Registered S3 method overwritten by 'mosaic':
#>   method                           from   
#>   fortify.SpatialPolygonsDataFrame ggplot2
#> 
#> The 'mosaic' package masks several functions from core packages in order to add 
#> additional features.  The original behavior of these functions should not be affected by this.
#> 
#> Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
#> 
#> Have you tried the ggformula package for your plots?
#> 
#> Attaching package: 'mosaic'
#> The following object is masked from 'package:Matrix':
#> 
#>     mean
#> The following object is masked from 'package:ggplot2':
#> 
#>     stat
#> The following objects are masked from 'package:dplyr':
#> 
#>     count, do, tally
#> The following objects are masked from 'package:stats':
#> 
#>     binom.test, cor, cor.test, cov, fivenum, IQR, median,
#>     prop.test, quantile, sd, t.test, var
#> The following objects are masked from 'package:base':
#> 
#>     max, mean, min, prod, range, sample, sum
tally(~ is.na(f1g), data=newds)
#> is.na(f1g)
#>  TRUE FALSE 
#>     1   452
favstats(~ f1g, data=newds)
#>  min Q1 median Q3 max mean  sd   n missing
#>    0  1      2  3   3 1.73 1.1 452       1

# reverse code f1d, f1h, f1l and f1p

cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t))
nmisscesd = apply(is.na(cesditems), 1, sum)
ncesditems = cesditems
ncesditems[is.na(cesditems)] = 0
newcesd = apply(ncesditems, 1, sum)
imputemeancesd = 20/(20-nmisscesd)*newcesd

data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
#>     newcesd newds.cesd nmisscesd imputemeancesd
#> 4        15         15         1           15.8
#> 17       19         19         1           20.0
#> 87       44         44         1           46.3
#> 101      17         17         1           17.9
#> 154      29         29         1           30.5
#> 177      44         44         1           46.3
#> 229      39         39         1           41.1

新增飲酒情況，包含 3 個類別，參考 newds 裡既有的 3 個變量產生

library(dplyr)
library(memisc)
#> Loading required package: MASS
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#> 
#>     select
#> 
#> Attaching package: 'memisc'
#> The following object is masked from 'package:Matrix':
#> 
#>     as.array
#> The following object is masked from 'package:ggplot2':
#> 
#>     syms
#> The following objects are masked from 'package:dplyr':
#> 
#>     collect, recode, rename, syms
#> The following objects are masked from 'package:stats':
#> 
#>     contr.sum, contr.treatment, contrasts
#> The following object is masked from 'package:base':
#> 
#>     as.array
newds = dplyr::mutate(newds, drinkstat= 
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))

library(mosaic)

detach(package:memisc)
detach(package:MASS)

以 tmpds 呈現 newds, i1, i2, 是否為女性以及飲酒狀況等資料的第 365 至 370 筆資料

library(dplyr)
tmpds = select(newds, i1, i2, female, drinkstat)
tmpds[365:370,]
#>     i1 i2 female drinkstat
#> 365  6 24      0  highrisk
#> 366  6  6      0  highrisk
#> 367  0  0      0 abstinent
#> 368  0  0      1 abstinent
#> 369  8  8      0  highrisk
#> 370 32 32      0  highrisk

篩選出飲酒情況為中度的女性

library(dplyr)
filter(tmpds, drinkstat=="moderate" & female==1)
#>   i1 i2 female drinkstat
#> 1  1  1      1  moderate
#> 2  1  3      1  moderate
#> 3  1  2      1  moderate
#> 4  1  1      1  moderate
#> 5  1  1      1  moderate
#> 6  1  1      1  moderate
#> 7  1  1      1  moderate

使用 crosstable 計算類別變量的個數和比率

library(gmodels)
with(tmpds, CrossTable(drinkstat))
#> 
#>  
#>    Cell Contents
#> |-------------------------|
#> |                       N |
#> |         N / Table Total |
#> |-------------------------|
#> 
#>  
#> Total Observations in Table:  453 
#> 
#>  
#>           | abstinent |  moderate |  highrisk | 
#>           |-----------|-----------|-----------|
#>           |        68 |        28 |       357 | 
#>           |     0.150 |     0.062 |     0.788 | 
#>           |-----------|-----------|-----------|
#> 
#> 
#> 
#>

將飲酒狀況及是否為女性做成列連表

with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
#> 
#>  
#>    Cell Contents
#> |-------------------------|
#> |                       N |
#> |           N / Row Total |
#> |-------------------------|
#> 
#>  
#> Total Observations in Table:  453 
#> 
#>  
#>              | female 
#>    drinkstat |         0 |         1 | Row Total | 
#> -------------|-----------|-----------|-----------|
#>    abstinent |        42 |        26 |        68 | 
#>              |     0.618 |     0.382 |     0.150 | 
#> -------------|-----------|-----------|-----------|
#>     moderate |        21 |         7 |        28 | 
#>              |     0.750 |     0.250 |     0.062 | 
#> -------------|-----------|-----------|-----------|
#>     highrisk |       283 |        74 |       357 | 
#>              |     0.793 |     0.207 |     0.788 | 
#> -------------|-----------|-----------|-----------|
#> Column Total |       346 |       107 |       453 | 
#> -------------|-----------|-----------|-----------|
#> 
#>

將 newsds 的性別相關資料以男性、女性表示，並統計各自數量

newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)
#>       gender
#> female Male Female
#>      0  346      0
#>      1    0    107

以 cesd 和 i1 由小到大的順序來排定 ds 資料，並選出 1 至 5 筆資料

library(dplyr)
newds = arrange(ds, cesd, i1)
newds[1:5, c("cesd", "i1", "id")]
#>   cesd i1  id
#> 1    1  3 233
#> 2    3  1 139
#> 3    3 13 418
#> 4    4  4 251
#> 5    4  9  95

計算女性對象觀察值的 cesd 平均數

library(dplyr)
females = filter(ds, female==1)
with(females, mean(cesd))
#> [1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1])
#> [1] 36.9

計算以是否為女性為分類的 cesd 平均數

with(ds, tapply(cesd, female, mean))
#>    0    1 
#> 31.6 36.9
library(mosaic)
mean(cesd ~ female, data=ds)
#>    0    1 
#> 31.6 36.9

Data wrangling: Homework 6

2020-Spring [Data Management] Instructor: SHEU, Ching-Fan

CHIU, Ming-Tzu

2020-04-12

讀取資料，選取其中 10 項欄位製成新資料

讀取這 10 項變量名稱及檢查其資料結構

基礎統計這 10 項變量

瀏覽這組資料的前三筆觀察值

替 newds 這筆資料加上註解，並將原先的 ds 資料另外儲存

將 ds 資料轉寫成 .csv 檔

將 ds 檔案轉寫成 SAS 軟體可讀的檔案類型

篩選出 newds 資料中 cesd 欄位的 1 到 10 筆資料

篩選出 newsds 資料中 cesd 欄位數值大於 56 的資料

使用 Tidy 做一樣的篩選，並指定呈現 id 和 cesd 兩個欄位的資料

排序 cesd 資料，由小到大，列出最小的前 4 個，另外尋找最小值在真正順序的第 199 個

使用 Tidy 尋找 newds 中 flg 欄位是否有遺漏值，以及針對 flg 進行 summary

新增飲酒情況，包含 3 個類別，參考 newds 裡既有的 3 個變量產生

以 tmpds 呈現 newds, i1, i2, 是否為女性以及飲酒狀況等資料的第 365 至 370 筆資料

篩選出飲酒情況為中度的女性

使用 crosstable 計算類別變量的個數和比率

將飲酒狀況及是否為女性做成列連表

將 newsds 的性別相關資料以男性、女性表示，並統計各自數量

以 cesd 和 i1 由小到大的順序來排定 ds 資料，並選出 1 至 5 筆資料

計算女性對象觀察值的 cesd 平均數

計算以是否為女性為分類的 cesd 平均數