library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(foreign)
library(mosaic)

## Loading required package: lattice

## Loading required package: ggformula

## Loading required package: ggplot2

## Loading required package: ggstance

## 
## Attaching package: 'ggstance'

## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh

## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")

## Loading required package: mosaicData

## Loading required package: Matrix

## This version of Shiny is designed to work with 'htmlwidgets' >= 1.5.
##     Please upgrade via install.packages('htmlwidgets').

## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
## 
## Have you tried the ggformula package for your plots?

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following object is masked from 'package:ggplot2':
## 
##     stat

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

library(gmodels)

cut-off the digital point

Show variable names

names(newds)

##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"

structure of the first 10 variables

str(newds[,1:10])

## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...

make a summary table

summary(newds[,1:10])

##       cesd          female            i1              i2       
##  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
##  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
##  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
##        id          treat            f1a            f1b      
##  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median :233   Median :0.000   Median :2.00   Median :1.00  
##  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00

output the first 3 rows

head(newds, n=3)

##   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0

set comment

comment(newds) = "HELP baseline dataset"
comment(newds)

## [1] "HELP baseline dataset"

save file

save(ds, file="savedfile")

write file as csv

write.csv(ds, file="ds.csv")

write file as .dat & .sas

write.foreign(newds, "file.dat", "file.sas", package="SAS")

output the first 10 newds$cesd data

with(newds, cesd[1:10])

##  [1] 49 30 39 15 39  6 52 32 50 46

with(newds, head(cesd, 10))

##  [1] 49 30 39 15 39  6 52 32 50 46

output the cesd data which newds$cesd > 56

with(newds, cesd[cesd > 56])

## [1] 57 58 57 60 58 58 57

filter the data by cesd > 56 and seledct id, cesd to output

filter(newds, cesd > 56) %>% select(id, cesd)

##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57

List cesd data

with(newds, sort(cesd)[1:4])

## [1] 1 3 3 4

Show the row which is minimum of cesd

with(newds, which.min(cesd))

## [1] 199

Whether count f1g have missing values?

tally(~ is.na(f1g), data=newds)

## is.na(f1g)
##  TRUE FALSE 
##     1   452

Show the stat values of flg

favstats(~ f1g, data=newds)

##  min Q1 median Q3 max mean  sd   n missing
##    0  1      2  3   3 1.73 1.1 452       1

reverse code f1d, f1h, f1l and f1p

cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t))

Sum up the data who have NA for cesditems

nmisscesd = apply(is.na(cesditems), 1, sum)
ncesditems = cesditems

set NA = 0

ncesditems[is.na(cesditems)] = 0

Sum up columns by each rows

newcesd = apply(ncesditems, 1, sum)
imputemeancesd = 20/(20-nmisscesd)*newcesd

Only rows which were not NA in cesditems and transform to a dataframe

data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]

##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1           15.8
## 17       19         19         1           20.0
## 87       44         44         1           46.3
## 101      17         17         1           17.9
## 154      29         29         1           30.5
## 177      44         44         1           46.3
## 229      39         39         1           41.1

library(dplyr)
library(memisc)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## 
## Attaching package: 'memisc'

## The following object is masked from 'package:Matrix':
## 
##     as.array

## The following object is masked from 'package:ggplot2':
## 
##     syms

## The following objects are masked from 'package:dplyr':
## 
##     collect, recode, rename, syms

## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts

## The following object is masked from 'package:base':
## 
##     as.array

# mutate function
newds = mutate(newds, drinkstat= 
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))

library(dplyr)
# select the i1 and i2  and show the data of 365~370 rows
tmpds = select(newds, i1, i2, female, drinkstat)
tmpds[365:370,]

##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk

filter the data by drinkstat==“moderate” & female==1

filter(tmpds, drinkstat=="moderate" & female==1)

##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate

show the number and the propotion

with(tmpds, CrossTable(drinkstat))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
##

newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)

##       gender
## female Male Female
##      0  346      0
##      1    0    107

order by cesd, i1 then show the first 5 rows of cesd, i1, id

newds = arrange(ds, cesd, i1)
newds[1:5, c("cesd", "i1", "id")]

##   cesd i1  id
## 1    1  3 233
## 2    3  1 139
## 3    3 13 418
## 4    4  4 251
## 5    4  9  95

caluculate the mean of cesd when female =1

females = filter(ds, female==1)
with(females, mean(cesd))

## [1] 36.9

an alternative approach

mean(ds$cesd[ds$female==1])

## [1] 36.9

with(ds, tapply(cesd, female, mean))

##    0    1 
## 31.6 36.9

# an alternative approach
mean(cesd ~ female, data=ds)

##    0    1 
## 31.6 36.9

Week 5 Homework 6

Hao-Lun Fu

2020-04-13

cut-off the digital point

Show variable names

structure of the first 10 variables

make a summary table

output the first 3 rows

set comment

save file

write file as csv

write file as .dat & .sas

output the first 10 newds$cesd data

output the cesd data which newds$cesd > 56

filter the data by cesd > 56 and seledct id, cesd to output

List cesd data

Show the row which is minimum of cesd

Whether count f1g have missing values?

Show the stat values of flg

reverse code f1d, f1h, f1l and f1p

Sum up the data who have NA for cesditems

set NA = 0

Sum up columns by each rows

Only rows which were not NA in cesditems and transform to a dataframe

filter the data by drinkstat==“moderate” & female==1

show the number and the propotion

order by cesd, i1 then show the first 5 rows of cesd, i1, id

caluculate the mean of cesd when female =1

an alternative approach