dataM_week4

Q1

dta1 <- Vocab
str(dta1)

## 'data.frame':    21638 obs. of  4 variables:
##  $ year      : int  2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
##  $ education : int  9 14 14 17 14 14 12 10 11 9 ...
##  $ vocabulary: int  3 6 9 8 1 7 6 6 5 1 ...

# the relationship between education and vocabulary over the years by gender
dta1%>%group_by(year,sex)%>%summarise(cor = cor(education,vocabulary))

## Source: local data frame [32 x 3]
## Groups: year [?]
## 
##     year    sex       cor
##    <int> <fctr>     <dbl>
## 1   1974 Female 0.4900975
## 2   1974   Male 0.5886080
## 3   1976 Female 0.5023103
## 4   1976   Male 0.5455221
## 5   1978 Female 0.5063789
## 6   1978   Male 0.5842083
## 7   1982 Female 0.5093188
## 8   1982   Male 0.5185160
## 9   1984 Female 0.5001001
## 10  1984   Male 0.5066840
## # ... with 22 more rows

dta1$year <- as.factor(dta1$year)

##ggplot
ggplot(data = dta1,aes(x = education,y = vocabulary))+
  geom_point(aes(col = year))+
  scale_color_grey(start = 0.95, end = 0.1)+
  facet_wrap(~sex)+
  stat_smooth(method = "lm",se = F)+
  theme_bw()

Q2

## 
options(continue="  ") # use tab for lines which continue over one line

## 
options(digits=3) # controls the number of digits to print when printing numeric values
options(width=72) # narrow output 
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv")# read .csv data
library(dplyr)# load dplyr package
newds = dplyr::select(ds,cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, 
               f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)# select column

## ------------------------------------------------------------------------
names(newds) # show column names

##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"

str(newds[,1:10]) # structure of the first 10 variables

## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...

## ------------------------------------------------------------------------
summary(newds[,1:10]) # summary of the first 10 variables

##       cesd          female            i1              i2       
##  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
##  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
##  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
##        id          treat            f1a            f1b      
##  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median :233   Median :0.000   Median :2.00   Median :1.00  
##  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00

## ------------------------------------------------------------------------
head(newds, n=3) #show first three rows' data

##   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0

## ------------------------------------------------------------------------
comment(newds) = "HELP baseline dataset" # set query a comment
comment(newds) # query a comment

## [1] "HELP baseline dataset"

save(ds, file="savedfile") # writes an external representation of R objects to the specified file

## ------------------------------------------------------------------------
write.csv(ds, file="ds.csv") # output the data

## ------------------------------------------------------------------------
library(foreign) #load foreign package
write.foreign(newds, "file.dat", "file.sas", package="SAS") # output file in SAS format

## ------------------------------------------------------------------------
with(newds, cesd[1:10])# the value of cesd of subject 1~10

##  [1] 49 30 39 15 39  6 52 32 50 46

with(newds, head(cesd, 10)) # the value of cesd of first 10 subject

##  [1] 49 30 39 15 39  6 52 32 50 46

## ------------------------------------------------------------------------
with(newds, cesd[cesd > 56]) # show the subject whose cesd value > 56

## [1] 57 58 57 60 58 58 57

## ------------------------------------------------------------------------
library(dplyr)
filter(newds, cesd > 56) %>% dplyr::select(id, cesd) # find the subject whose cesd value > 54 and report id & cesd

##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57

## ------------------------------------------------------------------------
with(newds, sort(cesd)[1:4]) # sort cesd value in ascending order and report the value of subject of 1~4

## [1] 1 3 3 4

with(newds, which.min(cesd)) # report the position which minimun cesd value occur

## [1] 199

## ------------------------------------------------------------------------
library(mosaic) # load mosaic package
tally(~ is.na(f1g), data=newds) # report how many missings on column f1g

## is.na(f1g)
##  TRUE FALSE 
##     1   452

favstats(~ f1g, data=newds) # summary column f1g

##  min Q1 median Q3 max mean  sd   n missing
##    0  1      2  3   3 1.73 1.1 452       1

## ------------------------------------------------------------------------
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
                              (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
                              f1q, f1r, f1s, f1t))
nmisscesd = apply(is.na(cesditems), 1, sum) # caculate each subject's numbers of missing 
ncesditems = cesditems
ncesditems[is.na(cesditems)] = 0 # replace NA by 0
newcesd = apply(ncesditems, 1, sum) # caculate each subject's total score
imputemeancesd = 20/(20-nmisscesd)*newcesd # weight the total score by the number of missing values

## ------------------------------------------------------------------------
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,] # report the subject's value which have missing

##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1           15.8
## 17       19         19         1           20.0
## 87       44         44         1           46.3
## 101      17         17         1           17.9
## 154      29         29         1           30.5
## 177      44         44         1           46.3
## 229      39         39         1           41.1

## ----createdrink,message=FALSE-------------------------------------------
library(dplyr)
library(memisc) # load memisc package
newds = mutate(newds, drinkstat= 
                 cases(
                   "abstinent" = i1==0,
                   "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
                     (i1>0 & i1<=2 & i2<=4 & female==0),
                   "highrisk" = ((i1>1 | i2>3) & female==1) |
                     ((i1>2 | i2>4) & female==0))) #categorize subject

## ----echo=FALSE----------------------------------------------------------
library(mosaic)

## ----echo=FALSE----------------------------------------------------------
detach(package:memisc) # detach memisc package
detach(package:MASS) # detach MASS package

## ------------------------------------------------------------------------
library(dplyr)
tmpds = dplyr::select(newds, i1, i2, female, drinkstat) # select column
tmpds[365:370,] # report the value of row 365~370

##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk

## ------------------------------------------------------------------------
library(dplyr)
filter(tmpds, drinkstat=="moderate" & female==1) # find subject which drinkstat is moderate and is female

##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate

## ----message=FALSE-------------------------------------------------------
library(gmodels) # load gmodels package
with(tmpds, CrossTable(drinkstat)) # give a table and show the percentage of drinkstat

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
##

## ------------------------------------------------------------------------
with(tmpds, CrossTable(drinkstat, female, 
                       prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))# give a table and show the percentage of drinkstat and sex

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
##

## ------------------------------------------------------------------------
newds = transform(newds, 
                  gender=factor(female, c(0,1), c("Male","Female"))) # give new column
tally(~ female + gender, margin=FALSE, data=newds) # counts of column female and gender

##       gender
## female Male Female
##      0  346      0
##      1    0    107

## ------------------------------------------------------------------------
library(dplyr)
newds = arrange(ds, cesd, i1)# arrange rows by cesd and i1
newds[1:5, c("cesd", "i1", "id")] # report cesd & i1 & id of row 1~5

##   cesd i1  id
## 1    1  3 233
## 2    3  1 139
## 3    3 13 418
## 4    4  4 251
## 5    4  9  95

## ------------------------------------------------------------------------
library(dplyr)
females = filter(ds, female==1) # 
with(females, mean(cesd)) # caculate the mean of male subject

## [1] 36.9

# an alternative approach
mean(ds$cesd[ds$female==1])

## [1] 36.9

## ------------------------------------------------------------------------
with(ds, tapply(cesd, female, mean)) # caculate the mean of female and male subject separately

##    0    1 
## 31.6 36.9

library(mosaic)
mean(cesd ~ female, data=ds) # caculate the mean of female and male subject separately

##    0    1 
## 31.6 36.9

Q3

dta3 <- read.csv("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/nlsy86long.csv")
str(dta3)

## 'data.frame':    664 obs. of  9 variables:
##  $ id   : int  2390 2560 3740 4020 6350 7030 7200 7610 7680 7700 ...
##  $ sex  : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
##  $ race : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
##  $ time : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ grade: int  0 0 0 0 1 0 0 0 0 0 ...
##  $ year : int  6 6 6 5 7 5 6 7 6 6 ...
##  $ month: int  67 66 67 60 78 62 66 79 76 67 ...
##  $ math : num  14.29 20.24 17.86 7.14 29.76 ...
##  $ read : num  19.05 21.43 21.43 7.14 30.95 ...

##expand the data set to a long format
dta3L <- gather(dta3,key = "test_var",value = "test_score",8:9)
str(dta3L)

## 'data.frame':    1328 obs. of  9 variables:
##  $ id        : int  2390 2560 3740 4020 6350 7030 7200 7610 7680 7700 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
##  $ race      : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
##  $ time      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ grade     : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ year      : int  6 6 6 5 7 5 6 7 6 6 ...
##  $ month     : int  67 66 67 60 78 62 66 79 76 67 ...
##  $ test_var  : chr  "math" "math" "math" "math" ...
##  $ test_score: num  14.29 20.24 17.86 7.14 29.76 ...

Q4

dta4 <- backpain
str(backpain)

## 'data.frame':    434 obs. of  4 variables:
##  $ ID      : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ status  : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
##  $ driver  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
##  $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...

dta4t <- data.frame(with(dta4,table(status,driver,suburban)))

# change long format to wide format
dta4tL <- spread(dta4t,key = "status",value = Freq)
# add a new column named Total
dta4tL <- dta4tL%>%mutate(Total = case + control)
dta4tL

##   driver suburban case control Total
## 1     no       no   26      47    73
## 2     no      yes    6       7    13
## 3    yes       no   64      63   127
## 4    yes      yes  121     100   221

Q5

dta5 <- cbind(state.x77,USArrests)
str(dta5)

## 'data.frame':    50 obs. of  12 variables:
##  $ Population: num  3615 365 2212 2110 21198 ...
##  $ Income    : num  3624 6315 4530 3378 5114 ...
##  $ Illiteracy: num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
##  $ Life Exp  : num  69 69.3 70.5 70.7 71.7 ...
##  $ Murder    : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
##  $ HS Grad   : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
##  $ Frost     : num  20 152 15 65 20 166 139 103 11 60 ...
##  $ Area      : num  50708 566432 113417 51945 156361 ...
##  $ Murder    : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault   : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop  : int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape      : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

cor(dta5)

##            Population  Income Illiteracy Life Exp  Murder HS Grad
## Population     1.0000  0.2082     0.1076  -0.0681  0.3436 -0.0985
## Income         0.2082  1.0000    -0.4371   0.3403 -0.2301  0.6199
## Illiteracy     0.1076 -0.4371     1.0000  -0.5885  0.7030 -0.6572
## Life Exp      -0.0681  0.3403    -0.5885   1.0000 -0.7808  0.5822
## Murder         0.3436 -0.2301     0.7030  -0.7808  1.0000 -0.4880
## HS Grad       -0.0985  0.6199    -0.6572   0.5822 -0.4880  1.0000
## Frost         -0.3322  0.2263    -0.6719   0.2621 -0.5389  0.3668
## Area           0.0225  0.3633     0.0773  -0.1073  0.2284  0.3335
## Murder         0.3202 -0.2152     0.7068  -0.7785  0.9337 -0.5216
## Assault        0.3170  0.0409     0.5110  -0.6267  0.7398 -0.2303
## UrbanPop       0.5126  0.4805    -0.0622   0.2715  0.0164  0.3587
## Rape           0.3052  0.3574     0.1546  -0.2696  0.5800  0.2707
##              Frost    Area  Murder Assault UrbanPop   Rape
## Population -0.3322  0.0225  0.3202  0.3170   0.5126  0.305
## Income      0.2263  0.3633 -0.2152  0.0409   0.4805  0.357
## Illiteracy -0.6719  0.0773  0.7068  0.5110  -0.0622  0.155
## Life Exp    0.2621 -0.1073 -0.7785 -0.6267   0.2715 -0.270
## Murder     -0.5389  0.2284  0.9337  0.7398   0.0164  0.580
## HS Grad     0.3668  0.3335 -0.5216 -0.2303   0.3587  0.271
## Frost       1.0000  0.0592 -0.5414 -0.4682  -0.2462 -0.279
## Area        0.0592  1.0000  0.1481  0.2312  -0.0615  0.525
## Murder     -0.5414  0.1481  1.0000  0.8019   0.0696  0.564
## Assault    -0.4682  0.2312  0.8019  1.0000   0.2589  0.665
## UrbanPop   -0.2462 -0.0615  0.0696  0.2589   1.0000  0.411
## Rape       -0.2792  0.5250  0.5636  0.6652   0.4113  1.000

heatmap(cor(dta5))

#High income people have more percentage of high school graduates , longer life expectancy and less criminal rate.

Q6

dta6 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/probeL.txt",header = T)
str(dta6)

## 'data.frame':    55 obs. of  3 variables:
##  $ ID           : Factor w/ 11 levels "S01","S02","S03",..: 1 1 1 1 1 2 2 2 2 2 ...
##  $ Response_Time: int  51 36 50 35 42 27 20 26 17 27 ...
##  $ Position     : int  1 2 3 4 5 1 2 3 4 5 ...

dtaW <- mutate(dta6, pre = rep("Pos", dim(dta6)[1])) %>%
  unite(time, pre, Position) %>%
  spread(time, Response_Time) %>%
  arrange(ID)
dtaW

##     ID Pos_1 Pos_2 Pos_3 Pos_4 Pos_5
## 1  S01    51    36    50    35    42
## 2  S02    27    20    26    17    27
## 3  S03    37    22    41    37    30
## 4  S04    42    36    32    34    27
## 5  S05    27    18    33    14    29
## 6  S06    43    32    43    35    40
## 7  S07    41    22    36    25    38
## 8  S08    38    21    31    20    16
## 9  S09    36    23    27    25    28
## 10 S10    26    31    31    32    36
## 11 S11    29    20    25    26    25

Q7

dta7_1 <- MASS::Animals
dta7_2 <- MASS::mammals
dta7_1$Type = rownames(dta7_1)
dta7_2$Type = rownames(dta7_2)
dta7 <- rbind(dta7_1,dta7_2)
dta7$type <- as.factor(dta7$Type)
sum(duplicated(dta7$type))

## [1] 23

dta7 = dta7[!duplicated(dta7$Type),]
sum(duplicated(dta7$Type))

## [1] 0

Q8

dta8 <- Prestige
str(dta8)

## 'data.frame':    102 obs. of  6 variables:
##  $ education: num  13.1 12.3 12.8 11.4 14.6 ...
##  $ income   : int  12351 25879 9271 8865 8403 11030 8258 14163 11377 11023 ...
##  $ women    : num  11.16 4.02 15.7 9.11 11.68 ...
##  $ prestige : num  68.8 69.1 63.4 56.8 73.5 77.6 72.6 78.1 73.1 68.8 ...
##  $ census   : int  1113 1130 1171 1175 2111 2113 2133 2141 2143 2153 ...
##  $ type     : Factor w/ 3 levels "bc","prof","wc": 2 2 2 2 2 2 2 2 2 2 ...

# median prestige score for each of the three types of occupation
aggregate(prestige~type, data = dta8, median)

##   type prestige
## 1   bc     35.9
## 2 prof     68.4
## 3   wc     41.5

#
dta8 <- dta8 %>%　group_by(type) %>% mutate(label = ifelse(prestige > median(prestige),"high","low"))
dta8 <- na.omit(dta8)
dta8$label <- as.factor(dta8$label)
dta8L <- dta8 %>%group_by(type,label) %>%do(tmp = data.frame(.))


#
ggplot(dta8, aes(x = education, y = income, color = label))+
  geom_point()+
  facet_grid(~type)+
  scale_color_discrete("Level of prestige", labels = c("High", "Low"))+
  theme_bw()+
  theme(legend.position=c(.15, .8), legend.background = element_rect(color="black", fill="white"))

# 
for(i in 1:6){
  dta8L$cor[i] <- cor(dta8L$tmp[[i]]$income,dta8L$tmp[[i]]$education)
}
dta8L

## Source: local data frame [6 x 4]
## Groups: <by row>
## 
## # A tibble: 6 × 4
##     type  label                   tmp       cor
## * <fctr> <fctr>                <list>     <dbl>
## 1     bc   high <data.frame [20 × 7]>  4.50e-01
## 2     bc    low <data.frame [24 × 7]>  5.74e-02
## 3   prof   high <data.frame [15 × 7]>  1.66e-06
## 4   prof    low <data.frame [16 × 7]>  3.73e-01
## 5     wc   high <data.frame [11 × 7]>  2.77e-01
## 6     wc    low <data.frame [12 × 7]> -8.06e-02

dataM_week4

Wen_Sung Cheng

2017年3月28日

Q1

Q2

Q3

Q4

Q5

Q6

Q7

Q8