initial set

setwd("~/")
options(digits = 4, show.signif.stars = FALSE)
pacman::p_load(tidyverse, ggplot2, car, HSAUR3, Ecdat, GGally, lattice, mlmRev, memisc)

2

dta<- Caschool %>% 
  group_by(county) %>% 
  sample_n(1)

with(dta, plot(mathscr ~ readscr, 
                xlab = "average math score", 
                ylab = "average reading score"))

3

##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"
## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...
##       cesd          female            i1              i2       
##  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
##  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
##  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
##        id          treat            f1a            f1b      
##  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median :233   Median :0.000   Median :2.00   Median :1.00  
##  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00
##   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0
## [1] "HELP baseline dataset"
##  [1] 49 30 39 15 39  6 52 32 50 46
##  [1] 49 30 39 15 39  6 52 32 50 46
## [1] 57 58 57 60 58 58 57
##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57
## [1] 1 3 3 4
## [1] 199
## Warning: package 'mosaic' was built under R version 3.4.4
## is.na(f1g)
##  TRUE FALSE 
##     1   452
##  min Q1 median Q3 max mean  sd   n missing
##    0  1      2  3   3 1.73 1.1 452       1
##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1           15.8
## 17       19         19         1           20.0
## 87       44         44         1           46.3
## 101      17         17         1           17.9
## 154      29         29         1           30.5
## 177      44         44         1           46.3
## 229      39         39         1           41.1
## Warning: package 'memisc' was built under R version 3.4.4
## Warning: package 'MASS' was built under R version 3.4.4
##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk
##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate
## Warning: package 'gmodels' was built under R version 3.4.4
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
## 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
## 
##       gender
## female Male Female
##      0  346      0
##      1    0    107
##   cesd i1  id
## 1    1  3 233
## 2    3  1 139
## 3    3 13 418
## 4    4  4 251
## 5    4  9  95
## [1] 36.9
## [1] 36.9
##    0    1 
## 31.6 36.9
##    0    1 
## 31.6 36.9

4

head(dta4 <- backpain)
##   ID  status driver suburban
## 1  1    case    yes      yes
## 2  1 control    yes       no
## 3  2    case    yes      yes
## 4  2 control    yes      yes
## 5  3    case    yes       no
## 6  3 control    yes      yes
dta <- backpain %>% 
  group_by(status, driver, suburban) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  spread(status, n) %>% 
  mutate(total = case + control); dta
## # A tibble: 4 x 5
##   driver suburban  case control total
##   <fct>  <fct>    <int>   <int> <int>
## 1 no     no          26      47    73
## 2 no     yes          6       7    13
## 3 yes    no          64      63   127
## 4 yes    yes        121     100   221

5

library(datasets)
dta <- merge(state.x77, USArrests, "row.names")
cor(dta[, -1])
##            Population  Income Illiteracy Life Exp Murder.x HS Grad
## Population     1.0000  0.2082     0.1076  -0.0681   0.3436 -0.0985
## Income         0.2082  1.0000    -0.4371   0.3403  -0.2301  0.6199
## Illiteracy     0.1076 -0.4371     1.0000  -0.5885   0.7030 -0.6572
## Life Exp      -0.0681  0.3403    -0.5885   1.0000  -0.7808  0.5822
## Murder.x       0.3436 -0.2301     0.7030  -0.7808   1.0000 -0.4880
## HS Grad       -0.0985  0.6199    -0.6572   0.5822  -0.4880  1.0000
## Frost         -0.3322  0.2263    -0.6719   0.2621  -0.5389  0.3668
## Area           0.0225  0.3633     0.0773  -0.1073   0.2284  0.3335
## Murder.y       0.3202 -0.2152     0.7068  -0.7785   0.9337 -0.5216
## Assault        0.3170  0.0409     0.5110  -0.6267   0.7398 -0.2303
## UrbanPop       0.5126  0.4805    -0.0622   0.2715   0.0164  0.3587
## Rape           0.3052  0.3574     0.1546  -0.2696   0.5800  0.2707
##              Frost    Area Murder.y Assault UrbanPop   Rape
## Population -0.3322  0.0225   0.3202  0.3170   0.5126  0.305
## Income      0.2263  0.3633  -0.2152  0.0409   0.4805  0.357
## Illiteracy -0.6719  0.0773   0.7068  0.5110  -0.0622  0.155
## Life Exp    0.2621 -0.1073  -0.7785 -0.6267   0.2715 -0.270
## Murder.x   -0.5389  0.2284   0.9337  0.7398   0.0164  0.580
## HS Grad     0.3668  0.3335  -0.5216 -0.2303   0.3587  0.271
## Frost       1.0000  0.0592  -0.5414 -0.4682  -0.2462 -0.279
## Area        0.0592  1.0000   0.1481  0.2312  -0.0615  0.525
## Murder.y   -0.5414  0.1481   1.0000  0.8019   0.0696  0.564
## Assault    -0.4682  0.2312   0.8019  1.0000   0.2589  0.665
## UrbanPop   -0.2462 -0.0615   0.0696  0.2589   1.0000  0.411
## Rape       -0.2792  0.5250   0.5636  0.6652   0.4113  1.000
ggpairs(dta[, -1])

NO interesting

6

library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:Ecdat':
## 
##     SP500
## The following object is masked from 'package:dplyr':
## 
##     select
dta <- merge(rownames_to_column(mammals), rownames_to_column(Animals), all = TRUE)
duplicated(dta)
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE
dim(dta)
## [1] 67  3

7

dta7 <- Caschool %>% 
  mutate(ratio = enrltot/teachers, 
         Reading = cut(readscr, breaks = quantile(readscr, probs = c(0, .33, .67, 1)), 
                   label = c("L", "M", "H"), ordered = T))

library(lattice)
xyplot(readscr ~ ratio | Reading, data = dta7,
                type = c("p", "g", "r"), layout = c(3, 1),
                xlab = "Student-Teacher Ratio",
                ylab = "Reading Score")

8.1

dta <- Prestige %>% 
  group_by(type) %>% 
  summarize(m = median(prestige, na.rm = TRUE)); dta
## # A tibble: 4 x 2
##   type      m
##   <fct> <dbl>
## 1 bc     35.9
## 2 prof   68.4
## 3 wc     41.5
## 4 <NA>   35.0

8.2

dta8.2 <- Prestige %>% 
  na.omit %>% 
  group_by(type) %>% 
  mutate(m = median(prestige),
         PrestigeLevel = memisc::cases("High" = prestige >= m,
                                       "Low" = prestige < m))
xyplot(income ~ education |type, 
       group = PrestigeLevel, 
       data = dta8.2, 
       type = c("p", "g", "r"),
       layout = c(3, 1),
       auto.key=list(space="right", row=2),
       xlab = "Average education of occupational incumbents in 1971",
       ylab = "Average income of incumbents in 1971")

三種職業的聲望高低,對於教育程度與收入之間關係的效果不同。

9

dta9 <- Hsb82 %>%
  group_by(sector, school) %>%
  summarise(m = mean(mAch, na.rm = TRUE),
            s = sd(mAch, na.rm = TRUE),
            n = n(),
            se = s/sqrt(n)) %>%
  mutate(lower.ci = m-2*se, upper.ci = m+2*se)

ggplot(dta9, aes(x = school, y = m)) +
  geom_point() +
  geom_errorbar(aes(ymax = upper.ci, ymin = lower.ci))+
  coord_flip()+
  facet_wrap(~sector)+
  labs(x = "School",y = "Math Mean Score by School")