Homework 0326

Yifang

2018-03-29

2

#讀進資料&看資料
dt2<-Ecdat::Caschool
knitr::kable(head(dt2))
distcod county district grspan enrltot teachers calwpct mealpct computer testscr compstu expnstu str avginc elpct readscr mathscr
75119 Alameda Sunol Glen Unified KK-08 195 10.90 0.5102 2.0408 67 690.80 0.3435898 6384.911 17.88991 22.690001 0.000000 691.6 690.0
61499 Butte Manzanita Elementary KK-08 240 11.15 15.4167 47.9167 101 661.20 0.4208333 5099.381 21.52466 9.824000 4.583334 660.5 661.9
61549 Butte Thermalito Union Elementary KK-08 1550 82.90 55.0323 76.3226 169 643.60 0.1090323 5501.955 18.69723 8.978000 30.000002 636.3 650.9
61457 Butte Golden Feather Union Elementary KK-08 243 14.00 36.4754 77.0492 85 647.70 0.3497942 7101.831 17.35714 8.978000 0.000000 651.9 643.5
61523 Butte Palermo Union Elementary KK-08 1335 71.50 33.1086 78.4270 171 640.85 0.1280899 5235.988 18.67133 9.080333 13.857677 641.8 639.9
62042 Fresno Burrel Union Elementary KK-08 137 6.40 12.3188 86.9565 25 605.55 0.1824818 5580.147 21.40625 10.415000 12.408759 605.7 605.4
#設亂數種子,每個國家抽一個,然後畫散布圖
set.seed(1234567)
dt2.1 <- dt2 %>% group_by(county) %>% sample_n(1) 
ggplot(dt2.1, aes(x=mathscr,y=readscr)) + geom_point()

3

## ----echo=FALSE,eval=TRUE------------------------------------------------
options(continue="  ")

## ------------------------------------------------------------------------
options(digits=3) #設定小數位數為3
options(width=72) # narrow output 
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv") #讀進資料
library(dplyr) #載入dplyr packages
#選擇要載入的變項
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, 
  f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t) 
## ------------------------------------------------------------------------
names(newds) #看變項的名稱
##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"
str(newds[,1:10]) # structure of the first 10 variables
## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...
## ------------------------------------------------------------------------
summary(newds[,1:10]) # summary of the first 10 variables
##       cesd          female            i1              i2       
##  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
##  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
##  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
##        id          treat            f1a            f1b      
##  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median :233   Median :0.000   Median :2.00   Median :1.00  
##  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00
## ------------------------------------------------------------------------
head(newds, n=3) #看前三筆資料
##   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0
## ------------------------------------------------------------------------
comment(newds) = "HELP baseline dataset"
comment(newds) #看設置的comment
## [1] "HELP baseline dataset"
save(ds, file="savedfile") #把檔案存檔

## ------------------------------------------------------------------------
write.csv(ds, file="ds.csv") #存成.csv檔

## ------------------------------------------------------------------------
library(foreign) #載入package
write.foreign(newds, "file.dat", "file.sas", package="SAS") #把檔案存成SAS

## ------------------------------------------------------------------------
with(newds, cesd[1:10]) #看cesd的前十個數值是多少
##  [1] 49 30 39 15 39  6 52 32 50 46
with(newds, head(cesd, 10)) #看cesd的前十個數值是多少
##  [1] 49 30 39 15 39  6 52 32 50 46
## ------------------------------------------------------------------------
with(newds, cesd[cesd > 56]) #找出cesd大於56的數值有多少
## [1] 57 58 57 60 58 58 57
## ------------------------------------------------------------------------
library(dplyr) #載入package
filter(newds, cesd > 56) %>% select(id, cesd) #挑出cesd>56的id編號
##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57
## ------------------------------------------------------------------------
with(newds, sort(cesd)[1:4]) #把cesd由小排到大之後看前四筆
## [1] 1 3 3 4
with(newds, which.min(cesd)) #找到cesd的最小值排在第幾列
## [1] 199
## ------------------------------------------------------------------------
library(mosaic) #載入package
## Loading required package: lattice
## Loading required package: ggformula
## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
tally(~ is.na(f1g), data=newds) #檢查flg的遺漏值,轉換成tbl
## is.na(f1g)
##  TRUE FALSE 
##     1   452
favstats(~ f1g, data=newds) #對flg做敘述統計報告
##  min Q1 median Q3 max mean  sd   n missing
##    0  1      2  3   3 1.73 1.1 452       1
## ------------------------------------------------------------------------
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t)) #將反向題挑出來重新編碼
nmisscesd = apply(is.na(cesditems), 1, sum) #計算NA的數量
ncesditems = cesditems #重新存成另一個資料
ncesditems[is.na(cesditems)] = 0 #讓NA變成0
newcesd = apply(ncesditems, 1, sum) #計算沒有遺漏值之後的平均
imputemeancesd = 20/(20-nmisscesd)*newcesd #用平均來代替NA

##把這些東西列成一個表格
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,] 
##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1           15.8
## 17       19         19         1           20.0
## 87       44         44         1           46.3
## 101      17         17         1           17.9
## 154      29         29         1           30.5
## 177      44         44         1           46.3
## 229      39         39         1           41.1
## ----createdrink,message=FALSE-------------------------------------------
library(dplyr) #載入package
library(memisc) #載入package
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Attaching package: 'memisc'
## The following object is masked from 'package:Matrix':
## 
##     as.array
## The following objects are masked from 'package:dplyr':
## 
##     collect, recode, rename
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
## 
##     as.array
newds = mutate(newds, drinkstat= 
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))
#把不同數值的資料給他一個類別的標籤
## ----echo=FALSE----------------------------------------------------------
library(mosaic) #載入package

##解除這兩個package
detach(package:memisc)
detach(package:MASS)

## ------------------------------------------------------------------------
library(dplyr) #載入package
tmpds = select(newds, i1, i2, female, drinkstat) #挑出需要的變項
tmpds[365:370,] #看365到370筆資料
##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk
filter(tmpds, drinkstat=="moderate" & female==1) #找出喝酒程度是moderate的女性
##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate
library(gmodels) #載入package
with(tmpds, CrossTable(drinkstat)) #看次數分配表
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
## 
with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE)) #橫列是喝酒程度、直欄是性別
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
## 
##把性別的0,1對應male與female的標籤,創一個新變項
newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)
##       gender
## female Male Female
##      0  346      0
##      1    0    107
newds = arrange(ds, cesd, i1) #排序
newds[1:5, c("cesd", "i1", "id")] #顯示1-5筆
##   cesd i1  id
## 1    1  3 233
## 2    3  1 139
## 3    3 13 418
## 4    4  4 251
## 5    4  9  95
females = filter(ds, female==1) #選出ds中的女性
with(females, mean(cesd)) #計算女性的平均cesd
## [1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1])  #計算女性的平均cesd
## [1] 36.9
## ------------------------------------------------------------------------
with(ds, tapply(cesd, female, mean)) #計算平均值
##    0    1 
## 31.6 36.9
library(mosaic) #載入package
mean(cesd ~ female, data=ds) #用~右邊的東西來界定根據甚麼計算平均
##    0    1 
## 31.6 36.9

4

read in the data and see

dt4<-HSAUR3::backpain
head(dt4) %>% knitr::kable()
ID status driver suburban
1 case yes yes
1 control yes no
2 case yes yes
2 control yes yes
3 case yes no
3 control yes yes

先把資料的列連表弄出來,然後把它們變成data.frame後拆開,再新增total一欄

with(dt4, ftable(driver, suburban,status)) %>% as.data.frame() %>% tidyr::spread(status,Freq) %>% mutate(total=case+control) 
##   driver suburban case control total
## 1     no       no   26      47    73
## 2     no      yes    6       7    13
## 3    yes       no   64      63   127
## 4    yes      yes  121     100   221

5

讀進資料後合併

dt51<-state.x77
dt52<-USArrests
dt5<-merge(dt51,dt52)
ggpairs(dt5[, -1])

不過我沒有看出有什麼有趣的事情.. # 6

讀進資料後合併起來,看看有沒有重複,發現沒有

dt61<-MASS::Animals
dt62<-MASS::mammals
dt6<-merge(dt61,dt62,"row.names")
head(dt6)
##          Row.names body.x brain.x body.y brain.y
## 1 African elephant 6654.0  5712.0 6654.0  5712.0
## 2   Asian elephant 2547.0  4603.0 2547.0  4603.0
## 3              Cat    3.3    25.6    3.3    25.6
## 4       Chimpanzee   52.2   440.0   52.2   440.0
## 5              Cow  465.0   423.0  465.0   423.0
## 6           Donkey  187.1   419.0  187.1   419.0
duplicated(dt6)
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE

7

讀進資料後新增新的變項

dt7<-Ecdat::Caschool
dt7<- dt7 %>% mutate(ratio=round(enrltot/teachers,2),
                     reading_cut=cut(readscr, breaks = quantile(readscr, probs = c(0,.33,.67,1)),
                                     label = c("L","M","H")))
#看一下資料的樣子
head(dt7)
##   distcod  county                        district grspan enrltot
## 1   75119 Alameda              Sunol Glen Unified  KK-08     195
## 2   61499   Butte            Manzanita Elementary  KK-08     240
## 3   61549   Butte     Thermalito Union Elementary  KK-08    1550
## 4   61457   Butte Golden Feather Union Elementary  KK-08     243
## 5   61523   Butte        Palermo Union Elementary  KK-08    1335
## 6   62042  Fresno         Burrel Union Elementary  KK-08     137
##   teachers calwpct mealpct computer testscr compstu expnstu  str avginc
## 1     10.9    0.51    2.04       67     691   0.344    6385 17.9  22.69
## 2     11.1   15.42   47.92      101     661   0.421    5099 21.5   9.82
## 3     82.9   55.03   76.32      169     644   0.109    5502 18.7   8.98
## 4     14.0   36.48   77.05       85     648   0.350    7102 17.4   8.98
## 5     71.5   33.11   78.43      171     641   0.128    5236 18.7   9.08
## 6      6.4   12.32   86.96       25     606   0.182    5580 21.4  10.41
##   elpct readscr mathscr ratio reading_cut
## 1  0.00     692     690  17.9           H
## 2  4.58     660     662  21.5           M
## 3 30.00     636     651  18.7           L
## 4  0.00     652     644  17.4           M
## 5 13.86     642     640  18.7           L
## 6 12.41     606     605  21.4           L

畫圖

lattice::xyplot(readscr ~ ratio | reading_cut, data = dt7,
       type = c("p", "g", "r"), layout = c(3, 1),
       xlab = "Student-Teacher Ratio",
       ylab = "Reading Score")

8

read in the data and see

dt8<-car::Prestige
knitr::kable(head(dt8))
education income women prestige census type
gov.administrators 13.1 12351 11.16 68.8 1113 prof
general.managers 12.3 25879 4.02 69.1 1130 prof
accountants 12.8 9271 15.70 63.4 1171 prof
purchasing.officers 11.4 8865 9.11 56.8 1175 prof
chemists 14.6 8403 11.68 73.5 2111 prof
physicists 15.6 11030 5.13 77.6 2113 prof

Find the median prestige score for each of the three types of occupation, respectively.

dt8 %>% na.omit() %>% group_by(type) %>% summarise(median(prestige))
## # A tibble: 3 x 2
##   type  `median(prestige)`
##   <fct>              <dbl>
## 1 bc                  35.9
## 2 prof                68.4
## 3 wc                  41.5

The relationship between income and education for each category

dt8 %>% filter(type=="bc") %>% 
  mutate(type_bc = cut(prestige, breaks = c(0,35.9,100),
                       label = c("L", "H"),ordered = T)) %>% 
  ggplot(data = ., aes(x = education, y = income,color=type_bc)) +
  geom_point() +  geom_line(aes(group = type_bc)) 

在type=bc的組別中,prestige比較高的income也比較高、education也比較高

dt8 %>% filter(type=="prof") %>% 
  mutate(type_pr = cut(prestige, breaks = c(0,68.4,100),
                       label = c("L", "H"),ordered = T)) %>% 
  ggplot(data = ., aes(x = education, y = income,color=type_pr)) +
  geom_point() +  geom_line(aes(group = type_pr))

在type=prof的組別中,prestige比較高的組別income的變化比較大、education比較高。

dt8 %>% filter(type=="wc") %>% 
  mutate(type_wc = cut(prestige, breaks = c(0,41.5,100),
                       label = c("L", "H"),ordered = T)) %>% 
  ggplot(data = ., aes(x = education, y = income,color=type_wc)) +
  geom_point() +  geom_line(aes(group = type_wc))

在type=wc的組別中,prestige比較高的組別education也比較高,但income看起來無太大差異

9

read the data and see

dt9<-mlmRev::Hsb82 
dt9$school <- factor(dt9$school) 
knitr::kable(head(dt9))
school minrty sx ses mAch meanses sector cses
1224 No Female -1.528 5.88 -0.434 Public -1.094
1224 No Female -0.588 19.71 -0.434 Public -0.154
1224 No Male -0.528 20.35 -0.434 Public -0.094
1224 No Male -0.668 8.78 -0.434 Public -0.234
1224 No Male -0.158 17.90 -0.434 Public 0.276
1224 No Male 0.022 4.58 -0.434 Public 0.456
dt9 %>% group_by(school) %>% summarise(mathsum=mean(mAch))
## # A tibble: 160 x 2
##    school mathsum
##    <ord>    <dbl>
##  1 8367      4.55
##  2 8854      4.24
##  3 4458      5.81
##  4 5762      4.32
##  5 6990      5.98
##  6 5815      7.27
##  7 7172      8.07
##  8 4868     12.3 
##  9 7341      9.79
## 10 1358     11.2 
## # ... with 150 more rows

plot the confidence interval

dt9 %>%
  group_by(school) %>%
  summarize(math_m=mean(mAch, na.rm = T),
            math_se=sd(mAch, na.rm = T)/sqrt(n())) %>%
  ggplot(data = ., aes(x = school, y = math_m)) +
  geom_point() +
  geom_line(aes(group = school)) +
  geom_errorbar(aes(ymin = math_m - 2*math_se, ymax = math_m + 2*math_se), width = .1) + 
  labs(x = "school", y = "Average Math score") +
  theme_bw() 
## geom_path: Each group consists of only one observation. Do you need
## to adjust the group aesthetic?

calculate the confidence interval

dt9 %>%
  group_by(school) %>%
  summarize(math_m=mean(mAch, na.rm = T),
            math_se=sd(mAch, na.rm = T)/sqrt(n())) %>%
  mutate(math_ci_upper=math_m - 2*math_se,
         math_ci_lower=math_m + 2*math_se) 
## # A tibble: 160 x 5
##    school math_m math_se math_ci_upper math_ci_lower
##    <ord>   <dbl>   <dbl>         <dbl>         <dbl>
##  1 8367     4.55   1.18           2.19          6.92
##  2 8854     4.24   0.956          2.33          6.15
##  3 4458     5.81   0.646          4.52          7.10
##  4 5762     4.32   0.821          2.68          5.97
##  5 6990     5.98   0.729          4.52          7.44
##  6 5815     7.27   1.12           5.04          9.51
##  7 7172     8.07   0.846          6.38          9.76
##  8 4868    12.3    0.932         10.4          14.2 
##  9 7341     9.79   0.757          8.28         11.3 
## 10 1358    11.2    1.07           9.06         13.4 
## # ... with 150 more rows