2

#讀進資料&看資料
dt2<-Ecdat::Caschool
knitr::kable(head(dt2))

distcod	county	district	grspan	enrltot	teachers	calwpct	mealpct	computer	testscr	compstu	expnstu	str	avginc	elpct	readscr	mathscr
75119	Alameda	Sunol Glen Unified	KK-08	195	10.90	0.5102	2.0408	67	690.80	0.3435898	6384.911	17.88991	22.690001	0.000000	691.6	690.0
61499	Butte	Manzanita Elementary	KK-08	240	11.15	15.4167	47.9167	101	661.20	0.4208333	5099.381	21.52466	9.824000	4.583334	660.5	661.9
61549	Butte	Thermalito Union Elementary	KK-08	1550	82.90	55.0323	76.3226	169	643.60	0.1090323	5501.955	18.69723	8.978000	30.000002	636.3	650.9
61457	Butte	Golden Feather Union Elementary	KK-08	243	14.00	36.4754	77.0492	85	647.70	0.3497942	7101.831	17.35714	8.978000	0.000000	651.9	643.5
61523	Butte	Palermo Union Elementary	KK-08	1335	71.50	33.1086	78.4270	171	640.85	0.1280899	5235.988	18.67133	9.080333	13.857677	641.8	639.9
62042	Fresno	Burrel Union Elementary	KK-08	137	6.40	12.3188	86.9565	25	605.55	0.1824818	5580.147	21.40625	10.415000	12.408759	605.7	605.4

#設亂數種子，每個國家抽一個，然後畫散布圖
set.seed(1234567)
dt2.1 <- dt2 %>% group_by(county) %>% sample_n(1) 
ggplot(dt2.1, aes(x=mathscr,y=readscr)) + geom_point()

3

## ----echo=FALSE,eval=TRUE------------------------------------------------
options(continue="  ")

## ------------------------------------------------------------------------
options(digits=3) #設定小數位數為3
options(width=72) # narrow output 
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv") #讀進資料
library(dplyr) #載入dplyr packages
#選擇要載入的變項
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, 
  f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t) 
## ------------------------------------------------------------------------
names(newds) #看變項的名稱

##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"

str(newds[,1:10]) # structure of the first 10 variables

## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...

## ------------------------------------------------------------------------
summary(newds[,1:10]) # summary of the first 10 variables

##       cesd          female            i1              i2       
##  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
##  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
##  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
##        id          treat            f1a            f1b      
##  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median :233   Median :0.000   Median :2.00   Median :1.00  
##  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00

## ------------------------------------------------------------------------
head(newds, n=3) #看前三筆資料

##   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0

## ------------------------------------------------------------------------
comment(newds) = "HELP baseline dataset"
comment(newds) #看設置的comment

## [1] "HELP baseline dataset"

save(ds, file="savedfile") #把檔案存檔

## ------------------------------------------------------------------------
write.csv(ds, file="ds.csv") #存成.csv檔

## ------------------------------------------------------------------------
library(foreign) #載入package
write.foreign(newds, "file.dat", "file.sas", package="SAS") #把檔案存成SAS

## ------------------------------------------------------------------------
with(newds, cesd[1:10]) #看cesd的前十個數值是多少

##  [1] 49 30 39 15 39  6 52 32 50 46

with(newds, head(cesd, 10)) #看cesd的前十個數值是多少

##  [1] 49 30 39 15 39  6 52 32 50 46

## ------------------------------------------------------------------------
with(newds, cesd[cesd > 56]) #找出cesd大於56的數值有多少

## [1] 57 58 57 60 58 58 57

## ------------------------------------------------------------------------
library(dplyr) #載入package
filter(newds, cesd > 56) %>% select(id, cesd) #挑出cesd>56的id編號

##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57

## ------------------------------------------------------------------------
with(newds, sort(cesd)[1:4]) #把cesd由小排到大之後看前四筆

## [1] 1 3 3 4

with(newds, which.min(cesd)) #找到cesd的最小值排在第幾列

## [1] 199

## ------------------------------------------------------------------------
library(mosaic) #載入package

## Loading required package: lattice

## Loading required package: ggformula

## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")

## Loading required package: mosaicData

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

tally(~ is.na(f1g), data=newds) #檢查flg的遺漏值，轉換成tbl

## is.na(f1g)
##  TRUE FALSE 
##     1   452

favstats(~ f1g, data=newds) #對flg做敘述統計報告

##  min Q1 median Q3 max mean  sd   n missing
##    0  1      2  3   3 1.73 1.1 452       1

## ------------------------------------------------------------------------
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t)) #將反向題挑出來重新編碼
nmisscesd = apply(is.na(cesditems), 1, sum) #計算NA的數量
ncesditems = cesditems #重新存成另一個資料
ncesditems[is.na(cesditems)] = 0 #讓NA變成0
newcesd = apply(ncesditems, 1, sum) #計算沒有遺漏值之後的平均
imputemeancesd = 20/(20-nmisscesd)*newcesd #用平均來代替NA

##把這些東西列成一個表格
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]

##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1           15.8
## 17       19         19         1           20.0
## 87       44         44         1           46.3
## 101      17         17         1           17.9
## 154      29         29         1           30.5
## 177      44         44         1           46.3
## 229      39         39         1           41.1

## ----createdrink,message=FALSE-------------------------------------------
library(dplyr) #載入package
library(memisc) #載入package

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## 
## Attaching package: 'memisc'

## The following object is masked from 'package:Matrix':
## 
##     as.array

## The following objects are masked from 'package:dplyr':
## 
##     collect, recode, rename

## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts

## The following object is masked from 'package:base':
## 
##     as.array

newds = mutate(newds, drinkstat= 
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))
#把不同數值的資料給他一個類別的標籤
## ----echo=FALSE----------------------------------------------------------
library(mosaic) #載入package

##解除這兩個package
detach(package:memisc)
detach(package:MASS)

## ------------------------------------------------------------------------
library(dplyr) #載入package
tmpds = select(newds, i1, i2, female, drinkstat) #挑出需要的變項
tmpds[365:370,] #看365到370筆資料

##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk

filter(tmpds, drinkstat=="moderate" & female==1) #找出喝酒程度是moderate的女性

##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate

library(gmodels) #載入package
with(tmpds, CrossTable(drinkstat)) #看次數分配表

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
##

with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE)) #橫列是喝酒程度、直欄是性別

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
##

##把性別的0,1對應male與female的標籤，創一個新變項
newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)

##       gender
## female Male Female
##      0  346      0
##      1    0    107

newds = arrange(ds, cesd, i1) #排序
newds[1:5, c("cesd", "i1", "id")] #顯示1-5筆

##   cesd i1  id
## 1    1  3 233
## 2    3  1 139
## 3    3 13 418
## 4    4  4 251
## 5    4  9  95

females = filter(ds, female==1) #選出ds中的女性
with(females, mean(cesd)) #計算女性的平均cesd

## [1] 36.9

# an alternative approach
mean(ds$cesd[ds$female==1])  #計算女性的平均cesd

## [1] 36.9

## ------------------------------------------------------------------------
with(ds, tapply(cesd, female, mean)) #計算平均值

##    0    1 
## 31.6 36.9

library(mosaic) #載入package
mean(cesd ~ female, data=ds) #用~右邊的東西來界定根據甚麼計算平均

##    0    1 
## 31.6 36.9

4

read in the data and see

dt4<-HSAUR3::backpain
head(dt4) %>% knitr::kable()

ID	status	driver	suburban
1	case	yes	yes
1	control	yes	no
2	case	yes	yes
2	control	yes	yes
3	case	yes	no
3	control	yes	yes

先把資料的列連表弄出來，然後把它們變成data.frame後拆開，再新增total一欄

with(dt4, ftable(driver, suburban,status)) %>% as.data.frame() %>% tidyr::spread(status,Freq) %>% mutate(total=case+control)

##   driver suburban case control total
## 1     no       no   26      47    73
## 2     no      yes    6       7    13
## 3    yes       no   64      63   127
## 4    yes      yes  121     100   221

5

讀進資料後合併

dt51<-state.x77
dt52<-USArrests
dt5<-merge(dt51,dt52)
ggpairs(dt5[, -1])

不過我沒有看出有什麼有趣的事情.. # 6

讀進資料後合併起來，看看有沒有重複，發現沒有

dt61<-MASS::Animals
dt62<-MASS::mammals
dt6<-merge(dt61,dt62,"row.names")
head(dt6)

##          Row.names body.x brain.x body.y brain.y
## 1 African elephant 6654.0  5712.0 6654.0  5712.0
## 2   Asian elephant 2547.0  4603.0 2547.0  4603.0
## 3              Cat    3.3    25.6    3.3    25.6
## 4       Chimpanzee   52.2   440.0   52.2   440.0
## 5              Cow  465.0   423.0  465.0   423.0
## 6           Donkey  187.1   419.0  187.1   419.0

duplicated(dt6)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE

7

讀進資料後新增新的變項

dt7<-Ecdat::Caschool
dt7<- dt7 %>% mutate(ratio=round(enrltot/teachers,2),
                     reading_cut=cut(readscr, breaks = quantile(readscr, probs = c(0,.33,.67,1)),
                                     label = c("L","M","H")))
#看一下資料的樣子
head(dt7)

##   distcod  county                        district grspan enrltot
## 1   75119 Alameda              Sunol Glen Unified  KK-08     195
## 2   61499   Butte            Manzanita Elementary  KK-08     240
## 3   61549   Butte     Thermalito Union Elementary  KK-08    1550
## 4   61457   Butte Golden Feather Union Elementary  KK-08     243
## 5   61523   Butte        Palermo Union Elementary  KK-08    1335
## 6   62042  Fresno         Burrel Union Elementary  KK-08     137
##   teachers calwpct mealpct computer testscr compstu expnstu  str avginc
## 1     10.9    0.51    2.04       67     691   0.344    6385 17.9  22.69
## 2     11.1   15.42   47.92      101     661   0.421    5099 21.5   9.82
## 3     82.9   55.03   76.32      169     644   0.109    5502 18.7   8.98
## 4     14.0   36.48   77.05       85     648   0.350    7102 17.4   8.98
## 5     71.5   33.11   78.43      171     641   0.128    5236 18.7   9.08
## 6      6.4   12.32   86.96       25     606   0.182    5580 21.4  10.41
##   elpct readscr mathscr ratio reading_cut
## 1  0.00     692     690  17.9           H
## 2  4.58     660     662  21.5           M
## 3 30.00     636     651  18.7           L
## 4  0.00     652     644  17.4           M
## 5 13.86     642     640  18.7           L
## 6 12.41     606     605  21.4           L

畫圖

lattice::xyplot(readscr ~ ratio | reading_cut, data = dt7,
       type = c("p", "g", "r"), layout = c(3, 1),
       xlab = "Student-Teacher Ratio",
       ylab = "Reading Score")

8

read in the data and see

dt8<-car::Prestige
knitr::kable(head(dt8))

	education	income	women	prestige	census	type
gov.administrators	13.1	12351	11.16	68.8	1113	prof
general.managers	12.3	25879	4.02	69.1	1130	prof
accountants	12.8	9271	15.70	63.4	1171	prof
purchasing.officers	11.4	8865	9.11	56.8	1175	prof
chemists	14.6	8403	11.68	73.5	2111	prof
physicists	15.6	11030	5.13	77.6	2113	prof

Find the median prestige score for each of the three types of occupation, respectively.

dt8 %>% na.omit() %>% group_by(type) %>% summarise(median(prestige))

## # A tibble: 3 x 2
##   type  `median(prestige)`
##   <fct>              <dbl>
## 1 bc                  35.9
## 2 prof                68.4
## 3 wc                  41.5

The relationship between income and education for each category

dt8 %>% filter(type=="bc") %>% 
  mutate(type_bc = cut(prestige, breaks = c(0,35.9,100),
                       label = c("L", "H"),ordered = T)) %>% 
  ggplot(data = ., aes(x = education, y = income,color=type_bc)) +
  geom_point() +  geom_line(aes(group = type_bc))

在type=bc的組別中，prestige比較高的income也比較高、education也比較高

dt8 %>% filter(type=="prof") %>% 
  mutate(type_pr = cut(prestige, breaks = c(0,68.4,100),
                       label = c("L", "H"),ordered = T)) %>% 
  ggplot(data = ., aes(x = education, y = income,color=type_pr)) +
  geom_point() +  geom_line(aes(group = type_pr))

在type=prof的組別中，prestige比較高的組別income的變化比較大、education比較高。

dt8 %>% filter(type=="wc") %>% 
  mutate(type_wc = cut(prestige, breaks = c(0,41.5,100),
                       label = c("L", "H"),ordered = T)) %>% 
  ggplot(data = ., aes(x = education, y = income,color=type_wc)) +
  geom_point() +  geom_line(aes(group = type_wc))

在type=wc的組別中，prestige比較高的組別education也比較高，但income看起來無太大差異

9

read the data and see

dt9<-mlmRev::Hsb82 
dt9$school <- factor(dt9$school) 
knitr::kable(head(dt9))

school	minrty	sx	ses	mAch	meanses	sector	cses
1224	No	Female	-1.528	5.88	-0.434	Public	-1.094
1224	No	Female	-0.588	19.71	-0.434	Public	-0.154
1224	No	Male	-0.528	20.35	-0.434	Public	-0.094
1224	No	Male	-0.668	8.78	-0.434	Public	-0.234
1224	No	Male	-0.158	17.90	-0.434	Public	0.276
1224	No	Male	0.022	4.58	-0.434	Public	0.456

dt9 %>% group_by(school) %>% summarise(mathsum=mean(mAch))

## # A tibble: 160 x 2
##    school mathsum
##    <ord>    <dbl>
##  1 8367      4.55
##  2 8854      4.24
##  3 4458      5.81
##  4 5762      4.32
##  5 6990      5.98
##  6 5815      7.27
##  7 7172      8.07
##  8 4868     12.3 
##  9 7341      9.79
## 10 1358     11.2 
## # ... with 150 more rows

plot the confidence interval

dt9 %>%
  group_by(school) %>%
  summarize(math_m=mean(mAch, na.rm = T),
            math_se=sd(mAch, na.rm = T)/sqrt(n())) %>%
  ggplot(data = ., aes(x = school, y = math_m)) +
  geom_point() +
  geom_line(aes(group = school)) +
  geom_errorbar(aes(ymin = math_m - 2*math_se, ymax = math_m + 2*math_se), width = .1) + 
  labs(x = "school", y = "Average Math score") +
  theme_bw()

## geom_path: Each group consists of only one observation. Do you need
## to adjust the group aesthetic?

calculate the confidence interval

dt9 %>%
  group_by(school) %>%
  summarize(math_m=mean(mAch, na.rm = T),
            math_se=sd(mAch, na.rm = T)/sqrt(n())) %>%
  mutate(math_ci_upper=math_m - 2*math_se,
         math_ci_lower=math_m + 2*math_se)

## # A tibble: 160 x 5
##    school math_m math_se math_ci_upper math_ci_lower
##    <ord>   <dbl>   <dbl>         <dbl>         <dbl>
##  1 8367     4.55   1.18           2.19          6.92
##  2 8854     4.24   0.956          2.33          6.15
##  3 4458     5.81   0.646          4.52          7.10
##  4 5762     4.32   0.821          2.68          5.97
##  5 6990     5.98   0.729          4.52          7.44
##  6 5815     7.27   1.12           5.04          9.51
##  7 7172     8.07   0.846          6.38          9.76
##  8 4868    12.3    0.932         10.4          14.2 
##  9 7341     9.79   0.757          8.28         11.3 
## 10 1358    11.2    1.07           9.06         13.4 
## # ... with 150 more rows

Homework 0326

Yifang

2018-03-29

2

3

4

read in the data and see

5

讀進資料後合併

讀進資料後合併起來，看看有沒有重複，發現沒有

7

讀進資料後新增新的變項

畫圖

8

read in the data and see

Find the median prestige score for each of the three types of occupation, respectively.

The relationship between income and education for each category

9

read the data and see

plot the confidence interval

calculate the confidence interval