Q2 Caschool

dta <- Caschool
head(dta, 3)

  distcod  county                    district grspan enrltot teachers
1   75119 Alameda          Sunol Glen Unified  KK-08     195    10.90
2   61499   Butte        Manzanita Elementary  KK-08     240    11.15
3   61549   Butte Thermalito Union Elementary  KK-08    1550    82.90
  calwpct mealpct computer testscr compstu expnstu   str avginc  elpct
1  0.5102   2.041       67   690.8  0.3436    6385 17.89 22.690  0.000
2 15.4167  47.917      101   661.2  0.4208    5099 21.52  9.824  4.583
3 55.0323  76.323      169   643.6  0.1090    5502 18.70  8.978 30.000
  readscr mathscr
1   691.6   690.0
2   660.5   661.9
3   636.3   650.9

#sampling
set.seed(33333)
new_dta <- dta %>% group_by(county) %>% 
            sample_n(1)

#plot
plot(new_dta$mathscr,new_dta$readscr,
     xlab = "Average math score", ylab = "Average reading score")

Q3 Commenting

###
options(continue="  ") 

###基本設定
options(digits=3) #設小數點位數 
options(width=72) # narrow output 
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv") #讀入資料
library(dplyr) #讀入package
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, 
               f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t) #選取需要的變項

###檢視資料
names(newds) #列出變項名稱
str(newds[,1:10]) #檢視前十筆變項的結構
summary(newds[,1:10]) # 檢視前十筆變項的摘要
head(newds, n=3) #檢視前三筆資料

##寫註解、儲存
comment(newds) = "HELP baseline dataset" #在object下註解
comment(newds) #檢視註解
save(ds, file="savedfile") #儲存

### 輸出檔案
write.csv(ds, file="ds.csv") #輸出csv檔

## 輸出檔案
library(foreign) #讀入package
write.foreign(newds, "file.dat", "file.sas", package="SAS") #分別輸出data、code的檔，使用package : SAS

### 檢視資料
with(newds, cesd[1:10]) #在newds這個object中，看變項csed前十列
with(newds, head(cesd, 10)) #在newds這個object中，看變項csed前十列
with(newds, cesd[cesd > 56]) #在newds這個object中，看變項csed數值超過56的數值

### 篩選資料
library(dplyr) #讀入package
filter(newds, cesd > 56) %>% select(id, cesd) #選出csed大於56的列，並指顯示id、cesd兩個變項

### 檢視
with(newds, sort(cesd)[1:4]) #將csed由小排到大後，看前四筆資料的數值
with(newds, which.min(cesd)) #看csed最小的數值位於第幾列

### 檢視資料  
library(mosaic) #讀入資料
tally(~ is.na(f1g), data=newds) #判別變項flg裡的na，用table列出
favstats(~ f1g, data=newds) #對變項flg做摘要，內容含min,Q1,median,Q3,max,mean,sd,n,missing

### 整理資料
# 反向題重新編碼reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
                              (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
                              f1q, f1r, f1s, f1t))
nmisscesd = apply(is.na(cesditems), 1, sum) #計算na個數
ncesditems = cesditems #建一個object
ncesditems[is.na(cesditems)] = 0 #把為na者用0代替
newcesd = apply(ncesditems, 1, sum) #計算na個數
imputemeancesd = 20/(20-nmisscesd)*newcesd #用平均值取代na

### 建立資料框架
#建立資料框架，列出nmisscesd大於0的列
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,] 



### 改變資料
library(dplyr) #讀入package
library(memisc) #讀入package
newds = mutate(newds, drinkstat= 
                 cases(
                   "abstinent" = i1==0,
                   "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
                     (i1>0 & i1<=2 & i2<=4 & female==0),
                   "highrisk" = ((i1>1 | i2>3) & female==1) |
                     ((i1>2 | i2>4) & female==0))) #把變項改成類別資料     

### 整理package
library(mosaic) #讀入package
detach(package:memisc) #卸載package
detach(package:MASS) #卸載package

### 整理資料
library(dplyr) #讀入package
tmpds = select(newds, i1, i2, female, drinkstat) #選擇變項
tmpds[365:370,] #列出第365到第370列
library(dplyr) #讀入package
filter(tmpds, drinkstat=="moderate" & female==1) #篩選擁有指定變項的值的列

### 製作表
library(gmodels) #讀入package
with(tmpds, CrossTable(drinkstat)) #次數與機率表
#列為飲酒、行為性別，不顯示行、列的比率，不顯示chi-square
with(tmpds, CrossTable(drinkstat, female, 
                       prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE)) 

### 整理、處理資料
newds = transform(newds, 
                  gender=factor(female, c(0,1), c("Male","Female"))) #依據female設一個新的變項
tally(~ female + gender, margin=FALSE, data=newds) #製成table

library(dplyr) #讀入package
newds = arrange(ds, cesd, i1) #依序排列的資料框
newds[1:5, c("cesd", "i1", "id")] #列出前五筆，只顯示上述變項

library(dplyr) #讀入package
females = filter(ds, female==1) #列出變項female為1的列
with(females, mean(cesd)) #算某變項的平均數

mean(ds$cesd[ds$female==1]) #an alternative approach

with(ds, tapply(cesd, female, mean)) #算性別csed的平均
library(mosaic) #讀入package
mean(cesd ~ female, data=ds) #算性別csed的平均

Q4 Backpain

library(HSAUR3)
dta4 <- backpain
head(dta4)

  ID  status driver suburban
1  1    case    yes      yes
2  1 control    yes       no
3  2    case    yes      yes
4  2 control    yes      yes
5  3    case    yes       no
6  3 control    yes      yes

with(dta4, table(driver, suburban,status))

, , status = case

      suburban
driver  no yes
   no   26   6
   yes  64 121

, , status = control

      suburban
driver  no yes
   no   47   7
   yes  63 100

#table
dta4 %>% 
  group_by(driver, suburban,status) %>% 
  tally() %>%
  spread(status, n) %>%
  mutate(total = case + control)

# A tibble: 4 x 5
# Groups:   driver, suburban [4]
  driver suburban  case control total
  <fct>  <fct>    <int>   <int> <int>
1 no     no          26      47    73
2 no     yes          6       7    13
3 yes    no          64      63   127
4 yes    yes        121     100   221

Apply table() using dplyr : source

Q5 USArrests

library(datasets)
dta5 <- merge(state.x77, USArrests, by = "row.names")
P <- cor(dta5[,-1]); P

           Population   Income Illiteracy Life Exp Murder.x  HS Grad
Population    1.00000  0.20823    0.10762 -0.06805  0.34364 -0.09849
Income        0.20823  1.00000   -0.43708  0.34026 -0.23008  0.61993
Illiteracy    0.10762 -0.43708    1.00000 -0.58848  0.70298 -0.65719
Life Exp     -0.06805  0.34026   -0.58848  1.00000 -0.78085  0.58222
Murder.x      0.34364 -0.23008    0.70298 -0.78085  1.00000 -0.48797
HS Grad      -0.09849  0.61993   -0.65719  0.58222 -0.48797  1.00000
Frost        -0.33215  0.22628   -0.67195  0.26207 -0.53888  0.36678
Area          0.02254  0.36332    0.07726 -0.10733  0.22839  0.33354
Murder.y      0.32024 -0.21521    0.70678 -0.77850  0.93369 -0.52159
Assault       0.31702  0.04093    0.51101 -0.62666  0.73976 -0.23031
UrbanPop      0.51260  0.48053   -0.06220  0.27147  0.01638  0.35868
Rape          0.30523  0.35739    0.15460 -0.26957  0.57996  0.27073
              Frost     Area Murder.y  Assault UrbanPop    Rape
Population -0.33215  0.02254  0.32024  0.31702  0.51260  0.3052
Income      0.22628  0.36332 -0.21521  0.04093  0.48053  0.3574
Illiteracy -0.67195  0.07726  0.70678  0.51101 -0.06220  0.1546
Life Exp    0.26207 -0.10733 -0.77850 -0.62666  0.27147 -0.2696
Murder.x   -0.53888  0.22839  0.93369  0.73976  0.01638  0.5800
HS Grad     0.36678  0.33354 -0.52159 -0.23031  0.35868  0.2707
Frost       1.00000  0.05923 -0.54140 -0.46824 -0.24619 -0.2792
Area        0.05923  1.00000  0.14809  0.23121 -0.06155  0.5250
Murder.y   -0.54140  0.14809  1.00000  0.80187  0.06957  0.5636
Assault    -0.46824  0.23121  0.80187  1.00000  0.25887  0.6652
UrbanPop   -0.24619 -0.06155  0.06957  0.25887  1.00000  0.4113
Rape       -0.27921  0.52496  0.56358  0.66524  0.41134  1.0000

corrplot(P, method = "circle")

兩邊資料的謀殺個數有高相關(分別為1976與1973的數據)

Q6 Animals & Mammals

library(MASS)
dta6 <- rbind(rownames_to_column(Animals,"type"), rownames_to_column(mammals,"type"))
dta6 <- dta6[!duplicated(dta6), ]

duplicated(dta6)

 [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[67] FALSE

dim(dta6)

[1] 67  3

Q7 Caschool,ratio

library(Ecdat)
dta7 <- Caschool
dta7$ratio <- dta7$enrltot / dta7$teachers

dta7 <- dta7 %>% filter(grspan == "KK-08") %>%
          mutate(level_read = cut(readscr, breaks = quantile(readscr, probs = c(0, .33, .67, 1)), label = c("L", "M", "H"), ordered = T))

# plot
library(lattice)
xyplot(readscr ~ ratio | level_read, data = dta7,
       type = c("p", "g", "r"), layout = c(3, 1),
       xlab = "Student-Teacher Ratio",
       ylab = "Reading Score")

Q8 Prestige

(1)median score

library(car)


Attaching package: 'car'

The following object is masked from 'package:memisc':

    recode

The following object is masked from 'package:purrr':

    some

The following object is masked from 'package:Ecdat':

    Mroz

The following object is masked from 'package:dplyr':

    recode

dta8 <- Prestige
dta8 <- dta8 %>%  na.omit() %>%
          group_by(type) %>% 
          summarize(m = median(prestige, na.rm = TRUE)); dta8

# A tibble: 3 x 2
  type      m
  <fct> <dbl>
1 bc     35.9
2 prof   68.4
3 wc     41.5

(2)income & edu

library(car)
dta8_n <- Prestige
dta8_n <- dta8_n %>% na.omit() %>%
            group_by(type) %>%
            mutate(M = median(prestige), level = cases("High" = prestige > M,
                                                       "Low" = prestige <= M))


# plot
ggplot(dta8_n, aes(x = education, y = income, group = level)) +
  geom_point(aes(color = level)) +
  stat_smooth(method = "lm", se = F, aes(color = level)) +
  facet_wrap(~ factor(type)) +
  labs(x= "Average education of each occupation", y = "Average income of occupation") +
  theme_bw()

Q9 Hsb82, math score

library(mlmRev)
dta9 <- Hsb82  
head(dta9, 3)

  school minrty     sx    ses   mAch meanses sector     cses
1   1224     No Female -1.528  5.876 -0.4344 Public -1.09362
2   1224     No Female -0.588 19.708 -0.4344 Public -0.15362
3   1224     No   Male -0.528 20.349 -0.4344 Public -0.09362

dta9 <- dta9 %>%  group_by(school, sector) %>%
          summarize(math_m = mean(mAch, na.rm = T),
                    math_se = sd(mAch, na.rm = T)/sqrt(n()))

# plot
ggplot(data = dta9, aes(x = school, y = math_m)) +
  geom_point() +
  facet_wrap( ~ sector) +
  geom_errorbar(aes(ymin = math_m - 2*math_se, ymax = math_m + 2*math_se), width = .1) + 
  labs(x = "school", y = "Average Math Achievement score") +
  coord_flip() +
  theme_bw()

w5 Data Management Q2 - Q9

Pei Jun

2018-03-30

Q2 Caschool

Q3 Commenting

Q4 Backpain

Q5 USArrests

Q6 Animals & Mammals

Q7 Caschool,ratio

Q8 Prestige

(1)median score

(2)income & edu

Q9 Hsb82, math score