HW2

pacman::p_load(Ecdat,mosaic,memisc,mlmRev,HSAUR3,knitr,kableExtra,readr,dplyr,ggplot2,tidyr,car,magrittr,tibble,data.table,stringr)
# read dataset
dta <- Ecdat::Caschool
head(dta)
##   distcod  county                        district grspan enrltot teachers
## 1   75119 Alameda              Sunol Glen Unified  KK-08     195    10.90
## 2   61499   Butte            Manzanita Elementary  KK-08     240    11.15
## 3   61549   Butte     Thermalito Union Elementary  KK-08    1550    82.90
## 4   61457   Butte Golden Feather Union Elementary  KK-08     243    14.00
## 5   61523   Butte        Palermo Union Elementary  KK-08    1335    71.50
## 6   62042  Fresno         Burrel Union Elementary  KK-08     137     6.40
##   calwpct mealpct computer testscr   compstu  expnstu      str    avginc
## 1  0.5102  2.0408       67  690.80 0.3435898 6384.911 17.88991 22.690001
## 2 15.4167 47.9167      101  661.20 0.4208333 5099.381 21.52466  9.824000
## 3 55.0323 76.3226      169  643.60 0.1090323 5501.955 18.69723  8.978000
## 4 36.4754 77.0492       85  647.70 0.3497942 7101.831 17.35714  8.978000
## 5 33.1086 78.4270      171  640.85 0.1280899 5235.988 18.67133  9.080333
## 6 12.3188 86.9565       25  605.55 0.1824818 5580.147 21.40625 10.415000
##       elpct readscr mathscr
## 1  0.000000   691.6   690.0
## 2  4.583333   660.5   661.9
## 3 30.000002   636.3   650.9
## 4  0.000000   651.9   643.5
## 5 13.857677   641.8   639.9
## 6 12.408759   605.7   605.4
#Select random
set.seed(1070326)
dta2 <- group_by(dta,county) %>% sample_n(1)
#draw a scatter diagram
plot(dta2$mathscr,dta2$readscr,xlab = "Average math score", ylab = "Average reading score")

HW3

基本設置與載入資料

options(continue=" “)

設置小數點顯示位數

options(digits=3)

narrow output

options(width=72)

從網路上讀到ds

ds = read.csv(“http://www.amherst.edu/~nhorton/r2/datasets/help.csv”) library(dplyr)

選擇變項

newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)

顯示變項名稱

names(newds)
str(newds[,1:10])

檢視前10筆資料摘要

summary(newds[,1:10])

檢視前三筆資料

head(newds, n=3)

添加註腳

comment(newds) = “HELP baseline dataset” comment(newds)

存檔

save(ds, file=“savedfile”)

輸出資料為csv格式

write.csv(ds, file=“ds.csv”)

library(foreign) ### 寫成SAS可讀格式 write.foreign(newds, “file.dat”, “file.sas”, package=“SAS”)

檢視資料

with(newds, cesd[1:10]) with(newds, head(cesd, 10)) with(newds, cesd[cesd > 56])

整理資料

library(dplyr) ###先選擇newds中cesd大於56的觀察值,再選擇其id和cesd兩欄資料檢視。 filter(newds, cesd > 56) %>% select(id, cesd) #

檢視vector

with(newds, sort(cesd)[1:4])
with(newds, which.min(cesd))

載入mosaic

library(mosaic) ###檢查f1g變項的NA值,輸出成tbl tally(~ is.na(f1g), data=newds)
###對f1g變項做一些敘述統計,輸出成tbl favstats(~ f1g, data=newds)

將反向題重新編碼

cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), f1q, f1r, f1s, f1t)) ### 計算NA數量 nmisscesd = apply(is.na(cesditems), 1, sum) ncesditems = cesditems ncesditems[is.na(cesditems)] = 0

計算扣除NA後的cesd

newcesd = apply(ncesditems, 1, sum)

使用平均值來取代NA

imputemeancesd = 20/(20-nmisscesd)*newcesd

列出剛才將NA改成其他數值的表格

data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]

將連續資料定義成類別資料

library(dplyr) # mutate來自dplyr library(memisc) # cases來自memisc newds = mutate(newds, drinkstat= cases(“abstinent” = i1==0, “moderate” = (i1>0 & i1<=1 & i2<=3 & female==1) | (i1>0 & i1<=2 & i2<=4 & female==0), “highrisk” = ((i1>1 | i2>3) & female==1) | ((i1>2 | i2>4) & female==0)))

讀取mosaic, 解除memisc與MASS的物件

library(mosaic); detach(package:memisc); detach(package:MASS)

library(dplyr) ###選擇i1, i2, female, drinkstat四個變項 tmpds = select(newds, i1, i2, female, drinkstat)

檢視第365到370筆資料

tmpds[365:370,]

檢視tmpds內中度飲酒的女性

filter(tmpds, drinkstat==“moderate” & female==1)

製作交叉表

library(gmodels) # CrossTable出處 with(tmpds, CrossTable(drinkstat)) with(tmpds, CrossTable(drinkstat, female, prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))

新建一個性別(“Male”,“Female”)變項

newds = transform(newds, gender=factor(female, c(0,1), c(“Male”,“Female”))) tally(~ female + gender, margin=FALSE, data=newds)

library(dplyr) ### 依序使用cesd和id排序ds資料框 newds = arrange(ds, cesd, i1) newds[1:5, c(“cesd”, “i1”, “id”)] # 選出ds中的女性並計算其平均 females = filter(ds, female==1) with(females, mean(cesd)) mean(ds\(cesd[ds\)female==1])

計算男女cesd的平均值

with(ds, tapply(cesd, female, mean))
library(mosaic) mean(cesd ~ female, data=ds)

HW4

#read dataset
dta4<-HSAUR3::backpain
head(dta4)
##   ID  status driver suburban
## 1  1    case    yes      yes
## 2  1 control    yes       no
## 3  2    case    yes      yes
## 4  2 control    yes      yes
## 5  3    case    yes       no
## 6  3 control    yes      yes
with(dta4, table(driver, suburban,status)) 
## , , status = case
## 
##       suburban
## driver  no yes
##    no   26   6
##    yes  64 121
## 
## , , status = control
## 
##       suburban
## driver  no yes
##    no   47   7
##    yes  63 100
group_by(dta4,driver, suburban,status) %>% 
tally() %>%  spread(status, n) %>%  mutate(total = case + control)
## # A tibble: 4 x 5
## # Groups:   driver, suburban [4]
##   driver suburban  case control total
##   <fct>  <fct>    <int>   <int> <int>
## 1 no     no          26      47    73
## 2 no     yes          6       7    13
## 3 yes    no          64      63   127
## 4 yes    yes        121     100   221

HW5

library(datasets)
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 3.4.4
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
dta5_1<-state.x77
dta5_2<-USArrests
# merge
dta5<-merge(dta5_1,dta5_2)
# plot
ggpairs(dta5[, -1])

##Highly correlated(0.7~0.99):Life_exp & Illiteracy

HW6

library(MASS)
dta6<-merge(Animals,mammals,"row.names")
head(dta6)
##          Row.names  body.x brain.x  body.y brain.y
## 1 African elephant 6654.00  5712.0 6654.00  5712.0
## 2   Asian elephant 2547.00  4603.0 2547.00  4603.0
## 3              Cat    3.30    25.6    3.30    25.6
## 4       Chimpanzee   52.16   440.0   52.16   440.0
## 5              Cow  465.00   423.0  465.00   423.0
## 6           Donkey  187.10   419.0  187.10   419.0
duplicated(dta6)
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE
dim(dta6)
## [1] 23  5

HW7

# read dataset
dta7 <- Ecdat::Caschool
dta7$ratio <- dta7$enrltot / dta7$teachers
dta7 <- dta7 %>% filter(grspan == "KK-08") %>%
          mutate(level_read = cut(readscr, breaks = quantile(readscr, probs = c(0, .33, .67, 1)), label = c("L", "M", "H"), ordered = T))
# plot
library(lattice)
xyplot(readscr ~ ratio | level_read, data = dta7,
       type = c("p", "g", "r"), layout = c(3, 1),
       xlab = "Student-Teacher Ratio",
       ylab = "Reading Score")

HW8

#8-1:Find the median prestige score...
dta8_1 <- car::Prestige
group_by(na.omit(dta8_1),type) %>% summarise(median(prestige))
## # A tibble: 3 x 2
##   type  `median(prestige)`
##   <fct>              <dbl>
## 1 bc                  35.9
## 2 prof                68.4
## 3 wc                  41.5
#8-2:define two levels of prestige...
dta8_2 <- car::Prestige %>% 
  na.omit %>% 
  group_by(type) %>% 
  mutate(m = median(prestige),
         PrestigeLevel = memisc::cases("High" = prestige >= m, "Low" = prestige < m))
# plot
xyplot(income ~ education |type, 
       group = PrestigeLevel, 
       data = dta8_2, 
       type = c("p", "g", "r"),
       layout = c(3, 1),
       auto.key=list(space="right", row=2),
       xlab = "Average education of occupational incumbents",
       ylab = "Average income of incumbents")

HW9

library(ggplot2)
dta_9 <- mlmRev::Hsb82 %>%  group_by(sector, school) %>%  summarise(m = mean(mAch, na.rm = TRUE),s = sd(mAch, na.rm = TRUE),n = n(),se = s/sqrt(n)) %>% mutate(lower.ci = m-2*se, upper.ci = m+2*se)
# plot
ggplot(dta_9, aes(x = school, y = m)) +  geom_point() +  geom_errorbar(aes(ymax = upper.ci, ymin = lower.ci))+  coord_flip()+  facet_wrap(~sector)+  labs(x = "School",y = "Math Mean Score by School")

END