Data wrangling

Ex2

#讀取資料
dta<-Ecdat::Caschool
#查看前六筆資料
head(dta)

##   distcod  county                        district grspan enrltot teachers
## 1   75119 Alameda              Sunol Glen Unified  KK-08     195    10.90
## 2   61499   Butte            Manzanita Elementary  KK-08     240    11.15
## 3   61549   Butte     Thermalito Union Elementary  KK-08    1550    82.90
## 4   61457   Butte Golden Feather Union Elementary  KK-08     243    14.00
## 5   61523   Butte        Palermo Union Elementary  KK-08    1335    71.50
## 6   62042  Fresno         Burrel Union Elementary  KK-08     137     6.40
##   calwpct mealpct computer testscr   compstu  expnstu      str    avginc
## 1  0.5102  2.0408       67  690.80 0.3435898 6384.911 17.88991 22.690001
## 2 15.4167 47.9167      101  661.20 0.4208333 5099.381 21.52466  9.824000
## 3 55.0323 76.3226      169  643.60 0.1090323 5501.955 18.69723  8.978000
## 4 36.4754 77.0492       85  647.70 0.3497942 7101.831 17.35714  8.978000
## 5 33.1086 78.4270      171  640.85 0.1280899 5235.988 18.67133  9.080333
## 6 12.3188 86.9565       25  605.55 0.1824818 5580.147 21.40625 10.415000
##       elpct readscr mathscr
## 1  0.000000   691.6   690.0
## 2  4.583333   660.5   661.9
## 3 30.000002   636.3   650.9
## 4  0.000000   651.9   643.5
## 5 13.857677   641.8   639.9
## 6 12.408759   605.7   605.4

#查看資料格式
str(dta)

## 'data.frame':    420 obs. of  17 variables:
##  $ distcod : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
##  $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
##  $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
##  $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
##  $ enrltot : int  195 240 1550 243 1335 137 195 888 379 2247 ...
##  $ teachers: num  10.9 11.1 82.9 14 71.5 ...
##  $ calwpct : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ mealpct : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer: int  67 101 169 85 171 25 28 66 35 0 ...
##  $ testscr : num  691 661 644 648 641 ...
##  $ compstu : num  0.344 0.421 0.109 0.35 0.128 ...
##  $ expnstu : num  6385 5099 5502 7102 5236 ...
##  $ str     : num  17.9 21.5 18.7 17.4 18.7 ...
##  $ avginc  : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ elpct   : num  0 4.58 30 0 13.86 ...
##  $ readscr : num  692 660 636 652 642 ...
##  $ mathscr : num  690 662 651 644 640 ...

#抽樣並畫散佈圖
set.seed(13579)
dta2_1<-dta%>%group_by(county)%>%sample_n(size = 1)
ggplot(dta2_1,aes(x=readscr,y=mathscr))+geom_point()

Ex3

## ----echo=FALSE,eval=TRUE------------------------------------------------
options(continue="  ")#設定換行符號

## ------------------------------------------------------------------------
options(digits=3)#設定小數點位數
options(width=72) # narrow output 
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv")#讀資料
library(dplyr)#載入Package

newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, 
               f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)#選擇載入變項

## ------------------------------------------------------------------------
names(newds)#查看變項名稱

##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"

str(newds[,1:10]) # structure of the first 10 variables

## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...

## ------------------------------------------------------------------------
summary(newds[,1:10]) # summary of the first 10 variables

##       cesd          female            i1              i2       
##  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
##  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
##  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
##        id          treat            f1a            f1b      
##  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median :233   Median :0.000   Median :2.00   Median :1.00  
##  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00

## ------------------------------------------------------------------------
head(newds, n=3)#檢視前三筆資料

##   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0

## ------------------------------------------------------------------------
comment(newds) = "HELP baseline dataset" #添加文字註腳
comment(newds)#查看註腳

## [1] "HELP baseline dataset"

save(ds, file="savedfile")#存擋

## ------------------------------------------------------------------------
write.csv(ds, file="ds.csv")#把ds匯出成.csv格式

## ------------------------------------------------------------------------
library(foreign)#載入package
write.foreign(newds, "file.dat", "file.sas", package="SAS")#會出成SAS可讀的模式

## ------------------------------------------------------------------------
with(newds, cesd[1:10])#查看cesd前10項的數值

##  [1] 49 30 39 15 39  6 52 32 50 46

with(newds, head(cesd, 10))#查看cesd前10項的數值

##  [1] 49 30 39 15 39  6 52 32 50 46

## ------------------------------------------------------------------------
with(newds, cesd[cesd > 56])#查看cesd大於56的數值

## [1] 57 58 57 60 58 58 57

## ------------------------------------------------------------------------
library(dplyr)#載入package
filter(newds, cesd > 56) %>% select(id, cesd)#選出cesd大於56的id

##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57

## ------------------------------------------------------------------------
with(newds, sort(cesd)[1:4])#將cesd從小排到大並查看前四筆

## [1] 1 3 3 4

with(newds, which.min(cesd))#查看cesd的最小值在第幾列

## [1] 199

## ------------------------------------------------------------------------
library(mosaic)#載入package
tally(~ is.na(f1g), data=newds)#查看flg的遺漏值並轉成tbl的樣式

## is.na(f1g)
##  TRUE FALSE 
##     1   452

favstats(~ f1g, data=newds)#描述統計

##  min Q1 median Q3 max mean  sd   n missing
##    0  1      2  3   3 1.73 1.1 452       1

## ------------------------------------------------------------------------
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
                              (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
                              f1q, f1r, f1s, f1t))#選出反向提重新編碼
nmisscesd = apply(is.na(cesditems), 1, sum)#計算NA數量
ncesditems = cesditems#重新指定成另一筆資料
ncesditems[is.na(cesditems)] = 0#讓遺漏值變為0
newcesd = apply(ncesditems, 1, sum)#計算列的平均
imputemeancesd = 20/(20-nmisscesd)*newcesd#以平均值帶入遺漏值

## ------------------------------------------------------------------------
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]

##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1           15.8
## 17       19         19         1           20.0
## 87       44         44         1           46.3
## 101      17         17         1           17.9
## 154      29         29         1           30.5
## 177      44         44         1           46.3
## 229      39         39         1           41.1

## ----createdrink,message=FALSE-------------------------------------------
library(dplyr)#載入package
library(memisc)#載入package

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## 
## Attaching package: 'memisc'

## The following object is masked from 'package:Matrix':
## 
##     as.array

## The following objects are masked from 'package:dplyr':
## 
##     collect, recode, rename

## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts

## The following object is masked from 'package:base':
## 
##     as.array

newds = mutate(newds, drinkstat= 
                 cases(
                   "abstinent" = i1==0,
                   "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
                     (i1>0 & i1<=2 & i2<=4 & female==0),
                   "highrisk" = ((i1>1 | i2>3) & female==1) |
                     ((i1>2 | i2>4) & female==0)))#重新設定類別標籤

## ----echo=FALSE----------------------------------------------------------
library(mosaic)#載入package

## ----echo=FALSE----------------------------------------------------------
detach(package:memisc)#解除package
detach(package:MASS)#解除package

## ------------------------------------------------------------------------
library(dplyr)#載入package
tmpds = select(newds, i1, i2, female, drinkstat)#挑選變項
tmpds[365:370,]#查看第365到370項

##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk

## ------------------------------------------------------------------------
library(dplyr)#載入package
filter(tmpds, drinkstat=="moderate" & female==1)#選出drinkstat是moderate的女性

##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate

## ----message=FALSE-------------------------------------------------------
library(gmodels)#載入package
with(tmpds, CrossTable(drinkstat))#查看次數表

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
##

## ------------------------------------------------------------------------
with(tmpds, CrossTable(drinkstat, female, 
                       prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))##橫列是喝酒程度、直欄是性別

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
##

## ------------------------------------------------------------------------
newds = transform(newds, 
                  gender=factor(female, c(0,1), c("Male","Female")))#將性別變項中的0,1對應到male和female
tally(~ female + gender, margin=FALSE, data=newds)

##       gender
## female Male Female
##      0  346      0
##      1    0    107

## ------------------------------------------------------------------------
library(dplyr)#載入package
newds = arrange(ds, cesd, i1)#排序
newds[1:5, c("cesd", "i1", "id")]#查看1到5筆資料

##   cesd i1  id
## 1    1  3 233
## 2    3  1 139
## 3    3 13 418
## 4    4  4 251
## 5    4  9  95

## ------------------------------------------------------------------------
library(dplyr)#載入package
females = filter(ds, female==1)#選出ds中的女性
with(females, mean(cesd))#計算女性的平均cesd

## [1] 36.9

# an alternative approach
mean(ds$cesd[ds$female==1])#計算女性的平均cesd

## [1] 36.9

## ------------------------------------------------------------------------
with(ds, tapply(cesd, female, mean))#計算平均

##    0    1 
## 31.6 36.9

library(mosaic)#載入package
mean(cesd ~ female, data=ds)#以female來界定根據什麼的平均

##    0    1 
## 31.6 36.9

Ex4

#查看資料前六筆以及資料結構
dta4<-HSAUR3::backpain
head(dta4)

##   ID  status driver suburban
## 1  1    case    yes      yes
## 2  1 control    yes       no
## 3  2    case    yes      yes
## 4  2 control    yes      yes
## 5  3    case    yes       no
## 6  3 control    yes      yes

str(dta4)

## 'data.frame':    434 obs. of  4 variables:
##  $ ID      : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ status  : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
##  $ driver  : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
##  $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...

dta4<-dta4 %>% 
  group_by(status,driver,suburban) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  tidyr::spread(status,n) %>% 
  mutate(total=case+control); dta4

## # A tibble: 4 x 5
##   driver suburban  case control total
##   <fct>  <fct>    <int>   <int> <int>
## 1 no     no          26      47    73
## 2 no     yes          6       7    13
## 3 yes    no          64      63   127
## 4 yes    yes        121     100   221

Ex5

#載入package
library(datasets)

#合併資料
dta5<-merge(state.x77, USArrests,"row.names")

#計算相關
cor(dta5[,-1])

##            Population  Income Illiteracy Life Exp Murder.x HS Grad
## Population     1.0000  0.2082     0.1076  -0.0681   0.3436 -0.0985
## Income         0.2082  1.0000    -0.4371   0.3403  -0.2301  0.6199
## Illiteracy     0.1076 -0.4371     1.0000  -0.5885   0.7030 -0.6572
## Life Exp      -0.0681  0.3403    -0.5885   1.0000  -0.7808  0.5822
## Murder.x       0.3436 -0.2301     0.7030  -0.7808   1.0000 -0.4880
## HS Grad       -0.0985  0.6199    -0.6572   0.5822  -0.4880  1.0000
## Frost         -0.3322  0.2263    -0.6719   0.2621  -0.5389  0.3668
## Area           0.0225  0.3633     0.0773  -0.1073   0.2284  0.3335
## Murder.y       0.3202 -0.2152     0.7068  -0.7785   0.9337 -0.5216
## Assault        0.3170  0.0409     0.5110  -0.6267   0.7398 -0.2303
## UrbanPop       0.5126  0.4805    -0.0622   0.2715   0.0164  0.3587
## Rape           0.3052  0.3574     0.1546  -0.2696   0.5800  0.2707
##              Frost    Area Murder.y Assault UrbanPop   Rape
## Population -0.3322  0.0225   0.3202  0.3170   0.5126  0.305
## Income      0.2263  0.3633  -0.2152  0.0409   0.4805  0.357
## Illiteracy -0.6719  0.0773   0.7068  0.5110  -0.0622  0.155
## Life Exp    0.2621 -0.1073  -0.7785 -0.6267   0.2715 -0.270
## Murder.x   -0.5389  0.2284   0.9337  0.7398   0.0164  0.580
## HS Grad     0.3668  0.3335  -0.5216 -0.2303   0.3587  0.271
## Frost       1.0000  0.0592  -0.5414 -0.4682  -0.2462 -0.279
## Area        0.0592  1.0000   0.1481  0.2312  -0.0615  0.525
## Murder.y   -0.5414  0.1481   1.0000  0.8019   0.0696  0.564
## Assault    -0.4682  0.2312   0.8019  1.0000   0.2589  0.665
## UrbanPop   -0.2462 -0.0615   0.0696  0.2589   1.0000  0.411
## Rape       -0.2792  0.5250   0.5636  0.6652   0.4113  1.000

#圖表
GGally::ggpairs(dta5[,-1])

####從圖中看不太出結論

Ex6

#載入資料庫
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(tibble)
#整理資料
dta6<-merge(rownames_to_column(mammals),rownames_to_column(Animals), all = TRUE)

#查看是否有重複
duplicated(dta6)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE

Ex7

#讀資料
dta7<-Ecdat::Caschool

#新增變項並查看新資料樣式
dta7<- dta7%>%
  mutate(ratio = enrltot/teachers, 
       Reading = cut(readscr, breaks = quantile(readscr, probs = c(0, .33, .67, 1)), 
       label = c("L", "M", "H"), ordered = T))
head(dta7)

##   distcod  county                        district grspan enrltot
## 1   75119 Alameda              Sunol Glen Unified  KK-08     195
## 2   61499   Butte            Manzanita Elementary  KK-08     240
## 3   61549   Butte     Thermalito Union Elementary  KK-08    1550
## 4   61457   Butte Golden Feather Union Elementary  KK-08     243
## 5   61523   Butte        Palermo Union Elementary  KK-08    1335
## 6   62042  Fresno         Burrel Union Elementary  KK-08     137
##   teachers calwpct mealpct computer testscr compstu expnstu  str avginc
## 1     10.9    0.51    2.04       67     691   0.344    6385 17.9  22.69
## 2     11.1   15.42   47.92      101     661   0.421    5099 21.5   9.82
## 3     82.9   55.03   76.32      169     644   0.109    5502 18.7   8.98
## 4     14.0   36.48   77.05       85     648   0.350    7102 17.4   8.98
## 5     71.5   33.11   78.43      171     641   0.128    5236 18.7   9.08
## 6      6.4   12.32   86.96       25     606   0.182    5580 21.4  10.41
##   elpct readscr mathscr ratio Reading
## 1  0.00     692     690  17.9       H
## 2  4.58     660     662  21.5       M
## 3 30.00     636     651  18.7       L
## 4  0.00     652     644  17.4       M
## 5 13.86     642     640  18.7       L
## 6 12.41     606     605  21.4       L

#繪圖
library(lattice)
xyplot(readscr~ratio|Reading,data=dta7,
                type=c("p","g","r"),layout=c(3,1),
                xlab="Student-Teacher Ratio",
                ylab="Reading Score")

Ex8

#讀資料
dta8<-car::Prestige
head(dta8)

##                     education income women prestige census type
## gov.administrators       13.1  12351 11.16     68.8   1113 prof
## general.managers         12.3  25879  4.02     69.1   1130 prof
## accountants              12.8   9271 15.70     63.4   1171 prof
## purchasing.officers      11.4   8865  9.11     56.8   1175 prof
## chemists                 14.6   8403 11.68     73.5   2111 prof
## physicists               15.6  11030  5.13     77.6   2113 prof

str(dta8)

## 'data.frame':    102 obs. of  6 variables:
##  $ education: num  13.1 12.3 12.8 11.4 14.6 ...
##  $ income   : int  12351 25879 9271 8865 8403 11030 8258 14163 11377 11023 ...
##  $ women    : num  11.16 4.02 15.7 9.11 11.68 ...
##  $ prestige : num  68.8 69.1 63.4 56.8 73.5 77.6 72.6 78.1 73.1 68.8 ...
##  $ census   : int  1113 1130 1171 1175 2111 2113 2133 2141 2143 2153 ...
##  $ type     : Factor w/ 3 levels "bc","prof","wc": 2 2 2 2 2 2 2 2 2 2 ...

8_1

#Find the median prestige score for each of the three types of occupation, respectively.
dta8_1<-dta8%>% 
  group_by(type) %>% 
  summarize(m=median(prestige,na.rm=TRUE));dta8_1

## # A tibble: 4 x 2
##   type      m
##   <fct> <dbl>
## 1 bc     35.9
## 2 prof   68.4
## 3 wc     41.5
## 4 <NA>   35.0

8_2

#整理資料
dta8_2<-dta8 %>% 
  na.omit%>% 
  group_by(type) %>% 
  mutate(m=median(prestige),
  PrestigeLV=memisc::cases("H"=prestige>=m,"Low"=prestige < m))

#繪圖
xyplot(income~education|type, 
       group=PrestigeLV, 
       data=dta8_2, 
       type=c("p","g","r"),
       layout=c(3, 1),
       auto.key=list(space="right", row=2),
       xlab="Average education of occupational incumbents in 1971",
       ylab="Average income of incumbents in 1971")

####三種職業的聲望對於教育和收入之間的效果不一致

Ex9

#讀資料並查看資料格式
dta9<-mlmRev::Hsb82
dta9$school<-factor(dta9$school)
head(dta9)

##   school minrty     sx    ses  mAch meanses sector    cses
## 1   1224     No Female -1.528  5.88  -0.434 Public -1.0936
## 2   1224     No Female -0.588 19.71  -0.434 Public -0.1536
## 3   1224     No   Male -0.528 20.35  -0.434 Public -0.0936
## 4   1224     No   Male -0.668  8.78  -0.434 Public -0.2336
## 5   1224     No   Male -0.158 17.90  -0.434 Public  0.2764
## 6   1224     No   Male  0.022  4.58  -0.434 Public  0.4564

str(dta9)

## 'data.frame':    7185 obs. of  8 variables:
##  $ school : Ord.factor w/ 160 levels "8367"<"8854"<..: 59 59 59 59 59 59 59 59 59 59 ...
##  $ minrty : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sx     : Factor w/ 2 levels "Male","Female": 2 2 1 1 1 1 2 1 2 1 ...
##  $ ses    : num  -1.528 -0.588 -0.528 -0.668 -0.158 ...
##  $ mAch   : num  5.88 19.71 20.35 8.78 17.9 ...
##  $ meanses: num  -0.434 -0.434 -0.434 -0.434 -0.434 ...
##  $ sector : Factor w/ 2 levels "Public","Catholic": 1 1 1 1 1 1 1 1 1 1 ...
##  $ cses   : num  -1.0936 -0.1536 -0.0936 -0.2336 0.2764 ...

#計算95CI
dta9_1 <- dta9 %>%
  group_by(sector, school) %>%
  summarise(m = mean(mAch, na.rm = TRUE),
            s = sd(mAch, na.rm = TRUE),
            n = n(),
            se = s/sqrt(n)) %>%
  mutate(lower.ci = m-2*se, upper.ci = m+2*se)
head(dta9_1)

## # A tibble: 6 x 8
## # Groups:   sector [1]
##   sector school     m     s     n    se lower.ci upper.ci
##   <fct>  <ord>  <dbl> <dbl> <int> <dbl>    <dbl>    <dbl>
## 1 Public 8367    4.55  4.43    14 1.18      2.19     6.92
## 2 Public 8854    4.24  5.41    32 0.956     2.33     6.15
## 3 Public 4458    5.81  4.48    48 0.646     4.52     7.10
## 4 Public 5762    4.32  4.99    37 0.821     2.68     5.97
## 5 Public 6990    5.98  5.31    53 0.729     4.52     7.44
## 6 Public 5815    7.27  5.59    25 1.12      5.04     9.51

#繪圖 
ggplot(dta9_1, aes(x = school, y = m)) +
  geom_point() +
  geom_errorbar(aes(ymax = upper.ci, ymin = lower.ci))+
  coord_flip()+
  facet_wrap(~sector)+
  labs(x = "School",y = "Math Mean Score by School")

Data_wrangling_Homework

Li-Ping Yen

2018.3.26

Data wrangling

Ex2

Ex3

Ex4

Ex5

Ex6

Ex7

Ex8

8_1

8_2

Ex9

The End