setwd("~/")
options(digits = 4, show.signif.stars = FALSE)
pacman::p_load(tidyverse, ggplot2, car, HSAUR3, Ecdat, GGally, lattice, mlmRev, memisc)
dta<- Caschool %>%
group_by(county) %>%
sample_n(1)
with(dta, plot(mathscr ~ readscr,
xlab = "average math score",
ylab = "average reading score"))
## 基本設置與載入資料
options(continue=" ") # 設置換行符號
options(digits=3) # 設置小數點顯示位數
options(width=72) # narrow output ## 設置輸出資訊的寬度
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv") # 從網址讀取資料並存為物件
library(dplyr) # 載入整理資料的package
newds = select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e,
f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
# 選擇變項
## 檢視資料框
names(newds) # 檢查變項名稱
## [1] "cesd" "female" "i1" "i2" "id" "treat" "f1a"
## [8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
## [15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
## [22] "f1p" "f1q" "f1r" "f1s" "f1t"
str(newds[,1:10]) # structure of the first 10 variables # 檢視前10筆資料結構
## 'data.frame': 453 obs. of 10 variables:
## $ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
## $ female: int 0 0 0 1 0 1 1 0 1 0 ...
## $ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
## $ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ treat : int 1 1 0 0 0 1 0 1 0 1 ...
## $ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
## $ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
## $ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
## $ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
summary(newds[,1:10]) # summary of the first 10 variables # 檢視前10筆資料摘要
## cesd female i1 i2
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:25.0 1st Qu.:0.000 1st Qu.: 3.0 1st Qu.: 3.0
## Median :34.0 Median :0.000 Median : 13.0 Median : 15.0
## Mean :32.8 Mean :0.236 Mean : 17.9 Mean : 22.6
## 3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.: 26.0 3rd Qu.: 32.0
## Max. :60.0 Max. :1.000 Max. :142.0 Max. :184.0
## id treat f1a f1b
## Min. : 1 Min. :0.000 Min. :0.00 Min. :0.00
## 1st Qu.:119 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
## Median :233 Median :0.000 Median :2.00 Median :1.00
## Mean :233 Mean :0.497 Mean :1.63 Mean :1.39
## 3rd Qu.:348 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
## Max. :470 Max. :1.000 Max. :3.00 Max. :3.00
## f1c f1d
## Min. :0.00 Min. :0.00
## 1st Qu.:1.00 1st Qu.:0.00
## Median :2.00 Median :1.00
## Mean :1.92 Mean :1.56
## 3rd Qu.:3.00 3rd Qu.:3.00
## Max. :3.00 Max. :3.00
head(newds, n=3) # 檢視前三筆資料
## cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1 49 0 13 26 1 1 3 2 3 0 2 3 3 0 2 3
## 2 30 0 56 62 2 1 3 2 0 3 3 2 0 0 3 0
## 3 39 0 0 0 3 0 3 2 3 0 2 2 1 3 2 3
## f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1 3 0 1 2 2 2 2 3 3 2
## 2 3 0 0 3 0 0 0 2 0 0
## 3 1 0 1 3 2 0 0 3 2 0
## 添加註腳
comment(newds) = "HELP baseline dataset" # 在物件newds上添加文字註腳
comment(newds) # 看看物件newds的文字註腳
## [1] "HELP baseline dataset"
save(ds, file="savedfile") # 存檔
## 輸出資料
write.csv(ds, file="ds.csv") # 把ds資料框寫出成為csv格式
## 輸出資料供不同軟體使用
library(foreign) # 載入foreign以便寫出成其他統計軟體使用的格式
write.foreign(newds, "file.dat", "file.sas", package="SAS") # 寫成SAS可讀格式
## 檢視資料框
with(newds, cesd[1:10]) # newds物件中,變項cesd的1到10筆資料
## [1] 49 30 39 15 39 6 52 32 50 46
with(newds, head(cesd, 10)) # newds物件中,變項cesd的前10筆資料
## [1] 49 30 39 15 39 6 52 32 50 46
with(newds, cesd[cesd > 56]) # newds物件中,變項cesd中數值大於56的所有資料
## [1] 57 58 57 60 58 58 57
## 整理資料
library(dplyr) # filter來自dplyr
filter(newds, cesd > 56) %>% select(id, cesd) # 先選擇newds中cesd大於56的觀察值,再選擇其id和cesd兩欄資料檢視。結果只有7筆。
## id cesd
## 1 71 57
## 2 127 58
## 3 200 57
## 4 228 60
## 5 273 58
## 6 351 58
## 7 13 57
## 檢視vector
with(newds, sort(cesd)[1:4]) # 將cesd由小到大排序後,第1到4筆的數值
## [1] 1 3 3 4
with(newds, which.min(cesd)) # 查詢cesd中最小值在第幾列
## [1] 199
## 檢查資料
library(mosaic) # 載入mosaic
## Warning: package 'mosaic' was built under R version 3.4.4
## Loading required package: ggformula
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:lme4':
##
## factorize
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:car':
##
## deltaMethod, logit
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:purrr':
##
## cross
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median,
## prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
tally(~ is.na(f1g), data=newds) # 檢查f1g變項的NA值,輸出成tbl
## is.na(f1g)
## TRUE FALSE
## 1 452
favstats(~ f1g, data=newds) # 對f1g變項做一些敘述統計,輸出成tbl
## min Q1 median Q3 max mean sd n missing
## 0 1 2 3 3 1.73 1.1 452 1
## 重新編碼、計算NA次數
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
(3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
f1q, f1r, f1s, f1t)) # 將反向題重新編碼(f1d, f1h, f1l and f1p)
nmisscesd = apply(is.na(cesditems), 1, sum) # 計算NA數量
ncesditems = cesditems # 備份一個cesditems
ncesditems[is.na(cesditems)] = 0 # 把ncesditems中的NA指派為0
newcesd = apply(ncesditems, 1, sum) # 計算扣除NA後的cesd
imputemeancesd = 20/(20-nmisscesd)*newcesd # 使用平均值來取代NA
## 列出剛才將NA改成其他數值的表格
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
## newcesd newds.cesd nmisscesd imputemeancesd
## 4 15 15 1 15.8
## 17 19 19 1 20.0
## 87 44 44 1 46.3
## 101 17 17 1 17.9
## 154 29 29 1 30.5
## 177 44 44 1 46.3
## 229 39 39 1 41.1
## 把連續資料定義成類別資料(abstinent, moderate & highrisk)
library(dplyr) # mutate來自dplyr
library(memisc) # cases來自memisc
## Warning: package 'memisc' was built under R version 3.4.4
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.4.4
##
## Attaching package: 'MASS'
## The following object is masked from 'package:Ecdat':
##
## SP500
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'memisc'
## The following object is masked from 'package:Matrix':
##
## as.array
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:dplyr':
##
## collect, recode, rename
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
## The following object is masked from 'package:base':
##
## as.array
newds = mutate(newds, drinkstat=
cases("abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) | (i1>0 & i1<=2 & i2<=4 & female==0),
"highrisk" = ((i1>1 | i2>3) & female==1) | ((i1>2 | i2>4) & female==0)))
# 讀取mosaic, 解除memisc與MASS的物件
library(mosaic); detach(package:memisc); detach(package:MASS)
## 檢視資料
library(dplyr) # select, filter出處
tmpds = select(newds, i1, i2, female, drinkstat) # 選擇i1, i2, female, drinkstat四個變項
tmpds[365:370,] # 檢視第365到370筆資料
## i1 i2 female drinkstat
## 365 6 24 0 highrisk
## 366 6 6 0 highrisk
## 367 0 0 0 abstinent
## 368 0 0 1 abstinent
## 369 8 8 0 highrisk
## 370 32 32 0 highrisk
filter(tmpds, drinkstat=="moderate" & female==1) # 檢視tmpds內中度飲酒的女性
## i1 i2 female drinkstat
## 1 1 1 1 moderate
## 2 1 3 1 moderate
## 3 1 2 1 moderate
## 4 1 1 1 moderate
## 5 1 1 1 moderate
## 6 1 1 1 moderate
## 7 1 1 1 moderate
## 製作交叉表
library(gmodels) # CrossTable出處
## Warning: package 'gmodels' was built under R version 3.4.4
with(tmpds, CrossTable(drinkstat)) # 次數與機率表
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | abstinent | moderate | highrisk |
## |-----------|-----------|-----------|
## | 68 | 28 | 357 |
## | 0.150 | 0.062 | 0.788 |
## |-----------|-----------|-----------|
##
##
##
##
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE)) # 列為飲酒、欄為性別,根據性別標明百分比(預設prop.r ?)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | female
## drinkstat | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## abstinent | 42 | 26 | 68 |
## | 0.618 | 0.382 | 0.150 |
## -------------|-----------|-----------|-----------|
## moderate | 21 | 7 | 28 |
## | 0.750 | 0.250 | 0.062 |
## -------------|-----------|-----------|-----------|
## highrisk | 283 | 74 | 357 |
## | 0.793 | 0.207 | 0.788 |
## -------------|-----------|-----------|-----------|
## Column Total | 346 | 107 | 453 |
## -------------|-----------|-----------|-----------|
##
##
## 根據女性(0, 1)變項創一個性別("Male","Female")變項
newds = transform(newds,
gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)
## gender
## female Male Female
## 0 346 0
## 1 0 107
## dplyr的arrange與filter功能
library(dplyr) # arrange, filter出處
newds = arrange(ds, cesd, i1) # 依序使用cesd和id排序ds資料框
newds[1:5, c("cesd", "i1", "id")] # 展示1到5筆
## cesd i1 id
## 1 1 3 233
## 2 3 1 139
## 3 3 13 418
## 4 4 4 251
## 5 4 9 95
females = filter(ds, female==1) # 選出ds中的女性
with(females, mean(cesd)) # 計算女性平均cesd
## [1] 36.9
mean(ds$cesd[ds$female==1]) # 結果同上
## [1] 36.9
## 平均值
with(ds, tapply(cesd, female, mean)) # 計算男女cesd的平均值
## 0 1
## 31.6 36.9
library(mosaic) # mosaic版本的平均值
mean(cesd ~ female, data=ds) # 用~來界定根據什麼變項算平均值
## 0 1
## 31.6 36.9
head(dta4 <- backpain)
## ID status driver suburban
## 1 1 case yes yes
## 2 1 control yes no
## 3 2 case yes yes
## 4 2 control yes yes
## 5 3 case yes no
## 6 3 control yes yes
dta <- backpain %>%
group_by(status, driver, suburban) %>%
summarise(n = n()) %>%
ungroup() %>%
spread(status, n) %>%
mutate(total = case + control); dta
## # A tibble: 4 x 5
## driver suburban case control total
## <fct> <fct> <int> <int> <int>
## 1 no no 26 47 73
## 2 no yes 6 7 13
## 3 yes no 64 63 127
## 4 yes yes 121 100 221
library(datasets)
dta <- merge(state.x77, USArrests, "row.names")
cor(dta[, -1])
## Population Income Illiteracy Life Exp Murder.x HS Grad
## Population 1.0000 0.2082 0.1076 -0.0681 0.3436 -0.0985
## Income 0.2082 1.0000 -0.4371 0.3403 -0.2301 0.6199
## Illiteracy 0.1076 -0.4371 1.0000 -0.5885 0.7030 -0.6572
## Life Exp -0.0681 0.3403 -0.5885 1.0000 -0.7808 0.5822
## Murder.x 0.3436 -0.2301 0.7030 -0.7808 1.0000 -0.4880
## HS Grad -0.0985 0.6199 -0.6572 0.5822 -0.4880 1.0000
## Frost -0.3322 0.2263 -0.6719 0.2621 -0.5389 0.3668
## Area 0.0225 0.3633 0.0773 -0.1073 0.2284 0.3335
## Murder.y 0.3202 -0.2152 0.7068 -0.7785 0.9337 -0.5216
## Assault 0.3170 0.0409 0.5110 -0.6267 0.7398 -0.2303
## UrbanPop 0.5126 0.4805 -0.0622 0.2715 0.0164 0.3587
## Rape 0.3052 0.3574 0.1546 -0.2696 0.5800 0.2707
## Frost Area Murder.y Assault UrbanPop Rape
## Population -0.3322 0.0225 0.3202 0.3170 0.5126 0.305
## Income 0.2263 0.3633 -0.2152 0.0409 0.4805 0.357
## Illiteracy -0.6719 0.0773 0.7068 0.5110 -0.0622 0.155
## Life Exp 0.2621 -0.1073 -0.7785 -0.6267 0.2715 -0.270
## Murder.x -0.5389 0.2284 0.9337 0.7398 0.0164 0.580
## HS Grad 0.3668 0.3335 -0.5216 -0.2303 0.3587 0.271
## Frost 1.0000 0.0592 -0.5414 -0.4682 -0.2462 -0.279
## Area 0.0592 1.0000 0.1481 0.2312 -0.0615 0.525
## Murder.y -0.5414 0.1481 1.0000 0.8019 0.0696 0.564
## Assault -0.4682 0.2312 0.8019 1.0000 0.2589 0.665
## UrbanPop -0.2462 -0.0615 0.0696 0.2589 1.0000 0.411
## Rape -0.2792 0.5250 0.5636 0.6652 0.4113 1.000
ggpairs(dta[, -1])
NO interesting
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
##
## Attaching package: 'MASS'
## The following object is masked from 'package:Ecdat':
##
## SP500
## The following object is masked from 'package:dplyr':
##
## select
dta <- merge(rownames_to_column(mammals), rownames_to_column(Animals), all = TRUE)
duplicated(dta)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE
dim(dta)
## [1] 67 3
dta7 <- Caschool %>%
mutate(ratio = enrltot/teachers,
Reading = cut(readscr, breaks = quantile(readscr, probs = c(0, .33, .67, 1)),
label = c("L", "M", "H"), ordered = T))
library(lattice)
xyplot(readscr ~ ratio | Reading, data = dta7,
type = c("p", "g", "r"), layout = c(3, 1),
xlab = "Student-Teacher Ratio",
ylab = "Reading Score")
dta <- Prestige %>%
group_by(type) %>%
summarize(m = median(prestige, na.rm = TRUE)); dta
## # A tibble: 4 x 2
## type m
## <fct> <dbl>
## 1 bc 35.9
## 2 prof 68.4
## 3 wc 41.5
## 4 <NA> 35.0
dta8.2 <- Prestige %>%
na.omit %>%
group_by(type) %>%
mutate(m = median(prestige),
PrestigeLevel = memisc::cases("High" = prestige >= m,
"Low" = prestige < m))
xyplot(income ~ education |type,
group = PrestigeLevel,
data = dta8.2,
type = c("p", "g", "r"),
layout = c(3, 1),
auto.key=list(space="right", row=2),
xlab = "Average education of occupational incumbents in 1971",
ylab = "Average income of incumbents in 1971")
三種職業的聲望高低,對於教育程度與收入之間關係的效果不同。
dta9 <- Hsb82 %>%
group_by(sector, school) %>%
summarise(m = mean(mAch, na.rm = TRUE),
s = sd(mAch, na.rm = TRUE),
n = n(),
se = s/sqrt(n)) %>%
mutate(lower.ci = m-2*se, upper.ci = m+2*se)
ggplot(dta9, aes(x = school, y = m)) +
geom_point() +
geom_errorbar(aes(ymax = upper.ci, ymin = lower.ci))+
coord_flip()+
facet_wrap(~sector)+
labs(x = "School",y = "Math Mean Score by School")