Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score (mathscr) against average reading score (readscr) for the sampled data set. Make sure your results are reproducible (e.g., the same random sample will be drawn each time).
# input data
::p_load(Ecdat)
pacmandata(Caschool, package="Ecdat")
#了解這個資料中的訊息
?Caschool# 顯示Caschool的資料結構
str(Caschool)
'data.frame': 420 obs. of 17 variables:
$ distcod : int 75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
$ county : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
$ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
$ grspan : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
$ enrltot : int 195 240 1550 243 1335 137 195 888 379 2247 ...
$ teachers: num 10.9 11.1 82.9 14 71.5 ...
$ calwpct : num 0.51 15.42 55.03 36.48 33.11 ...
$ mealpct : num 2.04 47.92 76.32 77.05 78.43 ...
$ computer: int 67 101 169 85 171 25 28 66 35 0 ...
$ testscr : num 691 661 644 648 641 ...
$ compstu : num 0.344 0.421 0.109 0.35 0.128 ...
$ expnstu : num 6385 5099 5502 7102 5236 ...
$ str : num 17.9 21.5 18.7 17.4 18.7 ...
$ avginc : num 22.69 9.82 8.98 8.98 9.08 ...
$ elpct : num 0 4.58 30 0 13.86 ...
$ readscr : num 692 660 636 652 642 ...
$ mathscr : num 690 662 651 644 640 ...
::p_load(Momocs,dplyr)
pacmanset.seed(1234)
<-Caschool |>
dta1 distinct(county, .keep_all = TRUE)|> #distinct 只保留唯一county列,並用.keep_all保留所有欄位資料
sample_n(45) #隨機抽樣本數量
head(dta1)
distcod county district grspan enrltot teachers
104 65557 Mendocino Arena Union Elementary KK-08 337 16.55
45 67199 Riverside Perris Elementary KK-06 4258 221.00
70 70904 Sonoma Roseland Elementary KK-06 1154 62.10
188 64105 Lassen Janesville Union Elementary KK-08 516 26.00
19 64709 Los Angeles Lennox Elementary KK-08 6880 303.03
8 63834 Kern Vineland Elementary KK-08 888 42.50
calwpct mealpct computer testscr compstu expnstu str avginc
104 21.2625 60.1329 26 639.30 0.07715134 6089.911 20.36254 10.0350
45 24.6595 92.7431 324 628.75 0.07609206 5092.917 19.26697 10.0505
70 22.9277 84.9206 129 634.15 0.11178510 6056.497 18.58293 11.6650
188 9.2593 24.8148 65 651.90 0.12596899 4974.281 19.84615 13.4670
19 21.2824 94.9712 960 619.80 0.13953489 5064.616 22.70402 7.0220
8 18.8063 100.0000 66 609.00 0.07432432 4565.746 20.89412 8.1740
elpct readscr mathscr
104 15.430267 642.7 635.9
45 36.801315 626.5 631.0
70 55.892544 636.8 631.5
188 1.937984 656.3 647.5
19 77.005814 619.1 620.5
8 46.959461 605.5 612.5
class(dta1)
[1] "data.frame"
有點不確定,distinct過程中就是隨機只只保留唯一county列了嗎? 如果是的話,應該就不用再寫sample_n(45)
::p_load("basicPlotteR")
pacman# plot
with(dta1, plot(mathscr,readscr, xlab="mathscr", ylab="readscr", pch=20, bty='n', col="lightblue", axes = FALSE)) #bty外框限
abline(lm(mathscr~readscr, data= dta1))
addTextLabels(dta1$mathscr, dta1$readscr,labels=dta1$county, cex.label = 0.5,
col.label = "black", col.line = "grey")
axis(1, at = seq(600,700, by=10))
axis(2, at = seq(600,700, by=10))
grid()
addTextLabels This function is similar to the text() function but it will attempt to re-locate labels that will overlap
Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach. The tail end of the data object should looks as follows:
# input data
::p_load(MASS)
pacmandata(nlschools, package="MASS")
#了解這個資料中的訊息
?nlschools# 顯示nlschools的資料結構
str(nlschools)
'data.frame': 2287 obs. of 6 variables:
$ lang : int 46 45 33 46 20 30 30 57 36 36 ...
$ IQ : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
$ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
$ GS : int 29 29 29 29 29 29 29 29 29 29 ...
$ SES : int 23 10 15 23 10 10 23 10 13 15 ...
$ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
<- nlschools
dta2
# 先利用group_by class,計算每個班級的mean, sd
%<>% group_by(class) %>% dplyr::summarize(Mean = mean(lang), SD =sd(lang)) # |> 用這個無法將summarize資料存到dta2,但用%>%可以
dta2
# 用transmute 新增欄位,並只保留這些欄位
<- dta2 |> transmute(classID = class,
tbl language_mean = Mean,
language_lb = Mean-1.96*SD/sqrt(n()),
language_lu = Mean+1.96*SD/sqrt(n()))
# older ID 並看前六筆
order(tbl$classID),] |> head() |>knitr::kable() tbl[
classID | language_mean | language_lb | language_lu |
---|---|---|---|
180 | 36.40000 | 34.91109 | 37.88891 |
280 | 23.71429 | 22.64069 | 24.78789 |
1082 | 30.40000 | 28.67265 | 32.12735 |
1280 | 30.86667 | 29.68244 | 32.05090 |
1580 | 30.87500 | 28.61403 | 33.13597 |
1680 | 41.50000 | 40.42897 | 42.57103 |
Convert the wide form of the O’Brien and Kaiser’s repeated-measures data in OBrienKaiser{carData} to the long form in OBrienKaiserLong{carData}.
::p_load(carData)
pacmandata(OBrienKaiser, package="carData")
# 了解這個資料中的訊息
?OBrienKaiser# 顯示OBrienKaiser的資料結構
str(OBrienKaiser)
'data.frame': 16 obs. of 17 variables:
$ treatment: Factor w/ 3 levels "control","A",..: 1 1 1 1 1 2 2 2 2 3 ...
..- attr(*, "contrasts")= num [1:3, 1:2] -2 1 1 0 -1 1
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:3] "control" "A" "B"
.. .. ..$ : NULL
$ gender : Factor w/ 2 levels "F","M": 2 2 2 1 1 2 2 1 1 2 ...
..- attr(*, "contrasts")= chr "contr.sum"
$ pre.1 : num 1 4 5 5 3 7 5 2 3 4 ...
$ pre.2 : num 2 4 6 4 4 8 5 3 3 4 ...
$ pre.3 : num 4 5 5 7 6 7 6 5 4 5 ...
$ pre.4 : num 2 3 7 5 4 9 4 3 6 3 ...
$ pre.5 : num 1 4 7 4 3 9 5 2 4 4 ...
$ post.1 : num 3 2 4 2 6 9 7 2 4 6 ...
$ post.2 : num 2 2 5 2 7 9 7 4 5 7 ...
$ post.3 : num 5 3 7 3 8 10 8 8 6 6 ...
$ post.4 : num 3 5 5 5 6 8 10 6 4 8 ...
$ post.5 : num 2 3 4 3 3 9 8 5 1 8 ...
$ fup.1 : num 2 4 7 4 4 9 8 6 5 8 ...
$ fup.2 : num 3 5 6 4 3 10 9 6 4 8 ...
$ fup.3 : num 2 6 9 5 6 11 11 7 7 9 ...
$ fup.4 : num 4 4 7 3 4 9 9 5 5 7 ...
$ fup.5 : num 4 1 6 4 3 6 8 6 4 8 ...
::p_load(carData)
pacmandata(OBrienKaiserLong, package="carData")
# 了解這個資料中的訊息
?OBrienKaiserLong# 顯示OBrienKaiserLong的資料結構
str(OBrienKaiserLong)
'data.frame': 240 obs. of 6 variables:
$ treatment: Factor w/ 3 levels "control","A",..: 1 1 1 1 1 1 1 1 1 1 ...
$ gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
$ score : num 1 2 4 2 1 3 2 5 3 2 ...
$ id : int 1 1 1 1 1 1 1 1 1 1 ...
$ phase : Factor w/ 3 levels "pre","post","fup": 1 1 1 1 1 2 2 2 2 2 ...
$ hour : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5 1 2 3 4 5 ...
- attr(*, "reshapeLong")=List of 4
..$ varying:List of 1
.. ..$ score: chr [1:15] "pre.1" "pre.2" "pre.3" "pre.4" ...
.. ..- attr(*, "v.names")= chr "score"
.. ..- attr(*, "times")= int [1:15] 1 2 3 4 5 6 7 8 9 10 ...
..$ v.names: chr "score"
..$ idvar : chr "id"
..$ timevar: chr "phase.hour"
library(tidyverse)
# 將OBrienKaiser存入dta3中
<-OBrienKaiser
dta3
# 先給他id,再轉為long form
<- dta3 |>
dtaL mutate(id = rep(1:n()))|>
pivot_longer(
cols = starts_with(c("pre","post","fup")), #辨識以pre, post, fup開頭的欄位
names_to = "Time", #欄位名稱為Time
values_to = "score") #值會放入score
%<>%
dtaL separate(Time, c("phase","hour")) |> #將Time拆解為phase與hour兩欄
mutate(hour=parse_number(hour)) # hour透過parse_number擷取"pre.x"後的x(數字)
# A tibble: 240 x 6
treatment gender id phase hour score
<fct> <fct> <int> <chr> <dbl> <dbl>
1 control M 1 pre 1 1
2 control M 1 pre 2 2
3 control M 1 pre 3 4
4 control M 1 pre 4 2
5 control M 1 pre 5 1
6 control M 1 post 1 3
7 control M 1 post 2 2
8 control M 1 post 3 5
9 control M 1 post 4 3
10 control M 1 post 5 2
# ... with 230 more rows
# change column order
<- dtaL[,c(1,2,6,3,4,5)]
dtaL2
# check our results is correct
head(dtaL2,n=16L)
# A tibble: 16 x 6
treatment gender score id phase hour
<fct> <fct> <dbl> <int> <chr> <chr>
1 control M 1 1 pre 1
2 control M 2 1 pre 2
3 control M 4 1 pre 3
4 control M 2 1 pre 4
5 control M 1 1 pre 5
6 control M 3 1 post 1
7 control M 2 1 post 2
8 control M 5 1 post 3
9 control M 3 1 post 4
10 control M 2 1 post 5
11 control M 2 1 fup 1
12 control M 3 1 fup 2
13 control M 2 1 fup 3
14 control M 4 1 fup 4
15 control M 4 1 fup 5
16 control M 4 2 pre 1
head(OBrienKaiserLong,n=16L)
treatment gender score id phase hour
1.1 control M 1 1 pre 1
1.2 control M 2 1 pre 2
1.3 control M 4 1 pre 3
1.4 control M 2 1 pre 4
1.5 control M 1 1 pre 5
1.6 control M 3 1 post 1
1.7 control M 2 1 post 2
1.8 control M 5 1 post 3
1.9 control M 3 1 post 4
1.10 control M 2 1 post 5
1.11 control M 2 1 fup 1
1.12 control M 3 1 fup 2
1.13 control M 2 1 fup 3
1.14 control M 4 1 fup 4
1.15 control M 4 1 fup 5
2.1 control M 4 2 pre 1
Information about student majors, activities in intramural sports, and survey responses about their satisfaction with the on-campus cafeteria (for the variety and quality of the food served, and hours of operation) are recorded in three separate plain text files by three different administrative units: majors, sports, cafeteria
Input these files into an R session, merge them and output the results as in the comma-separate values file .
# input data
<- read.table("student_major.txt", header = TRUE)
majors <-read.table("student_sport.txt", header = TRUE)
sports <-read.table("student_cafe.txt", header = TRUE)
cafeteria
# 存入list比較方便後續merge動作
<-list(majors,sports,cafeteria)
listdta3::glimpse(listdta3) dplyr
List of 3
$ :'data.frame': 14 obs. of 2 variables:
..$ id : int [1:14] 123 345 624 789 851 555 992 484 717 839 ...
..$ major: chr [1:14] "Mathematics" "Computer Science" "History" "Biology" ...
$ :'data.frame': 13 obs. of 2 variables:
..$ id : int [1:13] 123 123 222 385 385 481 555 717 789 851 ...
..$ sport: chr [1:13] "basketball" "badminton" "badminton" "basketball" ...
$ :'data.frame': 7 obs. of 4 variables:
..$ id : int [1:7] 123 345 385 481 624 851 992
..$ variety: int [1:7] 3 4 1 2 2 5 4
..$ quality: int [1:7] 4 4 1 3 3 4 5
..$ hours : int [1:7] 2 2 2 3 4 1 3
read_table 讀進來會出現空白欄位與quote on chr (ex:major,sport),使用read.table比較省事
differ read.table and read_table
library(tidyverse)
<- listdta3 %>% reduce(full_join, by='id') # full_join return all rows and columns from all datasets
tidydta3#
write.csv(tidydta3, "tidydta3.csv", sep = "," ,row.names = F)
Warning in write.csv(mrgdta3, “mrgdta3.csv”, sep = “,”) : attempt to set ‘sep’ ignored
write.csv()預設就是以”,“為分隔號,所以R說他忽略’sep’
write.csv() uses “.” for the decimal point and a comma (“,”) for the separator.
write.csv2() uses a comma (“,”) for the decimal point and a semicolon (“;”) for the separator. differ write.csv and write.csv2
# Roll our own merging function
<- function(f1, f2){
mrg merge(f1, f2, by="id", all=T)
}
# as.data.frame is easier to read in Rstudio
<-Reduce(mrg, listdta3) %>% as.data.frame()
mrgdta3
#
write.csv(mrgdta3, "mrgdta3.csv" ,row.names = F) #可以省略sep
The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit. Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care. Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless. Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years. A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions. The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.
Source: Kleinman, K., & Horton, N.J. (2015). Using R for Data Management, Statistical Analysis, and Graphics.
options(continue=" ")
options(digits=3)
options(width=72) # narrow output
= read.csv("https://nhorton.people.amherst.edu/sasr2/datasets/help.csv")
ds library(dplyr)
# 選擇變項
<- ds |> dplyr::select(cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t) newds
names(newds) # show all column name
[1] "cesd" "female" "i1" "i2" "id" "treat" "f1a"
[8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
[15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
[22] "f1p" "f1q" "f1r" "f1s" "f1t"
str(newds[,1:10]) # structure of the first 10 variables
'data.frame': 453 obs. of 10 variables:
$ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
$ female: int 0 0 0 1 0 1 1 0 1 0 ...
$ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
$ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ treat : int 1 1 0 0 0 1 0 1 0 1 ...
$ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
$ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
$ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
$ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
summary(newds[,1:10]) # summary of the first 10 variables
cesd female i1 i2
Min. : 1.0 Min. :0.000 Min. : 0.0 Min. : 0.0
1st Qu.:25.0 1st Qu.:0.000 1st Qu.: 3.0 1st Qu.: 3.0
Median :34.0 Median :0.000 Median : 13.0 Median : 15.0
Mean :32.8 Mean :0.236 Mean : 17.9 Mean : 22.6
3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.: 26.0 3rd Qu.: 32.0
Max. :60.0 Max. :1.000 Max. :142.0 Max. :184.0
id treat f1a f1b
Min. : 1 Min. :0.000 Min. :0.00 Min. :0.00
1st Qu.:119 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
Median :233 Median :0.000 Median :2.00 Median :1.00
Mean :233 Mean :0.497 Mean :1.63 Mean :1.39
3rd Qu.:348 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
Max. :470 Max. :1.000 Max. :3.00 Max. :3.00
f1c f1d
Min. :0.00 Min. :0.00
1st Qu.:1.00 1st Qu.:0.00
Median :2.00 Median :1.00
Mean :1.92 Mean :1.56
3rd Qu.:3.00 3rd Qu.:3.00
Max. :3.00 Max. :3.00
head(newds, n=3)
cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
1 49 0 13 26 1 1 3 2 3 0 2 3 3 0 2 3
2 30 0 56 62 2 1 3 2 0 3 3 2 0 0 3 0
3 39 0 0 0 3 0 3 2 3 0 2 2 1 3 2 3
f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
1 3 0 1 2 2 2 2 3 3 2
2 3 0 0 3 0 0 0 2 0 0
3 1 0 1 3 2 0 0 3 2 0
# comment
comment(newds) = "HELP baseline dataset"
comment(newds)
[1] "HELP baseline dataset"
# output R object
save(ds, file="savedfile") #Save R Objects can use load to read back
load("savedfile")
# output csv
write.csv(ds, file="ds.csv")
# 存成SAS可讀的檔案和code(but檔案file.dat在SAS無法開啟)
library(foreign)
write.foreign(newds, "file.dat", "file.sas", package="SAS")
file.show("file.dat")
write.foreign write.foreign(df, datafile, codefile, package = c(“SPSS”, “Stata”, “SAS”), …)
#用with attach data,並列出cesd前10的數值
with(newds, cesd[1:10])
[1] 49 30 39 15 39 6 52 32 50 46
#同上,改用head列出前10數值
with(newds, head(cesd, 10))
[1] 49 30 39 15 39 6 52 32 50 46
#在cesd欄取出cesd > 56
with(newds, cesd[cesd > 56])
[1] 57 58 57 60 58 58 57
#改用filter做一樣的事情
library(dplyr)
filter(newds, cesd > 56) %>% dplyr::select(id, cesd)
id cesd
1 71 57
2 127 58
3 200 57
4 228 60
5 273 58
6 351 58
7 13 57
with(newds, sort(cesd)[1:4])
[1] 1 3 3 4
with(newds, which.min(cesd)) #location of the (first) minimum vector
[1] 199
library(mosaic)
tally(~ is.na(f1g), data=newds)
is.na(f1g)
TRUE FALSE
1 452
favstats(~ f1g, data=newds)
min Q1 median Q3 max mean sd n missing
0 1 2 3 3 1.73 1.1 452 1
Counts/tally observations by group.
“favstats” is short for “favorite statistics”: it will give you the some of the most popular summary statistics for numerical variables.
# reverse code f1d, f1h, f1l and f1p
#cbind多個欄位,並直接透過3-VARIABLE 達到 reverse code
= with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
cesditems 3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
(
f1q, f1r, f1s, f1t))
# 計算每一列在cesditems 的NA總數
= apply(is.na(cesditems), 1, sum)
nmisscesd # 將cesditems 放入 ncesditems中
= cesditems
ncesditems # 如果ncesditems是NA的話,把它改為0
is.na(cesditems)] = 0
ncesditems[# 計算每一列在ncesditems 的總分
= apply(ncesditems, 1, sum)
newcesd # 填補遺漏值後的cesd總分?
= 20/(20-nmisscesd)*newcesd
imputemeancesd # 比較三個不同計算方式的cesd總分,並關注有na的列(nmisscesd>0)
<-data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,] dta5
newds$cesd = newcesd
原始資料中是沒有進行遺漏值填補的值,其與我們將NA視為0計算之總分相同。
nmisscesd呈現的是遺漏多少個題目,imputemeancesd是經由填補後的CESD分數
::p_load(dplyr,memisc)
pacman= mutate(newds, drinkstat=
newds cases(
"abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
>0 & i1<=2 & i2<=4 & female==0),
(i1"highrisk" = ((i1>1 | i2>3) & female==1) |
>2 | i2>4) & female==0))) ((i1
建立drinkstat新欄位,並透過條件設定將之分為abstinent、moderate、highrisk三組 cases 與 case_when()功能相同,可用於多重判斷時
library(mosaic)
detach(package:memisc)
detach(package:MASS)
library(dplyr)
= select(newds, i1, i2, female, drinkstat)#擷取newds中的四個變項,形成一個新物件tmpds
tmpds 365:370,] #列出 tmpds 中第 365 到 370 筆資料 tmpds[
i1 i2 female drinkstat
365 6 24 0 highrisk
366 6 6 0 highrisk
367 0 0 0 abstinent
368 0 0 1 abstinent
369 8 8 0 highrisk
370 32 32 0 highrisk
library(dplyr)
filter(tmpds, drinkstat=="moderate" & female==1)#篩選出 tmpds 中,drinkstat 為 moderate 且 female為1的列
i1 i2 female drinkstat
1 1 1 1 moderate
2 1 3 1 moderate
3 1 2 1 moderate
4 1 1 1 moderate
5 1 1 1 moderate
6 1 1 1 moderate
7 1 1 1 moderate
# CrossTable
library(gmodels)
with(tmpds, CrossTable(drinkstat)) # 顯示 drinkstat 三組的n與%
Cell Contents
|-------------------------|
| N |
| N / Table Total |
|-------------------------|
Total Observations in Table: 453
| abstinent | moderate | highrisk |
|-----------|-----------|-----------|
| 68 | 28 | 357 |
| 0.150 | 0.062 | 0.788 |
|-----------|-----------|-----------|
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE)) #顯示 drinkstat 和 female 變項的列連表
Cell Contents
|-------------------------|
| N |
| N / Row Total |
|-------------------------|
Total Observations in Table: 453
| female
drinkstat | 0 | 1 | Row Total |
-------------|-----------|-----------|-----------|
abstinent | 42 | 26 | 68 |
| 0.618 | 0.382 | 0.150 |
-------------|-----------|-----------|-----------|
moderate | 21 | 7 | 28 |
| 0.750 | 0.250 | 0.062 |
-------------|-----------|-----------|-----------|
highrisk | 283 | 74 | 357 |
| 0.793 | 0.207 | 0.788 |
-------------|-----------|-----------|-----------|
Column Total | 346 | 107 | 453 |
-------------|-----------|-----------|-----------|
# 新增gender欄位並將原本female欄位的0 改變(transform)為Male, 1為Female
= transform(newds,
newds gender=factor(female, c(0,1), c("Male","Female")))
# 用tally產生列連表
tally(~ female + gender, margin=FALSE, data=newds)
gender
female Male Female
0 346 0
1 0 107
# 資料重新排列
library(dplyr)
= arrange(ds, cesd, i1) # 將ds 資料重新由小到大排列,第一層依據為 cesd,第二層依據為i1
newds 1:5, c("cesd", "i1", "id")] #顯示 newds 中,變項 cesd、i1 和 id 的前五筆資料 newds[
cesd i1 id
1 1 3 233
2 3 1 139
3 3 13 418
4 4 4 251
5 4 9 95
#
library(dplyr)
= filter(ds, female==1) # 篩選 ds 的資料,篩選條件為 female 值為 1 的資料,存為一個新物件 females
females with(females, mean(cesd)) # 計算females物件中cesd的平均數
[1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1]) # 先抓ds$female為1,再透過ds$cesd[]列出cesd中女性的值並做平均
[1] 36.9
with(ds, tapply(cesd, female, mean)) # 透過tapply針對female 欄位計算 cesd的平均分數
0 1
31.6 36.9
library(mosaic)
mean(cesd ~ female, data=ds) # cesd分數按性別分別計算其平均
0 1
31.6 36.9