Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score mathscr against average reading score readscr for the sampled data set.
Make sure your results are reproducible (e.g., the same random sample will be drawn each time).
#install package and download data
::p_load(Ecdat)
pacmandata(Caschool, package="Ecdat")
#show first 6 lines data
|> head() |> knitr::kable() Caschool
distcod | county | district | grspan | enrltot | teachers | calwpct | mealpct | computer | testscr | compstu | expnstu | str | avginc | elpct | readscr | mathscr |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
75119 | Alameda | Sunol Glen Unified | KK-08 | 195 | 10.90 | 0.5102 | 2.0408 | 67 | 690.80 | 0.3435898 | 6384.911 | 17.88991 | 22.690001 | 0.000000 | 691.6 | 690.0 |
61499 | Butte | Manzanita Elementary | KK-08 | 240 | 11.15 | 15.4167 | 47.9167 | 101 | 661.20 | 0.4208333 | 5099.381 | 21.52466 | 9.824000 | 4.583334 | 660.5 | 661.9 |
61549 | Butte | Thermalito Union Elementary | KK-08 | 1550 | 82.90 | 55.0323 | 76.3226 | 169 | 643.60 | 0.1090323 | 5501.955 | 18.69723 | 8.978000 | 30.000002 | 636.3 | 650.9 |
61457 | Butte | Golden Feather Union Elementary | KK-08 | 243 | 14.00 | 36.4754 | 77.0492 | 85 | 647.70 | 0.3497942 | 7101.831 | 17.35714 | 8.978000 | 0.000000 | 651.9 | 643.5 |
61523 | Butte | Palermo Union Elementary | KK-08 | 1335 | 71.50 | 33.1086 | 78.4270 | 171 | 640.85 | 0.1280899 | 5235.988 | 18.67133 | 9.080333 | 13.857677 | 641.8 | 639.9 |
62042 | Fresno | Burrel Union Elementary | KK-08 | 137 | 6.40 | 12.3188 | 86.9565 | 25 | 605.55 | 0.1824818 | 5580.147 | 21.40625 | 10.415000 | 12.408759 | 605.7 | 605.4 |
#了解資料
?Caschool
## starting httpd help server ... done
#
str(Caschool)
## 'data.frame': 420 obs. of 17 variables:
## $ distcod : int 75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
## $ county : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
## $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
## $ grspan : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
## $ enrltot : int 195 240 1550 243 1335 137 195 888 379 2247 ...
## $ teachers: num 10.9 11.1 82.9 14 71.5 ...
## $ calwpct : num 0.51 15.42 55.03 36.48 33.11 ...
## $ mealpct : num 2.04 47.92 76.32 77.05 78.43 ...
## $ computer: int 67 101 169 85 171 25 28 66 35 0 ...
## $ testscr : num 691 661 644 648 641 ...
## $ compstu : num 0.344 0.421 0.109 0.35 0.128 ...
## $ expnstu : num 6385 5099 5502 7102 5236 ...
## $ str : num 17.9 21.5 18.7 17.4 18.7 ...
## $ avginc : num 22.69 9.82 8.98 8.98 9.08 ...
## $ elpct : num 0 4.58 30 0 13.86 ...
## $ readscr : num 692 660 636 652 642 ...
## $ mathscr : num 690 662 651 644 640 ...
::p_load(tidyverse,dplyr,sampling)
pacman#
<-Caschool |> #copy data
q1dtamutate(SID=paste0("S", 101:520))|> #add SID
as.data.frame()
# #set initial value to make sure the results are reproducible
set.seed(1111)
<-q1dta |>
q1_samplegroup_by(county)|> #grouping by country
sample_n(1) #randomly select 1 school per county
summary(q1_sample[,16:17])
## readscr mathscr
## Min. :604.5 Min. :605.4
## 1st Qu.:642.7 1st Qu.:639.6
## Median :656.1 Median :650.2
## Mean :655.6 Mean :651.9
## 3rd Qu.:667.6 3rd Qu.:662.3
## Max. :693.7 Max. :690.1
#plot
with(q1_sample, plot(mathscr,readscr,
xlab = "Average Math Score",
ylab = "Average Reading Score",
main = "Relationship of Average Score between Math and Reading",
xlim = c(600,700),
ylim = c(600,700),
type='n',
bty='n', #移除外框
pch=20))
grid() #增加輔助線
abline(lm(readscr ~ mathscr, data=q1_sample))
with(q1_sample, text(mathscr,readscr,
labels=county, #標註county
cex=.5))
Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach.
The tail end of the data object should looks as follows:
# input data
::p_load(MASS,dplyr)
pacmandata(nlschools, package="MASS")
#show first 6 lines data
|> head() |> knitr::kable() nlschools
lang | IQ | class | GS | SES | COMB |
---|---|---|---|---|---|
46 | 15.0 | 180 | 29 | 23 | 0 |
45 | 14.5 | 180 | 29 | 10 | 0 |
33 | 9.5 | 180 | 29 | 15 | 0 |
46 | 11.0 | 180 | 29 | 23 | 0 |
20 | 8.0 | 180 | 29 | 10 | 0 |
30 | 9.5 | 180 | 29 | 10 | 0 |
#
?nlschools
lang=language test score.
IQ=verbal IQ
class=class ID
GS=class size:number of eighth-grade pupils recorded in the class (there may be others: see COMB, and some may have been omitted with missing values).
SES=social-economic status of pupil’s family.
COMB=were the pupils taught in a multi-grade class (0/1)? Classes which contained pupils from grades 7 and 8 are coded 1, but only eighth-graders were tested.
#copy data
<-nlschools
q2dta#exam data structure
str(q2dta)
## 'data.frame': 2287 obs. of 6 variables:
## $ lang : int 46 45 33 46 20 30 30 57 36 36 ...
## $ IQ : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
## $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ GS : int 29 29 29 29 29 29 29 29 29 29 ...
## $ SES : int 23 10 15 23 10 10 23 10 13 15 ...
## $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
class(q2dta)
## [1] "data.frame"
typeof(q2dta)
## [1] "list"
summary(q2dta)
## lang IQ class GS SES
## Min. : 9.00 Min. : 4.00 15580 : 33 Min. :10.00 Min. :10.00
## 1st Qu.:35.00 1st Qu.:10.50 5480 : 31 1st Qu.:23.00 1st Qu.:20.00
## Median :42.00 Median :12.00 15980 : 31 Median :27.00 Median :27.00
## Mean :40.93 Mean :11.83 16180 : 31 Mean :26.51 Mean :27.81
## 3rd Qu.:48.00 3rd Qu.:13.00 18380 : 31 3rd Qu.:31.00 3rd Qu.:35.00
## Max. :58.00 Max. :18.00 5580 : 30 Max. :39.00 Max. :50.00
## (Other):2100
## COMB
## 0:1658
## 1: 629
##
##
##
##
##
# 徒手計算95% CI
<-q2dta |>
q2answerfilter(!is.na(lang),
!is.na(class)) |> #排除遺漏值
group_by(class)|> #grouping by class
summarize(language_mean = mean(lang),language_sd =sd(lang),N=length(lang))|> #calculate the mean,sd,se
mutate(language_se=language_sd/sqrt(N),tscore = qt(p=0.05/2, df=(N-1),lower.tail=F))|>
mutate(
language_lb = language_mean-(tscore*language_sd/sqrt(N)),
language_ub = language_mean+(tscore*language_sd/sqrt(N)))|>
arrange(desc(language_mean))
|>
q2answer::select(class,language_mean,language_lb,language_ub)|>
dplyrarrange(desc(language_mean))|>
tail()
## # A tibble: 6 x 4
## class language_mean language_lb language_ub
## <fct> <dbl> <dbl> <dbl>
## 1 17980 30.2 20.8 39.5
## 2 25680 29.3 19.8 38.8
## 3 25880 28.4 22.0 34.9
## 4 280 23.7 17.9 29.6
## 5 4780 23.5 19.5 27.5
## 6 10380 20 9.45 30.6
#check answer again
summary(q2answer)
## class language_mean language_sd N language_se
## 180 : 1 Min. :20.00 Min. : 4.272 Min. : 4.0 Min. :1.075
## 280 : 1 1st Qu.:37.05 1st Qu.: 6.623 1st Qu.:11.0 1st Qu.:1.511
## 1082 : 1 Median :40.72 Median : 7.821 Median :17.0 Median :1.820
## 1280 : 1 Mean :40.14 Mean : 7.956 Mean :17.2 Mean :2.089
## 1580 : 1 3rd Qu.:43.82 3rd Qu.: 8.931 3rd Qu.:23.0 3rd Qu.:2.464
## 1680 : 1 Max. :49.19 Max. :14.072 Max. :33.0 Max. :4.703
## (Other):127
## tscore language_lb language_ub
## Min. :2.037 Min. : 9.445 Min. :27.50
## 1st Qu.:2.074 1st Qu.:31.598 1st Qu.:43.02
## Median :2.120 Median :36.916 Median :45.28
## Mean :2.181 Mean :35.494 Mean :44.78
## 3rd Qu.:2.228 3rd Qu.:39.804 3rd Qu.:47.86
## Max. :3.182 Max. :46.948 Max. :52.35
##
再次檢查:language_mean的範圍與題目給的答案不符….>“<
Convert the wide form of the O’Brien and Kaiser’s repeated-measures data in OBrienKaiser{carData} to the long form in OBrienKaiserLong{carData}.
# install tool packages
::p_load(carData,tidyr,tidyverse,magrittr)
pacman# load and input data
data(OBrienKaiser, package = "carData")
#show first 6 lines data
|> head() |> knitr::kable() OBrienKaiser
treatment | gender | pre.1 | pre.2 | pre.3 | pre.4 | pre.5 | post.1 | post.2 | post.3 | post.4 | post.5 | fup.1 | fup.2 | fup.3 | fup.4 | fup.5 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
control | M | 1 | 2 | 4 | 2 | 1 | 3 | 2 | 5 | 3 | 2 | 2 | 3 | 2 | 4 | 4 |
control | M | 4 | 4 | 5 | 3 | 4 | 2 | 2 | 3 | 5 | 3 | 4 | 5 | 6 | 4 | 1 |
control | M | 5 | 6 | 5 | 7 | 7 | 4 | 5 | 7 | 5 | 4 | 7 | 6 | 9 | 7 | 6 |
control | F | 5 | 4 | 7 | 5 | 4 | 2 | 2 | 3 | 5 | 3 | 4 | 4 | 5 | 3 | 4 |
control | F | 3 | 4 | 6 | 4 | 3 | 6 | 7 | 8 | 6 | 3 | 4 | 3 | 6 | 4 | 3 |
A | M | 7 | 8 | 7 | 9 | 9 | 9 | 9 | 10 | 8 | 9 | 9 | 10 | 11 | 9 | 6 |
?OBrienKaiser
#Tibble
<- as_tibble(OBrienKaiser) %>%
q3dta mutate(ID=paste0("S", 101:116)) #16 observations
|> as.data.frame() |> head() q3dta
## treatment gender pre.1 pre.2 pre.3 pre.4 pre.5 post.1 post.2 post.3 post.4
## 1 control M 1 2 4 2 1 3 2 5 3
## 2 control M 4 4 5 3 4 2 2 3 5
## 3 control M 5 6 5 7 7 4 5 7 5
## 4 control F 5 4 7 5 4 2 2 3 5
## 5 control F 3 4 6 4 3 6 7 8 6
## 6 A M 7 8 7 9 9 9 9 10 8
## post.5 fup.1 fup.2 fup.3 fup.4 fup.5 ID
## 1 2 2 3 2 4 4 S101
## 2 3 4 5 6 4 1 S102
## 3 4 7 6 9 7 6 S103
## 4 3 4 4 5 3 4 S104
## 5 3 4 3 6 4 3 S105
## 6 9 9 10 11 9 6 S106
#Pivot data from wide to long
<- q3dta |>
dtaL pivot_longer(cols=starts_with(c("pre","post","fup")), #三種不同階段
names_to="Stage",
values_to="Hours") |>
arrange(ID)
|> as.data.frame() |> head(20) dtaL
## treatment gender ID Stage Hours
## 1 control M S101 pre.1 1
## 2 control M S101 pre.2 2
## 3 control M S101 pre.3 4
## 4 control M S101 pre.4 2
## 5 control M S101 pre.5 1
## 6 control M S101 post.1 3
## 7 control M S101 post.2 2
## 8 control M S101 post.3 5
## 9 control M S101 post.4 3
## 10 control M S101 post.5 2
## 11 control M S101 fup.1 2
## 12 control M S101 fup.2 3
## 13 control M S101 fup.3 2
## 14 control M S101 fup.4 4
## 15 control M S101 fup.5 4
## 16 control M S102 pre.1 4
## 17 control M S102 pre.2 4
## 18 control M S102 pre.3 5
## 19 control M S102 pre.4 3
## 20 control M S102 pre.5 4
#Pull apart a variable - separate
%<>%
dtaL separate(Stage, c("stage_name", "Times")) |> #將stage拆成"階段名"和"次數"
as.data.frame() |> head(20)
## treatment gender ID stage_name Times Hours
## 1 control M S101 pre 1 1
## 2 control M S101 pre 2 2
## 3 control M S101 pre 3 4
## 4 control M S101 pre 4 2
## 5 control M S101 pre 5 1
## 6 control M S101 post 1 3
## 7 control M S101 post 2 2
## 8 control M S101 post 3 5
## 9 control M S101 post 4 3
## 10 control M S101 post 5 2
## 11 control M S101 fup 1 2
## 12 control M S101 fup 2 3
## 13 control M S101 fup 3 2
## 14 control M S101 fup 4 4
## 15 control M S101 fup 5 4
## 16 control M S102 pre 1 4
## 17 control M S102 pre 2 4
## 18 control M S102 pre 3 5
## 19 control M S102 pre 4 3
## 20 control M S102 pre 5 4
Information about student majors, activities in intramural sports, and survey responses about their satisfaction with the on-campus cafeteria (for the variety and quality of the food served, and hours of operation) are recorded in three separate plain text files by three different administrative units:majors、sports、cafeteria
Input these files into an R session, merge them and output the results as in the comma-separate values file.
# Import plain text files
#fL1 <- "http://140.116.183.121/~sheu/dataM/05_rdw/data/student_major.txt"
#majors <- read.table(fL1,header = T) #無法直接從網路讀取資料
<- read.table("C:/Users/Ching-Fang Wu/Documents/dataM/student_major.txt", header = TRUE)
majors
<-read.table("C:/Users/Ching-Fang Wu/Documents/dataM/student_sport.txt", header = TRUE)
sports
<-read.table("C:/Users/Ching-Fang Wu/Documents/dataM/student_cafe.txt", header = TRUE) cafeteria
#
head(majors)
## id major
## 1 123 Mathematics
## 2 345 Computer Science
## 3 624 History
## 4 789 Biology
## 5 851 Computer Science
## 6 555 History
str(majors)
## 'data.frame': 14 obs. of 2 variables:
## $ id : int 123 345 624 789 851 555 992 484 717 839 ...
## $ major: chr "Mathematics" "Computer Science" "History" "Biology" ...
#
head(sports)
## id sport
## 1 123 basketball
## 2 123 badminton
## 3 222 badminton
## 4 385 basketball
## 5 385 volleyball
## 6 481 basketball
str(sports)
## 'data.frame': 13 obs. of 2 variables:
## $ id : int 123 123 222 385 385 481 555 717 789 851 ...
## $ sport: chr "basketball" "badminton" "badminton" "basketball" ...
#
head(cafeteria)
## id variety quality hours
## 1 123 3 4 2
## 2 345 4 4 2
## 3 385 1 1 2
## 4 481 2 3 3
## 5 624 2 3 4
## 6 851 5 4 1
str(cafeteria)
## 'data.frame': 7 obs. of 4 variables:
## $ id : int 123 345 385 481 624 851 992
## $ variety: int 3 4 1 2 2 5 4
## $ quality: int 4 4 1 3 3 4 5
## $ hours : int 2 2 2 3 4 1 3
#三個檔案都有ID,依據ID merge data
<- merge(majors, sports,by="id", all = TRUE) #all if TRUE, then extra rows/column will be added to the output
q4dta1
<- merge(q4dta1, cafeteria,by="id", all = TRUE)
q4dta2
head(q4dta2)
## id major sport variety quality hours
## 1 123 Mathematics badminton 3 4 2
## 2 123 Mathematics basketball 3 4 2
## 3 222 Computer Science badminton NA NA NA
## 4 345 Computer Science <NA> 4 4 2
## 5 385 Mathematics volleyball 1 1 2
## 6 385 Mathematics basketball 1 1 2
#Saving to comma-separated-value file (L4講義第29頁)
write.csv(q4dta2, file="./tmp_data/q4dta2.csv", quote=F, row.names=F)
head(read.csv("./tmp_data/q4dta2.csv"), 3)
## id major sport variety quality hours
## 1 123 Mathematics badminton 3 4 2
## 2 123 Mathematics basketball 3 4 2
## 3 222 Computer Science badminton NA NA NA
The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit.
Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care.
Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless.
Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years.
A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions.
The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.
Source: Kleinman, K., & Horton, N.J. (2015). Using R for Data Management, Statistical Analysis, and Graphics.
#input data
<- read.csv("https://nhorton.people.amherst.edu/sasr2/datasets/help.csv")
ds
library(dplyr)
# 選取變數
= dplyr::select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e,
newds f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
names(newds) # 欄位(變數)名稱
## [1] "cesd" "female" "i1" "i2" "id" "treat" "f1a"
## [8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
## [15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
## [22] "f1p" "f1q" "f1r" "f1s" "f1t"
str(newds[,1:10]) # 資料結構,只看第1-10個變數
## 'data.frame': 453 obs. of 10 variables:
## $ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
## $ female: int 0 0 0 1 0 1 1 0 1 0 ...
## $ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
## $ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ treat : int 1 1 0 0 0 1 0 1 0 1 ...
## $ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
## $ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
## $ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
## $ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
summary(newds[,1:10]) # 第1-10個變數摘要
## cesd female i1 i2
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:25.0 1st Qu.:0.000 1st Qu.: 3.0 1st Qu.: 3.0
## Median :34.0 Median :0.000 Median : 13.0 Median : 15.0
## Mean :32.8 Mean :0.236 Mean : 17.9 Mean : 22.6
## 3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.: 26.0 3rd Qu.: 32.0
## Max. :60.0 Max. :1.000 Max. :142.0 Max. :184.0
## id treat f1a f1b
## Min. : 1 Min. :0.000 Min. :0.00 Min. :0.00
## 1st Qu.:119 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
## Median :233 Median :0.000 Median :2.00 Median :1.00
## Mean :233 Mean :0.497 Mean :1.63 Mean :1.39
## 3rd Qu.:348 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
## Max. :470 Max. :1.000 Max. :3.00 Max. :3.00
## f1c f1d
## Min. :0.00 Min. :0.00
## 1st Qu.:1.00 1st Qu.:0.00
## Median :2.00 Median :1.00
## Mean :1.92 Mean :1.56
## 3rd Qu.:3.00 3rd Qu.:3.00
## Max. :3.00 Max. :3.00
head(newds, n=3) #看一下資料前三行
## cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1 49 0 13 26 1 1 3 2 3 0 2 3 3 0 2 3
## 2 30 0 56 62 2 1 3 2 0 3 3 2 0 0 3 0
## 3 39 0 0 0 3 0 3 2 3 0 2 2 1 3 2 3
## f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1 3 0 1 2 2 2 2 3 3 2
## 2 3 0 0 3 0 0 0 2 0 0
## 3 1 0 1 3 2 0 0 3 2 0
comment(newds) = "HELP baseline dataset" #寫入comment
comment(newds)
## [1] "HELP baseline dataset"
save(ds, file="savedfile") #save()將變數ds儲存,儲存格式file="savedfile"
#Use write.csv to save your data as a .csv file
write.csv(ds, file="ds.csv")
library(foreign) #foreign可讀取SAS、SPSS、Stata、Systat、minitab軟體的檔案
write.foreign(newds, "file.dat", "file.sas", package="SAS")
with(newds, cesd[1:10]) #列出newds資料中,變數cesd的前10筆數據
## [1] 49 30 39 15 39 6 52 32 50 46
with(newds, head(cesd, 10)) #第二種方法與上式結果一樣
## [1] 49 30 39 15 39 6 52 32 50 46
with(newds, cesd[cesd > 56]) #選出cesd > 56
## [1] 57 58 57 60 58 58 57
#filter: pick rows with matching criteria
#select: pick column by name
library(dplyr)
filter(newds, cesd > 56) %>% dplyr::select(id, cesd) #先用filter選出cesd的觀察值>56,然後(%>%)找出相對應的id(column)
## id cesd
## 1 71 57
## 2 127 58
## 3 200 57
## 4 228 60
## 5 273 58
## 6 351 58
## 7 13 57
with(newds, sort(cesd)[1:4]) #sort(x)將變數x由高到低排序,[1:4]是列出cesd最小值4個
## [1] 1 3 3 4
with(newds, which.min(cesd)) #which.min(cesd)是問cesd最小值是第幾個(row)
## [1] 199
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## 載入套件:'mosaic'
## 下列物件被遮斷自 'package:Matrix':
##
## mean
## 下列物件被遮斷自 'package:dplyr':
##
## count, do, tally
## 下列物件被遮斷自 'package:purrr':
##
## cross
## 下列物件被遮斷自 'package:ggplot2':
##
## stat
## 下列物件被遮斷自 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median,
## prop.test, quantile, sd, t.test, var
## 下列物件被遮斷自 'package:base':
##
## max, mean, min, prod, range, sample, sum
#tally() is short-hand for summarise()
#is.na(f1g)是問fla變數有沒有missing valus
tally(~ is.na(f1g), data=newds)
## is.na(f1g)
## TRUE FALSE
## 1 452
#“favstats” is short for “favorite statistics”
favstats(~ f1g, data=newds) #功能類似summary()
## min Q1 median Q3 max mean sd n missing
## 0 1 2 3 3 1.73 1.1 452 1
# reverse code f1d, f1h, f1l and f1p
#cbind()objects by Rows or Columns
= with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
cesditems 3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
(
f1q, f1r, f1s, f1t))
#is.na(cesditems)檢查是否有missing value
#apply( ,sum)計算數量
= apply(is.na(cesditems), 1, sum) #計算遺漏值總數
nmisscesd
= cesditems #copy data
ncesditems
#is.na(cesditems)是NA的話,命為0
is.na(cesditems)] = 0
ncesditems[
#ncesditems水平加總為newcesd
= apply(ncesditems, 1, sum)
newcesd
#20/(20-NA總數):這是加權權重(?)
= 20/(20-nmisscesd)*newcesd imputemeancesd
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
## newcesd newds.cesd nmisscesd imputemeancesd
## 4 15 15 1 15.8
## 17 19 19 1 20.0
## 87 44 44 1 46.3
## 101 17 17 1 17.9
## 154 29 29 1 30.5
## 177 44 44 1 46.3
## 229 39 39 1 41.1
#library(dplyr)
#library(memisc)
::p_load(memisc,tidyr)
pacman
#在newds中新增一個drinkstat變項,並根據給訂條件決定其值為 abstinent、moderate、highrisk
= mutate(newds, drinkstat=
newds cases(
"abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
>0 & i1<=2 & i2<=4 & female==0),
(i1"highrisk" = ((i1>1 | i2>3) & female==1) |
>2 | i2>4) & female==0))) ((i1
library(mosaic) #讀入package
detach(package:memisc) #將memisc移出搜尋路徑
detach(package:MASS) #將MASS移出搜尋路徑
library(dplyr) #讀入dplyr
#選取newds中的四個變數i1, i2, female, drinkstat,形成一個新object,命為tmpds
= select(newds, i1, i2, female, drinkstat)
tmpds
365:370,] #列出 tmpds 中第 365 到 370 筆資料 tmpds[
## i1 i2 female drinkstat
## 365 6 24 0 highrisk
## 366 6 6 0 highrisk
## 367 0 0 0 abstinent
## 368 0 0 1 abstinent
## 369 8 8 0 highrisk
## 370 32 32 0 highrisk
library(dplyr) #讀入dplyr
#在tmpds中,篩選drinkstat的觀察值為moderate且female為1的列
filter(tmpds, drinkstat=="moderate" & female==1)
## i1 i2 female drinkstat
## 1 1 1 1 moderate
## 2 1 3 1 moderate
## 3 1 2 1 moderate
## 4 1 1 1 moderate
## 5 1 1 1 moderate
## 6 1 1 1 moderate
## 7 1 1 1 moderate
library(gmodels) #讀入gmodels
with(tmpds, CrossTable(drinkstat)) # 顯示 drinkstat 三個值的數目和所占比例
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | abstinent | moderate | highrisk |
## |-----------|-----------|-----------|
## | 68 | 28 | 357 |
## | 0.150 | 0.062 | 0.788 |
## |-----------|-----------|-----------|
##
##
##
##
#CrossTable()可進行兩個類別變數的交叉表分析及獨立性檢定
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | female
## drinkstat | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## abstinent | 42 | 26 | 68 |
## | 0.618 | 0.382 | 0.150 |
## -------------|-----------|-----------|-----------|
## moderate | 21 | 7 | 28 |
## | 0.750 | 0.250 | 0.062 |
## -------------|-----------|-----------|-----------|
## highrisk | 283 | 74 | 357 |
## | 0.793 | 0.207 | 0.788 |
## -------------|-----------|-----------|-----------|
## Column Total | 346 | 107 | 453 |
## -------------|-----------|-----------|-----------|
##
##
#chisq:If TRUE, the results of a chi-square test will be included
= transform(newds,
newds gender=factor(female, c(0,1), c("Male","Female"))) #新增一個 gender 變項,將female中的0 或 1 轉換成 factor,並顯示為 male或female
## 產生一個列聯表,其中包含 famale 和 gender 變項
tally(~ female + gender, margin=FALSE, data=newds) #margin=FALSE不計算邊際總數
## gender
## female Male Female
## 0 346 0
## 1 0 107
library(dplyr)
#在ds資料中,篩選條件female值為1的資料,存為一個新物件females
= filter(ds, female==1)
females
#計算females的cesd變項的平均
with(females, mean(cesd))
## [1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1])
## [1] 36.9
#依據female分組,計算各組平均
with(ds, tapply(cesd, female, mean))
## 0 1
## 31.6 36.9
# an alternative approach
library(mosaic)
mean(cesd ~ female, data=ds)
## 0 1
## 31.6 36.9