Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score mathscr against average reading score readscr for the sampled data set. Make sure your results are reproducible (e.g., the same random sample will be drawn each time).
::p_load(Ecdat)
pacmandata(Caschool, package="Ecdat")
#顯示資料類型和結構
str(Caschool)
## 'data.frame': 420 obs. of 17 variables:
## $ distcod : int 75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
## $ county : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
## $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
## $ grspan : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
## $ enrltot : int 195 240 1550 243 1335 137 195 888 379 2247 ...
## $ teachers: num 10.9 11.1 82.9 14 71.5 ...
## $ calwpct : num 0.51 15.42 55.03 36.48 33.11 ...
## $ mealpct : num 2.04 47.92 76.32 77.05 78.43 ...
## $ computer: int 67 101 169 85 171 25 28 66 35 0 ...
## $ testscr : num 691 661 644 648 641 ...
## $ compstu : num 0.344 0.421 0.109 0.35 0.128 ...
## $ expnstu : num 6385 5099 5502 7102 5236 ...
## $ str : num 17.9 21.5 18.7 17.4 18.7 ...
## $ avginc : num 22.69 9.82 8.98 8.98 9.08 ...
## $ elpct : num 0 4.58 30 0 13.86 ...
## $ readscr : num 692 660 636 652 642 ...
## $ mathscr : num 690 662 651 644 640 ...
#呼叫dplyr ,要使用sample_n指令
::p_load(dplyr) pacman
#固定seed確保每次抽樣都一樣,每個county抽一個樣本
set.seed(1234)
<- group_by(Caschool, county) %>%
Caschool_1 sample_n(1)
head(Caschool_1 )
## # A tibble: 6 x 17
## # Groups: county [6]
## distcod county district grspan enrltot teachers calwpct mealpct computer
## <int> <fct> <fct> <fct> <int> <dbl> <dbl> <dbl> <int>
## 1 75119 Alameda Sunol G~ KK-08 195 10.9 0.510 2.04 67
## 2 61549 Butte Thermal~ KK-08 1550 82.9 55.0 76.3 169
## 3 61572 Calaveras Mark Tw~ KK-08 777 36.8 13.0 39.8 148
## 4 61713 Contra Costa Lafayet~ KK-08 3469 172. 0 0.173 496
## 5 61978 El Dorado Rescue ~ KK-08 2987 154. 2.04 11.2 290
## 6 62539 Fresno West Pa~ KK-08 314 19.2 29.9 93.6 8
## # ... with 8 more variables: testscr <dbl>, compstu <dbl>, expnstu <dbl>,
## # str <dbl>, avginc <dbl>, elpct <dbl>, readscr <dbl>, mathscr <dbl>
#陽春版
plot(mathscr ~ readscr,Caschool_1,
col = 51, pch =17)
#更新新版的R才能安裝basicPlotteR
::p_load(basicPlotteR) pacman
## 將程式套件安載入 'C:/Users/USER/Documents/R/win-library/4.1'
## (因為 'lib' 沒有被指定)
## Warning: package 'basicPlotteR' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.1:
## 無法開啟網址 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.1/PACKAGES'
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : 不存在叫 'basicPlotteR' 這個名稱的套件
## Warning in pacman::p_load(basicPlotteR): Failed to install/load:
## basicPlotteR
with(Caschool_1, plot(mathscr, readscr,
xlab="mathscr", ylab="readscr",
pch =18, bty="n", col=51,
axes = FALSE))
abline(lm(mathscr~readscr, Caschool_1))#加趨勢線
axis(1, at = seq(600,800, by=10)) #設定間隔及寬度
axis(2, at = seq(600,800, by=10))
grid()
#addTextLabels 無法加入標籤 addTextLabels(Caschool_1\(mathscr, Caschool_1\)readscr, Caschool_1$county, cex.label=.5, col.label=116, lty=2, col.line=rgb(0, 0, 0, 0.5))
#用plotly做,plotly 線上分析與視覺化的工具
::p_load(plotly)
pacman<- plot_ly(Caschool_1, x = ~mathscr, y = ~readscr, type = 'scatter', mode = 'markers',
Caschool_2 marker = list(size = 10))
<- Caschool_2 %>% add_annotations(
Caschool_2text = ~county,
xlab="mathscr", ylab="readscr",
showarrow = TRUE,
arrowhead = 4,
arrowsize = .5,
color = ~county,
ax = 20,
ay = -40)
Caschool_2
Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach. The tail end of the data object should looks as follows: classID language_mean language_lb language_ub 131 20.0 … … 132 23.5 … … 133 23.7 … …
::p_load(MASS)
pacmandata(nlschools, package="MASS")
head(nlschools)
## lang IQ class GS SES COMB
## 1 46 15.0 180 29 23 0
## 2 45 14.5 180 29 10 0
## 3 33 9.5 180 29 15 0
## 4 46 11.0 180 29 23 0
## 5 20 8.0 180 29 10 0
## 6 30 9.5 180 29 10 0
str(nlschools)
## 'data.frame': 2287 obs. of 6 variables:
## $ lang : int 46 45 33 46 20 30 30 57 36 36 ...
## $ IQ : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
## $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ GS : int 29 29 29 29 29 29 29 29 29 29 ...
## $ SES : int 23 10 15 23 10 10 23 10 13 15 ...
## $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#呼叫tidyverse
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.5 v purrr 0.3.4
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x MASS::select() masks plotly::select(), dplyr::select()
#95%CI計算 tail(nlschools)
language_lb language_ub
%>%
nlschools group_by(class) %>%
summarize(language_mean = mean(lang),
language_lb = language_mean - 1.96*sd(lang),
language_ub = language_mean + 1.96*sd(lang)) %>%
mutate(classID = paste0(1:n())) %>% #給序號
tail() %>% #看最後6筆
::select(classID, language_mean,language_lb,language_ub) %>%
dplyr::kable (digits = 3) knitr
classID | language_mean | language_lb | language_ub |
---|---|---|---|
128 | 45.500 | 30.124 | 60.876 |
129 | 41.500 | 29.445 | 53.555 |
130 | 40.267 | 20.744 | 59.789 |
131 | 38.091 | 26.953 | 49.229 |
132 | 29.300 | 3.264 | 55.336 |
133 | 28.429 | 14.762 | 42.095 |
Convert the wide form of the O’Brien and Kaiser’s repeated-measures data in OBrienKaiser{carData} to the long form in OBrienKaiserLong{carData}.
::p_load(carData)
pacmandata(OBrienKaiser, package="carData")
<-OBrienKaiser
dta3head(dta3)
## treatment gender pre.1 pre.2 pre.3 pre.4 pre.5 post.1 post.2 post.3 post.4
## 1 control M 1 2 4 2 1 3 2 5 3
## 2 control M 4 4 5 3 4 2 2 3 5
## 3 control M 5 6 5 7 7 4 5 7 5
## 4 control F 5 4 7 5 4 2 2 3 5
## 5 control F 3 4 6 4 3 6 7 8 6
## 6 A M 7 8 7 9 9 9 9 10 8
## post.5 fup.1 fup.2 fup.3 fup.4 fup.5
## 1 2 2 3 2 4 4
## 2 3 4 5 6 4 1
## 3 4 7 6 9 7 6
## 4 3 4 4 5 3 4
## 5 3 4 3 6 4 3
## 6 9 9 10 11 9 6
#gather 把橫的轉成直的,跟spread相反 為long form資料, 依genus變項轉成直的,填入mean_weight值, 不要留plot_id,因轉成直的會自動生程序號,不要序號
<- dta3 %>%
dta3_1 mutate(id = rep(1:n()))%>%
pivot_longer(
cols = starts_with(c("pre","post","fup")),#這3個要轉
names_to = "Time",
values_to = "score")|>
head() |> knitr::kable()
<- dta3 %>%
dta3_1 mutate(id = rep(1:n()))%>%
pivot_longer(
cols = starts_with(c("pre","post","fup")),#這3個要轉
names_to = "Time",
values_to = "score")|>
arrange(score)
|>as.data.frame()|> head() dta3_1
## treatment gender id Time score
## 1 control M 1 pre.1 1
## 2 control M 1 pre.5 1
## 3 control M 2 fup.5 1
## 4 A F 9 post.5 1
## 5 B F 14 pre.4 1
## 6 control M 1 pre.2 2
::p_load(tidyr) pacman
<- dta3_1 %>%
dta3_2 separate(Time, c("Time","hour")) %>% #將Time分為兩個欄位
mutate(hour=parse_number(hour))%>% #新變數hour透過parse_number擷取"pre.1"後的1 數字
as.data.frame()
head(dta3_2[,c(3,4,5,6,1,2)])#排欄位順序
## id Time hour score treatment gender
## 1 1 pre 1 1 control M
## 2 1 pre 5 1 control M
## 3 2 fup 5 1 control M
## 4 9 post 5 1 A F
## 5 14 pre 4 1 B F
## 6 1 pre 2 2 control M
Information about student majors, activities in intramural sports, and survey responses about their satisfaction with the on-campus cafeteria (for the variety and quality of the food served, and hours of operation) are recorded in three separate plain text files by three different administrative units: majors sports cafeteria
<- read.table("student_major.txt", h = T)
majors <- read.table("student_sport.txt", h = T)
sports <- read.table("student_cafe.txt", h = T) cafeteria
head(majors)
## id major
## 1 123 Mathematics
## 2 345 Computer Science
## 3 624 History
## 4 789 Biology
## 5 851 Computer Science
## 6 555 History
head(sports)
## id sport
## 1 123 basketball
## 2 123 badminton
## 3 222 badminton
## 4 385 basketball
## 5 385 volleyball
## 6 481 basketball
head(cafeteria)
## id variety quality hours
## 1 123 3 4 2
## 2 345 4 4 2
## 3 385 1 1 2
## 4 481 2 3 3
## 5 624 2 3 4
## 6 851 5 4 1
#All join,這種寫法會把三個檔案中所有欄位有NA的人 直接剔除 ,N=15
<- merge(merge(majors,sports),cafeteria,all = TRUE)
All head(All)
## id major sport variety quality hours
## 1 123 Mathematics badminton 3 4 2
## 2 123 Mathematics basketball 3 4 2
## 3 222 Computer Science badminton NA NA NA
## 4 345 <NA> <NA> 4 4 2
## 5 385 Mathematics basketball 1 1 2
## 6 385 Mathematics volleyball 1 1 2
str(All)
## 'data.frame': 15 obs. of 6 variables:
## $ id : int 123 123 222 345 385 385 481 555 624 717 ...
## $ major : chr "Mathematics" "Mathematics" "Computer Science" NA ...
## $ sport : chr "badminton" "basketball" "badminton" NA ...
## $ variety: int 3 3 NA 4 1 1 2 NA 2 NA ...
## $ quality: int 4 4 NA 4 1 1 3 NA 3 NA ...
## $ hours : int 2 2 NA 2 2 2 3 NA 4 NA ...
#inner_join 自動by id合併 同時存在在3個檔案中的一起合併
<- inner_join(inner_join(majors,sports),cafeteria,all = TRUE) All_1
## Joining, by = "id"
## Joining, by = "id"
head(All_1)
## id major sport variety quality hours
## 1 123 Mathematics basketball 3 4 2
## 2 123 Mathematics badminton 3 4 2
## 3 851 Computer Science volleyball 5 4 1
## 4 851 Computer Science badminton 5 4 1
## 5 992 Biology badminton 4 5 3
## 6 992 Biology volleyball 4 5 3
#inner_join中有9個觀察值
str(All_1)
## 'data.frame': 9 obs. of 6 variables:
## $ id : int 123 123 851 851 992 992 481 385 385
## $ major : chr "Mathematics" "Mathematics" "Computer Science" "Computer Science" ...
## $ sport : chr "basketball" "badminton" "volleyball" "badminton" ...
## $ variety: int 3 3 5 5 4 4 2 1 1
## $ quality: int 4 4 4 4 5 5 3 1 1
## $ hours : int 2 2 1 1 3 3 3 2 2
#left_join 自動by id合併 左邊檔案為主體(以左邊id為主要key),將右邊的檔案合併進來
<- left_join(left_join(majors,sports),cafeteria,all = TRUE) All_2
## Joining, by = "id"
## Joining, by = "id"
head(All_2)
## id major sport variety quality hours
## 1 123 Mathematics basketball 3 4 2
## 2 123 Mathematics badminton 3 4 2
## 3 345 Computer Science <NA> 4 4 2
## 4 624 History <NA> 2 3 4
## 5 789 Biology basketball NA NA NA
## 6 851 Computer Science volleyball 5 4 1
#left_join中有18個觀察值
str(All_2)
## 'data.frame': 18 obs. of 6 variables:
## $ id : int 123 123 345 624 789 851 851 555 992 992 ...
## $ major : chr "Mathematics" "Mathematics" "Computer Science" "History" ...
## $ sport : chr "basketball" "badminton" NA NA ...
## $ variety: int 3 3 4 2 NA 5 5 NA 4 4 ...
## $ quality: int 4 4 4 3 NA 4 4 NA 5 5 ...
## $ hours : int 2 2 2 4 NA 1 1 NA 3 3 ...
#可以省略sep,自動存入放三個檔案的資料夾中
#write.csv(All, "All.csv" ,row.names = F)
The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit. Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care. Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless. Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years. A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions. The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.
Source: Kleinman, K., & Horton, N.J. (2015). Using R for Data Management, Statistical Analysis, and Graphics.
options(continue=" ")
=read.csv("https://nhorton.people.amherst.edu/sasr2/datasets/help.csv")
dsoptions(digits=3) # narrow output
options(width=72)
library(dplyr)
#需使用dplyr::select函數抓取所需欄位,原本沒有抓id這裡補抓
<- ds |> dplyr::select( cesd, female,id, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)
newds head(newds)
## cesd female id i1 i2 treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1 49 0 1 13 26 1 3 2 3 0 2 3 3 0 2 3
## 2 30 0 2 56 62 1 3 2 0 3 3 2 0 0 3 0
## 3 39 0 3 0 0 0 3 2 3 0 2 2 1 3 2 3
## 4 15 1 4 5 5 0 0 0 1 3 2 2 1 3 0 0
## 5 39 0 5 10 13 0 3 0 3 3 3 3 1 3 3 2
## 6 6 1 6 4 4 1 1 0 1 3 0 0 0 3 0 1
## f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1 3 0 1 2 2 2 2 3 3 2
## 2 3 0 0 3 0 0 0 2 0 0
## 3 1 0 1 3 2 0 0 3 2 0
## 4 1 2 2 2 0 NA 2 0 0 1
## 5 3 2 2 3 0 3 3 3 3 3
## 6 1 3 1 0 1 3 0 0 0 0
names(newds) #所有欄位名稱
## [1] "cesd" "female" "id" "i1" "i2" "treat" "f1a"
## [8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
## [15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
## [22] "f1p" "f1q" "f1r" "f1s" "f1t"
str(newds[,1:10]) # structure of the first 10 variables
## 'data.frame': 453 obs. of 10 variables:
## $ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
## $ female: int 0 0 0 1 0 1 1 0 1 0 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
## $ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
## $ treat : int 1 1 0 0 0 1 0 1 0 1 ...
## $ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
## $ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
## $ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
## $ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
#前10個變項的基本描述統計
summary(newds[,1:10]) # summary of the first 10 variables
## cesd female id i1
## Min. : 1.0 Min. :0.000 Min. : 1 Min. : 0.0
## 1st Qu.:25.0 1st Qu.:0.000 1st Qu.:119 1st Qu.: 3.0
## Median :34.0 Median :0.000 Median :233 Median : 13.0
## Mean :32.8 Mean :0.236 Mean :233 Mean : 17.9
## 3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.:348 3rd Qu.: 26.0
## Max. :60.0 Max. :1.000 Max. :470 Max. :142.0
## i2 treat f1a f1b
## Min. : 0.0 Min. :0.000 Min. :0.00 Min. :0.00
## 1st Qu.: 3.0 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
## Median : 15.0 Median :0.000 Median :2.00 Median :1.00
## Mean : 22.6 Mean :0.497 Mean :1.63 Mean :1.39
## 3rd Qu.: 32.0 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
## Max. :184.0 Max. :1.000 Max. :3.00 Max. :3.00
## f1c f1d
## Min. :0.00 Min. :0.00
## 1st Qu.:1.00 1st Qu.:0.00
## Median :2.00 Median :1.00
## Mean :1.92 Mean :1.56
## 3rd Qu.:3.00 3rd Qu.:3.00
## Max. :3.00 Max. :3.00
#只看前3筆資料
head(newds, n=3)
## cesd female id i1 i2 treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1 49 0 1 13 26 1 3 2 3 0 2 3 3 0 2 3
## 2 30 0 2 56 62 1 3 2 0 3 3 2 0 0 3 0
## 3 39 0 3 0 0 0 3 2 3 0 2 2 1 3 2 3
## f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1 3 0 1 2 2 2 2 3 3 2
## 2 3 0 0 3 0 0 0 2 0 0
## 3 1 0 1 3 2 0 0 3 2 0
comment(newds) = "HELP baseline dataset" #comment
comment(newds)
## [1] "HELP baseline dataset"
#將資料存入savedfile,存成ds
save(ds, file="savedfile")
#存入CSV檔中
write.csv(ds, file="ds.csv")
#呼叫foreign
#將newds存成SAS.dat和.sas檔案
library(foreign)
write.foreign(newds, "file.dat", "file.sas", package="SAS")
#讀取'newds'中的cesd變項前10筆資料
#兩種方式皆可
with(newds, cesd[1:10])
## [1] 49 30 39 15 39 6 52 32 50 46
with(newds, head(cesd, 10))
## [1] 49 30 39 15 39 6 52 32 50 46
#讀取'newds'中的cesd變項>56的資料
with(newds, cesd[cesd > 56])
## [1] 57 58 57 60 58 58 57
#呼叫dplyr,使用filter確認資料中的cesd > 56,show id & cesd
#記得要使用dplyr::select去讀取,不然無法上傳至網頁 只能RStudio自己看
library(dplyr)
filter(newds, cesd > 56) %>% dplyr::select(id, cesd)
## id cesd
## 1 71 57
## 2 127 58
## 3 200 57
## 4 228 60
## 5 273 58
## 6 351 58
## 7 13 57
#shor()向量進行從小到大的排序cesd前四筆
#which.min()返回數值向量中第一個最小值的位置
with(newds, sort(cesd)[1:4])
## [1] 1 3 3 4
with(newds, which.min(cesd))
## [1] 199
#packages("mosaic")可做簡單微積分計算
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## 載入套件:'mosaic'
## 下列物件被遮斷自 'package:Matrix':
##
## mean
## 下列物件被遮斷自 'package:purrr':
##
## cross
## 下列物件被遮斷自 'package:plotly':
##
## do
## 下列物件被遮斷自 'package:ggplot2':
##
## stat
## 下列物件被遮斷自 'package:dplyr':
##
## count, do, tally
## 下列物件被遮斷自 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median,
## prop.test, quantile, sd, t.test, var
## 下列物件被遮斷自 'package:base':
##
## max, mean, min, prod, range, sample, sum
tally(~ is.na(f1g), data=newds) #tally count計算f1g變項總計 452
## is.na(f1g)
## TRUE FALSE
## 1 452
#favstats來自mosaic包, 描述統計值 max, mean, min,Q1,Q3, range, sd, missing
favstats(~ f1g, data=newds)
## min Q1 median Q3 max mean sd n missing
## 0 1 2 3 3 1.73 1.1 452 1
f1d, f1h, f1l and f1p
#將變項items 列出,並recode, (3 - f1l)表示反向題
= with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
cesditems 3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
( f1q, f1r, f1s, f1t))
#計算missing,NA總數,回傳資料存至nmisscesd
= apply(is.na(cesditems), 1, sum)
nmisscesd
#列出整理後的變項得分數,放入ncesditems
= cesditems
ncesditems
#將NA改0
is.na(cesditems)] = 0
ncesditems[
#計算ncesditems總分,apply 列出每列來
= apply(ncesditems, 1, sum)
newcesd
#針對加權,該列有幾個NA,其加總便乘以20/(20-NA數目)
= 20/(20-nmisscesd)*newcesd imputemeancesd
#list 結果確認
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]
## newcesd newds.cesd nmisscesd imputemeancesd
## 4 15 15 1 15.8
## 17 19 19 1 20.0
## 87 44 44 1 46.3
## 101 17 17 1 17.9
## 154 29 29 1 30.5
## 177 44 44 1 46.3
## 229 39 39 1 41.1
#createdrink,message=FALSE
#memisc為空間函數
#case分三組abstinent moderate highrisk
::p_load(dplyr,memisc)
pacman= mutate(newds, drinkstat=
newds cases(
"abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
>0 & i1<=2 & i2<=4 & female==0),
(i1"highrisk" = ((i1>1 | i2>3) & female==1) |
>2 | i2>4) & female==0))) ((i1
#----echo=FALSE
library(mosaic)
#----echo=FALSE
detach(package:memisc)
detach(package:MASS)
#select data to tmpds
#取出4個變項i1, i2, female, drinkstat, 看365~370資料
library(dplyr)
= select(newds, i1, i2, female, drinkstat)
tmpds 365:370,] tmpds[
## i1 i2 female drinkstat
## 365 6 24 0 highrisk
## 366 6 6 0 highrisk
## 367 0 0 0 abstinent
## 368 0 0 1 abstinent
## 369 8 8 0 highrisk
## 370 32 32 0 highrisk
#filter篩選變項"moderate" & female=1的人
library(dplyr)
filter(tmpds, drinkstat=="moderate" & female==1)
## i1 i2 female drinkstat
## 1 1 1 1 moderate
## 2 1 3 1 moderate
## 3 1 2 1 moderate
## 4 1 1 1 moderate
## 5 1 1 1 moderate
## 6 1 1 1 moderate
## 7 1 1 1 moderate
##message=FALSE
#gmodels::CrossTable 針對drinkstat做列聯表
::p_load(gmodels)
pacmanwith(tmpds, CrossTable(drinkstat))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | abstinent | moderate | highrisk |
## |-----------|-----------|-----------|
## | 68 | 28 | 357 |
## | 0.150 | 0.062 | 0.788 |
## |-----------|-----------|-----------|
##
##
##
##
#drinkstat, female做列聯表
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))#不做統計檢定
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | female
## drinkstat | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## abstinent | 42 | 26 | 68 |
## | 0.618 | 0.382 | 0.150 |
## -------------|-----------|-----------|-----------|
## moderate | 21 | 7 | 28 |
## | 0.750 | 0.250 | 0.062 |
## -------------|-----------|-----------|-----------|
## highrisk | 283 | 74 | 357 |
## | 0.793 | 0.207 | 0.788 |
## -------------|-----------|-----------|-----------|
## Column Total | 346 | 107 | 453 |
## -------------|-----------|-----------|-----------|
##
##
#做gender列聯表,計算tally
= transform(newds,
newds gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)
## gender
## female Male Female
## 0 346 0
## 1 0 107
#arrange資料,預設為遞增排序
library(dplyr)
= arrange(ds, cesd, i1)
newds 1:5, c("cesd", "i1", "id")]#看前5筆 newds[
## cesd i1 id
## 1 1 3 233
## 2 3 1 139
## 3 3 13 418
## 4 4 4 251
## 5 4 9 95
library(dplyr)
= filter(ds, female==1) #選取female=1放入females檔案中
females with(females, mean(cesd)) #計算cesd的mean
## [1] 36.9
# an alternative approach 2種都可
mean(ds$cesd[ds$female==1])#計算cesd的mean
## [1] 36.9
#tapply()把原始數據分割為若干組, 計算female的cesd平均數
#tapply(vector創建分數向量, factor針對的變項是什麼, fun應用的功能)
with(ds, tapply(cesd, female, mean))
## 0 1
## 31.6 36.9
library(mosaic)
mean(cesd ~ female, data=ds) #也可用此方法
## 0 1
## 31.6 36.9
~END~