Q1

Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score mathscr against average reading score readscr for the sampled data set.

Make sure your results are reproducible (e.g., the same random sample will be drawn each time).

#install package and download data
pacman::p_load(Ecdat)
data(Caschool, package="Ecdat")

#show first 6 lines data
Caschool |> head() |> knitr::kable()

distcod	county	district	grspan	enrltot	teachers	calwpct	mealpct	computer	testscr	compstu	expnstu	str	avginc	elpct	readscr	mathscr
75119	Alameda	Sunol Glen Unified	KK-08	195	10.90	0.5102	2.0408	67	690.80	0.3435898	6384.911	17.88991	22.690001	0.000000	691.6	690.0
61499	Butte	Manzanita Elementary	KK-08	240	11.15	15.4167	47.9167	101	661.20	0.4208333	5099.381	21.52466	9.824000	4.583334	660.5	661.9
61549	Butte	Thermalito Union Elementary	KK-08	1550	82.90	55.0323	76.3226	169	643.60	0.1090323	5501.955	18.69723	8.978000	30.000002	636.3	650.9
61457	Butte	Golden Feather Union Elementary	KK-08	243	14.00	36.4754	77.0492	85	647.70	0.3497942	7101.831	17.35714	8.978000	0.000000	651.9	643.5
61523	Butte	Palermo Union Elementary	KK-08	1335	71.50	33.1086	78.4270	171	640.85	0.1280899	5235.988	18.67133	9.080333	13.857677	641.8	639.9
62042	Fresno	Burrel Union Elementary	KK-08	137	6.40	12.3188	86.9565	25	605.55	0.1824818	5580.147	21.40625	10.415000	12.408759	605.7	605.4

#了解資料
?Caschool

## starting httpd help server ... done

#
str(Caschool)

## 'data.frame':    420 obs. of  17 variables:
##  $ distcod : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
##  $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
##  $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
##  $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
##  $ enrltot : int  195 240 1550 243 1335 137 195 888 379 2247 ...
##  $ teachers: num  10.9 11.1 82.9 14 71.5 ...
##  $ calwpct : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ mealpct : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer: int  67 101 169 85 171 25 28 66 35 0 ...
##  $ testscr : num  691 661 644 648 641 ...
##  $ compstu : num  0.344 0.421 0.109 0.35 0.128 ...
##  $ expnstu : num  6385 5099 5502 7102 5236 ...
##  $ str     : num  17.9 21.5 18.7 17.4 18.7 ...
##  $ avginc  : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ elpct   : num  0 4.58 30 0 13.86 ...
##  $ readscr : num  692 660 636 652 642 ...
##  $ mathscr : num  690 662 651 644 640 ...

pacman::p_load(tidyverse,dplyr,sampling)
# 
q1dta<-Caschool |> #copy data
 mutate(SID=paste0("S", 101:520))|> #add SID
 as.data.frame()

# #set initial value to make sure the results are reproducible
set.seed(1111) 

q1_sample<-q1dta |>
 group_by(county)|> #grouping by country
    sample_n(1) #randomly select 1 school per county

summary(q1_sample[,16:17])

##     readscr         mathscr     
##  Min.   :604.5   Min.   :605.4  
##  1st Qu.:642.7   1st Qu.:639.6  
##  Median :656.1   Median :650.2  
##  Mean   :655.6   Mean   :651.9  
##  3rd Qu.:667.6   3rd Qu.:662.3  
##  Max.   :693.7   Max.   :690.1

#plot 
with(q1_sample, plot(mathscr,readscr, 
        xlab = "Average Math Score",
        ylab = "Average Reading Score",
        main = "Relationship of Average Score between Math and Reading",
        xlim = c(600,700),
        ylim = c(600,700),
        type='n',
        bty='n', #移除外框
        pch=20))
grid() #增加輔助線
abline(lm(readscr ~ mathscr, data=q1_sample))
with(q1_sample, text(mathscr,readscr, 
               labels=county, #標註county
               cex=.5))

Q2

Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach.

The tail end of the data object should looks as follows:

# input data
pacman::p_load(MASS,dplyr)
data(nlschools, package="MASS")
#show first 6 lines data
nlschools |> head() |> knitr::kable()

lang	IQ	class	GS	SES
46	15.0	180	29	23
45	14.5	180	29	10
33	9.5	180	29	15
46	11.0	180	29	23
20	8.0	180	29	10
30	9.5	180	29	10

#
?nlschools

lang=language test score.

IQ=verbal IQ

class=class ID

GS=class size：number of eighth-grade pupils recorded in the class (there may be others: see COMB, and some may have been omitted with missing values).

SES=social-economic status of pupil’s family.

COMB=were the pupils taught in a multi-grade class (0/1)? Classes which contained pupils from grades 7 and 8 are coded 1, but only eighth-graders were tested.

#copy data 
q2dta<-nlschools 
#exam data structure
str(q2dta)

## 'data.frame':    2287 obs. of  6 variables:
##  $ lang : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ IQ   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ GS   : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ SES  : int  23 10 15 23 10 10 23 10 13 15 ...
##  $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

class(q2dta)

## [1] "data.frame"

typeof(q2dta)

## [1] "list"

summary(q2dta)

##       lang             IQ            class            GS             SES       
##  Min.   : 9.00   Min.   : 4.00   15580  :  33   Min.   :10.00   Min.   :10.00  
##  1st Qu.:35.00   1st Qu.:10.50   5480   :  31   1st Qu.:23.00   1st Qu.:20.00  
##  Median :42.00   Median :12.00   15980  :  31   Median :27.00   Median :27.00  
##  Mean   :40.93   Mean   :11.83   16180  :  31   Mean   :26.51   Mean   :27.81  
##  3rd Qu.:48.00   3rd Qu.:13.00   18380  :  31   3rd Qu.:31.00   3rd Qu.:35.00  
##  Max.   :58.00   Max.   :18.00   5580   :  30   Max.   :39.00   Max.   :50.00  
##                                  (Other):2100                                  
##  COMB    
##  0:1658  
##  1: 629  
##          
##          
##          
##          
##

# 徒手計算95% CI
q2answer<-q2dta |>
 filter(!is.na(lang),
        !is.na(class)) |> #排除遺漏值
 group_by(class)|> #grouping by class
 summarize(language_mean = mean(lang),language_sd =sd(lang),N=length(lang))|> #calculate the mean,sd,se
 mutate(language_se=language_sd/sqrt(N),tscore = qt(p=0.05/2, df=(N-1),lower.tail=F))|>
 mutate(
     language_lb = language_mean-(tscore*language_sd/sqrt(N)),
     language_ub = language_mean+(tscore*language_sd/sqrt(N)))|>
    arrange(desc(language_mean))

q2answer|>
dplyr::select(class,language_mean,language_lb,language_ub)|>
arrange(desc(language_mean))|>
tail()

## # A tibble: 6 x 4
##   class language_mean language_lb language_ub
##   <fct>         <dbl>       <dbl>       <dbl>
## 1 17980          30.2       20.8         39.5
## 2 25680          29.3       19.8         38.8
## 3 25880          28.4       22.0         34.9
## 4 280            23.7       17.9         29.6
## 5 4780           23.5       19.5         27.5
## 6 10380          20          9.45        30.6

#check answer again
summary(q2answer)

##      class     language_mean    language_sd           N         language_se   
##  180    :  1   Min.   :20.00   Min.   : 4.272   Min.   : 4.0   Min.   :1.075  
##  280    :  1   1st Qu.:37.05   1st Qu.: 6.623   1st Qu.:11.0   1st Qu.:1.511  
##  1082   :  1   Median :40.72   Median : 7.821   Median :17.0   Median :1.820  
##  1280   :  1   Mean   :40.14   Mean   : 7.956   Mean   :17.2   Mean   :2.089  
##  1580   :  1   3rd Qu.:43.82   3rd Qu.: 8.931   3rd Qu.:23.0   3rd Qu.:2.464  
##  1680   :  1   Max.   :49.19   Max.   :14.072   Max.   :33.0   Max.   :4.703  
##  (Other):127                                                                  
##      tscore       language_lb      language_ub   
##  Min.   :2.037   Min.   : 9.445   Min.   :27.50  
##  1st Qu.:2.074   1st Qu.:31.598   1st Qu.:43.02  
##  Median :2.120   Median :36.916   Median :45.28  
##  Mean   :2.181   Mean   :35.494   Mean   :44.78  
##  3rd Qu.:2.228   3rd Qu.:39.804   3rd Qu.:47.86  
##  Max.   :3.182   Max.   :46.948   Max.   :52.35  
##

再次檢查：language_mean的範圍與題目給的答案不符….>“<

Q3

Convert the wide form of the O’Brien and Kaiser’s repeated-measures data in OBrienKaiser{carData} to the long form in OBrienKaiserLong{carData}.

# install tool packages 
pacman::p_load(carData,tidyr,tidyverse,magrittr)
# load and input data
data(OBrienKaiser, package = "carData")
#show first 6 lines data
OBrienKaiser |> head() |> knitr::kable()

treatment	gender	pre.1	pre.2	pre.3	pre.4	pre.5	post.1	post.2	post.3	post.4	post.5	fup.1	fup.2	fup.3	fup.4	fup.5
control	M	1	2	4	2	1	3	2	5	3	2	2	3	2	4	4
control	M	4	4	5	3	4	2	2	3	5	3	4	5	6	4	1
control	M	5	6	5	7	7	4	5	7	5	4	7	6	9	7	6
control	F	5	4	7	5	4	2	2	3	5	3	4	4	5	3	4
control	F	3	4	6	4	3	6	7	8	6	3	4	3	6	4	3
A	M	7	8	7	9	9	9	9	10	8	9	9	10	11	9	6

?OBrienKaiser

#Tibble
q3dta <- as_tibble(OBrienKaiser) %>% 
 mutate(ID=paste0("S", 101:116)) #16 observations
q3dta  |> as.data.frame() |> head()

##   treatment gender pre.1 pre.2 pre.3 pre.4 pre.5 post.1 post.2 post.3 post.4
## 1   control      M     1     2     4     2     1      3      2      5      3
## 2   control      M     4     4     5     3     4      2      2      3      5
## 3   control      M     5     6     5     7     7      4      5      7      5
## 4   control      F     5     4     7     5     4      2      2      3      5
## 5   control      F     3     4     6     4     3      6      7      8      6
## 6         A      M     7     8     7     9     9      9      9     10      8
##   post.5 fup.1 fup.2 fup.3 fup.4 fup.5   ID
## 1      2     2     3     2     4     4 S101
## 2      3     4     5     6     4     1 S102
## 3      4     7     6     9     7     6 S103
## 4      3     4     4     5     3     4 S104
## 5      3     4     3     6     4     3 S105
## 6      9     9    10    11     9     6 S106

#Pivot data from wide to long
dtaL <- q3dta |> 
 pivot_longer(cols=starts_with(c("pre","post","fup")), #三種不同階段
              names_to="Stage", 
              values_to="Hours") |>
 arrange(ID)
dtaL |> as.data.frame() |> head(20)

##    treatment gender   ID  Stage Hours
## 1    control      M S101  pre.1     1
## 2    control      M S101  pre.2     2
## 3    control      M S101  pre.3     4
## 4    control      M S101  pre.4     2
## 5    control      M S101  pre.5     1
## 6    control      M S101 post.1     3
## 7    control      M S101 post.2     2
## 8    control      M S101 post.3     5
## 9    control      M S101 post.4     3
## 10   control      M S101 post.5     2
## 11   control      M S101  fup.1     2
## 12   control      M S101  fup.2     3
## 13   control      M S101  fup.3     2
## 14   control      M S101  fup.4     4
## 15   control      M S101  fup.5     4
## 16   control      M S102  pre.1     4
## 17   control      M S102  pre.2     4
## 18   control      M S102  pre.3     5
## 19   control      M S102  pre.4     3
## 20   control      M S102  pre.5     4

#Pull apart a variable - separate
dtaL %<>% 
 separate(Stage, c("stage_name", "Times")) |> #將stage拆成"階段名"和"次數"
 as.data.frame() |> head(20)

##    treatment gender   ID stage_name Times Hours
## 1    control      M S101        pre     1     1
## 2    control      M S101        pre     2     2
## 3    control      M S101        pre     3     4
## 4    control      M S101        pre     4     2
## 5    control      M S101        pre     5     1
## 6    control      M S101       post     1     3
## 7    control      M S101       post     2     2
## 8    control      M S101       post     3     5
## 9    control      M S101       post     4     3
## 10   control      M S101       post     5     2
## 11   control      M S101        fup     1     2
## 12   control      M S101        fup     2     3
## 13   control      M S101        fup     3     2
## 14   control      M S101        fup     4     4
## 15   control      M S101        fup     5     4
## 16   control      M S102        pre     1     4
## 17   control      M S102        pre     2     4
## 18   control      M S102        pre     3     5
## 19   control      M S102        pre     4     3
## 20   control      M S102        pre     5     4

Q4

Information about student majors, activities in intramural sports, and survey responses about their satisfaction with the on-campus cafeteria (for the variety and quality of the food served, and hours of operation) are recorded in three separate plain text files by three different administrative units:majors、sports、cafeteria

Input these files into an R session, merge them and output the results as in the comma-separate values file.

# Import plain text files
#fL1 <- "http://140.116.183.121/~sheu/dataM/05_rdw/data/student_major.txt"
#majors <- read.table(fL1,header = T) #無法直接從網路讀取資料

majors <- read.table("C:/Users/Ching-Fang Wu/Documents/dataM/student_major.txt", header = TRUE)

sports <-read.table("C:/Users/Ching-Fang Wu/Documents/dataM/student_sport.txt", header = TRUE)

cafeteria <-read.table("C:/Users/Ching-Fang Wu/Documents/dataM/student_cafe.txt", header = TRUE)

#
head(majors)

##    id            major
## 1 123      Mathematics
## 2 345 Computer Science
## 3 624          History
## 4 789          Biology
## 5 851 Computer Science
## 6 555         History

str(majors)

## 'data.frame':    14 obs. of  2 variables:
##  $ id   : int  123 345 624 789 851 555 992 484 717 839 ...
##  $ major: chr  "Mathematics" "Computer Science" "History" "Biology" ...

#
head(sports)

##    id      sport
## 1 123 basketball
## 2 123  badminton
## 3 222  badminton
## 4 385 basketball
## 5 385 volleyball
## 6 481 basketball

str(sports)

## 'data.frame':    13 obs. of  2 variables:
##  $ id   : int  123 123 222 385 385 481 555 717 789 851 ...
##  $ sport: chr  "basketball" "badminton" "badminton" "basketball" ...

#
head(cafeteria)

##    id variety quality hours
## 1 123       3       4     2
## 2 345       4       4     2
## 3 385       1       1     2
## 4 481       2       3     3
## 5 624       2       3     4
## 6 851       5       4     1

str(cafeteria)

## 'data.frame':    7 obs. of  4 variables:
##  $ id     : int  123 345 385 481 624 851 992
##  $ variety: int  3 4 1 2 2 5 4
##  $ quality: int  4 4 1 3 3 4 5
##  $ hours  : int  2 2 2 3 4 1 3

#三個檔案都有ID，依據ID merge data
q4dta1 <- merge(majors, sports,by="id", all = TRUE) #all if TRUE, then extra rows/column will be added to the output

q4dta2 <- merge(q4dta1, cafeteria,by="id", all = TRUE)

head(q4dta2)

##    id            major      sport variety quality hours
## 1 123      Mathematics  badminton       3       4     2
## 2 123      Mathematics basketball       3       4     2
## 3 222 Computer Science  badminton      NA      NA    NA
## 4 345 Computer Science       <NA>       4       4     2
## 5 385      Mathematics volleyball       1       1     2
## 6 385      Mathematics basketball       1       1     2

#Saving to comma-separated-value file (L4講義第29頁)
write.csv(q4dta2, file="./tmp_data/q4dta2.csv", quote=F, row.names=F)
head(read.csv("./tmp_data/q4dta2.csv"), 3)

##    id            major      sport variety quality hours
## 1 123      Mathematics  badminton       3       4     2
## 2 123      Mathematics basketball       3       4     2
## 3 222 Computer Science  badminton      NA      NA    NA

Q5

The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit.

Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care.

Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless.

Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years.

A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions.

The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.

Source: Kleinman, K., & Horton, N.J. (2015). Using R for Data Management, Statistical Analysis, and Graphics.

#input data
ds <- read.csv("https://nhorton.people.amherst.edu/sasr2/datasets/help.csv")

library(dplyr)
# 選取變數
newds = dplyr::select(ds, cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, 
  f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)

names(newds) # 欄位(變數)名稱

##  [1] "cesd"   "female" "i1"     "i2"     "id"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"

str(newds[,1:10]) # 資料結構，只看第1-10個變數

## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...

summary(newds[,1:10]) # 第1-10個變數摘要

##       cesd          female            i1              i2       
##  Min.   : 1.0   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:  3.0   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median : 13.0   Median : 15.0  
##  Mean   :32.8   Mean   :0.236   Mean   : 17.9   Mean   : 22.6  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.: 26.0   3rd Qu.: 32.0  
##  Max.   :60.0   Max.   :1.000   Max.   :142.0   Max.   :184.0  
##        id          treat            f1a            f1b      
##  Min.   :  1   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:119   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median :233   Median :0.000   Median :2.00   Median :1.00  
##  Mean   :233   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.:348   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :470   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00

head(newds, n=3) #看一下資料前三行

##   cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0 13 26  1     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0 56 62  2     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  0  0  3     0   3   2   3   0   2   2   1   3   2   3
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0

comment(newds) = "HELP baseline dataset" #寫入comment
comment(newds)

## [1] "HELP baseline dataset"

save(ds, file="savedfile") #save()將變數ds儲存，儲存格式file="savedfile"

#Use write.csv to save your data as a .csv file
write.csv(ds, file="ds.csv")

library(foreign) #foreign可讀取SAS、SPSS、Stata、Systat、minitab軟體的檔案
write.foreign(newds, "file.dat", "file.sas", package="SAS")

with(newds, cesd[1:10]) #列出newds資料中，變數cesd的前10筆數據

##  [1] 49 30 39 15 39  6 52 32 50 46

with(newds, head(cesd, 10)) #第二種方法與上式結果一樣

##  [1] 49 30 39 15 39  6 52 32 50 46

with(newds, cesd[cesd > 56]) #選出cesd > 56

## [1] 57 58 57 60 58 58 57

#filter: pick rows with matching criteria
#select: pick column by name

library(dplyr)
filter(newds, cesd > 56) %>% dplyr::select(id, cesd) #先用filter選出cesd的觀察值>56，然後(%>%)找出相對應的id(column)

##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57

with(newds, sort(cesd)[1:4]) #sort(x)將變數x由高到低排序，[1:4]是列出cesd最小值4個

## [1] 1 3 3 4

with(newds, which.min(cesd)) #which.min(cesd)是問cesd最小值是第幾個(row)

## [1] 199

library(mosaic)

## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.

## 
## 載入套件：'mosaic'

## 下列物件被遮斷自 'package:Matrix':
## 
##     mean

## 下列物件被遮斷自 'package:dplyr':
## 
##     count, do, tally

## 下列物件被遮斷自 'package:purrr':
## 
##     cross

## 下列物件被遮斷自 'package:ggplot2':
## 
##     stat

## 下列物件被遮斷自 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var

## 下列物件被遮斷自 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

#tally() is short-hand for summarise()
#is.na(f1g)是問fla變數有沒有missing valus
tally(~ is.na(f1g), data=newds)

## is.na(f1g)
##  TRUE FALSE 
##     1   452

#“favstats” is short for “favorite statistics”
favstats(~ f1g, data=newds) #功能類似summary()

##  min Q1 median Q3 max mean  sd   n missing
##    0  1      2  3   3 1.73 1.1 452       1

# reverse code f1d, f1h, f1l and f1p

#cbind()objects by Rows or Columns
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t))

#is.na(cesditems)檢查是否有missing value
#apply( ,sum)計算數量
nmisscesd = apply(is.na(cesditems), 1, sum) #計算遺漏值總數

ncesditems = cesditems #copy data

#is.na(cesditems)是NA的話，命為0
ncesditems[is.na(cesditems)] = 0

#ncesditems水平加總為newcesd
newcesd = apply(ncesditems, 1, sum) 

#20/(20-NA總數)：這是加權權重(?)
imputemeancesd = 20/(20-nmisscesd)*newcesd

data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]

##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1           15.8
## 17       19         19         1           20.0
## 87       44         44         1           46.3
## 101      17         17         1           17.9
## 154      29         29         1           30.5
## 177      44         44         1           46.3
## 229      39         39         1           41.1

#library(dplyr)
#library(memisc)
pacman::p_load(memisc,tidyr)

#在newds中新增一個drinkstat變項，並根據給訂條件決定其值為 abstinent、moderate、highrisk
newds = mutate(newds, drinkstat= 
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))

library(mosaic) #讀入package

detach(package:memisc) #將memisc移出搜尋路徑
detach(package:MASS)   #將MASS移出搜尋路徑

library(dplyr) #讀入dplyr

#選取newds中的四個變數i1, i2, female, drinkstat，形成一個新object，命為tmpds
tmpds = select(newds, i1, i2, female, drinkstat) 

tmpds[365:370,] #列出 tmpds 中第 365 到 370 筆資料

##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk

library(dplyr) #讀入dplyr

#在tmpds中，篩選drinkstat的觀察值為moderate且female為1的列
filter(tmpds, drinkstat=="moderate" & female==1)

##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate

library(gmodels) #讀入gmodels


with(tmpds, CrossTable(drinkstat)) # 顯示 drinkstat 三個值的數目和所占比例

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
##

#CrossTable()可進行兩個類別變數的交叉表分析及獨立性檢定
with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
##

#chisq：If TRUE, the results of a chi-square test will be included

newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female"))) #新增一個 gender 變項，將female中的0 或 1 轉換成 factor，並顯示為 male或female

## 產生一個列聯表，其中包含 famale 和 gender 變項
tally(~ female + gender, margin=FALSE, data=newds) #margin=FALSE不計算邊際總數

##       gender
## female Male Female
##      0  346      0
##      1    0    107

library(dplyr) 

#在ds資料中，篩選條件female值為1的資料，存為一個新物件females
females = filter(ds, female==1)

#計算females的cesd變項的平均
with(females, mean(cesd))

## [1] 36.9

# an alternative approach
mean(ds$cesd[ds$female==1])

## [1] 36.9

#依據female分組，計算各組平均
with(ds, tapply(cesd, female, mean))

##    0    1 
## 31.6 36.9

# an alternative approach
library(mosaic)
mean(cesd ~ female, data=ds)

##    0    1 
## 31.6 36.9

參考資料：https://rstudio-education.github.io/hopr/

DataM L5 Exercises

Ching-Fang Wu

Fri Nov 05 13:58:41 2021

Q1

Q2

Q3

Q4

Q5