Homework 1

Select at random one school per county in the data set Caschool{Ecdat} and draw a scatter diagram of average math score mathscr against average reading score readscr for the sampled data set. Make sure your results are reproducible (e.g., the same random sample will be drawn each time).

pacman::p_load(Ecdat)
data(Caschool, package="Ecdat")

#顯示資料類型和結構
str(Caschool)

## 'data.frame':    420 obs. of  17 variables:
##  $ distcod : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
##  $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
##  $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
##  $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
##  $ enrltot : int  195 240 1550 243 1335 137 195 888 379 2247 ...
##  $ teachers: num  10.9 11.1 82.9 14 71.5 ...
##  $ calwpct : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ mealpct : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer: int  67 101 169 85 171 25 28 66 35 0 ...
##  $ testscr : num  691 661 644 648 641 ...
##  $ compstu : num  0.344 0.421 0.109 0.35 0.128 ...
##  $ expnstu : num  6385 5099 5502 7102 5236 ...
##  $ str     : num  17.9 21.5 18.7 17.4 18.7 ...
##  $ avginc  : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ elpct   : num  0 4.58 30 0 13.86 ...
##  $ readscr : num  692 660 636 652 642 ...
##  $ mathscr : num  690 662 651 644 640 ...

#呼叫dplyr ,要使用sample_n指令
pacman::p_load(dplyr)

#固定seed確保每次抽樣都一樣，每個county抽一個樣本
set.seed(1234)
Caschool_1 <- group_by(Caschool, county) %>% 
  sample_n(1)

head(Caschool_1 )

## # A tibble: 6 x 17
## # Groups:   county [6]
##   distcod county       district grspan enrltot teachers calwpct mealpct computer
##     <int> <fct>        <fct>    <fct>    <int>    <dbl>   <dbl>   <dbl>    <int>
## 1   75119 Alameda      Sunol G~ KK-08      195     10.9   0.510   2.04        67
## 2   61549 Butte        Thermal~ KK-08     1550     82.9  55.0    76.3        169
## 3   61572 Calaveras    Mark Tw~ KK-08      777     36.8  13.0    39.8        148
## 4   61713 Contra Costa Lafayet~ KK-08     3469    172.    0       0.173      496
## 5   61978 El Dorado    Rescue ~ KK-08     2987    154.    2.04   11.2        290
## 6   62539 Fresno       West Pa~ KK-08      314     19.2  29.9    93.6          8
## # ... with 8 more variables: testscr <dbl>, compstu <dbl>, expnstu <dbl>,
## #   str <dbl>, avginc <dbl>, elpct <dbl>, readscr <dbl>, mathscr <dbl>

#陽春版
plot(mathscr ~ readscr,Caschool_1,
  col = 51, pch =17)

#更新新版的R才能安裝basicPlotteR
pacman::p_load(basicPlotteR)

## 將程式套件安載入 'C:/Users/USER/Documents/R/win-library/4.1'
## (因為 'lib' 沒有被指定)

## Warning: package 'basicPlotteR' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages

## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.1:
##   無法開啟網址 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.1/PACKAGES'

## Warning in p_install(package, character.only = TRUE, ...):

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : 不存在叫 'basicPlotteR' 這個名稱的套件

## Warning in pacman::p_load(basicPlotteR): Failed to install/load:
## basicPlotteR

with(Caschool_1, plot(mathscr, readscr,
                       xlab="mathscr", ylab="readscr",
                      pch =18, bty="n", col=51,
                      axes = FALSE))
abline(lm(mathscr~readscr, Caschool_1))#加趨勢線

axis(1, at = seq(600,800, by=10)) #設定間隔及寬度
axis(2, at = seq(600,800, by=10))
grid()

#addTextLabels 無法加入標籤 addTextLabels(Caschool_1$mathscr, Caschool_1$readscr, Caschool_1$county, cex.label=.5, col.label=116, lty=2, col.line=rgb(0, 0, 0, 0.5))

#用plotly做,plotly 線上分析與視覺化的工具
pacman::p_load(plotly)
Caschool_2 <- plot_ly(Caschool_1, x = ~mathscr, y = ~readscr, type = 'scatter', mode = 'markers',
        marker = list(size = 10))
Caschool_2<- Caschool_2 %>% add_annotations(
                  text = ~county,
                  xlab="mathscr", ylab="readscr",
                  showarrow = TRUE,
                  arrowhead = 4,
                  arrowsize = .5,
                  color = ~county,
                  ax = 20,
                  ay = -40)

Caschool_2

Homework 2

Find 133 class-level 95%-confidence intervals for language test score means of the nlschools{MASS} data set by using the tidy approach. The tail end of the data object should looks as follows: classID language_mean language_lb language_ub 131 20.0 … … 132 23.5 … … 133 23.7 … …

pacman::p_load(MASS)
data(nlschools, package="MASS")
head(nlschools)

##   lang   IQ class GS SES COMB
## 1   46 15.0   180 29  23    0
## 2   45 14.5   180 29  10    0
## 3   33  9.5   180 29  15    0
## 4   46 11.0   180 29  23    0
## 5   20  8.0   180 29  10    0
## 6   30  9.5   180 29  10    0

str(nlschools)

## 'data.frame':    2287 obs. of  6 variables:
##  $ lang : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ IQ   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ class: Factor w/ 133 levels "180","280","1082",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ GS   : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ SES  : int  23 10 15 23 10 10 23 10 13 15 ...
##  $ COMB : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

#呼叫tidyverse
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.5     v purrr   0.3.4
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag()     masks stats::lag()
## x MASS::select()   masks plotly::select(), dplyr::select()

#95%CI計算 tail(nlschools)
language_lb language_ub

nlschools %>% 
  group_by(class) %>% 
  summarize(language_mean = mean(lang), 
            language_lb = language_mean - 1.96*sd(lang), 
            language_ub = language_mean + 1.96*sd(lang)) %>% 
  mutate(classID = paste0(1:n())) %>% #給序號
  tail() %>% #看最後6筆
  dplyr::select(classID, language_mean,language_lb,language_ub) %>% 
  knitr::kable (digits = 3)

classID	language_mean	language_lb	language_ub
128	45.500	30.124	60.876
129	41.500	29.445	53.555
130	40.267	20.744	59.789
131	38.091	26.953	49.229
132	29.300	3.264	55.336
133	28.429	14.762	42.095

Homework 3

Convert the wide form of the O’Brien and Kaiser’s repeated-measures data in OBrienKaiser{carData} to the long form in OBrienKaiserLong{carData}.

pacman::p_load(carData)
data(OBrienKaiser, package="carData")
dta3<-OBrienKaiser
head(dta3)

##   treatment gender pre.1 pre.2 pre.3 pre.4 pre.5 post.1 post.2 post.3 post.4
## 1   control      M     1     2     4     2     1      3      2      5      3
## 2   control      M     4     4     5     3     4      2      2      3      5
## 3   control      M     5     6     5     7     7      4      5      7      5
## 4   control      F     5     4     7     5     4      2      2      3      5
## 5   control      F     3     4     6     4     3      6      7      8      6
## 6         A      M     7     8     7     9     9      9      9     10      8
##   post.5 fup.1 fup.2 fup.3 fup.4 fup.5
## 1      2     2     3     2     4     4
## 2      3     4     5     6     4     1
## 3      4     7     6     9     7     6
## 4      3     4     4     5     3     4
## 5      3     4     3     6     4     3
## 6      9     9    10    11     9     6

#gather 把橫的轉成直的,跟spread相反為long form資料，依genus變項轉成直的，填入mean_weight值，不要留plot_id，因轉成直的會自動生程序號，不要序號

dta3_1 <- dta3 %>%
  mutate(id = rep(1:n()))%>%
 pivot_longer(
     cols = starts_with(c("pre","post","fup")),#這3個要轉 
               names_to = "Time",
               values_to = "score")|>
  head() |> knitr::kable()

dta3_1 <- dta3 %>%
  mutate(id = rep(1:n()))%>%
 pivot_longer(
     cols = starts_with(c("pre","post","fup")),#這3個要轉 
               names_to = "Time",
               values_to = "score")|>
  arrange(score)

dta3_1 |>as.data.frame()|> head()

##   treatment gender id   Time score
## 1   control      M  1  pre.1     1
## 2   control      M  1  pre.5     1
## 3   control      M  2  fup.5     1
## 4         A      F  9 post.5     1
## 5         B      F 14  pre.4     1
## 6   control      M  1  pre.2     2

pacman::p_load(tidyr)

dta3_2 <- dta3_1 %>% 
  separate(Time, c("Time","hour")) %>% #將Time分為兩個欄位
  mutate(hour=parse_number(hour))%>%  #新變數hour透過parse_number擷取"pre.1"後的1 數字
as.data.frame()
head(dta3_2[,c(3,4,5,6,1,2)])#排欄位順序

##   id Time hour score treatment gender
## 1  1  pre    1     1   control      M
## 2  1  pre    5     1   control      M
## 3  2  fup    5     1   control      M
## 4  9 post    5     1         A      F
## 5 14  pre    4     1         B      F
## 6  1  pre    2     2   control      M

Homework 4

Information about student majors, activities in intramural sports, and survey responses about their satisfaction with the on-campus cafeteria (for the variety and quality of the food served, and hours of operation) are recorded in three separate plain text files by three different administrative units: majors sports cafeteria

Input data

majors <- read.table("student_major.txt", h = T)
sports <- read.table("student_sport.txt", h = T)
cafeteria <- read.table("student_cafe.txt", h = T)

head(majors)

##    id            major
## 1 123      Mathematics
## 2 345 Computer Science
## 3 624          History
## 4 789          Biology
## 5 851 Computer Science
## 6 555         History

head(sports)

##    id      sport
## 1 123 basketball
## 2 123  badminton
## 3 222  badminton
## 4 385 basketball
## 5 385 volleyball
## 6 481 basketball

head(cafeteria)

##    id variety quality hours
## 1 123       3       4     2
## 2 345       4       4     2
## 3 385       1       1     2
## 4 481       2       3     3
## 5 624       2       3     4
## 6 851       5       4     1

Merge data

#All join，這種寫法會把三個檔案中所有欄位有NA的人 直接剔除 ，N＝15
All <- merge(merge(majors,sports),cafeteria,all = TRUE)
head(All)

##    id            major      sport variety quality hours
## 1 123      Mathematics  badminton       3       4     2
## 2 123      Mathematics basketball       3       4     2
## 3 222 Computer Science  badminton      NA      NA    NA
## 4 345             <NA>       <NA>       4       4     2
## 5 385      Mathematics basketball       1       1     2
## 6 385      Mathematics volleyball       1       1     2

str(All)

## 'data.frame':    15 obs. of  6 variables:
##  $ id     : int  123 123 222 345 385 385 481 555 624 717 ...
##  $ major  : chr  "Mathematics" "Mathematics" "Computer Science" NA ...
##  $ sport  : chr  "badminton" "basketball" "badminton" NA ...
##  $ variety: int  3 3 NA 4 1 1 2 NA 2 NA ...
##  $ quality: int  4 4 NA 4 1 1 3 NA 3 NA ...
##  $ hours  : int  2 2 NA 2 2 2 3 NA 4 NA ...

#inner_join 自動by id合併  同時存在在3個檔案中的一起合併
All_1 <- inner_join(inner_join(majors,sports),cafeteria,all = TRUE)

## Joining, by = "id"
## Joining, by = "id"

head(All_1)

##    id            major      sport variety quality hours
## 1 123      Mathematics basketball       3       4     2
## 2 123      Mathematics  badminton       3       4     2
## 3 851 Computer Science volleyball       5       4     1
## 4 851 Computer Science  badminton       5       4     1
## 5 992          Biology  badminton       4       5     3
## 6 992          Biology volleyball       4       5     3

#inner_join中有9個觀察值
str(All_1)

## 'data.frame':    9 obs. of  6 variables:
##  $ id     : int  123 123 851 851 992 992 481 385 385
##  $ major  : chr  "Mathematics" "Mathematics" "Computer Science" "Computer Science" ...
##  $ sport  : chr  "basketball" "badminton" "volleyball" "badminton" ...
##  $ variety: int  3 3 5 5 4 4 2 1 1
##  $ quality: int  4 4 4 4 5 5 3 1 1
##  $ hours  : int  2 2 1 1 3 3 3 2 2

#left_join 自動by id合併  左邊檔案為主體（以左邊id為主要key)，將右邊的檔案合併進來
All_2 <- left_join(left_join(majors,sports),cafeteria,all = TRUE)

## Joining, by = "id"
## Joining, by = "id"

head(All_2)

##    id            major      sport variety quality hours
## 1 123      Mathematics basketball       3       4     2
## 2 123      Mathematics  badminton       3       4     2
## 3 345 Computer Science       <NA>       4       4     2
## 4 624          History       <NA>       2       3     4
## 5 789          Biology basketball      NA      NA    NA
## 6 851 Computer Science volleyball       5       4     1

#left_join中有18個觀察值
str(All_2)

## 'data.frame':    18 obs. of  6 variables:
##  $ id     : int  123 123 345 624 789 851 851 555 992 992 ...
##  $ major  : chr  "Mathematics" "Mathematics" "Computer Science" "History" ...
##  $ sport  : chr  "basketball" "badminton" NA NA ...
##  $ variety: int  3 3 4 2 NA 5 5 NA 4 4 ...
##  $ quality: int  4 4 4 3 NA 4 4 NA 5 5 ...
##  $ hours  : int  2 2 2 4 NA 1 1 NA 3 3 ...

Output these files

#可以省略sep，自動存入放三個檔案的資料夾中
#write.csv(All, "All.csv" ,row.names = F)

Homework 5

The HELP (Health Evaluation and Linkage to Primary Care) study was a clinical trial for adult inpatients recruited from a detoxification unit. Patients with no primary care physician were randomized to receive a multidisciplinary assessment and a brief motivational intervention or usual care, with the goal of linking them to primary medical care. Eligible subjects were adults, who spoke Spanish or English, reported alcohol, heroin or cocaine as their first or second drug of choice, resided in proximity to the primary care clinic to which they would be referred or were homeless. Subjects were interviewed at baseline during their detoxification stay and follow-up interviews were undertaken every 6 months for 2 years. A variety of continuous, count, discrete, and survival time predictors and outcomes were collected at each of these five occasions. The following R script is used to manage the data file at the initial stage of investigation. Provide comments on what each line of the script is meant to achieve.

Source: Kleinman, K., & Horton, N.J. (2015). Using R for Data Management, Statistical Analysis, and Graphics.

options(continue="  ")

Input data

ds=read.csv("https://nhorton.people.amherst.edu/sasr2/datasets/help.csv")
options(digits=3) # narrow output 
options(width=72)
library(dplyr)

#需使用dplyr::select函數抓取所需欄位，原本沒有抓id這裡補抓
newds <- ds |> dplyr::select( cesd, female,id, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e, f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t) 
head(newds)

##   cesd female id i1 i2 treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0  1 13 26     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0  2 56 62     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  3  0  0     0   3   2   3   0   2   2   1   3   2   3
## 4   15      1  4  5  5     0   0   0   1   3   2   2   1   3   0   0
## 5   39      0  5 10 13     0   3   0   3   3   3   3   1   3   3   2
## 6    6      1  6  4  4     1   1   0   1   3   0   0   0   3   0   1
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0
## 4   1   2   2   2   0  NA   2   0   0   1
## 5   3   2   2   3   0   3   3   3   3   3
## 6   1   3   1   0   1   3   0   0   0   0

資料整理與確認

names(newds) #所有欄位名稱

##  [1] "cesd"   "female" "id"     "i1"     "i2"     "treat"  "f1a"   
##  [8] "f1b"    "f1c"    "f1d"    "f1e"    "f1f"    "f1g"    "f1h"   
## [15] "f1i"    "f1j"    "f1k"    "f1l"    "f1m"    "f1n"    "f1o"   
## [22] "f1p"    "f1q"    "f1r"    "f1s"    "f1t"

str(newds[,1:10]) # structure of the first 10 variables

## 'data.frame':    453 obs. of  10 variables:
##  $ cesd  : int  49 30 39 15 39 6 52 32 50 46 ...
##  $ female: int  0 0 0 1 0 1 1 0 1 0 ...
##  $ id    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ i1    : int  13 56 0 5 10 4 13 12 71 20 ...
##  $ i2    : int  26 62 0 5 13 4 20 24 129 27 ...
##  $ treat : int  1 1 0 0 0 1 0 1 0 1 ...
##  $ f1a   : int  3 3 3 0 3 1 3 1 3 2 ...
##  $ f1b   : int  2 2 2 0 0 0 1 1 2 3 ...
##  $ f1c   : int  3 0 3 1 3 1 3 2 3 3 ...
##  $ f1d   : int  0 3 0 3 3 3 1 3 1 0 ...

#前10個變項的基本描述統計
summary(newds[,1:10]) # summary of the first 10 variables

##       cesd          female            id            i1       
##  Min.   : 1.0   Min.   :0.000   Min.   :  1   Min.   :  0.0  
##  1st Qu.:25.0   1st Qu.:0.000   1st Qu.:119   1st Qu.:  3.0  
##  Median :34.0   Median :0.000   Median :233   Median : 13.0  
##  Mean   :32.8   Mean   :0.236   Mean   :233   Mean   : 17.9  
##  3rd Qu.:41.0   3rd Qu.:0.000   3rd Qu.:348   3rd Qu.: 26.0  
##  Max.   :60.0   Max.   :1.000   Max.   :470   Max.   :142.0  
##        i2            treat            f1a            f1b      
##  Min.   :  0.0   Min.   :0.000   Min.   :0.00   Min.   :0.00  
##  1st Qu.:  3.0   1st Qu.:0.000   1st Qu.:1.00   1st Qu.:0.00  
##  Median : 15.0   Median :0.000   Median :2.00   Median :1.00  
##  Mean   : 22.6   Mean   :0.497   Mean   :1.63   Mean   :1.39  
##  3rd Qu.: 32.0   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:2.00  
##  Max.   :184.0   Max.   :1.000   Max.   :3.00   Max.   :3.00  
##       f1c            f1d      
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:1.00   1st Qu.:0.00  
##  Median :2.00   Median :1.00  
##  Mean   :1.92   Mean   :1.56  
##  3rd Qu.:3.00   3rd Qu.:3.00  
##  Max.   :3.00   Max.   :3.00

#只看前3筆資料
head(newds, n=3)

##   cesd female id i1 i2 treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1   49      0  1 13 26     1   3   2   3   0   2   3   3   0   2   3
## 2   30      0  2 56 62     1   3   2   0   3   3   2   0   0   3   0
## 3   39      0  3  0  0     0   3   2   3   0   2   2   1   3   2   3
##   f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1   3   0   1   2   2   2   2   3   3   2
## 2   3   0   0   3   0   0   0   2   0   0
## 3   1   0   1   3   2   0   0   3   2   0

comment(newds) = "HELP baseline dataset"  #comment
comment(newds)

## [1] "HELP baseline dataset"

#將資料存入savedfile，存成ds
save(ds, file="savedfile")

#存入CSV檔中
write.csv(ds, file="ds.csv")

#呼叫foreign
#將newds存成SAS.dat和.sas檔案
library(foreign) 
write.foreign(newds, "file.dat", "file.sas", package="SAS")

#讀取'newds'中的cesd變項前10筆資料
#兩種方式皆可
with(newds, cesd[1:10])

##  [1] 49 30 39 15 39  6 52 32 50 46

with(newds, head(cesd, 10))

##  [1] 49 30 39 15 39  6 52 32 50 46

#讀取'newds'中的cesd變項>56的資料
with(newds, cesd[cesd > 56])

## [1] 57 58 57 60 58 58 57

#呼叫dplyr,使用filter確認資料中的cesd > 56，show id & cesd
#記得要使用dplyr::select去讀取，不然無法上傳至網頁 只能RStudio自己看
library(dplyr)
filter(newds, cesd > 56) %>% dplyr::select(id, cesd)

##    id cesd
## 1  71   57
## 2 127   58
## 3 200   57
## 4 228   60
## 5 273   58
## 6 351   58
## 7  13   57

#shor()向量進行從小到大的排序cesd前四筆
#which.min()返回數值向量中第一個最小值的位置
with(newds, sort(cesd)[1:4])

## [1] 1 3 3 4

with(newds, which.min(cesd))

## [1] 199

#packages("mosaic")可做簡單微積分計算
library(mosaic)

## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.

## 
## 載入套件：'mosaic'

## 下列物件被遮斷自 'package:Matrix':
## 
##     mean

## 下列物件被遮斷自 'package:purrr':
## 
##     cross

## 下列物件被遮斷自 'package:plotly':
## 
##     do

## 下列物件被遮斷自 'package:ggplot2':
## 
##     stat

## 下列物件被遮斷自 'package:dplyr':
## 
##     count, do, tally

## 下列物件被遮斷自 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var

## 下列物件被遮斷自 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

tally(~ is.na(f1g), data=newds) #tally count計算f1g變項總計 452

## is.na(f1g)
##  TRUE FALSE 
##     1   452

#favstats來自mosaic包, 描述統計值 max, mean, min,Q1,Q3,   range, sd, missing
favstats(~ f1g, data=newds)

##  min Q1 median Q3 max mean  sd   n missing
##    0  1      2  3   3 1.73 1.1 452       1

reverse code

f1d, f1h, f1l and f1p

#將變項items 列出，並recode,  (3 - f1l)表示反向題
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g, 
   (3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p), 
   f1q, f1r, f1s, f1t))

遺漏值處理

#計算missing,NA總數,回傳資料存至nmisscesd 
nmisscesd = apply(is.na(cesditems), 1, sum)

#列出整理後的變項得分數,放入ncesditems
ncesditems = cesditems

#將NA改0
ncesditems[is.na(cesditems)] = 0

#計算ncesditems總分,apply 列出每列來
newcesd = apply(ncesditems, 1, sum)

#針對加權，該列有幾個NA，其加總便乘以20/(20-NA數目)
imputemeancesd = 20/(20-nmisscesd)*newcesd

#list 結果確認
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,]

##     newcesd newds.cesd nmisscesd imputemeancesd
## 4        15         15         1           15.8
## 17       19         19         1           20.0
## 87       44         44         1           46.3
## 101      17         17         1           17.9
## 154      29         29         1           30.5
## 177      44         44         1           46.3
## 229      39         39         1           41.1

常用包 memisc

#createdrink,message=FALSE

#memisc為空間函數
#case分三組abstinent moderate highrisk
pacman::p_load(dplyr,memisc)
newds = mutate(newds, drinkstat= 
  cases(
    "abstinent" = i1==0,
    "moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
               (i1>0 & i1<=2 & i2<=4 & female==0),
    "highrisk" = ((i1>1 | i2>3) & female==1) |
               ((i1>2 | i2>4) & female==0)))

#----echo=FALSE
library(mosaic)

#----echo=FALSE
detach(package:memisc)
detach(package:MASS)

#select data to tmpds
#取出4個變項i1, i2, female, drinkstat, 看365～370資料
library(dplyr)
tmpds = select(newds, i1, i2, female, drinkstat)
tmpds[365:370,]

##     i1 i2 female drinkstat
## 365  6 24      0  highrisk
## 366  6  6      0  highrisk
## 367  0  0      0 abstinent
## 368  0  0      1 abstinent
## 369  8  8      0  highrisk
## 370 32 32      0  highrisk

#filter篩選變項"moderate" & female=1的人
library(dplyr)
filter(tmpds, drinkstat=="moderate" & female==1)

##   i1 i2 female drinkstat
## 1  1  1      1  moderate
## 2  1  3      1  moderate
## 3  1  2      1  moderate
## 4  1  1      1  moderate
## 5  1  1      1  moderate
## 6  1  1      1  moderate
## 7  1  1      1  moderate

常用包 CrossTable gmodels

##message=FALSE
#gmodels::CrossTable 針對drinkstat做列聯表
pacman::p_load(gmodels)
with(tmpds, CrossTable(drinkstat))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##           | abstinent |  moderate |  highrisk | 
##           |-----------|-----------|-----------|
##           |        68 |        28 |       357 | 
##           |     0.150 |     0.062 |     0.788 | 
##           |-----------|-----------|-----------|
## 
## 
## 
##

#drinkstat, female做列聯表
with(tmpds, CrossTable(drinkstat, female, 
  prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))#不做統計檢定

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  453 
## 
##  
##              | female 
##    drinkstat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##    abstinent |        42 |        26 |        68 | 
##              |     0.618 |     0.382 |     0.150 | 
## -------------|-----------|-----------|-----------|
##     moderate |        21 |         7 |        28 | 
##              |     0.750 |     0.250 |     0.062 | 
## -------------|-----------|-----------|-----------|
##     highrisk |       283 |        74 |       357 | 
##              |     0.793 |     0.207 |     0.788 | 
## -------------|-----------|-----------|-----------|
## Column Total |       346 |       107 |       453 | 
## -------------|-----------|-----------|-----------|
## 
##

#做gender列聯表，計算tally
newds = transform(newds, 
  gender=factor(female, c(0,1), c("Male","Female")))
tally(~ female + gender, margin=FALSE, data=newds)

##       gender
## female Male Female
##      0  346      0
##      1    0    107

#arrange資料,預設為遞增排序

library(dplyr)
newds = arrange(ds, cesd, i1)
newds[1:5, c("cesd", "i1", "id")]#看前5筆

##   cesd i1  id
## 1    1  3 233
## 2    3  1 139
## 3    3 13 418
## 4    4  4 251
## 5    4  9  95

library(dplyr)
females = filter(ds, female==1) #選取female=1放入females檔案中
with(females, mean(cesd)) #計算cesd的mean

## [1] 36.9

# an alternative approach 2種都可
mean(ds$cesd[ds$female==1])#計算cesd的mean

## [1] 36.9

#tapply()把原始數據分割為若干組, 計算female的cesd平均數
#tapply(vector創建分數向量, factor針對的變項是什麼, fun應用的功能) 
with(ds, tapply(cesd, female, mean))

##    0    1 
## 31.6 36.9

library(mosaic)
mean(cesd ~ female, data=ds) #也可用此方法

##    0    1 
## 31.6 36.9

～END～

Homework-Data wrangling

SCL

2021-11-08

Homework 1

Homework 2

Homework 3

Homework 4

Input data

Merge data

Output these files

Homework 5

Input data

資料整理與確認

reverse code

遺漏值處理

常用包 memisc

常用包 CrossTable gmodels