dta1 <- Vocab
str(dta1)
## 'data.frame': 21638 obs. of 4 variables:
## $ year : int 2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
## $ education : int 9 14 14 17 14 14 12 10 11 9 ...
## $ vocabulary: int 3 6 9 8 1 7 6 6 5 1 ...
# the relationship between education and vocabulary over the years by gender
dta1%>%group_by(year,sex)%>%summarise(cor = cor(education,vocabulary))
## Source: local data frame [32 x 3]
## Groups: year [?]
##
## year sex cor
## <int> <fctr> <dbl>
## 1 1974 Female 0.4900975
## 2 1974 Male 0.5886080
## 3 1976 Female 0.5023103
## 4 1976 Male 0.5455221
## 5 1978 Female 0.5063789
## 6 1978 Male 0.5842083
## 7 1982 Female 0.5093188
## 8 1982 Male 0.5185160
## 9 1984 Female 0.5001001
## 10 1984 Male 0.5066840
## # ... with 22 more rows
dta1$year <- as.factor(dta1$year)
##ggplot
ggplot(data = dta1,aes(x = education,y = vocabulary))+
geom_point(aes(col = year))+
scale_color_grey(start = 0.95, end = 0.1)+
facet_wrap(~sex)+
stat_smooth(method = "lm",se = F)+
theme_bw()
##
options(continue=" ") # use tab for lines which continue over one line
##
options(digits=3) # controls the number of digits to print when printing numeric values
options(width=72) # narrow output
ds = read.csv("http://www.amherst.edu/~nhorton/r2/datasets/help.csv")# read .csv data
library(dplyr)# load dplyr package
newds = dplyr::select(ds,cesd, female, i1, i2, id, treat, f1a, f1b, f1c, f1d, f1e,
f1f, f1g, f1h, f1i, f1j, f1k, f1l, f1m, f1n, f1o, f1p, f1q, f1r, f1s, f1t)# select column
## ------------------------------------------------------------------------
names(newds) # show column names
## [1] "cesd" "female" "i1" "i2" "id" "treat" "f1a"
## [8] "f1b" "f1c" "f1d" "f1e" "f1f" "f1g" "f1h"
## [15] "f1i" "f1j" "f1k" "f1l" "f1m" "f1n" "f1o"
## [22] "f1p" "f1q" "f1r" "f1s" "f1t"
str(newds[,1:10]) # structure of the first 10 variables
## 'data.frame': 453 obs. of 10 variables:
## $ cesd : int 49 30 39 15 39 6 52 32 50 46 ...
## $ female: int 0 0 0 1 0 1 1 0 1 0 ...
## $ i1 : int 13 56 0 5 10 4 13 12 71 20 ...
## $ i2 : int 26 62 0 5 13 4 20 24 129 27 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ treat : int 1 1 0 0 0 1 0 1 0 1 ...
## $ f1a : int 3 3 3 0 3 1 3 1 3 2 ...
## $ f1b : int 2 2 2 0 0 0 1 1 2 3 ...
## $ f1c : int 3 0 3 1 3 1 3 2 3 3 ...
## $ f1d : int 0 3 0 3 3 3 1 3 1 0 ...
## ------------------------------------------------------------------------
summary(newds[,1:10]) # summary of the first 10 variables
## cesd female i1 i2
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:25.0 1st Qu.:0.000 1st Qu.: 3.0 1st Qu.: 3.0
## Median :34.0 Median :0.000 Median : 13.0 Median : 15.0
## Mean :32.8 Mean :0.236 Mean : 17.9 Mean : 22.6
## 3rd Qu.:41.0 3rd Qu.:0.000 3rd Qu.: 26.0 3rd Qu.: 32.0
## Max. :60.0 Max. :1.000 Max. :142.0 Max. :184.0
## id treat f1a f1b
## Min. : 1 Min. :0.000 Min. :0.00 Min. :0.00
## 1st Qu.:119 1st Qu.:0.000 1st Qu.:1.00 1st Qu.:0.00
## Median :233 Median :0.000 Median :2.00 Median :1.00
## Mean :233 Mean :0.497 Mean :1.63 Mean :1.39
## 3rd Qu.:348 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:2.00
## Max. :470 Max. :1.000 Max. :3.00 Max. :3.00
## f1c f1d
## Min. :0.00 Min. :0.00
## 1st Qu.:1.00 1st Qu.:0.00
## Median :2.00 Median :1.00
## Mean :1.92 Mean :1.56
## 3rd Qu.:3.00 3rd Qu.:3.00
## Max. :3.00 Max. :3.00
## ------------------------------------------------------------------------
head(newds, n=3) #show first three rows' data
## cesd female i1 i2 id treat f1a f1b f1c f1d f1e f1f f1g f1h f1i f1j
## 1 49 0 13 26 1 1 3 2 3 0 2 3 3 0 2 3
## 2 30 0 56 62 2 1 3 2 0 3 3 2 0 0 3 0
## 3 39 0 0 0 3 0 3 2 3 0 2 2 1 3 2 3
## f1k f1l f1m f1n f1o f1p f1q f1r f1s f1t
## 1 3 0 1 2 2 2 2 3 3 2
## 2 3 0 0 3 0 0 0 2 0 0
## 3 1 0 1 3 2 0 0 3 2 0
## ------------------------------------------------------------------------
comment(newds) = "HELP baseline dataset" # set query a comment
comment(newds) # query a comment
## [1] "HELP baseline dataset"
save(ds, file="savedfile") # writes an external representation of R objects to the specified file
## ------------------------------------------------------------------------
write.csv(ds, file="ds.csv") # output the data
## ------------------------------------------------------------------------
library(foreign) #load foreign package
write.foreign(newds, "file.dat", "file.sas", package="SAS") # output file in SAS format
## ------------------------------------------------------------------------
with(newds, cesd[1:10])# the value of cesd of subject 1~10
## [1] 49 30 39 15 39 6 52 32 50 46
with(newds, head(cesd, 10)) # the value of cesd of first 10 subject
## [1] 49 30 39 15 39 6 52 32 50 46
## ------------------------------------------------------------------------
with(newds, cesd[cesd > 56]) # show the subject whose cesd value > 56
## [1] 57 58 57 60 58 58 57
## ------------------------------------------------------------------------
library(dplyr)
filter(newds, cesd > 56) %>% dplyr::select(id, cesd) # find the subject whose cesd value > 54 and report id & cesd
## id cesd
## 1 71 57
## 2 127 58
## 3 200 57
## 4 228 60
## 5 273 58
## 6 351 58
## 7 13 57
## ------------------------------------------------------------------------
with(newds, sort(cesd)[1:4]) # sort cesd value in ascending order and report the value of subject of 1~4
## [1] 1 3 3 4
with(newds, which.min(cesd)) # report the position which minimun cesd value occur
## [1] 199
## ------------------------------------------------------------------------
library(mosaic) # load mosaic package
tally(~ is.na(f1g), data=newds) # report how many missings on column f1g
## is.na(f1g)
## TRUE FALSE
## 1 452
favstats(~ f1g, data=newds) # summary column f1g
## min Q1 median Q3 max mean sd n missing
## 0 1 2 3 3 1.73 1.1 452 1
## ------------------------------------------------------------------------
# reverse code f1d, f1h, f1l and f1p
cesditems = with(newds, cbind(f1a, f1b, f1c, (3 - f1d), f1e, f1f, f1g,
(3 - f1h), f1i, f1j, f1k, (3 - f1l), f1m, f1n, f1o, (3 - f1p),
f1q, f1r, f1s, f1t))
nmisscesd = apply(is.na(cesditems), 1, sum) # caculate each subject's numbers of missing
ncesditems = cesditems
ncesditems[is.na(cesditems)] = 0 # replace NA by 0
newcesd = apply(ncesditems, 1, sum) # caculate each subject's total score
imputemeancesd = 20/(20-nmisscesd)*newcesd # weight the total score by the number of missing values
## ------------------------------------------------------------------------
data.frame(newcesd, newds$cesd, nmisscesd, imputemeancesd)[nmisscesd>0,] # report the subject's value which have missing
## newcesd newds.cesd nmisscesd imputemeancesd
## 4 15 15 1 15.8
## 17 19 19 1 20.0
## 87 44 44 1 46.3
## 101 17 17 1 17.9
## 154 29 29 1 30.5
## 177 44 44 1 46.3
## 229 39 39 1 41.1
## ----createdrink,message=FALSE-------------------------------------------
library(dplyr)
library(memisc) # load memisc package
newds = mutate(newds, drinkstat=
cases(
"abstinent" = i1==0,
"moderate" = (i1>0 & i1<=1 & i2<=3 & female==1) |
(i1>0 & i1<=2 & i2<=4 & female==0),
"highrisk" = ((i1>1 | i2>3) & female==1) |
((i1>2 | i2>4) & female==0))) #categorize subject
## ----echo=FALSE----------------------------------------------------------
library(mosaic)
## ----echo=FALSE----------------------------------------------------------
detach(package:memisc) # detach memisc package
detach(package:MASS) # detach MASS package
## ------------------------------------------------------------------------
library(dplyr)
tmpds = dplyr::select(newds, i1, i2, female, drinkstat) # select column
tmpds[365:370,] # report the value of row 365~370
## i1 i2 female drinkstat
## 365 6 24 0 highrisk
## 366 6 6 0 highrisk
## 367 0 0 0 abstinent
## 368 0 0 1 abstinent
## 369 8 8 0 highrisk
## 370 32 32 0 highrisk
## ------------------------------------------------------------------------
library(dplyr)
filter(tmpds, drinkstat=="moderate" & female==1) # find subject which drinkstat is moderate and is female
## i1 i2 female drinkstat
## 1 1 1 1 moderate
## 2 1 3 1 moderate
## 3 1 2 1 moderate
## 4 1 1 1 moderate
## 5 1 1 1 moderate
## 6 1 1 1 moderate
## 7 1 1 1 moderate
## ----message=FALSE-------------------------------------------------------
library(gmodels) # load gmodels package
with(tmpds, CrossTable(drinkstat)) # give a table and show the percentage of drinkstat
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | abstinent | moderate | highrisk |
## |-----------|-----------|-----------|
## | 68 | 28 | 357 |
## | 0.150 | 0.062 | 0.788 |
## |-----------|-----------|-----------|
##
##
##
##
## ------------------------------------------------------------------------
with(tmpds, CrossTable(drinkstat, female,
prop.t=FALSE, prop.c=FALSE, prop.chisq=FALSE))# give a table and show the percentage of drinkstat and sex
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 453
##
##
## | female
## drinkstat | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## abstinent | 42 | 26 | 68 |
## | 0.618 | 0.382 | 0.150 |
## -------------|-----------|-----------|-----------|
## moderate | 21 | 7 | 28 |
## | 0.750 | 0.250 | 0.062 |
## -------------|-----------|-----------|-----------|
## highrisk | 283 | 74 | 357 |
## | 0.793 | 0.207 | 0.788 |
## -------------|-----------|-----------|-----------|
## Column Total | 346 | 107 | 453 |
## -------------|-----------|-----------|-----------|
##
##
## ------------------------------------------------------------------------
newds = transform(newds,
gender=factor(female, c(0,1), c("Male","Female"))) # give new column
tally(~ female + gender, margin=FALSE, data=newds) # counts of column female and gender
## gender
## female Male Female
## 0 346 0
## 1 0 107
## ------------------------------------------------------------------------
library(dplyr)
newds = arrange(ds, cesd, i1)# arrange rows by cesd and i1
newds[1:5, c("cesd", "i1", "id")] # report cesd & i1 & id of row 1~5
## cesd i1 id
## 1 1 3 233
## 2 3 1 139
## 3 3 13 418
## 4 4 4 251
## 5 4 9 95
## ------------------------------------------------------------------------
library(dplyr)
females = filter(ds, female==1) #
with(females, mean(cesd)) # caculate the mean of male subject
## [1] 36.9
# an alternative approach
mean(ds$cesd[ds$female==1])
## [1] 36.9
## ------------------------------------------------------------------------
with(ds, tapply(cesd, female, mean)) # caculate the mean of female and male subject separately
## 0 1
## 31.6 36.9
library(mosaic)
mean(cesd ~ female, data=ds) # caculate the mean of female and male subject separately
## 0 1
## 31.6 36.9
dta3 <- read.csv("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/nlsy86long.csv")
str(dta3)
## 'data.frame': 664 obs. of 9 variables:
## $ id : int 2390 2560 3740 4020 6350 7030 7200 7610 7680 7700 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
## $ race : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 1 1 1 1 1 1 1 1 1 1 ...
## $ grade: int 0 0 0 0 1 0 0 0 0 0 ...
## $ year : int 6 6 6 5 7 5 6 7 6 6 ...
## $ month: int 67 66 67 60 78 62 66 79 76 67 ...
## $ math : num 14.29 20.24 17.86 7.14 29.76 ...
## $ read : num 19.05 21.43 21.43 7.14 30.95 ...
##expand the data set to a long format
dta3L <- gather(dta3,key = "test_var",value = "test_score",8:9)
str(dta3L)
## 'data.frame': 1328 obs. of 9 variables:
## $ id : int 2390 2560 3740 4020 6350 7030 7200 7610 7680 7700 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
## $ race : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 1 1 1 1 1 1 1 1 1 1 ...
## $ grade : int 0 0 0 0 1 0 0 0 0 0 ...
## $ year : int 6 6 6 5 7 5 6 7 6 6 ...
## $ month : int 67 66 67 60 78 62 66 79 76 67 ...
## $ test_var : chr "math" "math" "math" "math" ...
## $ test_score: num 14.29 20.24 17.86 7.14 29.76 ...
dta4 <- backpain
str(backpain)
## 'data.frame': 434 obs. of 4 variables:
## $ ID : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ status : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
## $ driver : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
## $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...
dta4t <- data.frame(with(dta4,table(status,driver,suburban)))
# change long format to wide format
dta4tL <- spread(dta4t,key = "status",value = Freq)
# add a new column named Total
dta4tL <- dta4tL%>%mutate(Total = case + control)
dta4tL
## driver suburban case control Total
## 1 no no 26 47 73
## 2 no yes 6 7 13
## 3 yes no 64 63 127
## 4 yes yes 121 100 221
dta5 <- cbind(state.x77,USArrests)
str(dta5)
## 'data.frame': 50 obs. of 12 variables:
## $ Population: num 3615 365 2212 2110 21198 ...
## $ Income : num 3624 6315 4530 3378 5114 ...
## $ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
## $ Life Exp : num 69 69.3 70.5 70.7 71.7 ...
## $ Murder : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
## $ HS Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
## $ Frost : num 20 152 15 65 20 166 139 103 11 60 ...
## $ Area : num 50708 566432 113417 51945 156361 ...
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop : int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
cor(dta5)
## Population Income Illiteracy Life Exp Murder HS Grad
## Population 1.0000 0.2082 0.1076 -0.0681 0.3436 -0.0985
## Income 0.2082 1.0000 -0.4371 0.3403 -0.2301 0.6199
## Illiteracy 0.1076 -0.4371 1.0000 -0.5885 0.7030 -0.6572
## Life Exp -0.0681 0.3403 -0.5885 1.0000 -0.7808 0.5822
## Murder 0.3436 -0.2301 0.7030 -0.7808 1.0000 -0.4880
## HS Grad -0.0985 0.6199 -0.6572 0.5822 -0.4880 1.0000
## Frost -0.3322 0.2263 -0.6719 0.2621 -0.5389 0.3668
## Area 0.0225 0.3633 0.0773 -0.1073 0.2284 0.3335
## Murder 0.3202 -0.2152 0.7068 -0.7785 0.9337 -0.5216
## Assault 0.3170 0.0409 0.5110 -0.6267 0.7398 -0.2303
## UrbanPop 0.5126 0.4805 -0.0622 0.2715 0.0164 0.3587
## Rape 0.3052 0.3574 0.1546 -0.2696 0.5800 0.2707
## Frost Area Murder Assault UrbanPop Rape
## Population -0.3322 0.0225 0.3202 0.3170 0.5126 0.305
## Income 0.2263 0.3633 -0.2152 0.0409 0.4805 0.357
## Illiteracy -0.6719 0.0773 0.7068 0.5110 -0.0622 0.155
## Life Exp 0.2621 -0.1073 -0.7785 -0.6267 0.2715 -0.270
## Murder -0.5389 0.2284 0.9337 0.7398 0.0164 0.580
## HS Grad 0.3668 0.3335 -0.5216 -0.2303 0.3587 0.271
## Frost 1.0000 0.0592 -0.5414 -0.4682 -0.2462 -0.279
## Area 0.0592 1.0000 0.1481 0.2312 -0.0615 0.525
## Murder -0.5414 0.1481 1.0000 0.8019 0.0696 0.564
## Assault -0.4682 0.2312 0.8019 1.0000 0.2589 0.665
## UrbanPop -0.2462 -0.0615 0.0696 0.2589 1.0000 0.411
## Rape -0.2792 0.5250 0.5636 0.6652 0.4113 1.000
heatmap(cor(dta5))
#High income people have more percentage of high school graduates , longer life expectancy and less criminal rate.
dta6 <- read.table("http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/probeL.txt",header = T)
str(dta6)
## 'data.frame': 55 obs. of 3 variables:
## $ ID : Factor w/ 11 levels "S01","S02","S03",..: 1 1 1 1 1 2 2 2 2 2 ...
## $ Response_Time: int 51 36 50 35 42 27 20 26 17 27 ...
## $ Position : int 1 2 3 4 5 1 2 3 4 5 ...
dtaW <- mutate(dta6, pre = rep("Pos", dim(dta6)[1])) %>%
unite(time, pre, Position) %>%
spread(time, Response_Time) %>%
arrange(ID)
dtaW
## ID Pos_1 Pos_2 Pos_3 Pos_4 Pos_5
## 1 S01 51 36 50 35 42
## 2 S02 27 20 26 17 27
## 3 S03 37 22 41 37 30
## 4 S04 42 36 32 34 27
## 5 S05 27 18 33 14 29
## 6 S06 43 32 43 35 40
## 7 S07 41 22 36 25 38
## 8 S08 38 21 31 20 16
## 9 S09 36 23 27 25 28
## 10 S10 26 31 31 32 36
## 11 S11 29 20 25 26 25
dta7_1 <- MASS::Animals
dta7_2 <- MASS::mammals
dta7_1$Type = rownames(dta7_1)
dta7_2$Type = rownames(dta7_2)
dta7 <- rbind(dta7_1,dta7_2)
dta7$type <- as.factor(dta7$Type)
sum(duplicated(dta7$type))
## [1] 23
dta7 = dta7[!duplicated(dta7$Type),]
sum(duplicated(dta7$Type))
## [1] 0
dta8 <- Prestige
str(dta8)
## 'data.frame': 102 obs. of 6 variables:
## $ education: num 13.1 12.3 12.8 11.4 14.6 ...
## $ income : int 12351 25879 9271 8865 8403 11030 8258 14163 11377 11023 ...
## $ women : num 11.16 4.02 15.7 9.11 11.68 ...
## $ prestige : num 68.8 69.1 63.4 56.8 73.5 77.6 72.6 78.1 73.1 68.8 ...
## $ census : int 1113 1130 1171 1175 2111 2113 2133 2141 2143 2153 ...
## $ type : Factor w/ 3 levels "bc","prof","wc": 2 2 2 2 2 2 2 2 2 2 ...
# median prestige score for each of the three types of occupation
aggregate(prestige~type, data = dta8, median)
## type prestige
## 1 bc 35.9
## 2 prof 68.4
## 3 wc 41.5
#
dta8 <- dta8 %>% group_by(type) %>% mutate(label = ifelse(prestige > median(prestige),"high","low"))
dta8 <- na.omit(dta8)
dta8$label <- as.factor(dta8$label)
dta8L <- dta8 %>%group_by(type,label) %>%do(tmp = data.frame(.))
#
ggplot(dta8, aes(x = education, y = income, color = label))+
geom_point()+
facet_grid(~type)+
scale_color_discrete("Level of prestige", labels = c("High", "Low"))+
theme_bw()+
theme(legend.position=c(.15, .8), legend.background = element_rect(color="black", fill="white"))
#
for(i in 1:6){
dta8L$cor[i] <- cor(dta8L$tmp[[i]]$income,dta8L$tmp[[i]]$education)
}
dta8L
## Source: local data frame [6 x 4]
## Groups: <by row>
##
## # A tibble: 6 × 4
## type label tmp cor
## * <fctr> <fctr> <list> <dbl>
## 1 bc high <data.frame [20 × 7]> 4.50e-01
## 2 bc low <data.frame [24 × 7]> 5.74e-02
## 3 prof high <data.frame [15 × 7]> 1.66e-06
## 4 prof low <data.frame [16 × 7]> 3.73e-01
## 5 wc high <data.frame [11 × 7]> 2.77e-01
## 6 wc low <data.frame [12 × 7]> -8.06e-02