2. A subset of data from the National Longitudinal Survey of Youth is presented in a long format.
1.Covert the dataset to a wide format.
Recode grade 0, 1 and 2 to “kindergarten” “first” and “second”.
# I am not sure whether my solution is good or not
long <- read.table("data/nlsy86long.csv",h=T,sep=",")
#head(long[order(long$id),],20)
library(reshape);library(reshape2)
Attaching package: 'reshape2'
The following objects are masked from 'package:reshape':
colsplit, melt, recast
long$grade <- ifelse(long$grade==0,"kindergarten",ifelse(long$grade==1,"first",ifelse(long$grade==2,"second",long$grade)))
wide <- reshape(long,
timevar = "time",
idvar = c("id", "sex", "race"),
direction = "wide")
head(wide[order(wide$id),])
id sex race grade.1 year.1 month.1 math.1 read.1
155 1003 Male Minority kindergarten 5 60 11.90476 10.71429
156 1012 Male Minority kindergarten 6 75 27.38095 21.42857
157 1013 Female Minority kindergarten 6 69 14.28571 21.42857
158 1019 Female Minority first 7 79 26.19048 29.76190
159 1034 Male Minority kindergarten 6 72 23.80952 20.23810
160 1044 Female Minority kindergarten 5 64 16.66667 13.09524
grade.2 year.2 month.2 math.2 read.2 grade.3 year.3 month.3
155 second 8 91 33.33333 36.90476 3 10 116
156 second 9 103 58.33333 53.57143 4 11 127
157 3 8 99 29.76190 47.61905 5 10 124
158 second 9 107 29.76190 58.33333 5 11 134
159 second 9 102 35.71429 30.95238 4 11 126
160 second 8 92 34.52381 35.71429 3 10 116
math.3 read.3 grade.4 year.4 month.4 math.4 read.4
155 27.38095 36.90476 5 12 138 39.28571 45.23810
156 66.66667 66.66667 6 13 151 61.90476 69.04762
157 42.85714 55.95238 6 12 146 53.57143 69.04762
158 50.00000 53.57143 6 13 156 52.38095 75.00000
159 47.61905 36.90476 6 13 150 51.19048 41.66667
160 54.76190 61.90476 5 12 139 72.61905 66.66667
3.Given a vector of numerical values, find the most common element (mode) for it. Report only the element and the number of its occurrences.
#For example, use
num <- sample(100, rep=T)#to generate (with replacement) a list of random integers from 1 to 100.
sort(table(num),decreasing=TRUE)[1:5]
num
79 19 24 43 51
4 3 3 3 3
5. An instructor has 12 students is his class:
Amy, Brad, Chad, Daisy, Ed, Fey, George, Hillary, Ike, Jane, Kim, Luke.
For class assignment, he needs to permute these names at random. Help him accomplish it with R.
name <- c("Amy", "Brad", "Chad", "Daisy", "Ed", "Fey", "George", "Hillary", "Ike", "Jane", "Kim", "Luke")
set.seed(2016);sample(name)
[1] "Chad" "Brad" "Ike" "Kim" "Daisy" "Amy" "Hillary"
[8] "Ed" "George" "Fey" "Luke" "Jane"
7. The ‘cabbages’ data set in the ‘MASS’ package has 4 variables: Cult, Date, HeadWt, VitC. Ignore the last variable and make ‘Date’ a numeric variable. Plot 95% CIs for the variable ‘HeadWt’ by ‘Cult’ and ‘Date’, jointly.
library(MASS);library(ggplot2);head(cabbages)
Cult Date HeadWt VitC
1 c39 d16 2.5 51
2 c39 d16 2.2 55
3 c39 d16 3.1 45
4 c39 d16 4.3 42
5 c39 d16 2.5 53
6 c39 d16 4.3 50
cabbages$Date <- ifelse(cabbages$Date=="d16",1,ifelse(cabbages$Date=="d20",2,3))
cabbages_SE <- aggregate(cabbages$HeadWt~cabbages$Date+cabbages$Cult,FUN=function(x) quantile(x, probs =0.05))
colnames(cabbages_SE) <- c("Date","Cult","Prob_0.05")
cabbages_SE$Prob_0.95 <- aggregate(cabbages$HeadWt~cabbages$Date+cabbages$Cult,FUN=function(x) quantile(x, probs =0.95))[,3]
ggplot(cabbages,aes(HeadWt,fill=Date))+
geom_histogram(bins = 20)+
facet_grid(Cult~Date)+
geom_vline(data=cabbages_SE,aes(xintercept=Prob_0.05),col="red",size=1)+
geom_vline(data=cabbages_SE,aes(xintercept=Prob_0.95),col="red",size=1)
8. The ‘HairEyeColor’ data set is a built-in table array. Use it explore ‘melt’ and ‘cast’ functionalities provided by the ‘reshape2’ package.
library(reshape2);library(reshape)
#head(HairEyeColor)
dta1 <- melt(HairEyeColor)
cast(dta1,Hair~Eye,mean);cast(dta1,Hair~Sex,mean)
Hair Blue Brown Green Hazel
1 Black 10.0 34.0 2.5 7.5
2 Blond 47.0 3.5 8.0 5.0
3 Brown 42.0 59.5 14.5 27.0
4 Red 8.5 13.0 7.0 7.0
Hair Female Male
1 Black 13.00 14.00
2 Blond 20.25 11.50
3 Brown 35.75 35.75
4 Red 9.25 8.50
9.The ‘MASS’ library has these two data sets: ‘Animals’ and ‘mammals’. Merge the two files and remove duplicated observations using ‘duplicated’.
library(MASS)
dta <- rbind(Animals,mammals)
dta_nondu <- dta[!duplicated(dta),]
dta_nondu[order(rownames(dta_nondu)),] # for proving
body brain
African elephant 6654.000 5712.00
African giant pouched rat 1.000 6.60
Arctic fox 3.385 44.50
Arctic ground squirrel 0.920 5.70
Asian elephant 2547.000 4603.00
Baboon 10.550 179.50
Big brown bat 0.023 0.30
Brachiosaurus 87000.000 154.50
Brazilian tapir 160.000 169.00
Cat 3.300 25.60
Chimpanzee 52.160 440.00
Chinchilla 0.425 6.40
Cow 465.000 423.00
Desert hedgehog 0.550 2.40
Dipliodocus 11700.000 50.00
Donkey 187.100 419.00
E. American mole 0.075 1.20
Echidna 3.000 25.00
European hedgehog 0.785 3.50
Galago 0.200 5.00
Genet 1.410 17.50
Giant armadillo 60.000 81.00
Giraffe 529.000 680.00
Goat 27.660 115.00
Golden hamster 0.120 1.00
Gorilla 207.000 406.00
Grey seal 85.000 325.00
Grey wolf 36.330 119.50
Ground squirrel 0.101 4.00
Guinea pig 1.040 5.50
Horse 521.000 655.00
Human 62.000 1320.00
Jaguar 100.000 157.00
Kangaroo 35.000 56.00
Lesser short-tailed shrew 0.005 0.14
Little brown bat 0.010 0.25
Mole 0.122 3.00
Mountain beaver 1.350 8.10
Mouse 0.023 0.40
Musk shrew 0.048 0.33
N.A. opossum 1.700 6.30
Nine-banded armadillo 3.500 10.80
Okapi 250.000 490.00
Owl monkey 0.480 15.50
Phalanger 1.620 11.40
Pig 192.000 180.00
Potar monkey 10.000 115.00
Rabbit 2.500 12.10
Raccoon 4.288 39.20
Rat 0.280 1.90
Red fox 4.235 50.40
Rhesus monkey 6.800 179.00
Rock hyrax-a 0.750 12.30
Rock hyrax-b 3.600 21.00
Roe deer 14.830 98.20
Sheep 55.500 175.00
Slow loris 1.400 12.50
Star-nosed mole 0.060 1.00
Tenrec 0.900 2.60
Tree hyrax 2.000 12.30
Tree shrew 0.104 2.50
Triceratops 9400.000 70.00
Verbet 4.190 58.00
Water opossum 3.500 3.90
Yellow-bellied marmot 4.050 17.00
The data set lq2002{multilevel} cotains responses from a sample of 2,042 soldiers to a survey of 19 items. Items 1-11 measure leadership climate, items 12-14 measure task significance, and items 15-19 measure the hostility felt by the soldier. The soldiers came from 49 companies. For this problem, use only the company with the largest number of soldiers and only the hostility items.
(a) Convert the scores greater than or equal to 4 to 1 and 0 otherwise for all 5 items.
library(multilevel)
Loading required package: nlme
data(lq2002)#head(lq2002)
table(lq2002$COMPID)[order(table(lq2002$COMPID))]
14 17 37 55 6 9 19 48 57 47 7 49 56 31 54 58 26 16 50 2 22 10 30 38 45
10 10 10 11 12 13 13 13 13 14 15 18 18 19 19 20 21 23 23 24 25 29 29 30 30
3 24 52 25 4 21 23 42 5 35 44 20 28 32 27 43 33 34 41 29 15 46 18 13
37 37 39 41 45 53 54 54 58 58 63 68 68 68 73 74 77 78 78 85 89 90 94 99
dta <- subset(lq2002,lq2002$COMPID==13)
#head(dta[,17:21])
dta[,17:21] <- apply(dta[,17:21],2,function(x){ifelse(x>=4,1,0)})
head(dta[,17:21])
HOSTIL01 HOSTIL02 HOSTIL03 HOSTIL04 HOSTIL05
234 0 0 1 0 0
235 0 0 0 0 0
236 1 1 1 1 0
237 1 1 0 0 0
238 0 0 1 0 0
239 0 0 0 0 0
(b) Compute the proportions of 1’s for the five items and present them in descending order.
apply(dta[,17:21],2,mean)[order(apply(dta[,17:21],2,mean))]
HOSTIL05 HOSTIL04 HOSTIL02 HOSTIL01 HOSTIL03
0.02020202 0.04040404 0.06060606 0.11111111 0.13131313