Exercise:Data manipulation

2. A subset of data from the National Longitudinal Survey of Youth is presented in a long format.
1.Covert the dataset to a wide format.
Recode grade 0, 1 and 2 to “kindergarten” “first” and “second”.

# I am not sure whether my solution is good or not 

long <- read.table("data/nlsy86long.csv",h=T,sep=",")
#head(long[order(long$id),],20)
library(reshape);library(reshape2)


Attaching package: 'reshape2'

The following objects are masked from 'package:reshape':

    colsplit, melt, recast

long$grade <- ifelse(long$grade==0,"kindergarten",ifelse(long$grade==1,"first",ifelse(long$grade==2,"second",long$grade)))
wide <- reshape(long, 
  timevar = "time",
  idvar = c("id", "sex", "race"),
  direction = "wide")
head(wide[order(wide$id),])

      id    sex     race      grade.1 year.1 month.1   math.1   read.1
155 1003   Male Minority kindergarten      5      60 11.90476 10.71429
156 1012   Male Minority kindergarten      6      75 27.38095 21.42857
157 1013 Female Minority kindergarten      6      69 14.28571 21.42857
158 1019 Female Minority        first      7      79 26.19048 29.76190
159 1034   Male Minority kindergarten      6      72 23.80952 20.23810
160 1044 Female Minority kindergarten      5      64 16.66667 13.09524
    grade.2 year.2 month.2   math.2   read.2 grade.3 year.3 month.3
155  second      8      91 33.33333 36.90476       3     10     116
156  second      9     103 58.33333 53.57143       4     11     127
157       3      8      99 29.76190 47.61905       5     10     124
158  second      9     107 29.76190 58.33333       5     11     134
159  second      9     102 35.71429 30.95238       4     11     126
160  second      8      92 34.52381 35.71429       3     10     116
      math.3   read.3 grade.4 year.4 month.4   math.4   read.4
155 27.38095 36.90476       5     12     138 39.28571 45.23810
156 66.66667 66.66667       6     13     151 61.90476 69.04762
157 42.85714 55.95238       6     12     146 53.57143 69.04762
158 50.00000 53.57143       6     13     156 52.38095 75.00000
159 47.61905 36.90476       6     13     150 51.19048 41.66667
160 54.76190 61.90476       5     12     139 72.61905 66.66667

3.Given a vector of numerical values, find the most common element (mode) for it. Report only the element and the number of its occurrences.

#For example, use
num <- sample(100, rep=T)#to generate (with replacement) a list of random integers from 1 to 100.
sort(table(num),decreasing=TRUE)[1:5]

num
79 19 24 43 51 
 4  3  3  3  3

5. An instructor has 12 students is his class:
Amy, Brad, Chad, Daisy, Ed, Fey, George, Hillary, Ike, Jane, Kim, Luke.
For class assignment, he needs to permute these names at random. Help him accomplish it with R.

name <- c("Amy", "Brad", "Chad", "Daisy", "Ed", "Fey", "George", "Hillary", "Ike", "Jane", "Kim", "Luke")
set.seed(2016);sample(name)

 [1] "Chad"    "Brad"    "Ike"     "Kim"     "Daisy"   "Amy"     "Hillary"
 [8] "Ed"      "George"  "Fey"     "Luke"    "Jane"

7. The ‘cabbages’ data set in the ‘MASS’ package has 4 variables: Cult, Date, HeadWt, VitC. Ignore the last variable and make ‘Date’ a numeric variable. Plot 95% CIs for the variable ‘HeadWt’ by ‘Cult’ and ‘Date’, jointly.

library(MASS);library(ggplot2);head(cabbages)

  Cult Date HeadWt VitC
1  c39  d16    2.5   51
2  c39  d16    2.2   55
3  c39  d16    3.1   45
4  c39  d16    4.3   42
5  c39  d16    2.5   53
6  c39  d16    4.3   50

cabbages$Date <- ifelse(cabbages$Date=="d16",1,ifelse(cabbages$Date=="d20",2,3))
cabbages_SE <- aggregate(cabbages$HeadWt~cabbages$Date+cabbages$Cult,FUN=function(x) quantile(x, probs =0.05))
colnames(cabbages_SE) <- c("Date","Cult","Prob_0.05")
cabbages_SE$Prob_0.95 <- aggregate(cabbages$HeadWt~cabbages$Date+cabbages$Cult,FUN=function(x) quantile(x, probs =0.95))[,3]

ggplot(cabbages,aes(HeadWt,fill=Date))+
        geom_histogram(bins = 20)+
        facet_grid(Cult~Date)+
        geom_vline(data=cabbages_SE,aes(xintercept=Prob_0.05),col="red",size=1)+
        geom_vline(data=cabbages_SE,aes(xintercept=Prob_0.95),col="red",size=1)

8. The ‘HairEyeColor’ data set is a built-in table array. Use it explore ‘melt’ and ‘cast’ functionalities provided by the ‘reshape2’ package.

library(reshape2);library(reshape)
#head(HairEyeColor)
dta1 <- melt(HairEyeColor)
cast(dta1,Hair~Eye,mean);cast(dta1,Hair~Sex,mean)

   Hair Blue Brown Green Hazel
1 Black 10.0  34.0   2.5   7.5
2 Blond 47.0   3.5   8.0   5.0
3 Brown 42.0  59.5  14.5  27.0
4   Red  8.5  13.0   7.0   7.0

   Hair Female  Male
1 Black  13.00 14.00
2 Blond  20.25 11.50
3 Brown  35.75 35.75
4   Red   9.25  8.50

9.The ‘MASS’ library has these two data sets: ‘Animals’ and ‘mammals’. Merge the two files and remove duplicated observations using ‘duplicated’.

library(MASS)
dta <- rbind(Animals,mammals)
dta_nondu <- dta[!duplicated(dta),]
dta_nondu[order(rownames(dta_nondu)),] # for proving

                               body   brain
African elephant           6654.000 5712.00
African giant pouched rat     1.000    6.60
Arctic fox                    3.385   44.50
Arctic ground squirrel        0.920    5.70
Asian elephant             2547.000 4603.00
Baboon                       10.550  179.50
Big brown bat                 0.023    0.30
Brachiosaurus             87000.000  154.50
Brazilian tapir             160.000  169.00
Cat                           3.300   25.60
Chimpanzee                   52.160  440.00
Chinchilla                    0.425    6.40
Cow                         465.000  423.00
Desert hedgehog               0.550    2.40
Dipliodocus               11700.000   50.00
Donkey                      187.100  419.00
E. American mole              0.075    1.20
Echidna                       3.000   25.00
European hedgehog             0.785    3.50
Galago                        0.200    5.00
Genet                         1.410   17.50
Giant armadillo              60.000   81.00
Giraffe                     529.000  680.00
Goat                         27.660  115.00
Golden hamster                0.120    1.00
Gorilla                     207.000  406.00
Grey seal                    85.000  325.00
Grey wolf                    36.330  119.50
Ground squirrel               0.101    4.00
Guinea pig                    1.040    5.50
Horse                       521.000  655.00
Human                        62.000 1320.00
Jaguar                      100.000  157.00
Kangaroo                     35.000   56.00
Lesser short-tailed shrew     0.005    0.14
Little brown bat              0.010    0.25
Mole                          0.122    3.00
Mountain beaver               1.350    8.10
Mouse                         0.023    0.40
Musk shrew                    0.048    0.33
N.A. opossum                  1.700    6.30
Nine-banded armadillo         3.500   10.80
Okapi                       250.000  490.00
Owl monkey                    0.480   15.50
Phalanger                     1.620   11.40
Pig                         192.000  180.00
Potar monkey                 10.000  115.00
Rabbit                        2.500   12.10
Raccoon                       4.288   39.20
Rat                           0.280    1.90
Red fox                       4.235   50.40
Rhesus monkey                 6.800  179.00
Rock hyrax-a                  0.750   12.30
Rock hyrax-b                  3.600   21.00
Roe deer                     14.830   98.20
Sheep                        55.500  175.00
Slow loris                    1.400   12.50
Star-nosed mole               0.060    1.00
Tenrec                        0.900    2.60
Tree hyrax                    2.000   12.30
Tree shrew                    0.104    2.50
Triceratops                9400.000   70.00
Verbet                        4.190   58.00
Water opossum                 3.500    3.90
Yellow-bellied marmot         4.050   17.00

The data set lq2002{multilevel} cotains responses from a sample of 2,042 soldiers to a survey of 19 items. Items 1-11 measure leadership climate, items 12-14 measure task significance, and items 15-19 measure the hostility felt by the soldier. The soldiers came from 49 companies. For this problem, use only the company with the largest number of soldiers and only the hostility items.
(a) Convert the scores greater than or equal to 4 to 1 and 0 otherwise for all 5 items.

library(multilevel)

Loading required package: nlme

data(lq2002)#head(lq2002)
table(lq2002$COMPID)[order(table(lq2002$COMPID))]


14 17 37 55  6  9 19 48 57 47  7 49 56 31 54 58 26 16 50  2 22 10 30 38 45 
10 10 10 11 12 13 13 13 13 14 15 18 18 19 19 20 21 23 23 24 25 29 29 30 30 
 3 24 52 25  4 21 23 42  5 35 44 20 28 32 27 43 33 34 41 29 15 46 18 13 
37 37 39 41 45 53 54 54 58 58 63 68 68 68 73 74 77 78 78 85 89 90 94 99

dta <- subset(lq2002,lq2002$COMPID==13)
#head(dta[,17:21])
dta[,17:21] <- apply(dta[,17:21],2,function(x){ifelse(x>=4,1,0)})
head(dta[,17:21])

    HOSTIL01 HOSTIL02 HOSTIL03 HOSTIL04 HOSTIL05
234        0        0        1        0        0
235        0        0        0        0        0
236        1        1        1        1        0
237        1        1        0        0        0
238        0        0        1        0        0
239        0        0        0        0        0

(b) Compute the proportions of 1’s for the five items and present them in descending order.

apply(dta[,17:21],2,mean)[order(apply(dta[,17:21],2,mean))]

  HOSTIL05   HOSTIL04   HOSTIL02   HOSTIL01   HOSTIL03 
0.02020202 0.04040404 0.06060606 0.11111111 0.13131313

Exercise:Data manipulation_2,3,5,7,8,9,

Chi-Lin Yu