Exercise_Grammar of graphics

1.Fifty male and fifty female students fill out the same questionnaire in weekly intervals starting five weeks before an important examination to measure state anxiety. The research interests are: 1. whether there are gender difference in state anxiety 2. individual differences in state anxiety

# read data
anxiety <- read.table("data/stateAnxiety.txt",h=T)
head(anxiety)

  f1 f2 f3 f4 f5 m1 m2 m3 m4 m5
1 13 17 18 20 24  6 14 22 20 24
2 26 31 33 38 42  4 11 14 12 23
3 13 17 24 29 32 17 25 26 29 38
4 22 24 26 27 29 19 22 26 30 34
5 18 19 19 22 30 12 21 21 23 24
6 32 31 30 31 32 11 16 20 19 22

str(anxiety)

'data.frame':   50 obs. of  10 variables:
 $ f1: int  13 26 13 22 18 32 16 18 14 20 ...
 $ f2: int  17 31 17 24 19 31 16 22 17 19 ...
 $ f3: int  18 33 24 26 19 30 21 25 23 23 ...
 $ f4: int  20 38 29 27 22 31 27 29 21 25 ...
 $ f5: int  24 42 32 29 30 32 30 35 25 28 ...
 $ m1: int  6 4 17 19 12 11 14 9 12 11 ...
 $ m2: int  14 11 25 22 21 16 23 18 16 13 ...
 $ m3: int  22 14 26 26 21 20 26 20 23 17 ...
 $ m4: int  20 12 29 30 23 19 29 20 26 14 ...
 $ m5: int  24 23 38 34 24 22 33 24 32 20 ...

# reshape
require(reshape2)

Loading required package: reshape2

anx <- melt(anxiety, variable.name="value", value.name="score")

No id variables; using all as measure variables

anx$sex <- factor(rep(c("F","M"),each=250,len=500))
anx$week <- rep(-5:-1,each=50,len=500)
anx$id <- c(rep(1:50,5,len=250),rep(51:100,5,250))
head(anx)

  value score sex week id
1    f1    13   F   -5  1
2    f1    26   F   -5  2
3    f1    13   F   -5  3
4    f1    22   F   -5  4
5    f1    18   F   -5  5
6    f1    32   F   -5  6

str(anx)

'data.frame':   500 obs. of  5 variables:
 $ value: Factor w/ 10 levels "f1","f2","f3",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ score: int  13 26 13 22 18 32 16 18 14 20 ...
 $ sex  : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
 $ week : int  -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 ...
 $ id   : int  1 2 3 4 5 6 7 8 9 10 ...

library(ggplot2)
# first plot
ggplot(data = anx, aes(x = week, y = score, group = id, color = sex)) +
    geom_line() +
    labs(x = "week", y = "anxiety")

# second plot
ggplot(anx, aes(x = week, y = score, color=sex)) + 
    geom_point() +
    labs(x = "week", y = "anxiety") +
    stat_smooth(method="lm")

2.The data set is concerned with grade 8 pupils (age about 11 years) in elementary schools in the Netherlands. After deleting pupils with missing values, the number of pupils is 2,287 and the number of schools is 131. Class size ranges from 4 to 35. The response variables are score on a language test and that on an arithmetic test. The research intest is on how the two test scores depend on the pupil’s intelligence (verbal IQ) and on the number of pupils in a school class.

langmath <- read.table("data/langMathDutch.txt",h=T)
head(langmath)

  school pupil  IQV size lang arith
1      1 17001 15.0   29   46    24
2      1 17002 14.5   29   45    19
3      1 17003  9.5   29   33    24
4      1 17004 11.0   29   46    26
5      1 17005  8.0   29   20     9
6      1 17006  9.5   29   30    13

str(langmath)

'data.frame':   2287 obs. of  6 variables:
 $ school: int  1 1 1 1 1 1 1 1 1 1 ...
 $ pupil : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
 $ IQV   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
 $ size  : int  29 29 29 29 29 29 29 29 29 29 ...
 $ lang  : int  46 45 33 46 20 30 30 57 36 36 ...
 $ arith : int  24 19 24 26 9 13 13 30 23 22 ...

# The class size is categorized into small, medium, and large 
langmath$classsize <- cut(langmath$size, 3, labels=c("Small", "Medium", "Large"))
# The verbal IQ is categorized into low, middle and high
langmath$VIQ <- cut(langmath$IQV, 3, labels=c("Low", "Middle", "High"))
# combine
langmath$sizIQ <- paste(langmath$classsize, langmath$VIQ, sep=",")
langmath$sizIQ <- ordered(langmath$sizIQ,levels = c("Small,Low", "Small,Middle", "Small,High","Medium,Low", "Medium,Middle", "Medium,High","Large,Low", "Large,Middle", "Large,High"))
# draw
ggplot(data = langmath, aes(x = lang, y = arith)) +
    stat_smooth(method = "lm", formula = y ~ x) +
    geom_point(shape = 18) +
    facet_wrap(~sizIQ) +
    labs(x = "Language score", y = "Arithmetic score")

3.A sample of pupils whose mathematics attainment is tested at the end of Year 2 (seven years old). The coverage of mathematics curriculum each pupil received during Year 2 is measured; so is the mathematics attainment at the beginning of the year. There are 39 pupils in the study.Draw a graph of the mean test scores of year one and year two for the data set. Include the 95%-confidence intervals for the means.

math <- read.table("data/mathAttainment.txt",h=T)
mat <- math[,-3]
require(reshape2)
mat <- melt(mat, variable.name="Mathyear", value.name="Score")

No id variables; using all as measure variables

mat$Mathyear <- ordered(mat$Mathyear,levels = c("math1","math2"))
head(mat)

  Mathyear Score
1    math2    28
2    math2    56
3    math2    51
4    math2    13
5    math2    39
6    math2    41

str(mat)

'data.frame':   78 obs. of  2 variables:
 $ Mathyear: Ord.factor w/ 2 levels "math1"<"math2": 2 2 2 2 2 2 2 2 2 2 ...
 $ Score   : int  28 56 51 13 39 41 30 13 17 32 ...

s_se <- function(x) {
return(c(mean(x, na.rm = T), sd(x, na.rm = T)/sqrt(length(x)))) }
s0 <- aggregate(Score ~ Mathyear, data = mat, FUN = s_se)
s0 <- data.frame(s0[, 1], s0[, 2])
names(s0) <- c("year", "mean", "se")
ggplot(data = s0, aes(x = year, y = mean)) +
    geom_point(size = 4) +
    geom_errorbar(aes(ymin = mean - 2*se, ymax = mean + 2*se),width = .2) +
    labs(x = "Mean test scores of year one and year two", y = "Mean math score") +
    theme_bw()

5.Use the built-in data set USPersonalExpenditure in R for this problem. This data set consists of United States personal expenditures (in billions of dollars) in the categories; food and tobacco, household operation, medical and health, personal care, and private education for the years 1940, 1945, 1950, 1955 and 1960. Plot the US personal expenditure data in the style of the third plot in the “Time Use” case study. You might want to transform the dollar amounts to log base 10 unit first.

data(USPersonalExpenditure)
USP <- melt(USPersonalExpenditure, value.name="score")
names(USP) <- c("Categories","Year","Dollars")
head(USP)

           Categories Year Dollars
1    Food and Tobacco 1940  22.200
2 Household Operation 1940  10.500
3  Medical and Health 1940   3.530
4       Personal Care 1940   1.040
5   Private Education 1940   0.341
6    Food and Tobacco 1945  44.500

str(USP)

'data.frame':   25 obs. of  3 variables:
 $ Categories: Factor w/ 5 levels "Food and Tobacco",..: 1 2 3 4 5 1 2 3 4 5 ...
 $ Year      : int  1940 1940 1940 1940 1940 1945 1945 1945 1945 1945 ...
 $ Dollars   : num  22.2 10.5 3.53 1.04 0.341 44.5 15.5 5.76 1.98 0.974 ...

USP$Dollars <- log10(USP$Dollars)
dollarmean <- mean(USP$Dollars)
qplot(Dollars, Year, data = USP) +
  geom_segment(aes(xend = dollarmean, yend = Year)) +
  geom_vline(xintercept = dollarmean, colour = "grey") +
  facet_wrap(~ Categories, nrow = 1)

6.The data below give a cross-classification of 205 married persons by height.Plot the data table.

table1 <- array(c(18,20,12,28,51,25,14,28,9),dim=c(3,3),
                dimnames = list(Husband=c("Tall","Medium","Short"),
                                Wife=c("Tall","Medium","Short")) )
table1

        Wife
Husband  Tall Medium Short
  Tall     18     28    14
  Medium   20     51    28
  Short    12     25     9

likert(table1, main="",as.percent=T)

8.Use the math score of the high school students data set to reproduce the following graph with ggplot.

hs <- read.table("data/hs0.txt",h=T)
head(hs)

   id female  race    ses schtyp     prog read write math science socst
1  70   male white    low public  general   57    52   41      47    57
2 121 female white middle public vocation   68    59   53      63    61
3  86   male white   high public  general   44    33   54      58    31
4 141   male white   high public vocation   63    44   47      53    56
5 172   male white middle public academic   47    52   57      53    61
6 113   male white middle public academic   44    52   51      63    61

ggplot(hs) +
    geom_tile(aes(x=1, y=math, fill = math)) +
    scale_x_continuous(limits=c(0,2),breaks=1)+
    scale_fill_gradient2(low = 'navy', mid = 'white', high = 'green', midpoint = 50) +
    labs(x = " ",y = "Math score") +
    theme(axis.text.x = element_blank(),panel.background = element_blank(),
          panel.grid.major.y =  element_line(colour = "gray"))

9.In a national study of 15- and 16-year-old adolescents. The event of interest is ever having sexual intercourse.Turn the data table into the following plot.

# create data table
ta1 <- data.frame(Intercourse=c(rep("Yes",120),rep("No",342)))
ta1$Gender <- c(rep("Male",72),rep("Female",48),rep("Male",157),rep("Female",185))
ta1$Race <- c(rep("White",43),rep("Black",29),rep("White",26),rep("Black",22),
              rep("White",134),rep("Black",23),rep("White",149),rep("Black",36))
ta1$Gender <- as.factor(ta1$Gender)
ta1$Race <- as.factor(ta1$Race)
table1 <- with(ta1,table(Gender,Intercourse,Race))
# draw bar chart
ta2 <- data.frame(table1)
ta2

  Gender Intercourse  Race Freq
1 Female          No Black   36
2   Male          No Black   23
3 Female         Yes Black   22
4   Male         Yes Black   29
5 Female          No White  149
6   Male          No White  134
7 Female         Yes White   26
8   Male         Yes White   43

ta2$percent <- (ta2$Freq/sum(ta2$Freq))*100
ggplot(ta2, aes(x = Race, y = percent, fill = Gender)) +
    geom_bar(stat = "identity", position = "dodge") +
    scale_fill_manual(values = c("black","gray")) +
    coord_flip() +
    theme_bw()