homework0423

library(ggplot2)
library(tidyr)
library(dplyr)
library(tidyverse)
library(broom)
library(ggExtra)
library(ggrepel)
library(ggthemes)

Ex2

dta2 <- read.table("langMathDutch.txt", header = TRUE) %>% 
  mutate(Size = cut(size, quantile(size, probs = c(0, .33, .67, 1)),
                    c("Small", "Midium", "Large"), 
                    include.lowest = TRUE, ordered = TRUE),
         IQ = cut(IQV, quantile(IQV, probs = c(0, .33, .67, 1)), 
                  c("Low", "Middle", "High"), 
                  include.lowest = TRUE, ordered = TRUE),
         Size.IQ = factor(paste(Size, IQ, sep = "."), 
                          levels = c("Small.Low", "Small.Middle", "Small.High",
                                     "Midium.Low", "Midium.Middle", "Midium.High",
                                     "Large.Low", "Large.Middle", "Large.High")))

ggplot(dta2, aes(lang, arith))+
  geom_point()+
  stat_smooth(method = "lm")+
  facet_wrap(~ Size.IQ)+
  scale_y_continuous(breaks = seq(5, 35, 5))+
  labs(x = "Language Socre", y = "Arithmetic Score")

Ex3

dta3 <- read.table("stateAnxiety.txt", header = TRUE) %>%
  gather(key = "key", value = "Anxiety") %>%
  mutate(Gender = c(rep("Female", 250), rep("Male", 250)),
         ID = rep(1:50, 10)) %>%
  mutate(Week = rep(c(rep(1, 50), rep(2, 50), rep(3, 50), 
                      rep(4, 50), rep(5, 50)), 2))

Q1 Whether there are gender difference in state anxiety

pd <- position_dodge(.2)
ggplot(dta3, aes(Week, Anxiety, group = Gender, color = Gender)) +
  geom_point(position = pd) +
  stat_smooth(aes(color = Gender), method = "lm") +
  labs(x = "Weeks", y = "Anxiety score") +
  theme(legend.direction = "horizontal") +
  theme(legend.position = "top")

考前焦慮程度有性別差異，隨著時間增加，女性平均焦慮指數高於男性

Q2 Individual differences in state anxiety

ggplot(dta3, aes(Week, Anxiety, group = ID)) +
  geom_point() +
  stat_smooth(method = "lm", se = F) +
  labs(x = "Weeks", y = "Anxiety score")

焦慮程度並無顯著個別差異

Ex4

dta4 <- read.table("math_attainment.txt", header = T)

ggplot(dta4, aes(math1, math2, color = cc))+
  geom_point(size = rel(2))+
  stat_smooth(method = "lm", color = "steelblue")+
  theme(legend.position = c(.8, .2))+
  labs(x = "Math score at Year 1",
       y = "Math score at Year 2", 
       color = "Curriculum coverage")

m <- lm(math2 ~ -1 + math1 + cc, dta4)


dta4 %>% mutate(s.resi = scale(residuals(m)),pred = fitted.values(m)) %>% 
  ggplot(., aes(pred, s.resi))+
  geom_point()+ geom_hline(yintercept = 0, linetype = "dotted")+
  labs(x = "Fitted values", y = "Standardized residuals")

ggqqplot <- function(data) 
{
  y <- quantile(data[!is.na(data)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]
  d <- data.frame(resids = data)
  ggplot(d, aes(sample = resids))+stat_qq()+
    geom_abline(slope = slope, intercept = int)

}

ggqqplot(scale(resid(m)))

dta4 %>% mutate(pred = fitted.values(m)) %>% 
  ggplot(., aes(math1, math2, color = cc))+
  geom_point(alpha = .7, pch = 21, size = rel(2))+
  geom_point(aes(math1, pred, color = cc), size = rel(2))+
  stat_smooth(aes(math1, pred, color = cc), method = "lm", color = "steelblue")+
  theme(legend.position = c(.8, .2))+
  labs(x = "Math score at Year 1",
       y = "Fitted Values Math score at Year 2", 
       color = "Curriculum coverage")

tidy(m, conf.int = TRUE) %>% 
  ggplot(., aes(term, estimate))+
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))+
  geom_hline(yintercept = 0, linetype = "dotted")+
  scale_x_discrete(labels = c("Curriculum coverage", "Math score at Year 1"))+
  labs(x = "term", y = "parameter coefficients")

Ex5

dta5 <- read.table("hs0.txt", h = T)

cor <- cov(dta5$write, dta5$read)
v <- matrix(c(1,cor,cor,1), 2, 2)
slope <- eigen(v)$vector[1, 1]
intercept <- mean(dta5$write)-slope*mean(dta5$read)

# plot
p <- ggplot(dta5, aes(read, write)) +
  geom_point(shape = 21) +
  stat_ellipse() +
  geom_abline(intercept = 0, slope = 1, color = "gray") +
  geom_abline(intercept = intercept, slope = slope, color = "tomato", size = rel(2)) +
  geom_vline(xintercept = mean(dta5$read), color = "gray") +
  geom_hline(yintercept = mean(dta5$write), color = "gray") +
  stat_smooth(method = "lm", size = rel(1.1)) +
  xlim(25, 80) +
  ylim(25, 80) +
  labs(x = "Reading score", y = "Writing score")

library(ggExtra)
ggMarginal(p, type = "histogram",
           xparams = list(binwidth = (IQR(dta5$read)*2)/(200)^(1/3),
                          fill = "gray"),
           yparams = list(binwidth = (IQR(dta5$write)*2)/(200)^(1/3),
                          fill = "gray"))

Ex6

Ex7

dta7 <- as.data.frame(USPersonalExpenditure) %>%
  mutate(Item = rownames(USPersonalExpenditure)) %>%
  gather(key = "Year", value = "Expenditure", 1:5)  

qplot(log(Expenditure), Item, data = dta7) +
  geom_segment(aes(xend = 0, yend = Item)) +
  geom_vline(xintercept = 0, colour = "grey50") +
  facet_wrap(~ Year, nrow = 1)

Ex8

dta8 <- read.table("hs0.txt", h = T)

ggplot(dta8, aes(x = write, y = read, fill = female)) +
  geom_point(pch = 21, alpha = .5) +
  geom_density_2d(aes(color = ..level..)) +
  facet_wrap(~ female) +
  labs(x = "Writing score", y = "Reading score") +
  theme(legend.position = c(.05, .8))

Ex9

dta09 <- data.frame(Race = c("White", "White", "Black", "Black"),
                  Gender = c("Male", "Female", "Male", "Female"),
                  Yes = c(43, 26, 29, 22),  No = c(134, 149, 23, 36)) %>% 
  mutate(P_yes = Yes/(Yes+No), P_no = No/(Yes+No))

new.dta <- dta09 %>% select(1:4) %>% 
  gather(Intercourse, Freq, 3:4) %>%
  mutate(P = c(dta09$P_yes, dta09$P_no), SE = sqrt(P*(1-P)/Freq))


ggplot(new.dta, aes(Intercourse, P, fill = Gender))+
  geom_bar(position = "dodge", stat = "identity")+
  geom_errorbar(aes(ymin = P - SE, ymax = P + SE), position = "dodge")+
  facet_wrap(~ Race)+ labs(y = "Proportion")

Ex10

Ex11

library(MASS)
dta11 <- Cushings %>%
          mutate(Type = factor(Type, levels = c("u", "b", "c", "a"), 
                            labels = c("Unknown", "Bilateral Hyperplasia", "Carcinoma", "Adenoma")))


ggplot(dta11, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type)) +
  geom_point(shape = 21, size = rel(2)) +
  theme_hc() +
  labs(x = "Tetrahydrocortisone (mg/24 hours)", y = "Pregnanetriol (mg/24 hours)",
       title = "Cushings's syndrome") +
  theme(plot.title = element_text(hjust = 1)) +
  geom_text_repel(data = dta11[c(1,16,17,26),], aes(label = Type, color = Type))

Ex12

library(car)
dta12 <- Vocab
boundaries <- c(0, 6, 9, 12, 16, 20)

#plot
with(dta12, hist(education, breaks= boundaries, xlab = "Education level(years)", axes = FALSE, main = NULL, col = "gray"))
axis(1, boundaries) ; axis(2)
abline(h = seq(.01, .14, .01), col = "lightgray", lty = "dotted")

homework0423

BHWang

20180507

Ex2

Ex3

Ex4

Ex5

Ex6

Ex7

Ex8

Ex9

Ex10

Ex11

Ex12