4/23 homework

set up

pacman::p_load(tidyverse, broom, ggExtra, datasets, ggthemes, ggrepel)

ex2

## 切割資料
library(dplyr)
dta <- read.table("C:/Users/she22_000/Documents/langMathDutch.txt", header = TRUE) %>% 
  mutate(Size = cut(size, quantile(size, probs = c(0, .33, .67, 1)),
                          c("Small", "Midium", "Large"), 
                          include.lowest = TRUE, ordered = TRUE),
               IQ = cut(IQV, quantile(IQV, probs = c(0, .33, .67, 1)), 
                        c("Low", "Middle", "High"), 
                        include.lowest = TRUE, ordered = TRUE),
               Size.IQ = factor(paste(Size, IQ, sep = "."), 
                                levels = c("Small.Low", "Small.Middle", "Small.High",
                                           "Midium.Low", "Midium.Middle", "Midium.High",
                                           "Large.Low", "Large.Middle", "Large.High")))

## 繪圖
ggplot(dta, aes(lang, arith))+
  geom_point()+
  stat_smooth(method = "lm")+
  facet_wrap(~ Size.IQ)+
  scale_y_continuous(breaks = seq(5, 35, 5))+
  labs(x = "Language Socre", y = "Arithmetic Score")

ex3

dta03 <- read.table("C:/Users/she22_000/Documents/stateAnxiety.txt", header = TRUE)
dta03 <- dta03 %>%
  gather(key = "key", value = "Score") %>%
  mutate(Gender = c(rep("Female", 250), rep("Male", 250)),
         ID = rep(1:50, 10)) %>%
  mutate(Time = rep(c(rep(-5, 50), rep(-4, 50), rep(-3, 50), 
                      rep(-2, 50), rep(-1, 50)), 2)) 

pd <- position_dodge(.2)
ggplot(dta03, aes(Time, Score, group = Gender, color = Gender)) +
  geom_point(position = pd) +
  stat_smooth(aes(color = Gender), method = "lm") +
  labs(x = "Weeks before exam", y = "Anxiety score") +
  theme(legend.direction = "horizontal") +
  theme(legend.position = "top")

可看出性別對考試焦慮存在著差異，女性的考試焦慮分數較高，而男性的考試焦慮隨著距離考試時間接近上升速度變快。

ggplot(dta03, aes(Time, Score, group = ID)) +
  geom_point() +
  stat_smooth(method = "lm", se = F) +
  labs(x = "Weeks before exam", y = "Anxiety score")

不同參與者的焦慮存在著焦慮，但大致上皆隨著考試的接近焦慮隨之升高。

ex4

dta04 <- read.table("C:/Users/she22_000/Documents/math_attainment.txt",header = T)
ggplot(dta04, aes(math1, math2, color = cc))+
  geom_point(size = rel(2))+
  stat_smooth(method = "lm", color = "steelblue")+
  theme(legend.position = c(.8, .2))+
  labs(x = "Math score at Year 1",
       y = "Math score at Year 2", 
       color = "Curriculum coverage")

modle <- lm(math2 ~ -1 + math1 + cc, dta04)
dta04 %>% mutate(s.resi = scale(residuals(modle)),
               pred = fitted.values(modle)) %>% 
  ggplot(., aes(pred, s.resi))+
  geom_point()+
  geom_hline(yintercept = 0, linetype = "dotted")+
  labs(x = "Fitted values", y = "Standardized residuals")

ggqqplot <- function(data) 
{
  y <- quantile(data[!is.na(data)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]
  d <- data.frame(resids = data)
  ggplot(d, aes(sample = resids))+
    stat_qq()+
    geom_abline(slope = slope, intercept = int)

}

ggqqplot(scale(resid(modle)))

dta04 %>% mutate(pred = fitted.values(modle)) %>% 
  ggplot(., aes(math1, math2, color = cc))+
  geom_point(alpha = .7, pch = 21, size = rel(2))+
  geom_point(aes(math1, pred, color = cc), size = rel(2))+
  stat_smooth(aes(math1, pred, color = cc), method = "lm", color = "steelblue")+
  theme(legend.position = c(.8, .2))+
  labs(x = "Math score at Year 1",
       y = "Fitted Values Math score at Year 2", 
       color = "Curriculum coverage")

tidy(modle, conf.int = TRUE) %>% 
  ggplot(., aes(term, estimate))+
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))+
  geom_hline(yintercept = 0, linetype = "dotted")+
  scale_x_discrete(labels = c("Curriculum coverage", "Math score at Year 1"))+
  labs(x = "term", y = "parameter coefficients")

ex5

dta05 <- read.table("C:/Users/she22_000/Documents/hs0.txt", header = T)

cor <- cov(dta05$write, dta05$read)
v <- matrix(c(1,cor,cor,1), 2, 2)
slope <- eigen(v)$vector[1, 1]
intercept <- mean(dta05$write)-slope*mean(dta05$read)

p <- ggplot(dta05, aes(read, write)) +
  geom_point(shape = 21) +
  stat_ellipse() +
  geom_abline(intercept = 0, slope = 1, color = "gray") +
  geom_abline(intercept = intercept, slope = slope, color = "tomato", size = rel(2)) +
  geom_vline(xintercept = mean(dta05$read), color = "gray") +
  geom_hline(yintercept = mean(dta05$write), color = "gray") +
  stat_smooth(method = "lm", size = rel(1.1)) +
  xlim(25, 80) +
  ylim(25, 80) +
  labs(x = "Reading score", y = "Writing score")

ggMarginal(p, type = "histogram",
           xparams = list(binwidth = (IQR(dta05$read)*2)/(200)^(1/3),
                          fill = "gray"),
           yparams = list(binwidth = (IQR(dta05$write)*2)/(200)^(1/3),
                          fill = "gray"))

ex6

dta06 <- read.csv("C:/Users/she22_000/Documents/imf_data.csv", header = T) %>% 
  gather(Year, PPP, -Country) %>% 
  mutate(Year = parse_number(Year))

ggplot(dta06, aes(Year, PPP/1000))+
  geom_line(aes(color = Country), size = rel(2), alpha = .8)+
  scale_color_hue(l = 30, c = 50, guide = FALSE)+
  geom_text_repel(aes(label = ifelse(Year == 2017, as.character(Country),"")), force = 5, size = rel(5))+
  ggthemes::theme_economist()+
  scale_y_continuous(position = "right", breaks = seq(0, 120, 20))+
  scale_x_continuous(breaks = c(seq(1980, 2010, 10), seq(2012, 2020, 2)))+
  labs(x = "",y = "", title = "Overtaking the leader", subtitle = "GDP per person at purchasing-power parity 2018, prices, $'000")

ex7

library(datasets)
dta07 <- as.data.frame(USPersonalExpenditure) %>%
 mutate(index = 1:5, category = c("Food&Tobacco", "Household&Operation", "Medical&Health", "Personal&Care", "Private&Education")) %>% 
  gather(year, exp, 1:5) %>% 
  group_by(category) %>% 
  mutate(exp_c = log(exp) - mean(log(exp)))

qplot(category, exp_c, data = dta07) +
  geom_hline(yintercept = 0, colour = "grey50") +
  geom_line(aes(group = 1)) +
  facet_wrap(~ year) + 
  labs(x = "categories", y = "Centered personal expenditures")

qplot(exp_c, category, data = dta07) +
  geom_segment(aes(xend = 0, yend = category)) +
  geom_vline(xintercept = 0, colour = "grey50") +
  facet_wrap(~ year, nrow = 1) +
  labs(x = "Centered personal expenditures (log)", y = "categories")

ex8

dta08 <- read.table("C:/Users/she22_000/Documents/hs0.txt", header = T)
ggplot(dta08, aes(write, read, fill = female))+
  stat_density2d(aes(color = ..level..))+
  theme(legend.position = c(.05, .75))+
  geom_point(pch = 21, alpha = .5)+
  facet_wrap(~female)+
  labs(x = "Writing score", y = "Reading score")

ex9

dta09 <- data.frame(Race = c("White", "White", "Black", "Black"),
                  Gender = c("Male", "Female", "Male", "Female"),
                  Yes = c(43, 26, 29, 22),
                  No = c(134, 149, 23, 36)) %>% 
  mutate(P_yes = Yes/(Yes+No), P_no = No/(Yes+No))

new.dta09 <- dta09 %>% select(1:4) %>% 
  gather(Intercourse, Freq, 3:4) %>%
  mutate(P = c(dta09$P_yes, dta09$P_no),
         SE = sqrt(P*(1-P)/Freq))

ggplot(new.dta09, aes(Intercourse, P, fill = Gender))+
  geom_bar(position = "dodge", stat = "identity")+
  geom_errorbar(aes(ymin = P - SE, ymax = P + SE), position = "dodge")+
  facet_wrap(~ Race)+
  labs(y = "Proportion")

ex10

連結失效

ex11

dta11 <- MASS::Cushings %>% 
  mutate(Type = factor(Type, levels = c("u", "b", "c", "a"), 
                       labels = c("Unknown", "Bilateral Hyperplasia", "Carcinoma", "Adenoma")),
         Label = "")

dta11$Label[c(1, 13, 21, 27)] = c("Adenoma", "Bilateral Hyperplasia", "Carcinoma", "Unknown")

ggplot(dta11, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type))+
  geom_point(pch = 21, size = rel(2))+
  geom_text_repel(aes(label = Label, color = Type))+
  scale_fill_discrete(guide = FALSE)+
  scale_color_discrete(guide = FALSE)+
  theme_hc()+
  labs(x = "Tetrahydrocortisone (mg/24 hours)", y = "Pregnanetriol (mg/24 hours)",
       title = "Cushings's syndrome")+
  theme(plot.title = element_text(hjust = 1))

ex12

## 讀取資料
library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

dta12 <- Vocab
str(dta12)

## 'data.frame':    21638 obs. of  4 variables:
##  $ year      : int  2004 2004 2004 2004 2004 2004 2004 2004 2004 2004 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
##  $ education : int  9 14 14 17 14 14 12 10 11 9 ...
##  $ vocabulary: int  3 6 9 8 1 7 6 6 5 1 ...

## 繪圖
ggplot(dta12, aes(education))+
  geom_histogram(aes(y = ..density..), breaks = c(0, 6, 9, 12, 16, 20),
                 fill = "gray", color = "black")+
  scale_y_continuous(breaks = seq(0, 0.15, 0.05), limits = c(0, 0.15), 
                     minor_breaks = seq(0, 0.15, 0.01)) +
  scale_x_continuous(breaks = c(0, 6, 9, 12, 16, 20))+
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(linetype = "dotted", color = "gray50"),
        panel.grid.minor.y = element_line(linetype = "dotted", color = "gray50"))+
  labs(x = "Education level (years)", y = "Density")

4/23 homework

jiru shih

2018年5月6日

set up

ex2

ex3

ex4

ex5

ex6

ex7

ex8

ex9

ex10

ex11

ex12