HW2
#載入
pacman::p_load(tidyverse, broom, ggExtra, datasets, ggthemes, ggrepel)
#讀檔,分級
dta02 <- read.table("langMathDutch.txt", header = T) %>%
mutate(Size = cut(size, quantile(size, probs = c(0, .33, .67, 1)),
c("Small", "Midium", "Large"),
include.lowest = TRUE, ordered = TRUE),
IQ = cut(IQV, quantile(IQV, probs = c(0, .33, .67, 1)),
c("Low", "Middle", "High"),
include.lowest = TRUE, ordered = TRUE),
Size.IQ = factor(paste(Size, IQ, sep = "."),
levels = c("Small.Low", "Small.Middle", "Small.High",
"Midium.Low", "Midium.Middle", "Midium.High",
"Large.Low", "Large.Middle", "Large.High")))
#繪圖
ggplot(dta02, aes(lang, arith))+geom_point()+ stat_smooth(method = "lm")+
facet_wrap(~ Size.IQ)+ scale_y_continuous(breaks = seq(5, 35, 5))+
labs(x = "Language Socre", y = "Arithmetic Score")

HW3 Anxiety M/F
dta03 <- read.table("stateAnxiety.txt", h = T) %>%
mutate(index = 1:50) %>%
gather("Week", "Anxiety", 1:10) %>%
mutate(Gender = factor(rep(c("Female", "Male"), each = 250)),
Week = paste0("Week", parse_number(Week)),
Subject = factor(rep(paste0("Sub", 101:200), times = 5))) %>%
group_by(Week) %>%
mutate(mean_g = mean(Anxiety)) %>%
ungroup
theme_set(theme_bw())
#Plot
ggplot(dta03, aes(Week, Anxiety, color = Gender))+
geom_line(aes(group = Subject), color = "gray50", alpha = .8, linetype = "dotted")+
stat_smooth(aes(group = 1), method = "lm", alpha = .5)+
stat_summary(aes(group = Gender), fun.y = mean, geom = "line")+
stat_summary(fun.data = mean_cl_boot, geom = "pointrange")

#plot
ggplot(dta03, aes(reorder(Subject, Anxiety, mean), Anxiety))+
stat_summary(fun.data = mean_cl_boot, geom = "pointrange")+
coord_flip()+
labs(x = "Subject reorder by averge anxiety socre")

HW4
dta04 <- read.table("math_attainment.txt", header = T)
ggplot(dta04, aes(math1, math2, color = cc))+
geom_point(size = rel(2))+
stat_smooth(method = "lm", color = "steelblue")+
theme(legend.position = c(.8, .2))+
labs(x = "Math score at Year 1",
y = "Math score at Year 2",
color = "Curriculum coverage")

# Model
m <- lm(math2 ~ -1 + math1 + cc, dta04)
# standardized residual plot
dta04 %>% mutate(s.resi = scale(residuals(m)),pred = fitted.values(m)) %>%
ggplot(., aes(pred, s.resi))+
geom_point()+ geom_hline(yintercept = 0, linetype = "dotted")+
labs(x = "Fitted values", y = "Standardized residuals")

ggqqplot <- function(data)
{
y <- quantile(data[!is.na(data)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
d <- data.frame(resids = data)
ggplot(d, aes(sample = resids))+stat_qq()+
geom_abline(slope = slope, intercept = int)
}
ggqqplot(scale(resid(m)))

# Fit
dta04 %>% mutate(pred = fitted.values(m)) %>%
ggplot(., aes(math1, math2, color = cc))+
geom_point(alpha = .7, pch = 21, size = rel(2))+
geom_point(aes(math1, pred, color = cc), size = rel(2))+
stat_smooth(aes(math1, pred, color = cc), method = "lm", color = "steelblue")+
theme(legend.position = c(.8, .2))+
labs(x = "Math score at Year 1",
y = "Fitted Values Math score at Year 2",
color = "Curriculum coverage")

tidy(m, conf.int = TRUE) %>%
ggplot(., aes(term, estimate))+
geom_pointrange(aes(ymin = conf.low, ymax = conf.high))+
geom_hline(yintercept = 0, linetype = "dotted")+
scale_x_discrete(labels = c("Curriculum coverage", "Math score at Year 1"))+
labs(x = "term", y = "parameter coefficients")

HW5
dta05 <- read.table("hs0.txt", h = T)
cor <- cov(dta05$write, dta05$read)
v <- matrix(c(1,cor,cor,1), 2, 2)
slope <- eigen(v)$vector[1, 1]
intercept <- mean(dta05$write)-slope*mean(dta05$read)
# plot
p <- ggplot(dta05, aes(read, write)) +
geom_point(shape = 21) +
stat_ellipse() +
geom_abline(intercept = 0, slope = 1, color = "gray") +
geom_abline(intercept = intercept, slope = slope, color = "tomato", size = rel(2)) +
geom_vline(xintercept = mean(dta05$read), color = "gray") +
geom_hline(yintercept = mean(dta05$write), color = "gray") +
stat_smooth(method = "lm", size = rel(1.1)) +
xlim(25, 80) +
ylim(25, 80) +
labs(x = "Reading score", y = "Writing score")
library(ggExtra)
ggMarginal(p, type = "histogram",
xparams = list(binwidth = (IQR(dta05$read)*2)/(200)^(1/3),
fill = "gray"),
yparams = list(binwidth = (IQR(dta05$write)*2)/(200)^(1/3),
fill = "gray"))

HW6
dta06 <- read.csv("imf_data.csv", h = T) %>%
gather(Year, PPP, -Country) %>% mutate(Year = parse_number(Year))
#Plot
ggplot(dta06, aes(Year, PPP/1000))+
geom_line(aes(color = Country), size = rel(2), alpha = .8)+
scale_color_hue(l = 30, c = 50, guide = FALSE)+
geom_text_repel(aes(label = ifelse(Year == 2017, as.character(Country),"")), force = 5, size = rel(5))+ ggthemes::theme_economist()+
scale_y_continuous(position = "right", breaks = seq(0, 120, 20))+
scale_x_continuous(breaks = c(seq(1980, 2010, 10), seq(2012, 2020, 2)))+
labs(x = "",y = "", title = "Overtaking the leader", subtitle = "GDP per person at purchasing-power parity 2018, prices, $'000")

HW7
dta07 <- as.data.frame(USPersonalExpenditure) %>%
mutate(Item = rownames(USPersonalExpenditure)) %>%
gather(key = "Year", value = "Expenditure", 1:5)
#plot
qplot(log(Expenditure), Item, data = dta07) +
geom_segment(aes(xend = 0, yend = Item)) +
geom_vline(xintercept = 0, colour = "grey50") +
facet_wrap(~ Year, nrow = 1)

HW8
dta08 <- read.table("hs0.txt", h = T)
ggplot(dta08, aes(x = write, y = read, fill = female)) +
geom_point(pch = 21, alpha = .5) +
geom_density_2d(aes(color = ..level..)) +
facet_wrap(~ female) +
labs(x = "Writing score", y = "Reading score") +
theme(legend.position = c(.05, .8))

HW9
# Read data
dta09 <- data.frame(Race = c("White", "White", "Black", "Black"),
Gender = c("Male", "Female", "Male", "Female"),
Yes = c(43, 26, 29, 22), No = c(134, 149, 23, 36)) %>%
mutate(P_yes = Yes/(Yes+No), P_no = No/(Yes+No))
new.dta <- dta09 %>% select(1:4) %>%
gather(Intercourse, Freq, 3:4) %>%
mutate(P = c(dta09$P_yes, dta09$P_no), SE = sqrt(P*(1-P)/Freq))
# Plot
ggplot(new.dta, aes(Intercourse, P, fill = Gender))+
geom_bar(position = "dodge", stat = "identity")+
geom_errorbar(aes(ymin = P - SE, ymax = P + SE), position = "dodge")+
facet_wrap(~ Race)+ labs(y = "Proportion")

HW10
HW11
dta11 <- MASS::Cushings %>%
mutate(Type = factor(Type, levels = c("u", "b", "c", "a"),
labels = c("Unknown", "Bilateral Hyperplasia", "Carcinoma", "Adenoma")), Label = "")
dta11$Label[c(1, 13, 21, 27)] = c("Adenoma", "Bilateral Hyperplasia", "Carcinoma", "Unknown")
#plot
ggplot(dta11, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type))+
geom_point(pch = 21, size = rel(2))+
geom_text_repel(aes(label = Label, color = Type))+
scale_fill_discrete(guide = FALSE)+
scale_color_discrete(guide = FALSE)+
theme_hc()+ labs(x = "Tetrahydrocortisone (mg/24 hours)", y = "Pregnanetriol (mg/24 hours)",title = "Cushings's syndrome")+theme(plot.title = element_text(hjust = 1))

HW12
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
dta12 <- Vocab
str(dta12)
## 'data.frame': 30351 obs. of 4 variables:
## $ year : num 1974 1974 1974 1974 1974 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
## $ education : num 14 16 10 10 12 16 17 10 12 11 ...
## $ vocabulary: num 9 9 9 5 8 8 9 5 3 5 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:32115] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..- attr(*, "names")= chr [1:32115] "19720001" "19720002" "19720003" "19720004" ...
ggplot(dta12, aes(education))+
geom_histogram(aes(y = ..density..), breaks = c(0, 6, 9, 12, 16, 20),
fill = "gray", color = "black")+
scale_y_continuous(breaks = seq(0, 0.15, 0.05), limits = c(0, 0.15),
minor_breaks = seq(0, 0.15, 0.01)) +
scale_x_continuous(breaks = c(0, 6, 9, 12, 16, 20))+
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(linetype = "dotted", color = "gray50"),
panel.grid.minor.y = element_line(linetype = "dotted", color = "gray50"))+
labs(x = "Education level (years)", y = "Density")
