Q2 Pupil

dta <- read.table("langMathDutch.txt", h = T)

#divide group
dta$group_s <- as.factor(as.numeric( cut2(dta$size,g = 3)))
dta$group_IQ <- as.factor(as.numeric( cut2(dta$IQV,g = 3)))

#set to levels
dta <- dta %>%
  mutate(group_s = factor(group_s, levels(group_s), 
                         labels = c("Small", "Medium", "Large")),
         group_IQ = factor(group_IQ, levels(group_IQ), 
                         labels = c("Low", "Middle", "High")))
#plot
ggplot(dta, aes(lang, arith)) +
  geom_point() +
  stat_smooth(method = "lm", se = T) +
  facet_wrap(~ group_s + group_IQ, ncol =3) +
  labs(x= "Language score", y = "Arithmetic score")

Q3 Anxiety M/F

dta3 <- read.table("stateAnxiety.txt", h = T)

dta3 <- dta3 %>%
  gather("Week", "Score") %>%
  mutate(Gender = factor(c(rep("Female", 250), rep("Male", 250))),
         ID = c(rep(paste0("S", 101:150), 5), rep(paste0("S", 151:200), 5)),
  Week = paste0("w", parse_number(Week)))

#plot
ggplot(dta3, aes(Week, Score, color = Gender)) +
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  stat_summary(aes(group = Gender), fun.y = mean, geom = "line") +
  geom_line(aes(group = ID), color = "gray50", alpha = .8, linetype = "dotted") +
  stat_smooth(aes(group = 1), method = "lm", se = T)

ggplot(dta3, aes(reorder(ID, Score, mean), Score))+
  stat_summary(fun.data = mean_se, geom = "pointrange")+
  coord_flip()+
  labs(x = "Subject reorder by averge anxiety socre")

Q4 Math1&2

Model diagnostics source 亦有參考趙同學。

Visualize data

library(car)


Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

dta4 <- read.table("math_attainment.txt", h = T)
ggplot(dta4, aes(math1, math2, color = cc))+
  geom_point()+
  stat_smooth(method = "lm", color = "steelblue") +
  theme_bw()

Model diagnostics

m1 <- lm(math2 ~ -1 + math1 + cc, data = dta4)

#qqplot
qqPlot(m1, main="QQ Plot")

# Pearson residual
car::residualPlot(m1)

# standardized residual plot
dta4 %>% mutate(s.resi = scale(residuals(m1)),
               pred = fitted.values(m1)) %>% 
  ggplot(., aes(pred, s.resi))+
  geom_point()+
  geom_hline(yintercept = 0, linetype = "dotted")+
  labs(x = "Fitted values", y = "Standardized residuals")

Model fitting

dta4 %>% mutate(pred = fitted.values(m1)) %>% 
  ggplot(., aes(math1, math2, color = cc))+
  geom_point(alpha = .7, pch = 21, size = rel(2))+
  geom_point(aes(math1, pred, color = cc), size = rel(2))+
  stat_smooth(aes(math1, pred, color = cc), method = "lm", color = "steelblue")+
  theme(legend.position = c(.8, .2))+
  labs(x = "Math score at Year 1",
       y = "Fitted Values Math score at Year 2", 
       color = "Curriculum coverage")

Parameter coefficients

require(coefplot)
coefplot(m1, xlab = "estimates", ylab = "parameters")

Q5 SD line

dta5 <- read.table("hs0.txt", h = T)

#calculate slope(a) and intercept(b)
a <- sd(dta5$write)/sd(dta5$read)
b <- mean(dta5$write)-a*mean(dta5$read)

#plot
p <- ggplot(dta5, aes(read, write)) +
  geom_point(shape = 21) +
  stat_ellipse() +
  geom_abline(intercept = 0, slope = 1, color = "gray") +
  geom_abline(intercept = b, slope = a, color = "red", size = rel(1.2)) +    #sd line
  geom_vline(xintercept = mean(dta5$read), color = "gray") +
  geom_hline(yintercept = mean(dta5$write), color = "gray") +
  stat_smooth(method = "lm", size = rel(1.1)) +
  xlim(25, 80) +
  ylim(25, 80) +
  labs(x = "Reading score", y = "Writing score")

ggMarginal(p, type = "histogram",
           xparams = list(binwidth = (IQR(dta5$read)*2)/(200)^(1/3),
                          fill = "gray"),
           yparams = list(binwidth = (IQR(dta5$write)*2)/(200)^(1/3),
                          fill = "gray"))

Q6 GDP graph

Data source

dta6 <- read.csv("imf_data.csv", h = T)

dta6 <- dta6[c(74,85,90,154,169),] %>% 
  gather(key = Year, value = GDP, 2:45) %>%
  mutate(Year = parse_number(Year),
         GDP = as.numeric(GDP))

#plot
ggplot(dta6, aes(Year, GDP/1000))+
  geom_line(aes(color = Country), size = rel(2))+
  ggthemes::theme_economist()+
  scale_y_continuous(position = "right", breaks = seq(0, 120, 20))+
  scale_x_continuous(breaks = c(seq(1980, 2010, 10), seq(2016, 2022, 2)))+
  labs(title = "Overtaking the leader", subtitle = "GDP per person at purchasing-power parity 2018, prices, $'000") +
  geom_text_repel(data = subset(dta6, Year == max(Year)), aes(label = Country))

Q7 USExpenditure

library(datasets)
dta7 <- as.data.frame(USPersonalExpenditure)

dta7 <- dta7 %>% mutate(Category = c("food&tobacco", "household operation", "medical&health", "personal care", "private education")) %>%
          gather(Year, Expend,1:5) %>%
          group_by(Category) %>% 
          mutate(excess = log(Expend) - mean(log(Expend)))

#plot
qplot(Category, excess, data = dta7) +
  geom_hline(yintercept = 0, colour = "grey50") +
  geom_line(aes(group = 1)) +
  facet_wrap(~ Year)

qplot(excess, Category, data = dta7) +
  geom_segment(aes(xend = 0, yend = Category)) +
  geom_vline(xintercept = 0, colour = "grey50") +
  facet_wrap(~ Year, nrow = 1)

Q8 High school score

dta8 <- read.table("hs0.txt", h = T)

ggplot(dta8, aes(x = write, y = read, fill = female)) +
  geom_point(pch = 21, alpha = .5) +
  geom_density_2d(aes(color = ..level..)) +
  facet_wrap(~ female) +
  labs(x = "Writing score", y = "Reading score") +
  theme(legend.position = c(.05, .8))

Q9 Intercourse

dta9 <- data.frame(Race = c("White", "White", "Black", "Black"),
                   Gender = c("Male", " Female"),
                   Yes = c(43, 26, 29, 22),
                   No = c(134, 149, 23,36)) %>% 
        mutate(P_yes = Yes/(Yes+No), P_no = No/(Yes+No))

# calculate SE
dta9_n <- dta9 %>% select(1:4) %>% 
  gather(Intercourse, Num, 3:4) %>%
  mutate(P = c(dta9$P_yes, dta9$P_no),
         SE = sqrt(P*(1-P)/Num))

#plot
ggplot(dta9_n, aes(Intercourse, P, fill = Gender))+
  geom_bar(stat = "identity", position = "dodge")+
  geom_errorbar(aes(ymin = P - SE, ymax = P + SE), position = position_dodge(.9), width = .2)+
  facet_wrap(~ Race)+
  labs(y = "Proportion")

Q10 Article rate

Q11 Cushings

library(MASS)


Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

dta11 <- Cushings %>%
          mutate(Type = factor(Type, levels = c("u", "b", "c", "a"), 
                            labels = c("Unknown", "Bilateral Hyperplasia", "Carcinoma", "Adenoma")))


ggplot(dta11, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type)) +
  geom_point(shape = 21, size = rel(2)) +
  theme_hc() +
  labs(x = "Tetrahydrocortisone (mg/24 hours)", y = "Pregnanetriol (mg/24 hours)",
       title = "Cushings's syndrome") +
  theme(plot.title = element_text(hjust = 1)) +
  geom_text_repel(data = dta11[c(1,16,17,26),], aes(label = Type, color = Type))

Q12 Vocab

library(car)
dta12 <- Vocab
boundaries <- c(0, 6, 9, 12, 16, 20)

#plot
with(dta12, hist(education, breaks= boundaries, xlab = "Education level(years)", axes = FALSE, main = NULL, col = "gray"))
axis(1, boundaries) ; axis(2)
abline(h = seq(.01, .14, .01), col = "lightgray", lty = "dotted")

w9 Data Management Q2 - 12

Pei Jun

2018-05-07

Q2 Pupil

Q3 Anxiety M/F

Q4 Math1&2

Q5 SD line

Q6 GDP graph

Q7 USExpenditure

Q8 High school score

Q9 Intercourse

Q10 Article rate

Q11 Cushings

Q12 Vocab