library(dplyr)
library(ggplot2)
library(tidyr)

2

#read in data
dt2 <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0423/langMathDutch.txt", header=T)
#把數值分組
dt2_plot<-dt2 %>% mutate(size=cut(size, 3,labels=c("small", "medium","large")),
                         IQV=cut(IQV, 3, labels=c("low","middle","high")))
#看一下資料長怎樣
knitr::kable(head(dt2_plot))

school	pupil	IQV	size	lang	arith
1	17001	high	large	46	24
1	17002	high	large	45	19
1	17003	middle	large	33	24
1	17004	middle	large	46	26
1	17005	low	large	20	9
1	17006	middle	large	30	13

#畫畫
ggplot(dt2_plot,aes(lang, arith))+
  geom_point()+  geom_smooth()+
  labs(x="Language score",y="Arithmetic score")+
  facet_wrap(~size+IQV,ncol=3,nrow=3)

3

3.1: whether there are gender difference in state anxiety 由下圖可知男女生在焦慮程度上是有差異的，女生的焦慮程度比男生高。在每周的焦慮程度上性別內也有顯著差異，焦慮程度隨著周次增加而增加。

#read in data
dt3 <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0423/anxiety.txt", header=T)
dt3$ID <- seq.int(nrow(dt3))
#wide to long
dt3l<-tidyr::gather(dt3,week,anxiety,f1:m5,factor_key=T)
#把資料修改成適合分析的格式
dt3l[,4] <- gsub("m.", "male", as.matrix(dt3l[,2])) 
dt3l[,4] <- gsub("f.", "female", as.matrix(dt3l[,4]))
dt3l[,2] <- as.numeric(gsub("[fm]", "", as.matrix(dt3l[,2])))
dt3l<-dt3l %>% rename(gender=V4) 
dt3l[,4]<-as.factor(dt3l$gender)
dt3l[,2]<-as.factor(dt3l$week)
#畫圖
m1<-lm(anxiety~gender+week,data=dt3l)
dtam1<-ggeffects::ggpredict(m1,terms=c("gender","week"))
plot(dtam1)

3.2: individual differences in state anxiety 如下圖，個體在不同階段的焦慮程度有些人是有差異的。

dt3l[,1]<-as.factor(dt3l$ID)
m2<-lm(anxiety~gender+week+ID,data=dt3l)
dtam2<-ggeffects::ggpredict(m2,terms=c("ID","week","gender"))
plot(dtam2)

4

#read the data
dt4 <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0423/math_attainment.txt", header=T)

#visualize data
ggplot(dt4, aes(math1, math2, color = cc))+
  geom_point(size = rel(2))+
  stat_smooth(method = "lm", color = "steelblue")+
  labs(x = "Math score at Year 1",
       y = "Math score at Year 2", 
       color = "Curriculum coverage")

#model diagnostics
m <- lm(math2 ~ -1 + math1 + cc, dt4)


plot(fitted.values(m),residuals(m),xlab = "Predicted Values", 
     ylab= "Residuals",main = "Predicted Values versus Residuals")

# fitted value
dt4 %>% mutate(pred = fitted.values(m)) %>% 
  ggplot(., aes(math1, math2, color = cc))+
  geom_point(pch = 21)+
  geom_point(aes(math1, pred, color = cc))+
  stat_smooth(aes(math1,pred,color=cc),method="lm",color="steelblue")+
  labs(x = "Math score at Year 1",
       y = "Fitted Values Math score at Year 2", 
       color = "Curriculum coverage")

# coefficients and ci
broom::tidy(m, conf.int = TRUE) %>% 
  ggplot(., aes(term, estimate))+
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))+
  geom_hline(yintercept = 0, linetype = "dotted")+
  scale_x_discrete(labels = c("Curriculum coverage", "Math score at Year 1"))+
  labs(x = "term", y = "parameter coefficients")

5

dta <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0409/inclass/hs0.txt", header=T)

# eigen vector
cor <- cov(dta$write, dta$read)
v <- matrix(c(1,cor,cor,1), 2, 2)
slope <- eigen(v)$vector[1, 1]
intercept <- mean(dta$write)-slope*mean(dta$read)

# plot
p <- ggplot(dta, aes(read, write)) +
  geom_point(shape = 21) +
  stat_ellipse() +
  geom_abline(intercept = 0, slope = 1, color = "gray") +
  geom_abline(intercept = intercept, slope = slope, color = "tomato", size = rel(2)) +
  geom_vline(xintercept = mean(dta$read), color = "gray") +
  geom_hline(yintercept = mean(dta$write), color = "gray") +
  stat_smooth(method = "lm", size = rel(1.1)) +
  xlim(25, 80) +
  ylim(25, 80) +
  labs(x = "Reading score", y = "Writing score")

ggExtra::ggMarginal(p, type = "histogram",
           xparams = list(binwidth = (IQR(dta$read)*2)/(200)^(1/3),
                          fill = "gray"),
           yparams = list(binwidth = (IQR(dta$write)*2)/(200)^(1/3),
                          fill = "gray"))

6

dt6 <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0423/imf_data.csv", header=T,sep=",")
dt6_plot <- dt6 %>%  gather(Year, PPP, -Country) %>% 
  mutate(Year =  readr::parse_number(Year)) %>%
  filter(Year<2018)

dt6_plot$Country <- as.character(dt6_plot$Country) 

dt6_plot$Country[dt6_plot$Country=="TW"] <-"Taiwan"
dt6_plot$Country[dt6_plot$Country=="SG"] <-"Singapore"
dt6_plot$Country[dt6_plot$Country=="JP"] <-"Japan"
dt6_plot$Country[dt6_plot$Country=="HK"] <-"Hong Kong"
dt6_plot$Country[dt6_plot$Country=="KR"] <-"South Korea"

# plot
ggplot(dt6_plot, aes(Year, PPP/1000))+
  geom_line(aes(color = Country), size = rel(2), alpha = .8)+
  scale_color_hue(l = 30, c = 50, guide = FALSE)+
  ggrepel::geom_text_repel(aes(label = ifelse(Year == 2017, as.character(Country),"")), force = 5, size = rel(5))+
  ggthemes::theme_economist()+
  scale_y_continuous(position = "right", breaks = seq(0, 120, 20))+
  scale_x_continuous(breaks = c(seq(1980, 2010, 10), seq(2012, 2020, 2)))+
  labs(x = "",y = "", title = "Overtaking the leader", subtitle = "GDP per person at purchasing-power parity 2018, prices, $'000")

7

# data
dt7 <- USPersonalExpenditure %>% as.data.frame %>% 
  mutate(index = 1:5, category = c("Food\nTobacco", "Household\nOperation", "Medical\nHealth", "Personal\nCare", "Private\nEducation")) %>% 
  gather(year, exp, 1:5) %>% 
  group_by(category) %>% 
  mutate(exp_c = log(exp) - mean(log(exp)))

# plot 1
qplot(category, exp_c, data = dt7) +
  geom_hline(yintercept = 0, colour = "grey50") +
  geom_line(aes(group = 1)) +
  facet_wrap(~ year) + 
  labs(x = "categories", y = "Centered personal expenditures")+
  theme_bw()

8

dta <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0409/inclass/hs0.txt", header=T)

# plot density_2d
ggplot(dta, aes(write, read, fill = female))+
  stat_density2d(aes(color = ..level..))+
  theme_bw()+
  geom_point(pch = 21, alpha = .5)+
  facet_wrap(~female)+
  labs(x = "Writing score", y = "Reading score")

9

# data
dt9 <- data.frame(Race = c("White", "White", "Black", "Black"),
                  Gender = c("Male", "Female", "Male", "Female"),
                  Yes = c(43, 26, 29, 22),
                  No = c(134, 149, 23, 36)) %>% 
  mutate(P_yes = Yes/(Yes+No), P_no = No/(Yes+No))

# computate P and SE
dt9 <- dt9 %>% select(1:4) %>% 
  gather(Intercourse, Freq, 3:4) %>%
  mutate(P = c(dt9$P_yes, dt9$P_no),
         SE = sqrt(P*(1-P)/Freq))

# plot
ggplot(dt9, aes(Intercourse, P, fill = Gender))+
  geom_bar(position = "dodge", stat = "identity")+
  geom_errorbar(aes(ymin = P - SE, ymax = P + SE), position = "dodge")+
  facet_wrap(~ Race)+
  labs(y = "Proportion")+
  theme_bw()

11

# data
dt11 <- MASS::Cushings %>% 
  mutate(Type = factor(Type, levels = c("u", "b", "c", "a"), 
                       labels = c("Unknown", "Bilateral Hyperplasia", "Carcinoma", "Adenoma")),
         Label = "")

# label
dt11$Label[c(1, 13, 21, 27)] = c("Adenoma", "Bilateral Hyperplasia", "Carcinoma", "Unknown")

# plot and label
ggplot(dt11, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type))+
  geom_point(pch = 21, size = rel(2))+
  ggrepel::geom_text_repel(aes(label = Label, color = Type))+
  scale_fill_discrete(guide = FALSE)+
  scale_color_discrete(guide = FALSE)+
  theme_bw()+
  labs(x = "Tetrahydrocortisone (mg/24 hours)", y = "Pregnanetriol (mg/24 hours)",
       title = "Cushings's syndrome")+
  theme(plot.title = element_text(hjust = 1))

12

dt12<-carData::Vocab$education

hist(dt12, 
     col = "gray", border = "black", 
     breaks = c(0, 6, 9, 12, 16, 20), ylim = c(0, .15),
     axes = FALSE, main = NULL, xlab = "Education level (years)")

Homework 0423

Yifang Yu

2018-05-07

2

3

4

5

6

7

8

9

11

12