library(dplyr)
library(ggplot2)
library(tidyr)2
#read in data
dt2 <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0423/langMathDutch.txt", header=T)
#把數值分組
dt2_plot<-dt2 %>% mutate(size=cut(size, 3,labels=c("small", "medium","large")),
IQV=cut(IQV, 3, labels=c("low","middle","high")))
#看一下資料長怎樣
knitr::kable(head(dt2_plot))| school | pupil | IQV | size | lang | arith |
|---|---|---|---|---|---|
| 1 | 17001 | high | large | 46 | 24 |
| 1 | 17002 | high | large | 45 | 19 |
| 1 | 17003 | middle | large | 33 | 24 |
| 1 | 17004 | middle | large | 46 | 26 |
| 1 | 17005 | low | large | 20 | 9 |
| 1 | 17006 | middle | large | 30 | 13 |
#畫畫
ggplot(dt2_plot,aes(lang, arith))+
geom_point()+ geom_smooth()+
labs(x="Language score",y="Arithmetic score")+
facet_wrap(~size+IQV,ncol=3,nrow=3)3
3.1: whether there are gender difference in state anxiety 由下圖可知男女生在焦慮程度上是有差異的,女生的焦慮程度比男生高。在每周的焦慮程度上性別內也有顯著差異,焦慮程度隨著周次增加而增加。
#read in data
dt3 <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0423/anxiety.txt", header=T)
dt3$ID <- seq.int(nrow(dt3))
#wide to long
dt3l<-tidyr::gather(dt3,week,anxiety,f1:m5,factor_key=T)
#把資料修改成適合分析的格式
dt3l[,4] <- gsub("m.", "male", as.matrix(dt3l[,2]))
dt3l[,4] <- gsub("f.", "female", as.matrix(dt3l[,4]))
dt3l[,2] <- as.numeric(gsub("[fm]", "", as.matrix(dt3l[,2])))
dt3l<-dt3l %>% rename(gender=V4)
dt3l[,4]<-as.factor(dt3l$gender)
dt3l[,2]<-as.factor(dt3l$week)
#畫圖
m1<-lm(anxiety~gender+week,data=dt3l)
dtam1<-ggeffects::ggpredict(m1,terms=c("gender","week"))
plot(dtam1) 3.2: individual differences in state anxiety 如下圖,個體在不同階段的焦慮程度有些人是有差異的。
dt3l[,1]<-as.factor(dt3l$ID)
m2<-lm(anxiety~gender+week+ID,data=dt3l)
dtam2<-ggeffects::ggpredict(m2,terms=c("ID","week","gender"))
plot(dtam2) 4
#read the data
dt4 <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0423/math_attainment.txt", header=T)
#visualize data
ggplot(dt4, aes(math1, math2, color = cc))+
geom_point(size = rel(2))+
stat_smooth(method = "lm", color = "steelblue")+
labs(x = "Math score at Year 1",
y = "Math score at Year 2",
color = "Curriculum coverage")#model diagnostics
m <- lm(math2 ~ -1 + math1 + cc, dt4)
plot(fitted.values(m),residuals(m),xlab = "Predicted Values",
ylab= "Residuals",main = "Predicted Values versus Residuals")# fitted value
dt4 %>% mutate(pred = fitted.values(m)) %>%
ggplot(., aes(math1, math2, color = cc))+
geom_point(pch = 21)+
geom_point(aes(math1, pred, color = cc))+
stat_smooth(aes(math1,pred,color=cc),method="lm",color="steelblue")+
labs(x = "Math score at Year 1",
y = "Fitted Values Math score at Year 2",
color = "Curriculum coverage")# coefficients and ci
broom::tidy(m, conf.int = TRUE) %>%
ggplot(., aes(term, estimate))+
geom_pointrange(aes(ymin = conf.low, ymax = conf.high))+
geom_hline(yintercept = 0, linetype = "dotted")+
scale_x_discrete(labels = c("Curriculum coverage", "Math score at Year 1"))+
labs(x = "term", y = "parameter coefficients")5
dta <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0409/inclass/hs0.txt", header=T)
# eigen vector
cor <- cov(dta$write, dta$read)
v <- matrix(c(1,cor,cor,1), 2, 2)
slope <- eigen(v)$vector[1, 1]
intercept <- mean(dta$write)-slope*mean(dta$read)
# plot
p <- ggplot(dta, aes(read, write)) +
geom_point(shape = 21) +
stat_ellipse() +
geom_abline(intercept = 0, slope = 1, color = "gray") +
geom_abline(intercept = intercept, slope = slope, color = "tomato", size = rel(2)) +
geom_vline(xintercept = mean(dta$read), color = "gray") +
geom_hline(yintercept = mean(dta$write), color = "gray") +
stat_smooth(method = "lm", size = rel(1.1)) +
xlim(25, 80) +
ylim(25, 80) +
labs(x = "Reading score", y = "Writing score")
ggExtra::ggMarginal(p, type = "histogram",
xparams = list(binwidth = (IQR(dta$read)*2)/(200)^(1/3),
fill = "gray"),
yparams = list(binwidth = (IQR(dta$write)*2)/(200)^(1/3),
fill = "gray"))6
dt6 <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0423/imf_data.csv", header=T,sep=",")
dt6_plot <- dt6 %>% gather(Year, PPP, -Country) %>%
mutate(Year = readr::parse_number(Year)) %>%
filter(Year<2018)
dt6_plot$Country <- as.character(dt6_plot$Country)
dt6_plot$Country[dt6_plot$Country=="TW"] <-"Taiwan"
dt6_plot$Country[dt6_plot$Country=="SG"] <-"Singapore"
dt6_plot$Country[dt6_plot$Country=="JP"] <-"Japan"
dt6_plot$Country[dt6_plot$Country=="HK"] <-"Hong Kong"
dt6_plot$Country[dt6_plot$Country=="KR"] <-"South Korea"
# plot
ggplot(dt6_plot, aes(Year, PPP/1000))+
geom_line(aes(color = Country), size = rel(2), alpha = .8)+
scale_color_hue(l = 30, c = 50, guide = FALSE)+
ggrepel::geom_text_repel(aes(label = ifelse(Year == 2017, as.character(Country),"")), force = 5, size = rel(5))+
ggthemes::theme_economist()+
scale_y_continuous(position = "right", breaks = seq(0, 120, 20))+
scale_x_continuous(breaks = c(seq(1980, 2010, 10), seq(2012, 2020, 2)))+
labs(x = "",y = "", title = "Overtaking the leader", subtitle = "GDP per person at purchasing-power parity 2018, prices, $'000")7
# data
dt7 <- USPersonalExpenditure %>% as.data.frame %>%
mutate(index = 1:5, category = c("Food\nTobacco", "Household\nOperation", "Medical\nHealth", "Personal\nCare", "Private\nEducation")) %>%
gather(year, exp, 1:5) %>%
group_by(category) %>%
mutate(exp_c = log(exp) - mean(log(exp)))
# plot 1
qplot(category, exp_c, data = dt7) +
geom_hline(yintercept = 0, colour = "grey50") +
geom_line(aes(group = 1)) +
facet_wrap(~ year) +
labs(x = "categories", y = "Centered personal expenditures")+
theme_bw()8
dta <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0409/inclass/hs0.txt", header=T)
# plot density_2d
ggplot(dta, aes(write, read, fill = female))+
stat_density2d(aes(color = ..level..))+
theme_bw()+
geom_point(pch = 21, alpha = .5)+
facet_wrap(~female)+
labs(x = "Writing score", y = "Reading score") 9
# data
dt9 <- data.frame(Race = c("White", "White", "Black", "Black"),
Gender = c("Male", "Female", "Male", "Female"),
Yes = c(43, 26, 29, 22),
No = c(134, 149, 23, 36)) %>%
mutate(P_yes = Yes/(Yes+No), P_no = No/(Yes+No))
# computate P and SE
dt9 <- dt9 %>% select(1:4) %>%
gather(Intercourse, Freq, 3:4) %>%
mutate(P = c(dt9$P_yes, dt9$P_no),
SE = sqrt(P*(1-P)/Freq))
# plot
ggplot(dt9, aes(Intercourse, P, fill = Gender))+
geom_bar(position = "dodge", stat = "identity")+
geom_errorbar(aes(ymin = P - SE, ymax = P + SE), position = "dodge")+
facet_wrap(~ Race)+
labs(y = "Proportion")+
theme_bw()11
# data
dt11 <- MASS::Cushings %>%
mutate(Type = factor(Type, levels = c("u", "b", "c", "a"),
labels = c("Unknown", "Bilateral Hyperplasia", "Carcinoma", "Adenoma")),
Label = "")
# label
dt11$Label[c(1, 13, 21, 27)] = c("Adenoma", "Bilateral Hyperplasia", "Carcinoma", "Unknown")
# plot and label
ggplot(dt11, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type))+
geom_point(pch = 21, size = rel(2))+
ggrepel::geom_text_repel(aes(label = Label, color = Type))+
scale_fill_discrete(guide = FALSE)+
scale_color_discrete(guide = FALSE)+
theme_bw()+
labs(x = "Tetrahydrocortisone (mg/24 hours)", y = "Pregnanetriol (mg/24 hours)",
title = "Cushings's syndrome")+
theme(plot.title = element_text(hjust = 1))12
dt12<-carData::Vocab$education
hist(dt12,
col = "gray", border = "black",
breaks = c(0, 6, 9, 12, 16, 20), ylim = c(0, .15),
axes = FALSE, main = NULL, xlab = "Education level (years)")