setwd("/Users/tayloryen/Desktop/大學/成大課業/大四下/資料管理/0423/HW")
Question 2
library(Hmisc)
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.4.4
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
dta2<-read.table("dta2.txt",header=T)
#分組
dta2$group_s<-as.factor(as.numeric(cut2(dta2$size,g=3)))
dta2$group_IQ<-as.factor(as.numeric(cut2(dta2$IQV,g=3)))
#設定因子
dta2 <- dta2 %>%
mutate(group_s = factor(group_s, levels(group_s),
labels = c("Small", "Medium", "Large")),
group_IQ = factor(group_IQ, levels(group_IQ),
labels = c("Low", "Middle", "High")))
## Warning: package 'bindrcpp' was built under R version 3.4.4
#plot
ggplot(dta2, aes(lang, arith)) +
geom_point() +
stat_smooth(method = "lm", se = T) +
facet_wrap(~ group_s + group_IQ, ncol =3) +
labs(x= "Language score", y = "Arithmetic score")

Question 3
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
##
## expand
library(readr)
dta3<-read.table("dta3.txt",header=T)
dta3 <- dta3 %>%
gather("Week", "Score") %>%
mutate(Gender = factor(c(rep("Female", 250), rep("Male", 250))),
ID = c(rep(paste0("S", 101:150), 5), rep(paste0("S", 151:200), 5)),
Week = paste0("w", parse_number(Week)))
#繪圖一
ggplot(dta3, aes(Week, Score, color = Gender)) +
stat_summary(fun.data = mean_se, geom = "pointrange") +
stat_summary(aes(group = Gender), fun.y = mean, geom = "line") +
geom_line(aes(group = ID), color = "gray50", alpha = .8, linetype = "dotted") +
stat_smooth(aes(group = 1), method = "lm", se = T)

#繪圖二
ggplot(dta3, aes(reorder(ID, Score, mean), Score))+
stat_summary(fun.data = mean_se, geom = "pointrange")+
coord_flip()+
labs(x = "Subject reorder by averge anxiety socre")

Question 4
dta4<-read.table("dta4.txt",header=T)
#scatter plot
ggplot(dta4, aes(math1, math2, color = cc))+
geom_point()+
stat_smooth(method = "lm", color = "steelblue") +
theme_bw()

m1<-lm(math2 ~ -1 + math1 + cc, data = dta4)
#QQplot
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
##
## Attaching package: 'car'
## The following objects are masked from 'package:mosaic':
##
## deltaMethod, logit
## The following object is masked from 'package:dplyr':
##
## recode
qqPlot(m1, main="QQ Plot")

## [1] 3 20
#Pearson residual
car::residualPlot(m1)

# standardized residual plot
dta4 %>% mutate(s.resi = scale(residuals(m1)),
pred = fitted.values(m1)) %>%
ggplot(., aes(pred, s.resi))+
geom_point()+
geom_hline(yintercept = 0, linetype = "dotted")+
labs(x = "Fitted values", y = "Standardized residuals")

#model fit
dta4 %>% mutate(pred = fitted.values(m1)) %>%
ggplot(., aes(math1, math2, color = cc))+
geom_point(alpha = .7, pch = 21, size = rel(2))+
geom_point(aes(math1, pred, color = cc), size = rel(2))+
stat_smooth(aes(math1, pred, color = cc), method = "lm", color = "steelblue")+
theme(legend.position = c(.8, .2))+
labs(x = "Math score at Year 1",
y = "Fitted Values Math score at Year 2",
color = "Curriculum coverage")

#Parameter coefficients
require(coefplot)
## Loading required package: coefplot
coefplot(m1, xlab = "estimates", ylab = "parameters")

Question 5
library(ggExtra)
## Warning: package 'ggExtra' was built under R version 3.4.4
dta5<- read.table("dta5.txt", header=T)
a <- sd(dta5$write)/sd(dta5$read)
b <- mean(dta5$write)-a*mean(dta5$read)
p <- ggplot(dta5, aes(read, write)) +
geom_point(shape = 21) +
stat_ellipse() +
geom_abline(intercept = 0, slope = 1, color = "gray") +
geom_abline(intercept = b, slope = a, color = "red", size = rel(1.2)) + #sd line
geom_vline(xintercept = mean(dta5$read), color = "gray") +
geom_hline(yintercept = mean(dta5$write), color = "gray") +
stat_smooth(method = "lm", size = rel(1.1)) +
xlim(25, 80) +
ylim(25, 80) +
labs(x = "Reading score", y = "Writing score")
ggMarginal(p, type = "histogram",
xparams = list(binwidth = (IQR(dta5$read)*2)/(200)^(1/3),
fill = "gray"),
yparams = list(binwidth = (IQR(dta5$write)*2)/(200)^(1/3),
fill = "gray"))

Question 6
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:mosaic':
##
## resample
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(ggrepel)
dta6<-read.xls("dta6.xls",sheet=1)
dta6 <- dta6[c(74,85,90,154,169),] %>%
gather(key = Year, value = GDP, 2:45) %>%
mutate(Year = parse_number(Year),
GDP = as.numeric(GDP))
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning in evalq(as.numeric(GDP), <environment>): 強制變更過程中產生了 NA
#plot
ggplot(dta6, aes(Year, GDP/1000))+
geom_line(aes(color = Country), size = rel(2))+
ggthemes::theme_economist()+
scale_y_continuous(position = "right", breaks = seq(0, 120, 20))+
scale_x_continuous(breaks = c(seq(1980, 2010, 10), seq(2016, 2022, 2)))+
labs(title = "Overtaking the leader", subtitle = "GDP per person at purchasing-power parity 2018, prices, $'000") +
geom_text_repel(data = subset(dta6, Year == max(Year)), aes(label = Country))
## Warning: Removed 45 rows containing missing values (geom_path).

Question 7
library(datasets)
library(datasets)
dta7 <- as.data.frame(USPersonalExpenditure)
dta7 <- dta7 %>% mutate(Category = c("food&tobacco", "household operation", "medical&health", "personal care", "private education")) %>%
gather(Year, Expend,1:5) %>%
group_by(Category) %>%
mutate(excess = log(Expend) - mean(log(Expend)))
#繪圖1
qplot(Category, excess, data = dta7) +
geom_hline(yintercept = 0, colour = "grey50") +
geom_line(aes(group = 1)) +
facet_wrap(~ Year)

#繪圖2
qplot(excess, Category, data = dta7) +
geom_segment(aes(xend = 0, yend = Category)) +
geom_vline(xintercept = 0, colour = "grey50") +
facet_wrap(~ Year, nrow = 1)

Question 8
dta8 <- read.table("dta8.txt", header = T)
#繪圖
ggplot(dta8, aes(write, read, fill = female))+
stat_density2d(aes(color = ..level..))+
theme(legend.position = c(.05, .75))+
geom_point(pch = 21, alpha = .5)+
facet_wrap(~female)+
labs(x = "Writing score", y = "Reading score")

Question 9
#產生資料
dta9 <- data.frame(Race = c("White", "White", "Black", "Black"),
Gender = c("Male", "Female", "Male", "Female"),
Yes = c(43, 26, 29, 22),
No = c(134, 149, 23, 36)) %>%
mutate(P_yes = Yes/(Yes+No), P_no = No/(Yes+No))
dta9_1 <- dta9 %>% select(1:4) %>%
gather(Intercourse, Freq, 3:4) %>%
mutate(P = c(dta9$P_yes, dta9$P_no),
SE = sqrt(P*(1-P)/Freq))
#繪圖
ggplot(dta9_1, aes(Intercourse, P, fill = Gender))+
geom_bar(position = "dodge", stat = "identity")+
geom_errorbar(aes(ymin = P - SE, ymax = P + SE), position = "dodge")+
facet_wrap(~ Race)+
labs(y = "Proportion")

Question 10
Question 11
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(ggrepel)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.4.4
##
## Attaching package: 'ggthemes'
## The following object is masked from 'package:mosaic':
##
## theme_map
dta11 <- Cushings %>%
mutate(Type = factor(Type, levels = c("u", "b", "c", "a"),
labels = c("Unknown", "Bilateral Hyperplasia", "Carcinoma", "Adenoma")))
ggplot(dta11, aes(Tetrahydrocortisone, Pregnanetriol, fill = Type)) +
geom_point(shape = 21, size = rel(2)) +
theme_hc() +
labs(x = "Tetrahydrocortisone (mg/24 hours)", y = "Pregnanetriol (mg/24 hours)",
title = "Cushings's syndrome") +
theme(plot.title = element_text(hjust = 1)) +
geom_text_repel(data = dta11[c(1,16,17,26),], aes(label = Type, color = Type))

Question 12
library(car)
dta12 <- Vocab
boundaries <- c(0, 6, 9, 12, 16, 20)
#繪圖
with(dta12, hist(education, breaks= boundaries, xlab = "Education level(years)", axes = FALSE, main = NULL, col = "gray"))
axis(1, boundaries) ; axis(2)
abline(h = seq(.01, .14, .01), col = "lightgray", lty = "dotted")
