# install.package("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# invoke help document
?Gcsemv
# view first 6 lines
head(Gcsemv)
school student gender written course
1 20920 16 M 23 NA
2 20920 25 F NA 71.2
3 20920 27 F 39 76.8
4 20920 31 F 36 87.9
5 20920 42 M 16 44.4
6 20920 62 F 36 NA
with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school , mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568
library(tidyverse)
dta <- Gcsemv %>%
group_by(school) %>%
mutate(r_sch = cor(course, written, use="pairwise"))
dtar <- dta[!duplicated(dta$school),"r_sch"]
# superimpose two plots
ggplot(data=Gcsemv, aes(x=course, y=written)) +
geom_point(color="lightskyblue1") +
geom_point(data=dtar, aes(x=course_schavg, y=written_schavg), color="steelblue") +labs(x="course score",
y="written score") +
theme_bw()
## The end
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")
##assign variable names
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
"Sat")
str(dta)
'data.frame': 50 obs. of 8 variables:
$ State : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
$ Expend: num 4.41 8.96 4.78 4.46 4.99 ...
$ Ratio : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
$ Salary: num 31.1 48 32.2 28.9 41.1 ...
$ Frac : int 8 47 27 6 45 29 81 68 48 65 ...
$ Verbal: int 491 445 448 482 417 462 431 429 420 406 ...
$ Math : int 538 489 496 523 485 518 477 468 469 448 ...
$ Sat : int 1029 934 944 1005 902 980 908 897 889 854 ...
head(dta)
State Expend Ratio Salary Frac Verbal Math Sat
1 Alabama 4.405 17.2 31.144 8 491 538 1029
2 Alaska 8.963 17.6 47.951 47 445 489 934
3 Arizona 4.778 19.3 32.175 27 448 496 944
4 Arkansas 4.459 17.1 28.934 6 482 523 1005
5 California 4.992 24.0 41.078 45 417 485 902
6 Colorado 5.443 18.4 34.571 29 462 518 980
library(tidyverse)
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
labels = c("Low", "Medium", "High")))
ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
stat_smooth(method="lm",
formula= y ~ x,
se=F,
color="black",
linetype=3,
size=rel(.5)) +
stat_smooth(aes(group = Fracf),
method="lm",
formula= y ~ x,
se=F,
color="gray",
linetype=2,
size=rel(.5))+
geom_text(aes(color=Fracf),
check_overlap=TRUE,
show.legend=FALSE,
size=rel(2)) +
labs(x="Salary ($1000)",
y="SAT Score") +
theme_bw()
##“2020-09-20”
dta <- read.csv("C:\\Users\\88696\\Desktop\\20200914\\homework3\\nlsy86wide.csv")
str(dta)
'data.frame': 166 obs. of 23 variables:
$ id : int 23901 25601 37401 40201 63501 70301 72001 76101 76801 77001 ...
$ sex : chr "Female" "Female" "Female" "Male" ...
$ race : chr "Majority" "Majority" "Majority" "Majority" ...
$ grade86 : int 0 0 0 0 1 0 0 0 0 0 ...
$ grade88 : int 2 1 2 1 3 2 1 3 2 2 ...
$ grade90 : int 3 3 5 2 4 3 3 4 5 4 ...
$ grade92 : int 5 6 6 5 6 5 5 6 6 5 ...
$ age86year : int 6 6 6 5 7 5 6 7 6 6 ...
$ age88year : int 8 8 8 8 9 8 8 9 9 8 ...
$ age90year : int 10 10 10 9 11 10 10 11 11 10 ...
$ age92year : int 12 12 12 12 13 12 12 13 13 12 ...
$ age86month: int 67 66 67 60 78 62 66 79 76 67 ...
$ age88month: int 96 95 95 91 108 93 94 109 104 94 ...
$ age90month: int 119 119 122 112 132 117 118 131 128 117 ...
$ age92month: int 142 143 144 139 155 139 140 154 151 139 ...
$ math86 : num 14.29 20.24 17.86 7.14 29.76 ...
$ math88 : num 15.5 36.9 22.6 21.4 50 ...
$ math90 : num 38.1 52.4 53.6 53.6 47.6 ...
$ math92 : num 41.7 58.3 58.3 51.2 71.4 ...
$ read86 : num 19.05 21.43 21.43 7.14 30.95 ...
$ read88 : num 29.8 32.1 45.2 21.4 50 ...
$ read90 : num 28.6 45.2 69 50 63.1 ...
$ read92 : num 45.2 57.1 78.6 59.5 82.1 ...
head(dta)
id sex race grade86 grade88 grade90 grade92 age86year age88year
1 23901 Female Majority 0 2 3 5 6 8
2 25601 Female Majority 0 1 3 6 6 8
3 37401 Female Majority 0 2 5 6 6 8
4 40201 Male Majority 0 1 2 5 5 8
5 63501 Male Majority 1 3 4 6 7 9
6 70301 Male Majority 0 2 3 5 5 8
age90year age92year age86month age88month age90month age92month math86
1 10 12 67 96 119 142 14.2857
2 10 12 66 95 119 143 20.2381
3 10 12 67 95 122 144 17.8571
4 9 12 60 91 112 139 7.1429
5 11 13 78 108 132 155 29.7619
6 10 12 62 93 117 139 14.2857
math88 math90 math92 read86 read88 read90 read92
1 15.476 38.095 41.667 19.0476 29.762 28.571 45.238
2 36.905 52.381 58.333 21.4286 32.143 45.238 57.143
3 22.619 53.571 58.333 21.4286 45.238 69.048 78.571
4 21.429 53.571 51.190 7.1429 21.429 50.000 59.524
5 50.000 47.619 71.429 30.9524 50.000 63.095 82.143
6 36.905 55.952 63.095 17.8571 46.429 64.286 96.429
library(dplyr)
long_grade <- dta %>% gather (key = grade_y, value = grade, grade86:grade92)
longgrade <- long_grade[,c("id","sex","race","grade_y","grade")]
long_ageyear <- dta %>% gather (key = age_y, value = ageyear, age86year:age92year)
longageyear <- long_ageyear[,c("id","sex","race","age_y","ageyear")]
long_agemonth <- dta %>% gather (key = age_m, value = agemonth, age86month:age92month)
longagemonth <- long_agemonth[,c("id","sex","race","age_m","agemonth")]
long_math <- dta %>% gather (key = math_y, value = math, math86:math92)
longmath <- long_math[,c("id","sex","race","math_y","math")]
long_read <- dta %>% gather (key = read_y, value = read, read86:read92)
longread <- long_read[,c("id","sex","race","read_y","read")]
longdta <- cbind ((longgrade[,c("id","sex","race","grade_y","grade")]),
(longageyear[,c("id","age_y","ageyear")]),
(longagemonth[,c("id","age_m","agemonth")]),
(longmath[,c("id","math_y","math")]),
(longread[,c("id","read_y","read")]),by= "id")
longdtafinal <- longdta[,c("id","sex","race","ageyear","agemonth","math","read")]
head(longdtafinal)
id sex race ageyear agemonth math read
1 23901 Female Majority 6 67 14.2857 19.0476
2 25601 Female Majority 6 66 20.2381 21.4286
3 37401 Female Majority 6 67 17.8571 21.4286
4 40201 Male Majority 5 60 7.1429 7.1429
5 63501 Male Majority 7 78 29.7619 30.9524
6 70301 Male Majority 5 62 14.2857 17.8571
library(tidyverse)
ggplot(data=longdtafinal, aes(x=agemonth, y=read, group=id)) +
geom_point(size=rel(.5)) +
stat_smooth(mapping = NULL,
data = NULL,
geom = "smooth",
position = "identity",
method ="lm",
formula= y ~ x,
se=F,
fullrange = FALSE,
level = 0.95,
color="skyblue2",
linetype=1,
size=rel(.1)) +
facet_grid(rows = vars(race),
cols = vars(sex),
scales = "free",
space = "free",
shrink = T,
labeller = "label_value",
as.table = T,
switch = NULL,
drop = T,
margins = T, #F就變成原本的四格圖
facets = NULL) +
labs(x="Month", y="Reading score") +
theme_bw()
ggplot(data=longdtafinal, aes(x=agemonth, y=read, group=id)) +
geom_point(size=rel(.5)) +
stat_smooth(mapping = NULL,
data = NULL,
geom = "smooth",
position = "identity",
method ="lm",
formula= y ~ x,
se=F,
fullrange = FALSE,
level = 0.95,
color="skyblue2",
linetype=1,
size=rel(.1)) +
facet_grid(rows = vars(race),
cols = vars(sex),
scales = "free",
space = "free",
shrink = T,
labeller = "label_value",
as.table = T,
switch = NULL,
drop = T,
margins = F,
facets = NULL) +
labs(x="Month", y="Reading score") +
theme_bw()
author: “Chang-Chun Chen” date: “2020-09-20”
dta <- read.table("C:\\Users\\88696\\Desktop\\20200914\\homework4\\verbalIQ.txt",header=T)
head (dta)
school pupil viq language csize ses
1 1 17001 15.0 46 29 23
2 1 17002 14.5 45 29 10
3 1 17003 9.5 33 29 15
4 1 17004 11.0 46 29 23
5 1 17005 8.0 20 29 10
6 1 17006 9.5 30 29 10
library(tidyverse)
ggplot(data=dta, aes(x=viq, y=language, group = school)) +
geom_point() +
geom_smooth(method="lm",
formula= y ~ x,
se=F,
color="gray",
linetype=2,
size=rel(.5)) +
labs(x="Verbal IQ",
y="language") +
theme_bw()
mlang <- with(dta, tapply(language, school, mean))
mviq <- with(dta, tapply(viq, school, mean))
model=lm(formula = mlang ~ mviq, data = dta)
summary(model)
Call:
lm(formula = mlang ~ mviq, data = dta)
Residuals:
Min 1Q Median 3Q Max
-8.416 -1.751 0.046 2.218 6.889
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.269 3.431 -2.12 0.036
mviq 4.049 0.292 13.87 <2e-16
Residual standard error: 3.35 on 129 degrees of freedom
Multiple R-squared: 0.598, Adjusted R-squared: 0.595
F-statistic: 192 on 1 and 129 DF, p-value: <2e-16
with(dta, cor(viq, language, method = "pearson"))
[1] 0.60982
library(tidyverse)
library(ggplot2)
library(nlme)
library(dplyr)
library(gapminder)
ggplot(data=dta, aes(x=viq, y=language)) +
geom_point() +
geom_smooth(method="lm",
formula= y ~ x,
se=F,
level = 0.95,
color="red",
linetype=2,
size=rel(.5)) +
labs(x="Verbal IQ", y="language") +
theme_bw()
##The end