#read data
dta2 <- read.table("langMathDutch.txt", header=T)
head(dta2)
## school pupil IQV size lang arith
## 1 1 17001 15.0 29 46 24
## 2 1 17002 14.5 29 45 19
## 3 1 17003 9.5 29 33 24
## 4 1 17004 11.0 29 46 26
## 5 1 17005 8.0 29 20 9
## 6 1 17006 9.5 29 30 13
str(dta2)
## 'data.frame': 2287 obs. of 6 variables:
## $ school: int 1 1 1 1 1 1 1 1 1 1 ...
## $ pupil : int 17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
## $ IQV : num 15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
## $ size : int 29 29 29 29 29 29 29 29 29 29 ...
## $ lang : int 46 45 33 46 20 30 30 57 36 36 ...
## $ arith : int 24 19 24 26 9 13 13 30 23 22 ...
#generate new variabls(classlevel, IQlevel)
dta2$classlevel <- with(dta2, cut(dta2$size, breaks =quantile(size, probs = c(0, .33, .67, 1)),
labels = c("Small","Medium", "Large"), ordered=T, include.lowest = T))
dta2$IQlevel <- with(dta2, cut(dta2$IQV, breaks =quantile(IQV, probs = c(0, .33, .67, 1)),
labels = c("Low", "Middle", "High"), oreder=T, include.lowest = T))
head(dta2)
## school pupil IQV size lang arith classlevel IQlevel
## 1 1 17001 15.0 29 46 24 Large High
## 2 1 17002 14.5 29 45 19 Large High
## 3 1 17003 9.5 29 33 24 Large Low
## 4 1 17004 11.0 29 46 26 Large Low
## 5 1 17005 8.0 29 20 9 Large Low
## 6 1 17006 9.5 29 30 13 Large Low
library(ggplot2)
ggplot(dta2, aes(lang, arith))+
geom_point(shape=23, fill="black")+
stat_smooth(method="lm")+
facet_wrap(.~classlevel:IQlevel)+
labs(x="Language score", y="Arithmetic score")
## `geom_smooth()` using formula 'y ~ x'
dta3 <- datasets::USPersonalExpenditure
head(dta3)
## 1940 1945 1950 1955 1960
## Food and Tobacco 22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health 3.530 5.760 9.71 14.0 21.10
## Personal Care 1.040 1.980 2.45 3.4 5.40
## Private Education 0.341 0.974 1.80 2.6 3.64
library(reshape2)
dta3 <- melt(dta3)
colnames(dta3)<- c("category", "year", "expenditure")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
dta3$expenditurelog<-log(dta3$expenditure) #將ependiture取log ,取代原先的值
dta3 <-dta3 %>% mutate(excess= (expenditurelog - mean(expenditurelog)))
head(dta3)
## category year expenditure expenditurelog excess
## 1 Food and Tobacco 1940 22.200 3.10009229 0.9854839
## 2 Household Operation 1940 10.500 2.35137526 0.2367669
## 3 Medical and Health 1940 3.530 1.26129787 -0.8533105
## 4 Personal Care 1940 1.040 0.03922071 -2.0753877
## 5 Private Education 1940 0.341 -1.07587280 -3.1904812
## 6 Food and Tobacco 1945 44.500 3.79548919 1.6808808
Autism spectrum disorder
dta4 <- WWGbook::autism
head(dta4)
## age vsae sicdegp childid
## 1 2 6 3 1
## 2 3 7 3 1
## 3 5 18 3 1
## 4 9 25 3 1
## 5 13 27 3 1
## 6 2 17 3 3
#generate new variables(level, centered_age)
dta4$group <-with(dta4, cut(sicdegp, breaks = seq(0,3,1),
labels =c("Low", "Medium","High" ),
oredered=T))
dta4 <- dta4%>% mutate(centered_age= age-mean(age))
head(dta4)
## age vsae sicdegp childid group centered_age
## 1 2 6 3 1 High -3.7712418
## 2 3 7 3 1 High -2.7712418
## 3 5 18 3 1 High -0.7712418
## 4 9 25 3 1 High 3.2287582
## 5 13 27 3 1 High 7.2287582
## 6 2 17 3 3 High -3.7712418
ggplot(dta4, aes(centered_age, vsae))+
geom_point()+
geom_line(aes(group=childid), color="grey50", alpha=.3)+
stat_smooth(method="lm")+
facet_grid(.~group)+
labs(x="Age (in years, centered)", y="VSAE score")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
diabetes dataset
dta5 <-read.csv("diabetes_mell.csv", header = T, sep = ",")
head(dta5)
## SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI gender race diabetes BMI
## 1 51624 1 3 2 32.22 Males White No Overweight
## 2 51626 1 4 2 22.00 Males Black No Normal weight
## 3 51627 1 4 2 18.22 Males Black No Normal weight
## 4 51628 2 4 1 42.39 Females Black Yes Overweight
## 5 51629 1 1 2 32.61 Males Hispanic No Overweight
## 6 51630 2 3 2 30.57 Females White No Overweight
str(dta5)
## 'data.frame': 8706 obs. of 9 variables:
## $ SEQN : int 51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
## $ RIAGENDR: int 1 1 1 2 1 2 1 1 1 1 ...
## $ RIDRETH1: int 3 4 4 4 1 3 2 3 1 3 ...
## $ DIQ010 : int 2 2 2 1 2 2 2 2 2 1 ...
## $ BMXBMI : num 32.2 22 18.2 42.4 32.6 ...
## $ gender : Factor w/ 2 levels "Females","Males": 2 2 2 1 2 1 2 2 2 2 ...
## $ race : Factor w/ 3 levels "Black","Hispanic",..: 3 1 1 1 2 3 2 3 2 3 ...
## $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 2 ...
## $ BMI : Factor w/ 2 levels "Normal weight",..: 2 1 1 2 2 2 1 2 1 2 ...
dta5 <-dta5 %>% select(race, gender, diabetes, BMI)
dta51<- data.frame(xtabs(data=dta5, ~ race+gender+diabetes+BMI))
#使用"xtabs"將race, gender, diabetes, BMI變項資料整理成data.frame型態(計算frequency)
head(dta51)
## race gender diabetes BMI Freq
## 1 Black Females No Normal weight 347
## 2 Hispanic Females No Normal weight 712
## 3 White Females No Normal weight 998
## 4 Black Males No Normal weight 429
## 5 Hispanic Males No Normal weight 706
## 6 White Males No Normal weight 873
library(ggalluvial)
ggplot(dta51,
aes(axis1=race,
axis2= gender,
axis3= diabetes,
y=Freq))+
scale_x_discrete(limits=c("race",
"gender",
"diabetes"),
expand=c(.1, .05)) +
labs(x='',
y='No. individuals') +
geom_alluvium(aes(fill=BMI)) +
geom_stratum() +
geom_text(stat="stratum",
infer.label=TRUE) +
scale_fill_manual(values=c('gray','darkorange'))+
theme_minimal() +
theme(legend.position = "bottom") +
ggtitle("Diabetes in overall population in US 2009-2010", subtitle = "straitified by race, gender and diabetes mellitus")
Find out each code chunk(indicated by “##”)
## lookup R Documentation of ggplot2 packages
library(ggplot2)
?ggplot2
##安裝gapminder套件並叫出預備使用
library(gapminder)
##叫出資料並快速檢閱資料型態
data(gapminder)
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
## 指定資料名稱為gap
gap <- gapminder
## 以gap資料中的lifeExp變項做為自變項(空白畫布)
ggplot(data = gap, aes(x=lifeExp))
##繪製直方圖
ggplot(data=gap, aes(x=lifeExp))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##指定直方圖填滿顏色以及xy軸及標題名稱
ggplot(data=gap, aes(x=lifeExp))+
geom_histogram(fill="blue", color="black", bins = 10)+
ggtitle("Life expectancy for the gap set")+
xlab("Life expectancy(year)")+
ylab("Frequency")+
theme_classic()
##繪製盒型圖,並根據資料來源(continent)繪製不同圖案
ggplot(data=gap, aes(x=continent, y=lifeExp, fill = continent))+
geom_boxplot()+
xlab("Continent") +
ylab("Life expectancy (years)") +
theme_minimal() #+
#guides(fill = FALSE) #加上這行指令說明會不見
##
ggplot(data = gap, aes(x=lifeExp, y=gdpPercap, color = continent, shape=continent))+
geom_point(size=5, alpha=.5)+
theme_classic()+
ggtitle("Scatterplot of life expectancy by gdpPercap")+
xlab("Life expectancy (year)")+
ylab("gdpPercap (USD)")+
theme(legend.position = "top",
plot.title = element_text(hjust=.05, size = 20),
legend.title = element_text(size = 10),
legend.text = element_text(size=5),
axis.text.x = element_text(angle=45, hjust=1))
#第二個theme內容,調整了標題以及說明欄內的文字大小,還有最後一行調整x軸文字的角度
anxiety(female vs. male)]
dtae1<- read.table("stateAnxiety.txt", header = T)
head(dtae1)
## f1 f2 f3 f4 f5 m1 m2 m3 m4 m5
## 1 13 17 18 20 24 6 14 22 20 24
## 2 26 31 33 38 42 4 11 14 12 23
## 3 13 17 24 29 32 17 25 26 29 38
## 4 22 24 26 27 29 19 22 26 30 34
## 5 18 19 19 22 30 12 21 21 23 24
## 6 32 31 30 31 32 11 16 20 19 22
library(stringr)
dtaf<-dtae1[,1:5]
dtam<-dtae1[,6:10]
#wide form to long form (female)
dtaf_long <- dtaf%>% melt() %>%
#取variable裡的第1個字元,並新增"gender" variable
mutate(gender=str_sub(variable, 1,1),
#取variable裡的第2個字元,並新增"weeks" variable
weeks=str_sub(variable,2,2))
## No id variables; using all as measure variables
dtaf_long$id <-rep(1:50, 5)
head(dtaf_long)
## variable value gender weeks id
## 1 f1 13 f 1 1
## 2 f1 26 f 1 2
## 3 f1 13 f 1 3
## 4 f1 22 f 1 4
## 5 f1 18 f 1 5
## 6 f1 32 f 1 6
#男性資料也做一樣的動作
dtam_long <- dtam %>% melt() %>%
mutate(gender=str_sub(variable, 1,1),
weeks=str_sub(variable,2,2))
## No id variables; using all as measure variables
dtam_long$id <-rep(51:100,5)
head(dtam_long)
## variable value gender weeks id
## 1 m1 6 m 1 51
## 2 m1 4 m 1 52
## 3 m1 17 m 1 53
## 4 m1 19 m 1 54
## 5 m1 12 m 1 55
## 6 m1 11 m 1 56
#將資料結合
dtat <-rbind(dtam_long, dtaf_long)
dtat <-dtat[,-1] #去掉第一個column
colnames(dtat)<-c("scores", "gender","weeks", "id")
head(dtat)
## scores gender weeks id
## 1 6 m 1 51
## 2 4 m 1 52
## 3 17 m 1 53
## 4 19 m 1 54
## 5 12 m 1 55
## 6 11 m 1 56
pd <- position_dodge(.1)
dtat %>% group_by(gender, weeks) %>%
summarise(mean =mean(scores),
se= sd(scores)/sqrt(n())) %>%
ggplot()+
aes(x=weeks, y=mean, group=gender, shape=gender)+
geom_line(aes(linetype=gender), width=.2, size=.3, position = pd )+
geom_point(aes(shape=gender, color=gender), size=3)+
geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1, size=.2, position = pd)+
labs(x="weeks before exams", y="mean anxiety score")
## Warning: Ignoring unknown parameters: width