In-class exercise

In-class exercise 2

#read data
dta2 <- read.table("langMathDutch.txt", header=T)
head(dta2)

##   school pupil  IQV size lang arith
## 1      1 17001 15.0   29   46    24
## 2      1 17002 14.5   29   45    19
## 3      1 17003  9.5   29   33    24
## 4      1 17004 11.0   29   46    26
## 5      1 17005  8.0   29   20     9
## 6      1 17006  9.5   29   30    13

str(dta2)

## 'data.frame':    2287 obs. of  6 variables:
##  $ school: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pupil : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ IQV   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ size  : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ lang  : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ arith : int  24 19 24 26 9 13 13 30 23 22 ...

#generate new variabls(classlevel, IQlevel)
dta2$classlevel <- with(dta2, cut(dta2$size, breaks =quantile(size, probs = c(0, .33, .67, 1)),
                                  labels = c("Small","Medium", "Large"), ordered=T, include.lowest = T))

dta2$IQlevel <- with(dta2, cut(dta2$IQV, breaks =quantile(IQV, probs = c(0, .33, .67, 1)),
                               labels = c("Low", "Middle", "High"), oreder=T, include.lowest = T))

head(dta2)

##   school pupil  IQV size lang arith classlevel IQlevel
## 1      1 17001 15.0   29   46    24      Large    High
## 2      1 17002 14.5   29   45    19      Large    High
## 3      1 17003  9.5   29   33    24      Large     Low
## 4      1 17004 11.0   29   46    26      Large     Low
## 5      1 17005  8.0   29   20     9      Large     Low
## 6      1 17006  9.5   29   30    13      Large     Low

library(ggplot2)
ggplot(dta2, aes(lang, arith))+
             geom_point(shape=23, fill="black")+
             stat_smooth(method="lm")+
             facet_wrap(.~classlevel:IQlevel)+
             labs(x="Language score", y="Arithmetic score")

## `geom_smooth()` using formula 'y ~ x'

In-class exercise 3

dta3 <- datasets::USPersonalExpenditure
head(dta3)

##                       1940   1945  1950 1955  1960
## Food and Tobacco    22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health   3.530  5.760  9.71 14.0 21.10
## Personal Care        1.040  1.980  2.45  3.4  5.40
## Private Education    0.341  0.974  1.80  2.6  3.64

library(reshape2)
dta3 <- melt(dta3)
colnames(dta3)<- c("category", "year", "expenditure")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(magrittr)
dta3$expenditurelog<-log(dta3$expenditure) #將ependiture取log ，取代原先的值
dta3 <-dta3 %>% mutate(excess= (expenditurelog - mean(expenditurelog)))
head(dta3)

##              category year expenditure expenditurelog     excess
## 1    Food and Tobacco 1940      22.200     3.10009229  0.9854839
## 2 Household Operation 1940      10.500     2.35137526  0.2367669
## 3  Medical and Health 1940       3.530     1.26129787 -0.8533105
## 4       Personal Care 1940       1.040     0.03922071 -2.0753877
## 5   Private Education 1940       0.341    -1.07587280 -3.1904812
## 6    Food and Tobacco 1945      44.500     3.79548919  1.6808808

In-class exercise 4

Autism spectrum disorder

dta4 <- WWGbook::autism
head(dta4)

##   age vsae sicdegp childid
## 1   2    6       3       1
## 2   3    7       3       1
## 3   5   18       3       1
## 4   9   25       3       1
## 5  13   27       3       1
## 6   2   17       3       3

#generate new variables(level, centered_age)
dta4$group <-with(dta4, cut(sicdegp, breaks = seq(0,3,1), 
                            labels =c("Low", "Medium","High" ),
                            oredered=T))
dta4 <- dta4%>% mutate(centered_age= age-mean(age))
head(dta4)

##   age vsae sicdegp childid group centered_age
## 1   2    6       3       1  High   -3.7712418
## 2   3    7       3       1  High   -2.7712418
## 3   5   18       3       1  High   -0.7712418
## 4   9   25       3       1  High    3.2287582
## 5  13   27       3       1  High    7.2287582
## 6   2   17       3       3  High   -3.7712418

ggplot(dta4, aes(centered_age, vsae))+
  geom_point()+
  geom_line(aes(group=childid), color="grey50", alpha=.3)+
  stat_smooth(method="lm")+
  facet_grid(.~group)+
  labs(x="Age (in years, centered)", y="VSAE score")

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).

## Warning: Removed 1 row(s) containing missing values (geom_path).

In-class exercise 5

diabetes dataset

dta5 <-read.csv("diabetes_mell.csv", header = T, sep = ",")
head(dta5)

##    SEQN RIAGENDR RIDRETH1 DIQ010 BMXBMI  gender     race diabetes           BMI
## 1 51624        1        3      2  32.22   Males    White       No    Overweight
## 2 51626        1        4      2  22.00   Males    Black       No Normal weight
## 3 51627        1        4      2  18.22   Males    Black       No Normal weight
## 4 51628        2        4      1  42.39 Females    Black      Yes    Overweight
## 5 51629        1        1      2  32.61   Males Hispanic       No    Overweight
## 6 51630        2        3      2  30.57 Females    White       No    Overweight

str(dta5)

## 'data.frame':    8706 obs. of  9 variables:
##  $ SEQN    : int  51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
##  $ RIAGENDR: int  1 1 1 2 1 2 1 1 1 1 ...
##  $ RIDRETH1: int  3 4 4 4 1 3 2 3 1 3 ...
##  $ DIQ010  : int  2 2 2 1 2 2 2 2 2 1 ...
##  $ BMXBMI  : num  32.2 22 18.2 42.4 32.6 ...
##  $ gender  : Factor w/ 2 levels "Females","Males": 2 2 2 1 2 1 2 2 2 2 ...
##  $ race    : Factor w/ 3 levels "Black","Hispanic",..: 3 1 1 1 2 3 2 3 2 3 ...
##  $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 2 ...
##  $ BMI     : Factor w/ 2 levels "Normal weight",..: 2 1 1 2 2 2 1 2 1 2 ...

dta5 <-dta5 %>% select(race, gender, diabetes, BMI) 
dta51<- data.frame(xtabs(data=dta5, ~ race+gender+diabetes+BMI))
#使用"xtabs"將race, gender, diabetes, BMI變項資料整理成data.frame型態(計算frequency)
head(dta51)

##       race  gender diabetes           BMI Freq
## 1    Black Females       No Normal weight  347
## 2 Hispanic Females       No Normal weight  712
## 3    White Females       No Normal weight  998
## 4    Black   Males       No Normal weight  429
## 5 Hispanic   Males       No Normal weight  706
## 6    White   Males       No Normal weight  873

library(ggalluvial)
ggplot(dta51,
       aes(axis1=race,
           axis2= gender,
           axis3= diabetes,
           y=Freq))+
  scale_x_discrete(limits=c("race", 
                            "gender", 
                            "diabetes"), 
                   expand=c(.1, .05)) +
  labs(x='', 
       y='No. individuals') +
  geom_alluvium(aes(fill=BMI)) +
  geom_stratum() +
  geom_text(stat="stratum", 
            infer.label=TRUE) +
  scale_fill_manual(values=c('gray','darkorange'))+
  theme_minimal() +
  theme(legend.position = "bottom") +
  ggtitle("Diabetes in overall population in US 2009-2010", subtitle = "straitified by race, gender and diabetes mellitus")

In-class exercise 6

Find out each code chunk(indicated by “##”)

## lookup R Documentation of ggplot2 packages
library(ggplot2)
?ggplot2

##安裝gapminder套件並叫出預備使用

library(gapminder)

##叫出資料並快速檢閱資料型態
data(gapminder)
str(gapminder)

## Classes 'tbl_df', 'tbl' and 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...

## 指定資料名稱為gap
gap <- gapminder

## 以gap資料中的lifeExp變項做為自變項（空白畫布）
ggplot(data = gap, aes(x=lifeExp))

##繪製直方圖
ggplot(data=gap, aes(x=lifeExp))+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##指定直方圖填滿顏色以及xy軸及標題名稱
ggplot(data=gap, aes(x=lifeExp))+
  geom_histogram(fill="blue", color="black", bins = 10)+
  ggtitle("Life expectancy for the gap set")+
  xlab("Life expectancy(year)")+
  ylab("Frequency")+
  theme_classic()

##繪製盒型圖，並根據資料來源(continent)繪製不同圖案
ggplot(data=gap, aes(x=continent, y=lifeExp, fill = continent))+
  geom_boxplot()+
  xlab("Continent") + 
  ylab("Life expectancy (years)") +
  theme_minimal() #+

  #guides(fill = FALSE) #加上這行指令說明會不見

##
ggplot(data = gap, aes(x=lifeExp, y=gdpPercap, color = continent, shape=continent))+
  geom_point(size=5, alpha=.5)+
  theme_classic()+
  ggtitle("Scatterplot of life expectancy by gdpPercap")+
  xlab("Life expectancy (year)")+
  ylab("gdpPercap (USD)")+
  theme(legend.position = "top",
        plot.title = element_text(hjust=.05, size = 20),
        legend.title = element_text(size = 10),
        legend.text = element_text(size=5),
        axis.text.x = element_text(angle=45, hjust=1))

#第二個theme內容，調整了標題以及說明欄內的文字大小，還有最後一行調整x軸文字的角度

Homework exercise

Exercise1

anxiety(female vs. male)]

dtae1<- read.table("stateAnxiety.txt", header = T)
head(dtae1)

##   f1 f2 f3 f4 f5 m1 m2 m3 m4 m5
## 1 13 17 18 20 24  6 14 22 20 24
## 2 26 31 33 38 42  4 11 14 12 23
## 3 13 17 24 29 32 17 25 26 29 38
## 4 22 24 26 27 29 19 22 26 30 34
## 5 18 19 19 22 30 12 21 21 23 24
## 6 32 31 30 31 32 11 16 20 19 22

library(stringr)
dtaf<-dtae1[,1:5] 
dtam<-dtae1[,6:10]

#wide form to long form (female)
dtaf_long <- dtaf%>% melt() %>% 
  #取variable裡的第1個字元，並新增"gender" variable
  mutate(gender=str_sub(variable, 1,1), 
  #取variable裡的第2個字元，並新增"weeks" variable
         weeks=str_sub(variable,2,2))

## No id variables; using all as measure variables

dtaf_long$id <-rep(1:50, 5)
head(dtaf_long)

##   variable value gender weeks id
## 1       f1    13      f     1  1
## 2       f1    26      f     1  2
## 3       f1    13      f     1  3
## 4       f1    22      f     1  4
## 5       f1    18      f     1  5
## 6       f1    32      f     1  6

#男性資料也做一樣的動作
dtam_long <- dtam %>% melt() %>% 
  mutate(gender=str_sub(variable, 1,1),
         weeks=str_sub(variable,2,2))

## No id variables; using all as measure variables

dtam_long$id <-rep(51:100,5) 
head(dtam_long)

##   variable value gender weeks id
## 1       m1     6      m     1 51
## 2       m1     4      m     1 52
## 3       m1    17      m     1 53
## 4       m1    19      m     1 54
## 5       m1    12      m     1 55
## 6       m1    11      m     1 56

#將資料結合
dtat <-rbind(dtam_long, dtaf_long)
dtat <-dtat[,-1] #去掉第一個column
colnames(dtat)<-c("scores", "gender","weeks", "id")
head(dtat)

##   scores gender weeks id
## 1      6      m     1 51
## 2      4      m     1 52
## 3     17      m     1 53
## 4     19      m     1 54
## 5     12      m     1 55
## 6     11      m     1 56

pd <- position_dodge(.1)
dtat %>% group_by(gender, weeks) %>%
  summarise(mean =mean(scores),
            se= sd(scores)/sqrt(n())) %>%
  ggplot()+
    aes(x=weeks, y=mean, group=gender, shape=gender)+
    geom_line(aes(linetype=gender), width=.2, size=.3, position = pd )+
    geom_point(aes(shape=gender, color=gender), size=3)+
    geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1, size=.2, position = pd)+
    labs(x="weeks before exams", y="mean anxiety score")

## Warning: Ignoring unknown parameters: width

0420_Grammar of Graphics exercise

yuwenchen

4/23/2020

In-class exercise

In-class exercise 2

In-class exercise 3

In-class exercise 4

In-class exercise 5

In-class exercise 6

Homework exercise

Exercise1