In-class exercises1

plot(women, type='n') 
points(women[1,]) #顯示women第一列資料

##以lattice畫出Y軸為Weight x軸為height 點出所切割處理women第一列資料
lattice::xyplot(weight ~ height, 
  data=women,
  subset=row.names(women)==1, type='p')

###以ggplot 點出women中Y軸為Weight x軸為height women第一列資料
library(ggplot2)
ggplot(data=women[1,], aes(height, weight))+  geom_point()

In-class exercises2

#資料匯入、檢視資料

dta <- read.table("/Users/User/Desktop/DM_R/hk0420/langMathDutch.txt", header=T , stringsAsFactor=F, fill=T )
dim(dta)

## [1] 2287    6

str(dta)

## 'data.frame':    2287 obs. of  6 variables:
##  $ school: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pupil : int  17001 17002 17003 17004 17005 17006 17007 17008 17009 17010 ...
##  $ IQV   : num  15 14.5 9.5 11 8 9.5 9.5 13 9.5 11 ...
##  $ size  : int  29 29 29 29 29 29 29 29 29 29 ...
##  $ lang  : int  46 45 33 46 20 30 30 57 36 36 ...
##  $ arith : int  24 19 24 26 9 13 13 30 23 22 ...

head(dta)

##   school pupil  IQV size lang arith
## 1      1 17001 15.0   29   46    24
## 2      1 17002 14.5   29   45    19
## 3      1 17003  9.5   29   33    24
## 4      1 17004 11.0   29   46    26
## 5      1 17005  8.0   29   20     9
## 6      1 17006  9.5   29   30    13

##classified IQ
quantile(dta$IQV,seq(0, 1, 1/3))

##        0% 33.33333% 66.66667%      100% 
##       4.0      11.0      12.5      18.0

classIQ <- with(dta, cut(IQV, ordered=T, breaks=c(0, 11, 12.5, 18), labels=c("Low", "Middle","High")))
newdta <- cbind(dta,classIQ)
head(newdta)

##   school pupil  IQV size lang arith classIQ
## 1      1 17001 15.0   29   46    24    High
## 2      1 17002 14.5   29   45    19    High
## 3      1 17003  9.5   29   33    24     Low
## 4      1 17004 11.0   29   46    26     Low
## 5      1 17005  8.0   29   20     9     Low
## 6      1 17006  9.5   29   30    13     Low

##classified size
quantile(dta$size,seq(0, 1, 1/3))

##        0% 33.33333% 66.66667%      100% 
##         5        20        27        37

classsize <- with(dta, cut(size, ordered=T, breaks=c(0, 20, 27, 37), labels=c("Small", "Medium", "Large")))

ndta <- cbind(newdta,classsize)
head(ndta)

##   school pupil  IQV size lang arith classIQ classsize
## 1      1 17001 15.0   29   46    24    High     Large
## 2      1 17002 14.5   29   45    19    High     Large
## 3      1 17003  9.5   29   33    24     Low     Large
## 4      1 17004 11.0   29   46    26     Low     Large
## 5      1 17005  8.0   29   20     9     Low     Large
## 6      1 17006  9.5   29   30    13     Low     Large

##cbind classsize&classIQ  (use , split)
IQsize <- with(ndta,paste(classsize,classIQ,sep = ","))
head(IQsize)

## [1] "Large,High" "Large,High" "Large,Low"  "Large,Low"  "Large,Low" 
## [6] "Large,Low"

resultdata <- cbind(ndta,IQsize)
head(resultdata)

##   school pupil  IQV size lang arith classIQ classsize     IQsize
## 1      1 17001 15.0   29   46    24    High     Large Large,High
## 2      1 17002 14.5   29   45    19    High     Large Large,High
## 3      1 17003  9.5   29   33    24     Low     Large  Large,Low
## 4      1 17004 11.0   29   46    26     Low     Large  Large,Low
## 5      1 17005  8.0   29   20     9     Low     Large  Large,Low
## 6      1 17006  9.5   29   30    13     Low     Large  Large,Low

#geom_point#點#透明度,method=“lm”#線性,se=T##陰影部分,facet_wrap(~ IQsize)依照IQsize排列

p1 <-  ggplot(resultdata, 
             aes(lang, arith)) +
  geom_point(size=rel(1),
             alpha=1) +
  stat_smooth(method="lm", 
              formula=y ~ x,
              se=T,     
              col='blue') +
  facet_wrap(~ IQsize) + 
  labs(x="Language score", y="Arithmetic score") +
  theme_bw() 
p1

In-class exercises3

#資料匯入、檢視資料

dta <- datasets::USPersonalExpenditure
dim(dta)

## [1] 5 5

str(dta)

##  num [1:5, 1:5] 22.2 10.5 3.53 1.04 0.341 44.5 15.5 5.76 1.98 0.974 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:5] "Food and Tobacco" "Household Operation" "Medical and Health" "Personal Care" ...
##   ..$ : chr [1:5] "1940" "1945" "1950" "1955" ...

head(dta)

##                       1940   1945  1950 1955  1960
## Food and Tobacco    22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health   3.530  5.760  9.71 14.0 21.10
## Personal Care        1.040  1.980  2.45  3.4  5.40
## Private Education    0.341  0.974  1.80  2.6  3.64

# transform to long form
library(reshape2)
mdta <- melt(dta)  
head(mdta)

##                  Var1 Var2  value
## 1    Food and Tobacco 1940 22.200
## 2 Household Operation 1940 10.500
## 3  Medical and Health 1940  3.530
## 4       Personal Care 1940  1.040
## 5   Private Education 1940  0.341
## 6    Food and Tobacco 1945 44.500

#Def. colname
colnames(mdta) <- c("category", "year", "expenditure")
head(mdta)

##              category year expenditure
## 1    Food and Tobacco 1940      22.200
## 2 Household Operation 1940      10.500
## 3  Medical and Health 1940       3.530
## 4       Personal Care 1940       1.040
## 5   Private Education 1940       0.341
## 6    Food and Tobacco 1945      44.500

##transform the dollar amounts to log base 10  
logexp <- round(log10(mdta$expenditure),3)
head(logexp)

## [1]  1.346  1.021  0.548  0.017 -0.467  1.648

#參考李唐榮同學

resultdata <- cbind(mdta,logexp)
##draw ggplot
library(ggplot2)
p0 <- ggplot(resultdata) +
              aes(x=logexp, y=category,color = logexp>mean(logexp))  +
              geom_point(size=rel(3)) +
              facet_grid(. ~ year) +
              geom_segment(aes(xend=logexp, yend=category, x=mean(logexp))) + 
              geom_vline(aes(xintercept=mean(logexp)))+
              theme_minimal()
       
p0

library(ggplot2)
p0 <- ggplot(mdta) +
             aes(x=logexp, y=category) +
             labs(x="log of Expenditures", y="Category")
p0

p1 <- p0 +
  geom_point()
p1

p2 <- p1 +
  facet_grid(.~ year)
p2

p3 <- p2 +
  geom_segment(aes(xend = mean(logexp), 
                   yend = category)) +
  geom_vline(aes(xintercept=mean(logexp))) +
  theme_minimal()
p3

In-class exercises4

#資料匯入、檢視資料

library(WWGbook)
dta <- na.omit(autism) #畫圖顯示有missing data，回頭移除NA
dim(dta)

## [1] 610   4

str(dta)

## 'data.frame':    610 obs. of  4 variables:
##  $ age    : int  2 3 5 9 13 2 3 5 9 13 ...
##  $ vsae   : int  6 7 18 25 27 17 18 12 18 24 ...
##  $ sicdegp: int  3 3 3 3 3 3 3 3 3 3 ...
##  $ childid: int  1 1 1 1 1 3 3 3 3 3 ...
##  - attr(*, "na.action")= 'omit' Named int  507 553
##   ..- attr(*, "names")= chr  "507" "553"

head(dta)

##   age vsae sicdegp childid
## 1   2    6       3       1
## 2   3    7       3       1
## 3   5   18       3       1
## 4   9   25       3       1
## 5  13   27       3       1
## 6   2   17       3       3

#Convert 123 to LMH.
SICLMH <- with(dta, cut(sicdegp, ordered=T, breaks=c(0, 1, 2, 3), labels=c("L", "M", "H")))
#cbind
newdta <- cbind(dta, SICLMH)
head(newdta)

##   age vsae sicdegp childid SICLMH
## 1   2    6       3       1      H
## 2   3    7       3       1      H
## 3   5   18       3       1      H
## 4   9   25       3       1      H
## 5  13   27       3       1      H
## 6   2   17       3       3      H

#Def. centered age
centerage <- round((newdta$age-mean(newdta$age)),3)
ndta <- cbind(newdta, centerage)
head(ndta)

##   age vsae sicdegp childid SICLMH centerage
## 1   2    6       3       1      H     -3.77
## 2   3    7       3       1      H     -2.77
## 3   5   18       3       1      H     -0.77
## 4   9   25       3       1      H      3.23
## 5  13   27       3       1      H      7.23
## 6   2   17       3       3      H     -3.77

library(ggplot2)
p0 <- ggplot(ndta) +
  aes(x=centerage, y=vsae) + 
  geom_point() +
  labs(x="Age (in years, centered)", y="VSAE score")
p0

p1 <- p0 +
  geom_line(aes(group=childid), color="gray", alpha=.5)
p1

p2 <- p1 +
  stat_smooth(data=ndta,
              formula = y ~ x,
              method = "lm",
              se=TRUE)
p2

p3 <- p2 +
  facet_wrap(.~ SICLMH)
p3

p4 <- p3 +
  scale_x_continuous(breaks=seq(-2.5, 7.5, 2.5))+
  scale_y_continuous(breaks=seq(0, 200, 50))  #調整xyscale
p4

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

nNAdta <- na.omit(ndta)
newdata<-nNAdta %>% mutate(age2 = age - 2) %>% 
  # group data
  group_by(SICLMH, age2) %>%
  # compute mean and standard error for vase
  summarize(vsaemean = mean(vsae), 
            vsaese = sd(vsae) / sqrt(n()))

head(newdata)

## # A tibble: 6 x 4
## # Groups:   SICLMH [2]
##   SICLMH  age2 vsaemean vsaese
##   <ord>  <dbl>    <dbl>  <dbl>
## 1 L          0     7     0.387
## 2 L          1    12.0   0.914
## 3 L          3    15.0   1.47 
## 4 L          7    25.6   4.74 
## 5 L         11    37.1   6.72 
## 6 M          0     8.67  0.435

##draw ggplot
gg3 <- ggplot(data=newdata) +
  aes(age2, vsaemean, group=SICLMH, shape=SICLMH) +
  geom_errorbar(aes(ymin=vsaemean - vsaese,
                    ymax=vsaemean + vsaese),
                width=.2, size=.3, 
                position=position_dodge(.5)) +
  geom_line(position=position_dodge(.5), 
            aes(linetype=SICLMH))+
  geom_point(position=position_dodge(.5), 
             size=rel(2))+
  scale_shape_manual(values = c(1, 2, 19))+
  labs(x="Age (in year -2)", y="VSAE score")+
  theme(legend.position= c(0.07,0.85),)
 
  
  
  
gg3

In-class exercises5

#資料匯入、檢視資料

dta <-read.csv("/Users/User/Desktop/DM_R/hk0420/diabetes_mell.csv", header = T)
dim(dta)

## [1] 8706    9

str(dta)

## 'data.frame':    8706 obs. of  9 variables:
##  $ SEQN    : int  51624 51626 51627 51628 51629 51630 51632 51633 51634 51635 ...
##  $ RIAGENDR: int  1 1 1 2 1 2 1 1 1 1 ...
##  $ RIDRETH1: int  3 4 4 4 1 3 2 3 1 3 ...
##  $ DIQ010  : int  2 2 2 1 2 2 2 2 2 1 ...
##  $ BMXBMI  : num  32.2 22 18.2 42.4 32.6 ...
##  $ gender  : Factor w/ 2 levels "Females","Males": 2 2 2 1 2 1 2 2 2 2 ...
##  $ race    : Factor w/ 3 levels "Black","Hispanic",..: 3 1 1 1 2 3 2 3 2 3 ...
##  $ diabetes: Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 2 ...
##  $ BMI     : Factor w/ 2 levels "Normal weight",..: 2 1 1 2 2 2 1 2 1 2 ...

dta <- data.frame(with(dta[,c("race" ,"gender", "diabetes",  "BMI")], xtabs(~ race + gender + diabetes + BMI)))
#用xtabs將 race + gender + diabetes + BMI合為data frame資料類型，算frequency
head(dta)

##       race  gender diabetes           BMI Freq
## 1    Black Females       No Normal weight  347
## 2 Hispanic Females       No Normal weight  712
## 3    White Females       No Normal weight  998
## 4    Black   Males       No Normal weight  429
## 5 Hispanic   Males       No Normal weight  706
## 6    White   Males       No Normal weight  873

library(ggplot2)
library(ggalluvial)
p0 <- ggplot(dta) +
      aes(axis1=race, axis2=gender, axis3=diabetes,
          y=Freq) + 
  scale_x_discrete(limits=c("race", "gender", "diabetes"), expand=c(.1, .05)) +
  labs(x="", y="NO. individuals") +
  geom_alluvium(aes(fill=BMI)) +
  geom_stratum() +  #變項框
  geom_text(stat="stratum", infer.label=TRUE) +
  scale_fill_manual(values=c("gray","orange")) +
  theme_minimal()
p0

###尚未轉換方向

In-class exercises6

install.packages(“gapminder”)

library(ggplot2)
library(gapminder)
data(gapminder) #下載gapminder
str(gapminder) #查看gapminder資料

## Classes 'tbl_df', 'tbl' and 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...

gap <- gapminder #指定資料為gap
ggplot(data = gap, aes(x = lifeExp)) #使用ggplot畫圖，資料為gap，並以lifeEXP資料為x

ggplot(data = gap, aes(x = lifeExp)) + 
    geom_histogram() #添上直方圖

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = gap, aes(x = lifeExp)) + 
  geom_histogram(fill = "blue", color = "black", bins = 10) + 
  ggtitle("Life expectancy for the gap dataset") + 
  xlab("Life expectancy (years)") + 
  ylab("Frequency") + 
  theme_classic()

#每長條寬度為10

ggplot(data = gap, aes(x = continent, y = lifeExp, fill = continent)) + 
  geom_boxplot() + 
  ggtitle("Boxplots for lifeExp by continent") + 
  xlab("Continent") + 
  ylab("Life expectancy (years)") +
  theme_minimal() +  #繪製以continebt為x lifeExp為y的盒狀圖增加title及xy標題
  guides(fill = FALSE) #圖例不見

#以x為lifeExp y為 gdpPercap 將contenent以不同顏色及型態呈現
ggplot(data = gap, aes(x = lifeExp, y = gdpPercap, color = continent, shape = continent)) + 
    geom_point(size = 5, alpha = 0.5) +  #加上scatter plot (點的大小及透明度)
    theme_classic() + #套用主題
    ggtitle("Scatterplot of life expectancy by gdpPercap") + 
    xlab("Life expectancy (years)") + 
    ylab("gdpPercap (USD)") + 
    theme(legend.position = "top",
          plot.title = element_text(hjust = 0.5, size = 20),
          legend.title = element_text(size = 10),
          legend.text = element_text(size = 5),
          axis.text.x = element_text(angle = 45, hjust = 1))

hk0420_Grammar of graphics inclass&Exercises

Tung-Han Wu

2020/4/27

In-class exercises1

In-class exercises2

In-class exercises3

In-class exercises4

In-class exercises5

In-class exercises6