in_class exercise2

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
dta <- read.table("C:/Users/user/Desktop/langMathDutch.txt",header=TRUE)
head(dta)
dta1<- dta %>% mutate(class_size= cut(size,include.lowest = TRUE, breaks=quantile(size, probs=c(0, .33, .67, 1)), label=c("Small", "Medium", "Large"), ordered=T),IQV_f= cut(IQV, include.lowest = TRUE,breaks=quantile(IQV, probs=c(0, .33, .67, 1)), label=c("Low", "Middle", "High"), ordered=T))

table(dta1$groups)
## < table of extent 0 >
library(ggplot2)
p0 <- ggplot(data=dta1, 
             mapping = aes(x=lang, 
                 y=arith)) +
  
  geom_point(shape = 23, fill = "black")+
  stat_smooth(formula= y ~ x,
              method='lm', 
              se=FALSE) +
  labs(x = "Language score",
       y = "Arithmetic score")+
   facet_wrap(.~class_size+IQV_f, labeller=labeller(.multi_line=F))
p0

in_class exercise3

dta <- datasets::USPersonalExpenditure
head(dta)
##                       1940   1945  1950 1955  1960
## Food and Tobacco    22.200 44.500 59.60 73.2 86.80
## Household Operation 10.500 15.500 29.00 36.5 46.20
## Medical and Health   3.530  5.760  9.71 14.0 21.10
## Personal Care        1.040  1.980  2.45  3.4  5.40
## Private Education    0.341  0.974  1.80  2.6  3.64
library(reshape2)
dta1 <- melt(dta)
names(dta1) <- c("category", "year", "expenditure")
dta1$expenditure <- log10(dta1$expenditure)
library(dplyr)
dta1 <- dta1 %>% mutate(excess = expenditure - mean(expenditure))
library(ggplot2)
qplot(excess, category, data = dta1) +
  geom_segment(aes(xend = 0, yend = category)) +
  geom_vline(xintercept = 0, colour = "grey50") +
  facet_wrap(~ year, nrow = 1)

in_class exercise4

dta <- WWGbook::autism
head(dta)
library(dplyr)
dta <- na.omit(dta)
dta <- dta %>% mutate(group = cut(sicdegp, breaks=c(0, 1, 2, 3),labels = c("L", "M", "H")),age_centered = age - mean(age)) 

library(ggplot2)

p1 <- ggplot(dta, aes(age_centered, vsae)) +
   facet_wrap(. ~ group)+
  geom_point(alpha = 0.40) +
  geom_smooth(method = "lm",
              formula = "y ~ x")+
  geom_path(aes(group = childid), alpha = 0.4)+
  theme_bw()+
  scale_x_continuous(limits = c(-4, 7.5),
                     breaks = c(-2.5, 0.0, 2.5, 5.0)) +
labs(x = "Age (in years, centered)",
       y = "VSAE score")


pd <- position_dodge(.3)
p2 <- dta %>% mutate(age_2 = age - 2) %>% group_by(group,age_2) %>%
  summarize(vsae_mean = mean(vsae), 
            vsae_se = sd(vsae) / sqrt(n())) %>%
  ggplot() + 
  aes(age_2, vsae_mean, 
      group=group, 
      shape=group) +
  geom_errorbar(aes(ymin=vsae_mean -vsae_se,
                    ymax=vsae_mean +vsae_se),
                width=.2, size=.3, 
                position=pd) +
  geom_line(position=pd, 
            show.legend = T,
            aes(linetype=group)) +
  geom_point(position=pd, 
             size=rel(3),
             show.legend = T) +
  scale_shape_manual(values = c(1, 2, 16)) +
  labs(x="Age (in year -2)", y="VSAE score") +
   theme_bw()+
   theme(legend.position=c(.08, .8),
        legend.box.background = element_rect(colour = "black"),
        legend.key = element_rect(color = "black"),
        legend.key.size = unit(1, "cm"))
p1

p2

in_class exercise5

dta <- read.table("C:/Users/user/Desktop/diabetes_mell.csv", sep = ",", header = T)
head(dta)
library(ggalluvial)
dta_v3 <- data.frame(with(dta[, c("race", "gender", "diabetes", "BMI")], 
                        xtabs(~ race + gender + diabetes + BMI)))
dta_v3$race <- factor(dta_v3$race, 
                    levels(dta_v3$race)[c(2,3,1)])
dta_v3$gender <- factor(dta_v3$gender, 
                    levels(dta_v3$gender)[c(2,1)])
dta_v3$diabetes <- factor(dta_v3$diabetes, 
                    levels(dta_v3$diabetes)[c(2,1)])
ggplot(dta_v3, 
       aes(axis1=race,
           axis2=gender, 
           axis3=diabetes, 
           y=Freq)) +
  scale_x_discrete(limits=c("race", 
                            "gender", 
                            "diabetes"), 
                   expand=c(.1, .05)) +
  labs(x='', 
       y='No. individuals') +
  geom_alluvium(aes(fill=BMI)) +
  geom_stratum() + 
  geom_text(stat="stratum", 
            infer.label=TRUE) +
  scale_fill_manual(values=c('gray40','tan1'))+
  theme_minimal() +
  theme(legend.position = "bottom") +
  ggtitle("Diabetes in overall population in US 2009-2010", subtitle = "straitified by race, gender and diabetes mellitus")

We can observe the frequency distributions from the plot. The size of the bar represents the numbers of the population. The gray and brown flows represent a group of observations that match the value for each variable indicated by the flow.

We can see that most of the diabete population is overweighted. The numbers of diabete population between race and gender are similar.