Q2

隨著時間變化，性別在不同種族的數學成績沒有差異。

dta2 <- read.csv("nlsy86long.csv", h = T)
str(dta2)

'data.frame':   664 obs. of  9 variables:
 $ id   : int  2390 2560 3740 4020 6350 7030 7200 7610 7680 7700 ...
 $ sex  : Factor w/ 2 levels "Female","Male": 1 1 1 2 2 2 2 2 1 2 ...
 $ race : Factor w/ 2 levels "Majority","Minority": 1 1 1 1 1 1 1 1 1 1 ...
 $ time : int  1 1 1 1 1 1 1 1 1 1 ...
 $ grade: int  0 0 0 0 1 0 0 0 0 0 ...
 $ year : int  6 6 6 5 7 5 6 7 6 6 ...
 $ month: int  67 66 67 60 78 62 66 79 76 67 ...
 $ math : num  14.29 20.24 17.86 7.14 29.76 ...
 $ read : num  19.05 21.43 21.43 7.14 30.95 ...

dta2 <- dta2 %>% mutate(time = factor(time))

ggplot(dta2, aes(time, math, color = sex))+
  facet_wrap(~race) + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  stat_summary(aes(group = sex), fun.y = mean, geom = "line") +
  geom_line(aes(group = id), color = "gray50", alpha = .8, linetype = "dotted") +
  theme_bw()

隨著時間變化，性別在不同種族的閱讀成績沒有差異。

ggplot(dta2, aes(time, read, color = sex))+
  facet_wrap(~race) + 
  stat_summary(fun.data = mean_se, geom = "pointrange") +
  stat_summary(aes(group = sex), fun.y = mean, geom = "line") +
  geom_line(aes(group = id), color = "gray50", alpha = .8, linetype = "dotted") +
  theme_bw()

Q3

圖一是將人分為兩組，觀察組裡隨年齡增加和死亡率的關係。

dta3 <- read.csv("alcohol_age.csv", h = T) %>% na.omit()
dta3 <- dta3 %>% mutate(Legal =factor(c(rep("No", 24), rep("Yes", 24))))

#plot 1
ggplot(dta3, aes(Age, Alcohol, color = Legal))+
  geom_point()+
  stat_smooth(aes(group = Legal), method = "lm", se = F) +
  theme_bw()

圖二是看組裡飲酒和死亡率的關係

aggregate(Alcohol ~ Legal, FUN = mean, data = dta3)

  Legal  Alcohol
1    No 1.032118
2   Yes 1.482557

ggplot(dta3, aes(Age, Alcohol))+
  geom_point(aes(color = Legal), na.rm = TRUE)+
  geom_segment(aes(x = 19, xend = 21, y = 1.032, yend = 1.032), color = "tomato")+
  geom_segment(aes(x = 21, xend = 23, y = 1.483, yend = 1.483), color = "turquoise")+
  theme(legend.position = "none")+
  labs(x = "Age (year)", y = "Mortality rate from alcohol abuse (per 100,000)") +
  theme_bw()

Q4

隨著年齡增加，男性自殺率有增加的趨勢。

dta4 <- read.table("suicide.txt", h = F)
colnames(dta4)<- c("Country","25-34","35-44","45-54","55-64","65-74")
dta4 <- dta4 %>% gather(Age, Rate, 2:6)

ggplot(dta4, aes(Age, Rate))+
  geom_boxplot()+
  labs(x = "Age", y = "Deaths per 100,000 from male suicides") +
  theme_bw()

Q5

情緒與情境的關係，在大部分的情境中，annoy的分數均高。

dta5 <- read.table("coping.txt", h = T)

dta5 %>% gather(emotion,e_score,c(1:4,8)) %>%
  ggplot(.,aes(situation,e_score,color = emotion)) +
  stat_summary(fun.data = mean_se,position = position_dodge(0.3)) +
  theme_bw()+
  labs(x="Situation",y="Score")

採取策略與情境的關係。

dta5 %>% gather(coping, c_score, 5:7) %>%
  ggplot(.,aes(situation,c_score,color = coping)) +
  stat_summary(fun.data = mean_se,position = position_dodge(0.3)) +
  theme_bw()+
  labs(x="Situation",y="Score")

Q6

不知道怎麼用的簡單點。

dta6_1 <- read.table("Murd62/fr10-2.txt", h = F, sep = " ", fill = T) 
dta6_2 <- read.table("Murd62/fr15-2.txt", h = F, sep = " ", fill = T) 
dta6_3 <- read.table("Murd62/fr20-1.txt", h = F, sep = " ", fill = T)
#dta6_4 <- read.table("Murd62/fr20-2.txt", h = F, sep = " ", fill = T)
dta6_5 <- read.table("Murd62/fr30-1.txt", h = F, sep = " ", fill = T)
dta6_6 <- read.table("Murd62/fr40-1.txt", h = F, sep = " ", fill = T)

#count
Count <- function(x, data) {
  d <- 0
  c <- 1:x
  for(i in 1:x){
    d[i] <- sum(colSums(data == c[i], na.rm = T))
  }
  return(d)
}

#create 
Frame <- function(x, data) {
  prob <- Count(x, data)/nrow(data)
  all <- as.data.frame(cbind(item = 1:x, prob, grp = x))
  return(all)
}

Recall <- rbind(Frame(10, dta6_1), Frame(15, dta6_2), Frame(20, dta6_3), Frame(30, dta6_5), Frame(40, dta6_6))

ggplot(Recall, aes(item, prob)) + 
  geom_point() +
  geom_line(aes(group = grp)) +
  labs(x = "Serial position",y = "Probability of recall") +
  theme_bw()

Q7

dta7 <- read.table("beautyCourseEval.txt", h = T)
dta7 <- dta7 %>% mutate(sex = factor(sex, labels = c("Female", "Male") ))

ggplot(dta7, aes(beauty, eval, color = sex)) + 
  geom_point(shape = 1) +
  facet_wrap(~ courseID) + 
  stat_smooth(method = "lm", se = F) +
  labs(x = "Beauty judgment score",y = "Average course evaluation score") +
  theme_bw()

Q8

library(sas7bdat)
dta8 <- read.sas7bdat("sales.sas7bdat", debug=FALSE)

dta8 <- dta8 %>%
  mutate(region = factor(region, levels = 1:4, 
                          labels = c("Northern", "Southern", "Eastern","Western")),
         district = factor(district, levels = 1:5,
                           labels = c("North East", "South East", "South West", "North West", "Central West")),
         quarter = factor(quarter, levels = 1:4,
                          labels = c("1st", "2nd", "3rd", "4th")),
         month = factor(month, levels = 1:12,
                        labels = c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")))

dta8$sales <- replace(dta8$sales, dta8$sales < 0, 0)

以月來看個產品的營銷。

ggplot(dta8, aes(month, sales, color = product)) + 
  geom_point() +
  geom_line(aes(group = product)) +
  facet_wrap(~ year) + 
  labs(x = "Month",y = "sales") +
  theme_bw()

以季來看個產品的營銷。

ggplot(dta8,aes(quarter, sales, color = product)) + 
  stat_summary(fun.data = mean_se,position = position_dodge(0.3)) +
  facet_wrap(~ year) +
  theme_bw()+
  labs(x="quarter",y="sales")

北區外的店只有一年中特定月開，沒有月的變化，不討論地區比較。

Q9

長條圖不用使用數字、過於使用顏色、標籤中英混雜、標籤標示不清…

(資料還載不下來)

w11 Data Management Q2 - 9

Pei Jun

2018-05-14

Q2

Q3

Q4

Q5

Q6

Q7

Q8

Q9