Inclass exercise_Statistical graphics

library(ggplot2)
library(lattice)
library(tidyverse)
library(dplyr)

Exercises1:

The distribution of personal disposable income in Taiwan in 2015 has a story to tell. Revise the following plot to enhance that message.

dta<-read.csv("D://income_tw.csv", head=T)  
#看資料
summary(dta)

    Income              Count       
 Length:41          Min.   : 59820  
 Class :character   1st Qu.:241508  
 Mode  :character   Median :313992  
                    Mean   :352494  
                    3rd Qu.:461836  
                    Max.   :807160

str(dta)

'data.frame':   41 obs. of  2 variables:
 $ Income: chr  "160,000 and under" "160,000 to 179,999" "180,000 to 199,999" "200,000 to 219,999" ...
 $ Count : int  807160 301650 313992 329290 369583 452671 495387 517779 557786 584497 ...

dta1<-dta |>
  mutate(Count1 = Count / 10000) |>
#增加Count1=Count/10000這個column
  mutate(order=seq(1:n()))
#增加order這個column，從1開始編到n
  name=c("Income","Count", "Count1","order")
#給column新的命名
str(dta1)

'data.frame':   41 obs. of  4 variables:
 $ Income: chr  "160,000 and under" "160,000 to 179,999" "180,000 to 199,999" "200,000 to 219,999" ...
 $ Count : int  807160 301650 313992 329290 369583 452671 495387 517779 557786 584497 ...
 $ Count1: num  80.7 30.2 31.4 32.9 37 ...
 $ order : int  1 2 3 4 5 6 7 8 9 10 ...

p1<-ggplot(data=dta1,
           aes(x=Count1,
           y=reorder(Income,-order)))#y軸排序依據order倒著排
p1

p2<-p1+
  geom_point()
p2

p3<-p2+
  labs(x='Number of persons', 
       y='', 
       title="Distribution of personal disposable income in Taiwan (2015)",
       )
p3

p1<-ggplot(data=dta1,
           aes(x=Count1,
           y=reorder(Income,Count1)))+   #y軸排據count1排序
    geom_point()+
    labs(x='Number of persons', 
         y='', 
         title="Distribution of personal disposable income in Taiwan (2015)",
         )
p1

#這樣其實也只能看出來最多人的是<160000元
#最少人的是2500000元

Exercise 2

Comment on how the graphs presented in this link violate the principles for effective graphics and how would you revise them.

data <- read.csv("D://test.csv",fileEncoding="big5", head=T)
head(data)

                         law month            c_type count X_id
1 12條第1項第1.3.4.5.6.7.8款     1              汽車     0    1
2 12條第1項第1.3.4.5.6.7.8款     1              汽車   149    2
3 12條第1項第1.3.4.5.6.7.8款     1 250cc以上重型機車     0    3
4 12條第1項第1.3.4.5.6.7.8款     1 250cc以上重型機車     0    4
5 12條第1項第1.3.4.5.6.7.8款     1     250cc以下機車     0    5
6 12條第1項第1.3.4.5.6.7.8款     1     250cc以下機車    40    6
                  type X.year
1 未領用或未懸掛牌照等    104
2 未領用或未懸掛牌照等    104
3 未領用或未懸掛牌照等    104
4 未領用或未懸掛牌照等    104
5 未領用或未懸掛牌照等    104
6 未領用或未懸掛牌照等    104

names(data)<-c("year", "month", "type", "law", "vehicle_type", "method", "count")
str(data)

'data.frame':   100 obs. of  7 variables:
 $ year        : chr  "12條第1項第1.3.4.5.6.7.8款" "12條第1項第1.3.4.5.6.7.8款" "12條第1項第1.3.4.5.6.7.8款" "12條第1項第1.3.4.5.6.7.8款" ...
 $ month       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ type        : chr  "汽車" "汽車" "250cc以上重型機車" "250cc以上重型機車" ...
 $ law         : int  0 149 0 0 0 40 0 0 0 0 ...
 $ vehicle_type: int  1 2 3 4 5 6 7 8 9 10 ...
 $ method      : chr  "未領用或未懸掛牌照等" "未領用或未懸掛牌照等" "未領用或未懸掛牌照等" "未領用或未懸掛牌照等" ...
 $ count       : int  104 104 104 104 104 104 104 104 104 104 ...

type <- data %>%
  filter(year == 104) %>%
  group_by(type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

vtype <- data %>%
  filter(year == 104) %>%
  group_by(vehicle_type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

method <- data %>%
  filter(year == 104) %>%
  group_by(method) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

type_month <- data %>%
  filter(year == 104) %>%
  group_by(month,type) %>%
  summarise(count=sum(count)) %>%
  arrange(desc(count))

vtype_month <- data %>%
  filter(year == 104) %>%
  group_by(month,vehicle_type) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

method_month <- data %>%
  filter(year == 104) %>%
  group_by(month,method) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count))

# add font
windowsFonts(A=windowsFont("Microsoft JhengHei UI"))

Exercise 3

Use the free recall data to improve on the figure reported in Murdock, B. B. (1962). The serial position effect of free recall. Journal of Experimental Psychology, 64, 482-488.

list.files("D:/Murd62", pattern = "fr")

[1] "fr10-2.txt" "fr15-2.txt" "fr20-1.txt" "fr20-2.txt" "fr30-1.txt"
[6] "fr40-1.txt"

# 載入資料(有6個txt檔)
fL<-"D:/Murd62/fr10-2.txt"
# read.table(fL, sep="",header=FALSE)會出現錯誤碼如下，後來發現txt檔有空值
# Error in scan(file = file...: line 3 did not have 7 elements)
# 加上fill=T，帶入空值(NA)
dta10_2 <- read.table(fL, sep="",header=FALSE, fill=T)
#'data.frame':  1607 obs. of  7 variables
str(dta10_2)

'data.frame':   1607 obs. of  7 variables:
 $ V1: int  6 10 10 10 10 6 8 9 8 10 ...
 $ V2: int  1 9 7 88 3 8 9 10 9 8 ...
 $ V3: int  4 6 9 7 7 10 10 4 10 9 ...
 $ V4: int  7 8 8 2 2 5 7 6 7 2 ...
 $ V5: int  10 2 1 1 1 3 88 88 2 3 ...
 $ V6: int  2 1 NA 5 9 88 4 8 NA NA ...
 $ V7: int  8 88 NA NA NA NA NA 1 NA NA ...

#改column name
colnames(dta10_2)=c(1:7)
#新增group
dta10_2$Group<-as.factor("10-2")
str(dta10_2)

'data.frame':   1607 obs. of  8 variables:
 $ 1    : int  6 10 10 10 10 6 8 9 8 10 ...
 $ 2    : int  1 9 7 88 3 8 9 10 9 8 ...
 $ 3    : int  4 6 9 7 7 10 10 4 10 9 ...
 $ 4    : int  7 8 8 2 2 5 7 6 7 2 ...
 $ 5    : int  10 2 1 1 1 3 88 88 2 3 ...
 $ 6    : int  2 1 NA 5 9 88 4 8 NA NA ...
 $ 7    : int  8 88 NA NA NA NA NA 1 NA NA ...
 $ Group: Factor w/ 1 level "10-2": 1 1 1 1 1 1 1 1 1 1 ...

#匯入其他資料fr15-2、fr20-1、fr20-2、fr30-1、fr40-1
fL<-"D:/Murd62/fr15-2.txt"
dta15_2 <- read.table(fL, sep="",header=FALSE, fill=T)
str(dta15_2)

'data.frame':   2017 obs. of  7 variables:
 $ V1: int  15 15 14 15 15 3 13 14 14 9 ...
 $ V2: int  12 14 15 13 14 13 15 15 15 88 ...
 $ V3: int  14 3 12 12 12 14 10 12 5 NA ...
 $ V4: int  10 4 10 2 13 12 1 10 13 NA ...
 $ V5: int  11 2 5 10 1 88 2 3 2 NA ...
 $ V6: int  NA 1 NA 6 88 2 3 NA 1 NA ...
 $ V7: int  NA NA NA 8 NA 5 4 NA 4 NA ...

colnames(dta15_2)=c(1:7)
dta15_2$Group<-as.factor("15-2")

fL<-"D:/Murd62/fr20-1.txt"
dta20_1 <- read.table(fL, sep="",header=FALSE, fill=T)
str(dta20_1)

'data.frame':   1293 obs. of  10 variables:
 $ V1 : int  20 20 17 18 18 18 19 20 19 20 ...
 $ V2 : int  19 18 18 19 19 19 20 19 20 18 ...
 $ V3 : int  13 16 8 20 20 20 18 17 88 12 ...
 $ V4 : int  18 17 20 15 17 17 11 11 5 19 ...
 $ V5 : int  1 3 15 16 12 1 17 15 17 NA ...
 $ V6 : int  9 88 14 8 11 10 1 10 1 NA ...
 $ V7 : int  2 14 1 9 13 7 2 1 NA NA ...
 $ V8 : int  17 NA 2 1 88 NA NA NA NA NA ...
 $ V9 : int  16 NA 19 2 NA NA NA NA NA NA ...
 $ V10: int  88 NA NA NA NA NA NA NA NA NA ...

colnames(dta20_1)=c(1:10)
dta20_1$Group<-as.factor("20-1")

fL<-"D:/Murd62/fr20-2.txt"
dta20_2 <- read.table(fL, sep="",header=FALSE, fill=T)
colnames(dta20_2)=c(1:8)
dta20_2$Group<-as.factor("20-2")

fL<-"D:/Murd62/fr30-1.txt"
dta30_1 <- read.table(fL, sep="",header=FALSE, fill=T)
colnames(dta30_1)=c(1:9)
dta30_1$Group<-as.factor("30-1")

fL<-"D:/Murd62/fr40-1.txt"
dta40_1 <- read.table(fL, sep="",header=FALSE, fill=T)
colnames(dta40_1)=c(1:10)
dta40_1$Group<-as.factor("40-1")

Exercise 4

Sarah Leo at the Economist magazine published a data set to accompany the story about how scientific publishing is dominated by men. The plot on the left panel below is the orignal graph that appeared in the article. Help her find a better plot.

dta<-read.csv("D://Economist_women-research.csv", head=T, skip= 1)  
#跳開第一列
#看資料
summary(dta)

   Country          Health.sciences    Physical.sciences  Engineering    
 Length:18          Length:18          Min.   :0.1100    Min.   :0.1100  
 Class :character   Class :character   1st Qu.:0.2100    1st Qu.:0.2200  
 Mode  :character   Mode  :character   Median :0.2300    Median :0.2400  
                                       Mean   :0.2375    Mean   :0.2425  
                                       3rd Qu.:0.2500    3rd Qu.:0.2525  
                                       Max.   :0.3700    Max.   :0.3600  
                                       NA's   :6         NA's   :6       
 Computer.science..maths X..of.women.inventores
 Min.   :0.1100          Min.   :0.0800        
 1st Qu.:0.2025          1st Qu.:0.1200        
 Median :0.2200          Median :0.1350        
 Mean   :0.2092          Mean   :0.1525        
 3rd Qu.:0.2250          3rd Qu.:0.1825        
 Max.   :0.2700          Max.   :0.2600        
 NA's   :6               NA's   :6

str(dta)

'data.frame':   18 obs. of  6 variables:
 $ Country                : chr  "Japan" "Chile" "United Kingdom" "United States" ...
 $ Health.sciences        : chr  "0.24" "0.43" "0.45" "0.46" ...
 $ Physical.sciences      : num  0.11 0.23 0.21 0.2 0.25 0.22 0.25 0.24 0.21 0.23 ...
 $ Engineering            : num  0.11 0.22 0.22 0.22 0.26 0.23 0.25 0.25 0.22 0.25 ...
 $ Computer.science..maths: num  0.11 0.16 0.21 0.22 0.22 0.18 0.22 0.22 0.22 0.24 ...
 $ X..of.women.inventores : num  0.08 0.19 0.12 0.14 0.18 0.13 0.12 0.17 0.13 0.12 ...

dta <- dta[complete.cases(dta), ]
#移除遺漏值(excel後面的備註)
names(dta)<-c("Country", 
            "Health", 
            "Physical", 
            "Engineering", 
            "Computer", 
            "Inventors")
#column重新命名
tail(dta)

     Country Health Physical Engineering Computer Inventors
7       EU28   0.48     0.25        0.25     0.22      0.12
8     France   0.48     0.24        0.25     0.22      0.17
9     Canada   0.49     0.21        0.22     0.22      0.13
10 Australia    0.5     0.23        0.25     0.24      0.12
11    Brazil   0.57     0.33        0.32     0.24      0.19
12  Portugal   0.57     0.37        0.36     0.27      0.26

#檢查最後幾項，看起來沒問題

Inclass exercise_Statistical graphics

Ruei-Ning Su

2021-12-06

Exercises1:

Exercise 2

Exercise 3

Exercise 4