Statistical graphics Exercise

Exercise 1

The distribution of personal disposable income in Taiwan in 2015 has a story to tell. Revise the following plot to enhance that message

loading data and check data structure

dta<-read.csv("C:/Users/USER/Desktop/R_data management/0420/data/income_tw.csv", header=T)
str(dta)

## 'data.frame':    41 obs. of  2 variables:
##  $ Income: Factor w/ 41 levels "1,000,000 to 1,069,999",..: 7 8 9 11 12 13 14 15 16 17 ...
##  $ Count : int  807160 301650 313992 329290 369583 452671 495387 517779 557786 584497 ...

head(dta)

##               Income  Count
## 1  160,000 and under 807160
## 2 160,000 to 179,999 301650
## 3 180,000 to 199,999 313992
## 4 200,000 to 219,999 329290
## 5 220,000 to 239,999 369583
## 6 240,000 to 259,999 452671

data manipulation

library(dplyr)
dta$Income <- factor(dta$Income, levels = dta$Income)

dta2<-dta%>%mutate(group=ifelse(Count>=mean(Count), "M", "L"))

plot

library(ggplot2)
ggplot(dta2, aes(Count/10000, y=Income, col=group))+
  geom_point()+
  geom_segment(aes(xend=mean(Count)/10000 , yend=Income))+
  labs(x="Number of person (x 10,000)", y=" ")+
  theme(legend.position=" ")

Exercise 3

Sarah Leo at the Economist magazine published a data set to accompany the story about how scientific publishing is dominated by men. The plot on the left panel below is the orignal graph that appeared in the article. Help her find a better plot.

loading data and check data structure

dta<-read.csv("C:/Users/USER/Desktop/R_data management/0420/data/Economist_women-research.csv", header=T)
names(dta)<-c("Country", "Health sciences", "Physical science", "Engineering","Computer science, maths", "women inventores")
str(dta)

## 'data.frame':    19 obs. of  6 variables:
##  $ Country                : Factor w/ 18 levels "","Australia",..: 6 12 5 17 18 13 7 8 10 4 ...
##  $ Health sciences        : Factor w/ 13 levels "","0.24","0.43",..: 11 2 3 4 5 5 6 7 7 8 ...
##  $ Physical science       : Factor w/ 11 levels "","0.11","0.2",..: 11 2 6 4 3 8 5 8 7 4 ...
##  $ Engineering            : Factor w/ 9 levels "","0.11","0.22",..: 9 2 3 3 3 6 4 5 5 3 ...
##  $ Computer science, maths: Factor w/ 9 levels "","0.11","0.16",..: 9 2 3 5 6 6 4 6 6 6 ...
##  $ women inventores       : Factor w/ 10 levels "","% of women inventores",..: 2 3 9 4 6 8 5 4 7 5 ...

head(dta)

##          Country Health sciences  Physical science Engineering
## 1        Country Health sciences Physical sciences Engineering
## 2          Japan            0.24              0.11        0.11
## 3          Chile            0.43              0.23        0.22
## 4 United Kingdom            0.45              0.21        0.22
## 5  United States            0.46               0.2        0.22
## 6         Mexico            0.46              0.25        0.26
##   Computer science, maths      women inventores
## 1 Computer science, maths % of women inventores
## 2                    0.11                  0.08
## 3                    0.16                  0.19
## 4                    0.21                  0.12
## 5                    0.22                  0.14
## 6                    0.22                  0.18

dta<-dta[-1, ]

data manipulation

library(reshape)
dta.melt<-melt(dta, id="Country")
class(dta.melt$value)

## [1] "factor"

m<-as.numeric(levels(dta.melt$value))[dta.melt$value]
dta.melt$newv<-m
dta.melt$menv<-1-dta.melt$newv
dta.melt$diff<-dta.melt$newv-dta.melt$menv
dta.melt<-na.omit(dta.melt)

ggplot

I think this plot tried to demonstrate how scientific publishing is dominated by men, hence show the difference (gap) between female and male researchers could easy to catch the point and use the facet_wrap to avoid too much information in the same plot

library(ggplot2)
ggplot(dta.melt, aes(y=diff, x=Country))+
  geom_point()+
  facet_wrap(variable~., ncol=5)+
  geom_segment(aes(yend=0, xend=Country))+
  geom_hline(aes(yintercept=0), color="gray")+
  theme(legend.position=" ")+
  theme(axis.text.x=element_text(angle=90, vjust=0.5))+
  labs(x="Country", y="Difference in paper published (%, F-M)")

ggplot(dta.melt, aes(y=diff, x=variable))+
  geom_point()+
  facet_wrap(Country~., ncol=3)+
  geom_segment(aes(yend=0, xend=variable))+
  geom_hline(aes(yintercept=0), color="gray")+
  theme(legend.position=" ")+
  theme(axis.text.x=element_text(angle=90, vjust=0.5))+
  labs(x="Country", y="Difference in paper published (%, F-M)")