0. Data pre-processing

將2013~2017年資料整理乾淨，並進行編碼整合後，存成Rdata檔供後續使用。
由於重新編碼過程頗為繁複，且受到不同年度資料的「乾淨程度」有不同的處理，但這部分是「黑手」的工作，即使不使用R也可以完成，故過程就不呈現出來，主要原則就是一直透過table()檢視遺漏值，以及各年度編碼一致性，並利用library(car)中的Recode()進行重新編碼
以下是編碼簿

Variables	Description	Value
ovalue	請問，您對政府網站中提供給民眾的資訊滿不滿意?	1=非常不滿意; 2=不滿意; 3=滿意; 4=非常滿意
dvalue	請問，您常不常透過網路社群將您覺得重要的公共問題傳給其他人?	1=從來沒有; 2=很少; 3=有時; 4=經常
svalue	如果沒有網路，您的生活會變得比較快樂、還是比較不快樂?	1=比以前不快樂很多; 2=比以前不快樂一些; 3=比以前快樂一些; 4=比以前快樂很多
inter	網路使用	1=有;0=無
gender	性別	1=男性; 2=女性
edu	教育程度	1=小學及以下; 2=國初中; 3=高中職; 4=大專及以上
age	年齡	1=15-19歲; 2=20-29歲; 3=30-39歲; 4=40-49歲; 5=50-59歲; 6=60歲以上
w	權值	NULL

整理好的Rdata檔與當天使用的pdf檔在github裡面，有興趣可以抓來做做別的事：https://github.com/ZXian0518/RLadies-Taipei-20190128
電子治理研究中心有歷年數位國情的完整調查結果，可參見 https://www.teg.org.tw/electronDB/Digit

library(foreign)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(ggplot2)
library(ggstatsplot)
library(magrittr)
library(car)

## Loading required package: carData

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:magrittr':
## 
##     extract

## The following object is masked from 'package:Matrix':
## 
##     expand

library(tibble)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)

1. 2013~2017 Public Value of Digital Governance (數位治理公共價值指標)

rm(list=ls())
options(digits = 3)
load("/Users/huangzongxian/Desktop/RLadies/Data/datlis.RData") # 讀取整理好的Rdata檔

#你也可以將 list 拆分成5個data.frame來執行喔，只是會稍微麻煩一點
#for (i in 1:5L){
#assign(paste0("data", i),datlis[[i]])}#


#Using weight
dwlist<- lapply(datlis,function(x){
  svydesign(ids = ~1, data = x, weights = x$w)
})



#2013~2017 操作性價值
#各年度結果
lapply(dwlist,function(x){svytable(~ovalue, design=x) %>%
  prop.table()})  %>%
  do.call(rbind,.) %>% 
  cbind(seq(2013,2017),.)    %>%
  set_colnames(c("year",names(svytable(~ovalue, design=dwlist[[1]]))))

##      year 不滿意 非常不滿意 非常滿意  滿意
## [1,] 2013  0.182     0.0210   0.0382 0.759
## [2,] 2014  0.224     0.0339   0.0676 0.674
## [3,] 2015  0.171     0.0347   0.0447 0.749
## [4,] 2016  0.178     0.0320   0.0433 0.747
## [5,] 2017  0.255     0.0335   0.0355 0.676

#透過tibble與ggplot2的合作來plot
lapply(dwlist,function(x){svytable(~ovalue, design=x) %>%
  prop.table()})  %>%
  do.call(rbind,.) %>% 
  cbind(seq(2013,2017),.)    %>%
  set_colnames(c("year",names(svytable(~ovalue, design=dwlist[[1]]))))    %>% 
  as_tibble() %>% 
  gather(terms, # this will be the new column for the  key columns
         value, # this will contain the  values
         c("非常不滿意","不滿意","滿意","非常滿意"), 
         na.rm = TRUE 
  ) %>%
  ggplot(data=.,aes(x=year,y=value,group=factor(terms),col=terms,linetype=terms))+
         geom_line() + 
         geom_point() +  
  labs(x = "時間", y = "percent", title = "請問，您對政府網站中提供給民眾的資訊滿不滿意?") + 
  theme(text=element_text(family="黑體-繁 中黑", size=13),axis.ticks = element_blank())

#2013~2017 政治性價值
#
lapply(dwlist,function(x){svytable(~dvalue, design=x) %>%
  prop.table()})  %>%
  do.call(rbind,.) %>% 
  cbind(seq(2013,2017),.)    %>%
  set_colnames(c("year",names(svytable(~dvalue, design=dwlist[[1]]))))

##      year 從來沒有  很少   經常  有時
## [1,] 2013    0.534 0.228 0.0866 0.151
## [2,] 2014    0.465 0.246 0.0872 0.202
## [3,] 2015    0.383 0.255 0.1167 0.246
## [4,] 2016    0.375 0.194 0.1430 0.288
## [5,] 2017    0.483 0.258 0.0917 0.167

#
lapply(dwlist,function(x){svytable(~dvalue, design=x) %>%
  prop.table()})  %>%
  do.call(rbind,.) %>% 
  cbind(seq(2013,2017),.)    %>%
  set_colnames(c("year",names(svytable(~dvalue, design=dwlist[[1]]))))    %>% 
  as_tibble() %>% 
  gather(terms, # this will be the new column for the  key columns
         value, # this will contain the  values
         c("從來沒有","很少","有時","經常"), # this is the range of columns we want gathered
         na.rm = TRUE # handles missing
  ) %>%
  ggplot(data=.,aes(x=year,y=value,group=factor(terms),col=terms,linetype=terms))+
  geom_line() + geom_point() +  
  labs(x = "時間", y = "percent", 
       title = "請問，您常不常透過網路社群將您覺得重要的公共問題傳給其他人?") + 
  theme(text=element_text(family="黑體-繁 中黑", size=13),axis.ticks = element_blank())

#2013~2017 社會性價值
#
lapply(dwlist,function(x){svytable(~svalue, design=x) %>%
  prop.table()})  %>%
  do.call(rbind,.) %>% 
  cbind(seq(2013,2017),.)    %>%
  set_colnames(c("year",names(svytable(~svalue, design=dwlist[[1]]))))

##      year 比以前不快樂很多 比以前不快樂一些 比以前快樂很多 比以前快樂一些
## [1,] 2013            0.222            0.443         0.0817          0.253
## [2,] 2014            0.248            0.442         0.0894          0.221
## [3,] 2015            0.210            0.402         0.0947          0.293
## [4,] 2016            0.265            0.422         0.0826          0.230
## [5,] 2017            0.275            0.430         0.1099          0.185

#
lapply(dwlist,function(x){svytable(~svalue, design=x) %>%
  prop.table()})  %>%
  do.call(rbind,.) %>% 
  cbind(seq(2013,2017),.)    %>%
  set_colnames(c("year",names(svytable(~svalue, design=dwlist[[1]]))))    %>% 
  as_tibble() %>% 
  gather(terms, # this will be the new column for the  key columns
         value, # this will contain the  values
         c("比以前不快樂很多","比以前不快樂一些","比以前快樂很多","比以前快樂一些"), # this is the range of columns we want gathered
         na.rm = TRUE # handles missing
  ) %>%
  ggplot(data=.,aes(x=year,y=value,group=factor(terms),col=terms,linetype=terms))+
  geom_line() + geom_point() +  
  labs(x = "時間", y = "percent", 
       title = "如果沒有網路，您的生活會變得比較快樂、還是比較不快樂?") + 
  theme(text=element_text(family="黑體-繁 中黑", size=13),axis.ticks = element_blank())

2. Digital Divide

女性在網路使用上有沒有比較不利呢？

# 沒有任何一年的p-value < 0.05，無法推翻不同性別使用網路者比率相同的虛無假設
lapply(dwlist,function(x){svychisq(~gender+inter, design=x, statistic="Chisq")})

## [[1]]
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~gender + inter, design = x, statistic = "Chisq")
## X-squared = 0.3, df = 1, p-value = 0.6
## 
## 
## [[2]]
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~gender + inter, design = x, statistic = "Chisq")
## X-squared = 1, df = 1, p-value = 0.4
## 
## 
## [[3]]
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~gender + inter, design = x, statistic = "Chisq")
## X-squared = 0.2, df = 1, p-value = 0.7
## 
## 
## [[4]]
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~gender + inter, design = x, statistic = "Chisq")
## X-squared = 0.01, df = 1, p-value = 0.9
## 
## 
## [[5]]
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~gender + inter, design = x, statistic = "Chisq")
## X-squared = 0.07, df = 1, p-value = 0.8

# 各年度比例
lapply(dwlist,function(x){prop.table(svytable(~gender+inter, design=x),margin = 1)})

## [[1]]
##       inter
## gender     0     1
##      1 0.300 0.700
##      2 0.316 0.684
## 
## [[2]]
##       inter
## gender     0     1
##      1 0.309 0.691
##      2 0.338 0.662
## 
## [[3]]
##       inter
## gender     0     1
##      1 0.306 0.694
##      2 0.319 0.681
## 
## [[4]]
##       inter
## gender     0     1
##      1 0.318 0.682
##      2 0.320 0.680
## 
## [[5]]
##       inter
## gender     0     1
##      1 0.186 0.814
##      2 0.189 0.811

3. Weight vs. Unweight

說明：加權的權值即是參考該年度內政部人口統計資料計算而出，換言之加權資料的人口分佈即反應當時的內政部人口統計資料。

#Age
data <- c()
for (i in 1L:5L){
  term <- names(prop.table(table(datlis[[1]]$age))*100)
  date <- rep((i+2012),6)
  start<- lapply(datlis,function(x){prop.table(table(x$age))*100})[[i]] %>% as.character() %>% as.numeric()
  end<- lapply(dwlist,function(x){prop.table(svytable(~age,design=x))*100}) [[i]] %>%    as.character() %>% as.numeric()
  amount <- end-start %>% as.character() %>% as.numeric()
  type <- ifelse(amount > 0, "Postive","Negative")
  dt <- cbind(term,date,start,end,type,amount)
  data <- rbind(data,dt)
}

data<- as.data.frame(data)
data$date<- data$date %>% as.character() %>% as.numeric() 
data$amount<- data$amount %>% as.character() %>% as.numeric() 

#plot
data %>%
  ggplot(aes(x = date
           , y = amount
           , fill = term)) +
  geom_col(position = "dodge") +
  scale_fill_brewer(palette = "Dark2") + 
  labs(x = "時間", y = "差比(%)", title = "populations - survey results (AGE)") +
  theme(text=element_text(family="黑體-繁 中黑", size=13),axis.ticks = element_blank())

#Education
data <- c()
for (i in 1L:5L){
  term <- names(prop.table(table(datlis[[1]]$edu))*100)
  date <- rep((i+2012),4)
  start<- lapply(datlis,function(x){prop.table(table(x$edu))*100})[[i]] %>% as.character() %>% as.numeric()
  end<- lapply(dwlist,function(x){prop.table(svytable(~edu,design=x))*100}) [[i]] %>%    as.character() %>% as.numeric()
  amount <- end-start %>% as.character() %>% as.numeric()
  type <- ifelse(amount > 0, "Postive","Negative")
  dt <- cbind(term,date,start,end,type,amount)
  data <- rbind(data,dt)
}

data<- as.data.frame(data)
data$date<- data$date %>% as.character() %>% as.numeric() 
data$amount<- data$amount %>% as.character() %>% as.numeric() 

#plot
data %>%
  ggplot(aes(x = date
           , y = amount
           , fill = term)) +
  geom_col(position = "dodge") +
  scale_fill_brewer(palette = "Dark2") + 
  labs(x = "時間", y = "差比(%)", title = "populations - survey results (Education)") +
  theme(text=element_text(family="黑體-繁 中黑", size=13),axis.ticks = element_blank())

#這張圖的結果非常有趣，因為2016年的調查公司恰好因為招標的原因跟其他四年都不一樣(剛好因為我是助理所以才知道，不然報告書上沒有寫XD)，看來機構效應可能一定程度造成教育程度代表性的差異...!?

4. 當天沒有處理到的一個提問

年齡越低的人會不會越常透過網路上分享訊息？
(偷懶用2017年資料做個迴歸模型看看)
因為嚴格來說依變項是順序尺度(ordinal scale)，所以盡量不要直接用最小平方法的迴歸lm()，用ordinal套件中的clm會比較合理
喜愛ggplot2的大神們同時創造了可以變出漂亮統計圖表的ggstatsplot 詳參 https://github.com/IndrajeetPatil/ggstatsplot
年齡越大對於網路上的意見分享有負向影響(也就是年輕人比老年人更會透過網路分享公共資料)；教育程度越高、女性，則對於網路上的意見分享則有正向影響

data<- datlis[[5]] # 2017 data
data$edu<- Recode(data$edu,"'小學及以下'=1;'國初中'=2;'高中職'=3;'大專及以上'=4") %>% as.numeric()
data$age<- Recode(data$age,"'15-19歲'=1;'20-29歲'=2;'30-39歲'=3;'40-49歲'=4;'50-59歲'=5;'60歲以上'=6") %>% as.numeric()
data$dvalue<- Recode(data$dvalue,"'從來沒有'=1;'很少'=2;'有時'=3;'經常'=4")

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(ordinal)

## 
## Attaching package: 'ordinal'

## The following object is masked from 'package:dplyr':
## 
##     slice

clm1<- clm(dvalue~edu+age+gender,data=data)
ggstatsplot::ggcoefstats(clm1,caption.summary=F,title = "Ordinal Regression Model")

RLadies Taipei 20190128

Zong-Xian Huang

2019/2/10

0. Data pre-processing

1. 2013~2017 Public Value of Digital Governance (數位治理公共價值指標)

2. Digital Divide

3. Weight vs. Unweight

4. 當天沒有處理到的一個提問