Load libraries/data

library(haven)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
TEDS_2016 <- read_dta("TEDS_2016.dta")

Run an exploratory data analysis with R using the TEDS2016 dataset

table(TEDS_2016$District)
## 
##  201  401  502  701  703  704  901 1002 1303 1401 1501 1701 1801 6301 6302 6303 
##   55   55   49   37   37   39   47   48   49   21   22   60   48   54   57   54 
## 6403 6405 6406 6407 6408 6503 6505 6507 6508 6509 6603 6604 6605 6606 6608 6701 
##   42   42   42   44   49   54   55   54   55   59   38   38   36   36   36   44 
## 6702 6703 6704 6802 6806 
##   42   42   48   48   54
TEDS_2016$District <- as.factor(TEDS_2016$District)
ggplot(TEDS_2016, aes(District)) +
  geom_bar(fill= "pink") +
  xlab("District") +
  ylab("Count of responses") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90))

TEDS_2016$Age <- as.factor(TEDS_2016$Age)
plot(TEDS_2016$Age, xlab = "Age Category", ylab = "Count of Responses")

TEDS_2016$Sex <- as.factor(TEDS_2016$Sex)
plot(TEDS_2016$Sex, xlab = "Sex", ylab = "Count of Responses")

Generate frequency table and barchart of the Tondu variable. Tondu variable: Indicates position on unification and independence

TEDS_2016$Tondu<-factor(TEDS_2016$Tondu,labels=c("Unification now","Status quo, unif. in future","Status quo, decide later","Status quo forever", "Status quo, indep. in future", "Independence now","No response"))

table(TEDS_2016$Tondu)
## 
##              Unification now  Status quo, unif. in future 
##                           27                          180 
##     Status quo, decide later           Status quo forever 
##                          546                          328 
## Status quo, indep. in future             Independence now 
##                          380                          108 
##                  No response 
##                          121
ggplot(TEDS_2016, aes(Tondu)) +
  geom_bar(fill= "pink") +
  xlab(NULL) +
  ylab("Count of responses") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90))

What problems do you encounter when dealing with the dataset? There are many missing values for a variety of variables in the dataset. Furthermore, many variables that should be continuous or categorical are coded with numbers which I was unable to find labels for (the site linked in the book has been taken down). This makes it almost impossible to find meaning in most types of EDA. Additionally, most of the variables are not in the correct form when they are first loaded (especially those that are factor variables).

How to deal with missing values? I would leave them as missing values since R will exclude them from analysis, after making sure that they are correctly treated as missing. Since no observations have missing values across all variables there could be valuable insights to be had in the data that is not missing.

Explore the relationship between Tondu and other variables including female, DPP, age, income, edu, Taiwanese and Econ_worse. What methods would you use?

# female
TEDS_2016$female <- as.factor(TEDS_2016$female)
ggplot(TEDS_2016, aes(Tondu, fill = female)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90))

# DPP
TEDS_2016$DPP <- as.factor(TEDS_2016$DPP)
ggplot(TEDS_2016, aes(Tondu, fill = DPP)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90))

# age
TEDS_2016$age <- as.numeric(TEDS_2016$age)
ggplot(TEDS_2016, aes(age, fill = Tondu)) +
  geom_boxplot()

# income
TEDS_2016$income <- as.factor(TEDS_2016$income)
ggplot(TEDS_2016, aes(income, fill = Tondu)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90))

# education
TEDS_2016$edu <- as.factor(TEDS_2016$edu)
ggplot(TEDS_2016, aes(Tondu, fill = edu)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90))

# Taiwanese
TEDS_2016$Taiwanese <- as.factor(TEDS_2016$Taiwanese)
ggplot(TEDS_2016, aes(Tondu, fill = Taiwanese)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90))

# Econ_worse
TEDS_2016$Econ_worse <- as.factor(TEDS_2016$Econ_worse)
ggplot(TEDS_2016, aes(Tondu, fill = Econ_worse)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90))

How about the votetsai variable (vote for DPP candidate Tsai Ing-wen)?

TEDS_2016$votetsai <- as.factor(TEDS_2016$votetsai)
table(TEDS_2016$votetsai)
## 
##   0   1 
## 471 790
table(TEDS_2016$Tondu, TEDS_2016$votetsai)
##                               
##                                  0   1
##   Unification now               14   6
##   Status quo, unif. in future   97  48
##   Status quo, decide later     173 229
##   Status quo forever           132 112
##   Status quo, indep. in future  36 274
##   Independence now               8  79
##   No response                   11  42
ggplot(TEDS_2016, aes(Tondu, fill = votetsai)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90))