Load libraries/data
library(haven)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
TEDS_2016 <- read_dta("TEDS_2016.dta")
Run an exploratory data analysis with R using the TEDS2016 dataset
table(TEDS_2016$District)
##
## 201 401 502 701 703 704 901 1002 1303 1401 1501 1701 1801 6301 6302 6303
## 55 55 49 37 37 39 47 48 49 21 22 60 48 54 57 54
## 6403 6405 6406 6407 6408 6503 6505 6507 6508 6509 6603 6604 6605 6606 6608 6701
## 42 42 42 44 49 54 55 54 55 59 38 38 36 36 36 44
## 6702 6703 6704 6802 6806
## 42 42 48 48 54
TEDS_2016$District <- as.factor(TEDS_2016$District)
ggplot(TEDS_2016, aes(District)) +
geom_bar(fill= "pink") +
xlab("District") +
ylab("Count of responses") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
TEDS_2016$Age <- as.factor(TEDS_2016$Age)
plot(TEDS_2016$Age, xlab = "Age Category", ylab = "Count of Responses")
TEDS_2016$Sex <- as.factor(TEDS_2016$Sex)
plot(TEDS_2016$Sex, xlab = "Sex", ylab = "Count of Responses")
Generate frequency table and barchart of the Tondu variable. Tondu variable: Indicates position on unification and independence
TEDS_2016$Tondu<-factor(TEDS_2016$Tondu,labels=c("Unification now","Status quo, unif. in future","Status quo, decide later","Status quo forever", "Status quo, indep. in future", "Independence now","No response"))
table(TEDS_2016$Tondu)
##
## Unification now Status quo, unif. in future
## 27 180
## Status quo, decide later Status quo forever
## 546 328
## Status quo, indep. in future Independence now
## 380 108
## No response
## 121
ggplot(TEDS_2016, aes(Tondu)) +
geom_bar(fill= "pink") +
xlab(NULL) +
ylab("Count of responses") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
What problems do you encounter when dealing with the dataset? There are many missing values for a variety of variables in the dataset. Furthermore, many variables that should be continuous or categorical are coded with numbers which I was unable to find labels for (the site linked in the book has been taken down). This makes it almost impossible to find meaning in most types of EDA. Additionally, most of the variables are not in the correct form when they are first loaded (especially those that are factor variables).
How to deal with missing values? I would leave them as missing values since R will exclude them from analysis, after making sure that they are correctly treated as missing. Since no observations have missing values across all variables there could be valuable insights to be had in the data that is not missing.
Explore the relationship between Tondu and other variables including female, DPP, age, income, edu, Taiwanese and Econ_worse. What methods would you use?
# female
TEDS_2016$female <- as.factor(TEDS_2016$female)
ggplot(TEDS_2016, aes(Tondu, fill = female)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90))
# DPP
TEDS_2016$DPP <- as.factor(TEDS_2016$DPP)
ggplot(TEDS_2016, aes(Tondu, fill = DPP)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90))
# age
TEDS_2016$age <- as.numeric(TEDS_2016$age)
ggplot(TEDS_2016, aes(age, fill = Tondu)) +
geom_boxplot()
# income
TEDS_2016$income <- as.factor(TEDS_2016$income)
ggplot(TEDS_2016, aes(income, fill = Tondu)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90))
# education
TEDS_2016$edu <- as.factor(TEDS_2016$edu)
ggplot(TEDS_2016, aes(Tondu, fill = edu)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90))
# Taiwanese
TEDS_2016$Taiwanese <- as.factor(TEDS_2016$Taiwanese)
ggplot(TEDS_2016, aes(Tondu, fill = Taiwanese)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90))
# Econ_worse
TEDS_2016$Econ_worse <- as.factor(TEDS_2016$Econ_worse)
ggplot(TEDS_2016, aes(Tondu, fill = Econ_worse)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90))
How about the votetsai variable (vote for DPP candidate Tsai Ing-wen)?
TEDS_2016$votetsai <- as.factor(TEDS_2016$votetsai)
table(TEDS_2016$votetsai)
##
## 0 1
## 471 790
table(TEDS_2016$Tondu, TEDS_2016$votetsai)
##
## 0 1
## Unification now 14 6
## Status quo, unif. in future 97 48
## Status quo, decide later 173 229
## Status quo forever 132 112
## Status quo, indep. in future 36 274
## Independence now 8 79
## No response 11 42
ggplot(TEDS_2016, aes(Tondu, fill = votetsai)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90))