Your analysis should be done using R and your answers should be given in R. An example of a question and an answer is given below.
Your solution should be
x<-rnorm(100,0,1)
hist(x)
You do not need to explain your R code. For example, you do not need to write: “the function hist() was used to produce the histogram.” Your answers to the questions should be the R code that you used to produce the output.
You need to submit the following materials:
You do not need to interpret the results!!! For example, if the question is to conduct a two-sample t-test, you do not need to interpret the results. This means, for example, that you do not need to write “the p-value is 0.007 indicating a significant difference between the group means.”
Solutions on the questions should be introduced in an R markdown file (see above) and saved in the folder in which the exam questions are available.
In this section of the exam, we focus on titanic data. It contains data of survival status of passengers on the Titanic, together with their names (Name), age (Age), sex (Sex) and passenger class (PClass). This dataset is part of package lgrdata and will be available after installing the package. More information about the data can be found in https://www.rdocumentation.org/packages/titanic/versions/0.1.0. To access the data you need to install the lgrdata package.
#install.packages("lgrdata")
library(lgrdata)
data(titanic)
names(titanic)
## [1] "Name" "PClass" "Age" "Sex" "Survived"
head(titanic)
## Name PClass Age Sex Survived
## 1 Allen, Miss Elisabeth Walton 1st 29.00 female 1
## 2 Allison, Miss Helen Loraine 1st 2.00 female 0
## 3 Allison, Mr Hudson Joshua Creighton 1st 30.00 male 0
## 4 Allison, Mrs Hudson JC (Bessie Waldo Daniels) 1st 25.00 female 0
## 5 Allison, Master Hudson Trevor 1st 0.92 male 1
## 6 Anderson, Mr Harry 1st 47.00 male 1
sum(is.na(titanic$Age))
## [1] 557
sum(is.na(titanic$PClass))
## [1] 0
sum(is.na(titanic$Sex))
## [1] 0
sum(is.na(titanic$Survived))
## [1] 0
titanic <- na.omit(titanic)
titanic$Survived <- as.factor(titanic$Survived)
dim(titanic)
## [1] 756 5
There are 756 obsevations after dropping missing data
library(dplyr)
library(gt)
titanic_anonym <- titanic[,-1]
tab_vect <- sapply(titanic_anonym, function(x) n_distinct(x) )
table_1 <- data.frame(PClass=tab_vect[1], Age=tab_vect[2], Sex=tab_vect[3], Survived=tab_vect[4])
table_1 %>%
gt() %>%
tab_header(title = md("Table 1: Number of unique values per variable"))
| Table 1: Number of unique values per variable | |||
| PClass | Age | Sex | Survived |
|---|---|---|---|
| 3 | 75 | 2 | 2 |
titanic %>%
group_by(PClass, Sex) %>%
count(Survived)
## # A tibble: 12 × 4
## # Groups: PClass, Sex [6]
## PClass Sex Survived n
## <fct> <fct> <fct> <int>
## 1 1st female 0 5
## 2 1st female 1 96
## 3 1st male 0 82
## 4 1st male 1 43
## 5 2nd female 0 10
## 6 2nd female 1 75
## 7 2nd male 0 106
## 8 2nd male 1 21
## 9 3rd female 0 56
## 10 3rd female 1 46
## 11 3rd male 0 184
## 12 3rd male 1 32
sum(duplicated(titanic$Name))
## [1] 3
titanic[duplicated(titanic$Name),]
## Name PClass Age Sex Survived
## 708 Carlsson, Mr Frans Olof 3rd 33 male 0
## 730 Connolly, Miss Kate 3rd 22 female 1
## 923 Kelly, Mr James 3rd 42 male 0
For the analysis of this question, observations with missing values in the titanic dataset should be excluded.
titanic_surv <- titanic %>%
group_by(Survived, Sex, PClass) %>%
count(Survived) %>%
group_by(Survived,Sex) %>%
mutate(percent=n/sum(n)*100) %>%
mutate(pos = cumsum(percent) - (percent / 2)) %>%
print()
## # A tibble: 12 × 6
## # Groups: Survived, Sex [4]
## Survived Sex PClass n percent pos
## <fct> <fct> <fct> <int> <dbl> <dbl>
## 1 0 female 1st 5 7.04 3.52
## 2 0 female 2nd 10 14.1 14.1
## 3 0 female 3rd 56 78.9 60.6
## 4 0 male 1st 82 22.0 11.0
## 5 0 male 2nd 106 28.5 36.3
## 6 0 male 3rd 184 49.5 75.3
## 7 1 female 1st 96 44.2 22.1
## 8 1 female 2nd 75 34.6 61.5
## 9 1 female 3rd 46 21.2 89.4
## 10 1 male 1st 43 44.8 22.4
## 11 1 male 2nd 21 21.9 55.7
## 12 1 male 3rd 32 33.3 83.3
library(ggplot2)
ggplot(titanic_surv, mapping = aes(x="", y=percent, fill=PClass))+
geom_bar(stat="identity", width = 1)+
coord_polar("y",start=0)+
facet_grid(Survived~Sex)+
labs(x="Survived")+
theme(axis.title = element_blank())
ggplot(titanic_surv, mapping = aes(x="", y=percent, fill=PClass))+
geom_bar(stat="identity", width = 1)+
coord_polar("y",start=0)+
geom_label(aes(label=sprintf("%.2f%%", percent)), position = position_stack(vjust = 0.5), size=2)+
labs(x="Survived")+
theme(axis.title = element_blank())+
facet_grid(Survived~Sex)
titanic_3rd <- titanic %>%
filter(PClass=="3rd")
ggplot(titanic_3rd, aes(x = Survived, y = Age, fill = Survived)) +
geom_boxplot() +
geom_point(position = position_jitter(width = 0.2), size = 0.5) +
labs(x = "Survived") +
scale_fill_discrete(labels = c("0 (No)", "1 (Yes)"))+
theme(strip.text = element_text(face = "italic", size = 12))+
facet_wrap(~Sex)
titanic_female <- titanic%>%
filter(Sex=="female")
t.test(Age~Survived,data = titanic_female)
##
## Welch Two Sample t-test
##
## data: Age by Survived
## t = -3.2177, df = 135.67, p-value = 0.001617
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -9.632324 -2.299144
## sample estimates:
## mean in group 0 mean in group 1
## 24.90141 30.86714
titanic_clean <- na.omit(titanic)
dim(titanic_clean)
## [1] 756 5
There are 756 obsevations after dropping missing data
table(titanic_clean$Sex, titanic_clean$Survived)
##
## 0 1
## female 71 217
## male 372 96
sex_surv <- table(titanic_clean$Sex, titanic_clean$Survived)
sex_surv <- as.data.frame.matrix(sex_surv)
sex_surv <- sex_surv %>%
rename(Died = "0", Survived="1") %>%
mutate(percentDied =Died/(Died+Survived)*100) %>%
rename("%Died" = percentDied) %>%
print()
## Died Survived %Died
## female 71 217 24.65278
## male 372 96 79.48718
ggplot(titanic_clean, aes(x=Survived, y=Age, fill=Survived))+
geom_violin()+
scale_fill_discrete(labels = c("0 (No)", "1 (Yes)"))+
facet_wrap(~PClass)
ggplot(titanic_clean, aes(x=Survived, y=Age, fill=Survived))+
geom_violin()+
scale_fill_discrete(labels = c("0 (No)", "1 (Yes)"))+
facet_wrap(PClass~Sex)
This part is focused on the gss_abortion data. the data is a part of the R library stevedata. This is a toy data set derived from the General Social Survey and it contains information about abortion opinions in the general social survey. More information about the dataset and variables names can be found here: http://svmiller.com/stevedata/reference/gss_abortion.html. The gss_wages dataset contains 64814 rows and 18 columns. Make sure that you install the stevedata package. You can use the code below to install and access the data and to see the variables names.
#install.packages("stevedata")
library(stevedata)
data(gss_abortion)
names(gss_abortion)
## [1] "id" "year" "age" "race" "sex"
## [6] "hispaniccat" "educ" "partyid" "relactiv" "abany"
## [11] "abdefect" "abnomore" "abhlth" "abpoor" "abrape"
## [16] "absingle" "pid" "hispanic"
head(gss_abortion)
## # A tibble: 6 × 18
## id year age race sex hispaniccat educ partyid relactiv abany
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1 1972 23 White Female NA 16 Ind,Near Dem NA NA
## 2 2 1972 70 White Male NA 10 Not Str Democ… NA NA
## 3 3 1972 48 White Female NA 12 Independent NA NA
## 4 4 1972 27 White Female NA 17 Not Str Democ… NA NA
## 5 5 1972 61 White Female NA 12 Strong Democr… NA NA
## 6 6 1972 26 White Male NA 14 Ind,Near Dem NA NA
## # ℹ 8 more variables: abdefect <dbl>, abnomore <dbl>, abhlth <dbl>,
## # abpoor <dbl>, abrape <dbl>, absingle <dbl>, pid <dbl>, hispanic <dbl>
dim(gss_abortion)
## [1] 64814 18
For the analysis in this question, use the complete cases (i.e., exclude the observation swith missing values).
gss_clean <- na.omit(gss_abortion)
dim(gss_clean)
## [1] 9258 18
table_2 <- gss_clean %>%
group_by(race) %>%
summarise(
mean = round(mean(age),1),
median = median(age),
sd = round(sd(age),1))
print(table_2)
## # A tibble: 3 × 4
## race mean median sd
## <chr> <dbl> <dbl> <dbl>
## 1 Black 45 44 16.5
## 2 Other 40.9 38 15.5
## 3 White 49.3 49 17.3
There are 9358 obsevations after dropping missing data
gss_age <- gss_clean %>%
filter(age>33&age<47)
dim(gss_age)
## [1] 2202 18
table_3 <- gss_age %>%
group_by(year) %>%
summarise(mean=mean(educ)) %>%
arrange(desc(mean)) %>%
print()
## # A tibble: 7 × 2
## year mean
## <dbl> <dbl>
## 1 2012 14.2
## 2 2018 14.0
## 3 2016 14.0
## 4 2014 14.0
## 5 2008 13.9
## 6 2010 13.8
## 7 2006 13.8