#Lab 1
#Setting up R for Lab 1
#Set working directory
setwd("/Users/timliu/Desktop/SOC252/Lab 1")
#Loading UN data into R and titling it UN_Data
UN_Data <- "/Users/timliu/Desktop/SOC252/Lab 1/SOC252_UN.csv"
UN_Data <- read.csv(UN_Data)
#Seeing the first few rows of UN data to see if it worked properly
head(UN_Data)
## country region group fertility ppgdp lifeExpF pctUrban
## 1 Afghanistan Asia other 5.968 499.0 49.49 23
## 2 Albania Europe other 1.525 3677.2 80.40 53
## 3 Algeria Africa africa 2.142 4473.0 75.00 67
## 4 American Samoa <NA> <NA> NA NA NA NA
## 5 Angola Africa africa 5.135 4321.9 53.17 59
## 6 Anguilla Caribbean other 2.000 13750.1 81.10 100
## infantMortality
## 1 124.53500
## 2 16.56100
## 3 21.45800
## 4 11.29389
## 5 96.19100
## 6 NA
#Loading Titanic data into R and titling it Titanic_Data
Titanic_Data <- "/Users/timliu/Desktop/SOC252/Lab 1/titanic_data.csv"
Titanic_Data <- read.csv(Titanic_Data)
#Seeing the first few rows of Titanic data to see if it worked properly
head(Titanic_Data)
## passenger survived sex age passengerClass
## 1 Allen, Miss. Elisabeth Walton yes female 29.0000 1st
## 2 Allison, Master. Hudson Trevor yes male 0.9167 1st
## 3 Allison, Miss. Helen Loraine no female 2.0000 1st
## 4 Allison, Mr. Hudson Joshua Crei no male 30.0000 1st
## 5 Allison, Mrs. Hudson J C (Bessi no female 25.0000 1st
## 6 Anderson, Mr. Harry yes male 48.0000 1st
#Load libraries that we will use
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(ggplot2)
#Question 1 A1
#Code to find the mean, median, standard deviation, and interquartile range of
#female life expectancy, total fertility rate, infant deaths, and per capital gross domestic product
UN_Data %>%
summarise(across(c(lifeExpF, fertility, infantMortality, ppgdp),
list(mean = ~ mean(.,na.rm = TRUE),
median = ~ median(.,na.rm = TRUE),
sd = ~ sd(.,na.rm = TRUE),
iqr = ~ IQR(.,na.rm = TRUE)))) %>%
pivot_longer(cols = everything(), #reshaping to long format for easier manipulation
names_sep = "_",
names_to = c("var","measure"),
values_to = "value") %>%
pivot_wider(id_cols = c(var), #reshaping back to wide format
names_from = "measure",
values_from = "value",
names_repair = "unique")
## # A tibble: 4 × 5
## var mean median sd iqr
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 lifeExpF 72.3 75.9 10.1 13.9
## 2 fertility 2.76 2.26 1.34 1.79
## 3 infantMortality 29.4 19.0 28.7 37.5
## 4 ppgdp 13012. 4684. 18412. 14238.
#The average female life expectancy is 72.3 years old and half of females live until 75.9 years and
#the other half live after 75.9 years. The standard deviation (SD) is 10.1 years meaning many values
#deviate a decade from the mean. The interquartile range (IQR) is 13.9 years meaning that the middle
#50% of females expect to live between 65.7 and 79.6 years.
#The average woman has 2.76 children and half have less than 2.26 children and the other half have
#over 2.26 children. The SD of fertility is 1.34 showing variation in fertility rates and the IQR is
#1.79 suggesting that the middle 50% has 1.75 to 3.54 children.
#The average infant deaths by age 1 for every 1000 live births is 29.4 and the median is 19 for every
#1000 meaning half the countries have a rate below 19 and the other half above 19. The SD is 28.7
#showing a large variability in infant mortality between countries. The IQR is 37.5 deaths, showing
#that the middle 50% have infant mortality rates between 7.02 and 44.5 deaths per 1000 live births.
#The average per capital gdp in USD is $13,012 and the median is $4684 showing that some countries
#are extremely wealthy pulling the mean up significantly. The SD is $18,412 also indicating a large
#variability and the IQR is $14,238 meaning the middle 50% have a GDP of $1,283 and $15,520.
#Question 1 A2
#The 3 regions that will be used are Asia, Europe and Africa.
UN_regions <- UN_Data %>%
filter(region %in% c("Asia", "Europe", "Africa"))
#Visual for Life Expectancy for Females by Region
ggplot(UN_regions, aes(x = region, y = lifeExpF, fill = region)) +
geom_boxplot() +
labs(title= "Life Expectancy for Females by Region", x = "Region", y = "Female Life Expectancy (in years)") +
theme_minimal()

#For life expectancy for females, Africa by far has the lowest and also has a IQR that is less than Asia's
#bottom 25% which is the closest to Africa's IQR. This means that the life expectancy in Africa is
#significantly lower compared to Asia and Europe. Europe also has the smallest IQR meaning less variability
#and that most females live to about the same age which can also signify better healthcare. Africa has a
#median life expectancy of about 58 years, Asia around 75 and Europe around 82.
#Visual for Fertility Rate by Region
ggplot(UN_regions, aes(x = region, y = fertility, fill = region)) +
geom_boxplot() +
labs(title= "Fertility Rate by Region", x = "Region", y = "Fertility Rate") +
theme_minimal()

#The fertility rate is the highest in Africa with a median of around 4.5 compared to Asia's 2.25 and Europe's
#1. Africa also has the greatest variabilty of the three regions as seen by its IQR of around 3.1 to 5
#compared to Asia with 1.75 to 2.5 and Europe's 1 to 1.5. Asia also has a few significant outliers between 4
#and 6 which indicates that some mothers in Asia have a greater amount of children compared to the region
#average.
#Visual for Infant Mortality
ggplot(UN_regions, aes(x = region, y = infantMortality, fill = region)) +
geom_boxplot() +
labs(title= "Infants Deaths per 1000 Live Births by Region", x = "Region",
y = "Infants Deaths (by Age 1) per 1000 Live Births") +
theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

#Africa by far has the highest median infant death rate of around 65 compared to Asia's rate of around 22
#and Europe's rate of around 5 which indicates drastic access to healthcare and technology in these 3
#regions. Asia also has an outlier of around 125 which indicates that some parts of Asia have extrmely high
#infant death rates. Europe also has outliers although they are very near the upper whisker, but still
#indicates that differences in infant death rates exist in European countries. Finally, Africa also has the
#largest IQR which indicates a lot of variability between African countries.
#Question 1 B1
#I chose per capital gdp (ppgdp) and infant deaths (infantMortality) because I want to see:
#How does the gdp affect infant death rates? I believe countries with higher gdp also have
#lower infant death rates as they have more technology to decrease the rate and vice versa.
#Question 1 B2
UN_Data %>%
ggplot(aes(x = ppgdp, y = infantMortality)) +
geom_point(color = "orange") + #adds points to the scatter
labs(title = "Relationship between Per Capital GDP and Infant Death Rate
(By Age 1) per 1000 Live Births",
x = "Per Capital GDP",
y = "Infant Death Rate (By Age 1) per 1000 Live Births") +
theme_minimal()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Question 1 B3
#The scatter plot is able to show clearly the relationship between two continuous variable whereas the
#histogram is only for one continuous variable and the box plot is for comparing the distribution for
#variables.
#Question 1 B4
#As I predicted the pattern shows that infant mortality rates by age 1 for every 1000 live births is higher
#in countries with lower Per Capital GDP and as the GDP increases the infant death rates decrease.
#Question 2
#A
Titanic_Data %>% count(survived) %>%
mutate(prop = n / sum(n))
## survived n prop
## 1 no 809 0.618029
## 2 yes 500 0.381971
#The probability of survival is 38.197%
#B
Titanic_Data %>% filter(sex == "male") %>%
count(survived) %>%
mutate(prop = n / sum(n))
## survived n prop
## 1 no 682 0.8090154
## 2 yes 161 0.1909846
Titanic_Data %>% filter(sex == "female") %>%
count(survived) %>%
mutate(prop = n / sum(n))
## survived n prop
## 1 no 127 0.2725322
## 2 yes 339 0.7274678
#The probability of survival for a male was 19.098% and 72.746% for a female.
#C
Titanic_Data %>% filter((age < 18 |age > 45 )) %>% count(survived) %>%
mutate(prop = n/sum(n)) %>% filter(survived== "yes")
## survived n prop
## 1 yes 143 0.4627832
#The combined probability of survival for ages below 18 and ages above 45 is 46.278%.
#D
Titanic_Data %>% filter(passengerClass == "1st") %>%
count(survived) %>%
mutate(prop = n / sum(n))
## survived n prop
## 1 no 123 0.380805
## 2 yes 200 0.619195
Titanic_Data %>% filter(passengerClass == "3rd") %>%
count(survived) %>%
mutate(prop = n / sum(n))
## survived n prop
## 1 no 528 0.7447109
## 2 yes 181 0.2552891
# 61.919% of first class passengers survived vs the 25.528% of third class that survived.