#Lab 1

#Setting up R for Lab 1

#Set working directory 
setwd("/Users/timliu/Desktop/SOC252/Lab 1")

#Loading UN data into R and titling it UN_Data
UN_Data <- "/Users/timliu/Desktop/SOC252/Lab 1/SOC252_UN.csv"
UN_Data <- read.csv(UN_Data)

#Seeing the first few rows of UN data to see if it worked properly 
head(UN_Data)
##          country    region  group fertility   ppgdp lifeExpF pctUrban
## 1    Afghanistan      Asia  other     5.968   499.0    49.49       23
## 2        Albania    Europe  other     1.525  3677.2    80.40       53
## 3        Algeria    Africa africa     2.142  4473.0    75.00       67
## 4 American Samoa      <NA>   <NA>        NA      NA       NA       NA
## 5         Angola    Africa africa     5.135  4321.9    53.17       59
## 6       Anguilla Caribbean  other     2.000 13750.1    81.10      100
##   infantMortality
## 1       124.53500
## 2        16.56100
## 3        21.45800
## 4        11.29389
## 5        96.19100
## 6              NA
#Loading Titanic data into R and titling it Titanic_Data
Titanic_Data <- "/Users/timliu/Desktop/SOC252/Lab 1/titanic_data.csv"
Titanic_Data <- read.csv(Titanic_Data)

#Seeing the first few rows of Titanic data to see if it worked properly 
head(Titanic_Data)
##                         passenger survived    sex     age passengerClass
## 1   Allen, Miss. Elisabeth Walton      yes female 29.0000            1st
## 2  Allison, Master. Hudson Trevor      yes   male  0.9167            1st
## 3    Allison, Miss. Helen Loraine       no female  2.0000            1st
## 4 Allison, Mr. Hudson Joshua Crei       no   male 30.0000            1st
## 5 Allison, Mrs. Hudson J C (Bessi       no female 25.0000            1st
## 6             Anderson, Mr. Harry      yes   male 48.0000            1st
#Load libraries that we will use 
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(ggplot2)

#Question 1 A1

#Code to find the mean, median, standard deviation, and interquartile range of 
#female life expectancy, total fertility rate, infant deaths, and per capital gross domestic product 
UN_Data %>% 
  summarise(across(c(lifeExpF, fertility, infantMortality, ppgdp),
                   list(mean = ~ mean(.,na.rm = TRUE),
                        median = ~ median(.,na.rm = TRUE),
                        sd = ~ sd(.,na.rm = TRUE),
                        iqr = ~ IQR(.,na.rm = TRUE)))) %>%
  pivot_longer(cols = everything(), #reshaping to long format for easier manipulation
               names_sep = "_",
               names_to = c("var","measure"),
               values_to = "value") %>%
  pivot_wider(id_cols = c(var), #reshaping back to wide format
              names_from = "measure",
              values_from = "value",
              names_repair = "unique")
## # A tibble: 4 × 5
##   var                 mean  median       sd      iqr
##   <chr>              <dbl>   <dbl>    <dbl>    <dbl>
## 1 lifeExpF           72.3    75.9     10.1     13.9 
## 2 fertility           2.76    2.26     1.34     1.79
## 3 infantMortality    29.4    19.0     28.7     37.5 
## 4 ppgdp           13012.   4684.   18412.   14238.
#The average female life expectancy is 72.3 years old and half of females live until 75.9 years and 
#the other half live after 75.9 years. The standard deviation (SD) is 10.1 years meaning many values 
#deviate a decade from the mean. The interquartile range (IQR) is 13.9 years meaning that the middle
#50% of females expect to live between 65.7 and 79.6 years. 

#The average woman has 2.76 children and half have less than 2.26 children and the other half have
#over 2.26 children. The SD of fertility is 1.34 showing variation in fertility rates and the IQR is
#1.79 suggesting that the middle 50% has 1.75 to 3.54 children. 

#The average infant deaths by age 1 for every 1000 live births is 29.4 and the median is 19 for every
#1000 meaning half the countries have a rate below 19 and the other half above 19. The SD is 28.7
#showing a large variability in infant mortality between countries. The IQR is 37.5 deaths, showing 
#that the middle 50% have infant mortality rates between 7.02 and 44.5 deaths per 1000 live births.

#The average per capital gdp in USD is $13,012 and the median is $4684 showing that some countries
#are extremely wealthy pulling the mean up significantly. The SD is $18,412 also indicating a large
#variability and the IQR is $14,238 meaning the middle 50% have a GDP of $1,283 and $15,520. 

#Question 1 A2

#The 3 regions that will be used are Asia, Europe and Africa.
UN_regions <- UN_Data %>%
  filter(region %in% c("Asia", "Europe", "Africa"))

#Visual for Life Expectancy for Females by Region
ggplot(UN_regions, aes(x = region, y = lifeExpF, fill = region)) +
  geom_boxplot() +
  labs(title= "Life Expectancy for Females by Region", x = "Region", y = "Female Life Expectancy (in years)") +
  theme_minimal()

#For life expectancy for females, Africa by far has the lowest and also has a IQR that is less than Asia's 
#bottom 25% which is the closest to Africa's IQR. This means that the life expectancy in Africa is 
#significantly lower compared to Asia and Europe. Europe also has the smallest IQR meaning less variability 
#and that most females live to about the same age which can also signify better healthcare. Africa has a 
#median life expectancy of about 58 years, Asia around 75 and Europe around 82. 

#Visual for Fertility Rate by Region
ggplot(UN_regions, aes(x = region, y = fertility, fill = region)) +
  geom_boxplot() +
  labs(title= "Fertility Rate by Region", x = "Region", y = "Fertility Rate") +
  theme_minimal()

#The fertility rate is the highest in Africa with a median of around 4.5 compared to Asia's 2.25 and Europe's
#1. Africa also has the greatest variabilty of the three regions as seen by its IQR of around 3.1 to 5 
#compared to Asia with 1.75 to 2.5 and Europe's 1 to 1.5. Asia also has a few significant outliers between 4
#and 6 which indicates that some mothers in Asia have a greater amount of children compared to the region 
#average. 

#Visual for Infant Mortality
ggplot(UN_regions, aes(x = region, y = infantMortality, fill = region)) +
  geom_boxplot() +
  labs(title= "Infants Deaths per 1000 Live Births by Region", x = "Region", 
       y = "Infants Deaths (by Age 1) per 1000 Live Births") +
  theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

#Africa by far has the highest median infant death rate of around 65 compared to Asia's rate of around 22 
#and Europe's rate of around 5 which indicates drastic access to healthcare and technology in these 3 
#regions. Asia also has an outlier of around 125 which indicates that some parts of Asia have extrmely high
#infant death rates. Europe also has outliers although they are very near the upper whisker, but still 
#indicates that differences in infant death rates exist in European countries. Finally, Africa also has the 
#largest IQR which indicates a lot of variability between African countries. 

#Question 1 B1

#I chose per capital gdp (ppgdp) and infant deaths (infantMortality) because I want to see: 
#How does the gdp affect infant death rates? I believe countries with higher gdp also have 
#lower infant death rates as they have more technology to decrease the rate and vice versa. 

#Question 1 B2
UN_Data %>%
  ggplot(aes(x = ppgdp, y = infantMortality)) +
  geom_point(color = "orange") + #adds points to the scatter
  labs(title = "Relationship between Per Capital GDP and Infant Death Rate 
       (By Age 1) per 1000 Live Births",
       x = "Per Capital GDP",
       y = "Infant Death Rate (By Age 1) per 1000 Live Births") +
  theme_minimal()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Question 1 B3
#The scatter plot is able to show clearly the relationship between two continuous variable whereas the 
#histogram is only for one continuous variable and the box plot is for comparing the distribution for 
#variables. 

#Question 1 B4
#As I predicted the pattern shows that infant mortality rates by age 1 for every 1000 live births is higher 
#in countries with lower Per Capital GDP and as the GDP increases the infant death rates decrease. 

#Question 2

#A
Titanic_Data %>% count(survived) %>% 
  mutate(prop = n / sum(n))
##   survived   n     prop
## 1       no 809 0.618029
## 2      yes 500 0.381971
#The probability of survival is 38.197%

#B
Titanic_Data %>% filter(sex == "male") %>%
  count(survived) %>%
  mutate(prop = n / sum(n))
##   survived   n      prop
## 1       no 682 0.8090154
## 2      yes 161 0.1909846
Titanic_Data %>% filter(sex == "female") %>%
  count(survived) %>%
  mutate(prop = n / sum(n))
##   survived   n      prop
## 1       no 127 0.2725322
## 2      yes 339 0.7274678
#The probability of survival for a male was 19.098% and 72.746% for a female. 

#C
Titanic_Data %>% filter((age < 18 |age > 45 )) %>% count(survived) %>%
  mutate(prop = n/sum(n)) %>% filter(survived== "yes")
##   survived   n      prop
## 1      yes 143 0.4627832
#The combined probability of survival for ages below 18 and ages above 45 is 46.278%. 

#D 
Titanic_Data %>% filter(passengerClass == "1st") %>%
  count(survived) %>%
  mutate(prop = n / sum(n))
##   survived   n     prop
## 1       no 123 0.380805
## 2      yes 200 0.619195
Titanic_Data %>% filter(passengerClass == "3rd") %>%
  count(survived) %>%
  mutate(prop = n / sum(n))
##   survived   n      prop
## 1       no 528 0.7447109
## 2      yes 181 0.2552891
# 61.919% of first class passengers survived vs the 25.528% of third class that survived.