library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ dplyr   1.0.7
## ✓ tibble  3.1.4     ✓ stringr 1.4.0
## ✓ tidyr   1.1.3     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)

Question 1

a

# Reading in csv file
df <- read_csv("LungCapData (1).csv")
## Rows: 725 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Smoke, Gender, Caesarean
## dbl (3): LungCap, Age, Height
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Visualizing distribution of LungCap
hist(df$LungCap)

It appears the distrubution of lung capacity is nearly a normal distubution since most of the obervations fall around the mean with fewer near the margins.

b

# Creating female dataframe
df_fm <- df %>% 
  filter(Gender == 'female')

# Creating male dataframe
df_m <- df %>% 
  filter(Gender == 'male')
# Creating boxplots for male and female 
boxplot(df_m$LungCap, df_fm$LungCap)

The probability distribution of lungCap for males is slightly greater than that of females, though both hover around 8.

c

# Creating dataframe for smokers 
df_smoke <- df %>%
  filter(Smoke == 'yes')

# Creating dataframe for non-smokers
df_non_smoke <- df %>%
  filter(Smoke == 'no')
# Calculating mean lung capacities for smokers and non-smokers
mean(df_smoke$LungCap)
## [1] 8.645455
mean(df_non_smoke$LungCap)
## [1] 7.770188

The lung capacities of smokers (8.645) is larger than the lung capacity of non-smokers (7.77) which is suprising given that cigarettes are known to damage lung function overtime.

d

# Creating seperate dataframe for smokers among each age group
df_age_1 <- df_smoke %>%
  filter(Age <= '13')

df_age_2 <- df_smoke %>%
  filter(Age == '14':'15')
## Warning in Age == "14":"15": longer object length is not a multiple of shorter
## object length
df_age_3 <- df_smoke %>%
  filter(Age == '16':'17')
## Warning in Age == "16":"17": longer object length is not a multiple of shorter
## object length
df_age_4 <- df_smoke %>%
  filter(Age >= '18')
# Calculating lung capacity means for 13 and under
mean(df_age_1$LungCap)
## [1] 7.201852
# Lung capacity mean for 14-15
mean(df_age_2$LungCap)
## [1] 8.909375
# Lung capacity mean for 16-17
mean(df_age_3$LungCap)
## [1] 9.602083
# Lung capacity mean for 18 and up 
mean(df_age_4$LungCap)
## [1] 10.51333

Based on the mean lung capacity among each age group, older smokers (18+) have higher lung capacities (10.513) than smokers of lower age groups. The youngest age group (13 and under) understandably have the lowest lung capacity (7.202). The mean lung capacities of smokers increase at a fairly steady rate as across the age groups as they advance.

e

# Creating seperate dataframe for non-smokers among each age group
df_age_I <- df_non_smoke %>%
  filter(Age <= '13')

df_age_II <- df_non_smoke %>%
  filter(Age == '14':'15')

df_age_III <- df_non_smoke %>%
  filter(Age == '16':'17')

df_age_IIII <- df_non_smoke %>%
  filter(Age >= '18')
# Calculating mean lung capacity for non-smokers 13 and under
mean(df_age_I$LungCap)
## [1] 7.571804
# Lung capacity for non-smokers 14-15
mean(df_age_II$LungCap)
## [1] 8.8435
# Lung capacity for non-smokers 16-17
mean(df_age_III$LungCap)
## [1] 10.39438
# Lung capacity for non-smokers 18 and up 
mean(df_age_IIII$LungCap)
## [1] 6.5227

Mean lung capacities for non-smokers deviate slightly from the mean lung capacities of smokers across each age group. Where as the mean lung capacity for 18+ smokers was greater than the other groups, the mean lung capacity is for 18+ non-smokers is lower than the other groups (6.523). This could be due to the fact that there are far more non-smokers within that age group than there are smokers, 247 non-smokers compared to just 15 smokers. Given the greater volume of data, the mean lung capacity is prone to deviation as it accounts for more outliers.

f

# Solving correlation between lung capacity and age
cor(df$LungCap, df$Age)
## [1] 0.8196749

With a correlation score of 0.82, the relationship between lung capacity and age strong. This signifies that as age increases so does lung capacity in a fairly linear way.

# Solving covariance between lung capacity and age
cov(df$LungCap, df$Age)
## [1] 8.738289

A covariance of 8.738 indicates a positive linear relationship between lung capacity and age.

Question 2

a

# Finding probablility that a randomly selected inmate has excactly 2 prior convitions 
160/810
## [1] 0.1975309

b

# Probability that a randomly selected inmate has fewer than 2 prior convicitons
562/810
## [1] 0.6938272

c

# Probability that a randomly selected inmate has fewer than 2 prior convivtions 
722/810
## [1] 0.891358

d

# Probability that a randomly selected inmate has more than 2 prior convivtions 
88/810
## [1] 0.108642

e

# Defining convictions 
convs <- c(0, 1, 2, 3, 4)
# Defining probabilities
probs <- c(0.158, 0.536, 0.198, 0.079, 0.03)
# Calculating expected value 
sum(convs*probs)
## [1] 1.289

f

# Calculating variance for prior convictions 
var(probs)
## [1] 0.0395512
# Calculating standard deeviation for prior convictions 
sd(probs)
## [1] 0.1988748