library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
## here() starts at C:/Users/user/Documents
bdims <- read.csv("C:/Users/user/Documents/Body.csv")
head(bdims,4)
## BiaDi DiiDi BitDi CheDe CheDi ElbDi WriDi KneDi AnkDi ShoGi CheGi WaiGi NavGi
## 1 42.9 26.0 31.5 17.7 28.0 13.1 10.4 18.8 14.1 106.2 89.5 71.5 74.5
## 2 43.7 28.5 33.5 16.9 30.8 14.0 11.8 20.6 15.1 110.5 97.0 79.0 86.5
## 3 40.1 28.2 33.3 20.9 31.7 13.9 10.9 19.7 14.1 115.1 97.5 83.2 82.9
## 4 44.3 29.9 34.0 18.4 28.2 13.9 11.2 20.9 15.0 104.5 97.0 77.8 78.8
## HipGi ThiGi BicGi ForGi KneGi CalGi AnkGi WriGi Age Wgt Hgt Sex
## 1 93.5 51.5 32.5 26.0 34.5 36.5 23.5 16.5 21 65.6 174.0 1
## 2 94.8 51.5 34.4 28.0 36.5 37.5 24.5 17.0 23 71.8 175.3 1
## 3 95.0 57.3 33.4 28.8 37.0 37.3 21.9 16.9 28 80.7 193.5 1
## 4 94.0 53.0 31.0 26.2 37.0 34.8 23.0 16.6 23 72.6 186.5 1
mdims <- bdims %>%
filter(Sex == 1)
fdims <- bdims %>%
filter(Sex == 0)
###3. Make a plot (or plots) to visualize the distributions of men’s and women’s heights. How do their centers, shapes, and spreads compare?
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
gginit.male <- ggplot(mdims,aes(x=Hgt))
plottype.male <- geom_histogram(binwidth=5,color='blue',fill='yellow',alpha=1)
gginit.female <- ggplot(fdims,aes(x=Hgt))
plottype.female <- geom_histogram(binwidth=3,color='maroon',fill='white',alpha=1)
theme <- theme_gray()
male.plot <- gginit.male + plottype.male + theme + xlab('Height') + ggtitle('Distribution of Male') + xlim(100,200)
female.plot <- gginit.female + plottype.female + theme + xlab('Height') + ggtitle('Distribution of Female') + xlim(100,200)
grid.arrange(male.plot, female.plot, ncol = 3)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 2 rows containing missing values (geom_bar).
fhgtmean <- mean(fdims$Hgt)
fhgtsd <- sd(fdims$Hgt)
ggplot(fdims, aes(x = Hgt)) +
geom_histogram(aes(y = ..density..),
colour = 'black',
fill = 'cyan3',
binwidth = 3) +
stat_function(fun = dnorm,
args = c(mean = fhgtmean, sd = fhgtsd),
col = "tomato", size = 1.5)
### 4. Based on the this plot, does it appear that the data follow a nearly normal distribution?
###As we can see the distribution of females are equally distributed.
ggplot(data = fdims, mapping = aes(sample = Hgt)) +
geom_qq()
sim_norm <- rnorm(n = nrow(fdims), mean = fhgtmean, sd = fhgtsd)
###5. Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data? (Since sim_norm is not a dataframe, it can be put directly into the sample argument and the data argument can be dropped.)
ggplot( mapping = aes(sample = sim_norm)) +
geom_qq()+
labs(x="theory",
y='sample',
title="sim_norm Normal Probability")
### From the plot we can see the data is similar to a line graph as most of the points unit to form a line.
qqnormsim <- function(dat){
par(mfrow = c(3,3))
qqnorm(dat, main = "Normal QQ Plot (Data)")
qqline(dat)
for(i in 1:8){
simnorm <- rnorm(n = length(dat), mean = mean(dat), sd = sd(dat))
qqnorm(simnorm,main = "Normal QQ Plot (Sim)")
qqline(simnorm)
}
par(mfrow = c(1,1))
}
qqnormsim(fdims$Hgt)
###6. Does the normal probability plot for female heights look similar to the plots created for the simulated data? That is, do the plots provide evidence that the female heights are nearly normal?(Mark - 0.5)
###Yes, the normal probability plot for female heights look similar to plots created for simulated data. Evidence: Graphs with the qqnormism()function are similar to normal probability charts.
###7. Using the same technique, determine whether or not female weights appear to come from a normal distribution.(Mark - 0.5)
ggplot(data = fdims, mapping = aes(sample = Wgt)) + geom_qq()+labs(x="theory",
y='sample',
title=" Normal Probability")
qqnormsim(fdims$Wgt)
### The female weights are likely to have normal distribution. Evidence: We can see that theory and actual values have comparable Q-Q plot by using the same techniques.
###8. Plot the distribution of female weights using a histogram. (Mark - 0.5)
fwgtmean <- mean(fdims$Wgt)
fwgtsd <- sd(fdims$Wgt)
ggplot(fdims, aes(x = Wgt)) +
geom_histogram(aes(y = ..density..),
colour = 'Blue',
fill = 'Light blue',
binwidth = 5) +
stat_function(fun = dnorm,
args = c(mean = fwgtmean, sd = fwgtsd),
col = "Gray", size = 3)+
labs(title="Histogram for density distribution of Female Weights",
x="Female Weights",
y="Density")
1 - pnorm(q = 182, mean = fhgtmean, sd = fhgtsd)
## [1] 0.004434387
fdims %>%
filter(Hgt > 182) %>%
summarise(percent = n() / nrow(fdims))
## percent
## 1 0.003846154
###9. Write out two probability questions that you would like to answer; one regarding female heights and one regarding male weights. Calculate those probabilities using both the theoretical normal distribution as well as the empirical distribution (four probabilities in all). Which variable, height or weight, had a closer agreement between the two methods?(Mark - 0.5)
### 1. What is the probability of female height is less than 175cm
##theoretical
less <-pnorm(q = 175, mean = fhgtmean, sd = fhgtsd)
less
## [1] 0.9391272
###Normal distribution
fdims %>%
filter(Hgt < 175) %>%
summarise(percent = n() / nrow(fdims))
## percent
## 1 0.9153846
###2. what is probability of males weight between 80 kg to 85 kg
##theoretical
pnorm(q=85,mean= mdims$Wgt,sd=sd(mdims$Wgt)) - pnorm(q=80,mean=mdims$Wgt,sd=sd(mdims$Wgt))
## [1] 0.052889883 0.113066703 0.185282055 0.121649522 0.176882797 0.144466288
## [7] 0.175693220 0.174449674 0.029088964 0.187291651 0.161051768 0.186959361
## [13] 0.146429259 0.142479015 0.104493835 0.181077936 0.106630906 0.093932157
## [19] 0.119508252 0.178563332 0.176882797 0.170408074 0.057781015 0.175693220
## [25] 0.187557908 0.150275368 0.187891261 0.159344904 0.035961081 0.023681951
## [31] 0.144466288 0.093932157 0.119508252 0.185842229 0.084696936 0.017947143
## [37] 0.066484403 0.025552560 0.079716333 0.183220799 0.165928869 0.183970650
## [43] 0.130142090 0.116289330 0.187958003 0.158477158 0.185842229 0.103427636
## [49] 0.145450920 0.016527208 0.152155054 0.176882797 0.149324360 0.171113092
## [55] 0.006867866 0.010474632 0.026037073 0.162719112 0.173153422 0.117362976
## [61] 0.110918791 0.183603493 0.075822250 0.056948138 0.115215274 0.045317421
## [67] 0.144466288 0.093932157 0.037208594 0.035961081 0.178563332 0.177456966
## [73] 0.071078855 0.056122280 0.075822250 0.040457058 0.115215274 0.010008937
## [79] 0.141476808 0.137415903 0.115215274 0.185842229 0.122718154 0.073907735
## [85] 0.103427636 0.120579488 0.120579488 0.187291651 0.151219004 0.125913595
## [91] 0.096023634 0.130142090 0.099182400 0.082690433 0.032970928 0.077758726
## [97] 0.055303485 0.153083305 0.184657976 0.166704143 0.175693220 0.128033121
## [103] 0.004973316 0.115215274 0.007383178 0.014258399 0.187557908 0.122718154
## [109] 0.041808503 0.118436015 0.048258202 0.175693220 0.048258202 0.159344904
## [115] 0.185842229 0.062922920 0.151219004 0.113066703 0.187932972 0.013079060
## [121] 0.180604267 0.175693220 0.113066703 0.001141852 0.117362976 0.186959361
## [127] 0.180604267 0.137415903 0.178563332 0.154915556 0.146429259 0.169690894
## [133] 0.175693220 0.137415903 0.132238816 0.159344904 0.084696936 0.184657976
## [139] 0.044600758 0.084696936 0.009128996 0.175693220 0.185842229 0.166704143
## [145] 0.099182400 0.183603493 0.115215274 0.108773036 0.122718154 0.185842229
## [151] 0.162719112 0.038485703 0.185842229 0.185842229 0.180604267 0.079716333
## [157] 0.071078855 0.059467642 0.032970928 0.099182400 0.084696936 0.185842229
## [163] 0.169690894 0.113066703 0.055303485 0.084696936 0.132238816 0.187557908
## [169] 0.075822250 0.084696936 0.187932972 0.151219004 0.180604267 0.132238816
## [175] 0.128033121 0.185842229 0.178563332 0.187557908 0.187966347 0.184657976
## [181] 0.093932157 0.187557908 0.185842229 0.141476808 0.132238816 0.154915556
## [187] 0.182822670 0.084696936 0.132238816 0.184657976 0.187932972 0.175693220
## [193] 0.071078855 0.118436015 0.108773036 0.103427636 0.146429259 0.113066703
## [199] 0.113066703 0.169690894 0.026037073 0.103427636 0.146429259 0.186959361
## [205] 0.180604267 0.135357235 0.062922920 0.169690894 0.118436015 0.159344904
## [211] 0.151219004 0.038485703 0.135357235 0.067390869 0.169690894 0.187557908
## [217] 0.151219004 0.184657976 0.161051768 0.182822670 0.031825880 0.166704143
## [223] 0.113066703 0.165142536 0.101300770 0.103427636 0.044600758 0.166704143
## [229] 0.118436015 0.169690894 0.182822670 0.187932972 0.132238816 0.137415903
## [235] 0.178563332 0.128033121 0.158477158 0.055303485 0.137415903 0.154915556
## [241] 0.030709682 0.187932972 0.178563332 0.062922920 0.185842229 0.187557908
## [247] 0.187557908
sum(mdims$Wgt < 85 & mdims$Wgt > 80)/ length(mdims$Wgt)
## [1] 0.1740891
###We can see that height data is similar to normal distribution compared to weight data. Evidence: Height emperical data is almost close to normal data while weight data is not close.