Rashbir Singh Kohli (s3810585)
For this assignment two libraries are used:
data.table - It is an sub package of package named data.frame. It helps in fast reading of large datasets, is smart and smarty manages all the data points, identify seperator and do not import string as factors by default. Makes over all running of code faster.
ggpubr - visualisation library based on ggplot.
library('data.table')
library("ggpubr")
data <- fread('bdims.csv')
df <- transform(data[, c('bic.gi', 'sex')], sex=factor(sex))
df_male <- df[sex == 1, 'bic.gi']
df_female <- df[sex == 0, 'bic.gi']
For the purpose of calculating the summary statistics (i.e., mean, median, standard deviation, first and third quartile, interquartile range, minimum and maximum values) for the assignment, I used
infoDfFe <- data.frame(trimws(sub(".*:", "", summary(df_female)[1:length(summary(df_female))])), metric <- c('Min', '1Q', 'Median', 'Mean', '3Q', 'Max'), stringsAsFactors = FALSE)
names(infoDfFe) <- c('Value', 'metric')
infoDfFe <- transform(infoDfFe, Value = as.numeric(Value))
infoDfMale <- data.frame(trimws(sub(".*:", "", summary(df_male)[1:length(summary(df_male))])), metric <- c('Min', '1Q', 'Median', 'Mean', '3Q', 'Max'), stringsAsFactors = FALSE)
names(infoDfMale) <- c('Value', 'metric')
infoDfMale <- transform(infoDfMale, Value = as.numeric(Value))
infoDfFe <- rbind(infoDfFe, list(sd(df_female$bic.gi), 'Std'))
infoDfMale <- rbind(infoDfMale, list(sd(df_male$bic.gi), 'Std'))
infoDfFe <- rbind(infoDfFe, list(IQR(df_female$bic.gi), 'IQR'))
infoDfMale <- rbind(infoDfMale, list(IQR(df_male$bic.gi), 'IQR'))
cat(' Mean value for female: ',infoDfFe$Value[infoDfFe$metric == 'Mean'], '\n', 'Median value for female: ',infoDfFe$Value[infoDfFe$metric == 'Median'], '\n', 'Standard Deviation value for female: ',infoDfFe$Value[infoDfFe$metric == 'Std'], '\n', 'First Quartile value for female: ',infoDfFe$Value[infoDfFe$metric == '1Q'], '\n', 'Third Quartile value for female: ',infoDfFe$Value[infoDfFe$metric == '3Q'], '\n', 'Interquartile Range value for female: ',infoDfFe$Value[infoDfFe$metric == 'IQR'], '\n', 'Minimum value for female: ',infoDfFe$Value[infoDfFe$metric == 'Min'], '\n', 'Maximum value for female: ',infoDfFe$Value[infoDfFe$metric == 'Max'])
## Mean value for female: 28.1
## Median value for female: 27.8
## Standard Deviation value for female: 2.709477
## First Quartile value for female: 26.4
## Third Quartile value for female: 29.8
## Interquartile Range value for female: 3.4
## Minimum value for female: 22.4
## Maximum value for female: 40.3
cat(' Mean value for male: ',infoDfMale$Value[infoDfMale$metric == 'Mean'], '\n', 'Median value for male: ',infoDfMale$Value[infoDfMale$metric == 'Median'], '\n', 'Standard Deviation value for male: ',infoDfMale$Value[infoDfMale$metric == 'Std'], '\n', 'First Quartile value for male: ',infoDfMale$Value[infoDfMale$metric == '1Q'], '\n', 'Third Quartile value for male: ',infoDfMale$Value[infoDfMale$metric == '3Q'], '\n', 'Interquartile Range value for male: ',infoDfMale$Value[infoDfMale$metric == 'IQR'], '\n', 'Minimum value for male: ',infoDfMale$Value[infoDfMale$metric == 'Min'], '\n', 'Maximum value for male: ',infoDfMale$Value[infoDfMale$metric == 'Max'])
## Mean value for male: 34.4
## Median value for male: 34.4
## Standard Deviation value for male: 2.982037
## First Quartile value for male: 32.5
## Third Quartile value for male: 36.4
## Interquartile Range value for male: 3.9
## Minimum value for male: 25.6
## Maximum value for male: 42.4
hist(df_female$bic.gi, breaks = 20, probability = TRUE, xlab = 'Bicep girth', ylab = 'Frequency', main = 'Histogram for Female Bicep Girth')
lines(density(df_female$bic.gi), col = 'Blue', lwd=2)
curve(dnorm(x, mean=mean(df_female$bic.gi), sd=sd(df_female$bic.gi)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE)
op <- par(cex = 0.7)
legend("topright", legend = c("Density Curve for Female Sample", "Normal Curve"), bty = "n", text.col = "black", horiz = F, pch=c(15,16), col = c('Blue', "darkgreen"))
hist(df_male$bic.gi, breaks = 20, probability = TRUE, xlab = 'Bicep girth', ylab = 'Frequency', main = 'Histogram for Male Bicep Girth')
lines(density(df_male$bic.gi), col = 'Blue', lwd=2)
curve(dnorm(x, mean=mean(df_male$bic.gi), sd=sd(df_male$bic.gi)), add=TRUE, yaxt="n", lty="dotted", col="darkgreen", lwd=4)
op <- par(cex = 0.7)
legend("topright", legend = c("Density Curve for Male Sample", "Normal Curve"), bty = "n", text.col = "black", horiz = F, pch=c(15,16), col = c('Blue', "darkgreen"), )
print('68-95-99.7 rule')
## [1] "68-95-99.7 rule"
(length(df_female$bic.gi[df_female$bic.gi > (28.1 - 2.709477) & df_female$bic.gi < (28.1 + 2.709477)]) / length(df_female$bic.gi)) * 100
## [1] 69.61538
(length(df_female$bic.gi[df_female$bic.gi > (28.1 - 2*(2.709477)) & df_female$bic.gi < (28.1 + 2*(2.709477))]) / length(df_female$bic.gi)) * 100
## [1] 95.38462
(length(df_female$bic.gi[df_female$bic.gi > (28.1 - 3*(2.709477)) & df_female$bic.gi < (28.1 + 3*(2.709477))]) / length(df_female$bic.gi)) * 100
## [1] 99.23077
print('68-95-99.7 rule')
## [1] "68-95-99.7 rule"
(length(df_male$bic.gi[df_male$bic.gi > (34.4 - 2.982037) & df_male$bic.gi < (34.4 + 2.982037)]) / length(df_male$bic.gi)) * 100
## [1] 68.01619
(length(df_male$bic.gi[df_male$bic.gi > (34.4 - 2*(2.982037)) & df_male$bic.gi < (34.4 + 2*(2.982037))]) / length(df_male$bic.gi)) * 100
## [1] 96.35628
(length(df_male$bic.gi[df_male$bic.gi > (34.4 - 3*(2.982037)) & df_male$bic.gi < (34.4 + 3*(2.982037))]) / length(df_male$bic.gi)) * 100
## [1] 100
ggqqplot(df_female$bic.gi, main = ' QQ Plot for Female Bicep Girth')
ggqqplot(df_male$bic.gi, main = ' QQ Plot for Male Bicep Girth')