R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2) 
library(rmarkdown) 

library(readxl) 
bdims <- read_csv("C:/Users/thyagu/rmit/applied analytics/assign3/bdims.csv")
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
class(bdims$wgt)
## [1] "numeric"
class(bdims$hgt)
## [1] "numeric"
sum(is.na(bdims$hgt))
## [1] 0
sum(is.na(bdims$wgt))  
## [1] 0
#boxplot
boxplot(bdims$che.di,main = "Chest diameter")

#boxplot
boxplot(bdims$hgt,main = "Height")

par(mfrow=c(2,2))
bdims$hgt %>% hist(main = "height")
log(bdims$hgt) %>% hist(main = "log(height)")
bdims$che.di %>% hist(main = "Chest diameter")
log(bdims$che.di) %>% hist(main = "log(Chest diameter)")

plot(log(che.di) ~ log(hgt), data = bdims, xlab = "log(Height)", ylab = "log(Chest diameter)")

bdims_summary <- bdims %>%
summarize(N = n(), r = cor(che.di, hgt), mean_hgt = mean(hgt),
sd_hgt = sd(hgt),Max_hgt=max(hgt),IQR_hgt = IQR(hgt),mean_chestdi = mean(che.di), sd_chestdi = sd(che.di),Max_chestdi=max(che.di),IQR_chestdi = IQR(che.di))
bdims_summary
## # A tibble: 1 x 10
##       N     r mean_hgt sd_hgt Max_hgt IQR_hgt mean_chestdi sd_chestdi
##   <int> <dbl>    <dbl>  <dbl>   <dbl>   <dbl>        <dbl>      <dbl>
## 1   507 0.627     171.   9.41    198.      14         28.0       2.74
## # ... with 2 more variables: Max_chestdi <dbl>, IQR_chestdi <dbl>
bdims_logsummary <- bdims %>%
summarize(N = n(), r = cor(log(che.di), log(hgt)), mean_hgt = mean(log(hgt)),
sd_hgt = sd(log(hgt)),Max_hgt=max(log(hgt)),IQR_hgt =IQR(log(hgt)),mean_chestdi = mean(log(che.di)),sd_chestdi=sd(log(che.di)),Max_chestdi=max(log(che.di)),IQR_chestdi = IQR(log(che.di)))
bdims_logsummary
## # A tibble: 1 x 10
##       N     r mean_hgt sd_hgt Max_hgt IQR_hgt mean_chestdi sd_chestdi
##   <int> <dbl>    <dbl>  <dbl>   <dbl>   <dbl>        <dbl>      <dbl>
## 1   507 0.630     5.14 0.0549    5.29  0.0820         3.33     0.0976
## # ... with 2 more variables: Max_chestdi <dbl>, IQR_chestdi <dbl>
# bdims_summary <- bdims %>%
#   summarize(N = n(), r = cor(che.di, hgt), mean_hgt = mean(hgt),
#             sd_hgt = sd(hgt), mean_chestdi = mean(che.di), sd_chestdi = #sd(che.di))


 
# benchmark_chest <- quantile(bdims$che.di,probs=.75,na.rm=TRUE) + 1.5*IQR(bdims$che.di)
# benchmark_chest
# 
#  
# 
# #
# benchmark_hgt <- quantile(bdims$hgt,probs=.75,na.rm=TRUE) + 1.5*IQR(bdims$hgt)
# benchmark_hgt

# bdims_summary %>%
#   mutate(slope = r*sd_chestdi/sd_hgt, 
#          intercept = mean_chestdi - slope*mean_hgt) 

# #Formulas
# sum_x <- sum(bdims$hgt)
# sum_y <- sum(bdims$che.di) 
# sum_x_sq <- sum(bdims$hgt^2)
# sum_y_sq <- sum(bdims$che.di^2)
# sum_xy <- sum(bdims$hgt*bdims$che.di)
# n <- length(bdims$che.di) #Sample size
# Lxx <- sum_x_sq-((sum_x^2)/n)
# Lyy <- sum_y_sq-((sum_y^2)/n)
# Lxy = sum_xy - (((sum_x)*(sum_y))/n)
# b = Lxy/Lxx 
# a = mean(bdims$che.di - b*mean(bdims$hgt))
# print(a)
# print(b)
# 
# plot(che.di ~ hgt, data = bdims, xlab = "Height", ylab = "Chest diameter")
# abline(a = a, b = b, col= "red")


#lm model
heightchestmodel <- lm(log(che.di) ~ log(hgt), data = bdims)
heightchestmodel %>% summary()
## 
## Call:
## lm(formula = log(che.di) ~ log(hgt), data = bdims)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22123 -0.05121  0.00044  0.05057  0.22962 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.42566    0.31592  -7.678 8.43e-14 ***
## log(hgt)     1.11888    0.06145  18.208  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0759 on 505 degrees of freedom
## Multiple R-squared:  0.3963, Adjusted R-squared:  0.3951 
## F-statistic: 331.5 on 1 and 505 DF,  p-value: < 2.2e-16
#p-value of the observed F statistic 
pf(q = 331.5,1,505,lower.tail = FALSE)
## [1] 2.559747e-57
#RegSS and ResSS values 
heightchestmodel %>% anova()
## Analysis of Variance Table
## 
## Response: log(che.di)
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## log(hgt)    1 1.9098 1.90984  331.55 < 2.2e-16 ***
## Residuals 505 2.9090 0.00576                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#coefficient table to report sample estimates for constant a and slope b 

heightchestmodel %>% summary() %>% coef()
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -2.425662 0.31592406 -7.677989 8.431925e-14
## log(hgt)     1.118881 0.06144838 18.208469 2.522546e-57
#confint() function to capture a and b intervals 
heightchestmodel %>% confint()
##                  2.5 %    97.5 %
## (Intercept) -3.0463489 -1.804974
## log(hgt)     0.9981551  1.239607
#for B slope
2*pt(q = 18.21,df = 505, lower.tail=FALSE)
## [1] 2.480335e-57
#plot to summarise the relationship
plot(log(che.di) ~ log(hgt), data = bdims, xlab = "height", ylab = "chest diameter")
abline(heightchestmodel, col = "red")

#Assumptions plot

#plot(heightchestmodel) 

#residuals vs fitted
heightchestmodel %>% plot(which=1)

#Normal Q-Q
heightchestmodel %>% plot(which=2)

#SCALE-LOCATION
heightchestmodel %>% plot(which=3)

#residuals vs LEVERAGE
heightchestmodel %>% plot(which=5)

#Correlation
r <- cor(log(bdims$che.di),log(bdims$hgt),use = "complete.obs")
r
## [1] 0.6295466
#install.packages("Hmisc")
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
#full correlation analysis between hgt and Che_di in R by installing the Hmisc library and using the rcorr() function.
#bivariate<-as.matrix(dplyr::select(bdims, log(bdims$che.di),log(bdims$hgt))) #Create a matrix of the variables to be correlated
#rcorr(bivariate, type = "pearson") 

#install.packages("psychometric")
library(psychometric)
## Loading required package: multilevel
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Attaching package: 'psychometric'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
CIr(r, n = 507, level = .95)
## [1] 0.5739283 0.6793837
detach("package:psychometric",unload = TRUE)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.