This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(rmarkdown)
library(readxl)
bdims <- read_csv("C:/Users/thyagu/rmit/applied analytics/assign3/bdims.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
class(bdims$wgt)
## [1] "numeric"
class(bdims$hgt)
## [1] "numeric"
sum(is.na(bdims$hgt))
## [1] 0
sum(is.na(bdims$wgt))
## [1] 0
#boxplot
boxplot(bdims$che.di,main = "Chest diameter")
#boxplot
boxplot(bdims$hgt,main = "Height")
bdims_summary <- bdims %>%
summarize(N = n(), r = cor(che.di, hgt), mean_hgt = mean(hgt),
sd_hgt = sd(hgt),Max_hgt=max(hgt),IQR_hgt = IQR(hgt),mean_chestdi = mean(che.di), sd_chestdi = sd(che.di),Max_chestdi=max(che.di),IQR_chestdi = IQR(che.di))
bdims_summary
## # A tibble: 1 x 10
## N r mean_hgt sd_hgt Max_hgt IQR_hgt mean_chestdi sd_chestdi
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 507 0.627 171. 9.41 198. 14 28.0 2.74
## # ... with 2 more variables: Max_chestdi <dbl>, IQR_chestdi <dbl>
#
benchmark_chest <- quantile(bdims$che.di,probs=.75,na.rm=TRUE) + 1.5*IQR(bdims$che.di)
benchmark_chest
## 75%
## 36.4
#
benchmark_hgt <- quantile(bdims$hgt,probs=.75,na.rm=TRUE) + 1.5*IQR(bdims$hgt)
benchmark_hgt
## 75%
## 198.8
# bdims_summary %>%
# mutate(slope = r*sd_chestdi/sd_hgt,
# intercept = mean_chestdi - slope*mean_hgt)
#Formulas
sum_x <- sum(bdims$hgt)
sum_y <- sum(bdims$che.di)
sum_x_sq <- sum(bdims$hgt^2)
sum_y_sq <- sum(bdims$che.di^2)
sum_xy <- sum(bdims$hgt*bdims$che.di)
n <- length(bdims$che.di) #Sample size
Lxx <- sum_x_sq-((sum_x^2)/n)
Lyy <- sum_y_sq-((sum_y^2)/n)
Lxy = sum_xy - (((sum_x)*(sum_y))/n)
b = Lxy/Lxx
a = mean(bdims$che.di - b*mean(bdims$hgt))
print(a)
## [1] -3.294657
print(b)
## [1] 0.1827027
plot(che.di ~ hgt, data = bdims, xlab = "Height", ylab = "Chest diameter")
abline(a = a, b = b, col= "red")
#lm model
heightchestmodel <- lm(che.di ~ hgt, data = bdims)
heightchestmodel %>% summary()
##
## Call:
## lm(formula = che.di ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3102 -1.4326 -0.0696 1.4168 6.8929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.2947 1.7319 -1.902 0.0577 .
## hgt 0.1827 0.0101 18.082 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.138 on 505 degrees of freedom
## Multiple R-squared: 0.393, Adjusted R-squared: 0.3918
## F-statistic: 327 on 1 and 505 DF, p-value: < 2.2e-16
#p-value of the observed F statistic
pf(q = 327,1,505,lower.tail = FALSE)
## [1] 1.00343e-56
#RegSS and ResSS values
heightchestmodel %>% anova()
## Analysis of Variance Table
##
## Response: che.di
## Df Sum Sq Mean Sq F value Pr(>F)
## hgt 1 1494.7 1494.73 326.95 < 2.2e-16 ***
## Residuals 505 2308.7 4.57
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#coefficient table to report sample estimates for constant a and slope b
heightchestmodel %>% summary() %>% coef()
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.2946566 1.7318756 -1.902363 5.769227e-02
## hgt 0.1827027 0.0101042 18.081859 1.017694e-56
#confint() function to capture a and b intervals
heightchestmodel %>% confint()
## 2.5 % 97.5 %
## (Intercept) -6.6972252 0.1079121
## hgt 0.1628512 0.2025541
#for B slope
2*pt(q = 18.08,df = 505, lower.tail=FALSE)
## [1] 1.038739e-56
#plot to summarise the relationship
plot(che.di ~ hgt, data = bdims, xlab = "height", ylab = "chest diameter")
abline(heightchestmodel, col = "red")
#Assumptions plot
plot(heightchestmodel)
#Correlation
cor(bdims$che.di,bdims$hgt)
## [1] 0.6268931
#install.packages("Hmisc")
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
#full correlation analysis between hgt and Che_di in R by installing the Hmisc library and using the rcorr() function.
bivariate<-as.matrix(dplyr::select(bdims, che.di,hgt)) #Create a matrix of the variables to be correlated
rcorr(bivariate, type = "pearson")
## che.di hgt
## che.di 1.00 0.63
## hgt 0.63 1.00
##
## n= 507
##
##
## P
## che.di hgt
## che.di 0
## hgt 0
#install.packages("psychometric")
library(psychometric)
## Loading required package: multilevel
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'psychometric'
## The following object is masked from 'package:ggplot2':
##
## alpha
r=cor(bdims$che.di,bdims$hgt)
CIr(r = r, n = 507, level = .95)
## [1] 0.5709813 0.6770164
```
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.