This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(rmarkdown)
library(readxl)
bdims <- read_csv("C:/Users/thyagu/rmit/applied analytics/assign3/bdims.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
class(bdims$wgt)
## [1] "numeric"
class(bdims$hgt)
## [1] "numeric"
sum(is.na(bdims$hgt))
## [1] 0
sum(is.na(bdims$wgt))
## [1] 0
#boxplot
boxplot(bdims$che.di,main = "Chest diameter")
#boxplot
boxplot(bdims$hgt,main = "Height")
par(mfrow=c(2,2))
bdims$hgt %>% hist(main = "height")
log(bdims$hgt) %>% hist(main = "log(height)")
bdims$che.di %>% hist(main = "Chest diameter")
log(bdims$che.di) %>% hist(main = "log(Chest diameter)")
plot(log(che.di) ~ log(hgt), data = bdims, xlab = "log(Height)", ylab = "log(Chest diameter)")
bdims_summary <- bdims %>%
summarize(N = n(), r = cor(che.di, hgt), mean_hgt = mean(hgt),
sd_hgt = sd(hgt),Max_hgt=max(hgt),IQR_hgt = IQR(hgt),mean_chestdi = mean(che.di), sd_chestdi = sd(che.di),Max_chestdi=max(che.di),IQR_chestdi = IQR(che.di))
bdims_summary
## # A tibble: 1 x 10
## N r mean_hgt sd_hgt Max_hgt IQR_hgt mean_chestdi sd_chestdi
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 507 0.627 171. 9.41 198. 14 28.0 2.74
## # ... with 2 more variables: Max_chestdi <dbl>, IQR_chestdi <dbl>
bdims_logsummary <- bdims %>%
summarize(N = n(), r = cor(log(che.di), log(hgt)), mean_hgt = mean(log(hgt)),
sd_hgt = sd(log(hgt)),Max_hgt=max(log(hgt)),IQR_hgt =IQR(log(hgt)),mean_chestdi = mean(log(che.di)),sd_chestdi=sd(log(che.di)),Max_chestdi=max(log(che.di)),IQR_chestdi = IQR(log(che.di)))
bdims_logsummary
## # A tibble: 1 x 10
## N r mean_hgt sd_hgt Max_hgt IQR_hgt mean_chestdi sd_chestdi
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 507 0.630 5.14 0.0549 5.29 0.0820 3.33 0.0976
## # ... with 2 more variables: Max_chestdi <dbl>, IQR_chestdi <dbl>
# bdims_summary <- bdims %>%
# summarize(N = n(), r = cor(che.di, hgt), mean_hgt = mean(hgt),
# sd_hgt = sd(hgt), mean_chestdi = mean(che.di), sd_chestdi = #sd(che.di))
# benchmark_chest <- quantile(bdims$che.di,probs=.75,na.rm=TRUE) + 1.5*IQR(bdims$che.di)
# benchmark_chest
#
#
#
# #
# benchmark_hgt <- quantile(bdims$hgt,probs=.75,na.rm=TRUE) + 1.5*IQR(bdims$hgt)
# benchmark_hgt
# bdims_summary %>%
# mutate(slope = r*sd_chestdi/sd_hgt,
# intercept = mean_chestdi - slope*mean_hgt)
# #Formulas
# sum_x <- sum(bdims$hgt)
# sum_y <- sum(bdims$che.di)
# sum_x_sq <- sum(bdims$hgt^2)
# sum_y_sq <- sum(bdims$che.di^2)
# sum_xy <- sum(bdims$hgt*bdims$che.di)
# n <- length(bdims$che.di) #Sample size
# Lxx <- sum_x_sq-((sum_x^2)/n)
# Lyy <- sum_y_sq-((sum_y^2)/n)
# Lxy = sum_xy - (((sum_x)*(sum_y))/n)
# b = Lxy/Lxx
# a = mean(bdims$che.di - b*mean(bdims$hgt))
# print(a)
# print(b)
#
# plot(che.di ~ hgt, data = bdims, xlab = "Height", ylab = "Chest diameter")
# abline(a = a, b = b, col= "red")
#lm model
heightchestmodel <- lm(log(che.di) ~ log(hgt), data = bdims)
heightchestmodel %>% summary()
##
## Call:
## lm(formula = log(che.di) ~ log(hgt), data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22123 -0.05121 0.00044 0.05057 0.22962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.42566 0.31592 -7.678 8.43e-14 ***
## log(hgt) 1.11888 0.06145 18.208 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0759 on 505 degrees of freedom
## Multiple R-squared: 0.3963, Adjusted R-squared: 0.3951
## F-statistic: 331.5 on 1 and 505 DF, p-value: < 2.2e-16
#p-value of the observed F statistic
pf(q = 331.5,1,505,lower.tail = FALSE)
## [1] 2.559747e-57
#RegSS and ResSS values
heightchestmodel %>% anova()
## Analysis of Variance Table
##
## Response: log(che.di)
## Df Sum Sq Mean Sq F value Pr(>F)
## log(hgt) 1 1.9098 1.90984 331.55 < 2.2e-16 ***
## Residuals 505 2.9090 0.00576
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#coefficient table to report sample estimates for constant a and slope b
heightchestmodel %>% summary() %>% coef()
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.425662 0.31592406 -7.677989 8.431925e-14
## log(hgt) 1.118881 0.06144838 18.208469 2.522546e-57
#confint() function to capture a and b intervals
heightchestmodel %>% confint()
## 2.5 % 97.5 %
## (Intercept) -3.0463489 -1.804974
## log(hgt) 0.9981551 1.239607
#for B slope
2*pt(q = 18.21,df = 505, lower.tail=FALSE)
## [1] 2.480335e-57
#plot to summarise the relationship
plot(log(che.di) ~ log(hgt), data = bdims, xlab = "height", ylab = "chest diameter")
abline(heightchestmodel, col = "red")
#Assumptions plot
#plot(heightchestmodel)
#residuals vs fitted
heightchestmodel %>% plot(which=1)
#Normal Q-Q
heightchestmodel %>% plot(which=2)
#SCALE-LOCATION
heightchestmodel %>% plot(which=3)
#residuals vs LEVERAGE
heightchestmodel %>% plot(which=5)
#Correlation
r <- cor(log(bdims$che.di),log(bdims$hgt),use = "complete.obs")
r
## [1] 0.6295466
#install.packages("Hmisc")
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
#full correlation analysis between hgt and Che_di in R by installing the Hmisc library and using the rcorr() function.
#bivariate<-as.matrix(dplyr::select(bdims, log(bdims$che.di),log(bdims$hgt))) #Create a matrix of the variables to be correlated
#rcorr(bivariate, type = "pearson")
#install.packages("psychometric")
library(psychometric)
## Loading required package: multilevel
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'psychometric'
## The following object is masked from 'package:ggplot2':
##
## alpha
CIr(r, n = 507, level = .95)
## [1] 0.5739283 0.6793837
detach("package:psychometric",unload = TRUE)
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.