note: My computer had trouble installing the rgl and rpanel packages… why would this be?
library(mvnormtest)
set.seed(15) #set the seed so that you can reproduce the random sampling
x<-rnorm(100) #randomly sample from the normal distribution
z<-rnorm(100)
b<-function(ro){sqrt(1/ro^2-1)}
ro<-0.8
y<-x+b(ro)*z #linear transformation?
cor(x,y)#sample correlation coefficient between x and y
## [1] 0.7792761
mshapiro.test(rbind(x,y)) #does not reject normality, the multivariate shapiro comes from ('mvnormtest')
##
## Shapiro-Wilk normality test
##
## data: Z
## W = 0.98394, p-value = 0.2657
plot(x,y, pch="*", main = 'Plotting two normal distributions against each other')
We now want to produce a two-dimensional smoothed density curve from the data using the ‘sm’, ‘rgl’,‘rpanel’ packages
library('sm')
## Package 'sm', version 2.2-5.4: type help(sm) for summary information
sm.density(cbind(x,y))
## Loading required package: rgl
## Loading required package: rpanel
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'rpanel'
sm.density(cbind(x,y),hmult=0.5)
## Loading required package: rgl
## Loading required package: rpanel
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'rpanel'
sm.density(cbind(x,y),hmult=1)
## Loading required package: rgl
## Loading required package: rpanel
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'rpanel'
sm.density(cbind(x,y),hmult=2)
## Loading required package: rgl
## Loading required package: rpanel
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'rpanel'
sm.density(cbind(x,y),hmult=3)
## Loading required package: rgl
## Loading required package: rpanel
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'rpanel'
sm.density(x, main = 'Examining a marginal density')
sm.density(7)
You can change the smoothness of the density by changing the parameter ‘hmult’.
Suppose that the normal distribution with mean 100 and standard deviation 16 is a good model for the IQ scores of American adults. Answer the following questions.
The proportion of adults with an IQ between 100 and 132 if we are assuming a normal distribution with a standard deviation of 16 is 47%. You can use the pnorm function by noticing that 132 is 2 standard deviations above the mean of 100.
pnorm(2)-pnorm(0)
## [1] 0.4772499
The IQ score of John Smith is 113, you can use the qnorm function to find the iq that can be evaluated as being at the 80th quartile.
qnorm(.8,mean = 100, sd =16)
## [1] 113.4659
Hmm, i’m not really sure how to approach this problem with R, so I will run a monte carlo simulation…
set.seed(15)
runs <- 10000
meanlist<-numeric(runs) #initialize the vector for the loop
for(i in 1:runs){meanlist[i]<-mean(rnorm(4,mean = 100,sd = 16))}
#sample the distribution 10000 times
hist(meanlist)
countlist<-numeric(runs)
for(i in 1:runs) {
if(meanlist[i] >= 108) {
countlist[i]<-0}
else {
countlist[i]<-1}
}
table(countlist)
## countlist
## 0 1
## 1576 8424
Chancebelow108<-sum(countlist)/length(countlist)
Chancebelow108
## [1] 0.8424
There is an 84% percent chance that the average score will be below 108
Let’s run another Monte Carlo
set.seed(15)
runs <- 10000
samp_matrix<-matrix(nrow = 4, ncol = 10000) #initialize a matrix for the loop, this time we need to evaluate all of the members in the sample.
for(i in 1:runs){samp_matrix[,i]<-(rnorm(4,mean = 100,sd = 16))}
#sample the distribution 10000 times
countAlllist<-numeric(runs)
for(i in 1:runs) {
for(j in 1:4){
if(samp_matrix[j,i] >= 108) {
countAlllist[i]<-0
break}
else {
countAlllist[i]<-1}
}}
table(countAlllist)
## countAlllist
## 0 1
## 7770 2230
ChanceAllbelow108<-sum(countAlllist)/length(countAlllist)
ChanceAllbelow108
## [1] 0.223
There is an 23% percent chance that all of the scores will be below 108
set.seed(15)
runs <- 10000
samp_matrix1<-matrix(nrow = 4, ncol = 10000) #initialize a matrix for the loop, this time we need to evaluate all of the members in the sample.
for(i in 1:runs){samp_matrix1[,i]<-(rnorm(4,mean = 100,sd = 16))}
#sample the distribution 10000 times
countAlllist1<-numeric(runs)
for(i in 1:runs) {
for(j in 1:4){
if(samp_matrix1[j,i] <= 108) {
countAlllist1[i]<-1
break}
else {
countAlllist1[i]<-0}
}}
table(countAlllist1)
## countAlllist1
## 0 1
## 93 9907
Chanceonebelow108<-sum(countAlllist1)/length(countAlllist1)
Chanceonebelow108
## [1] 0.9907
There is a 99% chance that at least one of the randomly sampled adults will have an IQ below 108
For the exploration component of this lab, I will be using a built in data set called Pima.te. This data is from a population of women who were at least 21 years old, of Pima Indian heritage and living near Phoenix, Arizona, and were tested for diabetes according to World Health Organization criteria. The data were collected by the US National Institute of Diabetes and Digestive and Kidney Diseases. Today we will compare blood pressure and body mass index.
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:sm':
##
## muscle
head(Pima.te)
## npreg glu bp skin bmi ped age type
## 1 6 148 72 35 33.6 0.627 50 Yes
## 2 1 85 66 29 26.6 0.351 31 No
## 3 1 89 66 23 28.1 0.167 21 No
## 4 3 78 50 32 31.0 0.248 26 Yes
## 5 2 197 70 45 30.5 0.158 53 Yes
## 6 5 166 72 19 25.8 0.587 51 Yes
#let's do five number summary's, boxplots, and histograms first
BP<-Pima.te$bp
BMI<-Pima.te$bmi
fivenum(BP)
## [1] 24 64 72 80 110
fivenum(BMI)
## [1] 19.40 28.15 32.90 37.20 67.10
boxplot(BP,main = 'Blood Pressure of Pima Indian Women', ylab = 'Blood pressure (mmHG)')
boxplot(BMI,main = 'Body Mass Index of Pima Indian Women', ylab = 'BMI')
hist(BP,main = 'Blood Pressure of Pima Indian Women',xlab = 'Blood Pressure (mmHG)')
hist(BMI, main = 'Body Mass Index of Pima Indian Women', xlab = 'BMI')
One thing to note about these simple univariate summaries is that there is a tiny bit of a rightward skew in the BMI data. This shouldn’t be too much of a problem because the rest of the data looks like it is normally distributed.
Now, let’s do some tests for normality,and superimpose an appropriate normal distribution onto the histogram
bpm<-mean(BP)
bps<-sd(BP)
bmim<-mean(BMI)
bmis<-sd(BMI)
hist(BP,main = 'Blood Pressure of Pima Indian Women',xlab = 'Blood Pressure (mmHG)',prob = T)
curve(dnorm(x, mean=bpm, sd=bps),add=T, col="red", lty=2)
shapiro.test(BP)
##
## Shapiro-Wilk normality test
##
## data: BP
## W = 0.98683, p-value = 0.004076
qqnorm(BP, main = 'Normal QQ plot of blood pressure')
qqline(BP)
hist(BMI, main = 'Body Mass Index of Pima Indian Women', xlab = 'BMI', prob=T)
curve(dnorm(x, mean=bmim, sd=bmis),add=T, col="red", lty=2)
shapiro.test(BMI)
##
## Shapiro-Wilk normality test
##
## data: BMI
## W = 0.96679, p-value = 6.917e-07
qqnorm(BMI, main = 'Normal QQ plot of body mass index')
qqline(BMI)
Interesting! The data appears to be normal, but our Shapiro-Wilk tests suggest that we should reject the null-hypothesis which is that the data are normal! Perhaps this is because of the skew evident at the endpoints of the quantile plots?
plot(BP,BMI, main = 'Blood pressure plotted against BMI in Pima Indian Women', xlab = 'Blood Pressure (mmHG)', ylab = 'BMI',pch = '*')
cor(BP,BMI) #not much of a correlation there!
## [1] 0.3381926
sm.density(cbind(BP,BMI))
## Loading required package: rgl
## Loading required package: rpanel
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'rpanel'
There does not appear to be much of a correlation between blood pressure and body mass index in Pima Indian women.