Assignment 6

Code Setup

library(ggplot2)
library(boot)
library(gtools)

## 
## Attaching package: 'gtools'

## The following objects are masked from 'package:boot':
## 
##     inv.logit, logit

setwd("/Users/archanabalan/Documents/Spring Semester/DS")

Question 1

Chapter 4: Problem 10

Create bivariate plots

# load shhs data
shhs =  read.delim("shhs.txt")
# Vector of labels to create the plots
col_lab = c('bmi_s1','age_s1','time_bed','rdi4p','waist')

# Output all the means
output_mean = list()
for (i in  1: length(col_lab)){
  output_mean[i] = mean(shhs[[col_lab[i]]], na.rm = TRUE)
  print(paste(col_lab[i], output_mean[i]))
}

## [1] "bmi_s1 28.1618949055459"
## [1] "age_s1 63.1340454858718"
## [1] "time_bed 435.320899379738"
## [1] "rdi4p 8.6555171205491"
## [1] "waist 97.0293715393134"

# Plot bivariate scatter plots
# Choose all possible pairs 
# Mark the mean in the plots
pairs = combn(col_lab,2)
for (i in 1:dim(pairs)[2]){
  current_pair= pairs[,i]
 p1 = ggplot(shhs, aes(x=shhs[[current_pair[1]]], y=shhs[[current_pair[2]]])) + geom_point()+
    geom_point(aes(x=mean(shhs[[current_pair[1]]],na.rm = TRUE),y=mean(shhs[[current_pair[1]]],na.rm = TRUE)),colour = 'red')+xlab(current_pair[1])+ylab(current_pair[2])
  print(p1)
  
}

Chapter 4: Probelm 11

QQ-Plots

for (i in 1:length(col_lab)){
  hist(shhs[[col_lab[i]]], main = col_lab[i])
  qqnorm(shhs[[col_lab[i]]], pch = 1, frame = FALSE, main = col_lab[i])
}

Explanation of plots: The histogram shows thw distribution of the data. We can interpret the data skewness based on whether the mean of the distribution falls after the peak (positive skew), on the peak or to the left of the peak (negative skew)

QQ-plots: The closer the distribution is to teh straight line the more it’s similarity to a normal distribution.

Distributions that might best fit each distribution: waist and age_si: Normal distribution (qq-plot more or less aligned on teh normal reference line).
bmi_si, rdi4p: Exponential distribution( qq-plot has a positive skew).
time_bed: skewed binomial distribution.

Problems 12 & 13: Bootstrap

Generate 3 bootstrap samples.

# Bootstrap

# Intermediate function to calculate column mean
cal_median=function(data){
  median(data, na.rm = TRUE)
}

# Function to evaluate bootstrap statistics: mean and median: column statistics
cal_mean_med = function(data, indices){
  dat = data[indices, ]
  dat_mean = colMeans(dat, na.rm = TRUE)
  dat_median = apply(dat, 2, cal_median)
  out   =  c(dat_mean, dat_median)
  return (out)
}

# Create 3 bootstrap samples 
shhs.boot = boot(shhs, cal_mean_med,R=3) 
# Index values of each of teh bootstrap samples
boot_index= boot.array(shhs.boot , indices = TRUE)

Problem 12: Create qq-norm and qq-plot of bootstrap samples for rdi4p

QQ-norm Plots

# Plot histogram and QQ Plots of each of the bootstrap samples for the column: rdi4p
bootstrap_labels = c("Bootstrap_Sample 1","Bootstrap_Sample 3","Bootstrap_Sample 3")
for (i in 1:3){
  hist(shhs[boot_index[i,],]$rdi4p, main =bootstrap_labels[i])
  qqnorm(shhs[boot_index[i,],]$rdi4p, pch = 1, frame = FALSE, main =bootstrap_labels[i])
  
}

Interpretation: All the histograms are right-skewed. The qq-plots do not coincide with the normal reference line. Thus the samples belong to a right skewed distribution (such as exponential distribution)

QQ-Plots

# Plot QQ-plot for one sample vs the other
# Sample 1 vs Sample 2
qqplot(shhs[boot_index[1,],]$rdi4p,shhs[boot_index[2,],]$rdi4p,pch = 1, frame = FALSE)

# Sample 1 vs Sample 3
qqplot(shhs[boot_index[1,],]$rdi4p,shhs[boot_index[3,],]$rdi4p,pch = 1, frame = FALSE)

# Sample 2 vs Sample 3
qqplot(shhs[boot_index[2,],]$rdi4p,shhs[boot_index[3,],]$rdi4p,pch = 1, frame = FALSE)

Interpretation: The lower quantiles of all smaples are more closely correlated than the upper quantiles. This could be because the samples seem to be belonging to a right skewed distribution. There seems to be a higher similarity between samples 1 and 2 when compared to the other samples.

Problem 13: Bootstrap confidence Intervals

col_ind = which(colnames(shhs)=="rdi4p")
# Output mean and median have 60 indices: 30 for mean and 30 for median. 
# Mean of "rdi4p": col_ind
# Median index of "rdi4p": col_ind + 30

confidence_interval_mean= boot.ci(shhs.boot, index = col_ind, conf = 0.95, type = 'norm')
print(confidence_interval_mean)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 3 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = shhs.boot, conf = 0.95, type = "norm", index = col_ind)
## 
## Intervals : 
## Level      Normal        
## 95%   ( 8.458,  8.734 )  
## Calculations and Intervals on Original Scale

confidence_interval_median= boot.ci(shhs.boot, index = (col_ind+30), conf = 0.95, type = 'norm')
print(confidence_interval_median)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 3 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = shhs.boot, conf = 0.95, type = "norm", index = (col_ind + 
##     30))
## 
## Intervals : 
## Level      Normal        
## 95%   ( 3.975,  4.192 )  
## Calculations and Intervals on Original Scale

The 95% confidence intervals are obtained as 8.514 and 9.125 for mean and 4.031 and 4.428 for median The intervals are not symmetric as the samples are drawn from a skewed distribution

Question 2

Create a boxplot of gender by rdi4p.

ggplot(shhs, aes(y=rdi4p, x=gender,  fill=gender,group =gender)) + 
  geom_boxplot()+xlab("Gender")+ ylab("rdi4p")

Question 3

To find whether the mean of the rdi4p of one gender varies significantly from the mean of the other gender. Null Hypothesis: There is no difference in means. Alternative Hypothesis: There is a significant difference in means.

test statistic: Difference of means test: t-test

test_results = t.test(shhs$rdi4p~shhs$gender , var.equal=TRUE)
print(test_results)

## 
##  Two Sample t-test
## 
## data:  shhs$rdi4p by shhs$gender
## t = -16.438, df = 5802, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.877007 -4.624609
## sample estimates:
## mean in group 0 mean in group 1 
##        6.154055       11.404863

print(paste("Pvalue is ", test_results$p.value))

## [1] "Pvalue is  2.20725983679335e-59"

p-value is very small: null hypothesis is rejected. There is a significant difference in the means of rdi4p for each gender.

Question 4

Print out all possible bootstrap samples (random samples with replacement, and specific order)

data_set = c(1.1, 3.4, 8.5)
data_out = expand.grid(data_set,data_set,data_set)
#All possible bootstrap samples
print(data_out)

##    Var1 Var2 Var3
## 1   1.1  1.1  1.1
## 2   3.4  1.1  1.1
## 3   8.5  1.1  1.1
## 4   1.1  3.4  1.1
## 5   3.4  3.4  1.1
## 6   8.5  3.4  1.1
## 7   1.1  8.5  1.1
## 8   3.4  8.5  1.1
## 9   8.5  8.5  1.1
## 10  1.1  1.1  3.4
## 11  3.4  1.1  3.4
## 12  8.5  1.1  3.4
## 13  1.1  3.4  3.4
## 14  3.4  3.4  3.4
## 15  8.5  3.4  3.4
## 16  1.1  8.5  3.4
## 17  3.4  8.5  3.4
## 18  8.5  8.5  3.4
## 19  1.1  1.1  8.5
## 20  3.4  1.1  8.5
## 21  8.5  1.1  8.5
## 22  1.1  3.4  8.5
## 23  3.4  3.4  8.5
## 24  8.5  3.4  8.5
## 25  1.1  8.5  8.5
## 26  3.4  8.5  8.5
## 27  8.5  8.5  8.5

Question 5

Coin flip: 15 times. Draw 15 time from a binomial distribution with p =0.5 To find C value for a type 1 error of 5%

out = round( rbind(0 : 15,
                   100 * pbinom((0 : 15) - 1, size = 15, prob = 0.5, lower.tail = FALSE)) )
# P(X>=C) calculated as a percentage
rownames(out) = c("C", "P(X>=C) %")
out

##           [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## C            0    1    2    3    4    5    6    7    8     9    10    11
## P(X>=C) %  100  100  100  100   98   94   85   70   50    30    15     6
##           [,13] [,14] [,15] [,16]
## C            12    13    14    15
## P(X>=C) %     2     0     0     0

# Value of C for a type 1 error of 5 %
C = min(which(out[2,]<=5))
print(paste('The required value of C is ', C))

## [1] "The required value of C is  13"

Question 6

p-value for 10 heads

pvalue= pbinom(10, size = 15, prob = 0.5)
print(paste("pvalue",pvalue))

## [1] "pvalue 0.940765380859375"

The p-value is > 0.05 (5% level). Thus we cannot reject the null hypothesis. We accept the null hypothesis for 5% error level

Question 7 & 8

Compute the weight gain/loss for all valid data samples. Create a dataframe for: index value of chick, weigh gain, diet

Prepare dataset

chick_data = ChickWeight
chick_gain_id = chick_data$Chick[which(chick_data$Time == 21)]
wt_gain =list()
dt_gain=list()
for (i in 1:length(chick_gain_id)){
  tmp_data = chick_data[which(chick_data$Chick==chick_gain_id[i]), ]
  weights = tmp_data$weight
  wt_gain  = append(wt_gain,(weights[length(weights)] - weights[ 1]))
  dt_gain=append(dt_gain,tmp_data$Diet[1])
}
tmp_df  = as.data.frame(cbind(chick_gain_id, wt_gain, dt_gain))
chick_gain_df =  as.data.frame(lapply(tmp_df, unlist))

Question 7 Permutation test: Pairwise comparisons. Compute random sampples of pairs if data. Compute difference in means. Check if differnece is significant across all samples.

The above steps are carried out by using the pairwise.t.test function in R

pairwise.t.test(chick_gain_df$wt_gain, chick_gain_df$dt_gain, p.adjust.method="none", 
                paired=FALSE, pool.sd=FALSE)

## 
##  Pairwise comparisons using t tests with non-pooled SD 
## 
## data:  chick_gain_df$wt_gain and chick_gain_df$dt_gain 
## 
##   1      2      3     
## 2 0.2115 -      -     
## 3 0.0030 0.1162 -     
## 4 0.0075 0.4271 0.2537
## 
## P value adjustment method: none

Pvalues: Diet 1 vs Diet 2: 0.2115
Diet 1 vs Diet 3: 0.0030
Diet 1 vs Diet 4: 0.0075
Diet 2 vs Diet 3: 0.1162
Diet 2 vs Diet 4: 0.4271 Diet 3 vs Diet 4: 0.2537

Question 8: Bootstrap

# Function to compute difference in medians of diet 2 and diet 3
cal_med_diff = function(data, indices){
  dat = data[indices, ]
  med2 = median(dat$wt_gain[dat$dt_gain == 2])
  med3 = median(dat$wt_gain[dat$dt_gain == 3])
  med2 - med3
}

# Create a 1000 boot strap samples
chick_boot = boot(chick_gain_df, cal_med_diff, R=1000)
print(chick_boot)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = chick_gain_df, statistic = cal_med_diff, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original  bias    std. error
## t1*      -66  3.4345    46.03769

# Compute the 95% confidence intervals
confidence_interval_H = boot.ci(chick_boot, index = 1, conf = 0.95, type = 'bca') 
print(confidence_interval_H)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = chick_boot, conf = 0.95, type = "bca", index = 1)
## 
## Intervals : 
## Level       BCa          
## 95%   (-143.71,   28.00 )  
## Calculations and Intervals on Original Scale

# Plot the bootstrap distribution
plot(chick_boot)

The standard error is : 46.68662

Question 9

Project Development timeline:
Tasks to be completed;
1. Data pre-processing to remove noise and potential outliers: April 10 - April 13
2. Training Model: Developing a predictive model that uses the best set of features.
To develop 2 predicitive models :
GLM:April:14 - April 20
Random Forest : April 21 - April 26
3. Testing the model performace to decide the better model: GLM April 26 - April 29
4. Running the predictive model in shiny app: April 26 - April 29
5. Develop shiny app GUI to take user input and show output: April 21 - April 29
6. Testing the app for different inputs. April 29 - May 4

Dataset

The data set to be used [http://archive.ics.uci.edu/ml/index.php] comprises of 769 subjects(both diabetic and non-diabetic features) with 9 different features for each subject. The data set is open access and can be obtained from [https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/diabetes.csv]

Mock Interface