library(ggplot2)
library(boot)
library(gtools)
##
## Attaching package: 'gtools'
## The following objects are masked from 'package:boot':
##
## inv.logit, logit
setwd("/Users/archanabalan/Documents/Spring Semester/DS")
Chapter 4: Problem 10
Create bivariate plots
# load shhs data
shhs = read.delim("shhs.txt")
# Vector of labels to create the plots
col_lab = c('bmi_s1','age_s1','time_bed','rdi4p','waist')
# Output all the means
output_mean = list()
for (i in 1: length(col_lab)){
output_mean[i] = mean(shhs[[col_lab[i]]], na.rm = TRUE)
print(paste(col_lab[i], output_mean[i]))
}
## [1] "bmi_s1 28.1618949055459"
## [1] "age_s1 63.1340454858718"
## [1] "time_bed 435.320899379738"
## [1] "rdi4p 8.6555171205491"
## [1] "waist 97.0293715393134"
# Plot bivariate scatter plots
# Choose all possible pairs
# Mark the mean in the plots
pairs = combn(col_lab,2)
for (i in 1:dim(pairs)[2]){
current_pair= pairs[,i]
p1 = ggplot(shhs, aes(x=shhs[[current_pair[1]]], y=shhs[[current_pair[2]]])) + geom_point()+
geom_point(aes(x=mean(shhs[[current_pair[1]]],na.rm = TRUE),y=mean(shhs[[current_pair[1]]],na.rm = TRUE)),colour = 'red')+xlab(current_pair[1])+ylab(current_pair[2])
print(p1)
}
Chapter 4: Probelm 11
QQ-Plots
for (i in 1:length(col_lab)){
hist(shhs[[col_lab[i]]], main = col_lab[i])
qqnorm(shhs[[col_lab[i]]], pch = 1, frame = FALSE, main = col_lab[i])
}
Explanation of plots: The histogram shows thw distribution of the data. We can interpret the data skewness based on whether the mean of the distribution falls after the peak (positive skew), on the peak or to the left of the peak (negative skew)
QQ-plots: The closer the distribution is to teh straight line the more it’s similarity to a normal distribution.
Distributions that might best fit each distribution: waist and age_si: Normal distribution (qq-plot more or less aligned on teh normal reference line).
bmi_si, rdi4p: Exponential distribution( qq-plot has a positive skew).
time_bed: skewed binomial distribution.
Problems 12 & 13: Bootstrap
Generate 3 bootstrap samples.
# Bootstrap
# Intermediate function to calculate column mean
cal_median=function(data){
median(data, na.rm = TRUE)
}
# Function to evaluate bootstrap statistics: mean and median: column statistics
cal_mean_med = function(data, indices){
dat = data[indices, ]
dat_mean = colMeans(dat, na.rm = TRUE)
dat_median = apply(dat, 2, cal_median)
out = c(dat_mean, dat_median)
return (out)
}
# Create 3 bootstrap samples
shhs.boot = boot(shhs, cal_mean_med,R=3)
# Index values of each of teh bootstrap samples
boot_index= boot.array(shhs.boot , indices = TRUE)
Problem 12: Create qq-norm and qq-plot of bootstrap samples for rdi4p
QQ-norm Plots
# Plot histogram and QQ Plots of each of the bootstrap samples for the column: rdi4p
bootstrap_labels = c("Bootstrap_Sample 1","Bootstrap_Sample 3","Bootstrap_Sample 3")
for (i in 1:3){
hist(shhs[boot_index[i,],]$rdi4p, main =bootstrap_labels[i])
qqnorm(shhs[boot_index[i,],]$rdi4p, pch = 1, frame = FALSE, main =bootstrap_labels[i])
}
Interpretation: All the histograms are right-skewed. The qq-plots do not coincide with the normal reference line. Thus the samples belong to a right skewed distribution (such as exponential distribution)
QQ-Plots
# Plot QQ-plot for one sample vs the other
# Sample 1 vs Sample 2
qqplot(shhs[boot_index[1,],]$rdi4p,shhs[boot_index[2,],]$rdi4p,pch = 1, frame = FALSE)
# Sample 1 vs Sample 3
qqplot(shhs[boot_index[1,],]$rdi4p,shhs[boot_index[3,],]$rdi4p,pch = 1, frame = FALSE)
# Sample 2 vs Sample 3
qqplot(shhs[boot_index[2,],]$rdi4p,shhs[boot_index[3,],]$rdi4p,pch = 1, frame = FALSE)
Interpretation: The lower quantiles of all smaples are more closely correlated than the upper quantiles. This could be because the samples seem to be belonging to a right skewed distribution. There seems to be a higher similarity between samples 1 and 2 when compared to the other samples.
Problem 13: Bootstrap confidence Intervals
col_ind = which(colnames(shhs)=="rdi4p")
# Output mean and median have 60 indices: 30 for mean and 30 for median.
# Mean of "rdi4p": col_ind
# Median index of "rdi4p": col_ind + 30
confidence_interval_mean= boot.ci(shhs.boot, index = col_ind, conf = 0.95, type = 'norm')
print(confidence_interval_mean)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 3 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = shhs.boot, conf = 0.95, type = "norm", index = col_ind)
##
## Intervals :
## Level Normal
## 95% ( 8.458, 8.734 )
## Calculations and Intervals on Original Scale
confidence_interval_median= boot.ci(shhs.boot, index = (col_ind+30), conf = 0.95, type = 'norm')
print(confidence_interval_median)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 3 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = shhs.boot, conf = 0.95, type = "norm", index = (col_ind +
## 30))
##
## Intervals :
## Level Normal
## 95% ( 3.975, 4.192 )
## Calculations and Intervals on Original Scale
The 95% confidence intervals are obtained as 8.514 and 9.125 for mean and 4.031 and 4.428 for median The intervals are not symmetric as the samples are drawn from a skewed distribution
Create a boxplot of gender by rdi4p.
ggplot(shhs, aes(y=rdi4p, x=gender, fill=gender,group =gender)) +
geom_boxplot()+xlab("Gender")+ ylab("rdi4p")
To find whether the mean of the rdi4p of one gender varies significantly from the mean of the other gender. Null Hypothesis: There is no difference in means. Alternative Hypothesis: There is a significant difference in means.
test statistic: Difference of means test: t-test
test_results = t.test(shhs$rdi4p~shhs$gender , var.equal=TRUE)
print(test_results)
##
## Two Sample t-test
##
## data: shhs$rdi4p by shhs$gender
## t = -16.438, df = 5802, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.877007 -4.624609
## sample estimates:
## mean in group 0 mean in group 1
## 6.154055 11.404863
print(paste("Pvalue is ", test_results$p.value))
## [1] "Pvalue is 2.20725983679335e-59"
p-value is very small: null hypothesis is rejected. There is a significant difference in the means of rdi4p for each gender.
Print out all possible bootstrap samples (random samples with replacement, and specific order)
data_set = c(1.1, 3.4, 8.5)
data_out = expand.grid(data_set,data_set,data_set)
#All possible bootstrap samples
print(data_out)
## Var1 Var2 Var3
## 1 1.1 1.1 1.1
## 2 3.4 1.1 1.1
## 3 8.5 1.1 1.1
## 4 1.1 3.4 1.1
## 5 3.4 3.4 1.1
## 6 8.5 3.4 1.1
## 7 1.1 8.5 1.1
## 8 3.4 8.5 1.1
## 9 8.5 8.5 1.1
## 10 1.1 1.1 3.4
## 11 3.4 1.1 3.4
## 12 8.5 1.1 3.4
## 13 1.1 3.4 3.4
## 14 3.4 3.4 3.4
## 15 8.5 3.4 3.4
## 16 1.1 8.5 3.4
## 17 3.4 8.5 3.4
## 18 8.5 8.5 3.4
## 19 1.1 1.1 8.5
## 20 3.4 1.1 8.5
## 21 8.5 1.1 8.5
## 22 1.1 3.4 8.5
## 23 3.4 3.4 8.5
## 24 8.5 3.4 8.5
## 25 1.1 8.5 8.5
## 26 3.4 8.5 8.5
## 27 8.5 8.5 8.5
Coin flip: 15 times. Draw 15 time from a binomial distribution with p =0.5 To find C value for a type 1 error of 5%
out = round( rbind(0 : 15,
100 * pbinom((0 : 15) - 1, size = 15, prob = 0.5, lower.tail = FALSE)) )
# P(X>=C) calculated as a percentage
rownames(out) = c("C", "P(X>=C) %")
out
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## C 0 1 2 3 4 5 6 7 8 9 10 11
## P(X>=C) % 100 100 100 100 98 94 85 70 50 30 15 6
## [,13] [,14] [,15] [,16]
## C 12 13 14 15
## P(X>=C) % 2 0 0 0
# Value of C for a type 1 error of 5 %
C = min(which(out[2,]<=5))
print(paste('The required value of C is ', C))
## [1] "The required value of C is 13"
p-value for 10 heads
pvalue= pbinom(10, size = 15, prob = 0.5)
print(paste("pvalue",pvalue))
## [1] "pvalue 0.940765380859375"
The p-value is > 0.05 (5% level). Thus we cannot reject the null hypothesis. We accept the null hypothesis for 5% error level
Compute the weight gain/loss for all valid data samples. Create a dataframe for: index value of chick, weigh gain, diet
Prepare dataset
chick_data = ChickWeight
chick_gain_id = chick_data$Chick[which(chick_data$Time == 21)]
wt_gain =list()
dt_gain=list()
for (i in 1:length(chick_gain_id)){
tmp_data = chick_data[which(chick_data$Chick==chick_gain_id[i]), ]
weights = tmp_data$weight
wt_gain = append(wt_gain,(weights[length(weights)] - weights[ 1]))
dt_gain=append(dt_gain,tmp_data$Diet[1])
}
tmp_df = as.data.frame(cbind(chick_gain_id, wt_gain, dt_gain))
chick_gain_df = as.data.frame(lapply(tmp_df, unlist))
Question 7 Permutation test: Pairwise comparisons. Compute random sampples of pairs if data. Compute difference in means. Check if differnece is significant across all samples.
The above steps are carried out by using the pairwise.t.test function in R
pairwise.t.test(chick_gain_df$wt_gain, chick_gain_df$dt_gain, p.adjust.method="none",
paired=FALSE, pool.sd=FALSE)
##
## Pairwise comparisons using t tests with non-pooled SD
##
## data: chick_gain_df$wt_gain and chick_gain_df$dt_gain
##
## 1 2 3
## 2 0.2115 - -
## 3 0.0030 0.1162 -
## 4 0.0075 0.4271 0.2537
##
## P value adjustment method: none
Pvalues: Diet 1 vs Diet 2: 0.2115
Diet 1 vs Diet 3: 0.0030
Diet 1 vs Diet 4: 0.0075
Diet 2 vs Diet 3: 0.1162
Diet 2 vs Diet 4: 0.4271 Diet 3 vs Diet 4: 0.2537
Question 8: Bootstrap
# Function to compute difference in medians of diet 2 and diet 3
cal_med_diff = function(data, indices){
dat = data[indices, ]
med2 = median(dat$wt_gain[dat$dt_gain == 2])
med3 = median(dat$wt_gain[dat$dt_gain == 3])
med2 - med3
}
# Create a 1000 boot strap samples
chick_boot = boot(chick_gain_df, cal_med_diff, R=1000)
print(chick_boot)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = chick_gain_df, statistic = cal_med_diff, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -66 3.4345 46.03769
# Compute the 95% confidence intervals
confidence_interval_H = boot.ci(chick_boot, index = 1, conf = 0.95, type = 'bca')
print(confidence_interval_H)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = chick_boot, conf = 0.95, type = "bca", index = 1)
##
## Intervals :
## Level BCa
## 95% (-143.71, 28.00 )
## Calculations and Intervals on Original Scale
# Plot the bootstrap distribution
plot(chick_boot)
The standard error is : 46.68662
Project Development timeline:
Tasks to be completed;
1. Data pre-processing to remove noise and potential outliers: April 10 - April 13
2. Training Model: Developing a predictive model that uses the best set of features.
To develop 2 predicitive models :
GLM:April:14 - April 20
Random Forest : April 21 - April 26
3. Testing the model performace to decide the better model: GLM April 26 - April 29
4. Running the predictive model in shiny app: April 26 - April 29
5. Develop shiny app GUI to take user input and show output: April 21 - April 29
6. Testing the app for different inputs. April 29 - May 4
Dataset
The data set to be used [http://archive.ics.uci.edu/ml/index.php] comprises of 769 subjects(both diabetic and non-diabetic features) with 9 different features for each subject. The data set is open access and can be obtained from [https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/diabetes.csv]
Mock Interface