# Vector
vec1<-c(0,3,2,1,3,5)
vec1

vec2<-c('A','B','C')
vec2

# Matrix
mat<-matrix(c(0,3,2,1,3,5),ncol=2)
mat

set.seed(123)# this command controls the random generation
# Data Frame
df<-data.frame(age=rnorm(10,30,5),gender=rep(c('M','F'),5))
df

# List
ls<-list(vec1 = vec1, vec2 = vec2 ,mat = mat,df = df)
ls
# numerical
vec1
class(vec1)

# character
vec2
class(vec2)

# factor
df$gender
class(df$gender)
# load a data frame named iris
data(iris)

# write iris data frame to a csv file
write.csv(iris,file='~/Desktop/irisdata.csv') 

# if it fails because of the path do simply: 
# write.csv(iris,file='irisdata.csv') 
# load the same iris data from the exported csv file
ir<-read.csv('~/Desktop/irisdata.csv') 

# check the object class
class(ir)
## data()
# loading a library with functions and datasets 
library(MASS)

# loading a data set called Pima.tr
data(Pima.tr)

# dimensions of the data frame (number of rows / number of columns)
dim(Pima.tr)

# displaying the first 6 rows of the dataset
head(Pima.tr)

# displaying more information about the dataset
?Pima.tr

# naming the dataset in a simple way
p<-Pima.tr

# displaying the names of the objects in the data set
names(p)

# showing the class of the object
class(p)
# basic summary of the data
summary(p)
# mean
mean(p$bmi)

# median
median(p$bmi)

# mode
table(p$bmi)[which.max(table(p$bmi))]
# variance
var(p$bmi)

# standard deviation
sd(p$bmi)

# IQR
IQR(p$bmi)
# 5th percentile
quantile(p$bmi,0.05)

# 95th percentile
quantile(p$bmi,0.95)
# A frequency histogram
hist(p$skin)
# A density histogram
hist(p$skin,prob=T)

# Adding a theoretical normal curve
m<-mean(p$skin)
s<-sd(p$skin)
lower<-min(p$skin)-s
upper<-max(p$skin)+s
curve(dnorm(x,m,s),lower,upper,add=T,col='red')
# draw a sample of n=10 individuals from this normal population
rnorm(10,m,s)

# evaluating the probability of skin thickness less than 10
pnorm(10,m,s)

# evaluating the 90th percentile
qnorm(0.9,m,s)

# evaluating the density of the normal distribution at 10
dnorm(10,m,s)
# load the library nortest
library(nortest)

# q-q plot : a graphic that compares the theoretical quantiles to the observed quantiles
qqnorm(scale(p$skin))

# applying 4 different tests of normality

# anderson-darling
ad.test(p$skin)

# cramer-von-mises
cvm.test(p$skin)

# pearson chi-square
pearson.test(p$skin)

# shapiro-francia
sf.test(p$skin)

# shapiro-wilkis
shapiro.test(p$skin)
# Frequency (Absolute) of diabetic and non-diabetic
table(p$type)

# Frequency (Relative)
prop.table(table(p$type))
# Absolute
barplot(table(p$type))

# Relative
barplot(prop.table(table(p$type)))

# 6th row and 4th column
p[6,4]
# age and bmi for women with more than 60 yrs old
p[which(p$age>60),c('age','bmi')]
# selecting age, glucose and blood pressure for diabetic aged more than 60
diab.aged<-subset(p,type=='Yes' & age>60,select=c('age','glu','bp'))
diab.aged
p$bmi.range<-cut(p$bmi,breaks = c(-Inf,25,30,35,Inf),labels = c('underweight','normal','overweight','obese'))
table(p$bmi.range)
# creating a list of index for women in Pima dataset
indexes<-1:dim(p)[1]

# sampling 15 women
sample.indexes<-sample(indexes,15,replace=F)
sample.indexes

# Displaying information on 15 sampled women
p[sample.indexes,]


# categorized boxplot: skin thickness by diabetic condition (Yes/No)
boxplot(p$skin~p$type)
# t-test assuming different variances
t.test(p$skin~p$type)
# t-test assuming equal variances
t.test(p$skin~p$type,var.equal=TRUE)
reg = lm(skin~type,data=p)
summary(reg)
#purl("Rtutorial01.Rmd", output = "Rtutorial01.R", documentation = 1)
# cross-tabulation bmi range vs. diabetic status
tab<-table(p$bmi.range,p$type)
tab

# frequency relative to rows
prop.table(tab,margin=1)

# frequency relative to columns
prop.table(tab,margin=2)

# Performing chi-square and storing in a object
Q<-chisq.test(tab)
Q

# Observed Frequencies
Q$obs

# Expected Frequencies
Q$exp

# Residuals
Q$residuals
# A linear regression in R that uses Glucose as a dependent variable, Age and BMI as independent variable
fit<-lm(glu~age+bmi,data=p)

# summary of the regression
summary(fit)