R can be downloaded from http://cran-r.project.org and RStudio can be downloaded from http://www.rstudio.org/
R commands can be directly typed in R console but RStudio as a code editor will keep a record of all operations in R. The operations in R are highly dependent on the type of object is being manipulated.
Here we list 4 types of objects in an increasing order of complexity.
# Vector
vec1<-c(0,3,2,1,3,5)
vec1
## [1] 0 3 2 1 3 5
vec2<-c('A','B','C')
vec2
## [1] "A" "B" "C"
# Matrix
mat<-matrix(c(0,3,2,1,3,5),ncol=2)
mat
## [,1] [,2]
## [1,] 0 1
## [2,] 3 3
## [3,] 2 5
set.seed(123)# this command controls the random generation
# Data Frame
df<-data.frame(age=rnorm(10,30,5),gender=rep(c('M','F'),5))
df
## age gender
## 1 27.19762 M
## 2 28.84911 F
## 3 37.79354 M
## 4 30.35254 F
## 5 30.64644 M
## 6 38.57532 F
## 7 32.30458 M
## 8 23.67469 F
## 9 26.56574 M
## 10 27.77169 F
# List
ls<-list(vec1 = vec1, vec2 = vec2 ,mat = mat,df = df)
ls
## $vec1
## [1] 0 3 2 1 3 5
##
## $vec2
## [1] "A" "B" "C"
##
## $mat
## [,1] [,2]
## [1,] 0 1
## [2,] 3 3
## [3,] 2 5
##
## $df
## age gender
## 1 27.19762 M
## 2 28.84911 F
## 3 37.79354 M
## 4 30.35254 F
## 5 30.64644 M
## 6 38.57532 F
## 7 32.30458 M
## 8 23.67469 F
## 9 26.56574 M
## 10 27.77169 F
# numerical
vec1
## [1] 0 3 2 1 3 5
class(vec1)
## [1] "numeric"
# character
vec2
## [1] "A" "B" "C"
class(vec2)
## [1] "character"
# factor
df$gender
## [1] M F M F M F M F M F
## Levels: F M
class(df$gender)
## [1] "factor"
First let’s export data from R to csv file
# load a data frame named iris
data(iris)
# write iris data frame to a csv file
write.csv(iris,file='~/Desktop/irisdata.csv')
# if it fails because of the path do simply:
# write.csv(iris,file='irisdata.csv')
# load the same iris data from the exported csv file
ir<-read.csv('~/Desktop/irisdata.csv')
# check the object class
class(ir)
## [1] "data.frame"
In R there is a list of datasets that can be used for learning purposes. To see the list of datasets, just type:
# loading a library with functions and datasets
library(MASS)
# loading a data set called Pima.tr
data(Pima.tr)
# dimensions of the data frame (number of rows / number of columns)
dim(Pima.tr)
## [1] 200 8
# displaying the first 6 rows of the dataset
head(Pima.tr)
## npreg glu bp skin bmi ped age type
## 1 5 86 68 28 30.2 0.364 24 No
## 2 7 195 70 33 25.1 0.163 55 Yes
## 3 5 77 82 41 35.8 0.156 35 No
## 4 0 165 76 43 47.9 0.259 26 No
## 5 0 107 60 25 26.4 0.133 23 No
## 6 5 97 76 27 35.6 0.378 52 Yes
# displaying more information about the dataset
?Pima.tr
# naming the dataset in a simple way
p<-Pima.tr
# displaying the names of the objects in the data set
names(p)
## [1] "npreg" "glu" "bp" "skin" "bmi" "ped" "age" "type"
# showing the class of the object
class(p)
## [1] "data.frame"
# basic summary of the data
summary(p)
## npreg glu bp skin
## Min. : 0.00 Min. : 56.0 Min. : 38.00 Min. : 7.00
## 1st Qu.: 1.00 1st Qu.:100.0 1st Qu.: 64.00 1st Qu.:20.75
## Median : 2.00 Median :120.5 Median : 70.00 Median :29.00
## Mean : 3.57 Mean :124.0 Mean : 71.26 Mean :29.21
## 3rd Qu.: 6.00 3rd Qu.:144.0 3rd Qu.: 78.00 3rd Qu.:36.00
## Max. :14.00 Max. :199.0 Max. :110.00 Max. :99.00
## bmi ped age type
## Min. :18.20 Min. :0.0850 Min. :21.00 No :132
## 1st Qu.:27.57 1st Qu.:0.2535 1st Qu.:23.00 Yes: 68
## Median :32.80 Median :0.3725 Median :28.00
## Mean :32.31 Mean :0.4608 Mean :32.11
## 3rd Qu.:36.50 3rd Qu.:0.6160 3rd Qu.:39.25
## Max. :47.90 Max. :2.2880 Max. :63.00
# mean
mean(p$bmi)
## [1] 32.31
# median
median(p$bmi)
## [1] 32.8
# mode
table(p$bmi)[which.max(table(p$bmi))]
## 32
## 6
# variance
var(p$bmi)
## [1] 37.5795
# standard deviation
sd(p$bmi)
## [1] 6.130212
# IQR
IQR(p$bmi)
## [1] 8.925
# 5th percentile
quantile(p$bmi,0.05)
## 5%
## 22.485
# 95th percentile
quantile(p$bmi,0.95)
## 95%
## 42.615
# A frequency histogram
hist(p$skin)
# A density histogram
hist(p$skin,prob=T)
# Adding a theoretical normal curve
m<-mean(p$skin)
s<-sd(p$skin)
lower<-min(p$skin)-s
upper<-max(p$skin)+s
curve(dnorm(x,m,s),lower,upper,add=T,col='red')
Assuming that skin thickness is a normally distributed random variable with mean and standard deviation as observed in the sample:
# draw a sample of n=10 individuals from this normal population
rnorm(10,m,s)
## [1] 43.566862 33.433671 33.913883 30.512710 22.697988 50.165831 35.052095
## [8] 6.157212 37.438113 23.671713
# evaluating the probability of skin thickness less than 10
pnorm(10,m,s)
## [1] 0.05062093
# evaluating the 90th percentile
qnorm(0.9,m,s)
## [1] 44.24067
# evaluating the density of the normal distribution at 10
dnorm(10,m,s)
## [1] 0.008883473
Is the skin thickness normaly distributed ?
# load the library nortest
library(nortest)
# q-q plot : a graphic that compares the theoretical quantiles to the observed quantiles
qqnorm(scale(p$skin))
# applying 4 different tests of normality
# anderson-darling
ad.test(p$skin)
##
## Anderson-Darling normality test
##
## data: p$skin
## A = 0.80136, p-value = 0.03738
# cramer-von-mises
cvm.test(p$skin)
##
## Cramer-von Mises normality test
##
## data: p$skin
## W = 0.087268, p-value = 0.1658
# pearson chi-square
pearson.test(p$skin)
##
## Pearson chi-square normality test
##
## data: p$skin
## P = 29.67, p-value = 0.008471
# shapiro-francia
sf.test(p$skin)
##
## Shapiro-Francia normality test
##
## data: p$skin
## W = 0.93151, p-value = 3.08e-07
# shapiro-wilkis
shapiro.test(p$skin)
##
## Shapiro-Wilk normality test
##
## data: p$skin
## W = 0.93646, p-value = 1.14e-07
# Frequency (Absolute) of diabetic and non-diabetic
table(p$type)
##
## No Yes
## 132 68
# Frequency (Relative)
prop.table(table(p$type))
##
## No Yes
## 0.66 0.34
Barplot is the graphical representation of a frequency table
# Absolute
barplot(table(p$type))
# Relative
barplot(prop.table(table(p$type)))
Elements in the data-frame can be selected by subsetting rows and/or columns
# 6th row and 4th column
p[6,4]
## [1] 27
# age and bmi for women with more than 60 yrs old
p[which(p$age>60),c('age','bmi')]
## age bmi
## 9 63 32.4
## 80 62 26.5
## 157 62 34.7
# selecting age, glucose and blood pressure for diabetic aged more than 60
diab.aged<-subset(p,type=='Yes' & age>60,select=c('age','glu','bp'))
diab.aged
## age glu bp
## 157 62 197 70
it is possible to create new variables in R, by recoding existing ones
p$bmi.range<-cut(p$bmi,breaks = c(-Inf,25,30,35,Inf),labels = c('underweight','normal','overweight','obese'))
table(p$bmi.range)
##
## underweight normal overweight obese
## 26 42 67 65
The command sample is useful for drawing samples from a vector
# creating a list of index for women in Pima dataset
indexes<-1:dim(p)[1]
# sampling 15 women
sample.indexes<-sample(indexes,15,replace=F)
sample.indexes
## [1] 29 83 82 73 30 28 46 90 52 164 9 84 151 23 105
# Displaying information on 15 sampled women
p[sample.indexes,]
## npreg glu bp skin bmi ped age type bmi.range
## 29 5 108 72 43 36.1 0.263 33 No obese
## 83 6 125 78 31 27.6 0.565 49 Yes normal
## 82 4 154 72 29 31.3 0.338 37 No overweight
## 73 8 181 68 36 30.1 0.615 60 Yes overweight
## 30 2 110 74 29 32.4 0.698 27 No overweight
## 28 0 140 65 26 42.6 0.431 24 Yes obese
## 46 2 125 60 20 33.8 0.088 31 No overweight
## 90 1 130 60 23 28.6 0.692 21 No normal
## 52 2 83 66 23 32.2 0.497 22 No overweight
## 164 4 95 60 32 35.4 0.284 28 No obese
## 9 3 142 80 15 32.4 0.200 63 No overweight
## 84 10 125 70 26 31.1 0.205 41 Yes overweight
## 151 12 88 74 40 35.3 0.378 48 No obese
## 23 4 83 86 19 29.3 0.317 34 No normal
## 105 4 154 62 31 32.8 0.237 23 No overweight
# categorized boxplot: skin thickness by diabetic condition (Yes/No)
boxplot(p$skin~p$type)
Student’s t-test for two independent samples
# t-test assuming different variances
t.test(p$skin~p$type)
##
## Welch Two Sample t-test
##
## data: p$skin by p$type
## t = -3.3421, df = 122.23, p-value = 0.001104
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -9.415489 -2.410715
## sample estimates:
## mean in group No mean in group Yes
## 27.20455 33.11765
# t-test assuming equal variances
t.test(p$skin~p$type,var.equal=TRUE)
##
## Two Sample t-test
##
## data: p$skin by p$type
## t = -3.4712, df = 198, p-value = 0.0006361
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -9.272397 -2.553806
## sample estimates:
## mean in group No mean in group Yes
## 27.20455 33.11765
reg = lm(skin~type,data=p)
summary(reg)
##
## Call:
## lm(formula = skin ~ type, data = p)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.118 -8.205 -0.205 6.817 65.882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.2045 0.9933 27.388 < 2e-16 ***
## typeYes 5.9131 1.7035 3.471 0.000636 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.41 on 198 degrees of freedom
## Multiple R-squared: 0.05736, Adjusted R-squared: 0.0526
## F-statistic: 12.05 on 1 and 198 DF, p-value: 0.0006361
#purl("Rtutorial01.Rmd", output = "Rtutorial01.R", documentation = 1)
# cross-tabulation bmi range vs. diabetic status
tab<-table(p$bmi.range,p$type)
tab
##
## No Yes
## underweight 24 2
## normal 34 8
## overweight 39 28
## obese 35 30
# frequency relative to rows
prop.table(tab,margin=1)
##
## No Yes
## underweight 0.92307692 0.07692308
## normal 0.80952381 0.19047619
## overweight 0.58208955 0.41791045
## obese 0.53846154 0.46153846
# frequency relative to columns
prop.table(tab,margin=2)
##
## No Yes
## underweight 0.18181818 0.02941176
## normal 0.25757576 0.11764706
## overweight 0.29545455 0.41176471
## obese 0.26515152 0.44117647
# Performing chi-square and storing in a object
Q<-chisq.test(tab)
Q
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 18.295, df = 3, p-value = 0.0003824
# Observed Frequencies
Q$obs
##
## No Yes
## underweight 24 2
## normal 34 8
## overweight 39 28
## obese 35 30
# Expected Frequencies
Q$exp
##
## No Yes
## underweight 17.16 8.84
## normal 27.72 14.28
## overweight 44.22 22.78
## obese 42.90 22.10
# Residuals
Q$residuals
##
## No Yes
## underweight 1.6511916 -2.3005410
## normal 1.1927874 -1.6618642
## overweight -0.7849846 1.0936885
## obese -1.2061420 1.6804707
# A linear regression in R that uses Glucose as a dependent variable, Age and BMI as independent variable
fit<-lm(glu~age+bmi,data=p)
# summary of the regression
summary(fit)
##
## Call:
## lm(formula = glu ~ age + bmi, data = p)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77.694 -17.289 -3.253 18.797 82.306
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.1584 12.1151 5.378 2.12e-07 ***
## age 0.9244 0.1914 4.829 2.75e-06 ***
## bmi 0.9016 0.3427 2.630 0.0092 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.38 on 197 degrees of freedom
## Multiple R-squared: 0.1479, Adjusted R-squared: 0.1392
## F-statistic: 17.09 on 2 and 197 DF, p-value: 1.43e-07