Steps for using R with RStudio code editor

The basics steps for programming in R using the RStudio code editor are :

Install R
Install RStudio
Open Rstudio
File > New File > Rscript

R can be downloaded from http://cran-r.project.org and RStudio can be downloaded from http://www.rstudio.org/

Types of Objects in R

R commands can be directly typed in R console but RStudio as a code editor will keep a record of all operations in R. The operations in R are highly dependent on the type of object is being manipulated.

Vector, Matrices, Data-Frames and Lists

Here we list 4 types of objects in an increasing order of complexity.

# Vector
vec1<-c(0,3,2,1,3,5)
vec1

## [1] 0 3 2 1 3 5

vec2<-c('A','B','C')
vec2

## [1] "A" "B" "C"

# Matrix
mat<-matrix(c(0,3,2,1,3,5),ncol=2)
mat

##      [,1] [,2]
## [1,]    0    1
## [2,]    3    3
## [3,]    2    5

set.seed(123)# this command controls the random generation
# Data Frame
df<-data.frame(age=rnorm(10,30,5),gender=rep(c('M','F'),5))
df

##         age gender
## 1  27.19762      M
## 2  28.84911      F
## 3  37.79354      M
## 4  30.35254      F
## 5  30.64644      M
## 6  38.57532      F
## 7  32.30458      M
## 8  23.67469      F
## 9  26.56574      M
## 10 27.77169      F

# List
ls<-list(vec1 = vec1, vec2 = vec2 ,mat = mat,df = df)
ls

## $vec1
## [1] 0 3 2 1 3 5
## 
## $vec2
## [1] "A" "B" "C"
## 
## $mat
##      [,1] [,2]
## [1,]    0    1
## [2,]    3    3
## [3,]    2    5
## 
## $df
##         age gender
## 1  27.19762      M
## 2  28.84911      F
## 3  37.79354      M
## 4  30.35254      F
## 5  30.64644      M
## 6  38.57532      F
## 7  32.30458      M
## 8  23.67469      F
## 9  26.56574      M
## 10 27.77169      F

Numerical, Character and Factor

# numerical
vec1

## [1] 0 3 2 1 3 5

class(vec1)

## [1] "numeric"

# character
vec2

## [1] "A" "B" "C"

class(vec2)

## [1] "character"

# factor
df$gender

##  [1] M F M F M F M F M F
## Levels: F M

class(df$gender)

## [1] "factor"

Import and Exporting Data

First let’s export data from R to csv file

Exporting Data

# load a data frame named iris
data(iris)

# write iris data frame to a csv file
write.csv(iris,file='~/Desktop/irisdata.csv') 

# if it fails because of the path do simply: 
# write.csv(iris,file='irisdata.csv')

Importing Data

# load the same iris data from the exported csv file
ir<-read.csv('~/Desktop/irisdata.csv') 

# check the object class
class(ir)

## [1] "data.frame"

Datasets in R

In R there is a list of datasets that can be used for learning purposes. To see the list of datasets, just type:

Example Dataset: Diabetes in Pima Indian Women

# loading a library with functions and datasets 
library(MASS)

# loading a data set called Pima.tr
data(Pima.tr)

# dimensions of the data frame (number of rows / number of columns)
dim(Pima.tr)

## [1] 200   8

# displaying the first 6 rows of the dataset
head(Pima.tr)

##   npreg glu bp skin  bmi   ped age type
## 1     5  86 68   28 30.2 0.364  24   No
## 2     7 195 70   33 25.1 0.163  55  Yes
## 3     5  77 82   41 35.8 0.156  35   No
## 4     0 165 76   43 47.9 0.259  26   No
## 5     0 107 60   25 26.4 0.133  23   No
## 6     5  97 76   27 35.6 0.378  52  Yes

# displaying more information about the dataset
?Pima.tr

# naming the dataset in a simple way
p<-Pima.tr

# displaying the names of the objects in the data set
names(p)

## [1] "npreg" "glu"   "bp"    "skin"  "bmi"   "ped"   "age"   "type"

# showing the class of the object
class(p)

## [1] "data.frame"

Descriptive Univariate Analysis

# basic summary of the data
summary(p)

##      npreg            glu              bp              skin      
##  Min.   : 0.00   Min.   : 56.0   Min.   : 38.00   Min.   : 7.00  
##  1st Qu.: 1.00   1st Qu.:100.0   1st Qu.: 64.00   1st Qu.:20.75  
##  Median : 2.00   Median :120.5   Median : 70.00   Median :29.00  
##  Mean   : 3.57   Mean   :124.0   Mean   : 71.26   Mean   :29.21  
##  3rd Qu.: 6.00   3rd Qu.:144.0   3rd Qu.: 78.00   3rd Qu.:36.00  
##  Max.   :14.00   Max.   :199.0   Max.   :110.00   Max.   :99.00  
##       bmi             ped              age         type    
##  Min.   :18.20   Min.   :0.0850   Min.   :21.00   No :132  
##  1st Qu.:27.57   1st Qu.:0.2535   1st Qu.:23.00   Yes: 68  
##  Median :32.80   Median :0.3725   Median :28.00            
##  Mean   :32.31   Mean   :0.4608   Mean   :32.11            
##  3rd Qu.:36.50   3rd Qu.:0.6160   3rd Qu.:39.25            
##  Max.   :47.90   Max.   :2.2880   Max.   :63.00

Central Tendency Measures for BMI

# mean
mean(p$bmi)

## [1] 32.31

# median
median(p$bmi)

## [1] 32.8

# mode
table(p$bmi)[which.max(table(p$bmi))]

## 32 
##  6

Dispersion Measures for BMI

# variance
var(p$bmi)

## [1] 37.5795

# standard deviation
sd(p$bmi)

## [1] 6.130212

# IQR
IQR(p$bmi)

## [1] 8.925

Quantiles for BMI

# 5th percentile
quantile(p$bmi,0.05)

##     5% 
## 22.485

# 95th percentile
quantile(p$bmi,0.95)

##    95% 
## 42.615

Histogram of Skin Thickness

# A frequency histogram
hist(p$skin)

# A density histogram
hist(p$skin,prob=T)

# Adding a theoretical normal curve
m<-mean(p$skin)
s<-sd(p$skin)
lower<-min(p$skin)-s
upper<-max(p$skin)+s
curve(dnorm(x,m,s),lower,upper,add=T,col='red')

Assuming that skin thickness is a normally distributed random variable with mean and standard deviation as observed in the sample:

# draw a sample of n=10 individuals from this normal population
rnorm(10,m,s)

##  [1] 43.566862 33.433671 33.913883 30.512710 22.697988 50.165831 35.052095
##  [8]  6.157212 37.438113 23.671713

# evaluating the probability of skin thickness less than 10
pnorm(10,m,s)

## [1] 0.05062093

# evaluating the 90th percentile
qnorm(0.9,m,s)

## [1] 44.24067

# evaluating the density of the normal distribution at 10
dnorm(10,m,s)

## [1] 0.008883473

Is the skin thickness normaly distributed ?

# load the library nortest
library(nortest)

# q-q plot : a graphic that compares the theoretical quantiles to the observed quantiles
qqnorm(scale(p$skin))

# applying 4 different tests of normality

# anderson-darling
ad.test(p$skin)

## 
##  Anderson-Darling normality test
## 
## data:  p$skin
## A = 0.80136, p-value = 0.03738

# cramer-von-mises
cvm.test(p$skin)

## 
##  Cramer-von Mises normality test
## 
## data:  p$skin
## W = 0.087268, p-value = 0.1658

# pearson chi-square
pearson.test(p$skin)

## 
##  Pearson chi-square normality test
## 
## data:  p$skin
## P = 29.67, p-value = 0.008471

# shapiro-francia
sf.test(p$skin)

## 
##  Shapiro-Francia normality test
## 
## data:  p$skin
## W = 0.93151, p-value = 3.08e-07

# shapiro-wilkis
shapiro.test(p$skin)

## 
##  Shapiro-Wilk normality test
## 
## data:  p$skin
## W = 0.93646, p-value = 1.14e-07

Frequency Tables

# Frequency (Absolute) of diabetic and non-diabetic
table(p$type)

## 
##  No Yes 
## 132  68

# Frequency (Relative)
prop.table(table(p$type))

## 
##   No  Yes 
## 0.66 0.34

Barplot is the graphical representation of a frequency table

# Absolute
barplot(table(p$type))

# Relative
barplot(prop.table(table(p$type)))

Subsetting

Elements in the data-frame can be selected by subsetting rows and/or columns

Basic selection

# 6th row and 4th column
p[6,4]

## [1] 27

Subset of rows and columns

# age and bmi for women with more than 60 yrs old
p[which(p$age>60),c('age','bmi')]

##     age  bmi
## 9    63 32.4
## 80   62 26.5
## 157  62 34.7

Extracting a data-frame according to specified criteria

# selecting age, glucose and blood pressure for diabetic aged more than 60
diab.aged<-subset(p,type=='Yes' & age>60,select=c('age','glu','bp'))
diab.aged

##     age glu bp
## 157  62 197 70

Recoding

it is possible to create new variables in R, by recoding existing ones

p$bmi.range<-cut(p$bmi,breaks = c(-Inf,25,30,35,Inf),labels = c('underweight','normal','overweight','obese'))
table(p$bmi.range)

## 
## underweight      normal  overweight       obese 
##          26          42          67          65

Random Sampling

The command sample is useful for drawing samples from a vector

# creating a list of index for women in Pima dataset
indexes<-1:dim(p)[1]

# sampling 15 women
sample.indexes<-sample(indexes,15,replace=F)
sample.indexes

##  [1]  29  83  82  73  30  28  46  90  52 164   9  84 151  23 105

# Displaying information on 15 sampled women
p[sample.indexes,]

##     npreg glu bp skin  bmi   ped age type  bmi.range
## 29      5 108 72   43 36.1 0.263  33   No      obese
## 83      6 125 78   31 27.6 0.565  49  Yes     normal
## 82      4 154 72   29 31.3 0.338  37   No overweight
## 73      8 181 68   36 30.1 0.615  60  Yes overweight
## 30      2 110 74   29 32.4 0.698  27   No overweight
## 28      0 140 65   26 42.6 0.431  24  Yes      obese
## 46      2 125 60   20 33.8 0.088  31   No overweight
## 90      1 130 60   23 28.6 0.692  21   No     normal
## 52      2  83 66   23 32.2 0.497  22   No overweight
## 164     4  95 60   32 35.4 0.284  28   No      obese
## 9       3 142 80   15 32.4 0.200  63   No overweight
## 84     10 125 70   26 31.1 0.205  41  Yes overweight
## 151    12  88 74   40 35.3 0.378  48   No      obese
## 23      4  83 86   19 29.3 0.317  34   No     normal
## 105     4 154 62   31 32.8 0.237  23   No overweight

Bivariate Analysis : Skin Thickness vs. Diabetes

# categorized boxplot: skin thickness by diabetic condition (Yes/No)
boxplot(p$skin~p$type)

Inference

Student’s t-test for two independent samples

Case 1: Different Variances

# t-test assuming different variances
t.test(p$skin~p$type)

## 
##  Welch Two Sample t-test
## 
## data:  p$skin by p$type
## t = -3.3421, df = 122.23, p-value = 0.001104
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9.415489 -2.410715
## sample estimates:
##  mean in group No mean in group Yes 
##          27.20455          33.11765

Case 2: Equal Variances

# t-test assuming equal variances
t.test(p$skin~p$type,var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  p$skin by p$type
## t = -3.4712, df = 198, p-value = 0.0006361
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9.272397 -2.553806
## sample estimates:
##  mean in group No mean in group Yes 
##          27.20455          33.11765

T-test and Regression

reg = lm(skin~type,data=p)
summary(reg)

## 
## Call:
## lm(formula = skin ~ type, data = p)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.118  -8.205  -0.205   6.817  65.882 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  27.2045     0.9933  27.388  < 2e-16 ***
## typeYes       5.9131     1.7035   3.471 0.000636 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.41 on 198 degrees of freedom
## Multiple R-squared:  0.05736,    Adjusted R-squared:  0.0526 
## F-statistic: 12.05 on 1 and 198 DF,  p-value: 0.0006361

#purl("Rtutorial01.Rmd", output = "Rtutorial01.R", documentation = 1)

Bivariate Analysis : BMI Range vs. Diabetes

Chi-Square Test

# cross-tabulation bmi range vs. diabetic status
tab<-table(p$bmi.range,p$type)
tab

##              
##               No Yes
##   underweight 24   2
##   normal      34   8
##   overweight  39  28
##   obese       35  30

# frequency relative to rows
prop.table(tab,margin=1)

##              
##                       No        Yes
##   underweight 0.92307692 0.07692308
##   normal      0.80952381 0.19047619
##   overweight  0.58208955 0.41791045
##   obese       0.53846154 0.46153846

# frequency relative to columns
prop.table(tab,margin=2)

##              
##                       No        Yes
##   underweight 0.18181818 0.02941176
##   normal      0.25757576 0.11764706
##   overweight  0.29545455 0.41176471
##   obese       0.26515152 0.44117647

# Performing chi-square and storing in a object
Q<-chisq.test(tab)
Q

## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 18.295, df = 3, p-value = 0.0003824

# Observed Frequencies
Q$obs

##              
##               No Yes
##   underweight 24   2
##   normal      34   8
##   overweight  39  28
##   obese       35  30

# Expected Frequencies
Q$exp

##              
##                  No   Yes
##   underweight 17.16  8.84
##   normal      27.72 14.28
##   overweight  44.22 22.78
##   obese       42.90 22.10

# Residuals
Q$residuals

##              
##                       No        Yes
##   underweight  1.6511916 -2.3005410
##   normal       1.1927874 -1.6618642
##   overweight  -0.7849846  1.0936885
##   obese       -1.2061420  1.6804707

Multivariate Analysis : Glucose vs. BMI and Age

# A linear regression in R that uses Glucose as a dependent variable, Age and BMI as independent variable
fit<-lm(glu~age+bmi,data=p)

# summary of the regression
summary(fit)

## 
## Call:
## lm(formula = glu ~ age + bmi, data = p)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -77.694 -17.289  -3.253  18.797  82.306 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  65.1584    12.1151   5.378 2.12e-07 ***
## age           0.9244     0.1914   4.829 2.75e-06 ***
## bmi           0.9016     0.3427   2.630   0.0092 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29.38 on 197 degrees of freedom
## Multiple R-squared:  0.1479, Adjusted R-squared:  0.1392 
## F-statistic: 17.09 on 2 and 197 DF,  p-value: 1.43e-07

R Tutorial Part I

Joel Correa da Rosa

January 25, 2017