Data Science Tools - R

Sushanta Pradhan
13th June 2014

Installation

R
- http://cran.r-project.org/
R studio
- http://www.rstudio.com/products/rstudio/download/

Data Types

Vector (homogeneous) / List (hetrogeneous)

numeric_vector <- c(1,2,3,4)
charecter_vector <- c('a','b','c')

Matrix (homogeneous) / Data Frame (hetrogeneous)

m <- matrix(c(1:4), nrow=2, ncol=2)
rownames(m) <- LETTERS[1:2]
colnames(m) <- letters[1:2]
m

  a b
A 1 3
B 2 4

Data Types contd ..

factor

x <- c(1,2,3,1,3,2,3)
as.factor(x)

[1] 1 2 3 1 3 2 3
Levels: 1 2 3

Subsetting

indexing

x <- c(2.1, 4.2, 3.3, 5.4)
x[1]

[1] 2.1

logical

x <- c(2.1, 4.2, 3.3, 5.4)
x[x < 4]

[1] 2.1 3.3

Functional Programming

sapply/lapply

x <- c(2.1, 4.2, 3.3, 5.4)
sapply(x, `+` , 1)

[1] 3.1 5.2 4.3 6.4

x <- data.frame(x=c(1:3), y=c(4:6))
sapply(x, sum)

 x  y 
 6 15

Functional Programming contd..

tapply

x <- round(rnorm(12, mean=10, sd = 2))
y <- rep(1:3, 4)
df <- data.frame(x=x, y=y)
tapply(df$x, df$y, sum)

 1  2  3 
46 36 44

Functional Programming contd..

transform

  x y
1 1 3
2 2 4

transform(df, y = y^2)

  x  y
1 1  9
2 2 16

Debugging

power <- function(exponent){
    function(n){
        n^exponent
    }
}
debug(power)

square = power(2)

cube = power(3)

Getting data using a connection

con <- url("http://www.jhsph.edu", "r")
con <- file('somefile.txt')
con <- gzfile("words.gz")
data <- readLines(con, 10) 
#if connected resource is a csv file
data <- read.csv(con)
close(con)

Getting Data from Database

install.packages("RMySQL")
ucscDb <- dbConnect(MySQL(),user="genome", 
                    host="genome-mysql.cse.ucsc.edu")

#read the whole table
affyData <- dbReadTable(hg19, "affyU133Plus2")
head(affyData)

#execute query
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- fetch(query)

Getting Data using APIs

myapp = oauth_app("twitter", key="xxx",secret="xxx")
sig = sign_oauth1.0(myapp, token = "xxx", token_secret = "xxx")
homeTL = GET("https://api.twitter.com/1.1/statuses/home_timeline.json", sig)

Reading JSON data

library(jsonlite)
myjson <- toJSON(iris, pretty=TRUE)
jsonData <- fromJSON(myjson)
names(jsonData)

Caret Package

Some preprocessing (cleaning)
- preProcess
Data splitting
- createDataPartition
- createResample
- createTimeSlices
Training/testing functions
- train
- predict
Model comparison
- confusionMatrix

Data Splitting

library(caret); data(spam)
inTrain <- createDataPartition(y=spam$type,
                              p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
dim(training)

K-folds

set.seed(32323)
trainFolds <- createFolds(y=spam$type,k=10,
                             list=TRUE,returnTrain=TRUE)

testFolds <- createFolds(y=spam$type,k=10,
                             list=TRUE,returnTrain=FALSE)

data(iris)
model <- train(Species ~ . , trainControl=trainControl(method='cv', repeats=10), data=iris)

Training Options

args(trainControl)

method
- boot = bootstrapping
- boot632 = bootstrapping with adjustment
- cv = cross validation
- repeatedcv = repeated cross validation
- LOOCV = leave one out cross validation

Training Options contd ..

number
- For boot/cross validation
- Number of subsamples to take
repeats
- Number of times to repeate subsampling
- If big this can slow things down

Pre Process

library(caret)
preObj <- preProcess(iris[,-5],method=c("center","scale"))
trainiris <- predict(preObj,iris[,-5])

model <- train(Species ~.,data=iris,
                  preProcess=c("center","scale"))

Pre Process contd (remove constant features)..

library(caret)
nsv <- nearZeroVar(iris,saveMetrics=TRUE)
nsv[,c(3,4)]

             zeroVar   nzv
Sepal.Length   FALSE FALSE
Sepal.Width    FALSE FALSE
Petal.Length   FALSE FALSE
Petal.Width    FALSE FALSE
Species        FALSE FALSE

Pre Process contd (remove redundant features) ..

preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
trainPC <- predict(preProc,log10(training[,-58]+1))
modelFit <- train(training$type ~ .,method="glm",data=trainPC)

testPC <- predict(preProc,log10(testing[,-58]+1))
confusionMatrix(testing$type,predict(modelFit,testPC))

Linear Regression

library(ISLR); library(ggplot2); library(caret);
data(Wage); Wage <- subset(Wage,select=-c(logwage))
inTrain <- createDataPartition(y=Wage$wage,
                              p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
modFit<- train(wage ~ ., method = "lasso",data=training, preProcess=c('center', 'scale'))
hist(modFit$finalModel$residuals)

Types of errors

True positive (TP) = correctly identified
False positive (FP) = incorrectly identified
True negative (TN) = correctly rejected
False negative (FN) = incorrectly rejected

Accuracy metrics

Sensitivity = TP/(TP+FN)
Specificity = TN/(FP+TN)
Positive Predicted Value = TP/(TP+FP)
Negative Predicted Value = TN/(TN+FN)
Total Accuracy = (TP+TN)/(TP+FP+FN+TN)

Accuracy Measures

Mean squared error (or root mean squared error)

Continuous data, sensitive to outliers
Sensitivity (recall)

If you want few missed positives
Specificity

If you want few negatives called positives
Accuracy

Weights false positives/negatives equally

Resources

http://google-styleguide.googlecode.com/svn/trunk/Rguide.xml

http://adv-r.had.co.nz/

http://www.r-bloggers.com/?s=Web+Scraping

http://cran.r-project.org/