Sushanta Pradhan
13th June 2014
Installation
numeric_vector <- c(1,2,3,4)
charecter_vector <- c('a','b','c')
m <- matrix(c(1:4), nrow=2, ncol=2)
rownames(m) <- LETTERS[1:2]
colnames(m) <- letters[1:2]
m
a b
A 1 3
B 2 4
x <- c(1,2,3,1,3,2,3)
as.factor(x)
[1] 1 2 3 1 3 2 3
Levels: 1 2 3
x <- c(2.1, 4.2, 3.3, 5.4)
x[1]
[1] 2.1
x <- c(2.1, 4.2, 3.3, 5.4)
x[x < 4]
[1] 2.1 3.3
x <- c(2.1, 4.2, 3.3, 5.4)
sapply(x, `+` , 1)
[1] 3.1 5.2 4.3 6.4
x <- data.frame(x=c(1:3), y=c(4:6))
sapply(x, sum)
x y
6 15
x <- round(rnorm(12, mean=10, sd = 2))
y <- rep(1:3, 4)
df <- data.frame(x=x, y=y)
tapply(df$x, df$y, sum)
1 2 3
46 36 44
x y
1 1 3
2 2 4
transform(df, y = y^2)
x y
1 1 9
2 2 16
Debugging
power <- function(exponent){
function(n){
n^exponent
}
}
debug(power)
square = power(2)
cube = power(3)
con <- url("http://www.jhsph.edu", "r")
con <- file('somefile.txt')
con <- gzfile("words.gz")
data <- readLines(con, 10)
#if connected resource is a csv file
data <- read.csv(con)
close(con)
install.packages("RMySQL")
ucscDb <- dbConnect(MySQL(),user="genome",
host="genome-mysql.cse.ucsc.edu")
#read the whole table
affyData <- dbReadTable(hg19, "affyU133Plus2")
head(affyData)
#execute query
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- fetch(query)
Getting Data using APIs
myapp = oauth_app("twitter", key="xxx",secret="xxx")
sig = sign_oauth1.0(myapp, token = "xxx", token_secret = "xxx")
homeTL = GET("https://api.twitter.com/1.1/statuses/home_timeline.json", sig)
Reading JSON data
library(jsonlite)
myjson <- toJSON(iris, pretty=TRUE)
jsonData <- fromJSON(myjson)
names(jsonData)
Caret Package
Data Splitting
library(caret); data(spam)
inTrain <- createDataPartition(y=spam$type,
p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
dim(training)
K-folds
set.seed(32323)
trainFolds <- createFolds(y=spam$type,k=10,
list=TRUE,returnTrain=TRUE)
testFolds <- createFolds(y=spam$type,k=10,
list=TRUE,returnTrain=FALSE)
data(iris)
model <- train(Species ~ . , trainControl=trainControl(method='cv', repeats=10), data=iris)
Training Options
args(trainControl)
Training Options contd ..
Pre Process
library(caret)
preObj <- preProcess(iris[,-5],method=c("center","scale"))
trainiris <- predict(preObj,iris[,-5])
model <- train(Species ~.,data=iris,
preProcess=c("center","scale"))
Pre Process contd (remove constant features)..
library(caret)
nsv <- nearZeroVar(iris,saveMetrics=TRUE)
nsv[,c(3,4)]
zeroVar nzv
Sepal.Length FALSE FALSE
Sepal.Width FALSE FALSE
Petal.Length FALSE FALSE
Petal.Width FALSE FALSE
Species FALSE FALSE
Pre Process contd (remove redundant features) ..
preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
trainPC <- predict(preProc,log10(training[,-58]+1))
modelFit <- train(training$type ~ .,method="glm",data=trainPC)
testPC <- predict(preProc,log10(testing[,-58]+1))
confusionMatrix(testing$type,predict(modelFit,testPC))
Linear Regression
library(ISLR); library(ggplot2); library(caret);
data(Wage); Wage <- subset(Wage,select=-c(logwage))
inTrain <- createDataPartition(y=Wage$wage,
p=0.7, list=FALSE)
training <- Wage[inTrain,]; testing <- Wage[-inTrain,]
modFit<- train(wage ~ ., method = "lasso",data=training, preProcess=c('center', 'scale'))
hist(modFit$finalModel$residuals)
Types of errors
Accuracy metrics
Accuracy Measures
Mean squared error (or root mean squared error)
Continuous data, sensitive to outliers
Sensitivity (recall)
If you want few missed positives
Specificity
If you want few negatives called positives
Accuracy
Weights false positives/negatives equally