Homework 6

# answer3
0.87/sqrt(140)
## [1] 0.07352842
# answer 4
qnorm(0.975)
## [1] 1.959964
35 - 11.6 * qnorm(0.975)
## [1] 12.26442
35 + 11.6 * qnorm(0.975)
## [1] 57.73558
35 - ((11.6 / sqrt(100)) * qnorm(0.975))
## [1] 32.72644
35 + ((11.6 / sqrt(100)) * qnorm(0.975))
## [1] 37.27356
# answer 5
a <- c(12.2,11.1,14.0,11.3,10.8,12.5,12.2,11.9,13.6,12.7,13.4,13.7)
b <- c(11.9,10.7,12.3,13.9,11.1,11.2,13.3,11.4,12.0,11.1)
boxplot(a,b)

t.test(a,b)
## 
##  Welch Two Sample t-test
## 
## data:  a and b
## t = 1.2472, df = 19.515, p-value = 0.2271
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3781372  1.4981372
## sample estimates:
## mean of x mean of y 
##     12.45     11.89
# answer 6
m <- matrix(c(112,18, 124, 37), ncol = 2)
#m <- matrix(c(50,50, 60, 60), ncol = 2)
?chisq.test
## starting httpd help server ... done
chisq.test(m,  correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  m
## X-squared = 3.9158, df = 1, p-value = 0.04783
# answer 7
a <- '(1)21%,6.8; (2)12%,10.3; (3)30%,1.7; (4)8%,14.2; (5)10%,8.8; (6)26%,5.8; (7)42%,2.1; (8)31%,3.3; (9)21%,4.3; (10)15%,9.0; (11)19%,3.2; (12)6%,12.7; (13)18%,8.2; (14)12%,7.0; (15)23%,5.1; (16)34%,4.1'
area <- as.data.frame(matrix(unlist(lapply(trimws(strsplit(a, ';')[[1]]), function(e) strsplit(e,','))), ncol = 2, byrow=TRUE), stringsAsFactors = FALSE)

area$V1 <- as.integer(gsub('\\(\\d+\\)(\\d)', '\\1',unlist(strsplit(area$V1, '%'))))

# \\w => words and digits
# \\d => digits
# [a-z] => abcdefghijklmnopqrstuvwxyz
# ()  => group

gsub('[a-z]+(\\d+)[a-z]+', '\\1', 'abc123def')
## [1] "123"
gsub('([a-z]+)(\\d+)([a-z]+)', '\\3', 'abc123def')
## [1] "def"
gsub( '\\(\\d+\\)(\\d+)%' , '\\1' ,'(1)21%')
## [1] "21"
area$V2 <- as.numeric(area$V2)
cor(area)
##            V1         V2
## V1  1.0000000 -0.8477157
## V2 -0.8477157  1.0000000
plot(V1 ~ V2, data = area)
fit <- lm(V1 ~ V2, data = area)
fit
## 
## Call:
## lm(formula = V1 ~ V2, data = area)
## 
## Coefficients:
## (Intercept)           V2  
##      36.062       -2.336
plot(V1 ~ V2, data = area)
abline(fit, col= 'red')

a <- '0912345678'
b <- '0912345555'
a == '0912345678'
## [1] TRUE
b == '0912345678'
## [1] FALSE
## Regular Expression
grepl('3', '3')
## [1] TRUE
grepl('5', '3')
## [1] FALSE
# [] => match any character within the []
grepl('[0123456789]', '3')
## [1] TRUE
# \\d => [0123456789]
grepl('\\d', '3')
## [1] TRUE
grepl('[abcdefghijklmnopqrstuvwxyz]', 'w')
## [1] TRUE
grepl('[abcdefghijklmnopqrstuvwxyz]', 'W')
## [1] FALSE
grepl('[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLNBOPQRSTUVWXYZ]', 'W')
## [1] TRUE
# [a-z] =>[abcdefghijklmnopqrstuvwxyz]
# [a-zA-Z] => [abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLNBOPQRSTUVWXYZ]

grepl('[a-zA-Z]', 'W')
## [1] TRUE
grepl('[a-zA-Z]', '3')
## [1] FALSE
# \\w => [a-zA-Z0-9]
grepl('\\w', '3')
## [1] TRUE
# .   => [a-zA-Z0-9!@#$%^&*()_+]
grepl('.', '$')
## [1] TRUE
# {n} => match number n of character
grepl('\\d{3}', '123')
## [1] TRUE
# {m,n} => match at least m character, at most n character
grepl('\\d{3,6}', '12345')
## [1] TRUE
grepl('\\d{0,}', '12345')
## [1] TRUE
# {0,} => *
grepl('\\d*', '12345')
## [1] TRUE
grepl('\\d{1,}', '12345')
## [1] TRUE
# {0,} => +
grepl('\\d+', '12345')
## [1] TRUE
# match phone number
grepl('09\\d{8}', '0922333555')
## [1] TRUE
grepl('09\\d{8}', '0922-333555')
## [1] FALSE
grepl('09\\d{2}-{0,1}\\d{6}', '0922-333555')
## [1] TRUE
grepl('09\\d{2}-{0,1}\\d{6}', '0922333555')
## [1] TRUE
# {0,1} => ?
grepl('09\\d{2}-?\\d{6}', '0922-333555')
## [1] TRUE
grepl('09\\d{2}-?\\d{6}', '0922-333-555')
## [1] FALSE
grepl('09\\d{2}-?\\d{3}-?\\d{3}', '0922-333-555')
## [1] TRUE
grepl('09\\d{2}-?\\d{3}-?\\d{3}', '092233355529492034902840234')
## [1] TRUE
# ^ => match the begining
# $ => match the end
grepl('^09\\d{2}-?\\d{3}-?\\d{3}$', '092233355529492034902840234')
## [1] FALSE
grepl('\\(\\d+\\)\\d+%' ,'(1)21%')
## [1] TRUE
# (): group
gsub('\\(\\d+\\)(\\d+)%', '\\1' ,'(1)21%')
## [1] "21"
gsub('\\((\\d+)\\)(\\d+)%', '\\1' ,'(1)21%')
## [1] "1"
gsub('\\((\\d+)\\)(\\d+)%', '\\2' ,'(1)21%')
## [1] "21"
gsub('\\((\\d+)\\)(\\d+)%', '\\1' ,'(123)231%')
## [1] "123"
gsub('\\((\\d+)\\)(\\d+)%', '\\2' ,'(123)231%')
## [1] "231"

Feature Selection

url <- 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'

bc_data <- read.csv(url, header = FALSE)

class(bc_data)
str(bc_data)

colnames(bc_data) <- c("sample_code_number",
"clump_thickness",
"uniformity_of_cell_size",
"uniformity_of_cell_shape",
"marginal_adhesion",
"single_epithelial_cell_size",
"bare_nuclei",
"bland_chromatin",
"normal_nucleoli",
"mitosis",
"classes")

head(bc_data)
str(bc_data)

bc_data$classes <- ifelse(bc_data$classes == "2", "benign",
ifelse(bc_data$classes == "4", "malignant", NA))
bc_data$classes <- as.factor(bc_data$classes)

bc_data[bc_data == "?"] <- NA
sum(is.na(bc_data))

bc_data <- na.omit(bc_data)
sum(is.na(bc_data))


library(tidyr)
library(ggplot2)
gather(bc_data, x, y, clump_thickness:mitosis) %>%
ggplot(aes(x = y, color = classes, fill = classes)) +
geom_density(alpha = 0.3) +
facet_wrap( ~ x, scales = "free", ncol = 3)


#install.packages("FSelector")
#library(FSelector)


library(caret)
control <- trainControl(method="repeatedcv", number=10, repeats=3)

model <- train(classes~., data=bc_data, method="rpart"
, preProcess="scale",trControl=control)

importance <- varImp(model, scale=FALSE)
importance


model <- train(classes~., data=bc_data, method="rf", preProcess="scale",trControl=control)
importance <- varImp(model, scale=FALSE)
importance
plot(importance)

#install.packages('rminer')
library(rminer)
model<-fit(classes~.,bc_data,model="svm")

VariableImportance=Importance(model,bc_data,method="sensv")

#VariableImportance
L<-list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)

mgraph(L,graph="IMP",leg=names(bc_data),col="gray",Grid=10)


bc_data$bare_nuclei <- as.integer(bc_data$bare_nuclei)
str(bc_data[, -11])
cor(bc_data[, -11])


library(corrplot)
corMatMy <- cor(bc_data[, -11])
corrplot(corMatMy, order = "hclust")

library(corrplot)
corMatMy <- cor(bc_data[, -11])
findCorrelation(corMatMy,
cutoff = 0.7, verbose = TRUE)
highlyCor <- colnames(bc_data[, -11])[findCorrelation(corMatMy,
cutoff = 0.7, verbose = TRUE)]

highlyCor
bc_data_cor <- bc_data[, which(!colnames(bc_data) %in%
highlyCor)]
str(bc_data_cor)


library(MASS)
model <- glm(classes~.,data=bc_data,family=binomial())
summary(model)
model.step <- stepAIC(model)
summary(model.step)



evaluator <- function(subset) {
  k <- 5
  set.seed(42)
  ind <- sample(5, nrow(bc_data), replace = TRUE)
  results <- sapply(1:k, function(i) {
  train <- bc_data[ind !=i,]
  test  <- bc_data[ind ==i,]
  tree  <- rpart(as.simple.formula(subset, 'classes'), bc_data)
  error.rate <- sum(test$classes != predict(tree, test, type="class")) / nrow(test)
  return(1 - error.rate)
  })
  return(mean(results))
}

source('https://raw.githubusercontent.com/cran/FSelector/master/R/search.hill.climbing.R')

source('https://raw.githubusercontent.com/cran/FSelector/master/R/misc.R')

source('https://raw.githubusercontent.com/cran/FSelector/master/R/search.misc.R')

library(rpart)
attr.subset <- hill.climbing.search(names(bc_data)[!names(bc_data) %in% "classes"], evaluator)



f<-as.simple.formula(attr.subset, "classes")
print(f)



library(caret)
set.seed(42)
results_rfe <- rfe(
  x = bc_data[, -11],
  y = bc_data$classes,
  sizes = c(1:9),
  rfeControl = rfeControl(
    functions = rfFuncs, method = "cv", number = 10)
  )
results_rfe

Principal Component Analysis

bc_data.pca <-prcomp(bc_data[, -11], center=TRUE, scale=TRUE)
bc_data.pca
summary(bc_data.pca)
predict(bc_data.pca, head(bc_data[-11],1))

screeplot(bc_data.pca, type="barplot")
screeplot(bc_data.pca, type="line")

bc_data.pca$sdev
bc_data.pca$sdev ^ 2
which(bc_data.pca$sdev ^ 2> 1)


screeplot(bc_data.pca, type="line")
abline(h=1, col="red", lty= 3)



plot(bc_data.pca$x[,1], bc_data.pca$x[,2], xlim=c(-4,4))
text(bc_data.pca$x[,1], bc_data.pca$x[,2],
rownames(bc_data.pca$x), cex=0.7, pos=4, col="red")
biplot(bc_data.pca)


res <- predict(bc_data.pca, bc_data[-11])

bc_data$pc1 <- res[,1]

boxplot(pc1~ classes, data= bc_data)